Enhanced document chunk handling and API

- Updated `Scripts.sql` to add a new `[Index]` column to `[dbo].[DocumentChunks]` for order tracking.
- Modified `DocumentChunk.cs` to include a new `Index` property, and introduced a new immutable record class for document chunks.
- Introduced new API endpoints in `Program.cs` for document and chunk retrieval, including embedding details, with OpenAPI documentation enhancements.
- Updated an API endpoint description in `Program.cs` to clarify document embedding handling.
- Updated `VectorSearchService.cs` to reflect schema changes in service logic, adding methods for fetching document chunks and specific embeddings.
This commit is contained in:
Marco Minerva
2024-07-10 11:25:50 +02:00
parent ad6faee370
commit 45de38d87a
5 changed files with 56 additions and 2 deletions
+1
View File
@@ -1,6 +1,7 @@
CREATE TABLE [dbo].[DocumentChunks](
[Id] [uniqueidentifier] NOT NULL,
[DocumentId] [uniqueidentifier] NOT NULL,
[Index] INT NOT NULL,
[Content] [nvarchar](max) NOT NULL,
[Embedding] [varbinary](8000) NOT NULL,
CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED
@@ -6,6 +6,8 @@ public partial class DocumentChunk
public Guid DocumentId { get; set; }
public int Index { get; set; }
public required string Content { get; set; }
public required float[] Embedding { get; set; }
@@ -0,0 +1,3 @@
namespace SqlDatabaseVectorSearch.Models;
public record class DocumentChunk(Guid Id, int Index, string Content, float[]? Embedding = null);
+30 -1
View File
@@ -1,3 +1,4 @@
using Microsoft.AspNetCore.Http.HttpResults;
using Microsoft.EntityFrameworkCore;
using Microsoft.OpenApi.Models;
using Microsoft.SemanticKernel;
@@ -74,7 +75,35 @@ documentsApiGroup.MapGet(string.Empty, async (VectorSearchService vectorSearchSe
.WithOpenApi(operation =>
{
operation.Summary = "Gets the list of documents";
return operation;
});
documentsApiGroup.MapGet("{documentId:guid}/chunks", async (Guid documentId, VectorSearchService vectorSearchService) =>
{
var documents = await vectorSearchService.GetDocumentChunksAsync(documentId);
return TypedResults.Ok(documents);
})
.WithOpenApi(operation =>
{
operation.Summary = "Gets the list of chunks of a given document";
operation.Description = "The list does not contain embedding. Use '/api/documents/{documentId}/chunks/{documentChunkId}' to get the embedding for a given chunk.";
return operation;
});
documentsApiGroup.MapGet("{documentId:guid}/chunks/{documentChunkId:guid}", async Task<Results<Ok<DocumentChunk>, NotFound>> (Guid documentId, Guid documentChunkId, VectorSearchService vectorSearchService) =>
{
var chunk = await vectorSearchService.GetDocumentChunkEmbeddingAsync(documentId, documentChunkId);
if (chunk is null)
{
return TypedResults.NotFound();
}
return TypedResults.Ok(chunk);
})
.WithOpenApi(operation =>
{
operation.Summary = "Gets the details of a given chunk, includings its embedding";
return operation;
});
@@ -89,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi
.WithOpenApi(operation =>
{
operation.Summary = "Uploads a document";
operation.Description = "Uploads a document to SQL Server and saves its embeddings using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported.";
operation.Description = "Uploads a document to SQL Server and saves its embedding using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported.";
operation.Parameter("documentId").Description = "The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the document will be overridden.";
@@ -39,9 +39,10 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
var paragraphs = TextChunker.SplitPlainTextParagraphs(TextChunker.SplitPlainTextLines(content, appSettings.MaxTokensPerLine), appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens);
var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(paragraphs);
var index = 0;
foreach (var (paragraph, embedding) in paragraphs.Zip(embeddings, (p, e) => (p, e.ToArray())))
{
var documentChunk = new Entities.DocumentChunk { DocumentId = documentId.Value, Content = paragraph, Embedding = embedding };
var documentChunk = new Entities.DocumentChunk { DocumentId = documentId.Value, Index = index++, Content = paragraph, Embedding = embedding };
dbContext.DocumentChunks.Add(documentChunk);
}
@@ -58,6 +59,24 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
return documents;
}
public async Task<IEnumerable<DocumentChunk>> GetDocumentChunksAsync(Guid documentId)
{
var documentChunks = await dbContext.DocumentChunks.Where(c => c.DocumentId == documentId).OrderBy(c => c.Index).AsNoTracking()
.Select(c => new DocumentChunk(c.Id, c.Index, c.Content, null))
.ToListAsync();
return documentChunks;
}
public async Task<DocumentChunk?> GetDocumentChunkEmbeddingAsync(Guid documentId, Guid documentChunkId)
{
var documentChunk = await dbContext.DocumentChunks.Where(c => c.Id == documentChunkId && c.DocumentId == documentId).AsNoTracking()
.Select(c => new DocumentChunk(c.Id, c.Index, c.Content, c.Embedding))
.FirstOrDefaultAsync();
return documentChunk;
}
public async Task DeleteDocumentAsync(Guid documentId, bool saveChanges = true)
{
var document = await dbContext.Documents.Include(d => d.Chunks).FirstOrDefaultAsync(d => d.Id == documentId);