Enhanced document chunk handling and API

- Updated `Scripts.sql` to add a new `[Index]` column to `[dbo].[DocumentChunks]` for order tracking. - Modified `DocumentChunk.cs` to include a new `Index` property, and introduced a new immutable record class for document chunks. - Introduced new API endpoints in `Program.cs` for document and chunk retrieval, including embedding details, with OpenAPI documentation enhancements. - Updated an API endpoint description in `Program.cs` to clarify document embedding handling. - Updated `VectorSearchService.cs` to reflect schema changes in service logic, adding methods for fetching document chunks and specific embeddings.
2026-06-20 12:23:10 +00:00 · 2024-07-10 11:25:50 +02:00
parent ad6faee370
commit 45de38d87a
5 changed files with 56 additions and 2 deletions
@@ -1,6 +1,7 @@
 CREATE TABLE [dbo].[DocumentChunks](
 	[Id] [uniqueidentifier] NOT NULL,
 	[DocumentId] [uniqueidentifier] NOT NULL,
+	[Index] INT NOT NULL,
 	[Content] [nvarchar](max) NOT NULL,
 	[Embedding] [varbinary](8000) NOT NULL,
 CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED 
@@ -6,6 +6,8 @@ public partial class DocumentChunk

    public Guid DocumentId { get; set; }

+    public int Index { get; set; }
+
    public required string Content { get; set; }

    public required float[] Embedding { get; set; }
@@ -0,0 +1,3 @@
+namespace SqlDatabaseVectorSearch.Models;
+
+public record class DocumentChunk(Guid Id, int Index, string Content, float[]? Embedding = null);
@@ -1,3 +1,4 @@
+using Microsoft.AspNetCore.Http.HttpResults;
 using Microsoft.EntityFrameworkCore;
 using Microsoft.OpenApi.Models;
 using Microsoft.SemanticKernel;
@@ -74,7 +75,35 @@ documentsApiGroup.MapGet(string.Empty, async (VectorSearchService vectorSearchSe
 .WithOpenApi(operation =>
 {
    operation.Summary = "Gets the list of documents";
+    return operation;
+});

+documentsApiGroup.MapGet("{documentId:guid}/chunks", async (Guid documentId, VectorSearchService vectorSearchService) =>
+{
+    var documents = await vectorSearchService.GetDocumentChunksAsync(documentId);
+    return TypedResults.Ok(documents);
+})
+.WithOpenApi(operation =>
+{
+    operation.Summary = "Gets the list of chunks of a given document";
+    operation.Description = "The list does not contain embedding. Use '/api/documents/{documentId}/chunks/{documentChunkId}' to get the embedding for a given chunk.";
+
+    return operation;
+});
+
+documentsApiGroup.MapGet("{documentId:guid}/chunks/{documentChunkId:guid}", async Task<Results<Ok<DocumentChunk>, NotFound>> (Guid documentId, Guid documentChunkId, VectorSearchService vectorSearchService) =>
+{
+    var chunk = await vectorSearchService.GetDocumentChunkEmbeddingAsync(documentId, documentChunkId);
+    if (chunk is null)
+    {
+        return TypedResults.NotFound();
+    }
+
+    return TypedResults.Ok(chunk);
+})
+.WithOpenApi(operation =>
+{
+    operation.Summary = "Gets the details of a given chunk, includings its embedding";
    return operation;
 });

@@ -89,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi
 .WithOpenApi(operation =>
 {
    operation.Summary = "Uploads a document";
-    operation.Description = "Uploads a document to SQL Server and saves its embeddings using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported.";
+    operation.Description = "Uploads a document to SQL Server and saves its embedding using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported.";

    operation.Parameter("documentId").Description = "The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the document will be overridden.";

@@ -39,9 +39,10 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
        var paragraphs = TextChunker.SplitPlainTextParagraphs(TextChunker.SplitPlainTextLines(content, appSettings.MaxTokensPerLine), appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens);
        var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(paragraphs);

+        var index = 0;
        foreach (var (paragraph, embedding) in paragraphs.Zip(embeddings, (p, e) => (p, e.ToArray())))
        {
-            var documentChunk = new Entities.DocumentChunk { DocumentId = documentId.Value, Content = paragraph, Embedding = embedding };
+            var documentChunk = new Entities.DocumentChunk { DocumentId = documentId.Value, Index = index++, Content = paragraph, Embedding = embedding };
            dbContext.DocumentChunks.Add(documentChunk);
        }

@@ -58,6 +59,24 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
        return documents;
    }

+    public async Task<IEnumerable<DocumentChunk>> GetDocumentChunksAsync(Guid documentId)
+    {
+        var documentChunks = await dbContext.DocumentChunks.Where(c => c.DocumentId == documentId).OrderBy(c => c.Index).AsNoTracking()
+            .Select(c => new DocumentChunk(c.Id, c.Index, c.Content, null))
+            .ToListAsync();
+
+        return documentChunks;
+    }
+
+    public async Task<DocumentChunk?> GetDocumentChunkEmbeddingAsync(Guid documentId, Guid documentChunkId)
+    {
+        var documentChunk = await dbContext.DocumentChunks.Where(c => c.Id == documentChunkId && c.DocumentId == documentId).AsNoTracking()
+            .Select(c => new DocumentChunk(c.Id, c.Index, c.Content, c.Embedding))
+            .FirstOrDefaultAsync();
+
+        return documentChunk;
+    }
+
    public async Task DeleteDocumentAsync(Guid documentId, bool saveChanges = true)
    {
        var document = await dbContext.Documents.Include(d => d.Chunks).FirstOrDefaultAsync(d => d.Id == documentId);