From 45de38d87a9aea395639c5c15eb136c956661ac5 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 10 Jul 2024 11:25:50 +0200 Subject: [PATCH] Enhanced document chunk handling and API - Updated `Scripts.sql` to add a new `[Index]` column to `[dbo].[DocumentChunks]` for order tracking. - Modified `DocumentChunk.cs` to include a new `Index` property, and introduced a new immutable record class for document chunks. - Introduced new API endpoints in `Program.cs` for document and chunk retrieval, including embedding details, with OpenAPI documentation enhancements. - Updated an API endpoint description in `Program.cs` to clarify document embedding handling. - Updated `VectorSearchService.cs` to reflect schema changes in service logic, adding methods for fetching document chunks and specific embeddings. --- Scripts.sql | 1 + .../DataAccessLayer/Entities/DocumentChunk.cs | 2 ++ .../Models/DocumentChunk.cs | 3 ++ SqlDatabaseVectorSearch/Program.cs | 31 ++++++++++++++++++- .../Services/VectorSearchService.cs | 21 ++++++++++++- 5 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 SqlDatabaseVectorSearch/Models/DocumentChunk.cs diff --git a/Scripts.sql b/Scripts.sql index 2b62ad4..8765446 100644 --- a/Scripts.sql +++ b/Scripts.sql @@ -1,6 +1,7 @@ CREATE TABLE [dbo].[DocumentChunks]( [Id] [uniqueidentifier] NOT NULL, [DocumentId] [uniqueidentifier] NOT NULL, + [Index] INT NOT NULL, [Content] [nvarchar](max) NOT NULL, [Embedding] [varbinary](8000) NOT NULL, CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs b/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs index 597a175..b6f96ed 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs +++ b/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs @@ -6,6 +6,8 @@ public partial class DocumentChunk public Guid DocumentId { get; set; } + public int Index { get; set; } + public required string Content { get; set; } public required float[] Embedding { get; set; } diff --git a/SqlDatabaseVectorSearch/Models/DocumentChunk.cs b/SqlDatabaseVectorSearch/Models/DocumentChunk.cs new file mode 100644 index 0000000..3753352 --- /dev/null +++ b/SqlDatabaseVectorSearch/Models/DocumentChunk.cs @@ -0,0 +1,3 @@ +namespace SqlDatabaseVectorSearch.Models; + +public record class DocumentChunk(Guid Id, int Index, string Content, float[]? Embedding = null); diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 1ee0cc3..da657a3 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -1,3 +1,4 @@ +using Microsoft.AspNetCore.Http.HttpResults; using Microsoft.EntityFrameworkCore; using Microsoft.OpenApi.Models; using Microsoft.SemanticKernel; @@ -74,7 +75,35 @@ documentsApiGroup.MapGet(string.Empty, async (VectorSearchService vectorSearchSe .WithOpenApi(operation => { operation.Summary = "Gets the list of documents"; + return operation; +}); +documentsApiGroup.MapGet("{documentId:guid}/chunks", async (Guid documentId, VectorSearchService vectorSearchService) => +{ + var documents = await vectorSearchService.GetDocumentChunksAsync(documentId); + return TypedResults.Ok(documents); +}) +.WithOpenApi(operation => +{ + operation.Summary = "Gets the list of chunks of a given document"; + operation.Description = "The list does not contain embedding. Use '/api/documents/{documentId}/chunks/{documentChunkId}' to get the embedding for a given chunk."; + + return operation; +}); + +documentsApiGroup.MapGet("{documentId:guid}/chunks/{documentChunkId:guid}", async Task, NotFound>> (Guid documentId, Guid documentChunkId, VectorSearchService vectorSearchService) => +{ + var chunk = await vectorSearchService.GetDocumentChunkEmbeddingAsync(documentId, documentChunkId); + if (chunk is null) + { + return TypedResults.NotFound(); + } + + return TypedResults.Ok(chunk); +}) +.WithOpenApi(operation => +{ + operation.Summary = "Gets the details of a given chunk, includings its embedding"; return operation; }); @@ -89,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi .WithOpenApi(operation => { operation.Summary = "Uploads a document"; - operation.Description = "Uploads a document to SQL Server and saves its embeddings using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported."; + operation.Description = "Uploads a document to SQL Server and saves its embedding using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported."; operation.Parameter("documentId").Description = "The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the document will be overridden."; diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index b7bd990..9936636 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -39,9 +39,10 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG var paragraphs = TextChunker.SplitPlainTextParagraphs(TextChunker.SplitPlainTextLines(content, appSettings.MaxTokensPerLine), appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens); var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(paragraphs); + var index = 0; foreach (var (paragraph, embedding) in paragraphs.Zip(embeddings, (p, e) => (p, e.ToArray()))) { - var documentChunk = new Entities.DocumentChunk { DocumentId = documentId.Value, Content = paragraph, Embedding = embedding }; + var documentChunk = new Entities.DocumentChunk { DocumentId = documentId.Value, Index = index++, Content = paragraph, Embedding = embedding }; dbContext.DocumentChunks.Add(documentChunk); } @@ -58,6 +59,24 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG return documents; } + public async Task> GetDocumentChunksAsync(Guid documentId) + { + var documentChunks = await dbContext.DocumentChunks.Where(c => c.DocumentId == documentId).OrderBy(c => c.Index).AsNoTracking() + .Select(c => new DocumentChunk(c.Id, c.Index, c.Content, null)) + .ToListAsync(); + + return documentChunks; + } + + public async Task GetDocumentChunkEmbeddingAsync(Guid documentId, Guid documentChunkId) + { + var documentChunk = await dbContext.DocumentChunks.Where(c => c.Id == documentChunkId && c.DocumentId == documentId).AsNoTracking() + .Select(c => new DocumentChunk(c.Id, c.Index, c.Content, c.Embedding)) + .FirstOrDefaultAsync(); + + return documentChunk; + } + public async Task DeleteDocumentAsync(Guid documentId, bool saveChanges = true) { var document = await dbContext.Documents.Include(d => d.Chunks).FirstOrDefaultAsync(d => d.Id == documentId);