From b6c898a3f572e601b84f24f832cf97ba4e4e659a Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Fri, 14 Jun 2024 17:20:21 +0200 Subject: [PATCH] Refactor code and enhance API documentation - Converted `Question.cs` and `Search.cs` records to `record class` syntax for clarity. - Organized API endpoints with tags and added new GET and DELETE endpoints in `Program.cs`, including OpenAPI documentation improvements. - Removed commented-out code in `Program.cs` for cleaner codebase. - Introduced `WithTags` for better API operation categorization in Swagger UI. - Added a TODO comment in `ChatService.cs` for future improvement on chunk length check. - Clarified `using` directives in `VectorSearchService.cs` with namespace aliasing to improve readability. - Refactored document deletion in `VectorSearchService.cs` to use a private helper method and expanded service capabilities with a new `GetDocumentsAsync` method. - Introduced a new `Document` model in the `Models` namespace to support document fetching functionality. - Simplified `appsettings.json` by removing `MaxTokens` configuration for `ChatCompletion` and `Embedding` services. --- SqlDatabaseVectorSearch/Models/Document.cs | 3 + SqlDatabaseVectorSearch/Models/Question.cs | 2 +- SqlDatabaseVectorSearch/Models/Search.cs | 2 +- SqlDatabaseVectorSearch/Program.cs | 38 ++++++------ .../Services/ChatService.cs | 2 +- .../Services/VectorSearchService.cs | 60 +++++++++---------- SqlDatabaseVectorSearch/appsettings.json | 6 +- 7 files changed, 52 insertions(+), 61 deletions(-) create mode 100644 SqlDatabaseVectorSearch/Models/Document.cs diff --git a/SqlDatabaseVectorSearch/Models/Document.cs b/SqlDatabaseVectorSearch/Models/Document.cs new file mode 100644 index 0000000..ca208ce --- /dev/null +++ b/SqlDatabaseVectorSearch/Models/Document.cs @@ -0,0 +1,3 @@ +namespace SqlDatabaseVectorSearch.Models; + +public record class Document(Guid Id, string Name, DateTimeOffset CreationDate, int ChunkCount); diff --git a/SqlDatabaseVectorSearch/Models/Question.cs b/SqlDatabaseVectorSearch/Models/Question.cs index e84a050..e853813 100644 --- a/SqlDatabaseVectorSearch/Models/Question.cs +++ b/SqlDatabaseVectorSearch/Models/Question.cs @@ -1,3 +1,3 @@ namespace SqlDatabaseVectorSearch.Models; -public record Question(Guid ConversationId, string Text) : Search(Text); +public record class Question(Guid ConversationId, string Text) : Search(Text); diff --git a/SqlDatabaseVectorSearch/Models/Search.cs b/SqlDatabaseVectorSearch/Models/Search.cs index 3b20c18..a3a3077 100644 --- a/SqlDatabaseVectorSearch/Models/Search.cs +++ b/SqlDatabaseVectorSearch/Models/Search.cs @@ -1,4 +1,4 @@ namespace SqlDatabaseVectorSearch.Models; -public record Search(string Text); +public record class Search(string Text); diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 8cb7925..ab39ecb 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -63,7 +63,19 @@ if (app.Environment.IsDevelopment()) }); } -var documentsApiGroup = app.MapGroup("/api/documents"); +var documentsApiGroup = app.MapGroup("/api/documents").WithTags("Documents"); + +documentsApiGroup.MapGet(string.Empty, async (VectorSearchService vectorSearchService) => +{ + var documents = await vectorSearchService.GetDocumentsAsync(); + return TypedResults.Ok(documents); +}) +.WithOpenApi(operation => +{ + operation.Summary = "Gets the list of documents"; + + return operation; +}); documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchService vectorSearchService, LinkGenerator linkGenerator, Guid? documentId = null) => { @@ -79,8 +91,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi operation.Parameter("documentId").Description = "The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the document will be overridden."; return operation; -}) -; +}); documentsApiGroup.MapDelete("{documentId:guid}", async (Guid documentId, VectorSearchService vectorSearchService) => { @@ -90,27 +101,11 @@ documentsApiGroup.MapDelete("{documentId:guid}", async (Guid documentId, VectorS .WithOpenApi(operation => { operation.Summary = "Deletes a document"; - operation.Description = "This endpoint deletes the documents and all its chunks from SQL Server"; + operation.Description = "This endpoint deletes the document and all its chunks from SQL Server"; return operation; }); -//app.MapPost("/api/search", async (Search search, ApplicationMemoryService memory, double minimumRelevance = 0, string? index = null) => -//{ -// var response = await memory.SearchAsync(search, minimumRelevance, index); -// return TypedResults.Ok(response); -//}) -//.WithOpenApi(operation => -//{ -// operation.Summary = "Search into Kernel Memory"; -// operation.Description = "Search into Kernel Memory using the provided question and optional tags. If tags are provided, they will be used as filters with OR logic."; - -// operation.Parameter("minimumRelevance").Description = "The minimum Cosine Similarity required."; -// operation.Parameter("index").Description = "The index in which to search for documents. If not provided, the default index will be used ('default')."; - -// return operation; -//}); - app.MapPost("/api/ask", async (Question question, VectorSearchService vectorSearchService, bool reformulate = true) => { var response = await vectorSearchService.AskQuestionAsync(question, reformulate); @@ -124,6 +119,7 @@ app.MapPost("/api/ask", async (Question question, VectorSearchService vectorSear operation.Parameter("reformulate").Description = "If true, the question will be reformulated taking into account the context of the chat identified by the given ConversationId."; return operation; -}); +}) +.WithTags("Ask"); app.Run(); \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index faeeba9..5c97052 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -45,6 +45,7 @@ public class ChatService(IMemoryCache cache, IChatCompletionService chatCompleti """); + // TODO: Ensure that the chunks are not too long, according to the model max token. foreach (var result in chunks.Select(c => c.Content)) { prompt.AppendLine(result); @@ -75,7 +76,6 @@ public class ChatService(IMemoryCache cache, IChatCompletionService chatCompleti } cache.Set(conversationId, chat, appSettingsOptions.Value.MessageExpiration); - return Task.CompletedTask; } } diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index e7ddc4d..eb81657 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -3,10 +3,10 @@ using Microsoft.EntityFrameworkCore; using Microsoft.SemanticKernel.Embeddings; using Microsoft.SemanticKernel.Text; using SqlDatabaseVectorSearch.DataAccessLayer; -using SqlDatabaseVectorSearch.DataAccessLayer.Entities; using SqlDatabaseVectorSearch.Models; using UglyToad.PdfPig; using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; +using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities; namespace SqlDatabaseVectorSearch.Services; @@ -24,11 +24,11 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG } else { - // Creates a new document. + // Create a new document. documentId = Guid.NewGuid(); } - var document = new Document { Id = documentId.Value, Name = name, CreationDate = DateTimeOffset.UtcNow }; + var document = new Entities.Document { Id = documentId.Value, Name = name, CreationDate = DateTimeOffset.UtcNow }; dbContext.Documents.Add(document); // Split the content into chunks of at most 1024 tokens and generate the embeddings for each one. @@ -37,7 +37,7 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG foreach (var (paragraph, embedding) in paragraphs.Zip(embeddings, (p, e) => (p, e.ToArray()))) { - var documentChunk = new DocumentChunk { DocumentId = documentId.Value, Content = paragraph, Embedding = embedding }; + var documentChunk = new Entities.DocumentChunk { DocumentId = documentId.Value, Content = paragraph, Embedding = embedding }; dbContext.DocumentChunks.Add(documentChunk); } @@ -45,17 +45,18 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG return documentId.Value; } + public async Task> GetDocumentsAsync() + { + var documents = await dbContext.Documents.OrderBy(d => d.Name).AsNoTracking() + .Select(d => new Document(d.Id, d.Name, d.CreationDate, d.DocumentChunks.Count)) + .ToListAsync(); + + return documents; + } + public async Task DeleteDocumentAsync(Guid documentId) { - var document = await dbContext.Documents.Include(d => d.DocumentChunks).FirstOrDefaultAsync(d => d.Id == documentId); - if (document is null) - { - return; - } - - dbContext.DocumentChunks.RemoveRange(document.DocumentChunks); - dbContext.Documents.Remove(document); - + await DeleteDocumentInternalAsync(documentId); await dbContext.SaveChangesAsync(); } @@ -69,13 +70,6 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG var chunks = await dbContext.DocumentChunks .OrderBy(c => EF.Functions.VectorDistance("cosine", c.Embedding, questionEmbedding.ToArray())) - //.Select(c => new - //{ - // c.Id, - // c.DocumentId, - // c.Content, - // Distance = EF.Functions.VectorDistance("cosine", c.Embedding, questionEmbedding.ToArray()) - //}) .Take(5) .ToListAsync(); @@ -83,18 +77,6 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG return new Response(reformulatedQuestion, answer); } - //public async Task SearchAsync(Search search, double minimumRelevance = 0, string? index = null) - //{ - // // Search using the embedding search via Kernel Memory . - // // If tags are provided, use them as filters with OR logic. - // var searchResult = await memory.SearchAsync(search.Text.TrimEnd([' ', '?']), index, filters: search.Tags.ToMemoryFilters(), minRelevance: minimumRelevance, limit: 50); - - // // If you want to use an AND logic, set the "filter" parameter (instead of "filters"). - // //var searchResult = await memory.SearchAsync(search.Text.TrimEnd([' ', '?']), index, filter: search.Tags.ToMemoryFilter(), minRelevance: minimumRelevance); - - // return searchResult; - //} - private static Task GetContentAsync(Stream stream) { var content = new StringBuilder(); @@ -102,7 +84,7 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG // Reads the content of the PDF document using PdfPig. using var pdfDocument = PdfDocument.Open(stream); - foreach (var page in pdfDocument.GetPages().Where(x => x != null)) + foreach (var page in pdfDocument.GetPages().Where(x => x is not null)) { var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty; content.AppendLine(pageContent); @@ -110,4 +92,16 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG return Task.FromResult(content.ToString()); } + + private async Task DeleteDocumentInternalAsync(Guid documentId) + { + var document = await dbContext.Documents.Include(d => d.DocumentChunks).FirstOrDefaultAsync(d => d.Id == documentId); + if (document is null) + { + return; + } + + dbContext.DocumentChunks.RemoveRange(document.DocumentChunks); + dbContext.Documents.Remove(document); + } } \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/appsettings.json b/SqlDatabaseVectorSearch/appsettings.json index d9c99d7..84b9ab8 100644 --- a/SqlDatabaseVectorSearch/appsettings.json +++ b/SqlDatabaseVectorSearch/appsettings.json @@ -6,14 +6,12 @@ "ChatCompletion": { "Endpoint": "", "Deployment": "", - "ApiKey": "", - "MaxTokens": 32768 + "ApiKey": "" }, "Embedding": { "Endpoint": "", "Deployment": "", - "ApiKey": "", - "MaxTokens": 8191 + "ApiKey": "" } }, "AppSettings": {