From fa81f01c27121768ea083fe93199e2f583fc6a0e Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 27 May 2025 17:10:17 +0200 Subject: [PATCH 01/24] Refactor content decoders and restructure data layer Updated `DocxContentDecoder`, `PdfContentDecoder`, and `TextContentDecoder` to return `Task>` instead of `Task`, introducing a new `Chunk` record for structured output. Restructured the `ApplicationDbContext`, `Document`, and `DocumentChunk` classes by moving them to the `SqlDatabaseVectorSearch.Data` namespace for better organization. Updated database migration files to align with the new entity structure and modified references in `Program.cs`, `DocumentService.cs`, and `VectorSearchService.cs` to use the new namespace. --- .../ContentDecoders/DocxContentDecoder.cs | 4 +-- .../ContentDecoders/IContentDecoder.cs | 4 ++- .../ContentDecoders/PdfContentDecoder.cs | 31 ++++++++++++------- .../ContentDecoders/TextContentDecoder.cs | 4 +-- .../ApplicationDbContext.cs | 4 +-- .../Entities/Document.cs | 2 +- .../Entities/DocumentChunk.cs | 2 +- .../00000000000000_Initial.Designer.cs | 4 +-- .../Migrations/00000000000000_Initial.cs | 2 +- .../ApplicationDbContextModelSnapshot.cs | 4 +-- SqlDatabaseVectorSearch/Program.cs | 2 +- .../Services/DocumentService.cs | 2 +- .../Services/VectorSearchService.cs | 15 +++------ 13 files changed, 43 insertions(+), 37 deletions(-) rename SqlDatabaseVectorSearch/{DataAccessLayer => Data}/ApplicationDbContext.cs (93%) rename SqlDatabaseVectorSearch/{DataAccessLayer => Data}/Entities/Document.cs (78%) rename SqlDatabaseVectorSearch/{DataAccessLayer => Data}/Entities/DocumentChunk.cs (82%) rename SqlDatabaseVectorSearch/{DataAccessLayer => Data}/Migrations/00000000000000_Initial.Designer.cs (96%) rename SqlDatabaseVectorSearch/{DataAccessLayer => Data}/Migrations/00000000000000_Initial.cs (97%) rename SqlDatabaseVectorSearch/{DataAccessLayer => Data}/Migrations/ApplicationDbContextModelSnapshot.cs (96%) diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs index b67a45b..d606fa1 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -6,7 +6,7 @@ namespace SqlDatabaseVectorSearch.ContentDecoders; public class DocxContentDecoder : IContentDecoder { - public Task DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) + public Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) { // Open a Word document for read-only access. using var document = WordprocessingDocument.Open(stream, false); @@ -20,6 +20,6 @@ public class DocxContentDecoder : IContentDecoder content.AppendLine(p.InnerText); } - return Task.FromResult(content.ToString()); + return Task.FromResult(new List([new(1, 0, content.ToString())]).AsEnumerable()); } } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs index c5a46b5..4fc8293 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs @@ -2,5 +2,7 @@ public interface IContentDecoder { - Task DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default); + Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default); } + +public record class Chunk(int PageNumber, int IndexOnPage, string Content); \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs index 60710f7..696b192 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs @@ -1,24 +1,33 @@ -using System.Text; +using SqlDatabaseVectorSearch.TextChunkers; using UglyToad.PdfPig; -using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; +using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; namespace SqlDatabaseVectorSearch.ContentDecoders; -public class PdfContentDecoder : IContentDecoder +public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecoder { - public Task DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) + public Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) { - var content = new StringBuilder(); + var textChunker = serviceProvider.GetRequiredKeyedService(contentType); // Read the content of the PDF document. using var pdfDocument = PdfDocument.Open(stream); + var paragraphs = pdfDocument.GetPages().SelectMany(page => GetPageParagraphs(page, textChunker)).ToList(); - foreach (var page in pdfDocument.GetPages().Where(x => x is not null)) - { - var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty; - content.AppendLine(pageContent); - } + return Task.FromResult(paragraphs.AsEnumerable()); + } - return Task.FromResult(content.ToString()); + private static IEnumerable GetPageParagraphs(Page pdfPage, ITextChunker textChunker) + { + var letters = pdfPage.Letters; + var words = NearestNeighbourWordExtractor.Instance.GetWords(letters); + var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words); + var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" "))); + + var paragraphs = textChunker.Split(pageText); + + return paragraphs.Select((text, index) => new Chunk(pdfPage.Number, index, text)); } } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs index 7b86637..3235b8d 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs @@ -2,11 +2,11 @@ public class TextContentDecoder : IContentDecoder { - public async Task DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) + public async Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) { using var readStream = new StreamReader(stream); var content = await readStream.ReadToEndAsync(cancellationToken); - return content; + return [new(1, 0, content)]; } } diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs b/SqlDatabaseVectorSearch/Data/ApplicationDbContext.cs similarity index 93% rename from SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs rename to SqlDatabaseVectorSearch/Data/ApplicationDbContext.cs index 72dcfaf..e9e41a3 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs +++ b/SqlDatabaseVectorSearch/Data/ApplicationDbContext.cs @@ -1,8 +1,8 @@ using EntityFramework.Exceptions.SqlServer; using Microsoft.EntityFrameworkCore; -using SqlDatabaseVectorSearch.DataAccessLayer.Entities; +using SqlDatabaseVectorSearch.Data.Entities; -namespace SqlDatabaseVectorSearch.DataAccessLayer; +namespace SqlDatabaseVectorSearch.Data; public class ApplicationDbContext(DbContextOptions options) : DbContext(options) { diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs b/SqlDatabaseVectorSearch/Data/Entities/Document.cs similarity index 78% rename from SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs rename to SqlDatabaseVectorSearch/Data/Entities/Document.cs index d90cf56..24e4818 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs +++ b/SqlDatabaseVectorSearch/Data/Entities/Document.cs @@ -1,4 +1,4 @@ -namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities; +namespace SqlDatabaseVectorSearch.Data.Entities; public class Document { diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs b/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs similarity index 82% rename from SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs rename to SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs index 0d1886c..580de9e 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs +++ b/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs @@ -1,4 +1,4 @@ -namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities; +namespace SqlDatabaseVectorSearch.Data.Entities; public class DocumentChunk { diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.Designer.cs b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs similarity index 96% rename from SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.Designer.cs rename to SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs index bb1f760..5789fb8 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.Designer.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs @@ -5,11 +5,11 @@ using Microsoft.EntityFrameworkCore.Infrastructure; using Microsoft.EntityFrameworkCore.Metadata; using Microsoft.EntityFrameworkCore.Migrations; using Microsoft.EntityFrameworkCore.Storage.ValueConversion; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; #nullable disable -namespace SqlDatabaseVectorSearch.DataAccessLayer.Migrations +namespace SqlDatabaseVectorSearch.Data.Migrations { [DbContext(typeof(ApplicationDbContext))] [Migration("20250224102351_Initial")] diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.cs b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs similarity index 97% rename from SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.cs rename to SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs index a44fde0..2a530b0 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs @@ -3,7 +3,7 @@ using Microsoft.EntityFrameworkCore.Migrations; #nullable disable -namespace SqlDatabaseVectorSearch.DataAccessLayer.Migrations +namespace SqlDatabaseVectorSearch.Data.Migrations { /// public partial class Initial : Migration diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/ApplicationDbContextModelSnapshot.cs b/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs similarity index 96% rename from SqlDatabaseVectorSearch/DataAccessLayer/Migrations/ApplicationDbContextModelSnapshot.cs rename to SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs index 8be4784..6bb2ad9 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/ApplicationDbContextModelSnapshot.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs @@ -4,11 +4,11 @@ using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore.Infrastructure; using Microsoft.EntityFrameworkCore.Metadata; using Microsoft.EntityFrameworkCore.Storage.ValueConversion; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; #nullable disable -namespace SqlDatabaseVectorSearch.DataAccessLayer.Migrations +namespace SqlDatabaseVectorSearch.Data.Migrations { [DbContext(typeof(ApplicationDbContext))] partial class ApplicationDbContextModelSnapshot : ModelSnapshot diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 0652b74..f9d2e78 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -5,7 +5,7 @@ using Microsoft.EntityFrameworkCore; using Microsoft.SemanticKernel; using SqlDatabaseVectorSearch.Components; using SqlDatabaseVectorSearch.ContentDecoders; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; using SqlDatabaseVectorSearch.Extensions; using SqlDatabaseVectorSearch.Services; using SqlDatabaseVectorSearch.Settings; diff --git a/SqlDatabaseVectorSearch/Services/DocumentService.cs b/SqlDatabaseVectorSearch/Services/DocumentService.cs index 2a6c6ef..1955255 100644 --- a/SqlDatabaseVectorSearch/Services/DocumentService.cs +++ b/SqlDatabaseVectorSearch/Services/DocumentService.cs @@ -1,6 +1,6 @@ using System.Data; using Microsoft.EntityFrameworkCore; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; using SqlDatabaseVectorSearch.Models; namespace SqlDatabaseVectorSearch.Services; diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index beb0fde..c3c7e63 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -4,12 +4,11 @@ using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.AI; using Microsoft.Extensions.Options; using SqlDatabaseVectorSearch.ContentDecoders; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Settings; -using SqlDatabaseVectorSearch.TextChunkers; using ChatResponse = SqlDatabaseVectorSearch.Models.ChatResponse; -using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities; +using Entities = SqlDatabaseVectorSearch.Data.Entities; namespace SqlDatabaseVectorSearch.Services; @@ -21,10 +20,10 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb { // Extract the contents of the file. var decoder = serviceProvider.GetKeyedService(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported."); - var content = await decoder.DecodeAsync(stream, contentType, cancellationToken); + var paragraphs = await decoder.DecodeAsync(stream, contentType, cancellationToken); // We get the token count of the whole document because it is the total number of token used by embedding (it may be necessary, for example, for cost analysis). - var tokenCount = tokenizerService.CountEmbeddingTokens(content); + var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(string.Empty, paragraphs.Select(p => p.Content))); var strategy = dbContext.Database.CreateExecutionStrategy(); var document = await strategy.ExecuteAsync(async (cancellationToken) => @@ -40,11 +39,7 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() }; dbContext.Documents.Add(document); - // Split the content into chunks and generate the embeddings for each one. - var textChunker = serviceProvider.GetRequiredKeyedService(contentType); - var paragraphs = textChunker.Split(content); - - var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs, cancellationToken: cancellationToken); + var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken); // Save the document chunks and the corresponding embedding in the database. foreach (var (index, embedding) in embeddings.Index()) From 1e531e5ad61dc28ee355148b65e009c7bf8cc59d Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 27 May 2025 17:19:25 +0200 Subject: [PATCH 02/24] Filter out empty paragraphs in PdfContentDecoder Updated the paragraph processing to exclude empty or whitespace-only entries before creating Chunk objects, ensuring only meaningful text is included. --- SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs index 696b192..c5cd0e9 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs @@ -28,6 +28,6 @@ public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecod var paragraphs = textChunker.Split(pageText); - return paragraphs.Select((text, index) => new Chunk(pdfPage.Number, index, text)); + return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text)); } } From 2fc070d0aa1027d8e79d09000dc1941c728e997d Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 4 Jun 2025 10:22:15 +0200 Subject: [PATCH 03/24] Refactor response handling and content decoding - Updated `TextContentDecoder` to use `ITextChunker` for paragraph splitting and return a list of `Chunk` objects. - Changed return type of `Stream` method in `AskEndpoints.cs` from `IAsyncEnumerable` to `IAsyncEnumerable`. - Removed `QuestionResponse` class and introduced `Response` class to better handle streaming responses. - Modified `AskQuestionAsync` and `AskStreamingAsync` methods in `VectorSearchService` to return `Response` instead of `QuestionResponse`, and adjusted token count calculation. - Added namespace declaration in `Response.cs` and defined properties to align with new response structure. --- .../ContentDecoders/TextContentDecoder.cs | 11 ++++++++--- SqlDatabaseVectorSearch/Endpoints/AskEndpoints.cs | 2 +- SqlDatabaseVectorSearch/Models/QuestionResponse.cs | 10 ---------- SqlDatabaseVectorSearch/Models/Response.cs | 10 ++++++++++ .../Services/VectorSearchService.cs | 6 +++--- 5 files changed, 22 insertions(+), 17 deletions(-) delete mode 100644 SqlDatabaseVectorSearch/Models/QuestionResponse.cs create mode 100644 SqlDatabaseVectorSearch/Models/Response.cs diff --git a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs index 3235b8d..d03e32f 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs @@ -1,12 +1,17 @@ -namespace SqlDatabaseVectorSearch.ContentDecoders; +using SqlDatabaseVectorSearch.TextChunkers; -public class TextContentDecoder : IContentDecoder +namespace SqlDatabaseVectorSearch.ContentDecoders; + +public class TextContentDecoder(IServiceProvider serviceProvider) : IContentDecoder { public async Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) { + var textChunker = serviceProvider.GetRequiredKeyedService(contentType); + using var readStream = new StreamReader(stream); var content = await readStream.ReadToEndAsync(cancellationToken); - return [new(1, 0, content)]; + var paragraphs = textChunker.Split(content); + return paragraphs.Select((text, index) => new Chunk(1, index, text)).ToList(); } } diff --git a/SqlDatabaseVectorSearch/Endpoints/AskEndpoints.cs b/SqlDatabaseVectorSearch/Endpoints/AskEndpoints.cs index 1976fe7..5b236ff 100644 --- a/SqlDatabaseVectorSearch/Endpoints/AskEndpoints.cs +++ b/SqlDatabaseVectorSearch/Endpoints/AskEndpoints.cs @@ -23,7 +23,7 @@ public class AskEndpoints : IEndpointRouteHandlerBuilder endpoints.MapPost("/api/ask-streaming", (Question question, VectorSearchService vectorSearchService, CancellationToken cancellationToken, [Description("If true, the question will be reformulated taking into account the context of the chat identified by the given ConversationId.")] bool reformulate = true) => { - async IAsyncEnumerable Stream() + async IAsyncEnumerable Stream() { // Requests a streaming response. var responseStream = vectorSearchService.AskStreamingAsync(question, reformulate, cancellationToken); diff --git a/SqlDatabaseVectorSearch/Models/QuestionResponse.cs b/SqlDatabaseVectorSearch/Models/QuestionResponse.cs deleted file mode 100644 index 958f9d1..0000000 --- a/SqlDatabaseVectorSearch/Models/QuestionResponse.cs +++ /dev/null @@ -1,10 +0,0 @@ -namespace SqlDatabaseVectorSearch.Models; - -// Question and Answer can be null when using response streaming. -public record class QuestionResponse(string? OriginalQuestion, string? ReformulatedQuestion, string? Answer, StreamState? StreamState = null, TokenUsageResponse? TokenUsage = null) -{ - public QuestionResponse(string? token, StreamState streamState, TokenUsageResponse? tokenUsageResponse = null) - : this(null, null, token, streamState, tokenUsageResponse) - { - } -} \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Models/Response.cs b/SqlDatabaseVectorSearch/Models/Response.cs new file mode 100644 index 0000000..54921ba --- /dev/null +++ b/SqlDatabaseVectorSearch/Models/Response.cs @@ -0,0 +1,10 @@ +namespace SqlDatabaseVectorSearch.Models; + +// Question and Answer can be null when using response streaming. +public record class Response(string? OriginalQuestion, string? ReformulatedQuestion, string? Answer, StreamState? StreamState = null, TokenUsageResponse? TokenUsage = null) +{ + public Response(string? token, StreamState streamState, TokenUsageResponse? tokenUsageResponse = null) + : this(null, null, token, streamState, tokenUsageResponse) + { + } +} \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index c3c7e63..d2819ac 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -23,7 +23,7 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb var paragraphs = await decoder.DecodeAsync(stream, contentType, cancellationToken); // We get the token count of the whole document because it is the total number of token used by embedding (it may be necessary, for example, for cost analysis). - var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(string.Empty, paragraphs.Select(p => p.Content))); + var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(" ", paragraphs.Select(p => p.Content))); var strategy = dbContext.Database.CreateExecutionStrategy(); var document = await strategy.ExecuteAsync(async (cancellationToken) => @@ -59,7 +59,7 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb return new(document.Id, tokenCount); } - public async Task AskQuestionAsync(Question question, bool reformulate = true, CancellationToken cancellationToken = default) + public async Task AskQuestionAsync(Question question, bool reformulate = true, CancellationToken cancellationToken = default) { // It the user doesn't want to reforulate the question, CreateContextAsync returns the original one. var (reformulatedQuestion, embeddingTokenCount, chunks) = await CreateContextAsync(question, reformulate, cancellationToken); @@ -69,7 +69,7 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb return new(question.Text, reformulatedQuestion.Text!, answer, null, new(reformulatedQuestion.TokenUsage, embeddingTokenCount, tokenUsage)); } - public async IAsyncEnumerable AskStreamingAsync(Question question, bool reformulate = true, [EnumeratorCancellation] CancellationToken cancellationToken = default) + public async IAsyncEnumerable AskStreamingAsync(Question question, bool reformulate = true, [EnumeratorCancellation] CancellationToken cancellationToken = default) { // It the user doesn't want to reforulate the question, CreateContextAsync returns the original one. var (reformulatedQuestion, embeddingTokenCount, chunks) = await CreateContextAsync(question, reformulate, cancellationToken); From 0766103b9a3e373cee00aec3607f48547a46bc45 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 4 Jun 2025 11:42:24 +0200 Subject: [PATCH 04/24] Refactor ChatService and VectorSearchService parameters Updated parameter types in ChatService and VectorSearchService from IEnumerable to IEnumerable for better structure. Enhanced citation formatting rules in ChatService. Increased MaxRelevantChunks and MaxInputTokens in appsettings.json to improve processing capabilities. --- .../Services/ChatService.cs | 53 ++++++++++++++++--- .../Services/VectorSearchService.cs | 5 +- SqlDatabaseVectorSearch/appsettings.json | 4 +- 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index 049c08f..4e8d0fa 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -7,6 +7,7 @@ using Microsoft.SemanticKernel.Connectors.AzureOpenAI; using OpenAI.Chat; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Settings; +using Entities = SqlDatabaseVectorSearch.Data.Entities; namespace SqlDatabaseVectorSearch.Services; @@ -41,7 +42,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer return new(reformulatedQuestion.Content!, tokenUsage); } - public async Task AskQuestionAsync(Guid conversationId, IEnumerable chunks, string question, CancellationToken cancellationToken = default) + public async Task AskQuestionAsync(Guid conversationId, IEnumerable chunks, string question, CancellationToken cancellationToken = default) { var chat = CreateChatAsync(chunks, question); @@ -59,7 +60,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer return new(answer.Content!, tokenUsage); } - public async IAsyncEnumerable AskStreamingAsync(Guid conversationId, IEnumerable chunks, string question, [EnumeratorCancellation] CancellationToken cancellationToken = default) + public async IAsyncEnumerable AskStreamingAsync(Guid conversationId, IEnumerable chunks, string question, [EnumeratorCancellation] CancellationToken cancellationToken = default) { var chat = CreateChatAsync(chunks, question); @@ -110,19 +111,57 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer return null; } - private ChatHistory CreateChatAsync(IEnumerable chunks, string question) + private ChatHistory CreateChatAsync(IEnumerable chunks, string question) { var chat = new ChatHistory(""" You can use only the information provided in this chat to answer questions. If you don't know the answer, reply suggesting to refine the question. + For example, if the user asks "What is the capital of France?" and in this chat there isn't information about France, you should reply something like: - This information isn't available in the given context - I'm sorry, I don't know the answer to that question - I don't have that information - I don't know - Given the context, I can't answer that question - - I'my sorry, I don't have enough information to answer that question - Never answer to questions that are not related to this chat. - You must answer in the same language of the user's question. For example, it the user asks a question in English, the answer must be in English. + - I'm sorry, I don't have enough information to answer that question + + Never answer questions that are not related to this chat. + You must answer in the same language as the user's question. + + IMPORTANT - CITATION PLACEMENT AND LENGTH: + The quote in each MUST be MAXIMUM 5 words, taken word-for-word from the search result. If the quote is longer than 5 words, your answer is INVALID. + When you find an answer, you MUST place ALL citations ONLY at the very end of your response, never inside or between sentences. + First provide your complete answer, then add a blank line, then list all citations. + + Use this XML format for citations: + exact quote here + + STRICT RULES for citations: + - Citations MUST NEVER appear inside, before, or between sentences of your answer. They MUST be grouped together ONLY at the end, after a blank line. + - If you include citations anywhere except at the end, your answer is WRONG and INVALID. + - Always include the citation(s) if there are results. If you don't know the answer, do NOT include citations. + - The quote must be max 5 words, taken word-for-word from the search result, and is the basis for why the citation is relevant. If the quote is longer than 5 words, your answer is INVALID. + - Do NOT refer to the presence of citations; just emit these tags right at the end, with no surrounding text. + - The citations must always be in a list at the end of the response, one after the other. Never add the citations between the actual response text or inside sentences. + - Do NOT add any text after the citations. + - ALWAYS leave a blank line between your answer and the first citation. + + Examples (CORRECT): + Here is my complete answer to your question. I'm providing all the information based on the context. + + Paris is the capital + largest city in France + + Examples (WRONG): + Here is my answer Paris is the capital of France and is known for the Eiffel Tower with more text. + Paris is the capital of France and is known for the Eiffel Tower Here is my answer. + Here is my answer. (without any citations when information is available) + Here is my answer. + Paris is the capital of France and is known for the Eiffel Tower More answer text. + + YOU MUST SEPARATE YOUR ANSWER FROM CITATIONS WITH A BLANK LINE. + NEVER INSERT CITATIONS WITHIN YOUR ANSWER TEXT. + CITATIONS MUST ONLY APPEAR AT THE END, AFTER A BLANK LINE. + IF YOU DO NOT FOLLOW THESE RULES, YOUR RESPONSE IS INVALID. """); var prompt = new StringBuilder($""" @@ -141,7 +180,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer foreach (var chunk in chunks) { - var text = $"---{Environment.NewLine}{chunk}"; + var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id}) {Environment.NewLine}{chunk.Content}{Environment.NewLine}"; var tokenCount = tokenizerService.CountChatCompletionTokens(text); if (tokenCount > availableTokens) diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index d2819ac..beda3cb 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -96,7 +96,7 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb } } - private async Task<(ChatResponse ReformulatedQuestion, int EmbeddingTokenCount, IEnumerable Chunks)> CreateContextAsync(Question question, bool reformulate, CancellationToken cancellationToken) + private async Task<(ChatResponse ReformulatedQuestion, int EmbeddingTokenCount, IEnumerable Chunks)> CreateContextAsync(Question question, bool reformulate, CancellationToken cancellationToken) { // Reformulate the question taking into account the context of the chat to perform keyword search and embeddings. var reformulatedQuestion = reformulate ? await chatService.CreateQuestionAsync(question.ConversationId, question.Text, cancellationToken) : new(question.Text); @@ -107,9 +107,8 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb // Perform Vector Search on SQL Database. var questionEmbedding = await embeddingGenerator.GenerateVectorAsync(reformulatedQuestion.Text!, cancellationToken: cancellationToken); - var chunks = await dbContext.DocumentChunks + var chunks = await dbContext.DocumentChunks.Include(c => c.Document) .OrderBy(c => EF.Functions.VectorDistance("cosine", c.Embedding, questionEmbedding.ToArray())) - .Select(c => c.Content) .Take(appSettings.MaxRelevantChunks) .ToListAsync(cancellationToken); diff --git a/SqlDatabaseVectorSearch/appsettings.json b/SqlDatabaseVectorSearch/appsettings.json index 52ff3cf..10a1967 100644 --- a/SqlDatabaseVectorSearch/appsettings.json +++ b/SqlDatabaseVectorSearch/appsettings.json @@ -23,8 +23,8 @@ "MaxTokensPerLine": 300, "MaxTokensPerParagraph": 1000, "OverlapTokens": 100, - "MaxRelevantChunks": 10, - "MaxInputTokens": 16384, + "MaxRelevantChunks": 50, + "MaxInputTokens": 32768, "MaxOutputTokens": 800, "MessageExpiration": "00:05:00", "MessageLimit": 20 From 1c24250a4212ebf005e60635ab80f95f05ae768a Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 4 Jun 2025 12:34:02 +0200 Subject: [PATCH 05/24] Add citation handling and styling in Ask.razor Updated Ask.razor to include regex for citation extraction and display. Introduced a new method to extract citations and updated the Message class to store them. Added a Citation class for individual citation representation. Enhanced app.css with styles for citation display. --- .../Components/Pages/Ask.razor | 67 +++++++++++++++++++ SqlDatabaseVectorSearch/wwwroot/css/app.css | 6 ++ 2 files changed, 73 insertions(+) diff --git a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor index 9e07b45..60a2af1 100644 --- a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor +++ b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor @@ -1,4 +1,5 @@ @page "/ask" +@using System.Text.RegularExpressions @inject IServiceProvider ServiceProvider @inject IJSRuntime JSRuntime @@ -72,6 +73,23 @@ + @if (message.Citations is not null && message.Citations.Count() > 0) + { +
+ @foreach (var citation in message.Citations) + { +
+
+ @citation.FileName @if (!string.IsNullOrEmpty(citation.PageNumber)) + { + pag. @citation.PageNumber + } +
+
@citation.Quote
+
+ } +
+ } } } @@ -178,10 +196,17 @@ } else if (delta.StreamState == StreamState.Append) { + // Adds tokens to the assistant message as they are received assistantMessage.Text += delta.Answer; } else if (delta.StreamState == StreamState.End) { + // Extracts citations, if any. + var (cleanText, citations) = ExtractCitations(assistantMessage.Text); + + assistantMessage.Text = cleanText; + assistantMessage.Citations = citations; + assistantMessage.IsCompleted = true; assistantMessage.TokenUsage += FormatTokenUsage(delta.TokenUsage); } @@ -269,6 +294,36 @@ await JSRuntime.InvokeVoidAsync("scrollTo", chat); } + private static (string, IEnumerable) ExtractCitations(string? text) + { + var citations = new List(); + + if (string.IsNullOrEmpty(text)) + { + return (text ?? string.Empty, citations); + } + + var pattern = "(.*?)<\\/citation>"; + + var matches = Regex.Matches(text, pattern, RegexOptions.Singleline); + foreach (Match match in matches) + { + if (match.Success && match.Groups.Count == 4) + { + citations.Add(new Citation + { + FileName = match.Groups[1].Value, + PageNumber = match.Groups[2].Value, + Quote = match.Groups[3].Value + }); + } + } + + // Remove all tags from the text + var cleanText = Regex.Replace(text, pattern, string.Empty, RegexOptions.Singleline).TrimEnd(); + return (cleanText, citations); + } + public class Message { public string? Text { get; set; } @@ -278,5 +333,17 @@ public bool IsCompleted { get; set; } public string? TokenUsage { get; set; } + + // List of citations extracted from the answer + public IEnumerable? Citations { get; set; } + } + + public class Citation + { + public string FileName { get; set; } = null!; + + public string Quote { get; set; } = null!; + + public string PageNumber { get; set; } = null!; } } \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/wwwroot/css/app.css b/SqlDatabaseVectorSearch/wwwroot/css/app.css index 1f3f172..3d4dd04 100644 --- a/SqlDatabaseVectorSearch/wwwroot/css/app.css +++ b/SqlDatabaseVectorSearch/wwwroot/css/app.css @@ -62,3 +62,9 @@ h1:focus { .blazor-error-boundary::after { content: "An error has occurred." } + +.citation-box { + width: fit-content; + max-width: 100%; + background-color: #f8f9fa; +} From aae42a1658ce924e594b1b7488a17e8e87417241 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 4 Jun 2025 12:48:31 +0200 Subject: [PATCH 06/24] Enhance .editorconfig with new rules and adjustments - Added suggestion for collection expressions with loose type matches. - Introduced diagnostic rule IDE0305 for simplified collection initialization. --- .editorconfig | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/.editorconfig b/.editorconfig index 263f5c7..f6b7d3a 100644 --- a/.editorconfig +++ b/.editorconfig @@ -22,6 +22,7 @@ dotnet_style_operator_placement_when_wrapping = beginning_of_line dotnet_style_object_initializer = true:suggestion dotnet_style_coalesce_expression = true:suggestion dotnet_style_collection_initializer = true:suggestion +dotnet_style_prefer_collection_expression = when_types_loosely_match:suggestion dotnet_style_prefer_simplified_boolean_expressions = true:suggestion dotnet_style_prefer_conditional_expression_over_assignment = false:silent dotnet_style_prefer_conditional_expression_over_return = false:silent @@ -244,23 +245,23 @@ dotnet_naming_rule.async_method_should_be_ends_with_async.style = ends_with_asyn dotnet_naming_symbols.interface.applicable_kinds = interface dotnet_naming_symbols.interface.applicable_accessibilities = public, internal, private, protected, protected_internal, private_protected -dotnet_naming_symbols.interface.required_modifiers = +dotnet_naming_symbols.interface.required_modifiers = dotnet_naming_symbols.method.applicable_kinds = method dotnet_naming_symbols.method.applicable_accessibilities = public -dotnet_naming_symbols.method.required_modifiers = +dotnet_naming_symbols.method.required_modifiers = dotnet_naming_symbols.private_or_internal_field.applicable_kinds = field dotnet_naming_symbols.private_or_internal_field.applicable_accessibilities = internal, private, private_protected -dotnet_naming_symbols.private_or_internal_field.required_modifiers = +dotnet_naming_symbols.private_or_internal_field.required_modifiers = dotnet_naming_symbols.types.applicable_kinds = class, struct, interface, enum dotnet_naming_symbols.types.applicable_accessibilities = public, internal, private, protected, protected_internal, private_protected -dotnet_naming_symbols.types.required_modifiers = +dotnet_naming_symbols.types.required_modifiers = dotnet_naming_symbols.non_field_members.applicable_kinds = property, event, method dotnet_naming_symbols.non_field_members.applicable_accessibilities = public, internal, private, protected, protected_internal, private_protected -dotnet_naming_symbols.non_field_members.required_modifiers = +dotnet_naming_symbols.non_field_members.required_modifiers = dotnet_naming_symbols.async_method.applicable_kinds = method dotnet_naming_symbols.async_method.applicable_accessibilities = * @@ -268,24 +269,24 @@ dotnet_naming_symbols.async_method.required_modifiers = async # Naming styles -dotnet_naming_style.pascal_case.required_prefix = -dotnet_naming_style.pascal_case.required_suffix = -dotnet_naming_style.pascal_case.word_separator = +dotnet_naming_style.pascal_case.required_prefix = +dotnet_naming_style.pascal_case.required_suffix = +dotnet_naming_style.pascal_case.word_separator = dotnet_naming_style.pascal_case.capitalization = pascal_case dotnet_naming_style.begins_with_i.required_prefix = I -dotnet_naming_style.begins_with_i.required_suffix = -dotnet_naming_style.begins_with_i.word_separator = +dotnet_naming_style.begins_with_i.required_suffix = +dotnet_naming_style.begins_with_i.word_separator = dotnet_naming_style.begins_with_i.capitalization = pascal_case -dotnet_naming_style.camel_case.required_prefix = -dotnet_naming_style.camel_case.required_suffix = -dotnet_naming_style.camel_case.word_separator = +dotnet_naming_style.camel_case.required_prefix = +dotnet_naming_style.camel_case.required_suffix = +dotnet_naming_style.camel_case.word_separator = dotnet_naming_style.camel_case.capitalization = camel_case -dotnet_naming_style.ends_with_async.required_prefix = +dotnet_naming_style.ends_with_async.required_prefix = dotnet_naming_style.ends_with_async.required_suffix = Async -dotnet_naming_style.ends_with_async.word_separator = +dotnet_naming_style.ends_with_async.word_separator = dotnet_naming_style.ends_with_async.capitalization = pascal_case # IDE0058: Expression value is never used @@ -295,4 +296,7 @@ dotnet_diagnostic.IDE0058.severity = none dotnet_diagnostic.IDE0010.severity = none # IDE0072: Add missing cases -dotnet_diagnostic.IDE0072.severity = none \ No newline at end of file +dotnet_diagnostic.IDE0072.severity = none + +# IDE0305: Simplify collection initialization +dotnet_diagnostic.IDE0305.severity = none \ No newline at end of file From 9f5bd02f78bd3a30be775c08d70c6e8ed25ebd34 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Thu, 5 Jun 2025 11:48:18 +0200 Subject: [PATCH 07/24] Refactor citation handling in Ask.razor and ChatService.cs Updated the assistant message construction in `Ask.razor` to manage citations more effectively by introducing a `RawText` property and a new `RemoveCitations` method. The `ExtractCitations` method now processes raw input for citation extraction. Removed outdated comments in `ChatService.cs` regarding citation formatting rules, indicating a potential shift in how citation handling is enforced. --- .../Components/Pages/Ask.razor | 28 +++++++++++++++---- .../Services/ChatService.cs | 18 ------------ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor index 60a2af1..dbc459e 100644 --- a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor +++ b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor @@ -196,15 +196,16 @@ } else if (delta.StreamState == StreamState.Append) { - // Adds tokens to the assistant message as they are received - assistantMessage.Text += delta.Answer; + // Adds tokens to the assistant message as they are received. + assistantMessage.RawText += delta.Answer; + + // Updates the Text property to remove citations, if any. + assistantMessage.Text = RemoveCitations(assistantMessage.RawText); } else if (delta.StreamState == StreamState.End) { // Extracts citations, if any. - var (cleanText, citations) = ExtractCitations(assistantMessage.Text); - - assistantMessage.Text = cleanText; + var (_, citations) = ExtractCitations(assistantMessage.RawText); assistantMessage.Citations = citations; assistantMessage.IsCompleted = true; @@ -294,6 +295,16 @@ await JSRuntime.InvokeVoidAsync("scrollTo", chat); } + private static string RemoveCitations(string? text) + { + if (string.IsNullOrEmpty(text)) + { + return string.Empty; + } + + return (text.AsSpan().IndexOf("= 0 ? text[..index] : text).TrimEnd(); + } + private static (string, IEnumerable) ExtractCitations(string? text) { var citations = new List(); @@ -326,6 +337,13 @@ public class Message { + private string? rawText; + public string? RawText + { + get => rawText ?? Text; + set => rawText = value; + } + public string? Text { get; set; } public required string Role { get; set; } diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index 4e8d0fa..77686cb 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -144,24 +144,6 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer - The citations must always be in a list at the end of the response, one after the other. Never add the citations between the actual response text or inside sentences. - Do NOT add any text after the citations. - ALWAYS leave a blank line between your answer and the first citation. - - Examples (CORRECT): - Here is my complete answer to your question. I'm providing all the information based on the context. - - Paris is the capital - largest city in France - - Examples (WRONG): - Here is my answer Paris is the capital of France and is known for the Eiffel Tower with more text. - Paris is the capital of France and is known for the Eiffel Tower Here is my answer. - Here is my answer. (without any citations when information is available) - Here is my answer. - Paris is the capital of France and is known for the Eiffel Tower More answer text. - - YOU MUST SEPARATE YOUR ANSWER FROM CITATIONS WITH A BLANK LINE. - NEVER INSERT CITATIONS WITHIN YOUR ANSWER TEXT. - CITATIONS MUST ONLY APPEAR AT THE END, AFTER A BLANK LINE. - IF YOU DO NOT FOLLOW THESE RULES, YOUR RESPONSE IS INVALID. """); var prompt = new StringBuilder($""" From 5530a84d82e96ef070fb8c21e4eda1e1f717cbc3 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Thu, 5 Jun 2025 16:25:16 +0200 Subject: [PATCH 08/24] Update citation handling and formatting in Ask.razor Refactor regex pattern in `Ask.razor` to capture `document-id` and `chunk-id`. Update `Citation` class to include new properties and make `PageNumber` nullable. Adjust citation addition logic and citation format rules. Modify chunk text formatting in `ChatService.cs` to include page number. --- .../Components/Pages/Ask.razor | 18 ++++++++++++------ .../Services/ChatService.cs | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor index dbc459e..297ed39 100644 --- a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor +++ b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor @@ -314,18 +314,20 @@ return (text ?? string.Empty, citations); } - var pattern = "(.*?)<\\/citation>"; + var pattern = @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'>\s*(?.*?)\s*"; var matches = Regex.Matches(text, pattern, RegexOptions.Singleline); foreach (Match match in matches) { - if (match.Success && match.Groups.Count == 4) + if (match.Success) { citations.Add(new Citation { - FileName = match.Groups[1].Value, - PageNumber = match.Groups[2].Value, - Quote = match.Groups[3].Value + DocumentId = Guid.Parse(match.Groups["documentId"].Value), + ChunkId = Guid.Parse(match.Groups["chunkId"].Value), + FileName = match.Groups["filename"].Value, + PageNumber = match.Groups["pageNumber"].Value, + Quote = match.Groups["quote"].Value }); } } @@ -358,10 +360,14 @@ public class Citation { + public Guid DocumentId { get; set; } + + public Guid ChunkId { get; set; } + public string FileName { get; set; } = null!; public string Quote { get; set; } = null!; - public string PageNumber { get; set; } = null!; + public string? PageNumber { get; set; } } } \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index 77686cb..cc19d8a 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -133,7 +133,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer First provide your complete answer, then add a blank line, then list all citations. Use this XML format for citations: - exact quote here + exact quote here STRICT RULES for citations: - Citations MUST NEVER appear inside, before, or between sentences of your answer. They MUST be grouped together ONLY at the end, after a blank line. @@ -162,7 +162,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer foreach (var chunk in chunks) { - var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id}) {Environment.NewLine}{chunk.Content}{Environment.NewLine}"; + var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: 1) {Environment.NewLine}{chunk.Content}{Environment.NewLine}"; var tokenCount = tokenizerService.CountChatCompletionTokens(text); if (tokenCount > availableTokens) From dc6bbfde910915d573cfaea2acca7d9c904250ad Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Fri, 6 Jun 2025 10:50:03 +0200 Subject: [PATCH 09/24] Update content decoding and validation logic - Changed `PageNumber` in `Chunk` to nullable `int?` in `IContentDecoder` and updated related logic in `TextContentDecoder`. - Revised citation rules in `ChatService` for stricter placement and formatting. - Introduced `QuestionValidator` class with validation rules for `Question` model's `Text` property. --- .../ContentDecoders/IContentDecoder.cs | 2 +- .../ContentDecoders/TextContentDecoder.cs | 2 +- SqlDatabaseVectorSearch/Services/ChatService.cs | 13 +------------ .../QuestionValidator.cs | 2 +- 4 files changed, 4 insertions(+), 15 deletions(-) rename SqlDatabaseVectorSearch/{Validators => Validations}/QuestionValidator.cs (84%) diff --git a/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs index 4fc8293..7cce285 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs @@ -5,4 +5,4 @@ public interface IContentDecoder Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default); } -public record class Chunk(int PageNumber, int IndexOnPage, string Content); \ No newline at end of file +public record class Chunk(int? PageNumber, int IndexOnPage, string Content); \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs index d03e32f..29e76f1 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs @@ -12,6 +12,6 @@ public class TextContentDecoder(IServiceProvider serviceProvider) : IContentDeco var content = await readStream.ReadToEndAsync(cancellationToken); var paragraphs = textChunker.Split(content); - return paragraphs.Select((text, index) => new Chunk(1, index, text)).ToList(); + return paragraphs.Select((text, index) => new Chunk(null, index, text)).ToList(); } } diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index cc19d8a..4269efa 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -127,23 +127,12 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer Never answer questions that are not related to this chat. You must answer in the same language as the user's question. - IMPORTANT - CITATION PLACEMENT AND LENGTH: The quote in each MUST be MAXIMUM 5 words, taken word-for-word from the search result. If the quote is longer than 5 words, your answer is INVALID. When you find an answer, you MUST place ALL citations ONLY at the very end of your response, never inside or between sentences. - First provide your complete answer, then add a blank line, then list all citations. + First provide your complete answer, then list all citations. Use this XML format for citations: exact quote here - - STRICT RULES for citations: - - Citations MUST NEVER appear inside, before, or between sentences of your answer. They MUST be grouped together ONLY at the end, after a blank line. - - If you include citations anywhere except at the end, your answer is WRONG and INVALID. - - Always include the citation(s) if there are results. If you don't know the answer, do NOT include citations. - - The quote must be max 5 words, taken word-for-word from the search result, and is the basis for why the citation is relevant. If the quote is longer than 5 words, your answer is INVALID. - - Do NOT refer to the presence of citations; just emit these tags right at the end, with no surrounding text. - - The citations must always be in a list at the end of the response, one after the other. Never add the citations between the actual response text or inside sentences. - - Do NOT add any text after the citations. - - ALWAYS leave a blank line between your answer and the first citation. """); var prompt = new StringBuilder($""" diff --git a/SqlDatabaseVectorSearch/Validators/QuestionValidator.cs b/SqlDatabaseVectorSearch/Validations/QuestionValidator.cs similarity index 84% rename from SqlDatabaseVectorSearch/Validators/QuestionValidator.cs rename to SqlDatabaseVectorSearch/Validations/QuestionValidator.cs index 538f654..c0fa740 100644 --- a/SqlDatabaseVectorSearch/Validators/QuestionValidator.cs +++ b/SqlDatabaseVectorSearch/Validations/QuestionValidator.cs @@ -1,7 +1,7 @@ using FluentValidation; using SqlDatabaseVectorSearch.Models; -namespace SqlDatabaseVectorSearch.Validators; +namespace SqlDatabaseVectorSearch.Validations; public class QuestionValidator : AbstractValidator { From cdf8356e11333868aaf8702b8ecbfeac48e22e1b Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Fri, 6 Jun 2025 11:26:27 +0200 Subject: [PATCH 10/24] Enhance citation handling and document chunk structure - Updated `Ask.razor` to change `PageNumber` to a nullable integer and added `IndexOnPage` to the `Citation` class. Adjusted regex for citation parsing. - Introduced `PageNumber` and `IndexOnPage` properties in `DocumentChunk.cs`, marking `Content` as required. - Modified migration files to reflect changes in `DocumentChunk` and `Document` entities. - Updated citation format in `ChatService.cs` to include `index-on-page` and adjusted document chunk text formatting. - Changed embedding generation method in `VectorSearchService.cs` and updated document chunk creation to include new properties. --- .../Components/Pages/Ask.razor | 11 ++++++---- .../Data/Entities/DocumentChunk.cs | 4 ++++ .../00000000000000_Initial.Designer.cs | 22 ++++++++++++------- .../Data/Migrations/00000000000000_Initial.cs | 4 +++- .../ApplicationDbContextModelSnapshot.cs | 20 +++++++++++------ .../Services/ChatService.cs | 4 ++-- .../Services/VectorSearchService.cs | 16 +++++++++++--- 7 files changed, 56 insertions(+), 25 deletions(-) diff --git a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor index 297ed39..4adc676 100644 --- a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor +++ b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor @@ -80,7 +80,7 @@ {
- @citation.FileName @if (!string.IsNullOrEmpty(citation.PageNumber)) + @citation.FileName @if (citation.PageNumber.GetValueOrDefault() > 0) { pag. @citation.PageNumber } @@ -314,7 +314,7 @@ return (text ?? string.Empty, citations); } - var pattern = @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'>\s*(?.*?)\s*"; + var pattern = @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'\s+index-on-page='(?[^']*)'>\s*(?.*?)\s*"; var matches = Regex.Matches(text, pattern, RegexOptions.Singleline); foreach (Match match in matches) @@ -326,7 +326,8 @@ DocumentId = Guid.Parse(match.Groups["documentId"].Value), ChunkId = Guid.Parse(match.Groups["chunkId"].Value), FileName = match.Groups["filename"].Value, - PageNumber = match.Groups["pageNumber"].Value, + PageNumber = int.TryParse(match.Groups["pageNumber"].Value, out var pageNumber) && pageNumber > 0 ? pageNumber : null, + IndexOnPage = int.TryParse(match.Groups["indexOnPage"].Value, out var indexOnPage) ? indexOnPage : 0, Quote = match.Groups["quote"].Value }); } @@ -368,6 +369,8 @@ public string Quote { get; set; } = null!; - public string? PageNumber { get; set; } + public int? PageNumber { get; set; } + + public int IndexOnPage { get; set; } } } \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs b/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs index 580de9e..bc67963 100644 --- a/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs +++ b/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs @@ -8,6 +8,10 @@ public class DocumentChunk public int Index { get; set; } + public int? PageNumber { get; set; } + + public int IndexOnPage { get; set; } + public required string Content { get; set; } public required float[] Embedding { get; set; } diff --git a/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs index 5789fb8..c071cf1 100644 --- a/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs @@ -9,10 +9,10 @@ using SqlDatabaseVectorSearch.Data; #nullable disable -namespace SqlDatabaseVectorSearch.Data.Migrations +namespace SqlDatabaseVectorSearch.Migrations { [DbContext(typeof(ApplicationDbContext))] - [Migration("20250224102351_Initial")] + [Migration("20250606091336_Initial")] partial class Initial { /// @@ -20,12 +20,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations { #pragma warning disable 612, 618 modelBuilder - .HasAnnotation("ProductVersion", "9.0.2") + .HasAnnotation("ProductVersion", "9.0.5") .HasAnnotation("Relational:MaxIdentifierLength", 128); SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b => { b.Property("Id") .ValueGeneratedOnAdd() @@ -44,7 +44,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.ToTable("Documents", (string)null); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b => { b.Property("Id") .ValueGeneratedOnAdd() @@ -64,6 +64,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.Property("Index") .HasColumnType("int"); + b.Property("IndexOnPage") + .HasColumnType("int"); + + b.Property("PageNumber") + .HasColumnType("int"); + b.HasKey("Id"); b.HasIndex("DocumentId"); @@ -71,9 +77,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.ToTable("DocumentChunks", (string)null); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b => { - b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document") + b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document") .WithMany("Chunks") .HasForeignKey("DocumentId") .OnDelete(DeleteBehavior.Cascade) @@ -83,7 +89,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.Navigation("Document"); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b => { b.Navigation("Chunks"); }); diff --git a/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs index 2a530b0..590cad4 100644 --- a/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs @@ -3,7 +3,7 @@ using Microsoft.EntityFrameworkCore.Migrations; #nullable disable -namespace SqlDatabaseVectorSearch.Data.Migrations +namespace SqlDatabaseVectorSearch.Migrations { /// public partial class Initial : Migration @@ -31,6 +31,8 @@ namespace SqlDatabaseVectorSearch.Data.Migrations Id = table.Column(type: "uniqueidentifier", nullable: false), DocumentId = table.Column(type: "uniqueidentifier", nullable: false), Index = table.Column(type: "int", nullable: false), + PageNumber = table.Column(type: "int", nullable: true), + IndexOnPage = table.Column(type: "int", nullable: false), Content = table.Column(type: "nvarchar(max)", nullable: false), Embedding = table.Column(type: "vector(1536)", nullable: false) }, diff --git a/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs b/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs index 6bb2ad9..aeb0666 100644 --- a/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs @@ -8,7 +8,7 @@ using SqlDatabaseVectorSearch.Data; #nullable disable -namespace SqlDatabaseVectorSearch.Data.Migrations +namespace SqlDatabaseVectorSearch.Migrations { [DbContext(typeof(ApplicationDbContext))] partial class ApplicationDbContextModelSnapshot : ModelSnapshot @@ -17,12 +17,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations { #pragma warning disable 612, 618 modelBuilder - .HasAnnotation("ProductVersion", "9.0.2") + .HasAnnotation("ProductVersion", "9.0.5") .HasAnnotation("Relational:MaxIdentifierLength", 128); SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b => { b.Property("Id") .ValueGeneratedOnAdd() @@ -41,7 +41,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.ToTable("Documents", (string)null); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b => { b.Property("Id") .ValueGeneratedOnAdd() @@ -61,6 +61,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.Property("Index") .HasColumnType("int"); + b.Property("IndexOnPage") + .HasColumnType("int"); + + b.Property("PageNumber") + .HasColumnType("int"); + b.HasKey("Id"); b.HasIndex("DocumentId"); @@ -68,9 +74,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.ToTable("DocumentChunks", (string)null); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b => { - b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document") + b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document") .WithMany("Chunks") .HasForeignKey("DocumentId") .OnDelete(DeleteBehavior.Cascade) @@ -80,7 +86,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.Navigation("Document"); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b => { b.Navigation("Chunks"); }); diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index 4269efa..2bc3e51 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -132,7 +132,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer First provide your complete answer, then list all citations. Use this XML format for citations: - exact quote here + exact quote here """); var prompt = new StringBuilder($""" @@ -151,7 +151,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer foreach (var chunk in chunks) { - var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: 1) {Environment.NewLine}{chunk.Content}{Environment.NewLine}"; + var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: {chunk.PageNumber} | Index on Page: {chunk.IndexOnPage}) {Environment.NewLine}{chunk.Content}{Environment.NewLine}"; var tokenCount = tokenizerService.CountChatCompletionTokens(text); if (tokenCount > availableTokens) diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index beda3cb..2663c39 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -39,14 +39,24 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() }; dbContext.Documents.Add(document); - var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken); + var embeddings = await embeddingGenerator.GenerateAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken); // Save the document chunks and the corresponding embedding in the database. foreach (var (index, embedding) in embeddings.Index()) { - logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(embedding.Value)); + var paragraph = paragraphs.ElementAt(index); + logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(paragraph.Content)); + + var documentChunk = new Entities.DocumentChunk + { + Document = document, + Index = index, + PageNumber = paragraph.PageNumber, + IndexOnPage = paragraph.IndexOnPage, + Content = paragraph.Content, + Embedding = embedding.Vector.ToArray() + }; - var documentChunk = new Entities.DocumentChunk { Document = document, Index = index, Content = embedding.Value, Embedding = embedding.Embedding.Vector.ToArray() }; dbContext.DocumentChunks.Add(documentChunk); } From 457147878704cc3c3a9111c9a0627cf471daad67 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 10 Jun 2025 11:50:51 +0200 Subject: [PATCH 11/24] Enhance citation handling and formatting Updated citation management in the application by removing the `RemoveCitations` and `ExtractCitations` methods in `Ask.razor`, and directly processing citations from the `delta` object. The `Response` class now includes a `Citations` property for better data handling. Modified `VectorSearchService.cs` to extract citations from the full answer in `AskQuestionAsync` and return them at the end of the streaming process in `AskStreamingAsync`. Introduced a new `Citation` class in `Citation.cs` to encapsulate citation properties, ensuring structured management of citation data. Updated citation formatting rules to enforce a specific XML format, ensuring citations are presented at the end of responses rather than within the answer text. --- .../Components/Pages/Ask.razor | 68 +++--------------- SqlDatabaseVectorSearch/Models/Citation.cs | 16 +++++ SqlDatabaseVectorSearch/Models/Response.cs | 6 +- .../Services/ChatService.cs | 33 +++++++-- .../Services/VectorSearchService.cs | 69 +++++++++++++++++-- 5 files changed, 120 insertions(+), 72 deletions(-) create mode 100644 SqlDatabaseVectorSearch/Models/Citation.cs diff --git a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor index 4adc676..699532a 100644 --- a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor +++ b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor @@ -197,16 +197,20 @@ else if (delta.StreamState == StreamState.Append) { // Adds tokens to the assistant message as they are received. - assistantMessage.RawText += delta.Answer; - - // Updates the Text property to remove citations, if any. - assistantMessage.Text = RemoveCitations(assistantMessage.RawText); + assistantMessage.Text += delta.Answer; } else if (delta.StreamState == StreamState.End) { - // Extracts citations, if any. - var (_, citations) = ExtractCitations(assistantMessage.RawText); - assistantMessage.Citations = citations; + // Get citations from the response. + assistantMessage.Citations = delta.Citations?.Select(c => new Citation + { + DocumentId = c.DocumentId, + ChunkId = c.ChunkId, + FileName = c.FileName, + Quote = c.Quote, + PageNumber = c.PageNumber, + IndexOnPage = c.IndexOnPage + }); assistantMessage.IsCompleted = true; assistantMessage.TokenUsage += FormatTokenUsage(delta.TokenUsage); @@ -295,58 +299,8 @@ await JSRuntime.InvokeVoidAsync("scrollTo", chat); } - private static string RemoveCitations(string? text) - { - if (string.IsNullOrEmpty(text)) - { - return string.Empty; - } - - return (text.AsSpan().IndexOf("= 0 ? text[..index] : text).TrimEnd(); - } - - private static (string, IEnumerable) ExtractCitations(string? text) - { - var citations = new List(); - - if (string.IsNullOrEmpty(text)) - { - return (text ?? string.Empty, citations); - } - - var pattern = @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'\s+index-on-page='(?[^']*)'>\s*(?.*?)\s*"; - - var matches = Regex.Matches(text, pattern, RegexOptions.Singleline); - foreach (Match match in matches) - { - if (match.Success) - { - citations.Add(new Citation - { - DocumentId = Guid.Parse(match.Groups["documentId"].Value), - ChunkId = Guid.Parse(match.Groups["chunkId"].Value), - FileName = match.Groups["filename"].Value, - PageNumber = int.TryParse(match.Groups["pageNumber"].Value, out var pageNumber) && pageNumber > 0 ? pageNumber : null, - IndexOnPage = int.TryParse(match.Groups["indexOnPage"].Value, out var indexOnPage) ? indexOnPage : 0, - Quote = match.Groups["quote"].Value - }); - } - } - - // Remove all tags from the text - var cleanText = Regex.Replace(text, pattern, string.Empty, RegexOptions.Singleline).TrimEnd(); - return (cleanText, citations); - } - public class Message { - private string? rawText; - public string? RawText - { - get => rawText ?? Text; - set => rawText = value; - } - public string? Text { get; set; } public required string Role { get; set; } diff --git a/SqlDatabaseVectorSearch/Models/Citation.cs b/SqlDatabaseVectorSearch/Models/Citation.cs new file mode 100644 index 0000000..04fb64b --- /dev/null +++ b/SqlDatabaseVectorSearch/Models/Citation.cs @@ -0,0 +1,16 @@ +namespace SqlDatabaseVectorSearch.Models; + +public class Citation +{ + public Guid DocumentId { get; set; } + + public Guid ChunkId { get; set; } + + public string FileName { get; set; } = null!; + + public string Quote { get; set; } = null!; + + public int? PageNumber { get; set; } + + public int IndexOnPage { get; set; } +} \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Models/Response.cs b/SqlDatabaseVectorSearch/Models/Response.cs index 54921ba..62bde55 100644 --- a/SqlDatabaseVectorSearch/Models/Response.cs +++ b/SqlDatabaseVectorSearch/Models/Response.cs @@ -1,10 +1,10 @@ namespace SqlDatabaseVectorSearch.Models; // Question and Answer can be null when using response streaming. -public record class Response(string? OriginalQuestion, string? ReformulatedQuestion, string? Answer, StreamState? StreamState = null, TokenUsageResponse? TokenUsage = null) +public record class Response(string? OriginalQuestion, string? ReformulatedQuestion, string? Answer, StreamState? StreamState = null, TokenUsageResponse? TokenUsage = null, IEnumerable? Citations = null) { - public Response(string? token, StreamState streamState, TokenUsageResponse? tokenUsageResponse = null) - : this(null, null, token, streamState, tokenUsageResponse) + public Response(string? token, StreamState streamState, TokenUsageResponse? tokenUsageResponse = null, IEnumerable? citations = null) + : this(null, null, token, streamState, tokenUsageResponse, citations) { } } \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index 2bc3e51..a37bcce 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -125,14 +125,33 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer - I'm sorry, I don't have enough information to answer that question Never answer questions that are not related to this chat. - You must answer in the same language as the user's question. + You must answer in the same language as the user's question. For example, if the user asks a question in English, the answer must be in English, no matter the language of the documents. - The quote in each MUST be MAXIMUM 5 words, taken word-for-word from the search result. If the quote is longer than 5 words, your answer is INVALID. - When you find an answer, you MUST place ALL citations ONLY at the very end of your response, never inside or between sentences. - First provide your complete answer, then list all citations. - - Use this XML format for citations: - exact quote here + After the answer, you need to include citations following the XML format below: + 【exact quote here + exact quote here】 + + The entire list of XML citations MUST be enclosed between 【 and 】 (U+3010 and U+3011) and must exactly match the above format. + The quote in each MUST be MAXIMUM 5 words, taken word-for-word from the search result. + + IMPORTANT CITATION RULES: + 1. NEVER put citations inside your answer text. + 2. ALWAYS provide your complete answer FIRST. + 3. ONLY AFTER completing your answer, add ALL citations in a block at the very end. + 4. The citations block MUST be the last thing in your response. + 5. NEVER reference citations by number or mention them in your answer text. + 6. The citations MUST ALWAYS follow the XML format exactly as shown below. Any other format is NOT ACCEPTED. + + --- + Example of a correct answer: + The capital of France is Paris. + 【capital of France is Paris】 + + Example of an incorrect answer (NOT ACCEPTED): + The capital of France is Paris [1]. + [1] france.pdf, page 1 + --- + Only the correct format is accepted. If you do not follow the XML format exactly, your answer will be considered invalid. """); var prompt = new StringBuilder($""" diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 2663c39..f571d67 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -1,5 +1,7 @@ using System.Data; using System.Runtime.CompilerServices; +using System.Text; +using System.Text.RegularExpressions; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.AI; using Microsoft.Extensions.Options; @@ -74,9 +76,12 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb // It the user doesn't want to reforulate the question, CreateContextAsync returns the original one. var (reformulatedQuestion, embeddingTokenCount, chunks) = await CreateContextAsync(question, reformulate, cancellationToken); - var (answer, tokenUsage) = await chatService.AskQuestionAsync(question.ConversationId, chunks, reformulatedQuestion.Text!, cancellationToken); + var (fullAnswer, tokenUsage) = await chatService.AskQuestionAsync(question.ConversationId, chunks, reformulatedQuestion.Text!, cancellationToken); - return new(question.Text, reformulatedQuestion.Text!, answer, null, new(reformulatedQuestion.TokenUsage, embeddingTokenCount, tokenUsage)); + // Extract citations from the answer + var (answer, citations) = ExtractCitations(fullAnswer); + + return new(question.Text, reformulatedQuestion.Text!, answer, null, new(reformulatedQuestion.TokenUsage, embeddingTokenCount, tokenUsage), citations); } public async IAsyncEnumerable AskStreamingAsync(Question question, bool reformulate = true, [EnumeratorCancellation] CancellationToken cancellationToken = default) @@ -90,19 +95,42 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb yield return new(question.Text, reformulatedQuestion.Text!, null, StreamState.Start, new(reformulatedQuestion.TokenUsage, embeddingTokenCount, null)); TokenUsageResponse? tokenUsageResponse = null; + var fullAnswer = new StringBuilder(); + var areCitationsStarted = false; // Return each token as a partial response. await foreach (var (token, tokenUsage) in answerStream) { + fullAnswer.Append(token); + + if (token?.Contains('【') == true) + { + // Citations are started when the first token contains a 【 character. + // We need to track it because we don't want to return the citations in the actual response. + areCitationsStarted = true; + } + + if (!areCitationsStarted) + { + yield return new(token, StreamState.Append); + } + // Token usage is expected in the last message. tokenUsageResponse = tokenUsage is not null ? new(tokenUsage) : null; - yield return new(token, tokenUsageResponse is null ? StreamState.Append : StreamState.End, tokenUsageResponse); + if (tokenUsageResponse is not null) + { + // Response is complete, we can return the citations. + var (_, citations) = ExtractCitations(fullAnswer.ToString()); + yield return new(null, StreamState.End, tokenUsageResponse, citations); + } } - // If the token usage has not been returned in the last message, we must explicitly tells that the stream is ended. + // If the token usage has not been returned in the last message, we must explicitly tell that the stream is ended. if (tokenUsageResponse is null) { - yield return new(null, StreamState.End); + // Extract citations at the end of streaming. + var (_, citations) = ExtractCitations(fullAnswer.ToString()); + yield return new(null, StreamState.End, null, citations); } } @@ -124,4 +152,35 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb return (reformulatedQuestion, embeddingTokenCount, chunks); } + + private static (string, IEnumerable) ExtractCitations(string? text) + { + var citations = new List(); + + if (string.IsNullOrEmpty(text)) + { + return (text ?? string.Empty, citations); + } + + var matches = Regex.Matches(text, @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'\s+index-on-page='(?[^']*)'>\s*(?.*?)\s*", RegexOptions.Singleline); + foreach (Match match in matches) + { + if (match.Success) + { + citations.Add(new Citation + { + DocumentId = Guid.Parse(match.Groups["documentId"].Value), + ChunkId = Guid.Parse(match.Groups["chunkId"].Value), + FileName = match.Groups["filename"].Value, + PageNumber = int.TryParse(match.Groups["pageNumber"].Value, out var pageNumber) && pageNumber > 0 ? pageNumber : null, + IndexOnPage = int.TryParse(match.Groups["indexOnPage"].Value, out var indexOnPage) ? indexOnPage : 0, + Quote = match.Groups["quote"].Value + }); + } + } + + // Remove all content between 【 and 】 + var cleanText = Regex.Replace(text, @"【.*?】", string.Empty, RegexOptions.Singleline).TrimEnd(); + return (cleanText, citations); + } } \ No newline at end of file From 3f5f44145f69f4350d9b73b21e84452c3b1f4ea1 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 10 Jun 2025 12:07:35 +0200 Subject: [PATCH 12/24] Enforce strict citation formatting in services Updated `ChatService` to ensure citations are enclosed in XML tags with a consistent format. Modified `VectorSearchService` to implement a new regex pattern for citation matching and improved text cleaning by removing citation content more efficiently. --- .../Services/ChatService.cs | 19 +++++++++++++------ .../Services/VectorSearchService.cs | 15 +++++++++++---- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index a37bcce..9380f1f 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -128,19 +128,20 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer You must answer in the same language as the user's question. For example, if the user asks a question in English, the answer must be in English, no matter the language of the documents. After the answer, you need to include citations following the XML format below: - 【exact quote here - exact quote here】 + 【exact quote here + exact quote here】 The entire list of XML citations MUST be enclosed between 【 and 】 (U+3010 and U+3011) and must exactly match the above format. The quote in each MUST be MAXIMUM 5 words, taken word-for-word from the search result. IMPORTANT CITATION RULES: - 1. NEVER put citations inside your answer text. + 1. NEVER put citations inside your answer text. 2. ALWAYS provide your complete answer FIRST. 3. ONLY AFTER completing your answer, add ALL citations in a block at the very end. - 4. The citations block MUST be the last thing in your response. + 4. The citations block MUST be the last thing in your response, with absolutely nothing (no text, no spaces, no newlines, no punctuation, no comments) after it. 5. NEVER reference citations by number or mention them in your answer text. 6. The citations MUST ALWAYS follow the XML format exactly as shown below. Any other format is NOT ACCEPTED. + 7. If you add anything after the citations block, your answer will be considered invalid. --- Example of a correct answer: @@ -148,10 +149,16 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer 【capital of France is Paris】 Example of an incorrect answer (NOT ACCEPTED): - The capital of France is Paris [1]. + The capital of France is Paris. + 【capital of France is Paris】 + Thank you for your question. + + Another incorrect example (NOT ACCEPTED): + The capital of France is Paris. + 【capital of France is Paris】 [1] france.pdf, page 1 --- - Only the correct format is accepted. If you do not follow the XML format exactly, your answer will be considered invalid. + Only the correct format is accepted. If you do not follow the XML format exactly, or if you add anything after the citations block, your answer will be considered invalid. """); var prompt = new StringBuilder($""" diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index f571d67..7611690 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -14,7 +14,7 @@ using Entities = SqlDatabaseVectorSearch.Data.Entities; namespace SqlDatabaseVectorSearch.Services; -public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, IEmbeddingGenerator> embeddingGenerator, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) +public partial class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, IEmbeddingGenerator> embeddingGenerator, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) { private readonly AppSettings appSettings = appSettingsOptions.Value; @@ -162,7 +162,8 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb return (text ?? string.Empty, citations); } - var matches = Regex.Matches(text, @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'\s+index-on-page='(?[^']*)'>\s*(?.*?)\s*", RegexOptions.Singleline); + var matches = CitationRegEx.Matches(text); + foreach (Match match in matches) { if (match.Success) @@ -179,8 +180,14 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb } } - // Remove all content between 【 and 】 - var cleanText = Regex.Replace(text, @"【.*?】", string.Empty, RegexOptions.Singleline).TrimEnd(); + // Remove all content between 【 and 】. + var cleanText = RemoveCitationsRegEx.Replace(text, string.Empty).TrimEnd(); return (cleanText, citations); } + + [GeneratedRegex(@"[^""']*)(?:""|'|)\s+chunk-id=(?:""|'|)(?[^""']*)(?:""|'|)\s+filename=(?:""|'|)(?[^""']*)(?:""|'|)\s+page-number=(?:""|'|)(?[^""']*)(?:""|'|)\s+index-on-page=(?:""|'|)(?[^""']*)(?:""|'|)>\s*(?.*?)\s*", RegexOptions.Singleline)] + private static partial Regex CitationRegEx { get; } + + [GeneratedRegex(@"【.*?】", RegexOptions.Singleline)] + private static partial Regex RemoveCitationsRegEx { get; } } \ No newline at end of file From c9c5b74e7596c431c479f10d8c2edf50c1b1b65d Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 10 Jun 2025 12:19:37 +0200 Subject: [PATCH 13/24] Update response guidelines in ChatService class Modified response examples and citation rules in ChatService.cs. Changed context from France to Italy, added conditions for including citations, and emphasized formatting requirements. Updated examples to ensure compliance with new guidelines. --- .../Services/ChatService.cs | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index 9380f1f..6bb3fde 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -116,7 +116,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer var chat = new ChatHistory(""" You can use only the information provided in this chat to answer questions. If you don't know the answer, reply suggesting to refine the question. - For example, if the user asks "What is the capital of France?" and in this chat there isn't information about France, you should reply something like: + For example, if the user asks "What is the capital of Italy?" and in this chat there isn't information about Italy, you should reply something like: - This information isn't available in the given context - I'm sorry, I don't know the answer to that question - I don't have that information @@ -127,7 +127,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer Never answer questions that are not related to this chat. You must answer in the same language as the user's question. For example, if the user asks a question in English, the answer must be in English, no matter the language of the documents. - After the answer, you need to include citations following the XML format below: + After the answer, you need to include citations following the XML format below ONLY IF you know the answer and are providing information from the context. If you do NOT know the answer, DO NOT include the citations section at all. 【exact quote here exact quote here】 @@ -142,23 +142,28 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer 5. NEVER reference citations by number or mention them in your answer text. 6. The citations MUST ALWAYS follow the XML format exactly as shown below. Any other format is NOT ACCEPTED. 7. If you add anything after the citations block, your answer will be considered invalid. + 8. If you do NOT know the answer, DO NOT include the citations block at all. --- Example of a correct answer: - The capital of France is Paris. - 【capital of France is Paris】 + The capital of Italy is Rome. + 【capital of Italy is Rome】 + + Example of a correct answer when you do NOT know the answer: + I'm sorry, I don't know the answer to that question Example of an incorrect answer (NOT ACCEPTED): - The capital of France is Paris. - 【capital of France is Paris】 + The capital of Italy is Rome. + 【capital of Italy is Rome】 Thank you for your question. Another incorrect example (NOT ACCEPTED): - The capital of France is Paris. - 【capital of France is Paris】 - [1] france.pdf, page 1 + The capital of Italy is Rome. + 【capital of Italy is Rome】 + [1] italy.pdf, page 1 --- Only the correct format is accepted. If you do not follow the XML format exactly, or if you add anything after the citations block, your answer will be considered invalid. + If you do NOT know the answer, DO NOT include the citations block at all. """); var prompt = new StringBuilder($""" From cdbe2e3a9165458265958fe39c6832f0dbd0de0a Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 11 Jun 2025 17:20:56 +0200 Subject: [PATCH 14/24] Enhance content decoders and update dependencies - Modified `DocxContentDecoder` to use `IServiceProvider` for text chunking and improved paragraph processing with page break handling. - Updated `PdfContentDecoder` and `TextContentDecoder` to trim whitespace from text before splitting into paragraphs. - Reordered service registrations in `Program.cs` while retaining existing functionality. - Updated `SqlDatabaseVectorSearch.csproj` with new package versions for several dependencies, including `Microsoft.AspNetCore.OpenApi` and `Microsoft.EntityFrameworkCore`. --- .../ContentDecoders/DocxContentDecoder.cs | 44 +++++++++++++++---- .../ContentDecoders/PdfContentDecoder.cs | 2 +- .../ContentDecoders/TextContentDecoder.cs | 2 +- SqlDatabaseVectorSearch/Program.cs | 12 ++--- .../SqlDatabaseVectorSearch.csproj | 16 +++---- 5 files changed, 51 insertions(+), 25 deletions(-) diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs index d606fa1..3aa160b 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -1,25 +1,51 @@ using System.Text; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; +using SqlDatabaseVectorSearch.TextChunkers; namespace SqlDatabaseVectorSearch.ContentDecoders; -public class DocxContentDecoder : IContentDecoder +public class DocxContentDecoder(IServiceProvider serviceProvider) : IContentDecoder { public Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) { - // Open a Word document for read-only access. + var textChunker = serviceProvider.GetRequiredKeyedService(contentType); + using var document = WordprocessingDocument.Open(stream, false); - var body = document.MainDocumentPart?.Document.Body; - var content = new StringBuilder(); - - var paragraphs = body?.Descendants() ?? []; - foreach (var p in paragraphs) + if (body is null) { - content.AppendLine(p.InnerText); + return Task.FromResult(Enumerable.Empty()); } - return Task.FromResult(new List([new(1, 0, content.ToString())]).AsEnumerable()); + var pages = new List(); + var pageBuilder = new StringBuilder(); + + foreach (var paragraph in body.Descendants()) + { + // Note: this is just an attempt at counting pages, not 100% reliable + // see https://stackoverflow.com/questions/39992870/how-to-access-openxml-content-by-page-number + var lastRenderedPageBreak = paragraph.GetFirstChild()?.GetFirstChild(); + if (lastRenderedPageBreak is not null) + { + // Note: no trimming, use original spacing when working with pages + pages.Add(pageBuilder.ToString()); + pageBuilder.Clear(); + } + + pageBuilder.AppendLine(paragraph.InnerText); + } + + // Dopo aver processato tutti i paragrafi, aggiungi l'ultima pagina (anche se vuota) + pages.Add(pageBuilder.ToString()); + + var chunks = new List(); + foreach (var (pageIndex, pageText) in pages.Index()) + { + var paragraphs = textChunker.Split(pageText.Trim()); + chunks.AddRange(paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pageIndex + 1, index, text))); + } + + return Task.FromResult(chunks.AsEnumerable()); } } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs index c5cd0e9..00ee765 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs @@ -26,7 +26,7 @@ public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecod var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words); var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" "))); - var paragraphs = textChunker.Split(pageText); + var paragraphs = textChunker.Split(pageText.Trim()); return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text)); } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs index 29e76f1..b6d7fb0 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs @@ -11,7 +11,7 @@ public class TextContentDecoder(IServiceProvider serviceProvider) : IContentDeco using var readStream = new StreamReader(stream); var content = await readStream.ReadToEndAsync(cancellationToken); - var paragraphs = textChunker.Split(content); + var paragraphs = textChunker.Split(content.Trim()); return paragraphs.Select((text, index) => new Chunk(null, index, text)).ToList(); } } diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index f9d2e78..dfda707 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -63,12 +63,6 @@ builder.Services.AddKernel() .AddAzureOpenAIEmbeddingGenerator(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, modelId: aiSettings.Embedding.ModelId, dimensions: aiSettings.Embedding.Dimensions) .AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey, modelId: aiSettings.ChatCompletion.ModelId); -builder.Services.AddSingleton(); -builder.Services.AddSingleton(); - -builder.Services.AddScoped(); -builder.Services.AddScoped(); - builder.Services.AddKeyedSingleton(MediaTypeNames.Application.Pdf); builder.Services.AddKeyedSingleton("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); builder.Services.AddKeyedSingleton(MediaTypeNames.Text.Plain); @@ -77,6 +71,12 @@ builder.Services.AddKeyedSingleton(MediaTyp builder.Services.AddKeyedSingleton(KeyedService.AnyKey); builder.Services.AddKeyedSingleton(MediaTypeNames.Text.Markdown); +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); + +builder.Services.AddScoped(); +builder.Services.AddScoped(); + builder.Services.AddOpenApi(options => { options.RemoveServerList(); diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index fbe0632..5b36cf1 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -13,24 +13,24 @@ - - - + + + all runtime; build; native; contentfiles; analyzers; buildtransitive - - + + - + - - + + From 1975d6318988aedd2f2f6aeddfe6b80487866566 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 11 Jun 2025 17:47:44 +0200 Subject: [PATCH 15/24] Improve layout and readability in Documents.razor - Wrapped checkbox input in a div for better alignment. - Changed documents initialization from an empty array to a list. - Updated document addition code for improved readability. - Modified ConfirmDialogOptions and ToastMessage initializations to use object initializer syntax. - Translated comment in DocxContentDecoder.cs from Italian to English. --- .../Components/Pages/Documents.razor | 36 ++++++++++--------- .../ContentDecoders/DocxContentDecoder.cs | 2 +- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/SqlDatabaseVectorSearch/Components/Pages/Documents.razor b/SqlDatabaseVectorSearch/Components/Pages/Documents.razor index c756bab..af67098 100644 --- a/SqlDatabaseVectorSearch/Components/Pages/Documents.razor +++ b/SqlDatabaseVectorSearch/Components/Pages/Documents.razor @@ -76,7 +76,9 @@ else { - +
+ +
@document.Id @document.Name @@ -107,7 +109,7 @@ else private bool isLoading = true; private IList documents = []; - private UploadDocument Model { get; set; } = new(); + private UploadDocument Model { get; set; } = new(); [Inject] protected ToastService ToastService { get; set; } = default!; @@ -138,9 +140,9 @@ else foreach (var dbDocument in dbDocuments) { documents.Add(new SelectableDocument(dbDocument.Id, dbDocument.Name, dbDocument.CreationDate, dbDocument.ChunkCount) - { - LocalCreationDateString = await GetLocalDateTimeStringAsync(dbDocument.CreationDate) - }); + { + LocalCreationDateString = await GetLocalDateTimeStringAsync(dbDocument.CreationDate) + }); } } finally @@ -193,12 +195,12 @@ else var selectedDocumentIds = documents?.Where(d => d.IsSelected).Select(d => d.Id) ?? []; var options = new ConfirmDialogOptions - { - YesButtonText = "Yes", - YesButtonColor = ButtonColor.Danger, - NoButtonText = "No", - NoButtonColor = ButtonColor.Secondary - }; + { + YesButtonText = "Yes", + YesButtonColor = ButtonColor.Danger, + NoButtonText = "No", + NoButtonColor = ButtonColor.Secondary + }; var confirmation = await dialog.ShowAsync( title: "Delete the selected documents?", @@ -236,12 +238,12 @@ else private async Task CreateToastMessageAsync(ToastType toastType, string title, string message) { var toastMessage = new ToastMessage - { - Type = toastType, - Title = title, - HelpText = await GetLocalDateTimeStringAsync(DateTimeOffset.UtcNow), - Message = message - }; + { + Type = toastType, + Title = title, + HelpText = await GetLocalDateTimeStringAsync(DateTimeOffset.UtcNow), + Message = message + }; return toastMessage; } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs index 3aa160b..bdb934c 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -36,7 +36,7 @@ public class DocxContentDecoder(IServiceProvider serviceProvider) : IContentDeco pageBuilder.AppendLine(paragraph.InnerText); } - // Dopo aver processato tutti i paragrafi, aggiungi l'ultima pagina (anche se vuota) + // After processing all paragraphs, add the last page (even if empty). pages.Add(pageBuilder.ToString()); var chunks = new List(); From e0cf824dd66746a9a10a6c0d5317f4cc3a652717 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 18 Jun 2025 14:45:08 +0200 Subject: [PATCH 16/24] Refactor document processing and embedding generation - Updated `DocxContentDecoder` to process Word documents as chunks of text, removing page tracking and enhancing content handling. - Modified `VectorSearchService.ImportAsync` to work with chunks, implementing batching for embedding generation. - Added `EmbeddingBatchSize` property to `AppSettings` for configurable batch processing. - Updated `appsettings.json` to include the new `EmbeddingBatchSize` setting for improved control over embedding processes. --- .../ContentDecoders/DocxContentDecoder.cs | 41 +++++-------------- .../Services/VectorSearchService.cs | 26 ++++++++---- .../Settings/AppSettings.cs | 2 + SqlDatabaseVectorSearch/appsettings.json | 1 + 4 files changed, 32 insertions(+), 38 deletions(-) diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs index bdb934c..00579c4 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -11,41 +11,22 @@ public class DocxContentDecoder(IServiceProvider serviceProvider) : IContentDeco { var textChunker = serviceProvider.GetRequiredKeyedService(contentType); + // Open a Word document for read-only access. using var document = WordprocessingDocument.Open(stream, false); + var body = document.MainDocumentPart?.Document.Body; - if (body is null) + var content = new StringBuilder(); + + foreach (var p in body?.Descendants() ?? []) { - return Task.FromResult(Enumerable.Empty()); + content.AppendLine(p.InnerText); } - var pages = new List(); - var pageBuilder = new StringBuilder(); + var paragraphs = textChunker.Split(content.ToString().Trim()); - foreach (var paragraph in body.Descendants()) - { - // Note: this is just an attempt at counting pages, not 100% reliable - // see https://stackoverflow.com/questions/39992870/how-to-access-openxml-content-by-page-number - var lastRenderedPageBreak = paragraph.GetFirstChild()?.GetFirstChild(); - if (lastRenderedPageBreak is not null) - { - // Note: no trimming, use original spacing when working with pages - pages.Add(pageBuilder.ToString()); - pageBuilder.Clear(); - } - - pageBuilder.AppendLine(paragraph.InnerText); - } - - // After processing all paragraphs, add the last page (even if empty). - pages.Add(pageBuilder.ToString()); - - var chunks = new List(); - foreach (var (pageIndex, pageText) in pages.Index()) - { - var paragraphs = textChunker.Split(pageText.Trim()); - chunks.AddRange(paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pageIndex + 1, index, text))); - } - - return Task.FromResult(chunks.AsEnumerable()); + // Pages do not exist in the OpenXML format until they are rendered by a word processor. + // See https://stackoverflow.com/questions/43700252/how-to-get-page-numbers-based-on-openxmlelement for more details. + // Therefore, we will not assign a page number. + return Task.FromResult(paragraphs.Select((text, index) => new Chunk(null, index, text)).ToList().AsEnumerable()); } } diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 7611690..8133b79 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -22,10 +22,11 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli { // Extract the contents of the file. var decoder = serviceProvider.GetKeyedService(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported."); - var paragraphs = await decoder.DecodeAsync(stream, contentType, cancellationToken); + var chunks = await decoder.DecodeAsync(stream, contentType, cancellationToken); + var chunkContents = chunks.Select(p => p.Content).ToList(); // We get the token count of the whole document because it is the total number of token used by embedding (it may be necessary, for example, for cost analysis). - var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(" ", paragraphs.Select(p => p.Content))); + var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(" ", chunkContents)); var strategy = dbContext.Database.CreateExecutionStrategy(); var document = await strategy.ExecuteAsync(async (cancellationToken) => @@ -41,21 +42,30 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() }; dbContext.Documents.Add(document); - var embeddings = await embeddingGenerator.GenerateAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken); + // Process paragraphs in batches. + var embeddings = new List>(); + foreach (var batch in chunkContents.Chunk(appSettings.EmbeddingBatchSize)) + { + logger.LogDebug("Processing batch of {Count} chunks for embedding generation...", batch.Length); + + // Generate embeddings for this batch. + var batchEmbeddings = await embeddingGenerator.GenerateAsync(batch, cancellationToken: cancellationToken); + embeddings.AddRange(batchEmbeddings); + } // Save the document chunks and the corresponding embedding in the database. foreach (var (index, embedding) in embeddings.Index()) { - var paragraph = paragraphs.ElementAt(index); - logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(paragraph.Content)); + var chunk = chunks.ElementAt(index); + logger.LogDebug("Storing a chunk of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(chunk.Content)); var documentChunk = new Entities.DocumentChunk { Document = document, Index = index, - PageNumber = paragraph.PageNumber, - IndexOnPage = paragraph.IndexOnPage, - Content = paragraph.Content, + PageNumber = chunk.PageNumber, + IndexOnPage = chunk.IndexOnPage, + Content = chunk.Content, Embedding = embedding.Vector.ToArray() }; diff --git a/SqlDatabaseVectorSearch/Settings/AppSettings.cs b/SqlDatabaseVectorSearch/Settings/AppSettings.cs index a75a26e..538fed8 100644 --- a/SqlDatabaseVectorSearch/Settings/AppSettings.cs +++ b/SqlDatabaseVectorSearch/Settings/AppSettings.cs @@ -2,6 +2,8 @@ public class AppSettings { + public int EmbeddingBatchSize { get; init; } = 32; + public int MaxTokensPerLine { get; init; } = 300; public int MaxTokensPerParagraph { get; init; } = 1000; diff --git a/SqlDatabaseVectorSearch/appsettings.json b/SqlDatabaseVectorSearch/appsettings.json index 10a1967..16b1567 100644 --- a/SqlDatabaseVectorSearch/appsettings.json +++ b/SqlDatabaseVectorSearch/appsettings.json @@ -20,6 +20,7 @@ } }, "AppSettings": { + "EmbeddingBatchSize": 32, "MaxTokensPerLine": 300, "MaxTokensPerParagraph": 1000, "OverlapTokens": 100, From 765daa854435f2dedac9c7f719698e82dc1112a3 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 18 Jun 2025 14:51:26 +0200 Subject: [PATCH 17/24] Refactor citation and token usage handling Updated comments for clarity and streamlined logic for managing tokenUsageResponse. Removed explicit null checks in favor of a null-coalescing assignment. Ensured citations are always extracted and returned at the end of the streaming process. --- .../Services/VectorSearchService.cs | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 8133b79..4d1b108 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -115,7 +115,7 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli if (token?.Contains('【') == true) { - // Citations are started when the first token contains a 【 character. + // Citations are started when we encounter a token containing a 【 character. // We need to track it because we don't want to return the citations in the actual response. areCitationsStarted = true; } @@ -126,22 +126,12 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli } // Token usage is expected in the last message. - tokenUsageResponse = tokenUsage is not null ? new(tokenUsage) : null; - if (tokenUsageResponse is not null) - { - // Response is complete, we can return the citations. - var (_, citations) = ExtractCitations(fullAnswer.ToString()); - yield return new(null, StreamState.End, tokenUsageResponse, citations); - } + tokenUsageResponse ??= tokenUsage is not null ? new(tokenUsage) : null; } - // If the token usage has not been returned in the last message, we must explicitly tell that the stream is ended. - if (tokenUsageResponse is null) - { - // Extract citations at the end of streaming. - var (_, citations) = ExtractCitations(fullAnswer.ToString()); - yield return new(null, StreamState.End, null, citations); - } + // Extract citations at the end of streaming. + var (_, citations) = ExtractCitations(fullAnswer.ToString()); + yield return new(null, StreamState.End, tokenUsageResponse, citations); } private async Task<(ChatResponse ReformulatedQuestion, int EmbeddingTokenCount, IEnumerable Chunks)> CreateContextAsync(Question question, bool reformulate, CancellationToken cancellationToken) From c6ad2ca3ea204341862dd3d54fd66d20ff9f53ad Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Thu, 19 Jun 2025 09:27:24 +0200 Subject: [PATCH 18/24] Update package versions for dependencies - Updated `Microsoft.SemanticKernel` to version `1.57.0`. - Updated `Swashbuckle.AspNetCore.SwaggerUI` to version `9.0.1`. - Updated `TinyHelpers.AspNetCore` to version `4.0.29`. --- SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index 5b36cf1..bf54b8b 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -24,13 +24,13 @@ - + - - + + From 30fba5cfe0832d31195d57f4f077e3630f66994b Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 24 Jun 2025 12:16:48 +0200 Subject: [PATCH 19/24] Add citations feature and update streaming responses - Updated README.md to include a new **Citations** feature, detailing how users can access source information. - Modified JSON response examples to include a `citations` field and updated token usage details. - Enhanced streaming response section to clarify the end of the stream includes citations. - Adjusted `VectorSearchService.cs` to return `StreamState.End` and improved citation handling in streaming. - Updated `appsettings.json` with new model IDs for Azure OpenAI configuration. --- README.md | 169 ++++++++++++++---- .../Services/VectorSearchService.cs | 35 ++-- SqlDatabaseVectorSearch/appsettings.json | 2 +- 3 files changed, 157 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 4bf69a0..a3baf43 100644 --- a/README.md +++ b/README.md @@ -27,26 +27,47 @@ This repository contains a Blazor Web App as well as a Minimal API that allows t - **Conversation History with Question Reformulation**: This feature allows users to view the history of their conversations, including the ability to reformulate questions for better clarity and understanding. This ensures that users can track their interactions and refine their queries as needed. - **Information about Token Usage**: Users can access detailed information about token usage, which helps in understanding the consumption of tokens during interactions. This feature provides transparency and helps users manage their token usage effectively. - **Response streaming**: This feature enables real-time streaming of responses, allowing users to receive information as it is being processed. This ensures a seamless and efficient flow of information, enhancing the overall user experience. +- **Citations**: The application provides citations for the sources used to justify each answer. This allows users to verify the information and understand the origin of the content provided by the system. + +### Example of JSON response ```json { "originalQuestion": "why is mars called the red planet?", - "reformulatedQuestion": "Why is Mars referred to as the Red Planet?", - "answer": "Mars is referred to as the Red Planet due to its characteristic reddish color, which is caused by the abundance of iron oxide (rust) on its surface. This distinctive coloration has also been a significant factor in the cultural and mythological associations of Mars across different civilizations.", - "streamState": null, + "reformulatedQuestion": "Why is the planet Mars called the red planet?", + "answer": "Mars is called the Red Planet because its surface has an orange-red color due to being covered in iron(III) oxide dust, also known as rust. This iron oxide gives Mars its distinctive reddish appearance when observed from Earth and is the origin of its well-known nickname", + "streamState": "End", "tokenUsage": { "reformulation": { - "promptTokens": 107, - "completionTokens": 10, - "totalTokens": 117 + "promptTokens": 812, + "completionTokens": 11, + "totalTokens": 823 }, "embeddingTokenCount": 10, "question": { - "promptTokens": 9142, - "completionTokens": 53, - "totalTokens": 9195 + "promptTokens": 31708, + "completionTokens": 227, + "totalTokens": 31935 } - } + }, + "citations": [ + { + "documentId": "b1870ad7-4685-42a3-576a-08ddb01159d5", + "chunkId": "749aba1e-0db5-4033-cfa6-08ddb0115da3", + "fileName": "Mars.pdf", + "quote": "surface of Mars is orange-red because it is covered in iron(III) oxide", + "pageNumber": 1, + "indexOnPage": 0 + }, + { + "documentId": "b1870ad7-4685-42a3-576a-08ddb01159d5", + "chunkId": "215e7197-513f-4fbe-cfa8-08ddb0115da3", + "fileName": "Mars.pdf", + "quote": "Martian surface is caused by ferric oxide, or rust", + "pageNumber": 3, + "indexOnPage": 0 + } + ] } ``` @@ -58,69 +79,133 @@ When using the `/api/ask-streaming` endpoint, answers will be streamed as happen [ { "originalQuestion": "why is mars called the red planet?", - "reformulatedQuestion": "Why is Mars referred to as the Red Planet?", + "reformulatedQuestion": "Why is the planet Mars known as the red planet?", "answer": null, "streamState": "Start", "tokenUsage": { "reformulation": { - "promptTokens": 107, - "completionTokens": 10, - "totalTokens": 117 + "promptTokens": 541, + "completionTokens": 12, + "totalTokens": 553 }, - "embeddingTokenCount": 10, + "embeddingTokenCount": 11, "question": null - } + }, + "citations": null }, { "originalQuestion": null, "reformulatedQuestion": null, "answer": "Mars", "streamState": "Append", - "tokenUsage": null + "tokenUsage": null, + "citations": null }, { "originalQuestion": null, "reformulatedQuestion": null, "answer": " is", "streamState": "Append", - "tokenUsage": null + "tokenUsage": null, + "citations": null }, { "originalQuestion": null, "reformulatedQuestion": null, - "answer": " called", + "answer": " known", "streamState": "Append", - "tokenUsage": null + "tokenUsage": null, + "citations": null + }, + { + "originalQuestion": null, + "reformulatedQuestion": null, + "answer": " as", + "streamState": "Append", + "tokenUsage": null, + "citations": null }, { "originalQuestion": null, "reformulatedQuestion": null, "answer": " the", "streamState": "Append", - "tokenUsage": null + "tokenUsage": null, + "citations": null }, { "originalQuestion": null, "reformulatedQuestion": null, - "answer": " Red", + "answer": " red", "streamState": "Append", - "tokenUsage": null + "tokenUsage": null, + "citations": null }, { "originalQuestion": null, "reformulatedQuestion": null, - "answer": " Planet", + "answer": " planet", "streamState": "Append", - "tokenUsage": null + "tokenUsage": null, + "citations": null }, - //... { "originalQuestion": null, "reformulatedQuestion": null, - "answer": ".", + "answer": " because", "streamState": "Append", - "tokenUsage": null + "tokenUsage": null, + "citations": null }, + { + "originalQuestion": null, + "reformulatedQuestion": null, + "answer": " its", + "streamState": "Append", + "tokenUsage": null, + "citations": null + }, + { + "originalQuestion": null, + "reformulatedQuestion": null, + "answer": " surface", + "streamState": "Append", + "tokenUsage": null, + "citations": null + }, + { + "originalQuestion": null, + "reformulatedQuestion": null, + "answer": " is", + "streamState": "Append", + "tokenUsage": null, + "citations": null + }, + { + "originalQuestion": null, + "reformulatedQuestion": null, + "answer": " covered", + "streamState": "Append", + "tokenUsage": null, + "citations": null + }, + { + "originalQuestion": null, + "reformulatedQuestion": null, + "answer": " in", + "streamState": "Append", + "tokenUsage": null, + "citations": null + }, + { + "originalQuestion": null, + "reformulatedQuestion": null, + "answer": " iron", + "streamState": "Append", + "tokenUsage": null, + "citations": null + }, + /// ... { "originalQuestion": null, "reformulatedQuestion": null, @@ -130,11 +215,29 @@ When using the `/api/ask-streaming` endpoint, answers will be streamed as happen "reformulation": null, "embeddingTokenCount": null, "question": { - "promptTokens": 8986, - "completionTokens": 31, - "totalTokens": 9017 + "promptTokens": 30949, + "completionTokens": 221, + "totalTokens": 31170 } - } + }, + "citations": [ + { + "documentId": "b1870ad7-4685-42a3-576a-08ddb01159d5", + "chunkId": "749aba1e-0db5-4033-cfa6-08ddb0115da3", + "fileName": "Mars.pdf", + "quote": "surface of Mars is orange-red", + "pageNumber": 1, + "indexOnPage": 0 + }, + { + "documentId": "b1870ad7-4685-42a3-576a-08ddb01159d5", + "chunkId": "215e7197-513f-4fbe-cfa8-08ddb0115da3", + "fileName": "Mars.pdf", + "quote": "red-orange appearance of the Martian surface is caused by ferric oxide, or rust", + "pageNumber": 3, + "indexOnPage": 0 + } + ] } ] ``` @@ -147,7 +250,7 @@ When using the `/api/ask-streaming` endpoint, answers will be streamed as happen - each one contains a token - The *streamState* property is set to `Append` - *origianlQuestion*, *reformulatedQuestion* and *tokenUsage* are always `null` -- The stream ends when an element with *streamState* equals to `End` is received. This element contains token usage information for the question and the whole answer. +- The stream ends when an element with *streamState* equals to `End` is received. This element contains token usage information for the question and the whole answer and the list of citations. > [!NOTE] > If you prefer to use straight SQL, check out the [sql branch](https://github.com/marcominerva/SqlDatabaseVectorSearch/tree/sql). diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 4d1b108..3dd3ecf 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -91,7 +91,7 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli // Extract citations from the answer var (answer, citations) = ExtractCitations(fullAnswer); - return new(question.Text, reformulatedQuestion.Text!, answer, null, new(reformulatedQuestion.TokenUsage, embeddingTokenCount, tokenUsage), citations); + return new(question.Text, reformulatedQuestion.Text!, answer, StreamState.End, new(reformulatedQuestion.TokenUsage, embeddingTokenCount, tokenUsage), citations); } public async IAsyncEnumerable AskStreamingAsync(Question question, bool reformulate = true, [EnumeratorCancellation] CancellationToken cancellationToken = default) @@ -106,27 +106,32 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli TokenUsageResponse? tokenUsageResponse = null; var fullAnswer = new StringBuilder(); - var areCitationsStarted = false; + var citationsStarted = false; - // Return each token as a partial response. + // Returns each token as a partial response. await foreach (var (token, tokenUsage) in answerStream) { - fullAnswer.Append(token); - - if (token?.Contains('【') == true) + if (token is not null) // token can be null when the stream ends. { - // Citations are started when we encounter a token containing a 【 character. - // We need to track it because we don't want to return the citations in the actual response. - areCitationsStarted = true; - } + fullAnswer.Append(token); - if (!areCitationsStarted) + if (token.Contains('【')) + { + // Citations start when we encounter a token containing a 【 character. + // We need to track it because we don't want to return the citations in the actual response. + citationsStarted = true; + } + + if (!citationsStarted) + { + yield return new(token, StreamState.Append); + } + } + else { - yield return new(token, StreamState.Append); + // Token usage is expected in the last message, when token is null. + tokenUsageResponse ??= tokenUsage is not null ? new(tokenUsage) : null; } - - // Token usage is expected in the last message. - tokenUsageResponse ??= tokenUsage is not null ? new(tokenUsage) : null; } // Extract citations at the end of streaming. diff --git a/SqlDatabaseVectorSearch/appsettings.json b/SqlDatabaseVectorSearch/appsettings.json index 16b1567..4da4228 100644 --- a/SqlDatabaseVectorSearch/appsettings.json +++ b/SqlDatabaseVectorSearch/appsettings.json @@ -6,7 +6,7 @@ "ChatCompletion": { "Endpoint": "", "Deployment": "", - "ModelId": "", // o1, gpt-4o, gpt-4o-mini, gpt-4, gpt-3.5 + "ModelId": "", // gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, gpt-4o, gpt-4o-mini, gpt-4, gpt-3.5 "ApiKey": "" }, "Embedding": { From 4e01ec81be737efb575c1ba296a85da5f6c5ba91 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 24 Jun 2025 12:25:42 +0200 Subject: [PATCH 20/24] Improve README.md for clarity and consistency Updated the README.md file to enhance readability and grammatical structure. Key changes include improved phrasing, added missing commas, consistent formatting of section headers and bullet points, and correction of typographical errors. The description of the response streaming feature was also clarified to provide a better understanding of the application's functionality. --- README.md | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index a3baf43..4284262 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # SQL Database Vector Search Sample A repository that showcases the native VECTOR type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI. -The application allows to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX, TXT and MD files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel). +The application allows you to load documents, generate embeddings, save them into the database as vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX, TXT, and MD files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel). -This repository contains a Blazor Web App as well as a Minimal API that allows to programmatically interact with embeddings and RAG. +This repository contains a Blazor Web App as well as a Minimal API that allows you to programmatically interact with embeddings and RAG. ### Web App ![SQL Database Vector Search Web App](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/assets/SqlDatabaseVectorSearch_WebApp.png) @@ -14,19 +14,19 @@ This repository contains a Blazor Web App as well as a Minimal API that allows t ## Setup - [Create an Azure SQL Database](https://learn.microsoft.com/en-us/azure/azure-sql/database/single-database-create-quickstart) -- Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI - - If your embedding model supports shortening, like **text-embedding-3-small** and **text-embedding-3-large**, and you want to use this feature, you need to set the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to the corresponding value. If your model doesn't provide this feature, or do you want to use the default size, just leave the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to NULL. Keep in mind that **text-embedding-3-small** has a dimension of 1536, while **text-embedding-3-large** uses vectors with 3072 elements, so with this latter model it is mandatory to specify a value (that must be less or equal to 1998, the maximum currently supported by the VECTOR type). -- You may need to update the size of the [`VECTOR`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs?plain=1#L42C1-L42C47) column to match the size of the embedding model. The default value is 1536. Currently, the maximum allowed value is 1998. If you change it, remember to update also the [Database Migration](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.cs?plain=1#L35C1-L35C92). -- Run the application and start importing your documents +- Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI. + - If your embedding model supports shortening, like **text-embedding-3-small** and **text-embedding-3-large**, and you want to use this feature, you need to set the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to the corresponding value. If your model doesn't provide this feature, or if you want to use the default size, just leave the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property as NULL. Keep in mind that **text-embedding-3-small** has a dimension of 1536, while **text-embedding-3-large** uses vectors with 3072 elements, so with this latter model it is mandatory to specify a value (that must be less than or equal to 1998, the maximum currently supported by the VECTOR type). +- You may need to update the size of the [`VECTOR`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs?plain=1#L42C1-L42C47) column to match the size of the embedding model. The default value is 1536. Currently, the maximum allowed value is 1998. If you change it, remember to also update the [Database Migration](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.cs?plain=1#L35C1-L35C92). +- Run the application and start importing your documents. - If you want to directly use the APIs: - - import your documents with the `/api/documents` endpoint. - - Ask questions using `/api/ask` or `/api/ask-streaming` endpoints. + - Import your documents with the `/api/documents` endpoint. + - Ask questions using the `/api/ask` or `/api/ask-streaming` endpoints. ## Supported features - **Conversation History with Question Reformulation**: This feature allows users to view the history of their conversations, including the ability to reformulate questions for better clarity and understanding. This ensures that users can track their interactions and refine their queries as needed. - **Information about Token Usage**: Users can access detailed information about token usage, which helps in understanding the consumption of tokens during interactions. This feature provides transparency and helps users manage their token usage effectively. -- **Response streaming**: This feature enables real-time streaming of responses, allowing users to receive information as it is being processed. This ensures a seamless and efficient flow of information, enhancing the overall user experience. +- **Response Streaming**: This feature enables real-time streaming of responses, allowing users to receive information as it is being processed. This ensures a seamless and efficient flow of information, enhancing the overall user experience. - **Citations**: The application provides citations for the sources used to justify each answer. This allows users to verify the information and understand the origin of the content provided by the system. ### Example of JSON response @@ -73,7 +73,7 @@ This repository contains a Blazor Web App as well as a Minimal API that allows t ### How response streaming works -When using the `/api/ask-streaming` endpoint, answers will be streamed as happens with the typical response from OpenAI. The format of the response is the following: +When using the `/api/ask-streaming` endpoint, answers will be streamed as with the typical response from OpenAI. The format of the response is as follows: ```json [ @@ -241,16 +241,15 @@ When using the `/api/ask-streaming` endpoint, answers will be streamed as happen } ] ``` - - The first piece of the response has the following characteristics: - - the *streamState* property is set to `Start` - - it contains the question and its reformulation (if not requested, *reformulatedQuestion* will be equals to *originalQuestion*) - - the *tokenUsage* section holds information about token used for reformulation (if done) and for the embedding of the question + - The *streamState* property is set to `Start`. + - It contains the question and its reformulation (if not requested, *reformulatedQuestion* will be equal to *originalQuestion*). + - The *tokenUsage* section holds information about tokens used for reformulation (if done) and for the embedding of the question. - Then, there are as many elements for the actual answer as necessary: - - each one contains a token - - The *streamState* property is set to `Append` - - *origianlQuestion*, *reformulatedQuestion* and *tokenUsage* are always `null` -- The stream ends when an element with *streamState* equals to `End` is received. This element contains token usage information for the question and the whole answer and the list of citations. + - Each one contains a token. + - The *streamState* property is set to `Append`. + - *originalQuestion*, *reformulatedQuestion*, and *tokenUsage* are always `null`. +- The stream ends when an element with *streamState* equals `End` is received. This element contains token usage information for the question and the whole answer, and the list of citations. > [!NOTE] > If you prefer to use straight SQL, check out the [sql branch](https://github.com/marcominerva/SqlDatabaseVectorSearch/tree/sql). From 02861a27c5f0ef581308749e118326855fa7c774 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 24 Jun 2025 12:55:28 +0200 Subject: [PATCH 21/24] Update README.md for clarity and organization Enhanced the README.md file with the following changes: - Added badges for .NET 9 and Blazor. - Rephrased the application overview. - Structured the Table of Contents for easier navigation. - Reformatted setup instructions for better readability. - Provided additional details for configuring the database and OpenAI settings. - Expanded the supported features section. - Included examples of API requests and responses. - Clarified limitations and FAQ sections. - Added a note about using straight SQL. --- README.md | 110 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 97 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 4284262..6f00c0a 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,34 @@ # SQL Database Vector Search Sample -A repository that showcases the native VECTOR type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI. -The application allows you to load documents, generate embeddings, save them into the database as vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX, TXT, and MD files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel). +[![.NET 9](https://img.shields.io/badge/.NET-9-blue)](https://dotnet.microsoft.com/en-us/download/dotnet/9.0) [![Blazor](https://img.shields.io/badge/Blazor-WebApp-purple)](https://dotnet.microsoft.com/apps/aspnet/web-apps/blazor) -This repository contains a Blazor Web App as well as a Minimal API that allows you to programmatically interact with embeddings and RAG. +A Blazor Web App and Minimal API for performing RAG (Retrieval Augmented Generation) and vector search using the native VECTOR type in Azure SQL Database and Azure OpenAI. + + +## Table of Contents +- [Overview](#overview) +- [Screenshots](#screenshots) +- [Prerequisites](#prerequisites) +- [Project Structure](#project-structure) +- [Setup](#setup) +- [Supported Features](#supported-features) +- [How to Use](#how-to-use) +- [Limitations & FAQ](#limitations-faq) +- [Contributing](#contributing) +- [License](#license) + +--- + +## Overview +This application allows you to: +- Load documents (PDF, DOCX, TXT, MD) +- Generate embeddings and save them as vectors in Azure SQL Database +- Perform semantic search and RAG using Azure OpenAI +- Interact via a Blazor Web App or programmatically via Minimal API + +Embeddings and chat completion are powered by [Semantic Kernel](https://github.com/microsoft/semantic-kernel). Vectors are managed with [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch). + +## Screenshots ### Web App ![SQL Database Vector Search Web App](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/assets/SqlDatabaseVectorSearch_WebApp.png) @@ -11,16 +36,41 @@ This repository contains a Blazor Web App as well as a Minimal API that allows y ### Web API ![SQL Database Vector Search API](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/assets/SqlDatabaseVectorSearch_API.png) +## Prerequisites +- [.NET 9 SDK](https://dotnet.microsoft.com/en-us/download/dotnet/9.0) +- [Azure SQL Database](https://learn.microsoft.com/en-us/azure/azure-sql/database/single-database-create-quickstart) +- Azure OpenAI resource and API keys + +## Project Structure +- `SqlDatabaseVectorSearch/` - Main Blazor Web App and API + - `Components/` - Blazor UI components + - `Data/` - EF Core context, migrations, and entities + - `Endpoints/` - Minimal API endpoints + - `Services/` - Business logic and integration services + - `TextChunkers/` - Text splitting utilities + - `Settings/` - Configuration classes + ## Setup -- [Create an Azure SQL Database](https://learn.microsoft.com/en-us/azure/azure-sql/database/single-database-create-quickstart) -- Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI. - - If your embedding model supports shortening, like **text-embedding-3-small** and **text-embedding-3-large**, and you want to use this feature, you need to set the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to the corresponding value. If your model doesn't provide this feature, or if you want to use the default size, just leave the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property as NULL. Keep in mind that **text-embedding-3-small** has a dimension of 1536, while **text-embedding-3-large** uses vectors with 3072 elements, so with this latter model it is mandatory to specify a value (that must be less than or equal to 1998, the maximum currently supported by the VECTOR type). -- You may need to update the size of the [`VECTOR`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs?plain=1#L42C1-L42C47) column to match the size of the embedding model. The default value is 1536. Currently, the maximum allowed value is 1998. If you change it, remember to also update the [Database Migration](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.cs?plain=1#L35C1-L35C92). -- Run the application and start importing your documents. -- If you want to directly use the APIs: - - Import your documents with the `/api/documents` endpoint. - - Ask questions using the `/api/ask` or `/api/ask-streaming` endpoints. +1. Clone the repository + + ```bash + git clone https://github.com/marcominerva/SqlDatabaseVectorSearch.git + ``` + +2. Configure the database and OpenAI settings + - Edit `SqlDatabaseVectorSearch/appsettings.json` and set your Azure SQL connection string and OpenAI settings. + - If using embedding models with shortening (e.g., `text-embedding-3-small` or `text-embedding-3-large`), set the `Dimensions` property accordingly. For `text-embedding-3-large`, you must specify a value <= 1998. + - If you change the VECTOR size, update both the [ApplicationDbContext](SqlDatabaseVectorSearch/Data/ApplicationDbContext.cs) and the [Initial Migration](SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs). + +3. Run the application + + ```bash + dotnet run --project SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj + ``` + +5. Access the Web App + - Navigate to `https://localhost:5001` (or the port shown in the console) ## Supported features @@ -29,7 +79,23 @@ This repository contains a Blazor Web App as well as a Minimal API that allows y - **Response Streaming**: This feature enables real-time streaming of responses, allowing users to receive information as it is being processed. This ensures a seamless and efficient flow of information, enhancing the overall user experience. - **Citations**: The application provides citations for the sources used to justify each answer. This allows users to verify the information and understand the origin of the content provided by the system. -### Example of JSON response +## How to Use + +- **Web App**: Use the Blazor interface to upload documents, search, and chat with RAG. +- **API**: Import documents via `POST /api/documents` and ask questions via `POST /api/ask` or `POST /api/ask-streaming`. + +#### Example API Request +``` +POST /api/ask +Content-Type: application/json + +{ + "conversationId": "3d0bd178-499d-433a-b2bc-c35e488d9e2c" + "text": "Why is Mars called the red planet?" +} +``` + +#### Example API Response ```json { @@ -241,6 +307,7 @@ When using the `/api/ask-streaming` endpoint, answers will be streamed as with t } ] ``` + - The first piece of the response has the following characteristics: - The *streamState* property is set to `Start`. - It contains the question and its reformulation (if not requested, *reformulatedQuestion* will be equal to *originalQuestion*). @@ -248,8 +315,25 @@ When using the `/api/ask-streaming` endpoint, answers will be streamed as with t - Then, there are as many elements for the actual answer as necessary: - Each one contains a token. - The *streamState* property is set to `Append`. - - *originalQuestion*, *reformulatedQuestion*, and *tokenUsage* are always `null`. + - *originalQuestion*, *reformulatedQuestion*, *tokenUsage* and *citations* are always `null`. - The stream ends when an element with *streamState* equals `End` is received. This element contains token usage information for the question and the whole answer, and the list of citations. +## Limitations & FAQ + +- **VECTOR column size**: Maximum allowed is 1998. For `text-embedding-3-large`, set `Dimensions` <= 1998. +- **Supported file types**: PDF, DOCX, TXT, MD. +- **Known Issues**: See [Issues](https://github.com/marcominerva/SqlDatabaseVectorSearch/issues) + +## Contributing + +Contributions are welcome! Please open issues or pull requests. For major changes, discuss them first via an issue. + +## License + +This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. + +--- + > [!NOTE] > If you prefer to use straight SQL, check out the [sql branch](https://github.com/marcominerva/SqlDatabaseVectorSearch/tree/sql). + From 476a7734ef96d4856464ef69789da384f1028b20 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 24 Jun 2025 12:57:37 +0200 Subject: [PATCH 22/24] Update README.md to add Minimal API badge Added a badge for "Minimal API" availability to the README.md file, enhancing the visibility of project features. Existing badges for .NET 9 and Blazor remain unchanged. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f00c0a..1679e5d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # SQL Database Vector Search Sample -[![.NET 9](https://img.shields.io/badge/.NET-9-blue)](https://dotnet.microsoft.com/en-us/download/dotnet/9.0) [![Blazor](https://img.shields.io/badge/Blazor-WebApp-purple)](https://dotnet.microsoft.com/apps/aspnet/web-apps/blazor) +[![.NET 9](https://img.shields.io/badge/.NET-9-blue)](https://dotnet.microsoft.com/en-us/download/dotnet/9.0) +[![Minimal API](https://img.shields.io/badge/Minimal%20API-Available-green)](https://dotnet.microsoft.com/apps/aspnet/apis) +[![Blazor](https://img.shields.io/badge/Blazor-WebApp-purple)](https://dotnet.microsoft.com/apps/aspnet/web-apps/blazor) A Blazor Web App and Minimal API for performing RAG (Retrieval Augmented Generation) and vector search using the native VECTOR type in Azure SQL Database and Azure OpenAI. From 9eaed9176c26102ddd8995bfb34893777c5a3dce Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 24 Jun 2025 14:33:43 +0200 Subject: [PATCH 23/24] Enhance application description and user guidance Updated `Home.razor` to provide a more detailed overview of the application's capabilities, including document loading, embedding generation, and semantic search. Improved clarity in the supported features section and added a new feature for citations. Included new paragraphs to encourage user interaction and referenced the README for API usage details. --- .../Components/Pages/Home.razor | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/SqlDatabaseVectorSearch/Components/Pages/Home.razor b/SqlDatabaseVectorSearch/Components/Pages/Home.razor index faf2b04..a120b40 100644 --- a/SqlDatabaseVectorSearch/Components/Pages/Home.razor +++ b/SqlDatabaseVectorSearch/Components/Pages/Home.razor @@ -4,16 +4,34 @@ SQL Database Vector Search

SQL Database Vector Search

-

- How to use the native VECTOR type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI. -

-

- This application allows to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX, TXT and MD files are supported. Vectors are saved and retrieved with Entity Framework Core using the EFCore.SqlServer.VectorSearch library. Embedding and Chat Completion are integrated with Semantic Kernel. + +

+ A Blazor Web App and Minimal API for Retrieval Augmented Generation (RAG) and vector search using the native VECTOR type in Azure SQL Database with Azure OpenAI.

-

Supported features

+

+ This application allows you to: +

    +
  • Load documents (PDF, DOCX, TXT, MD)
  • +
  • Generate embeddings and save them as vectors in Azure SQL Database
  • +
  • Perform semantic search and RAG using Azure OpenAI
  • +
  • Interact via a Blazor Web App or programmatically via Minimal API
  • +
+ Embeddings and chat completion are powered by Semantic Kernel. Vectors are managed with EFCore.SqlServer.VectorSearch. +

+ +

Supported Features

    -
  • Conversation History with Question Reformulation: This feature allows users to view the history of their conversations, including the ability to reformulate questions for better clarity and understanding. This ensures that users can track their interactions and refine their queries as needed.
  • -
  • Information about Token Usage: Users can access detailed information about token usage, which helps in understanding the consumption of tokens during interactions. This feature provides transparency and helps users manage their token usage effectively.
  • -
  • Response Streaming: This feature enables real-time streaming of responses, allowing users to receive information as it is being processed. This ensures a seamless and efficient flow of information, enhancing the overall user experience.
  • +
  • Conversation History with Question Reformulation: View and reformulate your conversation history for better clarity and understanding.
  • +
  • Information about Token Usage: Access detailed information about token usage for transparency and management.
  • +
  • Response Streaming: Receive real-time streaming of responses for a seamless and efficient user experience.
  • +
  • Citations: Get citations for the sources used to justify each answer, allowing you to verify and understand the origin of the content.
+ +

+ Try uploading a document or ask a question to get started! +

+ +

+ For API usage and more details, see the README. +

From 06c1741f14062239c5f3987a8a246ca1d24145da Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 24 Jun 2025 14:53:14 +0200 Subject: [PATCH 24/24] Enforce response formatting in ChatService Updated the ChatService class to require that all responses end with a period and a space. Added a condition to include citations in a specified XML format when the answer is known, and to omit citations when the answer is unknown. --- SqlDatabaseVectorSearch/Services/ChatService.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index 6bb3fde..943edbb 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -127,7 +127,10 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer Never answer questions that are not related to this chat. You must answer in the same language as the user's question. For example, if the user asks a question in English, the answer must be in English, no matter the language of the documents. + IMPORTANT: Your answer must always end with a period and a space. + After the answer, you need to include citations following the XML format below ONLY IF you know the answer and are providing information from the context. If you do NOT know the answer, DO NOT include the citations section at all. + 【exact quote here exact quote here