From 457147878704cc3c3a9111c9a0627cf471daad67 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 10 Jun 2025 11:50:51 +0200 Subject: [PATCH] Enhance citation handling and formatting Updated citation management in the application by removing the `RemoveCitations` and `ExtractCitations` methods in `Ask.razor`, and directly processing citations from the `delta` object. The `Response` class now includes a `Citations` property for better data handling. Modified `VectorSearchService.cs` to extract citations from the full answer in `AskQuestionAsync` and return them at the end of the streaming process in `AskStreamingAsync`. Introduced a new `Citation` class in `Citation.cs` to encapsulate citation properties, ensuring structured management of citation data. Updated citation formatting rules to enforce a specific XML format, ensuring citations are presented at the end of responses rather than within the answer text. --- .../Components/Pages/Ask.razor | 68 +++--------------- SqlDatabaseVectorSearch/Models/Citation.cs | 16 +++++ SqlDatabaseVectorSearch/Models/Response.cs | 6 +- .../Services/ChatService.cs | 33 +++++++-- .../Services/VectorSearchService.cs | 69 +++++++++++++++++-- 5 files changed, 120 insertions(+), 72 deletions(-) create mode 100644 SqlDatabaseVectorSearch/Models/Citation.cs diff --git a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor index 4adc676..699532a 100644 --- a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor +++ b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor @@ -197,16 +197,20 @@ else if (delta.StreamState == StreamState.Append) { // Adds tokens to the assistant message as they are received. - assistantMessage.RawText += delta.Answer; - - // Updates the Text property to remove citations, if any. - assistantMessage.Text = RemoveCitations(assistantMessage.RawText); + assistantMessage.Text += delta.Answer; } else if (delta.StreamState == StreamState.End) { - // Extracts citations, if any. - var (_, citations) = ExtractCitations(assistantMessage.RawText); - assistantMessage.Citations = citations; + // Get citations from the response. + assistantMessage.Citations = delta.Citations?.Select(c => new Citation + { + DocumentId = c.DocumentId, + ChunkId = c.ChunkId, + FileName = c.FileName, + Quote = c.Quote, + PageNumber = c.PageNumber, + IndexOnPage = c.IndexOnPage + }); assistantMessage.IsCompleted = true; assistantMessage.TokenUsage += FormatTokenUsage(delta.TokenUsage); @@ -295,58 +299,8 @@ await JSRuntime.InvokeVoidAsync("scrollTo", chat); } - private static string RemoveCitations(string? text) - { - if (string.IsNullOrEmpty(text)) - { - return string.Empty; - } - - return (text.AsSpan().IndexOf("= 0 ? text[..index] : text).TrimEnd(); - } - - private static (string, IEnumerable) ExtractCitations(string? text) - { - var citations = new List(); - - if (string.IsNullOrEmpty(text)) - { - return (text ?? string.Empty, citations); - } - - var pattern = @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'\s+index-on-page='(?[^']*)'>\s*(?.*?)\s*"; - - var matches = Regex.Matches(text, pattern, RegexOptions.Singleline); - foreach (Match match in matches) - { - if (match.Success) - { - citations.Add(new Citation - { - DocumentId = Guid.Parse(match.Groups["documentId"].Value), - ChunkId = Guid.Parse(match.Groups["chunkId"].Value), - FileName = match.Groups["filename"].Value, - PageNumber = int.TryParse(match.Groups["pageNumber"].Value, out var pageNumber) && pageNumber > 0 ? pageNumber : null, - IndexOnPage = int.TryParse(match.Groups["indexOnPage"].Value, out var indexOnPage) ? indexOnPage : 0, - Quote = match.Groups["quote"].Value - }); - } - } - - // Remove all tags from the text - var cleanText = Regex.Replace(text, pattern, string.Empty, RegexOptions.Singleline).TrimEnd(); - return (cleanText, citations); - } - public class Message { - private string? rawText; - public string? RawText - { - get => rawText ?? Text; - set => rawText = value; - } - public string? Text { get; set; } public required string Role { get; set; } diff --git a/SqlDatabaseVectorSearch/Models/Citation.cs b/SqlDatabaseVectorSearch/Models/Citation.cs new file mode 100644 index 0000000..04fb64b --- /dev/null +++ b/SqlDatabaseVectorSearch/Models/Citation.cs @@ -0,0 +1,16 @@ +namespace SqlDatabaseVectorSearch.Models; + +public class Citation +{ + public Guid DocumentId { get; set; } + + public Guid ChunkId { get; set; } + + public string FileName { get; set; } = null!; + + public string Quote { get; set; } = null!; + + public int? PageNumber { get; set; } + + public int IndexOnPage { get; set; } +} \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Models/Response.cs b/SqlDatabaseVectorSearch/Models/Response.cs index 54921ba..62bde55 100644 --- a/SqlDatabaseVectorSearch/Models/Response.cs +++ b/SqlDatabaseVectorSearch/Models/Response.cs @@ -1,10 +1,10 @@ namespace SqlDatabaseVectorSearch.Models; // Question and Answer can be null when using response streaming. -public record class Response(string? OriginalQuestion, string? ReformulatedQuestion, string? Answer, StreamState? StreamState = null, TokenUsageResponse? TokenUsage = null) +public record class Response(string? OriginalQuestion, string? ReformulatedQuestion, string? Answer, StreamState? StreamState = null, TokenUsageResponse? TokenUsage = null, IEnumerable? Citations = null) { - public Response(string? token, StreamState streamState, TokenUsageResponse? tokenUsageResponse = null) - : this(null, null, token, streamState, tokenUsageResponse) + public Response(string? token, StreamState streamState, TokenUsageResponse? tokenUsageResponse = null, IEnumerable? citations = null) + : this(null, null, token, streamState, tokenUsageResponse, citations) { } } \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index 2bc3e51..a37bcce 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -125,14 +125,33 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer - I'm sorry, I don't have enough information to answer that question Never answer questions that are not related to this chat. - You must answer in the same language as the user's question. + You must answer in the same language as the user's question. For example, if the user asks a question in English, the answer must be in English, no matter the language of the documents. - The quote in each MUST be MAXIMUM 5 words, taken word-for-word from the search result. If the quote is longer than 5 words, your answer is INVALID. - When you find an answer, you MUST place ALL citations ONLY at the very end of your response, never inside or between sentences. - First provide your complete answer, then list all citations. - - Use this XML format for citations: - exact quote here + After the answer, you need to include citations following the XML format below: + 【exact quote here + exact quote here】 + + The entire list of XML citations MUST be enclosed between 【 and 】 (U+3010 and U+3011) and must exactly match the above format. + The quote in each MUST be MAXIMUM 5 words, taken word-for-word from the search result. + + IMPORTANT CITATION RULES: + 1. NEVER put citations inside your answer text. + 2. ALWAYS provide your complete answer FIRST. + 3. ONLY AFTER completing your answer, add ALL citations in a block at the very end. + 4. The citations block MUST be the last thing in your response. + 5. NEVER reference citations by number or mention them in your answer text. + 6. The citations MUST ALWAYS follow the XML format exactly as shown below. Any other format is NOT ACCEPTED. + + --- + Example of a correct answer: + The capital of France is Paris. + 【capital of France is Paris】 + + Example of an incorrect answer (NOT ACCEPTED): + The capital of France is Paris [1]. + [1] france.pdf, page 1 + --- + Only the correct format is accepted. If you do not follow the XML format exactly, your answer will be considered invalid. """); var prompt = new StringBuilder($""" diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 2663c39..f571d67 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -1,5 +1,7 @@ using System.Data; using System.Runtime.CompilerServices; +using System.Text; +using System.Text.RegularExpressions; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.AI; using Microsoft.Extensions.Options; @@ -74,9 +76,12 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb // It the user doesn't want to reforulate the question, CreateContextAsync returns the original one. var (reformulatedQuestion, embeddingTokenCount, chunks) = await CreateContextAsync(question, reformulate, cancellationToken); - var (answer, tokenUsage) = await chatService.AskQuestionAsync(question.ConversationId, chunks, reformulatedQuestion.Text!, cancellationToken); + var (fullAnswer, tokenUsage) = await chatService.AskQuestionAsync(question.ConversationId, chunks, reformulatedQuestion.Text!, cancellationToken); - return new(question.Text, reformulatedQuestion.Text!, answer, null, new(reformulatedQuestion.TokenUsage, embeddingTokenCount, tokenUsage)); + // Extract citations from the answer + var (answer, citations) = ExtractCitations(fullAnswer); + + return new(question.Text, reformulatedQuestion.Text!, answer, null, new(reformulatedQuestion.TokenUsage, embeddingTokenCount, tokenUsage), citations); } public async IAsyncEnumerable AskStreamingAsync(Question question, bool reformulate = true, [EnumeratorCancellation] CancellationToken cancellationToken = default) @@ -90,19 +95,42 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb yield return new(question.Text, reformulatedQuestion.Text!, null, StreamState.Start, new(reformulatedQuestion.TokenUsage, embeddingTokenCount, null)); TokenUsageResponse? tokenUsageResponse = null; + var fullAnswer = new StringBuilder(); + var areCitationsStarted = false; // Return each token as a partial response. await foreach (var (token, tokenUsage) in answerStream) { + fullAnswer.Append(token); + + if (token?.Contains('【') == true) + { + // Citations are started when the first token contains a 【 character. + // We need to track it because we don't want to return the citations in the actual response. + areCitationsStarted = true; + } + + if (!areCitationsStarted) + { + yield return new(token, StreamState.Append); + } + // Token usage is expected in the last message. tokenUsageResponse = tokenUsage is not null ? new(tokenUsage) : null; - yield return new(token, tokenUsageResponse is null ? StreamState.Append : StreamState.End, tokenUsageResponse); + if (tokenUsageResponse is not null) + { + // Response is complete, we can return the citations. + var (_, citations) = ExtractCitations(fullAnswer.ToString()); + yield return new(null, StreamState.End, tokenUsageResponse, citations); + } } - // If the token usage has not been returned in the last message, we must explicitly tells that the stream is ended. + // If the token usage has not been returned in the last message, we must explicitly tell that the stream is ended. if (tokenUsageResponse is null) { - yield return new(null, StreamState.End); + // Extract citations at the end of streaming. + var (_, citations) = ExtractCitations(fullAnswer.ToString()); + yield return new(null, StreamState.End, null, citations); } } @@ -124,4 +152,35 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb return (reformulatedQuestion, embeddingTokenCount, chunks); } + + private static (string, IEnumerable) ExtractCitations(string? text) + { + var citations = new List(); + + if (string.IsNullOrEmpty(text)) + { + return (text ?? string.Empty, citations); + } + + var matches = Regex.Matches(text, @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'\s+index-on-page='(?[^']*)'>\s*(?.*?)\s*", RegexOptions.Singleline); + foreach (Match match in matches) + { + if (match.Success) + { + citations.Add(new Citation + { + DocumentId = Guid.Parse(match.Groups["documentId"].Value), + ChunkId = Guid.Parse(match.Groups["chunkId"].Value), + FileName = match.Groups["filename"].Value, + PageNumber = int.TryParse(match.Groups["pageNumber"].Value, out var pageNumber) && pageNumber > 0 ? pageNumber : null, + IndexOnPage = int.TryParse(match.Groups["indexOnPage"].Value, out var indexOnPage) ? indexOnPage : 0, + Quote = match.Groups["quote"].Value + }); + } + } + + // Remove all content between 【 and 】 + var cleanText = Regex.Replace(text, @"【.*?】", string.Empty, RegexOptions.Singleline).TrimEnd(); + return (cleanText, citations); + } } \ No newline at end of file