From 3f5f44145f69f4350d9b73b21e84452c3b1f4ea1 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 10 Jun 2025 12:07:35 +0200 Subject: [PATCH] Enforce strict citation formatting in services Updated `ChatService` to ensure citations are enclosed in XML tags with a consistent format. Modified `VectorSearchService` to implement a new regex pattern for citation matching and improved text cleaning by removing citation content more efficiently. --- .../Services/ChatService.cs | 19 +++++++++++++------ .../Services/VectorSearchService.cs | 15 +++++++++++---- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index a37bcce..9380f1f 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -128,19 +128,20 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer You must answer in the same language as the user's question. For example, if the user asks a question in English, the answer must be in English, no matter the language of the documents. After the answer, you need to include citations following the XML format below: - 【exact quote here - exact quote here】 + 【exact quote here + exact quote here】 The entire list of XML citations MUST be enclosed between 【 and 】 (U+3010 and U+3011) and must exactly match the above format. The quote in each MUST be MAXIMUM 5 words, taken word-for-word from the search result. IMPORTANT CITATION RULES: - 1. NEVER put citations inside your answer text. + 1. NEVER put citations inside your answer text. 2. ALWAYS provide your complete answer FIRST. 3. ONLY AFTER completing your answer, add ALL citations in a block at the very end. - 4. The citations block MUST be the last thing in your response. + 4. The citations block MUST be the last thing in your response, with absolutely nothing (no text, no spaces, no newlines, no punctuation, no comments) after it. 5. NEVER reference citations by number or mention them in your answer text. 6. The citations MUST ALWAYS follow the XML format exactly as shown below. Any other format is NOT ACCEPTED. + 7. If you add anything after the citations block, your answer will be considered invalid. --- Example of a correct answer: @@ -148,10 +149,16 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer 【capital of France is Paris】 Example of an incorrect answer (NOT ACCEPTED): - The capital of France is Paris [1]. + The capital of France is Paris. + 【capital of France is Paris】 + Thank you for your question. + + Another incorrect example (NOT ACCEPTED): + The capital of France is Paris. + 【capital of France is Paris】 [1] france.pdf, page 1 --- - Only the correct format is accepted. If you do not follow the XML format exactly, your answer will be considered invalid. + Only the correct format is accepted. If you do not follow the XML format exactly, or if you add anything after the citations block, your answer will be considered invalid. """); var prompt = new StringBuilder($""" diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index f571d67..7611690 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -14,7 +14,7 @@ using Entities = SqlDatabaseVectorSearch.Data.Entities; namespace SqlDatabaseVectorSearch.Services; -public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, IEmbeddingGenerator> embeddingGenerator, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) +public partial class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, IEmbeddingGenerator> embeddingGenerator, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) { private readonly AppSettings appSettings = appSettingsOptions.Value; @@ -162,7 +162,8 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb return (text ?? string.Empty, citations); } - var matches = Regex.Matches(text, @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'\s+index-on-page='(?[^']*)'>\s*(?.*?)\s*", RegexOptions.Singleline); + var matches = CitationRegEx.Matches(text); + foreach (Match match in matches) { if (match.Success) @@ -179,8 +180,14 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb } } - // Remove all content between 【 and 】 - var cleanText = Regex.Replace(text, @"【.*?】", string.Empty, RegexOptions.Singleline).TrimEnd(); + // Remove all content between 【 and 】. + var cleanText = RemoveCitationsRegEx.Replace(text, string.Empty).TrimEnd(); return (cleanText, citations); } + + [GeneratedRegex(@"[^""']*)(?:""|'|)\s+chunk-id=(?:""|'|)(?[^""']*)(?:""|'|)\s+filename=(?:""|'|)(?[^""']*)(?:""|'|)\s+page-number=(?:""|'|)(?[^""']*)(?:""|'|)\s+index-on-page=(?:""|'|)(?[^""']*)(?:""|'|)>\s*(?.*?)\s*", RegexOptions.Singleline)] + private static partial Regex CitationRegEx { get; } + + [GeneratedRegex(@"【.*?】", RegexOptions.Singleline)] + private static partial Regex RemoveCitationsRegEx { get; } } \ No newline at end of file