Enforce strict citation formatting in services

Updated `ChatService` to ensure citations are enclosed in XML tags with a consistent format. Modified `VectorSearchService` to implement a new regex pattern for citation matching and improved text cleaning by removing citation content more efficiently.
This commit is contained in:
Marco Minerva
2025-06-10 12:07:35 +02:00
parent 4571478787
commit 3f5f44145f
2 changed files with 24 additions and 10 deletions
@@ -128,8 +128,8 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
You must answer in the same language as the user's question. For example, if the user asks a question in English, the answer must be in English, no matter the language of the documents.
After the answer, you need to include citations following the XML format below:
<citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='page_number' index-on-page='index_on-page'>exact quote here</citation>
<citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='page_number' index-on-page='index_on-page'>exact quote here</citation>
<citation document-id="document_id" chunk-id="chunk_id" filename="string" page-number="page_number" index-on-page="index_on-page">exact quote here</citation>
<citation document-id="document_id" chunk-id="chunk_id" filename="string" page-number="page_number" index-on-page="index_on-page">exact quote here</citation>
The entire list of XML citations MUST be enclosed between and (U+3010 and U+3011) and must exactly match the above format.
The quote in each <citation> MUST be MAXIMUM 5 words, taken word-for-word from the search result.
@@ -138,9 +138,10 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
1. NEVER put citations inside your answer text.
2. ALWAYS provide your complete answer FIRST.
3. ONLY AFTER completing your answer, add ALL citations in a block at the very end.
4. The citations block MUST be the last thing in your response.
4. The citations block MUST be the last thing in your response, with absolutely nothing (no text, no spaces, no newlines, no punctuation, no comments) after it.
5. NEVER reference citations by number or mention them in your answer text.
6. The citations MUST ALWAYS follow the XML format exactly as shown below. Any other format is NOT ACCEPTED.
7. If you add anything after the citations block, your answer will be considered invalid.
---
Example of a correct answer:
@@ -148,10 +149,16 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
<citation document-id='123' chunk-id='456' filename='france.pdf' page-number='1' index-on-page='1'>capital of France is Paris</citation>
Example of an incorrect answer (NOT ACCEPTED):
The capital of France is Paris [1].
The capital of France is Paris.
<citation document-id='123' chunk-id='456' filename='france.pdf' page-number='1' index-on-page='1'>capital of France is Paris</citation>
Thank you for your question.
Another incorrect example (NOT ACCEPTED):
The capital of France is Paris.
<citation document-id='123' chunk-id='456' filename='france.pdf' page-number='1' index-on-page='1'>capital of France is Paris</citation>
[1] france.pdf, page 1
---
Only the correct format is accepted. If you do not follow the XML format exactly, your answer will be considered invalid.
Only the correct format is accepted. If you do not follow the XML format exactly, or if you add anything after the citations block, your answer will be considered invalid.
""");
var prompt = new StringBuilder($"""
@@ -14,7 +14,7 @@ using Entities = SqlDatabaseVectorSearch.Data.Entities;
namespace SqlDatabaseVectorSearch.Services;
public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger)
public partial class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger)
{
private readonly AppSettings appSettings = appSettingsOptions.Value;
@@ -162,7 +162,8 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
return (text ?? string.Empty, citations);
}
var matches = Regex.Matches(text, @"<citation\s+document-id='(?<documentId>[^']*)'\s+chunk-id='(?<chunkId>[^']*)'\s+filename='(?<filename>[^']*)'\s+page-number='(?<pageNumber>[^']*)'\s+index-on-page='(?<indexOnPage>[^']*)'>\s*(?<quote>.*?)\s*</citation>", RegexOptions.Singleline);
var matches = CitationRegEx.Matches(text);
foreach (Match match in matches)
{
if (match.Success)
@@ -179,8 +180,14 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
}
}
// Remove all content between 【 and 】
var cleanText = Regex.Replace(text, @"【.*?】", string.Empty, RegexOptions.Singleline).TrimEnd();
// Remove all content between 【 and 】.
var cleanText = RemoveCitationsRegEx.Replace(text, string.Empty).TrimEnd();
return (cleanText, citations);
}
[GeneratedRegex(@"<citation\s+document-id=(?:""|'|)(?<documentId>[^""']*)(?:""|'|)\s+chunk-id=(?:""|'|)(?<chunkId>[^""']*)(?:""|'|)\s+filename=(?:""|'|)(?<filename>[^""']*)(?:""|'|)\s+page-number=(?:""|'|)(?<pageNumber>[^""']*)(?:""|'|)\s+index-on-page=(?:""|'|)(?<indexOnPage>[^""']*)(?:""|'|)>\s*(?<quote>.*?)\s*</citation>", RegexOptions.Singleline)]
private static partial Regex CitationRegEx { get; }
[GeneratedRegex(@"【.*?】", RegexOptions.Singleline)]
private static partial Regex RemoveCitationsRegEx { get; }
}