mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Enforce strict citation formatting in services
Updated `ChatService` to ensure citations are enclosed in XML tags with a consistent format. Modified `VectorSearchService` to implement a new regex pattern for citation matching and improved text cleaning by removing citation content more efficiently.
This commit is contained in:
@@ -128,8 +128,8 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
|
|||||||
You must answer in the same language as the user's question. For example, if the user asks a question in English, the answer must be in English, no matter the language of the documents.
|
You must answer in the same language as the user's question. For example, if the user asks a question in English, the answer must be in English, no matter the language of the documents.
|
||||||
|
|
||||||
After the answer, you need to include citations following the XML format below:
|
After the answer, you need to include citations following the XML format below:
|
||||||
【<citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='page_number' index-on-page='index_on-page'>exact quote here</citation>
|
【<citation document-id="document_id" chunk-id="chunk_id" filename="string" page-number="page_number" index-on-page="index_on-page">exact quote here</citation>
|
||||||
<citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='page_number' index-on-page='index_on-page'>exact quote here</citation>】
|
<citation document-id="document_id" chunk-id="chunk_id" filename="string" page-number="page_number" index-on-page="index_on-page">exact quote here</citation>】
|
||||||
|
|
||||||
The entire list of XML citations MUST be enclosed between 【 and 】 (U+3010 and U+3011) and must exactly match the above format.
|
The entire list of XML citations MUST be enclosed between 【 and 】 (U+3010 and U+3011) and must exactly match the above format.
|
||||||
The quote in each <citation> MUST be MAXIMUM 5 words, taken word-for-word from the search result.
|
The quote in each <citation> MUST be MAXIMUM 5 words, taken word-for-word from the search result.
|
||||||
@@ -138,9 +138,10 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
|
|||||||
1. NEVER put citations inside your answer text.
|
1. NEVER put citations inside your answer text.
|
||||||
2. ALWAYS provide your complete answer FIRST.
|
2. ALWAYS provide your complete answer FIRST.
|
||||||
3. ONLY AFTER completing your answer, add ALL citations in a block at the very end.
|
3. ONLY AFTER completing your answer, add ALL citations in a block at the very end.
|
||||||
4. The citations block MUST be the last thing in your response.
|
4. The citations block MUST be the last thing in your response, with absolutely nothing (no text, no spaces, no newlines, no punctuation, no comments) after it.
|
||||||
5. NEVER reference citations by number or mention them in your answer text.
|
5. NEVER reference citations by number or mention them in your answer text.
|
||||||
6. The citations MUST ALWAYS follow the XML format exactly as shown below. Any other format is NOT ACCEPTED.
|
6. The citations MUST ALWAYS follow the XML format exactly as shown below. Any other format is NOT ACCEPTED.
|
||||||
|
7. If you add anything after the citations block, your answer will be considered invalid.
|
||||||
|
|
||||||
---
|
---
|
||||||
Example of a correct answer:
|
Example of a correct answer:
|
||||||
@@ -148,10 +149,16 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
|
|||||||
【<citation document-id='123' chunk-id='456' filename='france.pdf' page-number='1' index-on-page='1'>capital of France is Paris</citation>】
|
【<citation document-id='123' chunk-id='456' filename='france.pdf' page-number='1' index-on-page='1'>capital of France is Paris</citation>】
|
||||||
|
|
||||||
Example of an incorrect answer (NOT ACCEPTED):
|
Example of an incorrect answer (NOT ACCEPTED):
|
||||||
The capital of France is Paris [1].
|
The capital of France is Paris.
|
||||||
|
【<citation document-id='123' chunk-id='456' filename='france.pdf' page-number='1' index-on-page='1'>capital of France is Paris</citation>】
|
||||||
|
Thank you for your question.
|
||||||
|
|
||||||
|
Another incorrect example (NOT ACCEPTED):
|
||||||
|
The capital of France is Paris.
|
||||||
|
【<citation document-id='123' chunk-id='456' filename='france.pdf' page-number='1' index-on-page='1'>capital of France is Paris</citation>】
|
||||||
[1] france.pdf, page 1
|
[1] france.pdf, page 1
|
||||||
---
|
---
|
||||||
Only the correct format is accepted. If you do not follow the XML format exactly, your answer will be considered invalid.
|
Only the correct format is accepted. If you do not follow the XML format exactly, or if you add anything after the citations block, your answer will be considered invalid.
|
||||||
""");
|
""");
|
||||||
|
|
||||||
var prompt = new StringBuilder($"""
|
var prompt = new StringBuilder($"""
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ using Entities = SqlDatabaseVectorSearch.Data.Entities;
|
|||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.Services;
|
namespace SqlDatabaseVectorSearch.Services;
|
||||||
|
|
||||||
public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger)
|
public partial class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger)
|
||||||
{
|
{
|
||||||
private readonly AppSettings appSettings = appSettingsOptions.Value;
|
private readonly AppSettings appSettings = appSettingsOptions.Value;
|
||||||
|
|
||||||
@@ -162,7 +162,8 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
|
|||||||
return (text ?? string.Empty, citations);
|
return (text ?? string.Empty, citations);
|
||||||
}
|
}
|
||||||
|
|
||||||
var matches = Regex.Matches(text, @"<citation\s+document-id='(?<documentId>[^']*)'\s+chunk-id='(?<chunkId>[^']*)'\s+filename='(?<filename>[^']*)'\s+page-number='(?<pageNumber>[^']*)'\s+index-on-page='(?<indexOnPage>[^']*)'>\s*(?<quote>.*?)\s*</citation>", RegexOptions.Singleline);
|
var matches = CitationRegEx.Matches(text);
|
||||||
|
|
||||||
foreach (Match match in matches)
|
foreach (Match match in matches)
|
||||||
{
|
{
|
||||||
if (match.Success)
|
if (match.Success)
|
||||||
@@ -179,8 +180,14 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove all content between 【 and 】
|
// Remove all content between 【 and 】.
|
||||||
var cleanText = Regex.Replace(text, @"【.*?】", string.Empty, RegexOptions.Singleline).TrimEnd();
|
var cleanText = RemoveCitationsRegEx.Replace(text, string.Empty).TrimEnd();
|
||||||
return (cleanText, citations);
|
return (cleanText, citations);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[GeneratedRegex(@"<citation\s+document-id=(?:""|'|)(?<documentId>[^""']*)(?:""|'|)\s+chunk-id=(?:""|'|)(?<chunkId>[^""']*)(?:""|'|)\s+filename=(?:""|'|)(?<filename>[^""']*)(?:""|'|)\s+page-number=(?:""|'|)(?<pageNumber>[^""']*)(?:""|'|)\s+index-on-page=(?:""|'|)(?<indexOnPage>[^""']*)(?:""|'|)>\s*(?<quote>.*?)\s*</citation>", RegexOptions.Singleline)]
|
||||||
|
private static partial Regex CitationRegEx { get; }
|
||||||
|
|
||||||
|
[GeneratedRegex(@"【.*?】", RegexOptions.Singleline)]
|
||||||
|
private static partial Regex RemoveCitationsRegEx { get; }
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user