Switch to CountEmbeddingTokens in chunkers

Updated the token counting method from CountChatCompletionTokens to CountEmbeddingTokens in VectorSearchService, DefaultTextChunker, and MarkdownTextChunker to align with embedding token counting. Added a new logging configuration for Microsoft.AspNetCore.Watch.BrowserRefresh in appsettings.Development.json to manage log verbosity during development.
This commit is contained in:
Marco Minerva
2025-10-16 12:32:48 +02:00
parent 6361bfd8d1
commit 0cc969c164
4 changed files with 6 additions and 5 deletions
@@ -58,7 +58,7 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli
foreach (var (index, embedding) in embeddings.Index())
{
var chunk = chunks.ElementAt(index);
logger.LogDebug("Storing a chunk of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(chunk.Content));
logger.LogDebug("Storing a chunk of {TokenCount} tokens.", tokenizerService.CountEmbeddingTokens(chunk.Content));
var documentChunk = new Entities.DocumentChunk
{
@@ -11,8 +11,8 @@ public class DefaultTextChunker(TokenizerService tokenizerService, IOptions<AppS
public IList<string> Split(string text)
{
var lines = TextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens);
var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens);
var lines = TextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens);
var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens);
return paragraphs;
}
@@ -11,8 +11,8 @@ public class MarkdownTextChunker(TokenizerService tokenizerService, IOptions<App
public IList<string> Split(string text)
{
var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens);
var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens);
var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens);
var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens);
return paragraphs;
}
@@ -3,6 +3,7 @@
"LogLevel": {
"Default": "Information",
"Microsoft.AspNetCore": "Warning",
"Microsoft.AspNetCore.Watch.BrowserRefresh": "Warning",
"SqlDatabaseVectorSearch": "Debug"
}
}