From 0cc969c16462171592bb6855889396f824fa54d4 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Thu, 16 Oct 2025 12:32:48 +0200 Subject: [PATCH] Switch to CountEmbeddingTokens in chunkers Updated the token counting method from CountChatCompletionTokens to CountEmbeddingTokens in VectorSearchService, DefaultTextChunker, and MarkdownTextChunker to align with embedding token counting. Added a new logging configuration for Microsoft.AspNetCore.Watch.BrowserRefresh in appsettings.Development.json to manage log verbosity during development. --- SqlDatabaseVectorSearch/Services/VectorSearchService.cs | 2 +- SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs | 4 ++-- SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs | 4 ++-- SqlDatabaseVectorSearch/appsettings.Development.json | 1 + 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index edaeea3..cc49944 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -58,7 +58,7 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli foreach (var (index, embedding) in embeddings.Index()) { var chunk = chunks.ElementAt(index); - logger.LogDebug("Storing a chunk of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(chunk.Content)); + logger.LogDebug("Storing a chunk of {TokenCount} tokens.", tokenizerService.CountEmbeddingTokens(chunk.Content)); var documentChunk = new Entities.DocumentChunk { diff --git a/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs b/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs index 46ccd49..a5a6854 100644 --- a/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs +++ b/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs @@ -11,8 +11,8 @@ public class DefaultTextChunker(TokenizerService tokenizerService, IOptions Split(string text) { - var lines = TextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens); - var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens); + var lines = TextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens); + var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens); return paragraphs; } diff --git a/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs b/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs index fd3a8f6..cba6679 100644 --- a/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs +++ b/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs @@ -11,8 +11,8 @@ public class MarkdownTextChunker(TokenizerService tokenizerService, IOptions Split(string text) { - var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens); - var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens); + var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens); + var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens); return paragraphs; } diff --git a/SqlDatabaseVectorSearch/appsettings.Development.json b/SqlDatabaseVectorSearch/appsettings.Development.json index 19c6237..0fc47fd 100644 --- a/SqlDatabaseVectorSearch/appsettings.Development.json +++ b/SqlDatabaseVectorSearch/appsettings.Development.json @@ -3,6 +3,7 @@ "LogLevel": { "Default": "Information", "Microsoft.AspNetCore": "Warning", + "Microsoft.AspNetCore.Watch.BrowserRefresh": "Warning", "SqlDatabaseVectorSearch": "Debug" } }