mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Switch to CountEmbeddingTokens in chunkers
Updated the token counting method from CountChatCompletionTokens to CountEmbeddingTokens in VectorSearchService, DefaultTextChunker, and MarkdownTextChunker to align with embedding token counting. Added a new logging configuration for Microsoft.AspNetCore.Watch.BrowserRefresh in appsettings.Development.json to manage log verbosity during development.
This commit is contained in:
@@ -58,7 +58,7 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli
|
||||
foreach (var (index, embedding) in embeddings.Index())
|
||||
{
|
||||
var chunk = chunks.ElementAt(index);
|
||||
logger.LogDebug("Storing a chunk of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(chunk.Content));
|
||||
logger.LogDebug("Storing a chunk of {TokenCount} tokens.", tokenizerService.CountEmbeddingTokens(chunk.Content));
|
||||
|
||||
var documentChunk = new Entities.DocumentChunk
|
||||
{
|
||||
|
||||
@@ -11,8 +11,8 @@ public class DefaultTextChunker(TokenizerService tokenizerService, IOptions<AppS
|
||||
|
||||
public IList<string> Split(string text)
|
||||
{
|
||||
var lines = TextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens);
|
||||
var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens);
|
||||
var lines = TextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens);
|
||||
var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens);
|
||||
|
||||
return paragraphs;
|
||||
}
|
||||
|
||||
@@ -11,8 +11,8 @@ public class MarkdownTextChunker(TokenizerService tokenizerService, IOptions<App
|
||||
|
||||
public IList<string> Split(string text)
|
||||
{
|
||||
var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens);
|
||||
var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens);
|
||||
var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens);
|
||||
var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens);
|
||||
|
||||
return paragraphs;
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
"LogLevel": {
|
||||
"Default": "Information",
|
||||
"Microsoft.AspNetCore": "Warning",
|
||||
"Microsoft.AspNetCore.Watch.BrowserRefresh": "Warning",
|
||||
"SqlDatabaseVectorSearch": "Debug"
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user