From 5624f73640cced1aebdfef6c8f4fba9fb20b0f34 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Mon, 15 Jun 2026 17:58:30 +0200 Subject: [PATCH] Refactor: replace SemanticKernel with Agents.AI.OpenAI Removed Microsoft.SemanticKernel dependencies in favor of Microsoft.Agents.AI.OpenAI for embedding and chat services. Updated DI registrations in Program.cs to use OpenAIClient. Reimplemented text chunking with a new PlainTextChunker class, updating DefaultTextChunker and MarkdownTextChunker accordingly. Updated .csproj to add new package references and suppress related analyzer warnings. --- SqlDatabaseVectorSearch/Program.cs | 29 +- .../SqlDatabaseVectorSearch.csproj | 8 +- .../TextChunkers/DefaultTextChunker.cs | 6 +- .../Implementations/PlainTextChunker.cs | 347 ++++++++++++++++++ .../TextChunkers/MarkdownTextChunker.cs | 6 +- 5 files changed, 383 insertions(+), 13 deletions(-) create mode 100644 SqlDatabaseVectorSearch/TextChunkers/Implementations/PlainTextChunker.cs diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 655e93a..e99238c 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -1,8 +1,11 @@ +using System.ClientModel; using System.Net.Mime; using System.Text.Json.Serialization; using FluentValidation; using Microsoft.EntityFrameworkCore; -using Microsoft.SemanticKernel; +using Microsoft.Extensions.AI; +using OpenAI; +using OpenAI.Responses; using SqlDatabaseVectorSearch.Components; using SqlDatabaseVectorSearch.ContentDecoders; using SqlDatabaseVectorSearch.Data; @@ -54,11 +57,25 @@ builder.Services.ConfigureHttpClientDefaults(configure => }); }); -// Semantic Kernel is used to generate embeddings and to reformulate questions taking into account all the previous interactions, -// so that embeddings themselves can be generated more accurately. -builder.Services.AddKernel() - .AddAzureOpenAIEmbeddingGenerator(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, modelId: aiSettings.Embedding.ModelId, dimensions: aiSettings.Embedding.Dimensions) - .AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey, modelId: aiSettings.ChatCompletion.ModelId); +builder.Services.AddSingleton(_ => +{ + var embeddingClient = new OpenAIClient(new ApiKeyCredential(aiSettings.Embedding.ApiKey), new() + { + Endpoint = new(aiSettings.Embedding.Endpoint), + }).GetEmbeddingClient(aiSettings.Embedding.Deployment).AsIEmbeddingGenerator(aiSettings.Embedding.Dimensions); + + return embeddingClient; +}); + +builder.Services.AddChatClient(_ => +{ + var chatClient = new OpenAIClient(new ApiKeyCredential(aiSettings.ChatCompletion.ApiKey), new() + { + Endpoint = new(aiSettings.ChatCompletion.Endpoint), + }).GetResponsesClient().AsIChatClientWithStoredOutputDisabled(aiSettings.ChatCompletion.Deployment); + + return chatClient; +}); builder.Services.AddKeyedSingleton(MediaTypeNames.Application.Pdf); builder.Services.AddKeyedSingleton("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index c82f9d0..42361e4 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -4,7 +4,7 @@ net10.0 enable enable - $(NoWarn);SKEXP0010;SKEXP0050 + $(NoWarn);SKEXP0010;SKEXP0050;OPENAI001;MAAI001 @@ -12,7 +12,13 @@ + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + all diff --git a/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs b/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs index a5a6854..777f48f 100644 --- a/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs +++ b/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs @@ -1,7 +1,7 @@ using Microsoft.Extensions.Options; -using Microsoft.SemanticKernel.Text; using SqlDatabaseVectorSearch.Services; using SqlDatabaseVectorSearch.Settings; +using SqlDatabaseVectorSearch.TextChunkers.Implementations; namespace SqlDatabaseVectorSearch.TextChunkers; @@ -11,8 +11,8 @@ public class DefaultTextChunker(TokenizerService tokenizerService, IOptions Split(string text) { - var lines = TextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens); - var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens); + var lines = PlainTextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens); + var paragraphs = PlainTextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens); return paragraphs; } diff --git a/SqlDatabaseVectorSearch/TextChunkers/Implementations/PlainTextChunker.cs b/SqlDatabaseVectorSearch/TextChunkers/Implementations/PlainTextChunker.cs new file mode 100644 index 0000000..be81fd9 --- /dev/null +++ b/SqlDatabaseVectorSearch/TextChunkers/Implementations/PlainTextChunker.cs @@ -0,0 +1,347 @@ +using System.Diagnostics; +using System.Text; + +namespace SqlDatabaseVectorSearch.TextChunkers.Implementations; + +/// +/// Split text in chunks, attempting to leave meaning intact. +/// For plain text, split looking at new lines first, then periods, and so on. +/// For markdown, split looking at punctuation first, and so on. +/// +internal static class PlainTextChunker +{ + /// + /// Represents a list of strings with token count. + /// Used to reduce the number of calls to the tokenizer. + /// + private sealed class StringListWithTokenCount(TokenCounter? tokenCounter) + { + private readonly TokenCounter? tokenCounter = tokenCounter; + + public void Add(string value) => Values.Add((value, tokenCounter is null ? GetDefaultTokenCount(value.Length) : tokenCounter(value))); + + public void Add(string value, int tokenCount) => Values.Add((value, tokenCount)); + + public void AddRange(StringListWithTokenCount range) => Values.AddRange(range.Values); + + public void RemoveRange(int index, int count) => Values.RemoveRange(index, count); + + public int Count => Values.Count; + + public List ToStringList() => Values.Select(v => v.Value).ToList(); + + private List<(string Value, int TokenCount)> Values { get; } = []; + + public string ValueAt(int i) => Values[i].Value; + + public int TokenCountAt(int i) => Values[i].TokenCount; + } + + /// + /// Delegate for counting tokens in a string. + /// + /// The input string to count tokens in. + /// The number of tokens in the input string. + public delegate int TokenCounter(string input); + + private static readonly char[] spaceChar = [' ']; + private static readonly string?[] plainTextSplitOptions = ["\n", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null]; + private static readonly string?[] markdownSplitOptions = [".\u3002\uFF0E", "?!", ";", ":", ",\uFF0C\u3001", ")]}", " ", "-", "\n\r", null]; + + /// + /// Split plain text into lines. + /// + /// Text to split + /// Maximum number of tokens per line. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of lines. + public static List SplitPlainTextLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null) => + InternalSplitLines(text, maxTokensPerLine, trim: true, plainTextSplitOptions, tokenCounter); + + /// + /// Split markdown text into lines. + /// + /// Text to split + /// Maximum number of tokens per line. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of lines. + public static List SplitMarkDownLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null) => + InternalSplitLines(text, maxTokensPerLine, trim: true, markdownSplitOptions, tokenCounter); + + /// + /// Split plain text into paragraphs. + /// + /// Lines of text. + /// Maximum number of tokens per paragraph. + /// Number of tokens to overlap between paragraphs. + /// Text to be prepended to each individual chunk. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of paragraphs. + public static List SplitPlainTextParagraphs(IEnumerable lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null) + => InternalSplitTextParagraphs(lines.Select(line => line.Replace("\r\n", "\n").Replace('\r', '\n')), maxTokensPerParagraph, overlapTokens, chunkHeader, + static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, plainTextSplitOptions, tokenCounter), tokenCounter); + + /// + /// Split markdown text into paragraphs. + /// + /// Lines of text. + /// Maximum number of tokens per paragraph. + /// Number of tokens to overlap between paragraphs. + /// Text to be prepended to each individual chunk. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of paragraphs. + public static List SplitMarkdownParagraphs(IEnumerable lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null) + => InternalSplitTextParagraphs(lines, maxTokensPerParagraph, overlapTokens, chunkHeader, + static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, markdownSplitOptions, tokenCounter), tokenCounter); + + private static List InternalSplitTextParagraphs(IEnumerable lines, int maxTokensPerParagraph, int overlapTokens, string? chunkHeader, Func> longLinesSplitter, TokenCounter? tokenCounter) + { + if (maxTokensPerParagraph <= 0) + { + throw new ArgumentException("maxTokensPerParagraph should be a positive number", nameof(maxTokensPerParagraph)); + } + + if (maxTokensPerParagraph <= overlapTokens) + { + throw new ArgumentException("overlapTokens cannot be larger than maxTokensPerParagraph", nameof(maxTokensPerParagraph)); + } + + // Optimize empty inputs if we can efficiently determine the're empty + if (lines is ICollection c && c.Count == 0) + { + return []; + } + + var chunkHeaderTokens = chunkHeader is { Length: > 0 } ? GetTokenCount(chunkHeader, tokenCounter) : 0; + var adjustedMaxTokensPerParagraph = maxTokensPerParagraph - overlapTokens - chunkHeaderTokens; + + // Split long lines first + var truncatedLines = lines.SelectMany(line => longLinesSplitter(line, adjustedMaxTokensPerParagraph, tokenCounter)); + + var paragraphs = BuildParagraph(truncatedLines, adjustedMaxTokensPerParagraph, tokenCounter); + var processedParagraphs = ProcessParagraphs(paragraphs, adjustedMaxTokensPerParagraph, overlapTokens, chunkHeader, longLinesSplitter, tokenCounter); + + return processedParagraphs; + } + + private static List BuildParagraph(IEnumerable truncatedLines, int maxTokensPerParagraph, TokenCounter? tokenCounter) + { + StringBuilder paragraphBuilder = new(); + List paragraphs = []; + + foreach (var line in truncatedLines) + { + if (paragraphBuilder.Length > 0) + { + string? paragraph = null; + + var currentCount = GetTokenCount(line, tokenCounter) + 1; + if (currentCount < maxTokensPerParagraph) + { + currentCount += tokenCounter is null ? + GetDefaultTokenCount(paragraphBuilder.Length) : + tokenCounter(paragraph = paragraphBuilder.ToString()); + } + + if (currentCount >= maxTokensPerParagraph) + { + // Complete the paragraph and prepare for the next + paragraph ??= paragraphBuilder.ToString(); + paragraphs.Add(paragraph.Trim()); + paragraphBuilder.Clear(); + } + } + + paragraphBuilder.AppendLine(line); + } + + if (paragraphBuilder.Length > 0) + { + // Add the final paragraph if there's anything remaining + paragraphs.Add(paragraphBuilder.ToString().Trim()); + } + + return paragraphs; + } + + private static List ProcessParagraphs(List paragraphs, int adjustedMaxTokensPerParagraph, int overlapTokens, string? chunkHeader, Func> longLinesSplitter, TokenCounter? tokenCounter) + { + // distribute text more evenly in the last paragraphs when the last paragraph is too short. + if (paragraphs.Count > 1) + { + var lastParagraph = paragraphs[^1]; + var secondLastParagraph = paragraphs[^2]; + + if (GetTokenCount(lastParagraph, tokenCounter) < adjustedMaxTokensPerParagraph / 4) + { + var lastParagraphTokens = lastParagraph.Split(spaceChar, StringSplitOptions.RemoveEmptyEntries); + var secondLastParagraphTokens = secondLastParagraph.Split(spaceChar, StringSplitOptions.RemoveEmptyEntries); + + var lastParagraphTokensCount = lastParagraphTokens.Length; + var secondLastParagraphTokensCount = secondLastParagraphTokens.Length; + + if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph) + { + var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens); + var newLastParagraph = string.Join(" ", lastParagraphTokens); + + paragraphs[^2] = $"{newSecondLastParagraph} {newLastParagraph}"; + paragraphs.RemoveAt(paragraphs.Count - 1); + } + } + } + + var processedParagraphs = new List(); + var paragraphStringBuilder = new StringBuilder(); + + for (var i = 0; i < paragraphs.Count; i++) + { + paragraphStringBuilder.Clear(); + + if (chunkHeader is not null) + { + paragraphStringBuilder.Append(chunkHeader); + } + + var paragraph = paragraphs[i]; + + if (overlapTokens > 0 && i < paragraphs.Count - 1) + { + var nextParagraph = paragraphs[i + 1]; + var split = longLinesSplitter(nextParagraph, overlapTokens, tokenCounter); + + paragraphStringBuilder.Append(paragraph); + + if (split.Count != 0) + { + paragraphStringBuilder.Append(' ').Append(split[0]); + } + } + else + { + paragraphStringBuilder.Append(paragraph); + } + + processedParagraphs.Add(paragraphStringBuilder.ToString()); + } + + return processedParagraphs; + } + + private static List InternalSplitLines(string text, int maxTokensPerLine, bool trim, string?[] splitOptions, TokenCounter? tokenCounter) + { + var result = new StringListWithTokenCount(tokenCounter); + + text = text.Replace("\r\n", "\n"); // normalize line endings + result.Add(text); + for (var i = 0; i < splitOptions.Length; i++) + { + var count = result.Count; // track where the original input left off + var (splits2, inputWasSplit2) = Split(result, maxTokensPerLine, splitOptions[i].AsSpan(), trim, tokenCounter); + result.AddRange(splits2); + result.RemoveRange(0, count); // remove the original input + if (!inputWasSplit2) + { + break; + } + } + + return result.ToStringList(); + } + + private static (StringListWithTokenCount, bool) Split(StringListWithTokenCount input, int maxTokens, ReadOnlySpan separators, bool trim, TokenCounter? tokenCounter) + { + var inputWasSplit = false; + StringListWithTokenCount result = new(tokenCounter); + var count = input.Count; + for (var i = 0; i < count; i++) + { + var (splits, split) = Split(input.ValueAt(i).AsSpan(), input.ValueAt(i), maxTokens, separators, trim, tokenCounter, input.TokenCountAt(i)); + result.AddRange(splits); + inputWasSplit |= split; + } + + return (result, inputWasSplit); + } + + private static (StringListWithTokenCount, bool) Split(ReadOnlySpan input, string? inputString, int maxTokens, ReadOnlySpan separators, bool trim, TokenCounter? tokenCounter, int inputTokenCount) + { + Debug.Assert(inputString is null || input.SequenceEqual(inputString.AsSpan())); + StringListWithTokenCount result = new(tokenCounter); + var inputWasSplit = false; + + if (inputTokenCount > maxTokens) + { + inputWasSplit = true; + + var half = input.Length / 2; + var cutPoint = -1; + + if (separators.IsEmpty) + { + cutPoint = half; + } + else if (input.Length > 2) + { + var pos = 0; + while (true) + { + var index = input[pos..^1].IndexOfAny(separators); + if (index < 0) + { + break; + } + + index += pos; + + if (Math.Abs(half - index) < Math.Abs(half - cutPoint)) + { + cutPoint = index + 1; + } + + pos = index + 1; + } + } + + if (cutPoint > 0) + { + var firstHalf = input[..cutPoint]; + var secondHalf = input[cutPoint..]; + if (trim) + { + firstHalf = firstHalf.Trim(); + secondHalf = secondHalf.Trim(); + } + + // Recursion + var (splits1, split1) = Split(firstHalf, null, maxTokens, separators, trim, tokenCounter, GetTokenCount(firstHalf.ToString(), tokenCounter)); + result.AddRange(splits1); + var (splits2, split2) = Split(secondHalf, null, maxTokens, separators, trim, tokenCounter, GetTokenCount(secondHalf.ToString(), tokenCounter)); + result.AddRange(splits2); + + inputWasSplit = split1 || split2; + return (result, inputWasSplit); + } + } + + var resultString = inputString ?? input.ToString(); + var resultTokenCount = inputTokenCount; + if (trim && !resultString.Trim().Equals(resultString, StringComparison.Ordinal)) + { + resultString = resultString.Trim(); + resultTokenCount = GetTokenCount(resultString, tokenCounter); + } + + result.Add(resultString, resultTokenCount); + + return (result, inputWasSplit); + } + + private static int GetTokenCount(string input, TokenCounter? tokenCounter) => tokenCounter is null ? GetDefaultTokenCount(input.Length) : tokenCounter(input); + + private static int GetDefaultTokenCount(int length) + { + Debug.Assert(length >= 0); + return length >> 2; + } +} \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs b/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs index cba6679..37a06e7 100644 --- a/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs +++ b/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs @@ -1,7 +1,7 @@ using Microsoft.Extensions.Options; -using Microsoft.SemanticKernel.Text; using SqlDatabaseVectorSearch.Services; using SqlDatabaseVectorSearch.Settings; +using SqlDatabaseVectorSearch.TextChunkers.Implementations; namespace SqlDatabaseVectorSearch.TextChunkers; @@ -11,8 +11,8 @@ public class MarkdownTextChunker(TokenizerService tokenizerService, IOptions Split(string text) { - var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens); - var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens); + var lines = PlainTextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens); + var paragraphs = PlainTextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens); return paragraphs; }