mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Refactor: replace SemanticKernel with Agents.AI.OpenAI
Removed Microsoft.SemanticKernel dependencies in favor of Microsoft.Agents.AI.OpenAI for embedding and chat services. Updated DI registrations in Program.cs to use OpenAIClient. Reimplemented text chunking with a new PlainTextChunker class, updating DefaultTextChunker and MarkdownTextChunker accordingly. Updated .csproj to add new package references and suppress related analyzer warnings.
This commit is contained in:
@@ -1,8 +1,11 @@
|
||||
using System.ClientModel;
|
||||
using System.Net.Mime;
|
||||
using System.Text.Json.Serialization;
|
||||
using FluentValidation;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.SemanticKernel;
|
||||
using Microsoft.Extensions.AI;
|
||||
using OpenAI;
|
||||
using OpenAI.Responses;
|
||||
using SqlDatabaseVectorSearch.Components;
|
||||
using SqlDatabaseVectorSearch.ContentDecoders;
|
||||
using SqlDatabaseVectorSearch.Data;
|
||||
@@ -54,11 +57,25 @@ builder.Services.ConfigureHttpClientDefaults(configure =>
|
||||
});
|
||||
});
|
||||
|
||||
// Semantic Kernel is used to generate embeddings and to reformulate questions taking into account all the previous interactions,
|
||||
// so that embeddings themselves can be generated more accurately.
|
||||
builder.Services.AddKernel()
|
||||
.AddAzureOpenAIEmbeddingGenerator(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, modelId: aiSettings.Embedding.ModelId, dimensions: aiSettings.Embedding.Dimensions)
|
||||
.AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey, modelId: aiSettings.ChatCompletion.ModelId);
|
||||
builder.Services.AddSingleton(_ =>
|
||||
{
|
||||
var embeddingClient = new OpenAIClient(new ApiKeyCredential(aiSettings.Embedding.ApiKey), new()
|
||||
{
|
||||
Endpoint = new(aiSettings.Embedding.Endpoint),
|
||||
}).GetEmbeddingClient(aiSettings.Embedding.Deployment).AsIEmbeddingGenerator(aiSettings.Embedding.Dimensions);
|
||||
|
||||
return embeddingClient;
|
||||
});
|
||||
|
||||
builder.Services.AddChatClient(_ =>
|
||||
{
|
||||
var chatClient = new OpenAIClient(new ApiKeyCredential(aiSettings.ChatCompletion.ApiKey), new()
|
||||
{
|
||||
Endpoint = new(aiSettings.ChatCompletion.Endpoint),
|
||||
}).GetResponsesClient().AsIChatClientWithStoredOutputDisabled(aiSettings.ChatCompletion.Deployment);
|
||||
|
||||
return chatClient;
|
||||
});
|
||||
|
||||
builder.Services.AddKeyedSingleton<IContentDecoder, PdfContentDecoder>(MediaTypeNames.Application.Pdf);
|
||||
builder.Services.AddKeyedSingleton<IContentDecoder, DocxContentDecoder>("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<NoWarn>$(NoWarn);SKEXP0010;SKEXP0050</NoWarn>
|
||||
<NoWarn>$(NoWarn);SKEXP0010;SKEXP0050;OPENAI001;MAAI001</NoWarn>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
@@ -12,7 +12,13 @@
|
||||
<PackageReference Include="DocumentFormat.OpenXml" Version="3.5.1" />
|
||||
<PackageReference Include="EntityFrameworkCore.Exceptions.SqlServer" Version="10.0.1" />
|
||||
<PackageReference Include="FluentValidation.DependencyInjectionExtensions" Version="12.1.1" />
|
||||
<PackageReference Include="Microsoft.Agents.AI.Hosting" Version="1.10.0-preview.260610.1" />
|
||||
<PackageReference Include="Microsoft.Agents.AI.OpenAI" Version="1.10.0" />
|
||||
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="10.0.9" />
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="10.0.9">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||
</PackageReference>
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" Version="10.0.9" />
|
||||
<PackageReference Include="Microsoft.EntityFrameworkCore.Tools" Version="10.0.9">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
using Microsoft.SemanticKernel.Text;
|
||||
using SqlDatabaseVectorSearch.Services;
|
||||
using SqlDatabaseVectorSearch.Settings;
|
||||
using SqlDatabaseVectorSearch.TextChunkers.Implementations;
|
||||
|
||||
namespace SqlDatabaseVectorSearch.TextChunkers;
|
||||
|
||||
@@ -11,8 +11,8 @@ public class DefaultTextChunker(TokenizerService tokenizerService, IOptions<AppS
|
||||
|
||||
public IList<string> Split(string text)
|
||||
{
|
||||
var lines = TextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens);
|
||||
var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens);
|
||||
var lines = PlainTextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens);
|
||||
var paragraphs = PlainTextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens);
|
||||
|
||||
return paragraphs;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,347 @@
|
||||
using System.Diagnostics;
|
||||
using System.Text;
|
||||
|
||||
namespace SqlDatabaseVectorSearch.TextChunkers.Implementations;
|
||||
|
||||
/// <summary>
|
||||
/// Split text in chunks, attempting to leave meaning intact.
|
||||
/// For plain text, split looking at new lines first, then periods, and so on.
|
||||
/// For markdown, split looking at punctuation first, and so on.
|
||||
/// </summary>
|
||||
internal static class PlainTextChunker
|
||||
{
|
||||
/// <summary>
|
||||
/// Represents a list of strings with token count.
|
||||
/// Used to reduce the number of calls to the tokenizer.
|
||||
/// </summary>
|
||||
private sealed class StringListWithTokenCount(TokenCounter? tokenCounter)
|
||||
{
|
||||
private readonly TokenCounter? tokenCounter = tokenCounter;
|
||||
|
||||
public void Add(string value) => Values.Add((value, tokenCounter is null ? GetDefaultTokenCount(value.Length) : tokenCounter(value)));
|
||||
|
||||
public void Add(string value, int tokenCount) => Values.Add((value, tokenCount));
|
||||
|
||||
public void AddRange(StringListWithTokenCount range) => Values.AddRange(range.Values);
|
||||
|
||||
public void RemoveRange(int index, int count) => Values.RemoveRange(index, count);
|
||||
|
||||
public int Count => Values.Count;
|
||||
|
||||
public List<string> ToStringList() => Values.Select(v => v.Value).ToList();
|
||||
|
||||
private List<(string Value, int TokenCount)> Values { get; } = [];
|
||||
|
||||
public string ValueAt(int i) => Values[i].Value;
|
||||
|
||||
public int TokenCountAt(int i) => Values[i].TokenCount;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Delegate for counting tokens in a string.
|
||||
/// </summary>
|
||||
/// <param name="input">The input string to count tokens in.</param>
|
||||
/// <returns>The number of tokens in the input string.</returns>
|
||||
public delegate int TokenCounter(string input);
|
||||
|
||||
private static readonly char[] spaceChar = [' '];
|
||||
private static readonly string?[] plainTextSplitOptions = ["\n", ".。.", "?!", ";", ":", ",,、", ")]}", " ", "-", null];
|
||||
private static readonly string?[] markdownSplitOptions = [".\u3002\uFF0E", "?!", ";", ":", ",\uFF0C\u3001", ")]}", " ", "-", "\n\r", null];
|
||||
|
||||
/// <summary>
|
||||
/// Split plain text into lines.
|
||||
/// </summary>
|
||||
/// <param name="text">Text to split</param>
|
||||
/// <param name="maxTokensPerLine">Maximum number of tokens per line.</param>
|
||||
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
|
||||
/// <returns>List of lines.</returns>
|
||||
public static List<string> SplitPlainTextLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null) =>
|
||||
InternalSplitLines(text, maxTokensPerLine, trim: true, plainTextSplitOptions, tokenCounter);
|
||||
|
||||
/// <summary>
|
||||
/// Split markdown text into lines.
|
||||
/// </summary>
|
||||
/// <param name="text">Text to split</param>
|
||||
/// <param name="maxTokensPerLine">Maximum number of tokens per line.</param>
|
||||
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
|
||||
/// <returns>List of lines.</returns>
|
||||
public static List<string> SplitMarkDownLines(string text, int maxTokensPerLine, TokenCounter? tokenCounter = null) =>
|
||||
InternalSplitLines(text, maxTokensPerLine, trim: true, markdownSplitOptions, tokenCounter);
|
||||
|
||||
/// <summary>
|
||||
/// Split plain text into paragraphs.
|
||||
/// </summary>
|
||||
/// <param name="lines">Lines of text.</param>
|
||||
/// <param name="maxTokensPerParagraph">Maximum number of tokens per paragraph.</param>
|
||||
/// <param name="overlapTokens">Number of tokens to overlap between paragraphs.</param>
|
||||
/// <param name="chunkHeader">Text to be prepended to each individual chunk.</param>
|
||||
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
|
||||
/// <returns>List of paragraphs.</returns>
|
||||
public static List<string> SplitPlainTextParagraphs(IEnumerable<string> lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null)
|
||||
=> InternalSplitTextParagraphs(lines.Select(line => line.Replace("\r\n", "\n").Replace('\r', '\n')), maxTokensPerParagraph, overlapTokens, chunkHeader,
|
||||
static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, plainTextSplitOptions, tokenCounter), tokenCounter);
|
||||
|
||||
/// <summary>
|
||||
/// Split markdown text into paragraphs.
|
||||
/// </summary>
|
||||
/// <param name="lines">Lines of text.</param>
|
||||
/// <param name="maxTokensPerParagraph">Maximum number of tokens per paragraph.</param>
|
||||
/// <param name="overlapTokens">Number of tokens to overlap between paragraphs.</param>
|
||||
/// <param name="chunkHeader">Text to be prepended to each individual chunk.</param>
|
||||
/// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
|
||||
/// <returns>List of paragraphs.</returns>
|
||||
public static List<string> SplitMarkdownParagraphs(IEnumerable<string> lines, int maxTokensPerParagraph, int overlapTokens = 0, string? chunkHeader = null, TokenCounter? tokenCounter = null)
|
||||
=> InternalSplitTextParagraphs(lines, maxTokensPerParagraph, overlapTokens, chunkHeader,
|
||||
static (text, maxTokens, tokenCounter) => InternalSplitLines(text, maxTokens, trim: false, markdownSplitOptions, tokenCounter), tokenCounter);
|
||||
|
||||
private static List<string> InternalSplitTextParagraphs(IEnumerable<string> lines, int maxTokensPerParagraph, int overlapTokens, string? chunkHeader, Func<string, int, TokenCounter?, List<string>> longLinesSplitter, TokenCounter? tokenCounter)
|
||||
{
|
||||
if (maxTokensPerParagraph <= 0)
|
||||
{
|
||||
throw new ArgumentException("maxTokensPerParagraph should be a positive number", nameof(maxTokensPerParagraph));
|
||||
}
|
||||
|
||||
if (maxTokensPerParagraph <= overlapTokens)
|
||||
{
|
||||
throw new ArgumentException("overlapTokens cannot be larger than maxTokensPerParagraph", nameof(maxTokensPerParagraph));
|
||||
}
|
||||
|
||||
// Optimize empty inputs if we can efficiently determine the're empty
|
||||
if (lines is ICollection<string> c && c.Count == 0)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
var chunkHeaderTokens = chunkHeader is { Length: > 0 } ? GetTokenCount(chunkHeader, tokenCounter) : 0;
|
||||
var adjustedMaxTokensPerParagraph = maxTokensPerParagraph - overlapTokens - chunkHeaderTokens;
|
||||
|
||||
// Split long lines first
|
||||
var truncatedLines = lines.SelectMany(line => longLinesSplitter(line, adjustedMaxTokensPerParagraph, tokenCounter));
|
||||
|
||||
var paragraphs = BuildParagraph(truncatedLines, adjustedMaxTokensPerParagraph, tokenCounter);
|
||||
var processedParagraphs = ProcessParagraphs(paragraphs, adjustedMaxTokensPerParagraph, overlapTokens, chunkHeader, longLinesSplitter, tokenCounter);
|
||||
|
||||
return processedParagraphs;
|
||||
}
|
||||
|
||||
private static List<string> BuildParagraph(IEnumerable<string> truncatedLines, int maxTokensPerParagraph, TokenCounter? tokenCounter)
|
||||
{
|
||||
StringBuilder paragraphBuilder = new();
|
||||
List<string> paragraphs = [];
|
||||
|
||||
foreach (var line in truncatedLines)
|
||||
{
|
||||
if (paragraphBuilder.Length > 0)
|
||||
{
|
||||
string? paragraph = null;
|
||||
|
||||
var currentCount = GetTokenCount(line, tokenCounter) + 1;
|
||||
if (currentCount < maxTokensPerParagraph)
|
||||
{
|
||||
currentCount += tokenCounter is null ?
|
||||
GetDefaultTokenCount(paragraphBuilder.Length) :
|
||||
tokenCounter(paragraph = paragraphBuilder.ToString());
|
||||
}
|
||||
|
||||
if (currentCount >= maxTokensPerParagraph)
|
||||
{
|
||||
// Complete the paragraph and prepare for the next
|
||||
paragraph ??= paragraphBuilder.ToString();
|
||||
paragraphs.Add(paragraph.Trim());
|
||||
paragraphBuilder.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
paragraphBuilder.AppendLine(line);
|
||||
}
|
||||
|
||||
if (paragraphBuilder.Length > 0)
|
||||
{
|
||||
// Add the final paragraph if there's anything remaining
|
||||
paragraphs.Add(paragraphBuilder.ToString().Trim());
|
||||
}
|
||||
|
||||
return paragraphs;
|
||||
}
|
||||
|
||||
private static List<string> ProcessParagraphs(List<string> paragraphs, int adjustedMaxTokensPerParagraph, int overlapTokens, string? chunkHeader, Func<string, int, TokenCounter?, List<string>> longLinesSplitter, TokenCounter? tokenCounter)
|
||||
{
|
||||
// distribute text more evenly in the last paragraphs when the last paragraph is too short.
|
||||
if (paragraphs.Count > 1)
|
||||
{
|
||||
var lastParagraph = paragraphs[^1];
|
||||
var secondLastParagraph = paragraphs[^2];
|
||||
|
||||
if (GetTokenCount(lastParagraph, tokenCounter) < adjustedMaxTokensPerParagraph / 4)
|
||||
{
|
||||
var lastParagraphTokens = lastParagraph.Split(spaceChar, StringSplitOptions.RemoveEmptyEntries);
|
||||
var secondLastParagraphTokens = secondLastParagraph.Split(spaceChar, StringSplitOptions.RemoveEmptyEntries);
|
||||
|
||||
var lastParagraphTokensCount = lastParagraphTokens.Length;
|
||||
var secondLastParagraphTokensCount = secondLastParagraphTokens.Length;
|
||||
|
||||
if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph)
|
||||
{
|
||||
var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens);
|
||||
var newLastParagraph = string.Join(" ", lastParagraphTokens);
|
||||
|
||||
paragraphs[^2] = $"{newSecondLastParagraph} {newLastParagraph}";
|
||||
paragraphs.RemoveAt(paragraphs.Count - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var processedParagraphs = new List<string>();
|
||||
var paragraphStringBuilder = new StringBuilder();
|
||||
|
||||
for (var i = 0; i < paragraphs.Count; i++)
|
||||
{
|
||||
paragraphStringBuilder.Clear();
|
||||
|
||||
if (chunkHeader is not null)
|
||||
{
|
||||
paragraphStringBuilder.Append(chunkHeader);
|
||||
}
|
||||
|
||||
var paragraph = paragraphs[i];
|
||||
|
||||
if (overlapTokens > 0 && i < paragraphs.Count - 1)
|
||||
{
|
||||
var nextParagraph = paragraphs[i + 1];
|
||||
var split = longLinesSplitter(nextParagraph, overlapTokens, tokenCounter);
|
||||
|
||||
paragraphStringBuilder.Append(paragraph);
|
||||
|
||||
if (split.Count != 0)
|
||||
{
|
||||
paragraphStringBuilder.Append(' ').Append(split[0]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
paragraphStringBuilder.Append(paragraph);
|
||||
}
|
||||
|
||||
processedParagraphs.Add(paragraphStringBuilder.ToString());
|
||||
}
|
||||
|
||||
return processedParagraphs;
|
||||
}
|
||||
|
||||
private static List<string> InternalSplitLines(string text, int maxTokensPerLine, bool trim, string?[] splitOptions, TokenCounter? tokenCounter)
|
||||
{
|
||||
var result = new StringListWithTokenCount(tokenCounter);
|
||||
|
||||
text = text.Replace("\r\n", "\n"); // normalize line endings
|
||||
result.Add(text);
|
||||
for (var i = 0; i < splitOptions.Length; i++)
|
||||
{
|
||||
var count = result.Count; // track where the original input left off
|
||||
var (splits2, inputWasSplit2) = Split(result, maxTokensPerLine, splitOptions[i].AsSpan(), trim, tokenCounter);
|
||||
result.AddRange(splits2);
|
||||
result.RemoveRange(0, count); // remove the original input
|
||||
if (!inputWasSplit2)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result.ToStringList();
|
||||
}
|
||||
|
||||
private static (StringListWithTokenCount, bool) Split(StringListWithTokenCount input, int maxTokens, ReadOnlySpan<char> separators, bool trim, TokenCounter? tokenCounter)
|
||||
{
|
||||
var inputWasSplit = false;
|
||||
StringListWithTokenCount result = new(tokenCounter);
|
||||
var count = input.Count;
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
var (splits, split) = Split(input.ValueAt(i).AsSpan(), input.ValueAt(i), maxTokens, separators, trim, tokenCounter, input.TokenCountAt(i));
|
||||
result.AddRange(splits);
|
||||
inputWasSplit |= split;
|
||||
}
|
||||
|
||||
return (result, inputWasSplit);
|
||||
}
|
||||
|
||||
private static (StringListWithTokenCount, bool) Split(ReadOnlySpan<char> input, string? inputString, int maxTokens, ReadOnlySpan<char> separators, bool trim, TokenCounter? tokenCounter, int inputTokenCount)
|
||||
{
|
||||
Debug.Assert(inputString is null || input.SequenceEqual(inputString.AsSpan()));
|
||||
StringListWithTokenCount result = new(tokenCounter);
|
||||
var inputWasSplit = false;
|
||||
|
||||
if (inputTokenCount > maxTokens)
|
||||
{
|
||||
inputWasSplit = true;
|
||||
|
||||
var half = input.Length / 2;
|
||||
var cutPoint = -1;
|
||||
|
||||
if (separators.IsEmpty)
|
||||
{
|
||||
cutPoint = half;
|
||||
}
|
||||
else if (input.Length > 2)
|
||||
{
|
||||
var pos = 0;
|
||||
while (true)
|
||||
{
|
||||
var index = input[pos..^1].IndexOfAny(separators);
|
||||
if (index < 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
index += pos;
|
||||
|
||||
if (Math.Abs(half - index) < Math.Abs(half - cutPoint))
|
||||
{
|
||||
cutPoint = index + 1;
|
||||
}
|
||||
|
||||
pos = index + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (cutPoint > 0)
|
||||
{
|
||||
var firstHalf = input[..cutPoint];
|
||||
var secondHalf = input[cutPoint..];
|
||||
if (trim)
|
||||
{
|
||||
firstHalf = firstHalf.Trim();
|
||||
secondHalf = secondHalf.Trim();
|
||||
}
|
||||
|
||||
// Recursion
|
||||
var (splits1, split1) = Split(firstHalf, null, maxTokens, separators, trim, tokenCounter, GetTokenCount(firstHalf.ToString(), tokenCounter));
|
||||
result.AddRange(splits1);
|
||||
var (splits2, split2) = Split(secondHalf, null, maxTokens, separators, trim, tokenCounter, GetTokenCount(secondHalf.ToString(), tokenCounter));
|
||||
result.AddRange(splits2);
|
||||
|
||||
inputWasSplit = split1 || split2;
|
||||
return (result, inputWasSplit);
|
||||
}
|
||||
}
|
||||
|
||||
var resultString = inputString ?? input.ToString();
|
||||
var resultTokenCount = inputTokenCount;
|
||||
if (trim && !resultString.Trim().Equals(resultString, StringComparison.Ordinal))
|
||||
{
|
||||
resultString = resultString.Trim();
|
||||
resultTokenCount = GetTokenCount(resultString, tokenCounter);
|
||||
}
|
||||
|
||||
result.Add(resultString, resultTokenCount);
|
||||
|
||||
return (result, inputWasSplit);
|
||||
}
|
||||
|
||||
private static int GetTokenCount(string input, TokenCounter? tokenCounter) => tokenCounter is null ? GetDefaultTokenCount(input.Length) : tokenCounter(input);
|
||||
|
||||
private static int GetDefaultTokenCount(int length)
|
||||
{
|
||||
Debug.Assert(length >= 0);
|
||||
return length >> 2;
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
using Microsoft.SemanticKernel.Text;
|
||||
using SqlDatabaseVectorSearch.Services;
|
||||
using SqlDatabaseVectorSearch.Settings;
|
||||
using SqlDatabaseVectorSearch.TextChunkers.Implementations;
|
||||
|
||||
namespace SqlDatabaseVectorSearch.TextChunkers;
|
||||
|
||||
@@ -11,8 +11,8 @@ public class MarkdownTextChunker(TokenizerService tokenizerService, IOptions<App
|
||||
|
||||
public IList<string> Split(string text)
|
||||
{
|
||||
var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens);
|
||||
var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens);
|
||||
var lines = PlainTextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountEmbeddingTokens);
|
||||
var paragraphs = PlainTextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountEmbeddingTokens);
|
||||
|
||||
return paragraphs;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user