Add Markdown support and refactor text chunking

Updated README.md to include Markdown file support.
Introduced new endpoint for uploading Markdown documents with MIME type handling.
Removed TextChunkerService and created DefaultTextChunker and MarkdownTextChunker classes implementing ITextChunker.
Updated VectorSearchService to utilize the new chunking interface.
Added MimeMapping package reference in the project file.
This commit is contained in:
Marco Minerva
2025-02-14 12:06:52 +01:00
parent e228d0bdbc
commit 5a507e972c
7 changed files with 56 additions and 20 deletions
@@ -0,0 +1,19 @@
using Microsoft.Extensions.Options;
using Microsoft.SemanticKernel.Text;
using SqlDatabaseVectorSearch.Services;
using SqlDatabaseVectorSearch.Settings;
namespace SqlDatabaseVectorSearch.TextChunkers;
public class DefaultTextChunker(TokenizerService tokenizerService, IOptions<AppSettings> appSettingsOptions) : ITextChunker
{
private readonly AppSettings appSettings = appSettingsOptions.Value;
public IList<string> Split(string text)
{
var lines = TextChunker.SplitPlainTextLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens);
var paragraphs = TextChunker.SplitPlainTextParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens);
return paragraphs;
}
}
@@ -0,0 +1,6 @@
namespace SqlDatabaseVectorSearch.TextChunkers;
public interface ITextChunker
{
IList<string> Split(string text);
}
@@ -0,0 +1,19 @@
using Microsoft.Extensions.Options;
using Microsoft.SemanticKernel.Text;
using SqlDatabaseVectorSearch.Services;
using SqlDatabaseVectorSearch.Settings;
namespace SqlDatabaseVectorSearch.TextChunkers;
public class MarkdownTextChunker(TokenizerService tokenizerService, IOptions<AppSettings> appSettingsOptions) : ITextChunker
{
private readonly AppSettings appSettings = appSettingsOptions.Value;
public IList<string> Split(string text)
{
var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens);
var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens);
return paragraphs;
}
}