mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Add Markdown support and refactor text chunking
Updated README.md to include Markdown file support. Introduced new endpoint for uploading Markdown documents with MIME type handling. Removed TextChunkerService and created DefaultTextChunker and MarkdownTextChunker classes implementing ITextChunker. Updated VectorSearchService to utilize the new chunking interface. Added MimeMapping package reference in the project file.
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
# SQL Database Vector Search Sample
|
||||
A repository that showcases the native VECTOR type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI.
|
||||
|
||||
The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX and TXT files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel).
|
||||
The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX, TXT and MD files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel).
|
||||
|
||||
> [!NOTE]
|
||||
> If you prefer to use straight SQL, check out the [sql branch](https://github.com/marcominerva/SqlDatabaseVectorSearch/tree/sql).
|
||||
|
||||
@@ -4,11 +4,13 @@ using System.Text.Json.Serialization;
|
||||
using Microsoft.AspNetCore.Http.HttpResults;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.SemanticKernel;
|
||||
using MimeMapping;
|
||||
using SqlDatabaseVectorSearch.ContentDecoders;
|
||||
using SqlDatabaseVectorSearch.DataAccessLayer;
|
||||
using SqlDatabaseVectorSearch.Models;
|
||||
using SqlDatabaseVectorSearch.Services;
|
||||
using SqlDatabaseVectorSearch.Settings;
|
||||
using SqlDatabaseVectorSearch.TextChunkers;
|
||||
using TinyHelpers.AspNetCore.Extensions;
|
||||
using TinyHelpers.AspNetCore.OpenApi;
|
||||
|
||||
@@ -53,7 +55,6 @@ builder.Services.AddKernel()
|
||||
.AddAzureOpenAITextEmbeddingGeneration(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, dimensions: aiSettings.Embedding.Dimensions)
|
||||
.AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey);
|
||||
|
||||
builder.Services.AddSingleton<TextChunkerService>();
|
||||
builder.Services.AddSingleton<TokenizerService>();
|
||||
builder.Services.AddSingleton<ChatService>();
|
||||
|
||||
@@ -63,11 +64,15 @@ builder.Services.AddScoped<VectorSearchService>();
|
||||
builder.Services.AddKeyedSingleton<IContentDecoder, PdfContentDecoder>(MediaTypeNames.Application.Pdf);
|
||||
builder.Services.AddKeyedSingleton<IContentDecoder, DocxContentDecoder>("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
||||
builder.Services.AddKeyedSingleton<IContentDecoder, TextContentDecoder>(MediaTypeNames.Text.Plain);
|
||||
builder.Services.AddKeyedSingleton<IContentDecoder, TextContentDecoder>(MediaTypeNames.Text.Markdown);
|
||||
|
||||
builder.Services.AddKeyedSingleton<ITextChunker, DefaultTextChunker>(KeyedService.AnyKey);
|
||||
builder.Services.AddKeyedSingleton<ITextChunker, MarkdownTextChunker>(MediaTypeNames.Text.Markdown);
|
||||
|
||||
builder.Services.AddOpenApi(options =>
|
||||
{
|
||||
options.RemoveServerList();
|
||||
options.AddDefaultResponse();
|
||||
options.AddDefaultProblemDetailsResponse();
|
||||
});
|
||||
|
||||
builder.Services.AddDefaultProblemDetails();
|
||||
@@ -135,6 +140,21 @@ documentsApiGroup.MapGet(string.Empty, async (DocumentService documentService, C
|
||||
})
|
||||
.WithSummary("Gets the list of documents");
|
||||
|
||||
documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchService vectorSearchService, CancellationToken cancellationToken,
|
||||
[Description("The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.")] Guid? documentId = null) =>
|
||||
{
|
||||
using var stream = file.OpenReadStream();
|
||||
|
||||
// Note: file.ContentType is not 100% reliable (for example, for markdown file).
|
||||
var response = await vectorSearchService.ImportAsync(stream, file.FileName, MimeUtility.GetMimeMapping(file.FileName), documentId, cancellationToken);
|
||||
|
||||
return TypedResults.Ok(response);
|
||||
})
|
||||
.DisableAntiforgery()
|
||||
.ProducesProblem(StatusCodes.Status400BadRequest)
|
||||
.WithSummary("Uploads a document")
|
||||
.WithDescription("Uploads a document to SQL Database and saves its embedding using the native VECTOR type. The document will be indexed and used to answer questions. Currently, PDF, DOCX, TXT and MD files are supported.");
|
||||
|
||||
documentsApiGroup.MapGet("{documentId:guid}/chunks", async (Guid documentId, DocumentService documentService, CancellationToken cancellationToken) =>
|
||||
{
|
||||
var documents = await documentService.GetChunksAsync(documentId, cancellationToken);
|
||||
@@ -164,17 +184,4 @@ documentsApiGroup.MapDelete("{documentId:guid}", async (Guid documentId, Documen
|
||||
.WithSummary("Deletes a document")
|
||||
.WithDescription("This endpoint deletes the document and all its chunks.");
|
||||
|
||||
documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchService vectorSearchService, CancellationToken cancellationToken,
|
||||
[Description("The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.")] Guid? documentId = null) =>
|
||||
{
|
||||
using var stream = file.OpenReadStream();
|
||||
var response = await vectorSearchService.ImportAsync(stream, file.FileName, file.ContentType, documentId, cancellationToken);
|
||||
|
||||
return TypedResults.Ok(response);
|
||||
})
|
||||
.DisableAntiforgery()
|
||||
.ProducesProblem(StatusCodes.Status400BadRequest)
|
||||
.WithSummary("Uploads a document")
|
||||
.WithDescription("Uploads a document to SQL Database and saves its embedding using the native VECTOR type. The document will be indexed and used to answer questions. Currently, PDF, DOCX and TXT files are supported.");
|
||||
|
||||
app.Run();
|
||||
@@ -7,11 +7,12 @@ using SqlDatabaseVectorSearch.ContentDecoders;
|
||||
using SqlDatabaseVectorSearch.DataAccessLayer;
|
||||
using SqlDatabaseVectorSearch.Models;
|
||||
using SqlDatabaseVectorSearch.Settings;
|
||||
using SqlDatabaseVectorSearch.TextChunkers;
|
||||
using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities;
|
||||
|
||||
namespace SqlDatabaseVectorSearch.Services;
|
||||
|
||||
public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, ITextEmbeddingGenerationService textEmbeddingGenerationService, TokenizerService tokenizerService, TextChunkerService textChunkerService, ChatService chatService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger)
|
||||
public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, ITextEmbeddingGenerationService textEmbeddingGenerationService, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger)
|
||||
{
|
||||
private readonly AppSettings appSettings = appSettingsOptions.Value;
|
||||
|
||||
@@ -39,7 +40,8 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
|
||||
dbContext.Documents.Add(document);
|
||||
|
||||
// Split the content into chunks and generate the embeddings for each one.
|
||||
var paragraphs = textChunkerService.Split(content);
|
||||
var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
|
||||
var paragraphs = textChunker.Split(content);
|
||||
var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(paragraphs, cancellationToken: cancellationToken);
|
||||
|
||||
// Save the document chunks and the corresponding embedding in the database.
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
<PackageReference Include="Microsoft.ML.Tokenizers.Data.Cl100kBase" Version="1.0.1" />
|
||||
<PackageReference Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="1.0.1" />
|
||||
<PackageReference Include="Microsoft.SemanticKernel" Version="1.37.0" />
|
||||
<PackageReference Include="MimeMapping" Version="3.1.0" />
|
||||
<PackageReference Include="MinimalHelpers.OpenApi" Version="2.1.4" />
|
||||
<PackageReference Include="PdfPig" Version="0.1.9" />
|
||||
<PackageReference Include="Swashbuckle.AspNetCore.SwaggerUI" Version="7.2.0" />
|
||||
|
||||
+3
-2
@@ -1,10 +1,11 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
using Microsoft.SemanticKernel.Text;
|
||||
using SqlDatabaseVectorSearch.Services;
|
||||
using SqlDatabaseVectorSearch.Settings;
|
||||
|
||||
namespace SqlDatabaseVectorSearch.Services;
|
||||
namespace SqlDatabaseVectorSearch.TextChunkers;
|
||||
|
||||
public class TextChunkerService(TokenizerService tokenizerService, IOptions<AppSettings> appSettingsOptions)
|
||||
public class DefaultTextChunker(TokenizerService tokenizerService, IOptions<AppSettings> appSettingsOptions) : ITextChunker
|
||||
{
|
||||
private readonly AppSettings appSettings = appSettingsOptions.Value;
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
namespace SqlDatabaseVectorSearch.TextChunkers;
|
||||
|
||||
public interface ITextChunker
|
||||
{
|
||||
IList<string> Split(string text);
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
using Microsoft.SemanticKernel.Text;
|
||||
using SqlDatabaseVectorSearch.Services;
|
||||
using SqlDatabaseVectorSearch.Settings;
|
||||
|
||||
namespace SqlDatabaseVectorSearch.TextChunkers;
|
||||
|
||||
public class MarkdownTextChunker(TokenizerService tokenizerService, IOptions<AppSettings> appSettingsOptions) : ITextChunker
|
||||
{
|
||||
private readonly AppSettings appSettings = appSettingsOptions.Value;
|
||||
|
||||
public IList<string> Split(string text)
|
||||
{
|
||||
var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens);
|
||||
var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens);
|
||||
|
||||
return paragraphs;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user