diff --git a/README.md b/README.md index 4c68e84..a62f962 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # SQL Database Vector Search Sample A repository that showcases the native VECTOR type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI. -The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX and TXT files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel). +The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX, TXT and MD files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel). > [!NOTE] > If you prefer to use straight SQL, check out the [sql branch](https://github.com/marcominerva/SqlDatabaseVectorSearch/tree/sql). diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index b5f03a8..1455b34 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -4,11 +4,13 @@ using System.Text.Json.Serialization; using Microsoft.AspNetCore.Http.HttpResults; using Microsoft.EntityFrameworkCore; using Microsoft.SemanticKernel; +using MimeMapping; using SqlDatabaseVectorSearch.ContentDecoders; using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Services; using SqlDatabaseVectorSearch.Settings; +using SqlDatabaseVectorSearch.TextChunkers; using TinyHelpers.AspNetCore.Extensions; using TinyHelpers.AspNetCore.OpenApi; @@ -53,7 +55,6 @@ builder.Services.AddKernel() .AddAzureOpenAITextEmbeddingGeneration(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, dimensions: aiSettings.Embedding.Dimensions) .AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey); -builder.Services.AddSingleton(); builder.Services.AddSingleton(); builder.Services.AddSingleton(); @@ -63,11 +64,15 @@ builder.Services.AddScoped(); builder.Services.AddKeyedSingleton(MediaTypeNames.Application.Pdf); builder.Services.AddKeyedSingleton("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); builder.Services.AddKeyedSingleton(MediaTypeNames.Text.Plain); +builder.Services.AddKeyedSingleton(MediaTypeNames.Text.Markdown); + +builder.Services.AddKeyedSingleton(KeyedService.AnyKey); +builder.Services.AddKeyedSingleton(MediaTypeNames.Text.Markdown); builder.Services.AddOpenApi(options => { options.RemoveServerList(); - options.AddDefaultResponse(); + options.AddDefaultProblemDetailsResponse(); }); builder.Services.AddDefaultProblemDetails(); @@ -135,6 +140,21 @@ documentsApiGroup.MapGet(string.Empty, async (DocumentService documentService, C }) .WithSummary("Gets the list of documents"); +documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchService vectorSearchService, CancellationToken cancellationToken, + [Description("The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.")] Guid? documentId = null) => +{ + using var stream = file.OpenReadStream(); + + // Note: file.ContentType is not 100% reliable (for example, for markdown file). + var response = await vectorSearchService.ImportAsync(stream, file.FileName, MimeUtility.GetMimeMapping(file.FileName), documentId, cancellationToken); + + return TypedResults.Ok(response); +}) +.DisableAntiforgery() +.ProducesProblem(StatusCodes.Status400BadRequest) +.WithSummary("Uploads a document") +.WithDescription("Uploads a document to SQL Database and saves its embedding using the native VECTOR type. The document will be indexed and used to answer questions. Currently, PDF, DOCX, TXT and MD files are supported."); + documentsApiGroup.MapGet("{documentId:guid}/chunks", async (Guid documentId, DocumentService documentService, CancellationToken cancellationToken) => { var documents = await documentService.GetChunksAsync(documentId, cancellationToken); @@ -164,17 +184,4 @@ documentsApiGroup.MapDelete("{documentId:guid}", async (Guid documentId, Documen .WithSummary("Deletes a document") .WithDescription("This endpoint deletes the document and all its chunks."); -documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchService vectorSearchService, CancellationToken cancellationToken, - [Description("The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.")] Guid? documentId = null) => -{ - using var stream = file.OpenReadStream(); - var response = await vectorSearchService.ImportAsync(stream, file.FileName, file.ContentType, documentId, cancellationToken); - - return TypedResults.Ok(response); -}) -.DisableAntiforgery() -.ProducesProblem(StatusCodes.Status400BadRequest) -.WithSummary("Uploads a document") -.WithDescription("Uploads a document to SQL Database and saves its embedding using the native VECTOR type. The document will be indexed and used to answer questions. Currently, PDF, DOCX and TXT files are supported."); - app.Run(); \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 682fa02..2c1a26d 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -7,11 +7,12 @@ using SqlDatabaseVectorSearch.ContentDecoders; using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Settings; +using SqlDatabaseVectorSearch.TextChunkers; using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities; namespace SqlDatabaseVectorSearch.Services; -public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, ITextEmbeddingGenerationService textEmbeddingGenerationService, TokenizerService tokenizerService, TextChunkerService textChunkerService, ChatService chatService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) +public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, DocumentService documentService, ITextEmbeddingGenerationService textEmbeddingGenerationService, TokenizerService tokenizerService, ChatService chatService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) { private readonly AppSettings appSettings = appSettingsOptions.Value; @@ -39,7 +40,8 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb dbContext.Documents.Add(document); // Split the content into chunks and generate the embeddings for each one. - var paragraphs = textChunkerService.Split(content); + var textChunker = serviceProvider.GetRequiredKeyedService(contentType); + var paragraphs = textChunker.Split(content); var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(paragraphs, cancellationToken: cancellationToken); // Save the document chunks and the corresponding embedding in the database. diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index 1f8939b..c8e83e5 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -19,6 +19,7 @@ + diff --git a/SqlDatabaseVectorSearch/Services/TextChunkerService.cs b/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs similarity index 73% rename from SqlDatabaseVectorSearch/Services/TextChunkerService.cs rename to SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs index 851f08d..46ccd49 100644 --- a/SqlDatabaseVectorSearch/Services/TextChunkerService.cs +++ b/SqlDatabaseVectorSearch/TextChunkers/DefaultTextChunker.cs @@ -1,10 +1,11 @@ using Microsoft.Extensions.Options; using Microsoft.SemanticKernel.Text; +using SqlDatabaseVectorSearch.Services; using SqlDatabaseVectorSearch.Settings; -namespace SqlDatabaseVectorSearch.Services; +namespace SqlDatabaseVectorSearch.TextChunkers; -public class TextChunkerService(TokenizerService tokenizerService, IOptions appSettingsOptions) +public class DefaultTextChunker(TokenizerService tokenizerService, IOptions appSettingsOptions) : ITextChunker { private readonly AppSettings appSettings = appSettingsOptions.Value; diff --git a/SqlDatabaseVectorSearch/TextChunkers/ITextChunker.cs b/SqlDatabaseVectorSearch/TextChunkers/ITextChunker.cs new file mode 100644 index 0000000..b62ad35 --- /dev/null +++ b/SqlDatabaseVectorSearch/TextChunkers/ITextChunker.cs @@ -0,0 +1,6 @@ +namespace SqlDatabaseVectorSearch.TextChunkers; + +public interface ITextChunker +{ + IList Split(string text); +} diff --git a/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs b/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs new file mode 100644 index 0000000..fd3a8f6 --- /dev/null +++ b/SqlDatabaseVectorSearch/TextChunkers/MarkdownTextChunker.cs @@ -0,0 +1,19 @@ +using Microsoft.Extensions.Options; +using Microsoft.SemanticKernel.Text; +using SqlDatabaseVectorSearch.Services; +using SqlDatabaseVectorSearch.Settings; + +namespace SqlDatabaseVectorSearch.TextChunkers; + +public class MarkdownTextChunker(TokenizerService tokenizerService, IOptions appSettingsOptions) : ITextChunker +{ + private readonly AppSettings appSettings = appSettingsOptions.Value; + + public IList Split(string text) + { + var lines = TextChunker.SplitMarkDownLines(text, appSettings.MaxTokensPerLine, tokenizerService.CountChatCompletionTokens); + var paragraphs = TextChunker.SplitMarkdownParagraphs(lines, appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens, tokenCounter: tokenizerService.CountChatCompletionTokens); + + return paragraphs; + } +}