From 110e21e1e07b2d72b0ad09c4b0e2b9de4390d438 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 29 Jan 2025 09:43:22 +0100 Subject: [PATCH 1/3] Add content decoding for PDF and DOCX files - Added `using` statements in `Program.cs` for new content decoding. - Registered new content decoder services in `builder.Services`. - Modified `documentsApiGroup.MapPost` to pass `file.ContentType`. - Refactored `VectorSearchService` to use `IServiceProvider` and handle content types. - Added `DocumentFormat.OpenXml` package reference. - Created `DocxContentDecoder` and `PdfContentDecoder` classes. - Created `IContentDecoder` interface. --- .../ContentDecoders/DocxContentDecoder.cs | 61 +++++++++++++++++++ .../ContentDecoders/IContentDecoder.cs | 6 ++ .../ContentDecoders/PdfContentDecoder.cs | 24 ++++++++ SqlDatabaseVectorSearch/Program.cs | 7 ++- .../Services/VectorSearchService.cs | 29 ++------- .../SqlDatabaseVectorSearch.csproj | 1 + 6 files changed, 104 insertions(+), 24 deletions(-) create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs new file mode 100644 index 0000000..35dfd51 --- /dev/null +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -0,0 +1,61 @@ +using System.Text; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Wordprocessing; + +namespace SqlDatabaseVectorSearch.ContentDecoders; + +public class DocxContentDecoder : IContentDecoder +{ + public Task DecodeAsync(Stream stream, string contentType) + { + // Open a Word document for read-only access. + using var document = WordprocessingDocument.Open(stream, false); + + var body = document.MainDocumentPart?.Document.Body; + var content = new StringBuilder(); + + var paragraphs = body?.Descendants() ?? []; + foreach (var p in paragraphs) + { + content.AppendLine(p.InnerText); + } + + return Task.FromResult(content.ToString()); + + //foreach (var paragraph in body!.Elements()) + //{ + // foreach (var element in paragraph.Elements()) + // { + // if (element is Run run) + // { + // DecodeTextFromRun(run); + // } + // else if (element is Hyperlink hyperlink) + // { + // foreach (var hyperlinkRun in hyperlink.Elements()) + // { + // DecodeTextFromRun(hyperlinkRun); + // } + + // //var hyperlinkUri = doc.MainDocumentPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id)?.Uri; + // //if (hyperlinkUri is not null) + // //{ + // // content.Append($" ({hyperlinkUri})"); + // //} + // } + // } + + // content.AppendLine(); // Preserve whitespace and blank lines. + //} + + //return Task.FromResult(content.ToString()); + + //void DecodeTextFromRun(Run run) + //{ + // foreach (var text in run.Elements()) + // { + // content.Append(text.Text); + // } + //} + } +} diff --git a/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs new file mode 100644 index 0000000..87a736f --- /dev/null +++ b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs @@ -0,0 +1,6 @@ +namespace SqlDatabaseVectorSearch.ContentDecoders; + +public interface IContentDecoder +{ + Task DecodeAsync(Stream stream, string contentType); +} diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs new file mode 100644 index 0000000..ecb14a5 --- /dev/null +++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs @@ -0,0 +1,24 @@ +using System.Text; +using UglyToad.PdfPig; +using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; + +namespace SqlDatabaseVectorSearch.ContentDecoders; + +public class PdfContentDecoder : IContentDecoder +{ + public Task DecodeAsync(Stream stream, string contentType) + { + var content = new StringBuilder(); + + // Read the content of the PDF document. + using var pdfDocument = PdfDocument.Open(stream); + + foreach (var page in pdfDocument.GetPages().Where(x => x is not null)) + { + var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty; + content.AppendLine(pageContent); + } + + return Task.FromResult(content.ToString()); + } +} diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 24807d8..71301b6 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -1,8 +1,10 @@ using System.ComponentModel; +using System.Net.Mime; using System.Text.Json.Serialization; using Microsoft.AspNetCore.Http.HttpResults; using Microsoft.EntityFrameworkCore; using Microsoft.SemanticKernel; +using SqlDatabaseVectorSearch.ContentDecoders; using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Services; @@ -50,6 +52,9 @@ builder.Services.AddSingleton(); builder.Services.AddSingleton(); builder.Services.AddScoped(); +builder.Services.AddKeyedSingleton(MediaTypeNames.Application.Pdf); +builder.Services.AddKeyedSingleton("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + builder.Services.ConfigureHttpJsonOptions(options => { options.SerializerOptions.Converters.Add(new JsonStringEnumConverter()); @@ -113,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi [Description("The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.")] Guid? documentId = null) => { using var stream = file.OpenReadStream(); - documentId = await vectorSearchService.ImportAsync(stream, file.FileName, documentId); + documentId = await vectorSearchService.ImportAsync(stream, file.FileName, file.ContentType, documentId); return TypedResults.Ok(new UploadDocumentResponse(documentId.Value)); }) diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 362762c..87b7151 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -1,26 +1,25 @@ using System.Data; -using System.Text; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Options; using Microsoft.SemanticKernel.Embeddings; using Microsoft.SemanticKernel.Text; +using SqlDatabaseVectorSearch.ContentDecoders; using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Settings; -using UglyToad.PdfPig; -using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities; namespace SqlDatabaseVectorSearch.Services; -public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TokenizerService tokenizerService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) +public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TokenizerService tokenizerService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) { private readonly AppSettings appSettings = appSettingsOptions.Value; - public async Task ImportAsync(Stream stream, string name, Guid? documentId) + public async Task ImportAsync(Stream stream, string name, string contentType, Guid? documentId) { - // Extract the contents of the file (currently, only PDF files are supported). - var content = await GetContentAsync(stream); + // Extract the contents of the file. + var decoder = serviceProvider.GetRequiredKeyedService(contentType); + var content = await decoder.DecodeAsync(stream, contentType); await dbContext.Database.BeginTransactionAsync(); @@ -126,20 +125,4 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG return (reformulatedQuestion, chunks); } - - private static Task GetContentAsync(Stream stream) - { - var content = new StringBuilder(); - - // Read the content of the PDF document. - using var pdfDocument = PdfDocument.Open(stream); - - foreach (var page in pdfDocument.GetPages().Where(x => x is not null)) - { - var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty; - content.AppendLine(pageContent); - } - - return Task.FromResult(content.ToString()); - } } \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index 3c549ef..a337091 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -8,6 +8,7 @@ + From af9158873f82610c7961a628e6c49d0b0e2495fe Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 29 Jan 2025 09:58:22 +0100 Subject: [PATCH 2/3] Add support for DOCX and TXT files, update error handling Updated README.md to reflect support for PDF, DOCX, and TXT files. Removed commented-out code in DocxContentDecoder.cs. Added TextContentDecoder service in Program.cs and updated exception handling middleware. Updated document upload endpoint description in Program.cs. Modified VectorSearchService to throw NotSupportedException for unsupported content types. Added TextContentDecoder class in TextContentDecoder.cs. --- README.md | 4 +-- .../ContentDecoders/DocxContentDecoder.cs | 36 ------------------- .../ContentDecoders/TextContentDecoder.cs | 12 +++++++ SqlDatabaseVectorSearch/Program.cs | 13 +++++-- .../Services/VectorSearchService.cs | 2 +- 5 files changed, 26 insertions(+), 41 deletions(-) create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs diff --git a/README.md b/README.md index 044cf77..ee762b6 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # SQL Database Vector Search Sample A repository that showcases the native VECTOR type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI. -The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, only PDF files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel). +The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX and TXT files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel). > [!NOTE] > If you prefer to use straight SQL, check out the [sql branch](https://github.com/marcominerva/SqlDatabaseVectorSearch/tree/sql). @@ -15,4 +15,4 @@ The application is a Minimal API that exposes endpoints to load documents, gener - You may need to update the size of the [`VECTOR`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/Scripts.sql#L17) column to match the size of the embedding model. Currently, the maximum allowed value is 1998. - Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI - If your embedding model supports shortening, like **text-embedding-3-small** and **text-embedding-3-large**, and you want to use this feature, you need to set the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to match the value you have used in the SQL script. If your model doesn't provide this feature, or do you want to use the default size, just leave the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to NULL. Keep in mind that **text-embedding-3-small** has a dimension of 1536, while **text-embedding-3-large** uses vectors with 3072 elements, so with this latter model it is mandatory to specify a value (that, as said, must be less or equal to 1998). -- Run the application and start importing your PDF documents. +- Run the application and start importing your documents. diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs index 35dfd51..36b6d06 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -21,41 +21,5 @@ public class DocxContentDecoder : IContentDecoder } return Task.FromResult(content.ToString()); - - //foreach (var paragraph in body!.Elements()) - //{ - // foreach (var element in paragraph.Elements()) - // { - // if (element is Run run) - // { - // DecodeTextFromRun(run); - // } - // else if (element is Hyperlink hyperlink) - // { - // foreach (var hyperlinkRun in hyperlink.Elements()) - // { - // DecodeTextFromRun(hyperlinkRun); - // } - - // //var hyperlinkUri = doc.MainDocumentPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id)?.Uri; - // //if (hyperlinkUri is not null) - // //{ - // // content.Append($" ({hyperlinkUri})"); - // //} - // } - // } - - // content.AppendLine(); // Preserve whitespace and blank lines. - //} - - //return Task.FromResult(content.ToString()); - - //void DecodeTextFromRun(Run run) - //{ - // foreach (var text in run.Elements()) - // { - // content.Append(text.Text); - // } - //} } } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs new file mode 100644 index 0000000..d9d6406 --- /dev/null +++ b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs @@ -0,0 +1,12 @@ +namespace SqlDatabaseVectorSearch.ContentDecoders; + +public class TextContentDecoder : IContentDecoder +{ + public async Task DecodeAsync(Stream stream, string contentType) + { + using var readStream = new StreamReader(stream); + var content = await readStream.ReadToEndAsync(); + + return content; + } +} diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 71301b6..4176fca 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -54,6 +54,7 @@ builder.Services.AddScoped(); builder.Services.AddKeyedSingleton(MediaTypeNames.Application.Pdf); builder.Services.AddKeyedSingleton("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); +builder.Services.AddKeyedSingleton(MediaTypeNames.Text.Plain); builder.Services.ConfigureHttpJsonOptions(options => { @@ -74,7 +75,15 @@ var app = builder.Build(); // Configure the HTTP request pipeline. app.UseHttpsRedirection(); -app.UseExceptionHandler(); +app.UseExceptionHandler(new ExceptionHandlerOptions +{ + StatusCodeSelector = exception => exception switch + { + NotSupportedException => StatusCodes.Status501NotImplemented, + _ => StatusCodes.Status500InternalServerError + } +}); + app.UseStatusCodePages(); app.MapOpenApi(); @@ -125,7 +134,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi .DisableAntiforgery() .ProducesProblem(StatusCodes.Status400BadRequest) .WithSummary("Uploads a document") -.WithDescription("Uploads a document to SQL Database and saves its embedding using the native VECTOR type. The document will be indexed and used to answer questions. Currently, only PDF files are supported."); +.WithDescription("Uploads a document to SQL Database and saves its embedding using the native VECTOR type. The document will be indexed and used to answer questions. Currently, PDF, DOCX and TXT files are supported."); documentsApiGroup.MapDelete("{documentId:guid}", async (Guid documentId, VectorSearchService vectorSearchService) => { diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 87b7151..3e0eca1 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -18,7 +18,7 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb public async Task ImportAsync(Stream stream, string name, string contentType, Guid? documentId) { // Extract the contents of the file. - var decoder = serviceProvider.GetRequiredKeyedService(contentType); + var decoder = serviceProvider.GetKeyedService(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported."); var content = await decoder.DecodeAsync(stream, contentType); await dbContext.Database.BeginTransactionAsync(); From b8aace05a5836dd74aaacbfcfb068e76a8e679b6 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 29 Jan 2025 10:00:25 +0100 Subject: [PATCH 3/3] Update Microsoft.SemanticKernel to v1.35.0 Upgraded Microsoft.SemanticKernel package from version 1.34.0 to 1.35.0 in SqlDatabaseVectorSearch.csproj. This update includes new features, bug fixes, and other improvements provided in the latest version. --- SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index a337091..ba44d14 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -18,7 +18,7 @@ - +