From 110e21e1e07b2d72b0ad09c4b0e2b9de4390d438 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 29 Jan 2025 09:43:22 +0100 Subject: [PATCH] Add content decoding for PDF and DOCX files - Added `using` statements in `Program.cs` for new content decoding. - Registered new content decoder services in `builder.Services`. - Modified `documentsApiGroup.MapPost` to pass `file.ContentType`. - Refactored `VectorSearchService` to use `IServiceProvider` and handle content types. - Added `DocumentFormat.OpenXml` package reference. - Created `DocxContentDecoder` and `PdfContentDecoder` classes. - Created `IContentDecoder` interface. --- .../ContentDecoders/DocxContentDecoder.cs | 61 +++++++++++++++++++ .../ContentDecoders/IContentDecoder.cs | 6 ++ .../ContentDecoders/PdfContentDecoder.cs | 24 ++++++++ SqlDatabaseVectorSearch/Program.cs | 7 ++- .../Services/VectorSearchService.cs | 29 ++------- .../SqlDatabaseVectorSearch.csproj | 1 + 6 files changed, 104 insertions(+), 24 deletions(-) create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs new file mode 100644 index 0000000..35dfd51 --- /dev/null +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -0,0 +1,61 @@ +using System.Text; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Wordprocessing; + +namespace SqlDatabaseVectorSearch.ContentDecoders; + +public class DocxContentDecoder : IContentDecoder +{ + public Task DecodeAsync(Stream stream, string contentType) + { + // Open a Word document for read-only access. + using var document = WordprocessingDocument.Open(stream, false); + + var body = document.MainDocumentPart?.Document.Body; + var content = new StringBuilder(); + + var paragraphs = body?.Descendants() ?? []; + foreach (var p in paragraphs) + { + content.AppendLine(p.InnerText); + } + + return Task.FromResult(content.ToString()); + + //foreach (var paragraph in body!.Elements()) + //{ + // foreach (var element in paragraph.Elements()) + // { + // if (element is Run run) + // { + // DecodeTextFromRun(run); + // } + // else if (element is Hyperlink hyperlink) + // { + // foreach (var hyperlinkRun in hyperlink.Elements()) + // { + // DecodeTextFromRun(hyperlinkRun); + // } + + // //var hyperlinkUri = doc.MainDocumentPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id)?.Uri; + // //if (hyperlinkUri is not null) + // //{ + // // content.Append($" ({hyperlinkUri})"); + // //} + // } + // } + + // content.AppendLine(); // Preserve whitespace and blank lines. + //} + + //return Task.FromResult(content.ToString()); + + //void DecodeTextFromRun(Run run) + //{ + // foreach (var text in run.Elements()) + // { + // content.Append(text.Text); + // } + //} + } +} diff --git a/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs new file mode 100644 index 0000000..87a736f --- /dev/null +++ b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs @@ -0,0 +1,6 @@ +namespace SqlDatabaseVectorSearch.ContentDecoders; + +public interface IContentDecoder +{ + Task DecodeAsync(Stream stream, string contentType); +} diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs new file mode 100644 index 0000000..ecb14a5 --- /dev/null +++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs @@ -0,0 +1,24 @@ +using System.Text; +using UglyToad.PdfPig; +using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; + +namespace SqlDatabaseVectorSearch.ContentDecoders; + +public class PdfContentDecoder : IContentDecoder +{ + public Task DecodeAsync(Stream stream, string contentType) + { + var content = new StringBuilder(); + + // Read the content of the PDF document. + using var pdfDocument = PdfDocument.Open(stream); + + foreach (var page in pdfDocument.GetPages().Where(x => x is not null)) + { + var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty; + content.AppendLine(pageContent); + } + + return Task.FromResult(content.ToString()); + } +} diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 24807d8..71301b6 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -1,8 +1,10 @@ using System.ComponentModel; +using System.Net.Mime; using System.Text.Json.Serialization; using Microsoft.AspNetCore.Http.HttpResults; using Microsoft.EntityFrameworkCore; using Microsoft.SemanticKernel; +using SqlDatabaseVectorSearch.ContentDecoders; using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Services; @@ -50,6 +52,9 @@ builder.Services.AddSingleton(); builder.Services.AddSingleton(); builder.Services.AddScoped(); +builder.Services.AddKeyedSingleton(MediaTypeNames.Application.Pdf); +builder.Services.AddKeyedSingleton("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + builder.Services.ConfigureHttpJsonOptions(options => { options.SerializerOptions.Converters.Add(new JsonStringEnumConverter()); @@ -113,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi [Description("The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.")] Guid? documentId = null) => { using var stream = file.OpenReadStream(); - documentId = await vectorSearchService.ImportAsync(stream, file.FileName, documentId); + documentId = await vectorSearchService.ImportAsync(stream, file.FileName, file.ContentType, documentId); return TypedResults.Ok(new UploadDocumentResponse(documentId.Value)); }) diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 362762c..87b7151 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -1,26 +1,25 @@ using System.Data; -using System.Text; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Options; using Microsoft.SemanticKernel.Embeddings; using Microsoft.SemanticKernel.Text; +using SqlDatabaseVectorSearch.ContentDecoders; using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Settings; -using UglyToad.PdfPig; -using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities; namespace SqlDatabaseVectorSearch.Services; -public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TokenizerService tokenizerService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) +public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TokenizerService tokenizerService, TimeProvider timeProvider, IOptions appSettingsOptions, ILogger logger) { private readonly AppSettings appSettings = appSettingsOptions.Value; - public async Task ImportAsync(Stream stream, string name, Guid? documentId) + public async Task ImportAsync(Stream stream, string name, string contentType, Guid? documentId) { - // Extract the contents of the file (currently, only PDF files are supported). - var content = await GetContentAsync(stream); + // Extract the contents of the file. + var decoder = serviceProvider.GetRequiredKeyedService(contentType); + var content = await decoder.DecodeAsync(stream, contentType); await dbContext.Database.BeginTransactionAsync(); @@ -126,20 +125,4 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG return (reformulatedQuestion, chunks); } - - private static Task GetContentAsync(Stream stream) - { - var content = new StringBuilder(); - - // Read the content of the PDF document. - using var pdfDocument = PdfDocument.Open(stream); - - foreach (var page in pdfDocument.GetPages().Where(x => x is not null)) - { - var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty; - content.AppendLine(pageContent); - } - - return Task.FromResult(content.ToString()); - } } \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index 3c549ef..a337091 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -8,6 +8,7 @@ +