From 110e21e1e07b2d72b0ad09c4b0e2b9de4390d438 Mon Sep 17 00:00:00 2001
From: Marco Minerva <marco.minerva@gmail.com>
Date: Wed, 29 Jan 2025 09:43:22 +0100
Subject: [PATCH] Add content decoding for PDF and DOCX files

- Added `using` statements in `Program.cs` for new content decoding.
- Registered new content decoder services in `builder.Services`.
- Modified `documentsApiGroup.MapPost` to pass `file.ContentType`.
- Refactored `VectorSearchService` to use `IServiceProvider` and handle content types.
- Added `DocumentFormat.OpenXml` package reference.
- Created `DocxContentDecoder` and `PdfContentDecoder` classes.
- Created `IContentDecoder` interface.
---
 .../ContentDecoders/DocxContentDecoder.cs     | 61 +++++++++++++++++++
 .../ContentDecoders/IContentDecoder.cs        |  6 ++
 .../ContentDecoders/PdfContentDecoder.cs      | 24 ++++++++
 SqlDatabaseVectorSearch/Program.cs            |  7 ++-
 .../Services/VectorSearchService.cs           | 29 ++-------
 .../SqlDatabaseVectorSearch.csproj            |  1 +
 6 files changed, 104 insertions(+), 24 deletions(-)
 create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs
 create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs
 create mode 100644 SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs
diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs
new file mode 100644
index 0000000..35dfd51
--- /dev/null
+++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs
@@ -0,0 +1,61 @@
+﻿using System.Text;
+using DocumentFormat.OpenXml.Packaging;
+using DocumentFormat.OpenXml.Wordprocessing;
+
+namespace SqlDatabaseVectorSearch.ContentDecoders;
+
+public class DocxContentDecoder : IContentDecoder
+{
+    public Task<string> DecodeAsync(Stream stream, string contentType)
+    {
+        // Open a Word document for read-only access.
+        using var document = WordprocessingDocument.Open(stream, false);
+
+        var body = document.MainDocumentPart?.Document.Body;
+        var content = new StringBuilder();
+
+        var paragraphs = body?.Descendants<Paragraph>() ?? [];
+        foreach (var p in paragraphs)
+        {
+            content.AppendLine(p.InnerText);
+        }
+
+        return Task.FromResult(content.ToString());
+
+        //foreach (var paragraph in body!.Elements<Paragraph>())
+        //{
+        //    foreach (var element in paragraph.Elements())
+        //    {
+        //        if (element is Run run)
+        //        {
+        //            DecodeTextFromRun(run);
+        //        }
+        //        else if (element is Hyperlink hyperlink)
+        //        {
+        //            foreach (var hyperlinkRun in hyperlink.Elements<Run>())
+        //            {
+        //                DecodeTextFromRun(hyperlinkRun);
+        //            }
+
+        //            //var hyperlinkUri = doc.MainDocumentPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id)?.Uri;
+        //            //if (hyperlinkUri is not null)
+        //            //{
+        //            //    content.Append($" ({hyperlinkUri})");
+        //            //}
+        //        }
+        //    }
+
+        //    content.AppendLine(); // Preserve whitespace and blank lines.
+        //}
+
+        //return Task.FromResult(content.ToString());
+
+        //void DecodeTextFromRun(Run run)
+        //{
+        //    foreach (var text in run.Elements<Text>())
+        //    {
+        //        content.Append(text.Text);
+        //    }
+        //}
+    }
+}
diff --git a/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs
new file mode 100644
index 0000000..87a736f
--- /dev/null
+++ b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs
@@ -0,0 +1,6 @@
+﻿namespace SqlDatabaseVectorSearch.ContentDecoders;
+
+public interface IContentDecoder
+{
+    Task<string> DecodeAsync(Stream stream, string contentType);
+}
diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs
new file mode 100644
index 0000000..ecb14a5
--- /dev/null
+++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs
@@ -0,0 +1,24 @@
+﻿using System.Text;
+using UglyToad.PdfPig;
+using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
+
+namespace SqlDatabaseVectorSearch.ContentDecoders;
+
+public class PdfContentDecoder : IContentDecoder
+{
+    public Task<string> DecodeAsync(Stream stream, string contentType)
+    {
+        var content = new StringBuilder();
+
+        // Read the content of the PDF document.
+        using var pdfDocument = PdfDocument.Open(stream);
+
+        foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
+        {
+            var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
+            content.AppendLine(pageContent);
+        }
+
+        return Task.FromResult(content.ToString());
+    }
+}
diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs
index 24807d8..71301b6 100644
--- a/SqlDatabaseVectorSearch/Program.cs
+++ b/SqlDatabaseVectorSearch/Program.cs
@@ -1,8 +1,10 @@
 using System.ComponentModel;
+using System.Net.Mime;
 using System.Text.Json.Serialization;
 using Microsoft.AspNetCore.Http.HttpResults;
 using Microsoft.EntityFrameworkCore;
 using Microsoft.SemanticKernel;
+using SqlDatabaseVectorSearch.ContentDecoders;
 using SqlDatabaseVectorSearch.DataAccessLayer;
 using SqlDatabaseVectorSearch.Models;
 using SqlDatabaseVectorSearch.Services;
@@ -50,6 +52,9 @@ builder.Services.AddSingleton<TokenizerService>();
 builder.Services.AddSingleton<ChatService>();
 builder.Services.AddScoped<VectorSearchService>();
 
+builder.Services.AddKeyedSingleton<IContentDecoder, PdfContentDecoder>(MediaTypeNames.Application.Pdf);
+builder.Services.AddKeyedSingleton<IContentDecoder, DocxContentDecoder>("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+
 builder.Services.ConfigureHttpJsonOptions(options =>
 {
     options.SerializerOptions.Converters.Add(new JsonStringEnumConverter());
@@ -113,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi
     [Description("The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.")] Guid? documentId = null) =>
 {
     using var stream = file.OpenReadStream();
-    documentId = await vectorSearchService.ImportAsync(stream, file.FileName, documentId);
+    documentId = await vectorSearchService.ImportAsync(stream, file.FileName, file.ContentType, documentId);
 
     return TypedResults.Ok(new UploadDocumentResponse(documentId.Value));
 })
diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs
index 362762c..87b7151 100644
--- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs
+++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs
@@ -1,26 +1,25 @@
 ﻿using System.Data;
-using System.Text;
 using Microsoft.EntityFrameworkCore;
 using Microsoft.Extensions.Options;
 using Microsoft.SemanticKernel.Embeddings;
 using Microsoft.SemanticKernel.Text;
+using SqlDatabaseVectorSearch.ContentDecoders;
 using SqlDatabaseVectorSearch.DataAccessLayer;
 using SqlDatabaseVectorSearch.Models;
 using SqlDatabaseVectorSearch.Settings;
-using UglyToad.PdfPig;
-using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
 using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities;
 
 namespace SqlDatabaseVectorSearch.Services;
 
-public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TokenizerService tokenizerService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger)
+public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TokenizerService tokenizerService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger)
 {
     private readonly AppSettings appSettings = appSettingsOptions.Value;
 
-    public async Task<Guid> ImportAsync(Stream stream, string name, Guid? documentId)
+    public async Task<Guid> ImportAsync(Stream stream, string name, string contentType, Guid? documentId)
     {
-        // Extract the contents of the file (currently, only PDF files are supported).
-        var content = await GetContentAsync(stream);
+        // Extract the contents of the file.
+        var decoder = serviceProvider.GetRequiredKeyedService<IContentDecoder>(contentType);
+        var content = await decoder.DecodeAsync(stream, contentType);
 
         await dbContext.Database.BeginTransactionAsync();
 
@@ -126,20 +125,4 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
 
         return (reformulatedQuestion, chunks);
     }
-
-    private static Task<string> GetContentAsync(Stream stream)
-    {
-        var content = new StringBuilder();
-
-        // Read the content of the PDF document.
-        using var pdfDocument = PdfDocument.Open(stream);
-
-        foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
-        {
-            var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
-            content.AppendLine(pageContent);
-        }
-
-        return Task.FromResult(content.ToString());
-    }
 }
\ No newline at end of file
diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj
index 3c549ef..a337091 100644
--- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj
+++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj
@@ -8,6 +8,7 @@
 	</PropertyGroup>
 
 	<ItemGroup>
+        <PackageReference Include="DocumentFormat.OpenXml" Version="3.2.0" />
         <PackageReference Include="EFCore.SqlServer.VectorSearch" Version="9.0.0-preview.2" />
         <PackageReference Include="EntityFrameworkCore.Exceptions.SqlServer" Version="8.1.3" />
         <PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="9.0.1" />