Add content decoding for PDF and DOCX files

- Added `using` statements in `Program.cs` for new content decoding. - Registered new content decoder services in `builder.Services`. - Modified `documentsApiGroup.MapPost` to pass `file.ContentType`. - Refactored `VectorSearchService` to use `IServiceProvider` and handle content types. - Added `DocumentFormat.OpenXml` package reference. - Created `DocxContentDecoder` and `PdfContentDecoder` classes. - Created `IContentDecoder` interface.
2026-06-20 12:23:10 +00:00 · 2025-01-29 09:43:22 +01:00
parent f15f387510
commit 110e21e1e0
6 changed files with 104 additions and 24 deletions
@@ -0,0 +1,61 @@
+using System.Text;
+using DocumentFormat.OpenXml.Packaging;
+using DocumentFormat.OpenXml.Wordprocessing;
+
+namespace SqlDatabaseVectorSearch.ContentDecoders;
+
+public class DocxContentDecoder : IContentDecoder
+{
+    public Task<string> DecodeAsync(Stream stream, string contentType)
+    {
+        // Open a Word document for read-only access.
+        using var document = WordprocessingDocument.Open(stream, false);
+
+        var body = document.MainDocumentPart?.Document.Body;
+        var content = new StringBuilder();
+
+        var paragraphs = body?.Descendants<Paragraph>() ?? [];
+        foreach (var p in paragraphs)
+        {
+            content.AppendLine(p.InnerText);
+        }
+
+        return Task.FromResult(content.ToString());
+
+        //foreach (var paragraph in body!.Elements<Paragraph>())
+        //{
+        //    foreach (var element in paragraph.Elements())
+        //    {
+        //        if (element is Run run)
+        //        {
+        //            DecodeTextFromRun(run);
+        //        }
+        //        else if (element is Hyperlink hyperlink)
+        //        {
+        //            foreach (var hyperlinkRun in hyperlink.Elements<Run>())
+        //            {
+        //                DecodeTextFromRun(hyperlinkRun);
+        //            }
+
+        //            //var hyperlinkUri = doc.MainDocumentPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id)?.Uri;
+        //            //if (hyperlinkUri is not null)
+        //            //{
+        //            //    content.Append($" ({hyperlinkUri})");
+        //            //}
+        //        }
+        //    }
+
+        //    content.AppendLine(); // Preserve whitespace and blank lines.
+        //}
+
+        //return Task.FromResult(content.ToString());
+
+        //void DecodeTextFromRun(Run run)
+        //{
+        //    foreach (var text in run.Elements<Text>())
+        //    {
+        //        content.Append(text.Text);
+        //    }
+        //}
+    }
+}
@@ -0,0 +1,6 @@
+namespace SqlDatabaseVectorSearch.ContentDecoders;
+
+public interface IContentDecoder
+{
+    Task<string> DecodeAsync(Stream stream, string contentType);
+}
@@ -0,0 +1,24 @@
+using System.Text;
+using UglyToad.PdfPig;
+using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
+
+namespace SqlDatabaseVectorSearch.ContentDecoders;
+
+public class PdfContentDecoder : IContentDecoder
+{
+    public Task<string> DecodeAsync(Stream stream, string contentType)
+    {
+        var content = new StringBuilder();
+
+        // Read the content of the PDF document.
+        using var pdfDocument = PdfDocument.Open(stream);
+
+        foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
+        {
+            var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
+            content.AppendLine(pageContent);
+        }
+
+        return Task.FromResult(content.ToString());
+    }
+}