Enhance content decoders and update dependencies

- Modified `DocxContentDecoder` to use `IServiceProvider` for text chunking and improved paragraph processing with page break handling. - Updated `PdfContentDecoder` and `TextContentDecoder` to trim whitespace from text before splitting into paragraphs. - Reordered service registrations in `Program.cs` while retaining existing functionality. - Updated `SqlDatabaseVectorSearch.csproj` with new package versions for several dependencies, including `Microsoft.AspNetCore.OpenApi` and `Microsoft.EntityFrameworkCore`.
2026-06-20 12:23:10 +00:00 · 2025-06-11 17:20:56 +02:00
parent c9c5b74e75
commit cdbe2e3a91
5 changed files with 51 additions and 25 deletions
@@ -1,25 +1,51 @@
 using System.Text;
 using DocumentFormat.OpenXml.Packaging;
 using DocumentFormat.OpenXml.Wordprocessing;
+using SqlDatabaseVectorSearch.TextChunkers;

 namespace SqlDatabaseVectorSearch.ContentDecoders;

-public class DocxContentDecoder : IContentDecoder
+public class DocxContentDecoder(IServiceProvider serviceProvider) : IContentDecoder
 {
    public Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
    {
-        // Open a Word document for read-only access.
+        var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
+
        using var document = WordprocessingDocument.Open(stream, false);
-
        var body = document.MainDocumentPart?.Document.Body;
-        var content = new StringBuilder();
-
-        var paragraphs = body?.Descendants<Paragraph>() ?? [];
-        foreach (var p in paragraphs)
+        if (body is null)
        {
-            content.AppendLine(p.InnerText);
+            return Task.FromResult(Enumerable.Empty<Chunk>());
        }

-        return Task.FromResult(new List<Chunk>([new(1, 0, content.ToString())]).AsEnumerable());
+        var pages = new List<string>();
+        var pageBuilder = new StringBuilder();
+
+        foreach (var paragraph in body.Descendants<Paragraph>())
+        {
+            // Note: this is just an attempt at counting pages, not 100% reliable
+            // see https://stackoverflow.com/questions/39992870/how-to-access-openxml-content-by-page-number
+            var lastRenderedPageBreak = paragraph.GetFirstChild<Run>()?.GetFirstChild<LastRenderedPageBreak>();
+            if (lastRenderedPageBreak is not null)
+            {
+                // Note: no trimming, use original spacing when working with pages
+                pages.Add(pageBuilder.ToString());
+                pageBuilder.Clear();
+            }
+
+            pageBuilder.AppendLine(paragraph.InnerText);
+        }
+
+        // Dopo aver processato tutti i paragrafi, aggiungi l'ultima pagina (anche se vuota)
+        pages.Add(pageBuilder.ToString());
+
+        var chunks = new List<Chunk>();
+        foreach (var (pageIndex, pageText) in pages.Index())
+        {
+            var paragraphs = textChunker.Split(pageText.Trim());
+            chunks.AddRange(paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pageIndex + 1, index, text)));
+        }
+
+        return Task.FromResult(chunks.AsEnumerable());
    }
 }
@@ -26,7 +26,7 @@ public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecod
        var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);
        var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" ")));

-        var paragraphs = textChunker.Split(pageText);
+        var paragraphs = textChunker.Split(pageText.Trim());

        return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text));
    }
@@ -11,7 +11,7 @@ public class TextContentDecoder(IServiceProvider serviceProvider) : IContentDeco
        using var readStream = new StreamReader(stream);
        var content = await readStream.ReadToEndAsync(cancellationToken);

-        var paragraphs = textChunker.Split(content);
+        var paragraphs = textChunker.Split(content.Trim());
        return paragraphs.Select((text, index) => new Chunk(null, index, text)).ToList();
    }
 }