From cdbe2e3a9165458265958fe39c6832f0dbd0de0a Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 11 Jun 2025 17:20:56 +0200 Subject: [PATCH] Enhance content decoders and update dependencies - Modified `DocxContentDecoder` to use `IServiceProvider` for text chunking and improved paragraph processing with page break handling. - Updated `PdfContentDecoder` and `TextContentDecoder` to trim whitespace from text before splitting into paragraphs. - Reordered service registrations in `Program.cs` while retaining existing functionality. - Updated `SqlDatabaseVectorSearch.csproj` with new package versions for several dependencies, including `Microsoft.AspNetCore.OpenApi` and `Microsoft.EntityFrameworkCore`. --- .../ContentDecoders/DocxContentDecoder.cs | 44 +++++++++++++++---- .../ContentDecoders/PdfContentDecoder.cs | 2 +- .../ContentDecoders/TextContentDecoder.cs | 2 +- SqlDatabaseVectorSearch/Program.cs | 12 ++--- .../SqlDatabaseVectorSearch.csproj | 16 +++---- 5 files changed, 51 insertions(+), 25 deletions(-) diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs index d606fa1..3aa160b 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -1,25 +1,51 @@ using System.Text; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; +using SqlDatabaseVectorSearch.TextChunkers; namespace SqlDatabaseVectorSearch.ContentDecoders; -public class DocxContentDecoder : IContentDecoder +public class DocxContentDecoder(IServiceProvider serviceProvider) : IContentDecoder { public Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) { - // Open a Word document for read-only access. + var textChunker = serviceProvider.GetRequiredKeyedService(contentType); + using var document = WordprocessingDocument.Open(stream, false); - var body = document.MainDocumentPart?.Document.Body; - var content = new StringBuilder(); - - var paragraphs = body?.Descendants() ?? []; - foreach (var p in paragraphs) + if (body is null) { - content.AppendLine(p.InnerText); + return Task.FromResult(Enumerable.Empty()); } - return Task.FromResult(new List([new(1, 0, content.ToString())]).AsEnumerable()); + var pages = new List(); + var pageBuilder = new StringBuilder(); + + foreach (var paragraph in body.Descendants()) + { + // Note: this is just an attempt at counting pages, not 100% reliable + // see https://stackoverflow.com/questions/39992870/how-to-access-openxml-content-by-page-number + var lastRenderedPageBreak = paragraph.GetFirstChild()?.GetFirstChild(); + if (lastRenderedPageBreak is not null) + { + // Note: no trimming, use original spacing when working with pages + pages.Add(pageBuilder.ToString()); + pageBuilder.Clear(); + } + + pageBuilder.AppendLine(paragraph.InnerText); + } + + // Dopo aver processato tutti i paragrafi, aggiungi l'ultima pagina (anche se vuota) + pages.Add(pageBuilder.ToString()); + + var chunks = new List(); + foreach (var (pageIndex, pageText) in pages.Index()) + { + var paragraphs = textChunker.Split(pageText.Trim()); + chunks.AddRange(paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pageIndex + 1, index, text))); + } + + return Task.FromResult(chunks.AsEnumerable()); } } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs index c5cd0e9..00ee765 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs @@ -26,7 +26,7 @@ public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecod var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words); var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" "))); - var paragraphs = textChunker.Split(pageText); + var paragraphs = textChunker.Split(pageText.Trim()); return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text)); } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs index 29e76f1..b6d7fb0 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs @@ -11,7 +11,7 @@ public class TextContentDecoder(IServiceProvider serviceProvider) : IContentDeco using var readStream = new StreamReader(stream); var content = await readStream.ReadToEndAsync(cancellationToken); - var paragraphs = textChunker.Split(content); + var paragraphs = textChunker.Split(content.Trim()); return paragraphs.Select((text, index) => new Chunk(null, index, text)).ToList(); } } diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index f9d2e78..dfda707 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -63,12 +63,6 @@ builder.Services.AddKernel() .AddAzureOpenAIEmbeddingGenerator(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, modelId: aiSettings.Embedding.ModelId, dimensions: aiSettings.Embedding.Dimensions) .AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey, modelId: aiSettings.ChatCompletion.ModelId); -builder.Services.AddSingleton(); -builder.Services.AddSingleton(); - -builder.Services.AddScoped(); -builder.Services.AddScoped(); - builder.Services.AddKeyedSingleton(MediaTypeNames.Application.Pdf); builder.Services.AddKeyedSingleton("application/vnd.openxmlformats-officedocument.wordprocessingml.document"); builder.Services.AddKeyedSingleton(MediaTypeNames.Text.Plain); @@ -77,6 +71,12 @@ builder.Services.AddKeyedSingleton(MediaTyp builder.Services.AddKeyedSingleton(KeyedService.AnyKey); builder.Services.AddKeyedSingleton(MediaTypeNames.Text.Markdown); +builder.Services.AddSingleton(); +builder.Services.AddSingleton(); + +builder.Services.AddScoped(); +builder.Services.AddScoped(); + builder.Services.AddOpenApi(options => { options.RemoveServerList(); diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index fbe0632..5b36cf1 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -13,24 +13,24 @@ - - - + + + all runtime; build; native; contentfiles; analyzers; buildtransitive - - + + - + - - + +