From e0cf824dd66746a9a10a6c0d5317f4cc3a652717 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Wed, 18 Jun 2025 14:45:08 +0200 Subject: [PATCH] Refactor document processing and embedding generation - Updated `DocxContentDecoder` to process Word documents as chunks of text, removing page tracking and enhancing content handling. - Modified `VectorSearchService.ImportAsync` to work with chunks, implementing batching for embedding generation. - Added `EmbeddingBatchSize` property to `AppSettings` for configurable batch processing. - Updated `appsettings.json` to include the new `EmbeddingBatchSize` setting for improved control over embedding processes. --- .../ContentDecoders/DocxContentDecoder.cs | 41 +++++-------------- .../Services/VectorSearchService.cs | 26 ++++++++---- .../Settings/AppSettings.cs | 2 + SqlDatabaseVectorSearch/appsettings.json | 1 + 4 files changed, 32 insertions(+), 38 deletions(-) diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs index bdb934c..00579c4 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -11,41 +11,22 @@ public class DocxContentDecoder(IServiceProvider serviceProvider) : IContentDeco { var textChunker = serviceProvider.GetRequiredKeyedService(contentType); + // Open a Word document for read-only access. using var document = WordprocessingDocument.Open(stream, false); + var body = document.MainDocumentPart?.Document.Body; - if (body is null) + var content = new StringBuilder(); + + foreach (var p in body?.Descendants() ?? []) { - return Task.FromResult(Enumerable.Empty()); + content.AppendLine(p.InnerText); } - var pages = new List(); - var pageBuilder = new StringBuilder(); + var paragraphs = textChunker.Split(content.ToString().Trim()); - foreach (var paragraph in body.Descendants()) - { - // Note: this is just an attempt at counting pages, not 100% reliable - // see https://stackoverflow.com/questions/39992870/how-to-access-openxml-content-by-page-number - var lastRenderedPageBreak = paragraph.GetFirstChild()?.GetFirstChild(); - if (lastRenderedPageBreak is not null) - { - // Note: no trimming, use original spacing when working with pages - pages.Add(pageBuilder.ToString()); - pageBuilder.Clear(); - } - - pageBuilder.AppendLine(paragraph.InnerText); - } - - // After processing all paragraphs, add the last page (even if empty). - pages.Add(pageBuilder.ToString()); - - var chunks = new List(); - foreach (var (pageIndex, pageText) in pages.Index()) - { - var paragraphs = textChunker.Split(pageText.Trim()); - chunks.AddRange(paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pageIndex + 1, index, text))); - } - - return Task.FromResult(chunks.AsEnumerable()); + // Pages do not exist in the OpenXML format until they are rendered by a word processor. + // See https://stackoverflow.com/questions/43700252/how-to-get-page-numbers-based-on-openxmlelement for more details. + // Therefore, we will not assign a page number. + return Task.FromResult(paragraphs.Select((text, index) => new Chunk(null, index, text)).ToList().AsEnumerable()); } } diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 7611690..8133b79 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -22,10 +22,11 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli { // Extract the contents of the file. var decoder = serviceProvider.GetKeyedService(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported."); - var paragraphs = await decoder.DecodeAsync(stream, contentType, cancellationToken); + var chunks = await decoder.DecodeAsync(stream, contentType, cancellationToken); + var chunkContents = chunks.Select(p => p.Content).ToList(); // We get the token count of the whole document because it is the total number of token used by embedding (it may be necessary, for example, for cost analysis). - var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(" ", paragraphs.Select(p => p.Content))); + var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(" ", chunkContents)); var strategy = dbContext.Database.CreateExecutionStrategy(); var document = await strategy.ExecuteAsync(async (cancellationToken) => @@ -41,21 +42,30 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() }; dbContext.Documents.Add(document); - var embeddings = await embeddingGenerator.GenerateAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken); + // Process paragraphs in batches. + var embeddings = new List>(); + foreach (var batch in chunkContents.Chunk(appSettings.EmbeddingBatchSize)) + { + logger.LogDebug("Processing batch of {Count} chunks for embedding generation...", batch.Length); + + // Generate embeddings for this batch. + var batchEmbeddings = await embeddingGenerator.GenerateAsync(batch, cancellationToken: cancellationToken); + embeddings.AddRange(batchEmbeddings); + } // Save the document chunks and the corresponding embedding in the database. foreach (var (index, embedding) in embeddings.Index()) { - var paragraph = paragraphs.ElementAt(index); - logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(paragraph.Content)); + var chunk = chunks.ElementAt(index); + logger.LogDebug("Storing a chunk of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(chunk.Content)); var documentChunk = new Entities.DocumentChunk { Document = document, Index = index, - PageNumber = paragraph.PageNumber, - IndexOnPage = paragraph.IndexOnPage, - Content = paragraph.Content, + PageNumber = chunk.PageNumber, + IndexOnPage = chunk.IndexOnPage, + Content = chunk.Content, Embedding = embedding.Vector.ToArray() }; diff --git a/SqlDatabaseVectorSearch/Settings/AppSettings.cs b/SqlDatabaseVectorSearch/Settings/AppSettings.cs index a75a26e..538fed8 100644 --- a/SqlDatabaseVectorSearch/Settings/AppSettings.cs +++ b/SqlDatabaseVectorSearch/Settings/AppSettings.cs @@ -2,6 +2,8 @@ public class AppSettings { + public int EmbeddingBatchSize { get; init; } = 32; + public int MaxTokensPerLine { get; init; } = 300; public int MaxTokensPerParagraph { get; init; } = 1000; diff --git a/SqlDatabaseVectorSearch/appsettings.json b/SqlDatabaseVectorSearch/appsettings.json index 10a1967..16b1567 100644 --- a/SqlDatabaseVectorSearch/appsettings.json +++ b/SqlDatabaseVectorSearch/appsettings.json @@ -20,6 +20,7 @@ } }, "AppSettings": { + "EmbeddingBatchSize": 32, "MaxTokensPerLine": 300, "MaxTokensPerParagraph": 1000, "OverlapTokens": 100,