From 1e531e5ad61dc28ee355148b65e009c7bf8cc59d Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 27 May 2025 17:19:25 +0200 Subject: [PATCH] Filter out empty paragraphs in PdfContentDecoder Updated the paragraph processing to exclude empty or whitespace-only entries before creating Chunk objects, ensuring only meaningful text is included. --- SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs index 696b192..c5cd0e9 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs @@ -28,6 +28,6 @@ public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecod var paragraphs = textChunker.Split(pageText); - return paragraphs.Select((text, index) => new Chunk(pdfPage.Number, index, text)); + return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text)); } }