From 1e531e5ad61dc28ee355148b65e009c7bf8cc59d Mon Sep 17 00:00:00 2001
From: Marco Minerva <marco.minerva@gmail.com>
Date: Tue, 27 May 2025 17:19:25 +0200
Subject: [PATCH] Filter out empty paragraphs in PdfContentDecoder

Updated the paragraph processing to exclude empty or whitespace-only entries before creating Chunk objects, ensuring only meaningful text is included.
---
 SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs
index 696b192..c5cd0e9 100644
--- a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs
+++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs
@@ -28,6 +28,6 @@ public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecod
 
         var paragraphs = textChunker.Split(pageText);
 
-        return paragraphs.Select((text, index) => new Chunk(pdfPage.Number, index, text));
+        return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text));
     }
 }