mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Filter out empty paragraphs in PdfContentDecoder
Updated the paragraph processing to exclude empty or whitespace-only entries before creating Chunk objects, ensuring only meaningful text is included.
This commit is contained in:
@@ -28,6 +28,6 @@ public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecod
|
|||||||
|
|
||||||
var paragraphs = textChunker.Split(pageText);
|
var paragraphs = textChunker.Split(pageText);
|
||||||
|
|
||||||
return paragraphs.Select((text, index) => new Chunk(pdfPage.Number, index, text));
|
return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user