Files
SqlDatabaseVectorSearch/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs
T
Marco Minerva cdbe2e3a91 Enhance content decoders and update dependencies
- Modified `DocxContentDecoder` to use `IServiceProvider` for text chunking and improved paragraph processing with page break handling.
- Updated `PdfContentDecoder` and `TextContentDecoder` to trim whitespace from text before splitting into paragraphs.
- Reordered service registrations in `Program.cs` while retaining existing functionality.
- Updated `SqlDatabaseVectorSearch.csproj` with new package versions for several dependencies, including `Microsoft.AspNetCore.OpenApi` and `Microsoft.EntityFrameworkCore`.
2025-06-11 17:20:56 +02:00

34 lines
1.5 KiB
C#

using SqlDatabaseVectorSearch.TextChunkers;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
namespace SqlDatabaseVectorSearch.ContentDecoders;
public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecoder
{
public Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
{
var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
// Read the content of the PDF document.
using var pdfDocument = PdfDocument.Open(stream);
var paragraphs = pdfDocument.GetPages().SelectMany(page => GetPageParagraphs(page, textChunker)).ToList();
return Task.FromResult(paragraphs.AsEnumerable());
}
private static IEnumerable<Chunk> GetPageParagraphs(Page pdfPage, ITextChunker textChunker)
{
var letters = pdfPage.Letters;
var words = NearestNeighbourWordExtractor.Instance.GetWords(letters);
var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);
var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" ")));
var paragraphs = textChunker.Split(pageText.Trim());
return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text));
}
}