mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
cdbe2e3a91
- Modified `DocxContentDecoder` to use `IServiceProvider` for text chunking and improved paragraph processing with page break handling. - Updated `PdfContentDecoder` and `TextContentDecoder` to trim whitespace from text before splitting into paragraphs. - Reordered service registrations in `Program.cs` while retaining existing functionality. - Updated `SqlDatabaseVectorSearch.csproj` with new package versions for several dependencies, including `Microsoft.AspNetCore.OpenApi` and `Microsoft.EntityFrameworkCore`.
34 lines
1.5 KiB
C#
34 lines
1.5 KiB
C#
using SqlDatabaseVectorSearch.TextChunkers;
|
|
using UglyToad.PdfPig;
|
|
using UglyToad.PdfPig.Content;
|
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
|
|
|
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
|
|
|
public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecoder
|
|
{
|
|
public Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
|
{
|
|
var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
|
|
|
|
// Read the content of the PDF document.
|
|
using var pdfDocument = PdfDocument.Open(stream);
|
|
var paragraphs = pdfDocument.GetPages().SelectMany(page => GetPageParagraphs(page, textChunker)).ToList();
|
|
|
|
return Task.FromResult(paragraphs.AsEnumerable());
|
|
}
|
|
|
|
private static IEnumerable<Chunk> GetPageParagraphs(Page pdfPage, ITextChunker textChunker)
|
|
{
|
|
var letters = pdfPage.Letters;
|
|
var words = NearestNeighbourWordExtractor.Instance.GetWords(letters);
|
|
var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);
|
|
var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" ")));
|
|
|
|
var paragraphs = textChunker.Split(pageText.Trim());
|
|
|
|
return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text));
|
|
}
|
|
}
|