Files
SqlDatabaseVectorSearch/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs
T
Marco Minerva 110e21e1e0 Add content decoding for PDF and DOCX files
- Added `using` statements in `Program.cs` for new content decoding.
- Registered new content decoder services in `builder.Services`.
- Modified `documentsApiGroup.MapPost` to pass `file.ContentType`.
- Refactored `VectorSearchService` to use `IServiceProvider` and handle content types.
- Added `DocumentFormat.OpenXml` package reference.
- Created `DocxContentDecoder` and `PdfContentDecoder` classes.
- Created `IContentDecoder` interface.
2025-01-29 09:43:22 +01:00

25 lines
730 B
C#

using System.Text;
using UglyToad.PdfPig;
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
namespace SqlDatabaseVectorSearch.ContentDecoders;
public class PdfContentDecoder : IContentDecoder
{
public Task<string> DecodeAsync(Stream stream, string contentType)
{
var content = new StringBuilder();
// Read the content of the PDF document.
using var pdfDocument = PdfDocument.Open(stream);
foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
{
var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
content.AppendLine(pageContent);
}
return Task.FromResult(content.ToString());
}
}