mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
110e21e1e0
- Added `using` statements in `Program.cs` for new content decoding. - Registered new content decoder services in `builder.Services`. - Modified `documentsApiGroup.MapPost` to pass `file.ContentType`. - Refactored `VectorSearchService` to use `IServiceProvider` and handle content types. - Added `DocumentFormat.OpenXml` package reference. - Created `DocxContentDecoder` and `PdfContentDecoder` classes. - Created `IContentDecoder` interface.
25 lines
730 B
C#
25 lines
730 B
C#
using System.Text;
|
|
using UglyToad.PdfPig;
|
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
|
|
|
|
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
|
|
|
public class PdfContentDecoder : IContentDecoder
|
|
{
|
|
public Task<string> DecodeAsync(Stream stream, string contentType)
|
|
{
|
|
var content = new StringBuilder();
|
|
|
|
// Read the content of the PDF document.
|
|
using var pdfDocument = PdfDocument.Open(stream);
|
|
|
|
foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
|
|
{
|
|
var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
|
|
content.AppendLine(pageContent);
|
|
}
|
|
|
|
return Task.FromResult(content.ToString());
|
|
}
|
|
}
|