mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Add content decoding for PDF and DOCX files
- Added `using` statements in `Program.cs` for new content decoding. - Registered new content decoder services in `builder.Services`. - Modified `documentsApiGroup.MapPost` to pass `file.ContentType`. - Refactored `VectorSearchService` to use `IServiceProvider` and handle content types. - Added `DocumentFormat.OpenXml` package reference. - Created `DocxContentDecoder` and `PdfContentDecoder` classes. - Created `IContentDecoder` interface.
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
using System.Text;
|
||||
using DocumentFormat.OpenXml.Packaging;
|
||||
using DocumentFormat.OpenXml.Wordprocessing;
|
||||
|
||||
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
||||
|
||||
public class DocxContentDecoder : IContentDecoder
|
||||
{
|
||||
public Task<string> DecodeAsync(Stream stream, string contentType)
|
||||
{
|
||||
// Open a Word document for read-only access.
|
||||
using var document = WordprocessingDocument.Open(stream, false);
|
||||
|
||||
var body = document.MainDocumentPart?.Document.Body;
|
||||
var content = new StringBuilder();
|
||||
|
||||
var paragraphs = body?.Descendants<Paragraph>() ?? [];
|
||||
foreach (var p in paragraphs)
|
||||
{
|
||||
content.AppendLine(p.InnerText);
|
||||
}
|
||||
|
||||
return Task.FromResult(content.ToString());
|
||||
|
||||
//foreach (var paragraph in body!.Elements<Paragraph>())
|
||||
//{
|
||||
// foreach (var element in paragraph.Elements())
|
||||
// {
|
||||
// if (element is Run run)
|
||||
// {
|
||||
// DecodeTextFromRun(run);
|
||||
// }
|
||||
// else if (element is Hyperlink hyperlink)
|
||||
// {
|
||||
// foreach (var hyperlinkRun in hyperlink.Elements<Run>())
|
||||
// {
|
||||
// DecodeTextFromRun(hyperlinkRun);
|
||||
// }
|
||||
|
||||
// //var hyperlinkUri = doc.MainDocumentPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id)?.Uri;
|
||||
// //if (hyperlinkUri is not null)
|
||||
// //{
|
||||
// // content.Append($" ({hyperlinkUri})");
|
||||
// //}
|
||||
// }
|
||||
// }
|
||||
|
||||
// content.AppendLine(); // Preserve whitespace and blank lines.
|
||||
//}
|
||||
|
||||
//return Task.FromResult(content.ToString());
|
||||
|
||||
//void DecodeTextFromRun(Run run)
|
||||
//{
|
||||
// foreach (var text in run.Elements<Text>())
|
||||
// {
|
||||
// content.Append(text.Text);
|
||||
// }
|
||||
//}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,6 @@
|
||||
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
||||
|
||||
public interface IContentDecoder
|
||||
{
|
||||
Task<string> DecodeAsync(Stream stream, string contentType);
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
using System.Text;
|
||||
using UglyToad.PdfPig;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
|
||||
|
||||
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
||||
|
||||
public class PdfContentDecoder : IContentDecoder
|
||||
{
|
||||
public Task<string> DecodeAsync(Stream stream, string contentType)
|
||||
{
|
||||
var content = new StringBuilder();
|
||||
|
||||
// Read the content of the PDF document.
|
||||
using var pdfDocument = PdfDocument.Open(stream);
|
||||
|
||||
foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
|
||||
{
|
||||
var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
|
||||
content.AppendLine(pageContent);
|
||||
}
|
||||
|
||||
return Task.FromResult(content.ToString());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user