Add content decoding for PDF and DOCX files

- Added `using` statements in `Program.cs` for new content decoding.
- Registered new content decoder services in `builder.Services`.
- Modified `documentsApiGroup.MapPost` to pass `file.ContentType`.
- Refactored `VectorSearchService` to use `IServiceProvider` and handle content types.
- Added `DocumentFormat.OpenXml` package reference.
- Created `DocxContentDecoder` and `PdfContentDecoder` classes.
- Created `IContentDecoder` interface.
This commit is contained in:
Marco Minerva
2025-01-29 09:43:22 +01:00
parent f15f387510
commit 110e21e1e0
6 changed files with 104 additions and 24 deletions
@@ -0,0 +1,61 @@
using System.Text;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
namespace SqlDatabaseVectorSearch.ContentDecoders;
public class DocxContentDecoder : IContentDecoder
{
public Task<string> DecodeAsync(Stream stream, string contentType)
{
// Open a Word document for read-only access.
using var document = WordprocessingDocument.Open(stream, false);
var body = document.MainDocumentPart?.Document.Body;
var content = new StringBuilder();
var paragraphs = body?.Descendants<Paragraph>() ?? [];
foreach (var p in paragraphs)
{
content.AppendLine(p.InnerText);
}
return Task.FromResult(content.ToString());
//foreach (var paragraph in body!.Elements<Paragraph>())
//{
// foreach (var element in paragraph.Elements())
// {
// if (element is Run run)
// {
// DecodeTextFromRun(run);
// }
// else if (element is Hyperlink hyperlink)
// {
// foreach (var hyperlinkRun in hyperlink.Elements<Run>())
// {
// DecodeTextFromRun(hyperlinkRun);
// }
// //var hyperlinkUri = doc.MainDocumentPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id)?.Uri;
// //if (hyperlinkUri is not null)
// //{
// // content.Append($" ({hyperlinkUri})");
// //}
// }
// }
// content.AppendLine(); // Preserve whitespace and blank lines.
//}
//return Task.FromResult(content.ToString());
//void DecodeTextFromRun(Run run)
//{
// foreach (var text in run.Elements<Text>())
// {
// content.Append(text.Text);
// }
//}
}
}
@@ -0,0 +1,6 @@
namespace SqlDatabaseVectorSearch.ContentDecoders;
public interface IContentDecoder
{
Task<string> DecodeAsync(Stream stream, string contentType);
}
@@ -0,0 +1,24 @@
using System.Text;
using UglyToad.PdfPig;
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
namespace SqlDatabaseVectorSearch.ContentDecoders;
public class PdfContentDecoder : IContentDecoder
{
public Task<string> DecodeAsync(Stream stream, string contentType)
{
var content = new StringBuilder();
// Read the content of the PDF document.
using var pdfDocument = PdfDocument.Open(stream);
foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
{
var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
content.AppendLine(pageContent);
}
return Task.FromResult(content.ToString());
}
}
+6 -1
View File
@@ -1,8 +1,10 @@
using System.ComponentModel; using System.ComponentModel;
using System.Net.Mime;
using System.Text.Json.Serialization; using System.Text.Json.Serialization;
using Microsoft.AspNetCore.Http.HttpResults; using Microsoft.AspNetCore.Http.HttpResults;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Microsoft.SemanticKernel; using Microsoft.SemanticKernel;
using SqlDatabaseVectorSearch.ContentDecoders;
using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.DataAccessLayer;
using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Models;
using SqlDatabaseVectorSearch.Services; using SqlDatabaseVectorSearch.Services;
@@ -50,6 +52,9 @@ builder.Services.AddSingleton<TokenizerService>();
builder.Services.AddSingleton<ChatService>(); builder.Services.AddSingleton<ChatService>();
builder.Services.AddScoped<VectorSearchService>(); builder.Services.AddScoped<VectorSearchService>();
builder.Services.AddKeyedSingleton<IContentDecoder, PdfContentDecoder>(MediaTypeNames.Application.Pdf);
builder.Services.AddKeyedSingleton<IContentDecoder, DocxContentDecoder>("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
builder.Services.ConfigureHttpJsonOptions(options => builder.Services.ConfigureHttpJsonOptions(options =>
{ {
options.SerializerOptions.Converters.Add(new JsonStringEnumConverter()); options.SerializerOptions.Converters.Add(new JsonStringEnumConverter());
@@ -113,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi
[Description("The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.")] Guid? documentId = null) => [Description("The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.")] Guid? documentId = null) =>
{ {
using var stream = file.OpenReadStream(); using var stream = file.OpenReadStream();
documentId = await vectorSearchService.ImportAsync(stream, file.FileName, documentId); documentId = await vectorSearchService.ImportAsync(stream, file.FileName, file.ContentType, documentId);
return TypedResults.Ok(new UploadDocumentResponse(documentId.Value)); return TypedResults.Ok(new UploadDocumentResponse(documentId.Value));
}) })
@@ -1,26 +1,25 @@
using System.Data; using System.Data;
using System.Text;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Options; using Microsoft.Extensions.Options;
using Microsoft.SemanticKernel.Embeddings; using Microsoft.SemanticKernel.Embeddings;
using Microsoft.SemanticKernel.Text; using Microsoft.SemanticKernel.Text;
using SqlDatabaseVectorSearch.ContentDecoders;
using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.DataAccessLayer;
using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Models;
using SqlDatabaseVectorSearch.Settings; using SqlDatabaseVectorSearch.Settings;
using UglyToad.PdfPig;
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities; using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities;
namespace SqlDatabaseVectorSearch.Services; namespace SqlDatabaseVectorSearch.Services;
public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TokenizerService tokenizerService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger) public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TokenizerService tokenizerService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions, ILogger<VectorSearchService> logger)
{ {
private readonly AppSettings appSettings = appSettingsOptions.Value; private readonly AppSettings appSettings = appSettingsOptions.Value;
public async Task<Guid> ImportAsync(Stream stream, string name, Guid? documentId) public async Task<Guid> ImportAsync(Stream stream, string name, string contentType, Guid? documentId)
{ {
// Extract the contents of the file (currently, only PDF files are supported). // Extract the contents of the file.
var content = await GetContentAsync(stream); var decoder = serviceProvider.GetRequiredKeyedService<IContentDecoder>(contentType);
var content = await decoder.DecodeAsync(stream, contentType);
await dbContext.Database.BeginTransactionAsync(); await dbContext.Database.BeginTransactionAsync();
@@ -126,20 +125,4 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
return (reformulatedQuestion, chunks); return (reformulatedQuestion, chunks);
} }
private static Task<string> GetContentAsync(Stream stream)
{
var content = new StringBuilder();
// Read the content of the PDF document.
using var pdfDocument = PdfDocument.Open(stream);
foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
{
var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
content.AppendLine(pageContent);
}
return Task.FromResult(content.ToString());
}
} }
@@ -8,6 +8,7 @@
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<PackageReference Include="DocumentFormat.OpenXml" Version="3.2.0" />
<PackageReference Include="EFCore.SqlServer.VectorSearch" Version="9.0.0-preview.2" /> <PackageReference Include="EFCore.SqlServer.VectorSearch" Version="9.0.0-preview.2" />
<PackageReference Include="EntityFrameworkCore.Exceptions.SqlServer" Version="8.1.3" /> <PackageReference Include="EntityFrameworkCore.Exceptions.SqlServer" Version="8.1.3" />
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="9.0.1" /> <PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="9.0.1" />