mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Refactor content decoders and restructure data layer
Updated `DocxContentDecoder`, `PdfContentDecoder`, and `TextContentDecoder` to return `Task<IEnumerable<Chunk>>` instead of `Task<string>`, introducing a new `Chunk` record for structured output. Restructured the `ApplicationDbContext`, `Document`, and `DocumentChunk` classes by moving them to the `SqlDatabaseVectorSearch.Data` namespace for better organization. Updated database migration files to align with the new entity structure and modified references in `Program.cs`, `DocumentService.cs`, and `VectorSearchService.cs` to use the new namespace.
This commit is contained in:
@@ -6,7 +6,7 @@ namespace SqlDatabaseVectorSearch.ContentDecoders;
|
||||
|
||||
public class DocxContentDecoder : IContentDecoder
|
||||
{
|
||||
public Task<string> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||
public Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||
{
|
||||
// Open a Word document for read-only access.
|
||||
using var document = WordprocessingDocument.Open(stream, false);
|
||||
@@ -20,6 +20,6 @@ public class DocxContentDecoder : IContentDecoder
|
||||
content.AppendLine(p.InnerText);
|
||||
}
|
||||
|
||||
return Task.FromResult(content.ToString());
|
||||
return Task.FromResult(new List<Chunk>([new(1, 0, content.ToString())]).AsEnumerable());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,5 +2,7 @@
|
||||
|
||||
public interface IContentDecoder
|
||||
{
|
||||
Task<string> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default);
|
||||
Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
public record class Chunk(int PageNumber, int IndexOnPage, string Content);
|
||||
@@ -1,24 +1,33 @@
|
||||
using System.Text;
|
||||
using SqlDatabaseVectorSearch.TextChunkers;
|
||||
using UglyToad.PdfPig;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
|
||||
using UglyToad.PdfPig.Content;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||
|
||||
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
||||
|
||||
public class PdfContentDecoder : IContentDecoder
|
||||
public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecoder
|
||||
{
|
||||
public Task<string> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||
public Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var content = new StringBuilder();
|
||||
var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
|
||||
|
||||
// Read the content of the PDF document.
|
||||
using var pdfDocument = PdfDocument.Open(stream);
|
||||
var paragraphs = pdfDocument.GetPages().SelectMany(page => GetPageParagraphs(page, textChunker)).ToList();
|
||||
|
||||
foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
|
||||
{
|
||||
var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
|
||||
content.AppendLine(pageContent);
|
||||
}
|
||||
return Task.FromResult(paragraphs.AsEnumerable());
|
||||
}
|
||||
|
||||
return Task.FromResult(content.ToString());
|
||||
private static IEnumerable<Chunk> GetPageParagraphs(Page pdfPage, ITextChunker textChunker)
|
||||
{
|
||||
var letters = pdfPage.Letters;
|
||||
var words = NearestNeighbourWordExtractor.Instance.GetWords(letters);
|
||||
var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);
|
||||
var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" ")));
|
||||
|
||||
var paragraphs = textChunker.Split(pageText);
|
||||
|
||||
return paragraphs.Select((text, index) => new Chunk(pdfPage.Number, index, text));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
|
||||
public class TextContentDecoder : IContentDecoder
|
||||
{
|
||||
public async Task<string> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||
public async Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||
{
|
||||
using var readStream = new StreamReader(stream);
|
||||
var content = await readStream.ReadToEndAsync(cancellationToken);
|
||||
|
||||
return content;
|
||||
return [new(1, 0, content)];
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user