mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Refactor content decoders and restructure data layer
Updated `DocxContentDecoder`, `PdfContentDecoder`, and `TextContentDecoder` to return `Task<IEnumerable<Chunk>>` instead of `Task<string>`, introducing a new `Chunk` record for structured output. Restructured the `ApplicationDbContext`, `Document`, and `DocumentChunk` classes by moving them to the `SqlDatabaseVectorSearch.Data` namespace for better organization. Updated database migration files to align with the new entity structure and modified references in `Program.cs`, `DocumentService.cs`, and `VectorSearchService.cs` to use the new namespace.
This commit is contained in:
@@ -6,7 +6,7 @@ namespace SqlDatabaseVectorSearch.ContentDecoders;
|
|||||||
|
|
||||||
public class DocxContentDecoder : IContentDecoder
|
public class DocxContentDecoder : IContentDecoder
|
||||||
{
|
{
|
||||||
public Task<string> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
public Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
// Open a Word document for read-only access.
|
// Open a Word document for read-only access.
|
||||||
using var document = WordprocessingDocument.Open(stream, false);
|
using var document = WordprocessingDocument.Open(stream, false);
|
||||||
@@ -20,6 +20,6 @@ public class DocxContentDecoder : IContentDecoder
|
|||||||
content.AppendLine(p.InnerText);
|
content.AppendLine(p.InnerText);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Task.FromResult(content.ToString());
|
return Task.FromResult(new List<Chunk>([new(1, 0, content.ToString())]).AsEnumerable());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,5 +2,7 @@
|
|||||||
|
|
||||||
public interface IContentDecoder
|
public interface IContentDecoder
|
||||||
{
|
{
|
||||||
Task<string> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default);
|
Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record class Chunk(int PageNumber, int IndexOnPage, string Content);
|
||||||
@@ -1,24 +1,33 @@
|
|||||||
using System.Text;
|
using SqlDatabaseVectorSearch.TextChunkers;
|
||||||
using UglyToad.PdfPig;
|
using UglyToad.PdfPig;
|
||||||
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
|
using UglyToad.PdfPig.Content;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
||||||
|
|
||||||
public class PdfContentDecoder : IContentDecoder
|
public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecoder
|
||||||
{
|
{
|
||||||
public Task<string> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
public Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
var content = new StringBuilder();
|
var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
|
||||||
|
|
||||||
// Read the content of the PDF document.
|
// Read the content of the PDF document.
|
||||||
using var pdfDocument = PdfDocument.Open(stream);
|
using var pdfDocument = PdfDocument.Open(stream);
|
||||||
|
var paragraphs = pdfDocument.GetPages().SelectMany(page => GetPageParagraphs(page, textChunker)).ToList();
|
||||||
|
|
||||||
foreach (var page in pdfDocument.GetPages().Where(x => x is not null))
|
return Task.FromResult(paragraphs.AsEnumerable());
|
||||||
{
|
}
|
||||||
var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty;
|
|
||||||
content.AppendLine(pageContent);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Task.FromResult(content.ToString());
|
private static IEnumerable<Chunk> GetPageParagraphs(Page pdfPage, ITextChunker textChunker)
|
||||||
|
{
|
||||||
|
var letters = pdfPage.Letters;
|
||||||
|
var words = NearestNeighbourWordExtractor.Instance.GetWords(letters);
|
||||||
|
var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);
|
||||||
|
var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" ")));
|
||||||
|
|
||||||
|
var paragraphs = textChunker.Split(pageText);
|
||||||
|
|
||||||
|
return paragraphs.Select((text, index) => new Chunk(pdfPage.Number, index, text));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,11 +2,11 @@
|
|||||||
|
|
||||||
public class TextContentDecoder : IContentDecoder
|
public class TextContentDecoder : IContentDecoder
|
||||||
{
|
{
|
||||||
public async Task<string> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
public async Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
using var readStream = new StreamReader(stream);
|
using var readStream = new StreamReader(stream);
|
||||||
var content = await readStream.ReadToEndAsync(cancellationToken);
|
var content = await readStream.ReadToEndAsync(cancellationToken);
|
||||||
|
|
||||||
return content;
|
return [new(1, 0, content)];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+2
-2
@@ -1,8 +1,8 @@
|
|||||||
using EntityFramework.Exceptions.SqlServer;
|
using EntityFramework.Exceptions.SqlServer;
|
||||||
using Microsoft.EntityFrameworkCore;
|
using Microsoft.EntityFrameworkCore;
|
||||||
using SqlDatabaseVectorSearch.DataAccessLayer.Entities;
|
using SqlDatabaseVectorSearch.Data.Entities;
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.DataAccessLayer;
|
namespace SqlDatabaseVectorSearch.Data;
|
||||||
|
|
||||||
public class ApplicationDbContext(DbContextOptions<ApplicationDbContext> options) : DbContext(options)
|
public class ApplicationDbContext(DbContextOptions<ApplicationDbContext> options) : DbContext(options)
|
||||||
{
|
{
|
||||||
+1
-1
@@ -1,4 +1,4 @@
|
|||||||
namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities;
|
namespace SqlDatabaseVectorSearch.Data.Entities;
|
||||||
|
|
||||||
public class Document
|
public class Document
|
||||||
{
|
{
|
||||||
+1
-1
@@ -1,4 +1,4 @@
|
|||||||
namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities;
|
namespace SqlDatabaseVectorSearch.Data.Entities;
|
||||||
|
|
||||||
public class DocumentChunk
|
public class DocumentChunk
|
||||||
{
|
{
|
||||||
+2
-2
@@ -5,11 +5,11 @@ using Microsoft.EntityFrameworkCore.Infrastructure;
|
|||||||
using Microsoft.EntityFrameworkCore.Metadata;
|
using Microsoft.EntityFrameworkCore.Metadata;
|
||||||
using Microsoft.EntityFrameworkCore.Migrations;
|
using Microsoft.EntityFrameworkCore.Migrations;
|
||||||
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
|
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
|
||||||
using SqlDatabaseVectorSearch.DataAccessLayer;
|
using SqlDatabaseVectorSearch.Data;
|
||||||
|
|
||||||
#nullable disable
|
#nullable disable
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.DataAccessLayer.Migrations
|
namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||||
{
|
{
|
||||||
[DbContext(typeof(ApplicationDbContext))]
|
[DbContext(typeof(ApplicationDbContext))]
|
||||||
[Migration("20250224102351_Initial")]
|
[Migration("20250224102351_Initial")]
|
||||||
+1
-1
@@ -3,7 +3,7 @@ using Microsoft.EntityFrameworkCore.Migrations;
|
|||||||
|
|
||||||
#nullable disable
|
#nullable disable
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.DataAccessLayer.Migrations
|
namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||||
{
|
{
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
public partial class Initial : Migration
|
public partial class Initial : Migration
|
||||||
+2
-2
@@ -4,11 +4,11 @@ using Microsoft.EntityFrameworkCore;
|
|||||||
using Microsoft.EntityFrameworkCore.Infrastructure;
|
using Microsoft.EntityFrameworkCore.Infrastructure;
|
||||||
using Microsoft.EntityFrameworkCore.Metadata;
|
using Microsoft.EntityFrameworkCore.Metadata;
|
||||||
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
|
using Microsoft.EntityFrameworkCore.Storage.ValueConversion;
|
||||||
using SqlDatabaseVectorSearch.DataAccessLayer;
|
using SqlDatabaseVectorSearch.Data;
|
||||||
|
|
||||||
#nullable disable
|
#nullable disable
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.DataAccessLayer.Migrations
|
namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||||
{
|
{
|
||||||
[DbContext(typeof(ApplicationDbContext))]
|
[DbContext(typeof(ApplicationDbContext))]
|
||||||
partial class ApplicationDbContextModelSnapshot : ModelSnapshot
|
partial class ApplicationDbContextModelSnapshot : ModelSnapshot
|
||||||
@@ -5,7 +5,7 @@ using Microsoft.EntityFrameworkCore;
|
|||||||
using Microsoft.SemanticKernel;
|
using Microsoft.SemanticKernel;
|
||||||
using SqlDatabaseVectorSearch.Components;
|
using SqlDatabaseVectorSearch.Components;
|
||||||
using SqlDatabaseVectorSearch.ContentDecoders;
|
using SqlDatabaseVectorSearch.ContentDecoders;
|
||||||
using SqlDatabaseVectorSearch.DataAccessLayer;
|
using SqlDatabaseVectorSearch.Data;
|
||||||
using SqlDatabaseVectorSearch.Extensions;
|
using SqlDatabaseVectorSearch.Extensions;
|
||||||
using SqlDatabaseVectorSearch.Services;
|
using SqlDatabaseVectorSearch.Services;
|
||||||
using SqlDatabaseVectorSearch.Settings;
|
using SqlDatabaseVectorSearch.Settings;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
using System.Data;
|
using System.Data;
|
||||||
using Microsoft.EntityFrameworkCore;
|
using Microsoft.EntityFrameworkCore;
|
||||||
using SqlDatabaseVectorSearch.DataAccessLayer;
|
using SqlDatabaseVectorSearch.Data;
|
||||||
using SqlDatabaseVectorSearch.Models;
|
using SqlDatabaseVectorSearch.Models;
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.Services;
|
namespace SqlDatabaseVectorSearch.Services;
|
||||||
|
|||||||
@@ -4,12 +4,11 @@ using Microsoft.EntityFrameworkCore;
|
|||||||
using Microsoft.Extensions.AI;
|
using Microsoft.Extensions.AI;
|
||||||
using Microsoft.Extensions.Options;
|
using Microsoft.Extensions.Options;
|
||||||
using SqlDatabaseVectorSearch.ContentDecoders;
|
using SqlDatabaseVectorSearch.ContentDecoders;
|
||||||
using SqlDatabaseVectorSearch.DataAccessLayer;
|
using SqlDatabaseVectorSearch.Data;
|
||||||
using SqlDatabaseVectorSearch.Models;
|
using SqlDatabaseVectorSearch.Models;
|
||||||
using SqlDatabaseVectorSearch.Settings;
|
using SqlDatabaseVectorSearch.Settings;
|
||||||
using SqlDatabaseVectorSearch.TextChunkers;
|
|
||||||
using ChatResponse = SqlDatabaseVectorSearch.Models.ChatResponse;
|
using ChatResponse = SqlDatabaseVectorSearch.Models.ChatResponse;
|
||||||
using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities;
|
using Entities = SqlDatabaseVectorSearch.Data.Entities;
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.Services;
|
namespace SqlDatabaseVectorSearch.Services;
|
||||||
|
|
||||||
@@ -21,10 +20,10 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
|
|||||||
{
|
{
|
||||||
// Extract the contents of the file.
|
// Extract the contents of the file.
|
||||||
var decoder = serviceProvider.GetKeyedService<IContentDecoder>(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported.");
|
var decoder = serviceProvider.GetKeyedService<IContentDecoder>(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported.");
|
||||||
var content = await decoder.DecodeAsync(stream, contentType, cancellationToken);
|
var paragraphs = await decoder.DecodeAsync(stream, contentType, cancellationToken);
|
||||||
|
|
||||||
// We get the token count of the whole document because it is the total number of token used by embedding (it may be necessary, for example, for cost analysis).
|
// We get the token count of the whole document because it is the total number of token used by embedding (it may be necessary, for example, for cost analysis).
|
||||||
var tokenCount = tokenizerService.CountEmbeddingTokens(content);
|
var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(string.Empty, paragraphs.Select(p => p.Content)));
|
||||||
|
|
||||||
var strategy = dbContext.Database.CreateExecutionStrategy();
|
var strategy = dbContext.Database.CreateExecutionStrategy();
|
||||||
var document = await strategy.ExecuteAsync(async (cancellationToken) =>
|
var document = await strategy.ExecuteAsync(async (cancellationToken) =>
|
||||||
@@ -40,11 +39,7 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
|
|||||||
var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() };
|
var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() };
|
||||||
dbContext.Documents.Add(document);
|
dbContext.Documents.Add(document);
|
||||||
|
|
||||||
// Split the content into chunks and generate the embeddings for each one.
|
var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken);
|
||||||
var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
|
|
||||||
var paragraphs = textChunker.Split(content);
|
|
||||||
|
|
||||||
var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs, cancellationToken: cancellationToken);
|
|
||||||
|
|
||||||
// Save the document chunks and the corresponding embedding in the database.
|
// Save the document chunks and the corresponding embedding in the database.
|
||||||
foreach (var (index, embedding) in embeddings.Index())
|
foreach (var (index, embedding) in embeddings.Index())
|
||||||
|
|||||||
Reference in New Issue
Block a user