diff --git a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs index b67a45b..d606fa1 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/DocxContentDecoder.cs @@ -6,7 +6,7 @@ namespace SqlDatabaseVectorSearch.ContentDecoders; public class DocxContentDecoder : IContentDecoder { - public Task DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) + public Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) { // Open a Word document for read-only access. using var document = WordprocessingDocument.Open(stream, false); @@ -20,6 +20,6 @@ public class DocxContentDecoder : IContentDecoder content.AppendLine(p.InnerText); } - return Task.FromResult(content.ToString()); + return Task.FromResult(new List([new(1, 0, content.ToString())]).AsEnumerable()); } } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs index c5a46b5..4fc8293 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/IContentDecoder.cs @@ -2,5 +2,7 @@ public interface IContentDecoder { - Task DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default); + Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default); } + +public record class Chunk(int PageNumber, int IndexOnPage, string Content); \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs index 60710f7..696b192 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/PdfContentDecoder.cs @@ -1,24 +1,33 @@ -using System.Text; +using SqlDatabaseVectorSearch.TextChunkers; using UglyToad.PdfPig; -using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; +using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; namespace SqlDatabaseVectorSearch.ContentDecoders; -public class PdfContentDecoder : IContentDecoder +public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecoder { - public Task DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) + public Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) { - var content = new StringBuilder(); + var textChunker = serviceProvider.GetRequiredKeyedService(contentType); // Read the content of the PDF document. using var pdfDocument = PdfDocument.Open(stream); + var paragraphs = pdfDocument.GetPages().SelectMany(page => GetPageParagraphs(page, textChunker)).ToList(); - foreach (var page in pdfDocument.GetPages().Where(x => x is not null)) - { - var pageContent = ContentOrderTextExtractor.GetText(page) ?? string.Empty; - content.AppendLine(pageContent); - } + return Task.FromResult(paragraphs.AsEnumerable()); + } - return Task.FromResult(content.ToString()); + private static IEnumerable GetPageParagraphs(Page pdfPage, ITextChunker textChunker) + { + var letters = pdfPage.Letters; + var words = NearestNeighbourWordExtractor.Instance.GetWords(letters); + var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words); + var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" "))); + + var paragraphs = textChunker.Split(pageText); + + return paragraphs.Select((text, index) => new Chunk(pdfPage.Number, index, text)); } } diff --git a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs index 7b86637..3235b8d 100644 --- a/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs +++ b/SqlDatabaseVectorSearch/ContentDecoders/TextContentDecoder.cs @@ -2,11 +2,11 @@ public class TextContentDecoder : IContentDecoder { - public async Task DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) + public async Task> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default) { using var readStream = new StreamReader(stream); var content = await readStream.ReadToEndAsync(cancellationToken); - return content; + return [new(1, 0, content)]; } } diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs b/SqlDatabaseVectorSearch/Data/ApplicationDbContext.cs similarity index 93% rename from SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs rename to SqlDatabaseVectorSearch/Data/ApplicationDbContext.cs index 72dcfaf..e9e41a3 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs +++ b/SqlDatabaseVectorSearch/Data/ApplicationDbContext.cs @@ -1,8 +1,8 @@ using EntityFramework.Exceptions.SqlServer; using Microsoft.EntityFrameworkCore; -using SqlDatabaseVectorSearch.DataAccessLayer.Entities; +using SqlDatabaseVectorSearch.Data.Entities; -namespace SqlDatabaseVectorSearch.DataAccessLayer; +namespace SqlDatabaseVectorSearch.Data; public class ApplicationDbContext(DbContextOptions options) : DbContext(options) { diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs b/SqlDatabaseVectorSearch/Data/Entities/Document.cs similarity index 78% rename from SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs rename to SqlDatabaseVectorSearch/Data/Entities/Document.cs index d90cf56..24e4818 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs +++ b/SqlDatabaseVectorSearch/Data/Entities/Document.cs @@ -1,4 +1,4 @@ -namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities; +namespace SqlDatabaseVectorSearch.Data.Entities; public class Document { diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs b/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs similarity index 82% rename from SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs rename to SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs index 0d1886c..580de9e 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs +++ b/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs @@ -1,4 +1,4 @@ -namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities; +namespace SqlDatabaseVectorSearch.Data.Entities; public class DocumentChunk { diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.Designer.cs b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs similarity index 96% rename from SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.Designer.cs rename to SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs index bb1f760..5789fb8 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.Designer.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs @@ -5,11 +5,11 @@ using Microsoft.EntityFrameworkCore.Infrastructure; using Microsoft.EntityFrameworkCore.Metadata; using Microsoft.EntityFrameworkCore.Migrations; using Microsoft.EntityFrameworkCore.Storage.ValueConversion; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; #nullable disable -namespace SqlDatabaseVectorSearch.DataAccessLayer.Migrations +namespace SqlDatabaseVectorSearch.Data.Migrations { [DbContext(typeof(ApplicationDbContext))] [Migration("20250224102351_Initial")] diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.cs b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs similarity index 97% rename from SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.cs rename to SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs index a44fde0..2a530b0 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/00000000000000_Initial.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs @@ -3,7 +3,7 @@ using Microsoft.EntityFrameworkCore.Migrations; #nullable disable -namespace SqlDatabaseVectorSearch.DataAccessLayer.Migrations +namespace SqlDatabaseVectorSearch.Data.Migrations { /// public partial class Initial : Migration diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/ApplicationDbContextModelSnapshot.cs b/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs similarity index 96% rename from SqlDatabaseVectorSearch/DataAccessLayer/Migrations/ApplicationDbContextModelSnapshot.cs rename to SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs index 8be4784..6bb2ad9 100644 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Migrations/ApplicationDbContextModelSnapshot.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs @@ -4,11 +4,11 @@ using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore.Infrastructure; using Microsoft.EntityFrameworkCore.Metadata; using Microsoft.EntityFrameworkCore.Storage.ValueConversion; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; #nullable disable -namespace SqlDatabaseVectorSearch.DataAccessLayer.Migrations +namespace SqlDatabaseVectorSearch.Data.Migrations { [DbContext(typeof(ApplicationDbContext))] partial class ApplicationDbContextModelSnapshot : ModelSnapshot diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 0652b74..f9d2e78 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -5,7 +5,7 @@ using Microsoft.EntityFrameworkCore; using Microsoft.SemanticKernel; using SqlDatabaseVectorSearch.Components; using SqlDatabaseVectorSearch.ContentDecoders; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; using SqlDatabaseVectorSearch.Extensions; using SqlDatabaseVectorSearch.Services; using SqlDatabaseVectorSearch.Settings; diff --git a/SqlDatabaseVectorSearch/Services/DocumentService.cs b/SqlDatabaseVectorSearch/Services/DocumentService.cs index 2a6c6ef..1955255 100644 --- a/SqlDatabaseVectorSearch/Services/DocumentService.cs +++ b/SqlDatabaseVectorSearch/Services/DocumentService.cs @@ -1,6 +1,6 @@ using System.Data; using Microsoft.EntityFrameworkCore; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; using SqlDatabaseVectorSearch.Models; namespace SqlDatabaseVectorSearch.Services; diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index beb0fde..c3c7e63 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -4,12 +4,11 @@ using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.AI; using Microsoft.Extensions.Options; using SqlDatabaseVectorSearch.ContentDecoders; -using SqlDatabaseVectorSearch.DataAccessLayer; +using SqlDatabaseVectorSearch.Data; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Settings; -using SqlDatabaseVectorSearch.TextChunkers; using ChatResponse = SqlDatabaseVectorSearch.Models.ChatResponse; -using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities; +using Entities = SqlDatabaseVectorSearch.Data.Entities; namespace SqlDatabaseVectorSearch.Services; @@ -21,10 +20,10 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb { // Extract the contents of the file. var decoder = serviceProvider.GetKeyedService(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported."); - var content = await decoder.DecodeAsync(stream, contentType, cancellationToken); + var paragraphs = await decoder.DecodeAsync(stream, contentType, cancellationToken); // We get the token count of the whole document because it is the total number of token used by embedding (it may be necessary, for example, for cost analysis). - var tokenCount = tokenizerService.CountEmbeddingTokens(content); + var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(string.Empty, paragraphs.Select(p => p.Content))); var strategy = dbContext.Database.CreateExecutionStrategy(); var document = await strategy.ExecuteAsync(async (cancellationToken) => @@ -40,11 +39,7 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() }; dbContext.Documents.Add(document); - // Split the content into chunks and generate the embeddings for each one. - var textChunker = serviceProvider.GetRequiredKeyedService(contentType); - var paragraphs = textChunker.Split(content); - - var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs, cancellationToken: cancellationToken); + var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken); // Save the document chunks and the corresponding embedding in the database. foreach (var (index, embedding) in embeddings.Index())