diff --git a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor index 297ed39..4adc676 100644 --- a/SqlDatabaseVectorSearch/Components/Pages/Ask.razor +++ b/SqlDatabaseVectorSearch/Components/Pages/Ask.razor @@ -80,7 +80,7 @@ {
- @citation.FileName @if (!string.IsNullOrEmpty(citation.PageNumber)) + @citation.FileName @if (citation.PageNumber.GetValueOrDefault() > 0) { pag. @citation.PageNumber } @@ -314,7 +314,7 @@ return (text ?? string.Empty, citations); } - var pattern = @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'>\s*(?.*?)\s*"; + var pattern = @"[^']*)'\s+chunk-id='(?[^']*)'\s+filename='(?[^']*)'\s+page-number='(?[^']*)'\s+index-on-page='(?[^']*)'>\s*(?.*?)\s*"; var matches = Regex.Matches(text, pattern, RegexOptions.Singleline); foreach (Match match in matches) @@ -326,7 +326,8 @@ DocumentId = Guid.Parse(match.Groups["documentId"].Value), ChunkId = Guid.Parse(match.Groups["chunkId"].Value), FileName = match.Groups["filename"].Value, - PageNumber = match.Groups["pageNumber"].Value, + PageNumber = int.TryParse(match.Groups["pageNumber"].Value, out var pageNumber) && pageNumber > 0 ? pageNumber : null, + IndexOnPage = int.TryParse(match.Groups["indexOnPage"].Value, out var indexOnPage) ? indexOnPage : 0, Quote = match.Groups["quote"].Value }); } @@ -368,6 +369,8 @@ public string Quote { get; set; } = null!; - public string? PageNumber { get; set; } + public int? PageNumber { get; set; } + + public int IndexOnPage { get; set; } } } \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs b/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs index 580de9e..bc67963 100644 --- a/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs +++ b/SqlDatabaseVectorSearch/Data/Entities/DocumentChunk.cs @@ -8,6 +8,10 @@ public class DocumentChunk public int Index { get; set; } + public int? PageNumber { get; set; } + + public int IndexOnPage { get; set; } + public required string Content { get; set; } public required float[] Embedding { get; set; } diff --git a/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs index 5789fb8..c071cf1 100644 --- a/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.Designer.cs @@ -9,10 +9,10 @@ using SqlDatabaseVectorSearch.Data; #nullable disable -namespace SqlDatabaseVectorSearch.Data.Migrations +namespace SqlDatabaseVectorSearch.Migrations { [DbContext(typeof(ApplicationDbContext))] - [Migration("20250224102351_Initial")] + [Migration("20250606091336_Initial")] partial class Initial { /// @@ -20,12 +20,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations { #pragma warning disable 612, 618 modelBuilder - .HasAnnotation("ProductVersion", "9.0.2") + .HasAnnotation("ProductVersion", "9.0.5") .HasAnnotation("Relational:MaxIdentifierLength", 128); SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b => { b.Property("Id") .ValueGeneratedOnAdd() @@ -44,7 +44,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.ToTable("Documents", (string)null); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b => { b.Property("Id") .ValueGeneratedOnAdd() @@ -64,6 +64,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.Property("Index") .HasColumnType("int"); + b.Property("IndexOnPage") + .HasColumnType("int"); + + b.Property("PageNumber") + .HasColumnType("int"); + b.HasKey("Id"); b.HasIndex("DocumentId"); @@ -71,9 +77,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.ToTable("DocumentChunks", (string)null); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b => { - b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document") + b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document") .WithMany("Chunks") .HasForeignKey("DocumentId") .OnDelete(DeleteBehavior.Cascade) @@ -83,7 +89,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.Navigation("Document"); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b => { b.Navigation("Chunks"); }); diff --git a/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs index 2a530b0..590cad4 100644 --- a/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/00000000000000_Initial.cs @@ -3,7 +3,7 @@ using Microsoft.EntityFrameworkCore.Migrations; #nullable disable -namespace SqlDatabaseVectorSearch.Data.Migrations +namespace SqlDatabaseVectorSearch.Migrations { /// public partial class Initial : Migration @@ -31,6 +31,8 @@ namespace SqlDatabaseVectorSearch.Data.Migrations Id = table.Column(type: "uniqueidentifier", nullable: false), DocumentId = table.Column(type: "uniqueidentifier", nullable: false), Index = table.Column(type: "int", nullable: false), + PageNumber = table.Column(type: "int", nullable: true), + IndexOnPage = table.Column(type: "int", nullable: false), Content = table.Column(type: "nvarchar(max)", nullable: false), Embedding = table.Column(type: "vector(1536)", nullable: false) }, diff --git a/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs b/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs index 6bb2ad9..aeb0666 100644 --- a/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs +++ b/SqlDatabaseVectorSearch/Data/Migrations/ApplicationDbContextModelSnapshot.cs @@ -8,7 +8,7 @@ using SqlDatabaseVectorSearch.Data; #nullable disable -namespace SqlDatabaseVectorSearch.Data.Migrations +namespace SqlDatabaseVectorSearch.Migrations { [DbContext(typeof(ApplicationDbContext))] partial class ApplicationDbContextModelSnapshot : ModelSnapshot @@ -17,12 +17,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations { #pragma warning disable 612, 618 modelBuilder - .HasAnnotation("ProductVersion", "9.0.2") + .HasAnnotation("ProductVersion", "9.0.5") .HasAnnotation("Relational:MaxIdentifierLength", 128); SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b => { b.Property("Id") .ValueGeneratedOnAdd() @@ -41,7 +41,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.ToTable("Documents", (string)null); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b => { b.Property("Id") .ValueGeneratedOnAdd() @@ -61,6 +61,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.Property("Index") .HasColumnType("int"); + b.Property("IndexOnPage") + .HasColumnType("int"); + + b.Property("PageNumber") + .HasColumnType("int"); + b.HasKey("Id"); b.HasIndex("DocumentId"); @@ -68,9 +74,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.ToTable("DocumentChunks", (string)null); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b => { - b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document") + b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document") .WithMany("Chunks") .HasForeignKey("DocumentId") .OnDelete(DeleteBehavior.Cascade) @@ -80,7 +86,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations b.Navigation("Document"); }); - modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => + modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b => { b.Navigation("Chunks"); }); diff --git a/SqlDatabaseVectorSearch/Services/ChatService.cs b/SqlDatabaseVectorSearch/Services/ChatService.cs index 4269efa..2bc3e51 100644 --- a/SqlDatabaseVectorSearch/Services/ChatService.cs +++ b/SqlDatabaseVectorSearch/Services/ChatService.cs @@ -132,7 +132,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer First provide your complete answer, then list all citations. Use this XML format for citations: - exact quote here + exact quote here """); var prompt = new StringBuilder($""" @@ -151,7 +151,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer foreach (var chunk in chunks) { - var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: 1) {Environment.NewLine}{chunk.Content}{Environment.NewLine}"; + var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: {chunk.PageNumber} | Index on Page: {chunk.IndexOnPage}) {Environment.NewLine}{chunk.Content}{Environment.NewLine}"; var tokenCount = tokenizerService.CountChatCompletionTokens(text); if (tokenCount > availableTokens) diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index beda3cb..2663c39 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -39,14 +39,24 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() }; dbContext.Documents.Add(document); - var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken); + var embeddings = await embeddingGenerator.GenerateAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken); // Save the document chunks and the corresponding embedding in the database. foreach (var (index, embedding) in embeddings.Index()) { - logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(embedding.Value)); + var paragraph = paragraphs.ElementAt(index); + logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(paragraph.Content)); + + var documentChunk = new Entities.DocumentChunk + { + Document = document, + Index = index, + PageNumber = paragraph.PageNumber, + IndexOnPage = paragraph.IndexOnPage, + Content = paragraph.Content, + Embedding = embedding.Vector.ToArray() + }; - var documentChunk = new Entities.DocumentChunk { Document = document, Index = index, Content = embedding.Value, Embedding = embedding.Embedding.Vector.ToArray() }; dbContext.DocumentChunks.Add(documentChunk); }