Enhance citation handling and document chunk structure

- Updated `Ask.razor` to change `PageNumber` to a nullable integer and added `IndexOnPage` to the `Citation` class. Adjusted regex for citation parsing.
- Introduced `PageNumber` and `IndexOnPage` properties in `DocumentChunk.cs`, marking `Content` as required.
- Modified migration files to reflect changes in `DocumentChunk` and `Document` entities.
- Updated citation format in `ChatService.cs` to include `index-on-page` and adjusted document chunk text formatting.
- Changed embedding generation method in `VectorSearchService.cs` and updated document chunk creation to include new properties.
This commit is contained in:
Marco Minerva
2025-06-06 11:26:27 +02:00
parent dc6bbfde91
commit cdf8356e11
7 changed files with 56 additions and 25 deletions
@@ -80,7 +80,7 @@
{ {
<div class="border rounded p-2 me-2 mb-2 citation-box small"> <div class="border rounded p-2 me-2 mb-2 citation-box small">
<div> <div>
<strong>@citation.FileName</strong> @if (!string.IsNullOrEmpty(citation.PageNumber)) <strong>@citation.FileName</strong> @if (citation.PageNumber.GetValueOrDefault() > 0)
{ {
<span class="ms-2">pag. @citation.PageNumber</span> <span class="ms-2">pag. @citation.PageNumber</span>
} }
@@ -314,7 +314,7 @@
return (text ?? string.Empty, citations); return (text ?? string.Empty, citations);
} }
var pattern = @"<citation\s+document-id='(?<documentId>[^']*)'\s+chunk-id='(?<chunkId>[^']*)'\s+filename='(?<filename>[^']*)'\s+page-number='(?<pageNumber>[^']*)'>\s*(?<quote>.*?)\s*</citation>"; var pattern = @"<citation\s+document-id='(?<documentId>[^']*)'\s+chunk-id='(?<chunkId>[^']*)'\s+filename='(?<filename>[^']*)'\s+page-number='(?<pageNumber>[^']*)'\s+index-on-page='(?<indexOnPage>[^']*)'>\s*(?<quote>.*?)\s*</citation>";
var matches = Regex.Matches(text, pattern, RegexOptions.Singleline); var matches = Regex.Matches(text, pattern, RegexOptions.Singleline);
foreach (Match match in matches) foreach (Match match in matches)
@@ -326,7 +326,8 @@
DocumentId = Guid.Parse(match.Groups["documentId"].Value), DocumentId = Guid.Parse(match.Groups["documentId"].Value),
ChunkId = Guid.Parse(match.Groups["chunkId"].Value), ChunkId = Guid.Parse(match.Groups["chunkId"].Value),
FileName = match.Groups["filename"].Value, FileName = match.Groups["filename"].Value,
PageNumber = match.Groups["pageNumber"].Value, PageNumber = int.TryParse(match.Groups["pageNumber"].Value, out var pageNumber) && pageNumber > 0 ? pageNumber : null,
IndexOnPage = int.TryParse(match.Groups["indexOnPage"].Value, out var indexOnPage) ? indexOnPage : 0,
Quote = match.Groups["quote"].Value Quote = match.Groups["quote"].Value
}); });
} }
@@ -368,6 +369,8 @@
public string Quote { get; set; } = null!; public string Quote { get; set; } = null!;
public string? PageNumber { get; set; } public int? PageNumber { get; set; }
public int IndexOnPage { get; set; }
} }
} }
@@ -8,6 +8,10 @@ public class DocumentChunk
public int Index { get; set; } public int Index { get; set; }
public int? PageNumber { get; set; }
public int IndexOnPage { get; set; }
public required string Content { get; set; } public required string Content { get; set; }
public required float[] Embedding { get; set; } public required float[] Embedding { get; set; }
@@ -9,10 +9,10 @@ using SqlDatabaseVectorSearch.Data;
#nullable disable #nullable disable
namespace SqlDatabaseVectorSearch.Data.Migrations namespace SqlDatabaseVectorSearch.Migrations
{ {
[DbContext(typeof(ApplicationDbContext))] [DbContext(typeof(ApplicationDbContext))]
[Migration("20250224102351_Initial")] [Migration("20250606091336_Initial")]
partial class Initial partial class Initial
{ {
/// <inheritdoc /> /// <inheritdoc />
@@ -20,12 +20,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
{ {
#pragma warning disable 612, 618 #pragma warning disable 612, 618
modelBuilder modelBuilder
.HasAnnotation("ProductVersion", "9.0.2") .HasAnnotation("ProductVersion", "9.0.5")
.HasAnnotation("Relational:MaxIdentifierLength", 128); .HasAnnotation("Relational:MaxIdentifierLength", 128);
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder); SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
{ {
b.Property<Guid>("Id") b.Property<Guid>("Id")
.ValueGeneratedOnAdd() .ValueGeneratedOnAdd()
@@ -44,7 +44,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
b.ToTable("Documents", (string)null); b.ToTable("Documents", (string)null);
}); });
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
{ {
b.Property<Guid>("Id") b.Property<Guid>("Id")
.ValueGeneratedOnAdd() .ValueGeneratedOnAdd()
@@ -64,6 +64,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
b.Property<int>("Index") b.Property<int>("Index")
.HasColumnType("int"); .HasColumnType("int");
b.Property<int>("IndexOnPage")
.HasColumnType("int");
b.Property<int?>("PageNumber")
.HasColumnType("int");
b.HasKey("Id"); b.HasKey("Id");
b.HasIndex("DocumentId"); b.HasIndex("DocumentId");
@@ -71,9 +77,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
b.ToTable("DocumentChunks", (string)null); b.ToTable("DocumentChunks", (string)null);
}); });
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
{ {
b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document") b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document")
.WithMany("Chunks") .WithMany("Chunks")
.HasForeignKey("DocumentId") .HasForeignKey("DocumentId")
.OnDelete(DeleteBehavior.Cascade) .OnDelete(DeleteBehavior.Cascade)
@@ -83,7 +89,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
b.Navigation("Document"); b.Navigation("Document");
}); });
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
{ {
b.Navigation("Chunks"); b.Navigation("Chunks");
}); });
@@ -3,7 +3,7 @@ using Microsoft.EntityFrameworkCore.Migrations;
#nullable disable #nullable disable
namespace SqlDatabaseVectorSearch.Data.Migrations namespace SqlDatabaseVectorSearch.Migrations
{ {
/// <inheritdoc /> /// <inheritdoc />
public partial class Initial : Migration public partial class Initial : Migration
@@ -31,6 +31,8 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
Id = table.Column<Guid>(type: "uniqueidentifier", nullable: false), Id = table.Column<Guid>(type: "uniqueidentifier", nullable: false),
DocumentId = table.Column<Guid>(type: "uniqueidentifier", nullable: false), DocumentId = table.Column<Guid>(type: "uniqueidentifier", nullable: false),
Index = table.Column<int>(type: "int", nullable: false), Index = table.Column<int>(type: "int", nullable: false),
PageNumber = table.Column<int>(type: "int", nullable: true),
IndexOnPage = table.Column<int>(type: "int", nullable: false),
Content = table.Column<string>(type: "nvarchar(max)", nullable: false), Content = table.Column<string>(type: "nvarchar(max)", nullable: false),
Embedding = table.Column<string>(type: "vector(1536)", nullable: false) Embedding = table.Column<string>(type: "vector(1536)", nullable: false)
}, },
@@ -8,7 +8,7 @@ using SqlDatabaseVectorSearch.Data;
#nullable disable #nullable disable
namespace SqlDatabaseVectorSearch.Data.Migrations namespace SqlDatabaseVectorSearch.Migrations
{ {
[DbContext(typeof(ApplicationDbContext))] [DbContext(typeof(ApplicationDbContext))]
partial class ApplicationDbContextModelSnapshot : ModelSnapshot partial class ApplicationDbContextModelSnapshot : ModelSnapshot
@@ -17,12 +17,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
{ {
#pragma warning disable 612, 618 #pragma warning disable 612, 618
modelBuilder modelBuilder
.HasAnnotation("ProductVersion", "9.0.2") .HasAnnotation("ProductVersion", "9.0.5")
.HasAnnotation("Relational:MaxIdentifierLength", 128); .HasAnnotation("Relational:MaxIdentifierLength", 128);
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder); SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
{ {
b.Property<Guid>("Id") b.Property<Guid>("Id")
.ValueGeneratedOnAdd() .ValueGeneratedOnAdd()
@@ -41,7 +41,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
b.ToTable("Documents", (string)null); b.ToTable("Documents", (string)null);
}); });
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
{ {
b.Property<Guid>("Id") b.Property<Guid>("Id")
.ValueGeneratedOnAdd() .ValueGeneratedOnAdd()
@@ -61,6 +61,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
b.Property<int>("Index") b.Property<int>("Index")
.HasColumnType("int"); .HasColumnType("int");
b.Property<int>("IndexOnPage")
.HasColumnType("int");
b.Property<int?>("PageNumber")
.HasColumnType("int");
b.HasKey("Id"); b.HasKey("Id");
b.HasIndex("DocumentId"); b.HasIndex("DocumentId");
@@ -68,9 +74,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
b.ToTable("DocumentChunks", (string)null); b.ToTable("DocumentChunks", (string)null);
}); });
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b => modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
{ {
b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document") b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document")
.WithMany("Chunks") .WithMany("Chunks")
.HasForeignKey("DocumentId") .HasForeignKey("DocumentId")
.OnDelete(DeleteBehavior.Cascade) .OnDelete(DeleteBehavior.Cascade)
@@ -80,7 +86,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
b.Navigation("Document"); b.Navigation("Document");
}); });
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b => modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
{ {
b.Navigation("Chunks"); b.Navigation("Chunks");
}); });
@@ -132,7 +132,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
First provide your complete answer, then list all citations. First provide your complete answer, then list all citations.
Use this XML format for citations: Use this XML format for citations:
<citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='1'>exact quote here</citation> <citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='page_number' index-on-page='index_on_page'>exact quote here</citation>
"""); """);
var prompt = new StringBuilder($""" var prompt = new StringBuilder($"""
@@ -151,7 +151,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
foreach (var chunk in chunks) foreach (var chunk in chunks)
{ {
var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: 1) {Environment.NewLine}{chunk.Content}{Environment.NewLine}"; var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: {chunk.PageNumber} | Index on Page: {chunk.IndexOnPage}) {Environment.NewLine}{chunk.Content}{Environment.NewLine}";
var tokenCount = tokenizerService.CountChatCompletionTokens(text); var tokenCount = tokenizerService.CountChatCompletionTokens(text);
if (tokenCount > availableTokens) if (tokenCount > availableTokens)
@@ -39,14 +39,24 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() }; var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() };
dbContext.Documents.Add(document); dbContext.Documents.Add(document);
var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken); var embeddings = await embeddingGenerator.GenerateAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken);
// Save the document chunks and the corresponding embedding in the database. // Save the document chunks and the corresponding embedding in the database.
foreach (var (index, embedding) in embeddings.Index()) foreach (var (index, embedding) in embeddings.Index())
{ {
logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(embedding.Value)); var paragraph = paragraphs.ElementAt(index);
logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(paragraph.Content));
var documentChunk = new Entities.DocumentChunk
{
Document = document,
Index = index,
PageNumber = paragraph.PageNumber,
IndexOnPage = paragraph.IndexOnPage,
Content = paragraph.Content,
Embedding = embedding.Vector.ToArray()
};
var documentChunk = new Entities.DocumentChunk { Document = document, Index = index, Content = embedding.Value, Embedding = embedding.Embedding.Vector.ToArray() };
dbContext.DocumentChunks.Add(documentChunk); dbContext.DocumentChunks.Add(documentChunk);
} }