mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Enhance citation handling and document chunk structure
- Updated `Ask.razor` to change `PageNumber` to a nullable integer and added `IndexOnPage` to the `Citation` class. Adjusted regex for citation parsing. - Introduced `PageNumber` and `IndexOnPage` properties in `DocumentChunk.cs`, marking `Content` as required. - Modified migration files to reflect changes in `DocumentChunk` and `Document` entities. - Updated citation format in `ChatService.cs` to include `index-on-page` and adjusted document chunk text formatting. - Changed embedding generation method in `VectorSearchService.cs` and updated document chunk creation to include new properties.
This commit is contained in:
@@ -80,7 +80,7 @@
|
||||
{
|
||||
<div class="border rounded p-2 me-2 mb-2 citation-box small">
|
||||
<div>
|
||||
<strong>@citation.FileName</strong> @if (!string.IsNullOrEmpty(citation.PageNumber))
|
||||
<strong>@citation.FileName</strong> @if (citation.PageNumber.GetValueOrDefault() > 0)
|
||||
{
|
||||
<span class="ms-2">pag. @citation.PageNumber</span>
|
||||
}
|
||||
@@ -314,7 +314,7 @@
|
||||
return (text ?? string.Empty, citations);
|
||||
}
|
||||
|
||||
var pattern = @"<citation\s+document-id='(?<documentId>[^']*)'\s+chunk-id='(?<chunkId>[^']*)'\s+filename='(?<filename>[^']*)'\s+page-number='(?<pageNumber>[^']*)'>\s*(?<quote>.*?)\s*</citation>";
|
||||
var pattern = @"<citation\s+document-id='(?<documentId>[^']*)'\s+chunk-id='(?<chunkId>[^']*)'\s+filename='(?<filename>[^']*)'\s+page-number='(?<pageNumber>[^']*)'\s+index-on-page='(?<indexOnPage>[^']*)'>\s*(?<quote>.*?)\s*</citation>";
|
||||
|
||||
var matches = Regex.Matches(text, pattern, RegexOptions.Singleline);
|
||||
foreach (Match match in matches)
|
||||
@@ -326,7 +326,8 @@
|
||||
DocumentId = Guid.Parse(match.Groups["documentId"].Value),
|
||||
ChunkId = Guid.Parse(match.Groups["chunkId"].Value),
|
||||
FileName = match.Groups["filename"].Value,
|
||||
PageNumber = match.Groups["pageNumber"].Value,
|
||||
PageNumber = int.TryParse(match.Groups["pageNumber"].Value, out var pageNumber) && pageNumber > 0 ? pageNumber : null,
|
||||
IndexOnPage = int.TryParse(match.Groups["indexOnPage"].Value, out var indexOnPage) ? indexOnPage : 0,
|
||||
Quote = match.Groups["quote"].Value
|
||||
});
|
||||
}
|
||||
@@ -368,6 +369,8 @@
|
||||
|
||||
public string Quote { get; set; } = null!;
|
||||
|
||||
public string? PageNumber { get; set; }
|
||||
public int? PageNumber { get; set; }
|
||||
|
||||
public int IndexOnPage { get; set; }
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,10 @@ public class DocumentChunk
|
||||
|
||||
public int Index { get; set; }
|
||||
|
||||
public int? PageNumber { get; set; }
|
||||
|
||||
public int IndexOnPage { get; set; }
|
||||
|
||||
public required string Content { get; set; }
|
||||
|
||||
public required float[] Embedding { get; set; }
|
||||
|
||||
+14
-8
@@ -9,10 +9,10 @@ using SqlDatabaseVectorSearch.Data;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
namespace SqlDatabaseVectorSearch.Migrations
|
||||
{
|
||||
[DbContext(typeof(ApplicationDbContext))]
|
||||
[Migration("20250224102351_Initial")]
|
||||
[Migration("20250606091336_Initial")]
|
||||
partial class Initial
|
||||
{
|
||||
/// <inheritdoc />
|
||||
@@ -20,12 +20,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
{
|
||||
#pragma warning disable 612, 618
|
||||
modelBuilder
|
||||
.HasAnnotation("ProductVersion", "9.0.2")
|
||||
.HasAnnotation("ProductVersion", "9.0.5")
|
||||
.HasAnnotation("Relational:MaxIdentifierLength", 128);
|
||||
|
||||
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
|
||||
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b =>
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
|
||||
{
|
||||
b.Property<Guid>("Id")
|
||||
.ValueGeneratedOnAdd()
|
||||
@@ -44,7 +44,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
b.ToTable("Documents", (string)null);
|
||||
});
|
||||
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b =>
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
|
||||
{
|
||||
b.Property<Guid>("Id")
|
||||
.ValueGeneratedOnAdd()
|
||||
@@ -64,6 +64,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
b.Property<int>("Index")
|
||||
.HasColumnType("int");
|
||||
|
||||
b.Property<int>("IndexOnPage")
|
||||
.HasColumnType("int");
|
||||
|
||||
b.Property<int?>("PageNumber")
|
||||
.HasColumnType("int");
|
||||
|
||||
b.HasKey("Id");
|
||||
|
||||
b.HasIndex("DocumentId");
|
||||
@@ -71,9 +77,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
b.ToTable("DocumentChunks", (string)null);
|
||||
});
|
||||
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b =>
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
|
||||
{
|
||||
b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document")
|
||||
b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document")
|
||||
.WithMany("Chunks")
|
||||
.HasForeignKey("DocumentId")
|
||||
.OnDelete(DeleteBehavior.Cascade)
|
||||
@@ -83,7 +89,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
b.Navigation("Document");
|
||||
});
|
||||
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b =>
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
|
||||
{
|
||||
b.Navigation("Chunks");
|
||||
});
|
||||
|
||||
@@ -3,7 +3,7 @@ using Microsoft.EntityFrameworkCore.Migrations;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
namespace SqlDatabaseVectorSearch.Migrations
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public partial class Initial : Migration
|
||||
@@ -31,6 +31,8 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
Id = table.Column<Guid>(type: "uniqueidentifier", nullable: false),
|
||||
DocumentId = table.Column<Guid>(type: "uniqueidentifier", nullable: false),
|
||||
Index = table.Column<int>(type: "int", nullable: false),
|
||||
PageNumber = table.Column<int>(type: "int", nullable: true),
|
||||
IndexOnPage = table.Column<int>(type: "int", nullable: false),
|
||||
Content = table.Column<string>(type: "nvarchar(max)", nullable: false),
|
||||
Embedding = table.Column<string>(type: "vector(1536)", nullable: false)
|
||||
},
|
||||
|
||||
@@ -8,7 +8,7 @@ using SqlDatabaseVectorSearch.Data;
|
||||
|
||||
#nullable disable
|
||||
|
||||
namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
namespace SqlDatabaseVectorSearch.Migrations
|
||||
{
|
||||
[DbContext(typeof(ApplicationDbContext))]
|
||||
partial class ApplicationDbContextModelSnapshot : ModelSnapshot
|
||||
@@ -17,12 +17,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
{
|
||||
#pragma warning disable 612, 618
|
||||
modelBuilder
|
||||
.HasAnnotation("ProductVersion", "9.0.2")
|
||||
.HasAnnotation("ProductVersion", "9.0.5")
|
||||
.HasAnnotation("Relational:MaxIdentifierLength", 128);
|
||||
|
||||
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
|
||||
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b =>
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
|
||||
{
|
||||
b.Property<Guid>("Id")
|
||||
.ValueGeneratedOnAdd()
|
||||
@@ -41,7 +41,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
b.ToTable("Documents", (string)null);
|
||||
});
|
||||
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b =>
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
|
||||
{
|
||||
b.Property<Guid>("Id")
|
||||
.ValueGeneratedOnAdd()
|
||||
@@ -61,6 +61,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
b.Property<int>("Index")
|
||||
.HasColumnType("int");
|
||||
|
||||
b.Property<int>("IndexOnPage")
|
||||
.HasColumnType("int");
|
||||
|
||||
b.Property<int?>("PageNumber")
|
||||
.HasColumnType("int");
|
||||
|
||||
b.HasKey("Id");
|
||||
|
||||
b.HasIndex("DocumentId");
|
||||
@@ -68,9 +74,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
b.ToTable("DocumentChunks", (string)null);
|
||||
});
|
||||
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b =>
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
|
||||
{
|
||||
b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document")
|
||||
b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document")
|
||||
.WithMany("Chunks")
|
||||
.HasForeignKey("DocumentId")
|
||||
.OnDelete(DeleteBehavior.Cascade)
|
||||
@@ -80,7 +86,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
||||
b.Navigation("Document");
|
||||
});
|
||||
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b =>
|
||||
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
|
||||
{
|
||||
b.Navigation("Chunks");
|
||||
});
|
||||
|
||||
@@ -132,7 +132,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
|
||||
First provide your complete answer, then list all citations.
|
||||
|
||||
Use this XML format for citations:
|
||||
<citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='1'>exact quote here</citation>
|
||||
<citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='page_number' index-on-page='index_on_page'>exact quote here</citation>
|
||||
""");
|
||||
|
||||
var prompt = new StringBuilder($"""
|
||||
@@ -151,7 +151,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
|
||||
|
||||
foreach (var chunk in chunks)
|
||||
{
|
||||
var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: 1) {Environment.NewLine}{chunk.Content}{Environment.NewLine}";
|
||||
var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: {chunk.PageNumber} | Index on Page: {chunk.IndexOnPage}) {Environment.NewLine}{chunk.Content}{Environment.NewLine}";
|
||||
|
||||
var tokenCount = tokenizerService.CountChatCompletionTokens(text);
|
||||
if (tokenCount > availableTokens)
|
||||
|
||||
@@ -39,14 +39,24 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
|
||||
var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() };
|
||||
dbContext.Documents.Add(document);
|
||||
|
||||
var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken);
|
||||
var embeddings = await embeddingGenerator.GenerateAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken);
|
||||
|
||||
// Save the document chunks and the corresponding embedding in the database.
|
||||
foreach (var (index, embedding) in embeddings.Index())
|
||||
{
|
||||
logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(embedding.Value));
|
||||
var paragraph = paragraphs.ElementAt(index);
|
||||
logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(paragraph.Content));
|
||||
|
||||
var documentChunk = new Entities.DocumentChunk
|
||||
{
|
||||
Document = document,
|
||||
Index = index,
|
||||
PageNumber = paragraph.PageNumber,
|
||||
IndexOnPage = paragraph.IndexOnPage,
|
||||
Content = paragraph.Content,
|
||||
Embedding = embedding.Vector.ToArray()
|
||||
};
|
||||
|
||||
var documentChunk = new Entities.DocumentChunk { Document = document, Index = index, Content = embedding.Value, Embedding = embedding.Embedding.Vector.ToArray() };
|
||||
dbContext.DocumentChunks.Add(documentChunk);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user