mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Enhance citation handling and document chunk structure
- Updated `Ask.razor` to change `PageNumber` to a nullable integer and added `IndexOnPage` to the `Citation` class. Adjusted regex for citation parsing. - Introduced `PageNumber` and `IndexOnPage` properties in `DocumentChunk.cs`, marking `Content` as required. - Modified migration files to reflect changes in `DocumentChunk` and `Document` entities. - Updated citation format in `ChatService.cs` to include `index-on-page` and adjusted document chunk text formatting. - Changed embedding generation method in `VectorSearchService.cs` and updated document chunk creation to include new properties.
This commit is contained in:
@@ -80,7 +80,7 @@
|
|||||||
{
|
{
|
||||||
<div class="border rounded p-2 me-2 mb-2 citation-box small">
|
<div class="border rounded p-2 me-2 mb-2 citation-box small">
|
||||||
<div>
|
<div>
|
||||||
<strong>@citation.FileName</strong> @if (!string.IsNullOrEmpty(citation.PageNumber))
|
<strong>@citation.FileName</strong> @if (citation.PageNumber.GetValueOrDefault() > 0)
|
||||||
{
|
{
|
||||||
<span class="ms-2">pag. @citation.PageNumber</span>
|
<span class="ms-2">pag. @citation.PageNumber</span>
|
||||||
}
|
}
|
||||||
@@ -314,7 +314,7 @@
|
|||||||
return (text ?? string.Empty, citations);
|
return (text ?? string.Empty, citations);
|
||||||
}
|
}
|
||||||
|
|
||||||
var pattern = @"<citation\s+document-id='(?<documentId>[^']*)'\s+chunk-id='(?<chunkId>[^']*)'\s+filename='(?<filename>[^']*)'\s+page-number='(?<pageNumber>[^']*)'>\s*(?<quote>.*?)\s*</citation>";
|
var pattern = @"<citation\s+document-id='(?<documentId>[^']*)'\s+chunk-id='(?<chunkId>[^']*)'\s+filename='(?<filename>[^']*)'\s+page-number='(?<pageNumber>[^']*)'\s+index-on-page='(?<indexOnPage>[^']*)'>\s*(?<quote>.*?)\s*</citation>";
|
||||||
|
|
||||||
var matches = Regex.Matches(text, pattern, RegexOptions.Singleline);
|
var matches = Regex.Matches(text, pattern, RegexOptions.Singleline);
|
||||||
foreach (Match match in matches)
|
foreach (Match match in matches)
|
||||||
@@ -326,7 +326,8 @@
|
|||||||
DocumentId = Guid.Parse(match.Groups["documentId"].Value),
|
DocumentId = Guid.Parse(match.Groups["documentId"].Value),
|
||||||
ChunkId = Guid.Parse(match.Groups["chunkId"].Value),
|
ChunkId = Guid.Parse(match.Groups["chunkId"].Value),
|
||||||
FileName = match.Groups["filename"].Value,
|
FileName = match.Groups["filename"].Value,
|
||||||
PageNumber = match.Groups["pageNumber"].Value,
|
PageNumber = int.TryParse(match.Groups["pageNumber"].Value, out var pageNumber) && pageNumber > 0 ? pageNumber : null,
|
||||||
|
IndexOnPage = int.TryParse(match.Groups["indexOnPage"].Value, out var indexOnPage) ? indexOnPage : 0,
|
||||||
Quote = match.Groups["quote"].Value
|
Quote = match.Groups["quote"].Value
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -368,6 +369,8 @@
|
|||||||
|
|
||||||
public string Quote { get; set; } = null!;
|
public string Quote { get; set; } = null!;
|
||||||
|
|
||||||
public string? PageNumber { get; set; }
|
public int? PageNumber { get; set; }
|
||||||
|
|
||||||
|
public int IndexOnPage { get; set; }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -8,6 +8,10 @@ public class DocumentChunk
|
|||||||
|
|
||||||
public int Index { get; set; }
|
public int Index { get; set; }
|
||||||
|
|
||||||
|
public int? PageNumber { get; set; }
|
||||||
|
|
||||||
|
public int IndexOnPage { get; set; }
|
||||||
|
|
||||||
public required string Content { get; set; }
|
public required string Content { get; set; }
|
||||||
|
|
||||||
public required float[] Embedding { get; set; }
|
public required float[] Embedding { get; set; }
|
||||||
|
|||||||
+14
-8
@@ -9,10 +9,10 @@ using SqlDatabaseVectorSearch.Data;
|
|||||||
|
|
||||||
#nullable disable
|
#nullable disable
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.Data.Migrations
|
namespace SqlDatabaseVectorSearch.Migrations
|
||||||
{
|
{
|
||||||
[DbContext(typeof(ApplicationDbContext))]
|
[DbContext(typeof(ApplicationDbContext))]
|
||||||
[Migration("20250224102351_Initial")]
|
[Migration("20250606091336_Initial")]
|
||||||
partial class Initial
|
partial class Initial
|
||||||
{
|
{
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
@@ -20,12 +20,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
{
|
{
|
||||||
#pragma warning disable 612, 618
|
#pragma warning disable 612, 618
|
||||||
modelBuilder
|
modelBuilder
|
||||||
.HasAnnotation("ProductVersion", "9.0.2")
|
.HasAnnotation("ProductVersion", "9.0.5")
|
||||||
.HasAnnotation("Relational:MaxIdentifierLength", 128);
|
.HasAnnotation("Relational:MaxIdentifierLength", 128);
|
||||||
|
|
||||||
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
|
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
|
||||||
|
|
||||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b =>
|
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
|
||||||
{
|
{
|
||||||
b.Property<Guid>("Id")
|
b.Property<Guid>("Id")
|
||||||
.ValueGeneratedOnAdd()
|
.ValueGeneratedOnAdd()
|
||||||
@@ -44,7 +44,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
b.ToTable("Documents", (string)null);
|
b.ToTable("Documents", (string)null);
|
||||||
});
|
});
|
||||||
|
|
||||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b =>
|
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
|
||||||
{
|
{
|
||||||
b.Property<Guid>("Id")
|
b.Property<Guid>("Id")
|
||||||
.ValueGeneratedOnAdd()
|
.ValueGeneratedOnAdd()
|
||||||
@@ -64,6 +64,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
b.Property<int>("Index")
|
b.Property<int>("Index")
|
||||||
.HasColumnType("int");
|
.HasColumnType("int");
|
||||||
|
|
||||||
|
b.Property<int>("IndexOnPage")
|
||||||
|
.HasColumnType("int");
|
||||||
|
|
||||||
|
b.Property<int?>("PageNumber")
|
||||||
|
.HasColumnType("int");
|
||||||
|
|
||||||
b.HasKey("Id");
|
b.HasKey("Id");
|
||||||
|
|
||||||
b.HasIndex("DocumentId");
|
b.HasIndex("DocumentId");
|
||||||
@@ -71,9 +77,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
b.ToTable("DocumentChunks", (string)null);
|
b.ToTable("DocumentChunks", (string)null);
|
||||||
});
|
});
|
||||||
|
|
||||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b =>
|
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
|
||||||
{
|
{
|
||||||
b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document")
|
b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document")
|
||||||
.WithMany("Chunks")
|
.WithMany("Chunks")
|
||||||
.HasForeignKey("DocumentId")
|
.HasForeignKey("DocumentId")
|
||||||
.OnDelete(DeleteBehavior.Cascade)
|
.OnDelete(DeleteBehavior.Cascade)
|
||||||
@@ -83,7 +89,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
b.Navigation("Document");
|
b.Navigation("Document");
|
||||||
});
|
});
|
||||||
|
|
||||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b =>
|
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
|
||||||
{
|
{
|
||||||
b.Navigation("Chunks");
|
b.Navigation("Chunks");
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ using Microsoft.EntityFrameworkCore.Migrations;
|
|||||||
|
|
||||||
#nullable disable
|
#nullable disable
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.Data.Migrations
|
namespace SqlDatabaseVectorSearch.Migrations
|
||||||
{
|
{
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
public partial class Initial : Migration
|
public partial class Initial : Migration
|
||||||
@@ -31,6 +31,8 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
Id = table.Column<Guid>(type: "uniqueidentifier", nullable: false),
|
Id = table.Column<Guid>(type: "uniqueidentifier", nullable: false),
|
||||||
DocumentId = table.Column<Guid>(type: "uniqueidentifier", nullable: false),
|
DocumentId = table.Column<Guid>(type: "uniqueidentifier", nullable: false),
|
||||||
Index = table.Column<int>(type: "int", nullable: false),
|
Index = table.Column<int>(type: "int", nullable: false),
|
||||||
|
PageNumber = table.Column<int>(type: "int", nullable: true),
|
||||||
|
IndexOnPage = table.Column<int>(type: "int", nullable: false),
|
||||||
Content = table.Column<string>(type: "nvarchar(max)", nullable: false),
|
Content = table.Column<string>(type: "nvarchar(max)", nullable: false),
|
||||||
Embedding = table.Column<string>(type: "vector(1536)", nullable: false)
|
Embedding = table.Column<string>(type: "vector(1536)", nullable: false)
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ using SqlDatabaseVectorSearch.Data;
|
|||||||
|
|
||||||
#nullable disable
|
#nullable disable
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.Data.Migrations
|
namespace SqlDatabaseVectorSearch.Migrations
|
||||||
{
|
{
|
||||||
[DbContext(typeof(ApplicationDbContext))]
|
[DbContext(typeof(ApplicationDbContext))]
|
||||||
partial class ApplicationDbContextModelSnapshot : ModelSnapshot
|
partial class ApplicationDbContextModelSnapshot : ModelSnapshot
|
||||||
@@ -17,12 +17,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
{
|
{
|
||||||
#pragma warning disable 612, 618
|
#pragma warning disable 612, 618
|
||||||
modelBuilder
|
modelBuilder
|
||||||
.HasAnnotation("ProductVersion", "9.0.2")
|
.HasAnnotation("ProductVersion", "9.0.5")
|
||||||
.HasAnnotation("Relational:MaxIdentifierLength", 128);
|
.HasAnnotation("Relational:MaxIdentifierLength", 128);
|
||||||
|
|
||||||
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
|
SqlServerModelBuilderExtensions.UseIdentityColumns(modelBuilder);
|
||||||
|
|
||||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b =>
|
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
|
||||||
{
|
{
|
||||||
b.Property<Guid>("Id")
|
b.Property<Guid>("Id")
|
||||||
.ValueGeneratedOnAdd()
|
.ValueGeneratedOnAdd()
|
||||||
@@ -41,7 +41,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
b.ToTable("Documents", (string)null);
|
b.ToTable("Documents", (string)null);
|
||||||
});
|
});
|
||||||
|
|
||||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b =>
|
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
|
||||||
{
|
{
|
||||||
b.Property<Guid>("Id")
|
b.Property<Guid>("Id")
|
||||||
.ValueGeneratedOnAdd()
|
.ValueGeneratedOnAdd()
|
||||||
@@ -61,6 +61,12 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
b.Property<int>("Index")
|
b.Property<int>("Index")
|
||||||
.HasColumnType("int");
|
.HasColumnType("int");
|
||||||
|
|
||||||
|
b.Property<int>("IndexOnPage")
|
||||||
|
.HasColumnType("int");
|
||||||
|
|
||||||
|
b.Property<int?>("PageNumber")
|
||||||
|
.HasColumnType("int");
|
||||||
|
|
||||||
b.HasKey("Id");
|
b.HasKey("Id");
|
||||||
|
|
||||||
b.HasIndex("DocumentId");
|
b.HasIndex("DocumentId");
|
||||||
@@ -68,9 +74,9 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
b.ToTable("DocumentChunks", (string)null);
|
b.ToTable("DocumentChunks", (string)null);
|
||||||
});
|
});
|
||||||
|
|
||||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.DocumentChunk", b =>
|
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.DocumentChunk", b =>
|
||||||
{
|
{
|
||||||
b.HasOne("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", "Document")
|
b.HasOne("SqlDatabaseVectorSearch.Data.Entities.Document", "Document")
|
||||||
.WithMany("Chunks")
|
.WithMany("Chunks")
|
||||||
.HasForeignKey("DocumentId")
|
.HasForeignKey("DocumentId")
|
||||||
.OnDelete(DeleteBehavior.Cascade)
|
.OnDelete(DeleteBehavior.Cascade)
|
||||||
@@ -80,7 +86,7 @@ namespace SqlDatabaseVectorSearch.Data.Migrations
|
|||||||
b.Navigation("Document");
|
b.Navigation("Document");
|
||||||
});
|
});
|
||||||
|
|
||||||
modelBuilder.Entity("SqlDatabaseVectorSearch.DataAccessLayer.Entities.Document", b =>
|
modelBuilder.Entity("SqlDatabaseVectorSearch.Data.Entities.Document", b =>
|
||||||
{
|
{
|
||||||
b.Navigation("Chunks");
|
b.Navigation("Chunks");
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -132,7 +132,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
|
|||||||
First provide your complete answer, then list all citations.
|
First provide your complete answer, then list all citations.
|
||||||
|
|
||||||
Use this XML format for citations:
|
Use this XML format for citations:
|
||||||
<citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='1'>exact quote here</citation>
|
<citation document-id='document_id' chunk-id='chunk_id' filename='string' page-number='page_number' index-on-page='index_on_page'>exact quote here</citation>
|
||||||
""");
|
""");
|
||||||
|
|
||||||
var prompt = new StringBuilder($"""
|
var prompt = new StringBuilder($"""
|
||||||
@@ -151,7 +151,7 @@ public class ChatService(IChatCompletionService chatCompletionService, Tokenizer
|
|||||||
|
|
||||||
foreach (var chunk in chunks)
|
foreach (var chunk in chunks)
|
||||||
{
|
{
|
||||||
var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: 1) {Environment.NewLine}{chunk.Content}{Environment.NewLine}";
|
var text = $"--- {chunk.Document.Name} (Document ID: {chunk.Document.Id} | Chunk ID: {chunk.Id} | Page Number: {chunk.PageNumber} | Index on Page: {chunk.IndexOnPage}) {Environment.NewLine}{chunk.Content}{Environment.NewLine}";
|
||||||
|
|
||||||
var tokenCount = tokenizerService.CountChatCompletionTokens(text);
|
var tokenCount = tokenizerService.CountChatCompletionTokens(text);
|
||||||
if (tokenCount > availableTokens)
|
if (tokenCount > availableTokens)
|
||||||
|
|||||||
@@ -39,14 +39,24 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
|
|||||||
var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() };
|
var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() };
|
||||||
dbContext.Documents.Add(document);
|
dbContext.Documents.Add(document);
|
||||||
|
|
||||||
var embeddings = await embeddingGenerator.GenerateAndZipAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken);
|
var embeddings = await embeddingGenerator.GenerateAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken);
|
||||||
|
|
||||||
// Save the document chunks and the corresponding embedding in the database.
|
// Save the document chunks and the corresponding embedding in the database.
|
||||||
foreach (var (index, embedding) in embeddings.Index())
|
foreach (var (index, embedding) in embeddings.Index())
|
||||||
{
|
{
|
||||||
logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(embedding.Value));
|
var paragraph = paragraphs.ElementAt(index);
|
||||||
|
logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(paragraph.Content));
|
||||||
|
|
||||||
|
var documentChunk = new Entities.DocumentChunk
|
||||||
|
{
|
||||||
|
Document = document,
|
||||||
|
Index = index,
|
||||||
|
PageNumber = paragraph.PageNumber,
|
||||||
|
IndexOnPage = paragraph.IndexOnPage,
|
||||||
|
Content = paragraph.Content,
|
||||||
|
Embedding = embedding.Vector.ToArray()
|
||||||
|
};
|
||||||
|
|
||||||
var documentChunk = new Entities.DocumentChunk { Document = document, Index = index, Content = embedding.Value, Embedding = embedding.Embedding.Vector.ToArray() };
|
|
||||||
dbContext.DocumentChunks.Add(documentChunk);
|
dbContext.DocumentChunks.Add(documentChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user