mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Enhanced document chunk handling and API
- Updated `Scripts.sql` to add a new `[Index]` column to `[dbo].[DocumentChunks]` for order tracking. - Modified `DocumentChunk.cs` to include a new `Index` property, and introduced a new immutable record class for document chunks. - Introduced new API endpoints in `Program.cs` for document and chunk retrieval, including embedding details, with OpenAPI documentation enhancements. - Updated an API endpoint description in `Program.cs` to clarify document embedding handling. - Updated `VectorSearchService.cs` to reflect schema changes in service logic, adding methods for fetching document chunks and specific embeddings.
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
CREATE TABLE [dbo].[DocumentChunks](
|
CREATE TABLE [dbo].[DocumentChunks](
|
||||||
[Id] [uniqueidentifier] NOT NULL,
|
[Id] [uniqueidentifier] NOT NULL,
|
||||||
[DocumentId] [uniqueidentifier] NOT NULL,
|
[DocumentId] [uniqueidentifier] NOT NULL,
|
||||||
|
[Index] INT NOT NULL,
|
||||||
[Content] [nvarchar](max) NOT NULL,
|
[Content] [nvarchar](max) NOT NULL,
|
||||||
[Embedding] [varbinary](8000) NOT NULL,
|
[Embedding] [varbinary](8000) NOT NULL,
|
||||||
CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED
|
CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ public partial class DocumentChunk
|
|||||||
|
|
||||||
public Guid DocumentId { get; set; }
|
public Guid DocumentId { get; set; }
|
||||||
|
|
||||||
|
public int Index { get; set; }
|
||||||
|
|
||||||
public required string Content { get; set; }
|
public required string Content { get; set; }
|
||||||
|
|
||||||
public required float[] Embedding { get; set; }
|
public required float[] Embedding { get; set; }
|
||||||
|
|||||||
@@ -0,0 +1,3 @@
|
|||||||
|
namespace SqlDatabaseVectorSearch.Models;
|
||||||
|
|
||||||
|
public record class DocumentChunk(Guid Id, int Index, string Content, float[]? Embedding = null);
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
using Microsoft.AspNetCore.Http.HttpResults;
|
||||||
using Microsoft.EntityFrameworkCore;
|
using Microsoft.EntityFrameworkCore;
|
||||||
using Microsoft.OpenApi.Models;
|
using Microsoft.OpenApi.Models;
|
||||||
using Microsoft.SemanticKernel;
|
using Microsoft.SemanticKernel;
|
||||||
@@ -74,7 +75,35 @@ documentsApiGroup.MapGet(string.Empty, async (VectorSearchService vectorSearchSe
|
|||||||
.WithOpenApi(operation =>
|
.WithOpenApi(operation =>
|
||||||
{
|
{
|
||||||
operation.Summary = "Gets the list of documents";
|
operation.Summary = "Gets the list of documents";
|
||||||
|
return operation;
|
||||||
|
});
|
||||||
|
|
||||||
|
documentsApiGroup.MapGet("{documentId:guid}/chunks", async (Guid documentId, VectorSearchService vectorSearchService) =>
|
||||||
|
{
|
||||||
|
var documents = await vectorSearchService.GetDocumentChunksAsync(documentId);
|
||||||
|
return TypedResults.Ok(documents);
|
||||||
|
})
|
||||||
|
.WithOpenApi(operation =>
|
||||||
|
{
|
||||||
|
operation.Summary = "Gets the list of chunks of a given document";
|
||||||
|
operation.Description = "The list does not contain embedding. Use '/api/documents/{documentId}/chunks/{documentChunkId}' to get the embedding for a given chunk.";
|
||||||
|
|
||||||
|
return operation;
|
||||||
|
});
|
||||||
|
|
||||||
|
documentsApiGroup.MapGet("{documentId:guid}/chunks/{documentChunkId:guid}", async Task<Results<Ok<DocumentChunk>, NotFound>> (Guid documentId, Guid documentChunkId, VectorSearchService vectorSearchService) =>
|
||||||
|
{
|
||||||
|
var chunk = await vectorSearchService.GetDocumentChunkEmbeddingAsync(documentId, documentChunkId);
|
||||||
|
if (chunk is null)
|
||||||
|
{
|
||||||
|
return TypedResults.NotFound();
|
||||||
|
}
|
||||||
|
|
||||||
|
return TypedResults.Ok(chunk);
|
||||||
|
})
|
||||||
|
.WithOpenApi(operation =>
|
||||||
|
{
|
||||||
|
operation.Summary = "Gets the details of a given chunk, includings its embedding";
|
||||||
return operation;
|
return operation;
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -89,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi
|
|||||||
.WithOpenApi(operation =>
|
.WithOpenApi(operation =>
|
||||||
{
|
{
|
||||||
operation.Summary = "Uploads a document";
|
operation.Summary = "Uploads a document";
|
||||||
operation.Description = "Uploads a document to SQL Server and saves its embeddings using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported.";
|
operation.Description = "Uploads a document to SQL Server and saves its embedding using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported.";
|
||||||
|
|
||||||
operation.Parameter("documentId").Description = "The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the document will be overridden.";
|
operation.Parameter("documentId").Description = "The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the document will be overridden.";
|
||||||
|
|
||||||
|
|||||||
@@ -39,9 +39,10 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
|
|||||||
var paragraphs = TextChunker.SplitPlainTextParagraphs(TextChunker.SplitPlainTextLines(content, appSettings.MaxTokensPerLine), appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens);
|
var paragraphs = TextChunker.SplitPlainTextParagraphs(TextChunker.SplitPlainTextLines(content, appSettings.MaxTokensPerLine), appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens);
|
||||||
var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(paragraphs);
|
var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(paragraphs);
|
||||||
|
|
||||||
|
var index = 0;
|
||||||
foreach (var (paragraph, embedding) in paragraphs.Zip(embeddings, (p, e) => (p, e.ToArray())))
|
foreach (var (paragraph, embedding) in paragraphs.Zip(embeddings, (p, e) => (p, e.ToArray())))
|
||||||
{
|
{
|
||||||
var documentChunk = new Entities.DocumentChunk { DocumentId = documentId.Value, Content = paragraph, Embedding = embedding };
|
var documentChunk = new Entities.DocumentChunk { DocumentId = documentId.Value, Index = index++, Content = paragraph, Embedding = embedding };
|
||||||
dbContext.DocumentChunks.Add(documentChunk);
|
dbContext.DocumentChunks.Add(documentChunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -58,6 +59,24 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
|
|||||||
return documents;
|
return documents;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async Task<IEnumerable<DocumentChunk>> GetDocumentChunksAsync(Guid documentId)
|
||||||
|
{
|
||||||
|
var documentChunks = await dbContext.DocumentChunks.Where(c => c.DocumentId == documentId).OrderBy(c => c.Index).AsNoTracking()
|
||||||
|
.Select(c => new DocumentChunk(c.Id, c.Index, c.Content, null))
|
||||||
|
.ToListAsync();
|
||||||
|
|
||||||
|
return documentChunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<DocumentChunk?> GetDocumentChunkEmbeddingAsync(Guid documentId, Guid documentChunkId)
|
||||||
|
{
|
||||||
|
var documentChunk = await dbContext.DocumentChunks.Where(c => c.Id == documentChunkId && c.DocumentId == documentId).AsNoTracking()
|
||||||
|
.Select(c => new DocumentChunk(c.Id, c.Index, c.Content, c.Embedding))
|
||||||
|
.FirstOrDefaultAsync();
|
||||||
|
|
||||||
|
return documentChunk;
|
||||||
|
}
|
||||||
|
|
||||||
public async Task DeleteDocumentAsync(Guid documentId, bool saveChanges = true)
|
public async Task DeleteDocumentAsync(Guid documentId, bool saveChanges = true)
|
||||||
{
|
{
|
||||||
var document = await dbContext.Documents.Include(d => d.Chunks).FirstOrDefaultAsync(d => d.Id == documentId);
|
var document = await dbContext.Documents.Include(d => d.Chunks).FirstOrDefaultAsync(d => d.Id == documentId);
|
||||||
|
|||||||
Reference in New Issue
Block a user