From 3e952514852c659400417bc40688b076c649c128 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Mon, 30 Sep 2024 17:08:28 +0200 Subject: [PATCH 1/5] Refactor to use the native VECTOR type --- Scripts.sql | 20 ++- .../DataAccessLayer/ApplicationDbContext.cs | 52 ------- .../DataAccessLayer/Entities/Document.cs | 12 -- .../DataAccessLayer/Entities/DocumentChunk.cs | 16 -- SqlDatabaseVectorSearch/Program.cs | 10 +- .../Services/VectorSearchService.cs | 142 ++++++++++++++---- .../SqlDatabaseVectorSearch.csproj | 3 +- 7 files changed, 129 insertions(+), 126 deletions(-) delete mode 100644 SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs delete mode 100644 SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs delete mode 100644 SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs diff --git a/Scripts.sql b/Scripts.sql index bb7843f..d62c420 100644 --- a/Scripts.sql +++ b/Scripts.sql @@ -1,26 +1,32 @@ -CREATE TABLE [dbo].[DocumentChunks]( +CREATE TABLE [dbo].[DocumentChunks2]( [Id] [uniqueidentifier] NOT NULL, [DocumentId] [uniqueidentifier] NOT NULL, [Index] [int] NOT NULL, [Content] [nvarchar](max) NOT NULL, - [Embedding] [varbinary](8000) NOT NULL, - CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED + [Embedding] [vector](1536) NOT NULL, + CONSTRAINT [PK_DocumentChunks2] PRIMARY KEY CLUSTERED ( [Id] ASC )) GO -CREATE TABLE [dbo].[Documents]( +CREATE TABLE [dbo].[Documents2]( [Id] [uniqueidentifier] NOT NULL, [Name] [nvarchar](255) NOT NULL, [CreationDate] [datetimeoffset](7) NOT NULL, - CONSTRAINT [PK_Documents] PRIMARY KEY CLUSTERED + CONSTRAINT [PK_Documents2] PRIMARY KEY CLUSTERED ( [Id] ASC )) GO -ALTER TABLE [dbo].[DocumentChunks] WITH CHECK ADD CONSTRAINT [FK_DocumentChunks_Documents] FOREIGN KEY([DocumentId]) -REFERENCES [dbo].[Documents] ([Id]) +ALTER TABLE [dbo].[DocumentChunks2] WITH CHECK ADD CONSTRAINT [FK_DocumentChunks2_Documents2] FOREIGN KEY([DocumentId]) +REFERENCES [dbo].[Documents2] ([Id]) ON DELETE CASCADE +GO + +ALTER TABLE [dbo].[Documents2] ADD CONSTRAINT [DF_Documents2_Id] DEFAULT (newsequentialid()) FOR [Id] +GO + +ALTER TABLE [dbo].[DocumentChunks2] ADD CONSTRAINT [DF_DocumentChunks2_Id] DEFAULT (newsequentialid()) FOR [Id] GO \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs b/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs deleted file mode 100644 index f4d88ed..0000000 --- a/SqlDatabaseVectorSearch/DataAccessLayer/ApplicationDbContext.cs +++ /dev/null @@ -1,52 +0,0 @@ -using EntityFramework.Exceptions.SqlServer; -using Microsoft.EntityFrameworkCore; -using SqlDatabaseVectorSearch.DataAccessLayer.Entities; - -namespace SqlDatabaseVectorSearch.DataAccessLayer; - -public class ApplicationDbContext(DbContextOptions options) : DbContext(options) -{ - public virtual DbSet Documents { get; set; } - - public virtual DbSet DocumentChunks { get; set; } - - protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder) - { - base.OnConfiguring(optionsBuilder); - - optionsBuilder.UseExceptionProcessor(); - //optionsBuilder.EnableSensitiveDataLogging(); - } - - protected override void OnModelCreating(ModelBuilder modelBuilder) - { - modelBuilder.Entity(entity => - { - entity.ToTable("Documents"); - entity.HasKey(e => e.Id); - - entity.Property(e => e.Id).ValueGeneratedOnAdd(); - entity.Property(e => e.Name) - .IsRequired() - .HasMaxLength(255); - }); - - modelBuilder.Entity(entity => - { - entity.ToTable("DocumentChunks"); - entity.HasKey(e => e.Id); - - entity.Property(e => e.Id).ValueGeneratedOnAdd(); - entity.Property(e => e.Content).IsRequired(); - entity.Property(e => e.Embedding) - .IsRequired() - .HasMaxLength(8000) - .IsVector(); - - entity.HasOne(d => d.Document).WithMany(p => p.Chunks) - .HasForeignKey(d => d.DocumentId) - .OnDelete(DeleteBehavior.Cascade) - .HasConstraintName("FK_DocumentChunks_Documents"); - }); - } -} diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs b/SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs deleted file mode 100644 index 17f4b1a..0000000 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/Document.cs +++ /dev/null @@ -1,12 +0,0 @@ -namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities; - -public class Document -{ - public Guid Id { get; set; } - - public required string Name { get; set; } - - public DateTimeOffset CreationDate { get; set; } - - public virtual ICollection Chunks { get; set; } = []; -} diff --git a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs b/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs deleted file mode 100644 index ea21ebc..0000000 --- a/SqlDatabaseVectorSearch/DataAccessLayer/Entities/DocumentChunk.cs +++ /dev/null @@ -1,16 +0,0 @@ -namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities; - -public class DocumentChunk -{ - public Guid Id { get; set; } - - public Guid DocumentId { get; set; } - - public int Index { get; set; } - - public required string Content { get; set; } - - public required float[] Embedding { get; set; } - - public virtual Document Document { get; set; } = null!; -} diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index df29d6f..8832861 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -1,9 +1,9 @@ using Microsoft.AspNetCore.Http.HttpResults; +using Microsoft.Data.SqlClient; using Microsoft.EntityFrameworkCore; using Microsoft.OpenApi.Models; using Microsoft.SemanticKernel; using MinimalHelpers.OpenApi; -using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Services; using SqlDatabaseVectorSearch.Settings; @@ -19,12 +19,10 @@ var appSettings = builder.Services.ConfigureAndGet(builder.Configur builder.Services.AddSingleton(TimeProvider.System); -builder.Services.AddSqlServer(builder.Configuration.GetConnectionString("SqlConnection"), options => +builder.Services.AddScoped(_ => { - options.UseVectorSearch(); -}, options => -{ - options.UseQueryTrackingBehavior(QueryTrackingBehavior.NoTracking); + var sqlConnection = new SqlConnection(builder.Configuration.GetConnectionString("SqlConnection")); + return sqlConnection; }); builder.Services.AddMemoryCache(); diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index d7e10ce..013a60b 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -1,18 +1,19 @@ -using System.Text; -using Microsoft.EntityFrameworkCore; +using System.Data; +using System.Data.Common; +using System.Text; +using System.Text.Json; +using Microsoft.Data.SqlClient; using Microsoft.Extensions.Options; using Microsoft.SemanticKernel.Embeddings; using Microsoft.SemanticKernel.Text; -using SqlDatabaseVectorSearch.DataAccessLayer; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Settings; using UglyToad.PdfPig; using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; -using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities; namespace SqlDatabaseVectorSearch.Services; -public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TimeProvider timeProvider, IOptions appSettingsOptions) +public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TimeProvider timeProvider, IOptions appSettingsOptions) { private readonly AppSettings appSettings = appSettingsOptions.Value; @@ -21,16 +22,25 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG // Extract the contents of the file (currently, only PDF files are supported). var content = await GetContentAsync(stream); - await dbContext.Database.BeginTransactionAsync(); + await sqlConnection.OpenAsync(); + await using var transaction = await sqlConnection.BeginTransactionAsync(); if (documentId.HasValue) { // If the user is importing a document that already exists, delete the previous one. - await DeleteDocumentAsync(documentId.Value); + await DeleteDocumentAsync(documentId.Value, transaction); } - var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() }; - dbContext.Documents.Add(document); + await using var command = sqlConnection.CreateCommand(); + command.Transaction = (SqlTransaction)transaction; + + command.CommandText = "INSERT INTO Documents2 (Id, [Name], CreationDate) OUTPUT INSERTED.Id VALUES (@Id, @Name, @CreationDate)"; + command.Parameters.AddWithValue("@Id", documentId.GetValueOrDefault(Guid.NewGuid())); + command.Parameters.AddWithValue("@Name", name); + command.Parameters.AddWithValue("@CreationDate", timeProvider.GetUtcNow()); + + var insertedId = await command.ExecuteScalarAsync(); + documentId = (Guid)insertedId!; // Split the content into chunks and generate the embeddings for each one. var paragraphs = TextChunker.SplitPlainTextParagraphs(TextChunker.SplitPlainTextLines(content, appSettings.MaxTokensPerLine), appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens); @@ -39,45 +49,105 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG var index = 0; foreach (var (paragraph, embedding) in paragraphs.Zip(embeddings, (p, e) => (p, e.ToArray()))) { - var documentChunk = new Entities.DocumentChunk { Document = document, Index = index++, Content = paragraph, Embedding = embedding }; - dbContext.DocumentChunks.Add(documentChunk); + command.Parameters.Clear(); + + command.CommandText = "INSERT INTO DocumentChunks2 (DocumentId, [Index], Content, Embedding) VALUES (@DocumentId, @Index, @Content, CAST(@Embedding AS VECTOR(1536)))"; + command.Parameters.AddWithValue("@DocumentId", documentId); + command.Parameters.AddWithValue("@Index", index++); + command.Parameters.AddWithValue("@Content", paragraph); + command.Parameters.AddWithValue("@Embedding", JsonSerializer.Serialize(embedding)); + + await command.ExecuteNonQueryAsync(); } - await dbContext.SaveChangesAsync(); - await dbContext.Database.CommitTransactionAsync(); + await transaction.CommitAsync(); - return document.Id; + return documentId.Value; } public async Task> GetDocumentsAsync() { - var documents = await dbContext.Documents.OrderBy(d => d.Name) - .Select(d => new Document(d.Id, d.Name, d.CreationDate, d.Chunks.Count)) - .ToListAsync(); + await sqlConnection.OpenAsync(); + await using var command = sqlConnection.CreateCommand(); + + command.CommandText = "SELECT Id, [Name], CreationDate, ChunkCount = (SELECT COUNT(*) FROM DocumentChunks2 WHERE DocumentId = Documents2.Id) FROM Documents2 ORDER BY [Name]"; + + var documents = new List(); + + using var reader = await command.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var id = reader.GetGuid(0); + var name = reader.GetString(1); + var creationDate = reader.GetDateTimeOffset(2); + var chunkCount = reader.GetInt32(3); + + documents.Add(new(id, name, creationDate, chunkCount)); + } return documents; } public async Task> GetDocumentChunksAsync(Guid documentId) { - var documentChunks = await dbContext.DocumentChunks.Where(c => c.DocumentId == documentId).OrderBy(c => c.Index) - .Select(c => new DocumentChunk(c.Id, c.Index, c.Content, null)) - .ToListAsync(); + await sqlConnection.OpenAsync(); + await using var command = sqlConnection.CreateCommand(); + + command.CommandText = "SELECT Id, [Index], Content FROM DocumentChunks2 WHERE DocumentId = @DocumentId ORDER BY [Index]"; + command.Parameters.AddWithValue("@DocumentId", documentId); + + var documentChunks = new List(); + + using var reader = await command.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var id = reader.GetGuid(0); + var index = reader.GetInt32(1); + var content = reader.GetString(2); + + documentChunks.Add(new(id, index, content, null)); + } return documentChunks; } public async Task GetDocumentChunkEmbeddingAsync(Guid documentId, Guid documentChunkId) { - var documentChunk = await dbContext.DocumentChunks.Where(c => c.Id == documentChunkId && c.DocumentId == documentId) - .Select(c => new DocumentChunk(c.Id, c.Index, c.Content, c.Embedding)) - .FirstOrDefaultAsync(); + await sqlConnection.OpenAsync(); + await using var command = sqlConnection.CreateCommand(); - return documentChunk; + command.CommandText = "SELECT [Index], Content, CAST(Embedding AS NVARCHAR(MAX)) FROM DocumentChunks2 WHERE Id = @DocumentChunkId AND DocumentId = @DocumentId"; + command.Parameters.AddWithValue("@DocumentChunkId", documentChunkId); + command.Parameters.AddWithValue("@DocumentId", documentId); + + using var reader = await command.ExecuteReaderAsync(); + if (reader.HasRows && await reader.ReadAsync()) + { + var index = reader.GetInt32(0); + var content = reader.GetString(1); + var embedding = JsonSerializer.Deserialize(reader.GetString(2))!; + + return new(documentChunkId, index, content, embedding); + } + + return null; } - public Task DeleteDocumentAsync(Guid documentId) - => dbContext.Documents.Where(d => d.Id == documentId).ExecuteDeleteAsync(); + public async Task DeleteDocumentAsync(Guid documentId, DbTransaction? transaction = null) + { + if (sqlConnection.State == ConnectionState.Closed) + { + await sqlConnection.OpenAsync(); + } + + using var command = sqlConnection.CreateCommand(); + command.Transaction = transaction as SqlTransaction; + + command.CommandText = "DELETE FROM Documents2 WHERE Id = @DocumentId"; + command.Parameters.AddWithValue("@DocumentId", documentId); + + await command.ExecuteNonQueryAsync(); + } public async Task AskQuestionAsync(Question question, bool reformulate = true) { @@ -87,11 +157,21 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG // Perform Vector Search on SQL Database. var questionEmbedding = await textEmbeddingGenerationService.GenerateEmbeddingAsync(reformulatedQuestion); - var chunks = await dbContext.DocumentChunks - .OrderBy(c => EF.Functions.VectorDistance("cosine", c.Embedding, questionEmbedding.ToArray())) - .Select(c => c.Content) - .Take(appSettings.MaxRelevantChunks) - .ToListAsync(); + await sqlConnection.OpenAsync(); + await using var command = sqlConnection.CreateCommand(); + + command.CommandText = "SELECT TOP (@MaxRelevantChunks) Content FROM DocumentChunks2 ORDER BY VECTOR_DISTANCE('cosine', Embedding, CAST(@QuestionEmbedding AS VECTOR(1536)))"; + command.Parameters.AddWithValue("@MaxRelevantChunks", appSettings.MaxRelevantChunks); + command.Parameters.AddWithValue("@QuestionEmbedding", JsonSerializer.Serialize(questionEmbedding)); + + var chunks = new List(); + + using var reader = await command.ExecuteReaderAsync(); + while (await reader.ReadAsync()) + { + var content = reader.GetString(0); + chunks.Add(content); + } var answer = await chatService.AskQuestionAsync(question.ConversationId, chunks, reformulatedQuestion); return new Response(reformulatedQuestion, answer); diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index a5caf99..3e07174 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -8,10 +8,9 @@ - - + From 2dff0aae55e0c1cce4dacf7f5616b17336e3efc0 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Mon, 30 Sep 2024 17:53:59 +0200 Subject: [PATCH 2/5] Add dimensions parameter for embeddings; reformat SQL Updated Program.cs to include dimensions parameter for AddAzureOpenAITextEmbeddingGeneration sourced from aiSettings.Embedding.Dimensions. Reformatted SQL command texts in VectorSearchService.cs for better readability. Introduced EmbeddingServiceSettings class in AzureOpenAISettings.cs to allow optional dimensions configuration. Updated appsettings.json to include new Dimensions property under Embedding section. --- SqlDatabaseVectorSearch/Program.cs | 2 +- .../Services/VectorSearchService.cs | 49 ++++++++++++++----- .../Settings/AzureOpenAISettings.cs | 7 ++- SqlDatabaseVectorSearch/appsettings.json | 3 +- 4 files changed, 47 insertions(+), 14 deletions(-) diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 8832861..244729a 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -30,7 +30,7 @@ builder.Services.AddMemoryCache(); // Semantic Kernel is used to generate embeddings and to reformulate questions taking into account all the previous interactions, // so that embeddings themselves can be generated more accurately. builder.Services.AddKernel() - .AddAzureOpenAITextEmbeddingGeneration(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey) + .AddAzureOpenAITextEmbeddingGeneration(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, dimensions: aiSettings.Embedding.Dimensions) .AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey); builder.Services.AddScoped(); diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 013a60b..207689d 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -34,7 +34,12 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene await using var command = sqlConnection.CreateCommand(); command.Transaction = (SqlTransaction)transaction; - command.CommandText = "INSERT INTO Documents2 (Id, [Name], CreationDate) OUTPUT INSERTED.Id VALUES (@Id, @Name, @CreationDate)"; + command.CommandText = """ + INSERT INTO Documents2 (Id, [Name], CreationDate) + OUTPUT INSERTED.Id + VALUES (@Id, @Name, @CreationDate); + """; + command.Parameters.AddWithValue("@Id", documentId.GetValueOrDefault(Guid.NewGuid())); command.Parameters.AddWithValue("@Name", name); command.Parameters.AddWithValue("@CreationDate", timeProvider.GetUtcNow()); @@ -51,7 +56,11 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene { command.Parameters.Clear(); - command.CommandText = "INSERT INTO DocumentChunks2 (DocumentId, [Index], Content, Embedding) VALUES (@DocumentId, @Index, @Content, CAST(@Embedding AS VECTOR(1536)))"; + command.CommandText = $""" + INSERT INTO DocumentChunks2 (DocumentId, [Index], Content, Embedding) + VALUES (@DocumentId, @Index, @Content, CAST(@Embedding AS VECTOR({embedding.Length}))); + """; + command.Parameters.AddWithValue("@DocumentId", documentId); command.Parameters.AddWithValue("@Index", index++); command.Parameters.AddWithValue("@Content", paragraph); @@ -70,11 +79,15 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene await sqlConnection.OpenAsync(); await using var command = sqlConnection.CreateCommand(); - command.CommandText = "SELECT Id, [Name], CreationDate, ChunkCount = (SELECT COUNT(*) FROM DocumentChunks2 WHERE DocumentId = Documents2.Id) FROM Documents2 ORDER BY [Name]"; + command.CommandText = """ + SELECT Id, [Name], CreationDate, ChunkCount = (SELECT COUNT(*) FROM DocumentChunks2 + WHERE DocumentId = Documents2.Id) + FROM Documents2 ORDER BY [Name]; + """; var documents = new List(); - using var reader = await command.ExecuteReaderAsync(); + await using var reader = await command.ExecuteReaderAsync(); while (await reader.ReadAsync()) { var id = reader.GetGuid(0); @@ -93,12 +106,17 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene await sqlConnection.OpenAsync(); await using var command = sqlConnection.CreateCommand(); - command.CommandText = "SELECT Id, [Index], Content FROM DocumentChunks2 WHERE DocumentId = @DocumentId ORDER BY [Index]"; + command.CommandText = """ + SELECT Id, [Index], Content + FROM DocumentChunks2 WHERE DocumentId = @DocumentId + ORDER BY [Index]; + """; + command.Parameters.AddWithValue("@DocumentId", documentId); var documentChunks = new List(); - using var reader = await command.ExecuteReaderAsync(); + await using var reader = await command.ExecuteReaderAsync(); while (await reader.ReadAsync()) { var id = reader.GetGuid(0); @@ -116,11 +134,15 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene await sqlConnection.OpenAsync(); await using var command = sqlConnection.CreateCommand(); - command.CommandText = "SELECT [Index], Content, CAST(Embedding AS NVARCHAR(MAX)) FROM DocumentChunks2 WHERE Id = @DocumentChunkId AND DocumentId = @DocumentId"; + command.CommandText = """ + "SELECT [Index], Content, CAST(Embedding AS NVARCHAR(MAX)) + FROM DocumentChunks2 WHERE Id = @DocumentChunkId AND DocumentId = @DocumentId; + """; + command.Parameters.AddWithValue("@DocumentChunkId", documentChunkId); command.Parameters.AddWithValue("@DocumentId", documentId); - using var reader = await command.ExecuteReaderAsync(); + await using var reader = await command.ExecuteReaderAsync(); if (reader.HasRows && await reader.ReadAsync()) { var index = reader.GetInt32(0); @@ -140,7 +162,7 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene await sqlConnection.OpenAsync(); } - using var command = sqlConnection.CreateCommand(); + await using var command = sqlConnection.CreateCommand(); command.Transaction = transaction as SqlTransaction; command.CommandText = "DELETE FROM Documents2 WHERE Id = @DocumentId"; @@ -160,13 +182,18 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene await sqlConnection.OpenAsync(); await using var command = sqlConnection.CreateCommand(); - command.CommandText = "SELECT TOP (@MaxRelevantChunks) Content FROM DocumentChunks2 ORDER BY VECTOR_DISTANCE('cosine', Embedding, CAST(@QuestionEmbedding AS VECTOR(1536)))"; + command.CommandText = $""" + SELECT TOP (@MaxRelevantChunks) Content + FROM DocumentChunks2 + ORDER BY VECTOR_DISTANCE('cosine', Embedding, CAST(@QuestionEmbedding AS VECTOR({questionEmbedding.Length}))); + """; + command.Parameters.AddWithValue("@MaxRelevantChunks", appSettings.MaxRelevantChunks); command.Parameters.AddWithValue("@QuestionEmbedding", JsonSerializer.Serialize(questionEmbedding)); var chunks = new List(); - using var reader = await command.ExecuteReaderAsync(); + await using var reader = await command.ExecuteReaderAsync(); while (await reader.ReadAsync()) { var content = reader.GetString(0); diff --git a/SqlDatabaseVectorSearch/Settings/AzureOpenAISettings.cs b/SqlDatabaseVectorSearch/Settings/AzureOpenAISettings.cs index 5c9abb1..e85d51d 100644 --- a/SqlDatabaseVectorSearch/Settings/AzureOpenAISettings.cs +++ b/SqlDatabaseVectorSearch/Settings/AzureOpenAISettings.cs @@ -4,7 +4,7 @@ public class AzureOpenAISettings { public required ServiceSettings ChatCompletion { get; init; } - public required ServiceSettings Embedding { get; init; } + public required EmbeddingServiceSettings Embedding { get; init; } } public class ServiceSettings @@ -15,3 +15,8 @@ public class ServiceSettings public required string ApiKey { get; init; } } + +public class EmbeddingServiceSettings : ServiceSettings +{ + public int? Dimensions { get; set; } +} diff --git a/SqlDatabaseVectorSearch/appsettings.json b/SqlDatabaseVectorSearch/appsettings.json index 5c5c42b..75f72ec 100644 --- a/SqlDatabaseVectorSearch/appsettings.json +++ b/SqlDatabaseVectorSearch/appsettings.json @@ -11,7 +11,8 @@ "Embedding": { "Endpoint": "", "Deployment": "", - "ApiKey": "" + "ApiKey": "", + "Dimensions": null } }, "AppSettings": { From 4355f72dab64cfeda592120e58563b60d6c6d4ef Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 1 Oct 2024 11:39:21 +0200 Subject: [PATCH 3/5] Refactor DB operations, rename tables, add Dapper Refactored `VectorSearchService.cs` to use Dapper for DB operations, replacing raw ADO.NET commands. Updated methods for inserting, retrieving, and deleting documents and chunks. Modified vector search query to use Dapper's `QueryAsync`. Updated `SqlDatabaseVectorSearch.csproj` to include Dapper package reference, version `2.1.35`. --- Scripts.sql | 30 ++-- .../Models/DocumentChunk.cs | 15 +- .../Services/VectorSearchService.cs | 130 ++++++------------ .../SqlDatabaseVectorSearch.csproj | 1 + 4 files changed, 74 insertions(+), 102 deletions(-) diff --git a/Scripts.sql b/Scripts.sql index d62c420..a2e1e93 100644 --- a/Scripts.sql +++ b/Scripts.sql @@ -1,4 +1,14 @@ -CREATE TABLE [dbo].[DocumentChunks2]( +CREATE TABLE [dbo].[Documents]( + [Id] [uniqueidentifier] NOT NULL, + [Name] [nvarchar](255) NOT NULL, + [CreationDate] [datetimeoffset](7) NOT NULL, + CONSTRAINT [PK_Documents2] PRIMARY KEY CLUSTERED +( + [Id] ASC +)) +GO + +CREATE TABLE [dbo].[DocumentChunks]( [Id] [uniqueidentifier] NOT NULL, [DocumentId] [uniqueidentifier] NOT NULL, [Index] [int] NOT NULL, @@ -10,23 +20,13 @@ CREATE TABLE [dbo].[DocumentChunks2]( )) GO -CREATE TABLE [dbo].[Documents2]( - [Id] [uniqueidentifier] NOT NULL, - [Name] [nvarchar](255) NOT NULL, - [CreationDate] [datetimeoffset](7) NOT NULL, - CONSTRAINT [PK_Documents2] PRIMARY KEY CLUSTERED -( - [Id] ASC -)) -GO - -ALTER TABLE [dbo].[DocumentChunks2] WITH CHECK ADD CONSTRAINT [FK_DocumentChunks2_Documents2] FOREIGN KEY([DocumentId]) -REFERENCES [dbo].[Documents2] ([Id]) +ALTER TABLE [dbo].[DocumentChunks] WITH CHECK ADD CONSTRAINT [FK_DocumentChunks_Documents] FOREIGN KEY([DocumentId]) +REFERENCES [dbo].[Documents] ([Id]) ON DELETE CASCADE GO -ALTER TABLE [dbo].[Documents2] ADD CONSTRAINT [DF_Documents2_Id] DEFAULT (newsequentialid()) FOR [Id] +ALTER TABLE [dbo].[Documents] ADD CONSTRAINT [DF_Documents_Id] DEFAULT (newsequentialid()) FOR [Id] GO -ALTER TABLE [dbo].[DocumentChunks2] ADD CONSTRAINT [DF_DocumentChunks2_Id] DEFAULT (newsequentialid()) FOR [Id] +ALTER TABLE [dbo].[DocumentChunks] ADD CONSTRAINT [DF_DocumentChunks_Id] DEFAULT (newsequentialid()) FOR [Id] GO \ No newline at end of file diff --git a/SqlDatabaseVectorSearch/Models/DocumentChunk.cs b/SqlDatabaseVectorSearch/Models/DocumentChunk.cs index 3753352..15bfab7 100644 --- a/SqlDatabaseVectorSearch/Models/DocumentChunk.cs +++ b/SqlDatabaseVectorSearch/Models/DocumentChunk.cs @@ -1,3 +1,14 @@ -namespace SqlDatabaseVectorSearch.Models; +using System.Text.Json; -public record class DocumentChunk(Guid Id, int Index, string Content, float[]? Embedding = null); +namespace SqlDatabaseVectorSearch.Models; + +public record class DocumentChunk(Guid Id, int Index, string Content, float[]? Embedding) +{ + public DocumentChunk(Guid Id, int Index, string Content) : this(Id, Index, Content, (float[]?)null) + { + } + + public DocumentChunk(Guid Id, int Index, string Content, string Embedding) : this(Id, Index, Content, JsonSerializer.Deserialize(Embedding)) + { + } +} diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index 207689d..f1b57e0 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -2,12 +2,14 @@ using System.Data.Common; using System.Text; using System.Text.Json; +using Dapper; using Microsoft.Data.SqlClient; using Microsoft.Extensions.Options; using Microsoft.SemanticKernel.Embeddings; using Microsoft.SemanticKernel.Text; using SqlDatabaseVectorSearch.Models; using SqlDatabaseVectorSearch.Settings; +using TinyHelpers.Extensions; using UglyToad.PdfPig; using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor; @@ -35,7 +37,7 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene command.Transaction = (SqlTransaction)transaction; command.CommandText = """ - INSERT INTO Documents2 (Id, [Name], CreationDate) + INSERT INTO Documents (Id, [Name], CreationDate) OUTPUT INSERTED.Id VALUES (@Id, @Name, @CreationDate); """; @@ -51,20 +53,19 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene var paragraphs = TextChunker.SplitPlainTextParagraphs(TextChunker.SplitPlainTextLines(content, appSettings.MaxTokensPerLine), appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens); var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(paragraphs); - var index = 0; - foreach (var (paragraph, embedding) in paragraphs.Zip(embeddings, (p, e) => (p, e.ToArray()))) + foreach (var (paragraph, index) in paragraphs.WithIndex()) { command.Parameters.Clear(); command.CommandText = $""" - INSERT INTO DocumentChunks2 (DocumentId, [Index], Content, Embedding) - VALUES (@DocumentId, @Index, @Content, CAST(@Embedding AS VECTOR({embedding.Length}))); + INSERT INTO DocumentChunks (DocumentId, [Index], Content, Embedding) + VALUES (@DocumentId, @Index, @Content, CAST(@Embedding AS VECTOR({embeddings[index].Length}))); """; command.Parameters.AddWithValue("@DocumentId", documentId); - command.Parameters.AddWithValue("@Index", index++); + command.Parameters.AddWithValue("@Index", index); command.Parameters.AddWithValue("@Content", paragraph); - command.Parameters.AddWithValue("@Embedding", JsonSerializer.Serialize(embedding)); + command.Parameters.AddWithValue("@Embedding", JsonSerializer.Serialize(embeddings[index])); await command.ExecuteNonQueryAsync(); } @@ -76,83 +77,36 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene public async Task> GetDocumentsAsync() { - await sqlConnection.OpenAsync(); - await using var command = sqlConnection.CreateCommand(); - - command.CommandText = """ - SELECT Id, [Name], CreationDate, ChunkCount = (SELECT COUNT(*) FROM DocumentChunks2 - WHERE DocumentId = Documents2.Id) - FROM Documents2 ORDER BY [Name]; - """; - - var documents = new List(); - - await using var reader = await command.ExecuteReaderAsync(); - while (await reader.ReadAsync()) - { - var id = reader.GetGuid(0); - var name = reader.GetString(1); - var creationDate = reader.GetDateTimeOffset(2); - var chunkCount = reader.GetInt32(3); - - documents.Add(new(id, name, creationDate, chunkCount)); - } + var documents = await sqlConnection.QueryAsync(""" + SELECT Id, [Name], CreationDate, ChunkCount = (SELECT COUNT(*) FROM DocumentChunks WHERE DocumentId = Documents.Id) + FROM Documents + ORDER BY [Name]; + """); return documents; } public async Task> GetDocumentChunksAsync(Guid documentId) { - await sqlConnection.OpenAsync(); - await using var command = sqlConnection.CreateCommand(); - - command.CommandText = """ + var documentChunks = await sqlConnection.QueryAsync(""" SELECT Id, [Index], Content - FROM DocumentChunks2 WHERE DocumentId = @DocumentId + FROM DocumentChunks + WHERE DocumentId = @DocumentId ORDER BY [Index]; - """; - - command.Parameters.AddWithValue("@DocumentId", documentId); - - var documentChunks = new List(); - - await using var reader = await command.ExecuteReaderAsync(); - while (await reader.ReadAsync()) - { - var id = reader.GetGuid(0); - var index = reader.GetInt32(1); - var content = reader.GetString(2); - - documentChunks.Add(new(id, index, content, null)); - } + """, new { documentId }); return documentChunks; } public async Task GetDocumentChunkEmbeddingAsync(Guid documentId, Guid documentChunkId) { - await sqlConnection.OpenAsync(); - await using var command = sqlConnection.CreateCommand(); + var documentChunk = await sqlConnection.QueryFirstOrDefaultAsync(""" + SELECT Id, [Index], Content, CAST(Embedding AS NVARCHAR(MAX)) AS Embedding + FROM DocumentChunks + WHERE Id = @DocumentChunkId AND DocumentId = @DocumentId; + """, new { documentId, documentChunkId }); - command.CommandText = """ - "SELECT [Index], Content, CAST(Embedding AS NVARCHAR(MAX)) - FROM DocumentChunks2 WHERE Id = @DocumentChunkId AND DocumentId = @DocumentId; - """; - - command.Parameters.AddWithValue("@DocumentChunkId", documentChunkId); - command.Parameters.AddWithValue("@DocumentId", documentId); - - await using var reader = await command.ExecuteReaderAsync(); - if (reader.HasRows && await reader.ReadAsync()) - { - var index = reader.GetInt32(0); - var content = reader.GetString(1); - var embedding = JsonSerializer.Deserialize(reader.GetString(2))!; - - return new(documentChunkId, index, content, embedding); - } - - return null; + return documentChunk; } public async Task DeleteDocumentAsync(Guid documentId, DbTransaction? transaction = null) @@ -165,7 +119,7 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene await using var command = sqlConnection.CreateCommand(); command.Transaction = transaction as SqlTransaction; - command.CommandText = "DELETE FROM Documents2 WHERE Id = @DocumentId"; + command.CommandText = "DELETE FROM Documents WHERE Id = @DocumentId"; command.Parameters.AddWithValue("@DocumentId", documentId); await command.ExecuteNonQueryAsync(); @@ -179,26 +133,32 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene // Perform Vector Search on SQL Database. var questionEmbedding = await textEmbeddingGenerationService.GenerateEmbeddingAsync(reformulatedQuestion); - await sqlConnection.OpenAsync(); - await using var command = sqlConnection.CreateCommand(); - - command.CommandText = $""" + var chunks = await sqlConnection.QueryAsync($""" SELECT TOP (@MaxRelevantChunks) Content - FROM DocumentChunks2 + FROM DocumentChunks ORDER BY VECTOR_DISTANCE('cosine', Embedding, CAST(@QuestionEmbedding AS VECTOR({questionEmbedding.Length}))); - """; + """, new { appSettings.MaxRelevantChunks, QuestionEmbedding = JsonSerializer.Serialize(questionEmbedding) }); - command.Parameters.AddWithValue("@MaxRelevantChunks", appSettings.MaxRelevantChunks); - command.Parameters.AddWithValue("@QuestionEmbedding", JsonSerializer.Serialize(questionEmbedding)); + //await sqlConnection.OpenAsync(); + //await using var command = sqlConnection.CreateCommand(); - var chunks = new List(); + //command.CommandText = $""" + // SELECT TOP (@MaxRelevantChunks) Content + // FROM DocumentChunks + // ORDER BY VECTOR_DISTANCE('cosine', Embedding, CAST(@QuestionEmbedding AS VECTOR({questionEmbedding.Length}))); + // """; - await using var reader = await command.ExecuteReaderAsync(); - while (await reader.ReadAsync()) - { - var content = reader.GetString(0); - chunks.Add(content); - } + //command.Parameters.AddWithValue("@MaxRelevantChunks", appSettings.MaxRelevantChunks); + //command.Parameters.AddWithValue("@QuestionEmbedding", JsonSerializer.Serialize(questionEmbedding)); + + //var chunks = new List(); + + //await using var reader = await command.ExecuteReaderAsync(); + //while (await reader.ReadAsync()) + //{ + // var content = reader.GetString(0); + // chunks.Add(content); + //} var answer = await chatService.AskQuestionAsync(question.ConversationId, chunks, reformulatedQuestion); return new Response(reformulatedQuestion, answer); diff --git a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj index 3e07174..16d541a 100644 --- a/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj +++ b/SqlDatabaseVectorSearch/SqlDatabaseVectorSearch.csproj @@ -8,6 +8,7 @@ + From 1b2ebbd6a3848bbe66fc56776a905f9344a34fd9 Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 1 Oct 2024 14:37:01 +0200 Subject: [PATCH 4/5] Update PK constraints, API description, and config comment Updated primary key constraint names in Scripts.sql for Documents and DocumentChunks tables. Enhanced document upload API endpoint description in Program.cs to reflect the use of the new native Vector type for embeddings. Added a comment in appsettings.json to clarify the Dimensions setting in the Embedding section. --- Scripts.sql | 4 ++-- SqlDatabaseVectorSearch/Program.cs | 2 +- SqlDatabaseVectorSearch/appsettings.json | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Scripts.sql b/Scripts.sql index a2e1e93..f0332cf 100644 --- a/Scripts.sql +++ b/Scripts.sql @@ -2,7 +2,7 @@ CREATE TABLE [dbo].[Documents]( [Id] [uniqueidentifier] NOT NULL, [Name] [nvarchar](255) NOT NULL, [CreationDate] [datetimeoffset](7) NOT NULL, - CONSTRAINT [PK_Documents2] PRIMARY KEY CLUSTERED + CONSTRAINT [PK_Documents] PRIMARY KEY CLUSTERED ( [Id] ASC )) @@ -14,7 +14,7 @@ CREATE TABLE [dbo].[DocumentChunks]( [Index] [int] NOT NULL, [Content] [nvarchar](max) NOT NULL, [Embedding] [vector](1536) NOT NULL, - CONSTRAINT [PK_DocumentChunks2] PRIMARY KEY CLUSTERED + CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED ( [Id] ASC )) diff --git a/SqlDatabaseVectorSearch/Program.cs b/SqlDatabaseVectorSearch/Program.cs index 244729a..f572c42 100644 --- a/SqlDatabaseVectorSearch/Program.cs +++ b/SqlDatabaseVectorSearch/Program.cs @@ -118,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi .WithOpenApi(operation => { operation.Summary = "Uploads a document"; - operation.Description = "Uploads a document to SQL Database and saves its embedding using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported."; + operation.Description = "Uploads a document to SQL Database and saves its embedding using the new native Vector type. The document will be indexed and used to answer questions. Currently, only PDF files are supported."; operation.Parameter("documentId").Description = "The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten."; diff --git a/SqlDatabaseVectorSearch/appsettings.json b/SqlDatabaseVectorSearch/appsettings.json index 75f72ec..79648fe 100644 --- a/SqlDatabaseVectorSearch/appsettings.json +++ b/SqlDatabaseVectorSearch/appsettings.json @@ -12,6 +12,8 @@ "Endpoint": "", "Deployment": "", "ApiKey": "", + // Set this value only if you're using a model that allows to specify the dimensions of the embeddings + // (e.g. text-embedding-3-small or text-embedding-3-large). Currently, a maximum value of 1998 is supported. "Dimensions": null } }, From 8c6cc3c969e4f8e0603ec13a83c38110580af25d Mon Sep 17 00:00:00 2001 From: Marco Minerva Date: Tue, 1 Oct 2024 17:35:59 +0200 Subject: [PATCH 5/5] Improve README, add comments, and clean up VectorSearchService Updated README.md for clarity and additional setup instructions: - Refined repository description to highlight native Vector type. - Rephrased note on Vector Support feature for readability. - Removed mention of EFCore.SqlServer.VectorSearch library. - Added instructions for updating VECTOR column size and setting Dimension property. Added comment in Scripts.sql to guide vector size setting in Embedding column. Cleaned up VectorSearchService.cs by removing unused and commented-out SQL command execution code. --- README.md | 16 +++++++------- Scripts.sql | 1 + .../Services/VectorSearchService.cs | 21 ------------------- 3 files changed, 10 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 3f8a0a3..1a522e1 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,18 @@ # SQL Database Vector Search Sample -A repository that showcases the Native Vector Support in Azure SQL Database to perform embeddings and RAG with Azure OpenAI. +A repository that showcases the native Vector type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI. > [!IMPORTANT] -> Usage of this application requires the Vector Support feature in Azure SQL Database, currently in EAP. [See this blog post](https://devblogs.microsoft.com/azure-sql/announcing-eap-native-vector-support-in-azure-sql-database/) for more details. +> Usage of this application requires the Vector support feature in Azure SQL Database, currently in EAP. [See this blog post](https://devblogs.microsoft.com/azure-sql/announcing-eap-native-vector-support-in-azure-sql-database/) for more details. -The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, only PDF files are supported. Vectors are saved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel). +The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, only PDF files are supported. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel). ![SQL Database Vector Search](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch.png) ### Setup -- [Create an Azure SQL Database](https://learn.microsoft.com/en-us/azure/azure-sql/database/single-database-create-quickstart) on a server that has the Vector Support feature enabled. -- Execute the [Scripts.sql](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/Scripts.sql) file to create the tables needed by the application. -- Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI. -- Run the application and start importing your PDF documents. +- [Create an Azure SQL Database](https://learn.microsoft.com/en-us/azure/azure-sql/database/single-database-create-quickstart) on a server that has the Vector Support feature enabled +- Execute the [Scripts.sql](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/Scripts.sql) file to create the tables needed by the application + - You may need to update the size of the [`VECTOR`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/Scripts.sql#L17) column to match the size of the embedding model. Currently, the maximum allowed value is 1998. +- Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI + - If your embedding model supports shortening, like **text-embedding-3-small** and **text-embedding-3-large**, and you want to use this feature, you need to set the [`Dimension`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to match the value you have used in the SQL script. If your model doesn't provide this feature, or do you want to use the default size, just leave the [`Dimension`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to NULL. Keep in mind that **text-embedding-3-small** has a dimension of 1536, while **text-embedding-3-large** uses vectors with 3072 elements, so with this latter model it is mandatory to specify a value (that, as said, must be less or equal to 1998). +- Run the application and start importing your PDF documents. \ No newline at end of file diff --git a/Scripts.sql b/Scripts.sql index f0332cf..34bdcb5 100644 --- a/Scripts.sql +++ b/Scripts.sql @@ -13,6 +13,7 @@ CREATE TABLE [dbo].[DocumentChunks]( [DocumentId] [uniqueidentifier] NOT NULL, [Index] [int] NOT NULL, [Content] [nvarchar](max) NOT NULL, + -- Set the size of the vector to the same size of the your embedding model. [Embedding] [vector](1536) NOT NULL, CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED ( diff --git a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs index f1b57e0..bea3ece 100644 --- a/SqlDatabaseVectorSearch/Services/VectorSearchService.cs +++ b/SqlDatabaseVectorSearch/Services/VectorSearchService.cs @@ -139,27 +139,6 @@ public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGene ORDER BY VECTOR_DISTANCE('cosine', Embedding, CAST(@QuestionEmbedding AS VECTOR({questionEmbedding.Length}))); """, new { appSettings.MaxRelevantChunks, QuestionEmbedding = JsonSerializer.Serialize(questionEmbedding) }); - //await sqlConnection.OpenAsync(); - //await using var command = sqlConnection.CreateCommand(); - - //command.CommandText = $""" - // SELECT TOP (@MaxRelevantChunks) Content - // FROM DocumentChunks - // ORDER BY VECTOR_DISTANCE('cosine', Embedding, CAST(@QuestionEmbedding AS VECTOR({questionEmbedding.Length}))); - // """; - - //command.Parameters.AddWithValue("@MaxRelevantChunks", appSettings.MaxRelevantChunks); - //command.Parameters.AddWithValue("@QuestionEmbedding", JsonSerializer.Serialize(questionEmbedding)); - - //var chunks = new List(); - - //await using var reader = await command.ExecuteReaderAsync(); - //while (await reader.ReadAsync()) - //{ - // var content = reader.GetString(0); - // chunks.Add(content); - //} - var answer = await chatService.AskQuestionAsync(question.ConversationId, chunks, reformulatedQuestion); return new Response(reformulatedQuestion, answer); }