Merge pull request #1 from marcominerva/vector-type

Add support for native VECTOR type
This commit is contained in:
Marco Minerva
2024-10-01 17:39:00 +02:00
committed by GitHub
11 changed files with 137 additions and 145 deletions
+9 -7
View File
@@ -1,16 +1,18 @@
# SQL Database Vector Search Sample
A repository that showcases the Native Vector Support in Azure SQL Database to perform embeddings and RAG with Azure OpenAI.
A repository that showcases the native Vector type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI.
> [!IMPORTANT]
> Usage of this application requires the Vector Support feature in Azure SQL Database, currently in EAP. [See this blog post](https://devblogs.microsoft.com/azure-sql/announcing-eap-native-vector-support-in-azure-sql-database/) for more details.
> Usage of this application requires the Vector support feature in Azure SQL Database, currently in EAP. [See this blog post](https://devblogs.microsoft.com/azure-sql/announcing-eap-native-vector-support-in-azure-sql-database/) for more details.
The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, only PDF files are supported. Vectors are saved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel).
The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, only PDF files are supported. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel).
![SQL Database Vector Search](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch.png)
### Setup
- [Create an Azure SQL Database](https://learn.microsoft.com/en-us/azure/azure-sql/database/single-database-create-quickstart) on a server that has the Vector Support feature enabled.
- Execute the [Scripts.sql](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/Scripts.sql) file to create the tables needed by the application.
- Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI.
- Run the application and start importing your PDF documents.
- [Create an Azure SQL Database](https://learn.microsoft.com/en-us/azure/azure-sql/database/single-database-create-quickstart) on a server that has the Vector Support feature enabled
- Execute the [Scripts.sql](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/Scripts.sql) file to create the tables needed by the application
- You may need to update the size of the [`VECTOR`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/Scripts.sql#L17) column to match the size of the embedding model. Currently, the maximum allowed value is 1998.
- Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI
- If your embedding model supports shortening, like **text-embedding-3-small** and **text-embedding-3-large**, and you want to use this feature, you need to set the [`Dimension`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to match the value you have used in the SQL script. If your model doesn't provide this feature, or do you want to use the default size, just leave the [`Dimension`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to NULL. Keep in mind that **text-embedding-3-small** has a dimension of 1536, while **text-embedding-3-large** uses vectors with 3072 elements, so with this latter model it is mandatory to specify a value (that, as said, must be less or equal to 1998).
- Run the application and start importing your PDF documents.
+19 -12
View File
@@ -1,15 +1,3 @@
CREATE TABLE [dbo].[DocumentChunks](
[Id] [uniqueidentifier] NOT NULL,
[DocumentId] [uniqueidentifier] NOT NULL,
[Index] [int] NOT NULL,
[Content] [nvarchar](max) NOT NULL,
[Embedding] [varbinary](8000) NOT NULL,
CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED
(
[Id] ASC
))
GO
CREATE TABLE [dbo].[Documents](
[Id] [uniqueidentifier] NOT NULL,
[Name] [nvarchar](255) NOT NULL,
@@ -20,7 +8,26 @@ CREATE TABLE [dbo].[Documents](
))
GO
CREATE TABLE [dbo].[DocumentChunks](
[Id] [uniqueidentifier] NOT NULL,
[DocumentId] [uniqueidentifier] NOT NULL,
[Index] [int] NOT NULL,
[Content] [nvarchar](max) NOT NULL,
-- Set the size of the vector to the same size of the your embedding model.
[Embedding] [vector](1536) NOT NULL,
CONSTRAINT [PK_DocumentChunks] PRIMARY KEY CLUSTERED
(
[Id] ASC
))
GO
ALTER TABLE [dbo].[DocumentChunks] WITH CHECK ADD CONSTRAINT [FK_DocumentChunks_Documents] FOREIGN KEY([DocumentId])
REFERENCES [dbo].[Documents] ([Id])
ON DELETE CASCADE
GO
ALTER TABLE [dbo].[Documents] ADD CONSTRAINT [DF_Documents_Id] DEFAULT (newsequentialid()) FOR [Id]
GO
ALTER TABLE [dbo].[DocumentChunks] ADD CONSTRAINT [DF_DocumentChunks_Id] DEFAULT (newsequentialid()) FOR [Id]
GO
@@ -1,52 +0,0 @@
using EntityFramework.Exceptions.SqlServer;
using Microsoft.EntityFrameworkCore;
using SqlDatabaseVectorSearch.DataAccessLayer.Entities;
namespace SqlDatabaseVectorSearch.DataAccessLayer;
public class ApplicationDbContext(DbContextOptions<ApplicationDbContext> options) : DbContext(options)
{
public virtual DbSet<Document> Documents { get; set; }
public virtual DbSet<DocumentChunk> DocumentChunks { get; set; }
protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder)
{
base.OnConfiguring(optionsBuilder);
optionsBuilder.UseExceptionProcessor();
//optionsBuilder.EnableSensitiveDataLogging();
}
protected override void OnModelCreating(ModelBuilder modelBuilder)
{
modelBuilder.Entity<Document>(entity =>
{
entity.ToTable("Documents");
entity.HasKey(e => e.Id);
entity.Property(e => e.Id).ValueGeneratedOnAdd();
entity.Property(e => e.Name)
.IsRequired()
.HasMaxLength(255);
});
modelBuilder.Entity<DocumentChunk>(entity =>
{
entity.ToTable("DocumentChunks");
entity.HasKey(e => e.Id);
entity.Property(e => e.Id).ValueGeneratedOnAdd();
entity.Property(e => e.Content).IsRequired();
entity.Property(e => e.Embedding)
.IsRequired()
.HasMaxLength(8000)
.IsVector();
entity.HasOne(d => d.Document).WithMany(p => p.Chunks)
.HasForeignKey(d => d.DocumentId)
.OnDelete(DeleteBehavior.Cascade)
.HasConstraintName("FK_DocumentChunks_Documents");
});
}
}
@@ -1,12 +0,0 @@
namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities;
public class Document
{
public Guid Id { get; set; }
public required string Name { get; set; }
public DateTimeOffset CreationDate { get; set; }
public virtual ICollection<DocumentChunk> Chunks { get; set; } = [];
}
@@ -1,16 +0,0 @@
namespace SqlDatabaseVectorSearch.DataAccessLayer.Entities;
public class DocumentChunk
{
public Guid Id { get; set; }
public Guid DocumentId { get; set; }
public int Index { get; set; }
public required string Content { get; set; }
public required float[] Embedding { get; set; }
public virtual Document Document { get; set; } = null!;
}
@@ -1,3 +1,14 @@
namespace SqlDatabaseVectorSearch.Models;
using System.Text.Json;
public record class DocumentChunk(Guid Id, int Index, string Content, float[]? Embedding = null);
namespace SqlDatabaseVectorSearch.Models;
public record class DocumentChunk(Guid Id, int Index, string Content, float[]? Embedding)
{
public DocumentChunk(Guid Id, int Index, string Content) : this(Id, Index, Content, (float[]?)null)
{
}
public DocumentChunk(Guid Id, int Index, string Content, string Embedding) : this(Id, Index, Content, JsonSerializer.Deserialize<float[]?>(Embedding))
{
}
}
+6 -8
View File
@@ -1,9 +1,9 @@
using Microsoft.AspNetCore.Http.HttpResults;
using Microsoft.Data.SqlClient;
using Microsoft.EntityFrameworkCore;
using Microsoft.OpenApi.Models;
using Microsoft.SemanticKernel;
using MinimalHelpers.OpenApi;
using SqlDatabaseVectorSearch.DataAccessLayer;
using SqlDatabaseVectorSearch.Models;
using SqlDatabaseVectorSearch.Services;
using SqlDatabaseVectorSearch.Settings;
@@ -19,12 +19,10 @@ var appSettings = builder.Services.ConfigureAndGet<AppSettings>(builder.Configur
builder.Services.AddSingleton(TimeProvider.System);
builder.Services.AddSqlServer<ApplicationDbContext>(builder.Configuration.GetConnectionString("SqlConnection"), options =>
builder.Services.AddScoped(_ =>
{
options.UseVectorSearch();
}, options =>
{
options.UseQueryTrackingBehavior(QueryTrackingBehavior.NoTracking);
var sqlConnection = new SqlConnection(builder.Configuration.GetConnectionString("SqlConnection"));
return sqlConnection;
});
builder.Services.AddMemoryCache();
@@ -32,7 +30,7 @@ builder.Services.AddMemoryCache();
// Semantic Kernel is used to generate embeddings and to reformulate questions taking into account all the previous interactions,
// so that embeddings themselves can be generated more accurately.
builder.Services.AddKernel()
.AddAzureOpenAITextEmbeddingGeneration(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey)
.AddAzureOpenAITextEmbeddingGeneration(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, dimensions: aiSettings.Embedding.Dimensions)
.AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey);
builder.Services.AddScoped<ChatService>();
@@ -120,7 +118,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi
.WithOpenApi(operation =>
{
operation.Summary = "Uploads a document";
operation.Description = "Uploads a document to SQL Database and saves its embedding using Vector Support. The document will be indexed and used to answer questions. Currently, only PDF files are supported.";
operation.Description = "Uploads a document to SQL Database and saves its embedding using the new native Vector type. The document will be indexed and used to answer questions. Currently, only PDF files are supported.";
operation.Parameter("documentId").Description = "The unique identifier of the document. If not provided, a new one will be generated. If you specify an existing documentId, the corresponding document will be overwritten.";
@@ -1,18 +1,21 @@
using System.Text;
using Microsoft.EntityFrameworkCore;
using System.Data;
using System.Data.Common;
using System.Text;
using System.Text.Json;
using Dapper;
using Microsoft.Data.SqlClient;
using Microsoft.Extensions.Options;
using Microsoft.SemanticKernel.Embeddings;
using Microsoft.SemanticKernel.Text;
using SqlDatabaseVectorSearch.DataAccessLayer;
using SqlDatabaseVectorSearch.Models;
using SqlDatabaseVectorSearch.Settings;
using TinyHelpers.Extensions;
using UglyToad.PdfPig;
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
using Entities = SqlDatabaseVectorSearch.DataAccessLayer.Entities;
namespace SqlDatabaseVectorSearch.Services;
public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions)
public class VectorSearchService(SqlConnection sqlConnection, ITextEmbeddingGenerationService textEmbeddingGenerationService, ChatService chatService, TimeProvider timeProvider, IOptions<AppSettings> appSettingsOptions)
{
private readonly AppSettings appSettings = appSettingsOptions.Value;
@@ -21,63 +24,106 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
// Extract the contents of the file (currently, only PDF files are supported).
var content = await GetContentAsync(stream);
await dbContext.Database.BeginTransactionAsync();
await sqlConnection.OpenAsync();
await using var transaction = await sqlConnection.BeginTransactionAsync();
if (documentId.HasValue)
{
// If the user is importing a document that already exists, delete the previous one.
await DeleteDocumentAsync(documentId.Value);
await DeleteDocumentAsync(documentId.Value, transaction);
}
var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() };
dbContext.Documents.Add(document);
await using var command = sqlConnection.CreateCommand();
command.Transaction = (SqlTransaction)transaction;
command.CommandText = """
INSERT INTO Documents (Id, [Name], CreationDate)
OUTPUT INSERTED.Id
VALUES (@Id, @Name, @CreationDate);
""";
command.Parameters.AddWithValue("@Id", documentId.GetValueOrDefault(Guid.NewGuid()));
command.Parameters.AddWithValue("@Name", name);
command.Parameters.AddWithValue("@CreationDate", timeProvider.GetUtcNow());
var insertedId = await command.ExecuteScalarAsync();
documentId = (Guid)insertedId!;
// Split the content into chunks and generate the embeddings for each one.
var paragraphs = TextChunker.SplitPlainTextParagraphs(TextChunker.SplitPlainTextLines(content, appSettings.MaxTokensPerLine), appSettings.MaxTokensPerParagraph, appSettings.OverlapTokens);
var embeddings = await textEmbeddingGenerationService.GenerateEmbeddingsAsync(paragraphs);
var index = 0;
foreach (var (paragraph, embedding) in paragraphs.Zip(embeddings, (p, e) => (p, e.ToArray())))
foreach (var (paragraph, index) in paragraphs.WithIndex())
{
var documentChunk = new Entities.DocumentChunk { Document = document, Index = index++, Content = paragraph, Embedding = embedding };
dbContext.DocumentChunks.Add(documentChunk);
command.Parameters.Clear();
command.CommandText = $"""
INSERT INTO DocumentChunks (DocumentId, [Index], Content, Embedding)
VALUES (@DocumentId, @Index, @Content, CAST(@Embedding AS VECTOR({embeddings[index].Length})));
""";
command.Parameters.AddWithValue("@DocumentId", documentId);
command.Parameters.AddWithValue("@Index", index);
command.Parameters.AddWithValue("@Content", paragraph);
command.Parameters.AddWithValue("@Embedding", JsonSerializer.Serialize(embeddings[index]));
await command.ExecuteNonQueryAsync();
}
await dbContext.SaveChangesAsync();
await dbContext.Database.CommitTransactionAsync();
await transaction.CommitAsync();
return document.Id;
return documentId.Value;
}
public async Task<IEnumerable<Document>> GetDocumentsAsync()
{
var documents = await dbContext.Documents.OrderBy(d => d.Name)
.Select(d => new Document(d.Id, d.Name, d.CreationDate, d.Chunks.Count))
.ToListAsync();
var documents = await sqlConnection.QueryAsync<Document>("""
SELECT Id, [Name], CreationDate, ChunkCount = (SELECT COUNT(*) FROM DocumentChunks WHERE DocumentId = Documents.Id)
FROM Documents
ORDER BY [Name];
""");
return documents;
}
public async Task<IEnumerable<DocumentChunk>> GetDocumentChunksAsync(Guid documentId)
{
var documentChunks = await dbContext.DocumentChunks.Where(c => c.DocumentId == documentId).OrderBy(c => c.Index)
.Select(c => new DocumentChunk(c.Id, c.Index, c.Content, null))
.ToListAsync();
var documentChunks = await sqlConnection.QueryAsync<DocumentChunk>("""
SELECT Id, [Index], Content
FROM DocumentChunks
WHERE DocumentId = @DocumentId
ORDER BY [Index];
""", new { documentId });
return documentChunks;
}
public async Task<DocumentChunk?> GetDocumentChunkEmbeddingAsync(Guid documentId, Guid documentChunkId)
{
var documentChunk = await dbContext.DocumentChunks.Where(c => c.Id == documentChunkId && c.DocumentId == documentId)
.Select(c => new DocumentChunk(c.Id, c.Index, c.Content, c.Embedding))
.FirstOrDefaultAsync();
var documentChunk = await sqlConnection.QueryFirstOrDefaultAsync<DocumentChunk>("""
SELECT Id, [Index], Content, CAST(Embedding AS NVARCHAR(MAX)) AS Embedding
FROM DocumentChunks
WHERE Id = @DocumentChunkId AND DocumentId = @DocumentId;
""", new { documentId, documentChunkId });
return documentChunk;
}
public Task DeleteDocumentAsync(Guid documentId)
=> dbContext.Documents.Where(d => d.Id == documentId).ExecuteDeleteAsync();
public async Task DeleteDocumentAsync(Guid documentId, DbTransaction? transaction = null)
{
if (sqlConnection.State == ConnectionState.Closed)
{
await sqlConnection.OpenAsync();
}
await using var command = sqlConnection.CreateCommand();
command.Transaction = transaction as SqlTransaction;
command.CommandText = "DELETE FROM Documents WHERE Id = @DocumentId";
command.Parameters.AddWithValue("@DocumentId", documentId);
await command.ExecuteNonQueryAsync();
}
public async Task<Response> AskQuestionAsync(Question question, bool reformulate = true)
{
@@ -87,11 +133,11 @@ public class VectorSearchService(ApplicationDbContext dbContext, ITextEmbeddingG
// Perform Vector Search on SQL Database.
var questionEmbedding = await textEmbeddingGenerationService.GenerateEmbeddingAsync(reformulatedQuestion);
var chunks = await dbContext.DocumentChunks
.OrderBy(c => EF.Functions.VectorDistance("cosine", c.Embedding, questionEmbedding.ToArray()))
.Select(c => c.Content)
.Take(appSettings.MaxRelevantChunks)
.ToListAsync();
var chunks = await sqlConnection.QueryAsync<string>($"""
SELECT TOP (@MaxRelevantChunks) Content
FROM DocumentChunks
ORDER BY VECTOR_DISTANCE('cosine', Embedding, CAST(@QuestionEmbedding AS VECTOR({questionEmbedding.Length})));
""", new { appSettings.MaxRelevantChunks, QuestionEmbedding = JsonSerializer.Serialize(questionEmbedding) });
var answer = await chatService.AskQuestionAsync(question.ConversationId, chunks, reformulatedQuestion);
return new Response(reformulatedQuestion, answer);
@@ -4,7 +4,7 @@ public class AzureOpenAISettings
{
public required ServiceSettings ChatCompletion { get; init; }
public required ServiceSettings Embedding { get; init; }
public required EmbeddingServiceSettings Embedding { get; init; }
}
public class ServiceSettings
@@ -15,3 +15,8 @@ public class ServiceSettings
public required string ApiKey { get; init; }
}
public class EmbeddingServiceSettings : ServiceSettings
{
public int? Dimensions { get; set; }
}
@@ -8,10 +8,10 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="EFCore.SqlServer.VectorSearch" Version="0.1.1" />
<PackageReference Include="Dapper" Version="2.1.35" />
<PackageReference Include="EntityFrameworkCore.Exceptions.SqlServer" Version="8.1.3" />
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="8.0.8" />
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" Version="8.0.8" />
<PackageReference Include="Microsoft.Data.SqlClient" Version="5.2.2" />
<PackageReference Include="Microsoft.SemanticKernel" Version="1.21.1" />
<PackageReference Include="MinimalHelpers.OpenApi" Version="2.0.15" />
<PackageReference Include="PdfPig" Version="0.1.8" />
+4 -1
View File
@@ -11,7 +11,10 @@
"Embedding": {
"Endpoint": "",
"Deployment": "",
"ApiKey": ""
"ApiKey": "",
// Set this value only if you're using a model that allows to specify the dimensions of the embeddings
// (e.g. text-embedding-3-small or text-embedding-3-large). Currently, a maximum value of 1998 is supported.
"Dimensions": null
}
},
"AppSettings": {