mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Add support for DOCX and TXT files, update error handling
Updated README.md to reflect support for PDF, DOCX, and TXT files. Removed commented-out code in DocxContentDecoder.cs. Added TextContentDecoder service in Program.cs and updated exception handling middleware. Updated document upload endpoint description in Program.cs. Modified VectorSearchService to throw NotSupportedException for unsupported content types. Added TextContentDecoder class in TextContentDecoder.cs.
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
# SQL Database Vector Search Sample
|
# SQL Database Vector Search Sample
|
||||||
A repository that showcases the native VECTOR type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI.
|
A repository that showcases the native VECTOR type in Azure SQL Database to perform embeddings and RAG with Azure OpenAI.
|
||||||
|
|
||||||
The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, only PDF files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel).
|
The application is a Minimal API that exposes endpoints to load documents, generate embeddings and save them into the database as Vectors, and perform searches using Vector Search and RAG. Currently, PDF, DOCX and TXT files are supported. Vectors are saved and retrieved with Entity Framework Core using the [EFCore.SqlServer.VectorSearch](https://github.com/efcore/EfCore.SqlServer.VectorSearch) library. Embedding and Chat Completion are integrated with [Semantic Kernel](https://github.com/microsoft/semantic-kernel).
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> If you prefer to use straight SQL, check out the [sql branch](https://github.com/marcominerva/SqlDatabaseVectorSearch/tree/sql).
|
> If you prefer to use straight SQL, check out the [sql branch](https://github.com/marcominerva/SqlDatabaseVectorSearch/tree/sql).
|
||||||
@@ -15,4 +15,4 @@ The application is a Minimal API that exposes endpoints to load documents, gener
|
|||||||
- You may need to update the size of the [`VECTOR`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/Scripts.sql#L17) column to match the size of the embedding model. Currently, the maximum allowed value is 1998.
|
- You may need to update the size of the [`VECTOR`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/Scripts.sql#L17) column to match the size of the embedding model. Currently, the maximum allowed value is 1998.
|
||||||
- Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI
|
- Open the [appsettings.json](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json) file and set the connection string to the database and the other settings required by Azure OpenAI
|
||||||
- If your embedding model supports shortening, like **text-embedding-3-small** and **text-embedding-3-large**, and you want to use this feature, you need to set the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to match the value you have used in the SQL script. If your model doesn't provide this feature, or do you want to use the default size, just leave the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to NULL. Keep in mind that **text-embedding-3-small** has a dimension of 1536, while **text-embedding-3-large** uses vectors with 3072 elements, so with this latter model it is mandatory to specify a value (that, as said, must be less or equal to 1998).
|
- If your embedding model supports shortening, like **text-embedding-3-small** and **text-embedding-3-large**, and you want to use this feature, you need to set the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to match the value you have used in the SQL script. If your model doesn't provide this feature, or do you want to use the default size, just leave the [`Dimensions`](https://github.com/marcominerva/SqlDatabaseVectorSearch/blob/master/SqlDatabaseVectorSearch/appsettings.json#L17) property to NULL. Keep in mind that **text-embedding-3-small** has a dimension of 1536, while **text-embedding-3-large** uses vectors with 3072 elements, so with this latter model it is mandatory to specify a value (that, as said, must be less or equal to 1998).
|
||||||
- Run the application and start importing your PDF documents.
|
- Run the application and start importing your documents.
|
||||||
|
|||||||
@@ -21,41 +21,5 @@ public class DocxContentDecoder : IContentDecoder
|
|||||||
}
|
}
|
||||||
|
|
||||||
return Task.FromResult(content.ToString());
|
return Task.FromResult(content.ToString());
|
||||||
|
|
||||||
//foreach (var paragraph in body!.Elements<Paragraph>())
|
|
||||||
//{
|
|
||||||
// foreach (var element in paragraph.Elements())
|
|
||||||
// {
|
|
||||||
// if (element is Run run)
|
|
||||||
// {
|
|
||||||
// DecodeTextFromRun(run);
|
|
||||||
// }
|
|
||||||
// else if (element is Hyperlink hyperlink)
|
|
||||||
// {
|
|
||||||
// foreach (var hyperlinkRun in hyperlink.Elements<Run>())
|
|
||||||
// {
|
|
||||||
// DecodeTextFromRun(hyperlinkRun);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// //var hyperlinkUri = doc.MainDocumentPart.HyperlinkRelationships.FirstOrDefault(r => r.Id == hyperlink.Id)?.Uri;
|
|
||||||
// //if (hyperlinkUri is not null)
|
|
||||||
// //{
|
|
||||||
// // content.Append($" ({hyperlinkUri})");
|
|
||||||
// //}
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// content.AppendLine(); // Preserve whitespace and blank lines.
|
|
||||||
//}
|
|
||||||
|
|
||||||
//return Task.FromResult(content.ToString());
|
|
||||||
|
|
||||||
//void DecodeTextFromRun(Run run)
|
|
||||||
//{
|
|
||||||
// foreach (var text in run.Elements<Text>())
|
|
||||||
// {
|
|
||||||
// content.Append(text.Text);
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,12 @@
|
|||||||
|
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
||||||
|
|
||||||
|
public class TextContentDecoder : IContentDecoder
|
||||||
|
{
|
||||||
|
public async Task<string> DecodeAsync(Stream stream, string contentType)
|
||||||
|
{
|
||||||
|
using var readStream = new StreamReader(stream);
|
||||||
|
var content = await readStream.ReadToEndAsync();
|
||||||
|
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -54,6 +54,7 @@ builder.Services.AddScoped<VectorSearchService>();
|
|||||||
|
|
||||||
builder.Services.AddKeyedSingleton<IContentDecoder, PdfContentDecoder>(MediaTypeNames.Application.Pdf);
|
builder.Services.AddKeyedSingleton<IContentDecoder, PdfContentDecoder>(MediaTypeNames.Application.Pdf);
|
||||||
builder.Services.AddKeyedSingleton<IContentDecoder, DocxContentDecoder>("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
builder.Services.AddKeyedSingleton<IContentDecoder, DocxContentDecoder>("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
||||||
|
builder.Services.AddKeyedSingleton<IContentDecoder, TextContentDecoder>(MediaTypeNames.Text.Plain);
|
||||||
|
|
||||||
builder.Services.ConfigureHttpJsonOptions(options =>
|
builder.Services.ConfigureHttpJsonOptions(options =>
|
||||||
{
|
{
|
||||||
@@ -74,7 +75,15 @@ var app = builder.Build();
|
|||||||
// Configure the HTTP request pipeline.
|
// Configure the HTTP request pipeline.
|
||||||
app.UseHttpsRedirection();
|
app.UseHttpsRedirection();
|
||||||
|
|
||||||
app.UseExceptionHandler();
|
app.UseExceptionHandler(new ExceptionHandlerOptions
|
||||||
|
{
|
||||||
|
StatusCodeSelector = exception => exception switch
|
||||||
|
{
|
||||||
|
NotSupportedException => StatusCodes.Status501NotImplemented,
|
||||||
|
_ => StatusCodes.Status500InternalServerError
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
app.UseStatusCodePages();
|
app.UseStatusCodePages();
|
||||||
|
|
||||||
app.MapOpenApi();
|
app.MapOpenApi();
|
||||||
@@ -125,7 +134,7 @@ documentsApiGroup.MapPost(string.Empty, async (IFormFile file, VectorSearchServi
|
|||||||
.DisableAntiforgery()
|
.DisableAntiforgery()
|
||||||
.ProducesProblem(StatusCodes.Status400BadRequest)
|
.ProducesProblem(StatusCodes.Status400BadRequest)
|
||||||
.WithSummary("Uploads a document")
|
.WithSummary("Uploads a document")
|
||||||
.WithDescription("Uploads a document to SQL Database and saves its embedding using the native VECTOR type. The document will be indexed and used to answer questions. Currently, only PDF files are supported.");
|
.WithDescription("Uploads a document to SQL Database and saves its embedding using the native VECTOR type. The document will be indexed and used to answer questions. Currently, PDF, DOCX and TXT files are supported.");
|
||||||
|
|
||||||
documentsApiGroup.MapDelete("{documentId:guid}", async (Guid documentId, VectorSearchService vectorSearchService) =>
|
documentsApiGroup.MapDelete("{documentId:guid}", async (Guid documentId, VectorSearchService vectorSearchService) =>
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ public class VectorSearchService(IServiceProvider serviceProvider, ApplicationDb
|
|||||||
public async Task<Guid> ImportAsync(Stream stream, string name, string contentType, Guid? documentId)
|
public async Task<Guid> ImportAsync(Stream stream, string name, string contentType, Guid? documentId)
|
||||||
{
|
{
|
||||||
// Extract the contents of the file.
|
// Extract the contents of the file.
|
||||||
var decoder = serviceProvider.GetRequiredKeyedService<IContentDecoder>(contentType);
|
var decoder = serviceProvider.GetKeyedService<IContentDecoder>(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported.");
|
||||||
var content = await decoder.DecodeAsync(stream, contentType);
|
var content = await decoder.DecodeAsync(stream, contentType);
|
||||||
|
|
||||||
await dbContext.Database.BeginTransactionAsync();
|
await dbContext.Database.BeginTransactionAsync();
|
||||||
|
|||||||
Reference in New Issue
Block a user