mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Enhance content decoders and update dependencies
- Modified `DocxContentDecoder` to use `IServiceProvider` for text chunking and improved paragraph processing with page break handling. - Updated `PdfContentDecoder` and `TextContentDecoder` to trim whitespace from text before splitting into paragraphs. - Reordered service registrations in `Program.cs` while retaining existing functionality. - Updated `SqlDatabaseVectorSearch.csproj` with new package versions for several dependencies, including `Microsoft.AspNetCore.OpenApi` and `Microsoft.EntityFrameworkCore`.
This commit is contained in:
@@ -1,25 +1,51 @@
|
|||||||
using System.Text;
|
using System.Text;
|
||||||
using DocumentFormat.OpenXml.Packaging;
|
using DocumentFormat.OpenXml.Packaging;
|
||||||
using DocumentFormat.OpenXml.Wordprocessing;
|
using DocumentFormat.OpenXml.Wordprocessing;
|
||||||
|
using SqlDatabaseVectorSearch.TextChunkers;
|
||||||
|
|
||||||
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
namespace SqlDatabaseVectorSearch.ContentDecoders;
|
||||||
|
|
||||||
public class DocxContentDecoder : IContentDecoder
|
public class DocxContentDecoder(IServiceProvider serviceProvider) : IContentDecoder
|
||||||
{
|
{
|
||||||
public Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
public Task<IEnumerable<Chunk>> DecodeAsync(Stream stream, string contentType, CancellationToken cancellationToken = default)
|
||||||
{
|
{
|
||||||
// Open a Word document for read-only access.
|
var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
|
||||||
|
|
||||||
using var document = WordprocessingDocument.Open(stream, false);
|
using var document = WordprocessingDocument.Open(stream, false);
|
||||||
|
|
||||||
var body = document.MainDocumentPart?.Document.Body;
|
var body = document.MainDocumentPart?.Document.Body;
|
||||||
var content = new StringBuilder();
|
if (body is null)
|
||||||
|
|
||||||
var paragraphs = body?.Descendants<Paragraph>() ?? [];
|
|
||||||
foreach (var p in paragraphs)
|
|
||||||
{
|
{
|
||||||
content.AppendLine(p.InnerText);
|
return Task.FromResult(Enumerable.Empty<Chunk>());
|
||||||
}
|
}
|
||||||
|
|
||||||
return Task.FromResult(new List<Chunk>([new(1, 0, content.ToString())]).AsEnumerable());
|
var pages = new List<string>();
|
||||||
|
var pageBuilder = new StringBuilder();
|
||||||
|
|
||||||
|
foreach (var paragraph in body.Descendants<Paragraph>())
|
||||||
|
{
|
||||||
|
// Note: this is just an attempt at counting pages, not 100% reliable
|
||||||
|
// see https://stackoverflow.com/questions/39992870/how-to-access-openxml-content-by-page-number
|
||||||
|
var lastRenderedPageBreak = paragraph.GetFirstChild<Run>()?.GetFirstChild<LastRenderedPageBreak>();
|
||||||
|
if (lastRenderedPageBreak is not null)
|
||||||
|
{
|
||||||
|
// Note: no trimming, use original spacing when working with pages
|
||||||
|
pages.Add(pageBuilder.ToString());
|
||||||
|
pageBuilder.Clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
pageBuilder.AppendLine(paragraph.InnerText);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dopo aver processato tutti i paragrafi, aggiungi l'ultima pagina (anche se vuota)
|
||||||
|
pages.Add(pageBuilder.ToString());
|
||||||
|
|
||||||
|
var chunks = new List<Chunk>();
|
||||||
|
foreach (var (pageIndex, pageText) in pages.Index())
|
||||||
|
{
|
||||||
|
var paragraphs = textChunker.Split(pageText.Trim());
|
||||||
|
chunks.AddRange(paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pageIndex + 1, index, text)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return Task.FromResult(chunks.AsEnumerable());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ public class PdfContentDecoder(IServiceProvider serviceProvider) : IContentDecod
|
|||||||
var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);
|
var textBlocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);
|
||||||
var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" ")));
|
var pageText = string.Join($"{Environment.NewLine}{Environment.NewLine}", textBlocks.Select(t => t.Text.ReplaceLineEndings(" ")));
|
||||||
|
|
||||||
var paragraphs = textChunker.Split(pageText);
|
var paragraphs = textChunker.Split(pageText.Trim());
|
||||||
|
|
||||||
return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text));
|
return paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pdfPage.Number, index, text));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ public class TextContentDecoder(IServiceProvider serviceProvider) : IContentDeco
|
|||||||
using var readStream = new StreamReader(stream);
|
using var readStream = new StreamReader(stream);
|
||||||
var content = await readStream.ReadToEndAsync(cancellationToken);
|
var content = await readStream.ReadToEndAsync(cancellationToken);
|
||||||
|
|
||||||
var paragraphs = textChunker.Split(content);
|
var paragraphs = textChunker.Split(content.Trim());
|
||||||
return paragraphs.Select((text, index) => new Chunk(null, index, text)).ToList();
|
return paragraphs.Select((text, index) => new Chunk(null, index, text)).ToList();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -63,12 +63,6 @@ builder.Services.AddKernel()
|
|||||||
.AddAzureOpenAIEmbeddingGenerator(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, modelId: aiSettings.Embedding.ModelId, dimensions: aiSettings.Embedding.Dimensions)
|
.AddAzureOpenAIEmbeddingGenerator(aiSettings.Embedding.Deployment, aiSettings.Embedding.Endpoint, aiSettings.Embedding.ApiKey, modelId: aiSettings.Embedding.ModelId, dimensions: aiSettings.Embedding.Dimensions)
|
||||||
.AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey, modelId: aiSettings.ChatCompletion.ModelId);
|
.AddAzureOpenAIChatCompletion(aiSettings.ChatCompletion.Deployment, aiSettings.ChatCompletion.Endpoint, aiSettings.ChatCompletion.ApiKey, modelId: aiSettings.ChatCompletion.ModelId);
|
||||||
|
|
||||||
builder.Services.AddSingleton<TokenizerService>();
|
|
||||||
builder.Services.AddSingleton<ChatService>();
|
|
||||||
|
|
||||||
builder.Services.AddScoped<DocumentService>();
|
|
||||||
builder.Services.AddScoped<VectorSearchService>();
|
|
||||||
|
|
||||||
builder.Services.AddKeyedSingleton<IContentDecoder, PdfContentDecoder>(MediaTypeNames.Application.Pdf);
|
builder.Services.AddKeyedSingleton<IContentDecoder, PdfContentDecoder>(MediaTypeNames.Application.Pdf);
|
||||||
builder.Services.AddKeyedSingleton<IContentDecoder, DocxContentDecoder>("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
builder.Services.AddKeyedSingleton<IContentDecoder, DocxContentDecoder>("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
||||||
builder.Services.AddKeyedSingleton<IContentDecoder, TextContentDecoder>(MediaTypeNames.Text.Plain);
|
builder.Services.AddKeyedSingleton<IContentDecoder, TextContentDecoder>(MediaTypeNames.Text.Plain);
|
||||||
@@ -77,6 +71,12 @@ builder.Services.AddKeyedSingleton<IContentDecoder, TextContentDecoder>(MediaTyp
|
|||||||
builder.Services.AddKeyedSingleton<ITextChunker, DefaultTextChunker>(KeyedService.AnyKey);
|
builder.Services.AddKeyedSingleton<ITextChunker, DefaultTextChunker>(KeyedService.AnyKey);
|
||||||
builder.Services.AddKeyedSingleton<ITextChunker, MarkdownTextChunker>(MediaTypeNames.Text.Markdown);
|
builder.Services.AddKeyedSingleton<ITextChunker, MarkdownTextChunker>(MediaTypeNames.Text.Markdown);
|
||||||
|
|
||||||
|
builder.Services.AddSingleton<TokenizerService>();
|
||||||
|
builder.Services.AddSingleton<ChatService>();
|
||||||
|
|
||||||
|
builder.Services.AddScoped<DocumentService>();
|
||||||
|
builder.Services.AddScoped<VectorSearchService>();
|
||||||
|
|
||||||
builder.Services.AddOpenApi(options =>
|
builder.Services.AddOpenApi(options =>
|
||||||
{
|
{
|
||||||
options.RemoveServerList();
|
options.RemoveServerList();
|
||||||
|
|||||||
@@ -13,24 +13,24 @@
|
|||||||
<PackageReference Include="EFCore.SqlServer.VectorSearch" Version="9.0.0-preview.2" />
|
<PackageReference Include="EFCore.SqlServer.VectorSearch" Version="9.0.0-preview.2" />
|
||||||
<PackageReference Include="EntityFrameworkCore.Exceptions.SqlServer" Version="8.1.3" />
|
<PackageReference Include="EntityFrameworkCore.Exceptions.SqlServer" Version="8.1.3" />
|
||||||
<PackageReference Include="FluentValidation.DependencyInjectionExtensions" Version="12.0.0" />
|
<PackageReference Include="FluentValidation.DependencyInjectionExtensions" Version="12.0.0" />
|
||||||
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="9.0.5" />
|
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="9.0.6" />
|
||||||
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" Version="9.0.5" />
|
<PackageReference Include="Microsoft.EntityFrameworkCore.SqlServer" Version="9.0.6" />
|
||||||
<PackageReference Include="Microsoft.EntityFrameworkCore.Tools" Version="9.0.5">
|
<PackageReference Include="Microsoft.EntityFrameworkCore.Tools" Version="9.0.6">
|
||||||
<PrivateAssets>all</PrivateAssets>
|
<PrivateAssets>all</PrivateAssets>
|
||||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||||
</PackageReference>
|
</PackageReference>
|
||||||
<PackageReference Include="Microsoft.Extensions.Caching.Hybrid" Version="9.5.0" />
|
<PackageReference Include="Microsoft.Extensions.Caching.Hybrid" Version="9.6.0" />
|
||||||
<PackageReference Include="Microsoft.Extensions.Http.Resilience" Version="9.5.0" />
|
<PackageReference Include="Microsoft.Extensions.Http.Resilience" Version="9.6.0" />
|
||||||
<PackageReference Include="Microsoft.ML.Tokenizers" Version="1.0.2" />
|
<PackageReference Include="Microsoft.ML.Tokenizers" Version="1.0.2" />
|
||||||
<PackageReference Include="Microsoft.ML.Tokenizers.Data.Cl100kBase" Version="1.0.2" />
|
<PackageReference Include="Microsoft.ML.Tokenizers.Data.Cl100kBase" Version="1.0.2" />
|
||||||
<PackageReference Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="1.0.2" />
|
<PackageReference Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="1.0.2" />
|
||||||
<PackageReference Include="Microsoft.SemanticKernel" Version="1.55.0" />
|
<PackageReference Include="Microsoft.SemanticKernel" Version="1.56.0" />
|
||||||
<PackageReference Include="MimeMapping" Version="3.1.0" />
|
<PackageReference Include="MimeMapping" Version="3.1.0" />
|
||||||
<PackageReference Include="MinimalHelpers.FluentValidation" Version="1.1.3" />
|
<PackageReference Include="MinimalHelpers.FluentValidation" Version="1.1.3" />
|
||||||
<PackageReference Include="MinimalHelpers.Routing.Analyzers" Version="1.1.3" />
|
<PackageReference Include="MinimalHelpers.Routing.Analyzers" Version="1.1.3" />
|
||||||
<PackageReference Include="PdfPig" Version="0.1.10" />
|
<PackageReference Include="PdfPig" Version="0.1.10" />
|
||||||
<PackageReference Include="Swashbuckle.AspNetCore.SwaggerUI" Version="8.1.3" />
|
<PackageReference Include="Swashbuckle.AspNetCore.SwaggerUI" Version="8.1.4" />
|
||||||
<PackageReference Include="TinyHelpers.AspNetCore" Version="4.0.26" />
|
<PackageReference Include="TinyHelpers.AspNetCore" Version="4.0.27" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|||||||
Reference in New Issue
Block a user