mirror of
https://github.com/marcominerva/SqlDatabaseVectorSearch.git
synced 2026-06-20 12:23:10 +00:00
Refactor document processing and embedding generation
- Updated `DocxContentDecoder` to process Word documents as chunks of text, removing page tracking and enhancing content handling. - Modified `VectorSearchService.ImportAsync` to work with chunks, implementing batching for embedding generation. - Added `EmbeddingBatchSize` property to `AppSettings` for configurable batch processing. - Updated `appsettings.json` to include the new `EmbeddingBatchSize` setting for improved control over embedding processes.
This commit is contained in:
@@ -11,41 +11,22 @@ public class DocxContentDecoder(IServiceProvider serviceProvider) : IContentDeco
|
|||||||
{
|
{
|
||||||
var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
|
var textChunker = serviceProvider.GetRequiredKeyedService<ITextChunker>(contentType);
|
||||||
|
|
||||||
|
// Open a Word document for read-only access.
|
||||||
using var document = WordprocessingDocument.Open(stream, false);
|
using var document = WordprocessingDocument.Open(stream, false);
|
||||||
|
|
||||||
var body = document.MainDocumentPart?.Document.Body;
|
var body = document.MainDocumentPart?.Document.Body;
|
||||||
if (body is null)
|
var content = new StringBuilder();
|
||||||
|
|
||||||
|
foreach (var p in body?.Descendants<Paragraph>() ?? [])
|
||||||
{
|
{
|
||||||
return Task.FromResult(Enumerable.Empty<Chunk>());
|
content.AppendLine(p.InnerText);
|
||||||
}
|
}
|
||||||
|
|
||||||
var pages = new List<string>();
|
var paragraphs = textChunker.Split(content.ToString().Trim());
|
||||||
var pageBuilder = new StringBuilder();
|
|
||||||
|
|
||||||
foreach (var paragraph in body.Descendants<Paragraph>())
|
// Pages do not exist in the OpenXML format until they are rendered by a word processor.
|
||||||
{
|
// See https://stackoverflow.com/questions/43700252/how-to-get-page-numbers-based-on-openxmlelement for more details.
|
||||||
// Note: this is just an attempt at counting pages, not 100% reliable
|
// Therefore, we will not assign a page number.
|
||||||
// see https://stackoverflow.com/questions/39992870/how-to-access-openxml-content-by-page-number
|
return Task.FromResult(paragraphs.Select((text, index) => new Chunk(null, index, text)).ToList().AsEnumerable());
|
||||||
var lastRenderedPageBreak = paragraph.GetFirstChild<Run>()?.GetFirstChild<LastRenderedPageBreak>();
|
|
||||||
if (lastRenderedPageBreak is not null)
|
|
||||||
{
|
|
||||||
// Note: no trimming, use original spacing when working with pages
|
|
||||||
pages.Add(pageBuilder.ToString());
|
|
||||||
pageBuilder.Clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
pageBuilder.AppendLine(paragraph.InnerText);
|
|
||||||
}
|
|
||||||
|
|
||||||
// After processing all paragraphs, add the last page (even if empty).
|
|
||||||
pages.Add(pageBuilder.ToString());
|
|
||||||
|
|
||||||
var chunks = new List<Chunk>();
|
|
||||||
foreach (var (pageIndex, pageText) in pages.Index())
|
|
||||||
{
|
|
||||||
var paragraphs = textChunker.Split(pageText.Trim());
|
|
||||||
chunks.AddRange(paragraphs.Where(p => !string.IsNullOrWhiteSpace(p)).Select((text, index) => new Chunk(pageIndex + 1, index, text)));
|
|
||||||
}
|
|
||||||
|
|
||||||
return Task.FromResult(chunks.AsEnumerable());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,10 +22,11 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli
|
|||||||
{
|
{
|
||||||
// Extract the contents of the file.
|
// Extract the contents of the file.
|
||||||
var decoder = serviceProvider.GetKeyedService<IContentDecoder>(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported.");
|
var decoder = serviceProvider.GetKeyedService<IContentDecoder>(contentType) ?? throw new NotSupportedException($"Content type '{contentType}' is not supported.");
|
||||||
var paragraphs = await decoder.DecodeAsync(stream, contentType, cancellationToken);
|
var chunks = await decoder.DecodeAsync(stream, contentType, cancellationToken);
|
||||||
|
var chunkContents = chunks.Select(p => p.Content).ToList();
|
||||||
|
|
||||||
// We get the token count of the whole document because it is the total number of token used by embedding (it may be necessary, for example, for cost analysis).
|
// We get the token count of the whole document because it is the total number of token used by embedding (it may be necessary, for example, for cost analysis).
|
||||||
var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(" ", paragraphs.Select(p => p.Content)));
|
var tokenCount = tokenizerService.CountEmbeddingTokens(string.Join(" ", chunkContents));
|
||||||
|
|
||||||
var strategy = dbContext.Database.CreateExecutionStrategy();
|
var strategy = dbContext.Database.CreateExecutionStrategy();
|
||||||
var document = await strategy.ExecuteAsync(async (cancellationToken) =>
|
var document = await strategy.ExecuteAsync(async (cancellationToken) =>
|
||||||
@@ -41,21 +42,30 @@ public partial class VectorSearchService(IServiceProvider serviceProvider, Appli
|
|||||||
var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() };
|
var document = new Entities.Document { Id = documentId.GetValueOrDefault(), Name = name, CreationDate = timeProvider.GetUtcNow() };
|
||||||
dbContext.Documents.Add(document);
|
dbContext.Documents.Add(document);
|
||||||
|
|
||||||
var embeddings = await embeddingGenerator.GenerateAsync(paragraphs.Select(p => p.Content), cancellationToken: cancellationToken);
|
// Process paragraphs in batches.
|
||||||
|
var embeddings = new List<Embedding<float>>();
|
||||||
|
foreach (var batch in chunkContents.Chunk(appSettings.EmbeddingBatchSize))
|
||||||
|
{
|
||||||
|
logger.LogDebug("Processing batch of {Count} chunks for embedding generation...", batch.Length);
|
||||||
|
|
||||||
|
// Generate embeddings for this batch.
|
||||||
|
var batchEmbeddings = await embeddingGenerator.GenerateAsync(batch, cancellationToken: cancellationToken);
|
||||||
|
embeddings.AddRange(batchEmbeddings);
|
||||||
|
}
|
||||||
|
|
||||||
// Save the document chunks and the corresponding embedding in the database.
|
// Save the document chunks and the corresponding embedding in the database.
|
||||||
foreach (var (index, embedding) in embeddings.Index())
|
foreach (var (index, embedding) in embeddings.Index())
|
||||||
{
|
{
|
||||||
var paragraph = paragraphs.ElementAt(index);
|
var chunk = chunks.ElementAt(index);
|
||||||
logger.LogDebug("Storing a paragraph of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(paragraph.Content));
|
logger.LogDebug("Storing a chunk of {TokenCount} tokens.", tokenizerService.CountChatCompletionTokens(chunk.Content));
|
||||||
|
|
||||||
var documentChunk = new Entities.DocumentChunk
|
var documentChunk = new Entities.DocumentChunk
|
||||||
{
|
{
|
||||||
Document = document,
|
Document = document,
|
||||||
Index = index,
|
Index = index,
|
||||||
PageNumber = paragraph.PageNumber,
|
PageNumber = chunk.PageNumber,
|
||||||
IndexOnPage = paragraph.IndexOnPage,
|
IndexOnPage = chunk.IndexOnPage,
|
||||||
Content = paragraph.Content,
|
Content = chunk.Content,
|
||||||
Embedding = embedding.Vector.ToArray()
|
Embedding = embedding.Vector.ToArray()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
public class AppSettings
|
public class AppSettings
|
||||||
{
|
{
|
||||||
|
public int EmbeddingBatchSize { get; init; } = 32;
|
||||||
|
|
||||||
public int MaxTokensPerLine { get; init; } = 300;
|
public int MaxTokensPerLine { get; init; } = 300;
|
||||||
|
|
||||||
public int MaxTokensPerParagraph { get; init; } = 1000;
|
public int MaxTokensPerParagraph { get; init; } = 1000;
|
||||||
|
|||||||
@@ -20,6 +20,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"AppSettings": {
|
"AppSettings": {
|
||||||
|
"EmbeddingBatchSize": 32,
|
||||||
"MaxTokensPerLine": 300,
|
"MaxTokensPerLine": 300,
|
||||||
"MaxTokensPerParagraph": 1000,
|
"MaxTokensPerParagraph": 1000,
|
||||||
"OverlapTokens": 100,
|
"OverlapTokens": 100,
|
||||||
|
|||||||
Reference in New Issue
Block a user