Initial MVP
This commit is contained in:
@@ -0,0 +1,123 @@
|
||||
"""Docling-based ingestion for PDFs, DOCX, HTML, and images."""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
# Suppress noisy Docling/RapidOCR logging
|
||||
logging.getLogger("RapidOCR").setLevel(logging.ERROR)
|
||||
logging.getLogger("docling.models.stages.ocr.rapid_ocr_model").setLevel(logging.ERROR)
|
||||
logging.getLogger("docling").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def chunk_document(file_path: Path, cfg: dict) -> list[dict]:
|
||||
"""Ingest a document using Docling and return chunks."""
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
|
||||
|
||||
# Configure PDF pipeline
|
||||
ocr_setting = cfg.get("ingestion", {}).get("enable_ocr", "auto")
|
||||
pdf_opts = PdfPipelineOptions()
|
||||
|
||||
if ocr_setting == "never":
|
||||
pdf_opts.do_ocr = False
|
||||
elif ocr_setting == "always":
|
||||
pdf_opts.do_ocr = True
|
||||
pdf_opts.ocr_options = RapidOcrOptions(force_full_page_ocr=True)
|
||||
else:
|
||||
# "auto" — enable OCR but only trigger on pages with significant bitmap content
|
||||
pdf_opts.do_ocr = True
|
||||
pdf_opts.ocr_options = RapidOcrOptions(bitmap_area_threshold=0.25)
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts),
|
||||
}
|
||||
)
|
||||
|
||||
# Convert
|
||||
result = converter.convert(str(file_path))
|
||||
doc = result.document
|
||||
|
||||
# Chunk using hierarchy-aware chunker
|
||||
chunking_cfg = cfg.get("chunking", {}).get("pdf", {})
|
||||
strategy = chunking_cfg.get("strategy", "hierarchy")
|
||||
|
||||
if strategy == "hierarchy":
|
||||
chunks = _hierarchy_chunk(doc)
|
||||
else:
|
||||
chunks = _fixed_chunk(doc, chunking_cfg)
|
||||
|
||||
if not chunks:
|
||||
# Fallback: try extracting raw text
|
||||
text = doc.export_to_markdown()
|
||||
if text and text.strip():
|
||||
chunks = _fixed_chunk_text(text, chunking_cfg)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _hierarchy_chunk(doc) -> list[dict]:
|
||||
"""Use Docling's HierarchicalChunker."""
|
||||
from docling_core.transforms.chunker import HierarchicalChunker
|
||||
|
||||
chunker = HierarchicalChunker()
|
||||
chunks = []
|
||||
|
||||
for i, chunk in enumerate(chunker.chunk(doc)):
|
||||
meta = {}
|
||||
|
||||
# Extract page info if available
|
||||
if hasattr(chunk, "meta") and chunk.meta:
|
||||
if hasattr(chunk.meta, "doc_items"):
|
||||
for item in chunk.meta.doc_items:
|
||||
if hasattr(item, "prov") and item.prov:
|
||||
for prov in item.prov:
|
||||
if hasattr(prov, "page_no"):
|
||||
meta["page"] = prov.page_no
|
||||
break
|
||||
|
||||
# Section headers
|
||||
if hasattr(chunk.meta, "headings") and chunk.meta.headings:
|
||||
meta["section_header"] = " > ".join(chunk.meta.headings)
|
||||
|
||||
chunks.append({
|
||||
"text": chunk.text,
|
||||
"chunk_index": i,
|
||||
"metadata": meta,
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _fixed_chunk(doc, chunking_cfg: dict) -> list[dict]:
|
||||
"""Fixed-size chunking from Docling document."""
|
||||
text = doc.export_to_markdown()
|
||||
return _fixed_chunk_text(text, chunking_cfg)
|
||||
|
||||
|
||||
def _fixed_chunk_text(text: str, chunking_cfg: dict) -> list[dict]:
|
||||
"""Fixed-size chunking from plain text."""
|
||||
max_tokens = chunking_cfg.get("max_tokens", 1024)
|
||||
overlap = chunking_cfg.get("overlap_tokens", 50)
|
||||
|
||||
# Approximate: 1 token ~= 4 chars
|
||||
max_chars = max_tokens * 4
|
||||
overlap_chars = overlap * 4
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
idx = 0
|
||||
while start < len(text):
|
||||
end = start + max_chars
|
||||
chunk_text = text[start:end].strip()
|
||||
if chunk_text:
|
||||
chunks.append({
|
||||
"text": chunk_text,
|
||||
"chunk_index": idx,
|
||||
"metadata": {},
|
||||
})
|
||||
idx += 1
|
||||
start = end - overlap_chars
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user