124 lines
3.9 KiB
Python
124 lines
3.9 KiB
Python
"""Docling-based ingestion for PDFs, DOCX, HTML, and images."""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
# Suppress noisy Docling/RapidOCR logging
|
|
logging.getLogger("RapidOCR").setLevel(logging.ERROR)
|
|
logging.getLogger("docling.models.stages.ocr.rapid_ocr_model").setLevel(logging.ERROR)
|
|
logging.getLogger("docling").setLevel(logging.WARNING)
|
|
|
|
|
|
def chunk_document(file_path: Path, cfg: dict) -> list[dict]:
|
|
"""Ingest a document using Docling and return chunks."""
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
|
|
|
|
# Configure PDF pipeline
|
|
ocr_setting = cfg.get("ingestion", {}).get("enable_ocr", "auto")
|
|
pdf_opts = PdfPipelineOptions()
|
|
|
|
if ocr_setting == "never":
|
|
pdf_opts.do_ocr = False
|
|
elif ocr_setting == "always":
|
|
pdf_opts.do_ocr = True
|
|
pdf_opts.ocr_options = RapidOcrOptions(force_full_page_ocr=True)
|
|
else:
|
|
# "auto" — enable OCR but only trigger on pages with significant bitmap content
|
|
pdf_opts.do_ocr = True
|
|
pdf_opts.ocr_options = RapidOcrOptions(bitmap_area_threshold=0.25)
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts),
|
|
}
|
|
)
|
|
|
|
# Convert
|
|
result = converter.convert(str(file_path))
|
|
doc = result.document
|
|
|
|
# Chunk using hierarchy-aware chunker
|
|
chunking_cfg = cfg.get("chunking", {}).get("pdf", {})
|
|
strategy = chunking_cfg.get("strategy", "hierarchy")
|
|
|
|
if strategy == "hierarchy":
|
|
chunks = _hierarchy_chunk(doc)
|
|
else:
|
|
chunks = _fixed_chunk(doc, chunking_cfg)
|
|
|
|
if not chunks:
|
|
# Fallback: try extracting raw text
|
|
text = doc.export_to_markdown()
|
|
if text and text.strip():
|
|
chunks = _fixed_chunk_text(text, chunking_cfg)
|
|
|
|
return chunks
|
|
|
|
|
|
def _hierarchy_chunk(doc) -> list[dict]:
|
|
"""Use Docling's HierarchicalChunker."""
|
|
from docling_core.transforms.chunker import HierarchicalChunker
|
|
|
|
chunker = HierarchicalChunker()
|
|
chunks = []
|
|
|
|
for i, chunk in enumerate(chunker.chunk(doc)):
|
|
meta = {}
|
|
|
|
# Extract page info if available
|
|
if hasattr(chunk, "meta") and chunk.meta:
|
|
if hasattr(chunk.meta, "doc_items"):
|
|
for item in chunk.meta.doc_items:
|
|
if hasattr(item, "prov") and item.prov:
|
|
for prov in item.prov:
|
|
if hasattr(prov, "page_no"):
|
|
meta["page"] = prov.page_no
|
|
break
|
|
|
|
# Section headers
|
|
if hasattr(chunk.meta, "headings") and chunk.meta.headings:
|
|
meta["section_header"] = " > ".join(chunk.meta.headings)
|
|
|
|
chunks.append({
|
|
"text": chunk.text,
|
|
"chunk_index": i,
|
|
"metadata": meta,
|
|
})
|
|
|
|
return chunks
|
|
|
|
|
|
def _fixed_chunk(doc, chunking_cfg: dict) -> list[dict]:
|
|
"""Fixed-size chunking from Docling document."""
|
|
text = doc.export_to_markdown()
|
|
return _fixed_chunk_text(text, chunking_cfg)
|
|
|
|
|
|
def _fixed_chunk_text(text: str, chunking_cfg: dict) -> list[dict]:
|
|
"""Fixed-size chunking from plain text."""
|
|
max_tokens = chunking_cfg.get("max_tokens", 1024)
|
|
overlap = chunking_cfg.get("overlap_tokens", 50)
|
|
|
|
# Approximate: 1 token ~= 4 chars
|
|
max_chars = max_tokens * 4
|
|
overlap_chars = overlap * 4
|
|
|
|
chunks = []
|
|
start = 0
|
|
idx = 0
|
|
while start < len(text):
|
|
end = start + max_chars
|
|
chunk_text = text[start:end].strip()
|
|
if chunk_text:
|
|
chunks.append({
|
|
"text": chunk_text,
|
|
"chunk_index": idx,
|
|
"metadata": {},
|
|
})
|
|
idx += 1
|
|
start = end - overlap_chars
|
|
|
|
return chunks
|