Files
kb/src/kb_search/ingest/docling.py
T
2026-03-23 20:38:42 +00:00

124 lines
3.9 KiB
Python

"""Docling-based ingestion for PDFs, DOCX, HTML, and images."""
import logging
from pathlib import Path
# Suppress noisy Docling/RapidOCR logging
logging.getLogger("RapidOCR").setLevel(logging.ERROR)
logging.getLogger("docling.models.stages.ocr.rapid_ocr_model").setLevel(logging.ERROR)
logging.getLogger("docling").setLevel(logging.WARNING)
def chunk_document(file_path: Path, cfg: dict) -> list[dict]:
"""Ingest a document using Docling and return chunks."""
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
# Configure PDF pipeline
ocr_setting = cfg.get("ingestion", {}).get("enable_ocr", "auto")
pdf_opts = PdfPipelineOptions()
if ocr_setting == "never":
pdf_opts.do_ocr = False
elif ocr_setting == "always":
pdf_opts.do_ocr = True
pdf_opts.ocr_options = RapidOcrOptions(force_full_page_ocr=True)
else:
# "auto" — enable OCR but only trigger on pages with significant bitmap content
pdf_opts.do_ocr = True
pdf_opts.ocr_options = RapidOcrOptions(bitmap_area_threshold=0.25)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts),
}
)
# Convert
result = converter.convert(str(file_path))
doc = result.document
# Chunk using hierarchy-aware chunker
chunking_cfg = cfg.get("chunking", {}).get("pdf", {})
strategy = chunking_cfg.get("strategy", "hierarchy")
if strategy == "hierarchy":
chunks = _hierarchy_chunk(doc)
else:
chunks = _fixed_chunk(doc, chunking_cfg)
if not chunks:
# Fallback: try extracting raw text
text = doc.export_to_markdown()
if text and text.strip():
chunks = _fixed_chunk_text(text, chunking_cfg)
return chunks
def _hierarchy_chunk(doc) -> list[dict]:
"""Use Docling's HierarchicalChunker."""
from docling_core.transforms.chunker import HierarchicalChunker
chunker = HierarchicalChunker()
chunks = []
for i, chunk in enumerate(chunker.chunk(doc)):
meta = {}
# Extract page info if available
if hasattr(chunk, "meta") and chunk.meta:
if hasattr(chunk.meta, "doc_items"):
for item in chunk.meta.doc_items:
if hasattr(item, "prov") and item.prov:
for prov in item.prov:
if hasattr(prov, "page_no"):
meta["page"] = prov.page_no
break
# Section headers
if hasattr(chunk.meta, "headings") and chunk.meta.headings:
meta["section_header"] = " > ".join(chunk.meta.headings)
chunks.append({
"text": chunk.text,
"chunk_index": i,
"metadata": meta,
})
return chunks
def _fixed_chunk(doc, chunking_cfg: dict) -> list[dict]:
"""Fixed-size chunking from Docling document."""
text = doc.export_to_markdown()
return _fixed_chunk_text(text, chunking_cfg)
def _fixed_chunk_text(text: str, chunking_cfg: dict) -> list[dict]:
"""Fixed-size chunking from plain text."""
max_tokens = chunking_cfg.get("max_tokens", 1024)
overlap = chunking_cfg.get("overlap_tokens", 50)
# Approximate: 1 token ~= 4 chars
max_chars = max_tokens * 4
overlap_chars = overlap * 4
chunks = []
start = 0
idx = 0
while start < len(text):
end = start + max_chars
chunk_text = text[start:end].strip()
if chunk_text:
chunks.append({
"text": chunk_text,
"chunk_index": idx,
"metadata": {},
})
idx += 1
start = end - overlap_chars
return chunks