"""Docling-based ingestion for PDFs, DOCX, HTML, and images.""" import logging from pathlib import Path # Suppress noisy Docling/RapidOCR logging logging.getLogger("RapidOCR").setLevel(logging.ERROR) logging.getLogger("docling.models.stages.ocr.rapid_ocr_model").setLevel(logging.ERROR) logging.getLogger("docling").setLevel(logging.WARNING) def chunk_document(file_path: Path, cfg: dict) -> list[dict]: """Ingest a document using Docling and return chunks.""" from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions # Configure PDF pipeline ocr_setting = cfg.get("ingestion", {}).get("enable_ocr", "auto") pdf_opts = PdfPipelineOptions() if ocr_setting == "never": pdf_opts.do_ocr = False elif ocr_setting == "always": pdf_opts.do_ocr = True pdf_opts.ocr_options = RapidOcrOptions(force_full_page_ocr=True) else: # "auto" — enable OCR but only trigger on pages with significant bitmap content pdf_opts.do_ocr = True pdf_opts.ocr_options = RapidOcrOptions(bitmap_area_threshold=0.25) converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts), } ) # Convert result = converter.convert(str(file_path)) doc = result.document # Chunk using hierarchy-aware chunker chunking_cfg = cfg.get("chunking", {}).get("pdf", {}) strategy = chunking_cfg.get("strategy", "hierarchy") if strategy == "hierarchy": chunks = _hierarchy_chunk(doc) else: chunks = _fixed_chunk(doc, chunking_cfg) if not chunks: # Fallback: try extracting raw text text = doc.export_to_markdown() if text and text.strip(): chunks = _fixed_chunk_text(text, chunking_cfg) return chunks def _hierarchy_chunk(doc) -> list[dict]: """Use Docling's HierarchicalChunker.""" from docling_core.transforms.chunker import HierarchicalChunker chunker = HierarchicalChunker() chunks = [] for i, chunk in enumerate(chunker.chunk(doc)): meta = {} # Extract page info if available if hasattr(chunk, "meta") and chunk.meta: if hasattr(chunk.meta, "doc_items"): for item in chunk.meta.doc_items: if hasattr(item, "prov") and item.prov: for prov in item.prov: if hasattr(prov, "page_no"): meta["page"] = prov.page_no break # Section headers if hasattr(chunk.meta, "headings") and chunk.meta.headings: meta["section_header"] = " > ".join(chunk.meta.headings) chunks.append({ "text": chunk.text, "chunk_index": i, "metadata": meta, }) return chunks def _fixed_chunk(doc, chunking_cfg: dict) -> list[dict]: """Fixed-size chunking from Docling document.""" text = doc.export_to_markdown() return _fixed_chunk_text(text, chunking_cfg) def _fixed_chunk_text(text: str, chunking_cfg: dict) -> list[dict]: """Fixed-size chunking from plain text.""" max_tokens = chunking_cfg.get("max_tokens", 1024) overlap = chunking_cfg.get("overlap_tokens", 50) # Approximate: 1 token ~= 4 chars max_chars = max_tokens * 4 overlap_chars = overlap * 4 chunks = [] start = 0 idx = 0 while start < len(text): end = start + max_chars chunk_text = text[start:end].strip() if chunk_text: chunks.append({ "text": chunk_text, "chunk_index": idx, "metadata": {}, }) idx += 1 start = end - overlap_chars return chunks