"""File type detection and routing.""" from pathlib import Path EXTENSION_MAP = { # Docling-handled formats ".pdf": ("pdf", None), ".docx": ("pdf", None), # Docling handles DOCX too ".html": ("pdf", None), ".htm": ("pdf", None), ".png": ("pdf", None), ".jpg": ("pdf", None), ".jpeg": ("pdf", None), ".tiff": ("pdf", None), ".bmp": ("pdf", None), ".webp": ("pdf", None), # Markdown / text ".md": ("markdown", None), ".markdown": ("markdown", None), ".txt": ("markdown", None), # Code ".py": ("code", "python"), ".sh": ("code", "bash"), ".bash": ("code", "bash"), ".go": ("code", "go"), } SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys()) def detect_type(path: Path, force_type: str | None = None, force_language: str | None = None) -> tuple[str, str | None]: """Detect document type and language from file extension. Returns (doc_type, language) tuple. Raises ValueError for unsupported file types. """ if force_type: return force_type, force_language ext = path.suffix.lower() if ext not in EXTENSION_MAP: supported = ", ".join(sorted(SUPPORTED_EXTENSIONS)) raise ValueError(f"Unsupported file type '{ext}'. Supported: {supported}") doc_type, language = EXTENSION_MAP[ext] if force_language: language = force_language return doc_type, language def is_supported(path: Path) -> bool: """Check if a file has a supported extension.""" return path.suffix.lower() in SUPPORTED_EXTENSIONS