55 lines
1.5 KiB
Python
55 lines
1.5 KiB
Python
"""File type detection and routing."""
|
|
|
|
from pathlib import Path
|
|
|
|
EXTENSION_MAP = {
|
|
# Docling-handled formats
|
|
".pdf": ("pdf", None),
|
|
".docx": ("pdf", None), # Docling handles DOCX too
|
|
".html": ("pdf", None),
|
|
".htm": ("pdf", None),
|
|
".png": ("pdf", None),
|
|
".jpg": ("pdf", None),
|
|
".jpeg": ("pdf", None),
|
|
".tiff": ("pdf", None),
|
|
".bmp": ("pdf", None),
|
|
".webp": ("pdf", None),
|
|
# Markdown / text
|
|
".md": ("markdown", None),
|
|
".markdown": ("markdown", None),
|
|
".txt": ("markdown", None),
|
|
# Code
|
|
".py": ("code", "python"),
|
|
".sh": ("code", "bash"),
|
|
".bash": ("code", "bash"),
|
|
".go": ("code", "go"),
|
|
}
|
|
|
|
SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys())
|
|
|
|
|
|
def detect_type(path: Path, force_type: str | None = None,
|
|
force_language: str | None = None) -> tuple[str, str | None]:
|
|
"""Detect document type and language from file extension.
|
|
|
|
Returns (doc_type, language) tuple.
|
|
Raises ValueError for unsupported file types.
|
|
"""
|
|
if force_type:
|
|
return force_type, force_language
|
|
|
|
ext = path.suffix.lower()
|
|
if ext not in EXTENSION_MAP:
|
|
supported = ", ".join(sorted(SUPPORTED_EXTENSIONS))
|
|
raise ValueError(f"Unsupported file type '{ext}'. Supported: {supported}")
|
|
|
|
doc_type, language = EXTENSION_MAP[ext]
|
|
if force_language:
|
|
language = force_language
|
|
return doc_type, language
|
|
|
|
|
|
def is_supported(path: Path) -> bool:
|
|
"""Check if a file has a supported extension."""
|
|
return path.suffix.lower() in SUPPORTED_EXTENSIONS
|