Files
kb/src/kb_search/ingest/detector.py
T
2026-03-23 20:38:42 +00:00

55 lines
1.5 KiB
Python

"""File type detection and routing."""
from pathlib import Path
EXTENSION_MAP = {
# Docling-handled formats
".pdf": ("pdf", None),
".docx": ("pdf", None), # Docling handles DOCX too
".html": ("pdf", None),
".htm": ("pdf", None),
".png": ("pdf", None),
".jpg": ("pdf", None),
".jpeg": ("pdf", None),
".tiff": ("pdf", None),
".bmp": ("pdf", None),
".webp": ("pdf", None),
# Markdown / text
".md": ("markdown", None),
".markdown": ("markdown", None),
".txt": ("markdown", None),
# Code
".py": ("code", "python"),
".sh": ("code", "bash"),
".bash": ("code", "bash"),
".go": ("code", "go"),
}
SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys())
def detect_type(path: Path, force_type: str | None = None,
force_language: str | None = None) -> tuple[str, str | None]:
"""Detect document type and language from file extension.
Returns (doc_type, language) tuple.
Raises ValueError for unsupported file types.
"""
if force_type:
return force_type, force_language
ext = path.suffix.lower()
if ext not in EXTENSION_MAP:
supported = ", ".join(sorted(SUPPORTED_EXTENSIONS))
raise ValueError(f"Unsupported file type '{ext}'. Supported: {supported}")
doc_type, language = EXTENSION_MAP[ext]
if force_language:
language = force_language
return doc_type, language
def is_supported(path: Path) -> bool:
"""Check if a file has a supported extension."""
return path.suffix.lower() in SUPPORTED_EXTENSIONS