Initial MVP
This commit is contained in:
@@ -0,0 +1,54 @@
|
||||
"""File type detection and routing."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
EXTENSION_MAP = {
|
||||
# Docling-handled formats
|
||||
".pdf": ("pdf", None),
|
||||
".docx": ("pdf", None), # Docling handles DOCX too
|
||||
".html": ("pdf", None),
|
||||
".htm": ("pdf", None),
|
||||
".png": ("pdf", None),
|
||||
".jpg": ("pdf", None),
|
||||
".jpeg": ("pdf", None),
|
||||
".tiff": ("pdf", None),
|
||||
".bmp": ("pdf", None),
|
||||
".webp": ("pdf", None),
|
||||
# Markdown / text
|
||||
".md": ("markdown", None),
|
||||
".markdown": ("markdown", None),
|
||||
".txt": ("markdown", None),
|
||||
# Code
|
||||
".py": ("code", "python"),
|
||||
".sh": ("code", "bash"),
|
||||
".bash": ("code", "bash"),
|
||||
".go": ("code", "go"),
|
||||
}
|
||||
|
||||
SUPPORTED_EXTENSIONS = set(EXTENSION_MAP.keys())
|
||||
|
||||
|
||||
def detect_type(path: Path, force_type: str | None = None,
|
||||
force_language: str | None = None) -> tuple[str, str | None]:
|
||||
"""Detect document type and language from file extension.
|
||||
|
||||
Returns (doc_type, language) tuple.
|
||||
Raises ValueError for unsupported file types.
|
||||
"""
|
||||
if force_type:
|
||||
return force_type, force_language
|
||||
|
||||
ext = path.suffix.lower()
|
||||
if ext not in EXTENSION_MAP:
|
||||
supported = ", ".join(sorted(SUPPORTED_EXTENSIONS))
|
||||
raise ValueError(f"Unsupported file type '{ext}'. Supported: {supported}")
|
||||
|
||||
doc_type, language = EXTENSION_MAP[ext]
|
||||
if force_language:
|
||||
language = force_language
|
||||
return doc_type, language
|
||||
|
||||
|
||||
def is_supported(path: Path) -> bool:
|
||||
"""Check if a file has a supported extension."""
|
||||
return path.suffix.lower() in SUPPORTED_EXTENSIONS
|
||||
Reference in New Issue
Block a user