"""Tests for file type detection, dedup, note creation.""" from pathlib import Path import pytest from kb_search.ingest.detector import detect_type, is_supported from kb_search.ingest.note import auto_title, chunk_note class TestDetector: def test_pdf(self, tmp_path): assert detect_type(tmp_path / "doc.pdf") == ("pdf", None) def test_markdown(self, tmp_path): assert detect_type(tmp_path / "notes.md") == ("markdown", None) def test_txt(self, tmp_path): assert detect_type(tmp_path / "notes.txt") == ("markdown", None) def test_python(self, tmp_path): assert detect_type(tmp_path / "main.py") == ("code", "python") def test_bash(self, tmp_path): assert detect_type(tmp_path / "deploy.sh") == ("code", "bash") def test_go(self, tmp_path): assert detect_type(tmp_path / "main.go") == ("code", "go") def test_unsupported(self, tmp_path): with pytest.raises(ValueError, match="Unsupported"): detect_type(tmp_path / "archive.zip") def test_force_type(self, tmp_path): assert detect_type(tmp_path / "data.txt", force_type="code", force_language="bash") == ("code", "bash") def test_force_language_only(self, tmp_path): doc_type, lang = detect_type(tmp_path / "script.py", force_language="go") assert doc_type == "code" assert lang == "go" def test_is_supported(self, tmp_path): assert is_supported(tmp_path / "test.pdf") assert is_supported(tmp_path / "test.py") assert not is_supported(tmp_path / "test.zip") def test_case_insensitive(self, tmp_path): assert detect_type(tmp_path / "DOC.PDF") == ("pdf", None) def test_image_files(self, tmp_path): assert detect_type(tmp_path / "scan.png") == ("pdf", None) assert detect_type(tmp_path / "photo.jpg") == ("pdf", None) def test_docx(self, tmp_path): assert detect_type(tmp_path / "report.docx") == ("pdf", None) class TestNote: def test_chunk_note(self): chunks = chunk_note("Hello world") assert len(chunks) == 1 assert chunks[0]["text"] == "Hello world" assert chunks[0]["chunk_index"] == 0 def test_auto_title_short(self): assert auto_title("Short note") == "Short note" def test_auto_title_long(self): long_text = "This is a very long note that exceeds the maximum title length and should be truncated at a word boundary" result = auto_title(long_text, max_len=50) assert len(result) <= 54 # 50 + "..." assert result.endswith("...") def test_auto_title_multiline(self): text = "First line\nSecond line\nThird line" assert auto_title(text) == "First line" def test_auto_title_no_space(self): text = "a" * 100 result = auto_title(text, max_len=80) assert result.endswith("...")