82 lines
2.8 KiB
Python
82 lines
2.8 KiB
Python
"""Tests for file type detection, dedup, note creation."""
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from kb_search.ingest.detector import detect_type, is_supported
|
|
from kb_search.ingest.note import auto_title, chunk_note
|
|
|
|
|
|
class TestDetector:
|
|
def test_pdf(self, tmp_path):
|
|
assert detect_type(tmp_path / "doc.pdf") == ("pdf", None)
|
|
|
|
def test_markdown(self, tmp_path):
|
|
assert detect_type(tmp_path / "notes.md") == ("markdown", None)
|
|
|
|
def test_txt(self, tmp_path):
|
|
assert detect_type(tmp_path / "notes.txt") == ("markdown", None)
|
|
|
|
def test_python(self, tmp_path):
|
|
assert detect_type(tmp_path / "main.py") == ("code", "python")
|
|
|
|
def test_bash(self, tmp_path):
|
|
assert detect_type(tmp_path / "deploy.sh") == ("code", "bash")
|
|
|
|
def test_go(self, tmp_path):
|
|
assert detect_type(tmp_path / "main.go") == ("code", "go")
|
|
|
|
def test_unsupported(self, tmp_path):
|
|
with pytest.raises(ValueError, match="Unsupported"):
|
|
detect_type(tmp_path / "archive.zip")
|
|
|
|
def test_force_type(self, tmp_path):
|
|
assert detect_type(tmp_path / "data.txt", force_type="code", force_language="bash") == ("code", "bash")
|
|
|
|
def test_force_language_only(self, tmp_path):
|
|
doc_type, lang = detect_type(tmp_path / "script.py", force_language="go")
|
|
assert doc_type == "code"
|
|
assert lang == "go"
|
|
|
|
def test_is_supported(self, tmp_path):
|
|
assert is_supported(tmp_path / "test.pdf")
|
|
assert is_supported(tmp_path / "test.py")
|
|
assert not is_supported(tmp_path / "test.zip")
|
|
|
|
def test_case_insensitive(self, tmp_path):
|
|
assert detect_type(tmp_path / "DOC.PDF") == ("pdf", None)
|
|
|
|
def test_image_files(self, tmp_path):
|
|
assert detect_type(tmp_path / "scan.png") == ("pdf", None)
|
|
assert detect_type(tmp_path / "photo.jpg") == ("pdf", None)
|
|
|
|
def test_docx(self, tmp_path):
|
|
assert detect_type(tmp_path / "report.docx") == ("pdf", None)
|
|
|
|
|
|
class TestNote:
|
|
def test_chunk_note(self):
|
|
chunks = chunk_note("Hello world")
|
|
assert len(chunks) == 1
|
|
assert chunks[0]["text"] == "Hello world"
|
|
assert chunks[0]["chunk_index"] == 0
|
|
|
|
def test_auto_title_short(self):
|
|
assert auto_title("Short note") == "Short note"
|
|
|
|
def test_auto_title_long(self):
|
|
long_text = "This is a very long note that exceeds the maximum title length and should be truncated at a word boundary"
|
|
result = auto_title(long_text, max_len=50)
|
|
assert len(result) <= 54 # 50 + "..."
|
|
assert result.endswith("...")
|
|
|
|
def test_auto_title_multiline(self):
|
|
text = "First line\nSecond line\nThird line"
|
|
assert auto_title(text) == "First line"
|
|
|
|
def test_auto_title_no_space(self):
|
|
text = "a" * 100
|
|
result = auto_title(text, max_len=80)
|
|
assert result.endswith("...")
|