Initial MVP
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
"""Tests for file type detection, dedup, note creation."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from kb_search.ingest.detector import detect_type, is_supported
|
||||
from kb_search.ingest.note import auto_title, chunk_note
|
||||
|
||||
|
||||
class TestDetector:
|
||||
def test_pdf(self, tmp_path):
|
||||
assert detect_type(tmp_path / "doc.pdf") == ("pdf", None)
|
||||
|
||||
def test_markdown(self, tmp_path):
|
||||
assert detect_type(tmp_path / "notes.md") == ("markdown", None)
|
||||
|
||||
def test_txt(self, tmp_path):
|
||||
assert detect_type(tmp_path / "notes.txt") == ("markdown", None)
|
||||
|
||||
def test_python(self, tmp_path):
|
||||
assert detect_type(tmp_path / "main.py") == ("code", "python")
|
||||
|
||||
def test_bash(self, tmp_path):
|
||||
assert detect_type(tmp_path / "deploy.sh") == ("code", "bash")
|
||||
|
||||
def test_go(self, tmp_path):
|
||||
assert detect_type(tmp_path / "main.go") == ("code", "go")
|
||||
|
||||
def test_unsupported(self, tmp_path):
|
||||
with pytest.raises(ValueError, match="Unsupported"):
|
||||
detect_type(tmp_path / "archive.zip")
|
||||
|
||||
def test_force_type(self, tmp_path):
|
||||
assert detect_type(tmp_path / "data.txt", force_type="code", force_language="bash") == ("code", "bash")
|
||||
|
||||
def test_force_language_only(self, tmp_path):
|
||||
doc_type, lang = detect_type(tmp_path / "script.py", force_language="go")
|
||||
assert doc_type == "code"
|
||||
assert lang == "go"
|
||||
|
||||
def test_is_supported(self, tmp_path):
|
||||
assert is_supported(tmp_path / "test.pdf")
|
||||
assert is_supported(tmp_path / "test.py")
|
||||
assert not is_supported(tmp_path / "test.zip")
|
||||
|
||||
def test_case_insensitive(self, tmp_path):
|
||||
assert detect_type(tmp_path / "DOC.PDF") == ("pdf", None)
|
||||
|
||||
def test_image_files(self, tmp_path):
|
||||
assert detect_type(tmp_path / "scan.png") == ("pdf", None)
|
||||
assert detect_type(tmp_path / "photo.jpg") == ("pdf", None)
|
||||
|
||||
def test_docx(self, tmp_path):
|
||||
assert detect_type(tmp_path / "report.docx") == ("pdf", None)
|
||||
|
||||
|
||||
class TestNote:
|
||||
def test_chunk_note(self):
|
||||
chunks = chunk_note("Hello world")
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0]["text"] == "Hello world"
|
||||
assert chunks[0]["chunk_index"] == 0
|
||||
|
||||
def test_auto_title_short(self):
|
||||
assert auto_title("Short note") == "Short note"
|
||||
|
||||
def test_auto_title_long(self):
|
||||
long_text = "This is a very long note that exceeds the maximum title length and should be truncated at a word boundary"
|
||||
result = auto_title(long_text, max_len=50)
|
||||
assert len(result) <= 54 # 50 + "..."
|
||||
assert result.endswith("...")
|
||||
|
||||
def test_auto_title_multiline(self):
|
||||
text = "First line\nSecond line\nThird line"
|
||||
assert auto_title(text) == "First line"
|
||||
|
||||
def test_auto_title_no_space(self):
|
||||
text = "a" * 100
|
||||
result = auto_title(text, max_len=80)
|
||||
assert result.endswith("...")
|
||||
Reference in New Issue
Block a user