Files
kb/tests/test_ingest_docling.py
T
2026-03-23 20:38:42 +00:00

34 lines
1.1 KiB
Python

"""Tests for Docling ingestion (fixed-size chunking logic, mocked Docling)."""
from kb_search.ingest.docling import _fixed_chunk_text
class TestFixedChunkText:
def test_short_text_single_chunk(self):
chunks = _fixed_chunk_text("Hello world", {})
assert len(chunks) == 1
assert chunks[0]["text"] == "Hello world"
assert chunks[0]["chunk_index"] == 0
def test_long_text_multiple_chunks(self):
text = "word " * 2000 # ~10000 chars
chunks = _fixed_chunk_text(text, {"max_tokens": 512, "overlap_tokens": 50})
assert len(chunks) > 1
# Chunks should overlap
for i, c in enumerate(chunks):
assert c["chunk_index"] == i
def test_empty_text(self):
chunks = _fixed_chunk_text("", {})
assert len(chunks) == 0
def test_whitespace_only(self):
chunks = _fixed_chunk_text(" \n\n ", {})
assert len(chunks) == 0
def test_custom_max_tokens(self):
text = "a " * 500
chunks = _fixed_chunk_text(text, {"max_tokens": 100})
# 100 tokens * 4 chars = 400 chars window, 1000 chars total
assert len(chunks) > 1