34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
"""Tests for Docling ingestion (fixed-size chunking logic, mocked Docling)."""
|
|
|
|
from kb_search.ingest.docling import _fixed_chunk_text
|
|
|
|
|
|
class TestFixedChunkText:
|
|
def test_short_text_single_chunk(self):
|
|
chunks = _fixed_chunk_text("Hello world", {})
|
|
assert len(chunks) == 1
|
|
assert chunks[0]["text"] == "Hello world"
|
|
assert chunks[0]["chunk_index"] == 0
|
|
|
|
def test_long_text_multiple_chunks(self):
|
|
text = "word " * 2000 # ~10000 chars
|
|
chunks = _fixed_chunk_text(text, {"max_tokens": 512, "overlap_tokens": 50})
|
|
assert len(chunks) > 1
|
|
# Chunks should overlap
|
|
for i, c in enumerate(chunks):
|
|
assert c["chunk_index"] == i
|
|
|
|
def test_empty_text(self):
|
|
chunks = _fixed_chunk_text("", {})
|
|
assert len(chunks) == 0
|
|
|
|
def test_whitespace_only(self):
|
|
chunks = _fixed_chunk_text(" \n\n ", {})
|
|
assert len(chunks) == 0
|
|
|
|
def test_custom_max_tokens(self):
|
|
text = "a " * 500
|
|
chunks = _fixed_chunk_text(text, {"max_tokens": 100})
|
|
# 100 tokens * 4 chars = 400 chars window, 1000 chars total
|
|
assert len(chunks) > 1
|