122 lines
4.1 KiB
Python
122 lines
4.1 KiB
Python
"""Tests for markdown header-based splitting."""
|
|
|
|
from kb_search.ingest.markdown import (
|
|
_fixed_chunk,
|
|
_has_headers,
|
|
_merge_small_sections,
|
|
_split_at_headers,
|
|
chunk_markdown,
|
|
)
|
|
|
|
|
|
def make_cfg(**overrides):
|
|
cfg = {"chunking": {"markdown": {"strategy": "header", "min_tokens": 50, "max_tokens": 1024}}}
|
|
cfg["chunking"]["markdown"].update(overrides)
|
|
return cfg
|
|
|
|
|
|
class TestHasHeaders:
|
|
def test_with_headers(self):
|
|
assert _has_headers("## Title\nContent")
|
|
|
|
def test_without_headers(self):
|
|
assert not _has_headers("Just plain text\nNo headers here")
|
|
|
|
def test_h3(self):
|
|
assert _has_headers("### Subsection\nStuff")
|
|
|
|
|
|
class TestSplitAtHeaders:
|
|
def test_basic_split(self):
|
|
text = "## Section 1\nContent one\n\n## Section 2\nContent two"
|
|
sections = _split_at_headers(text)
|
|
assert len(sections) == 2
|
|
assert sections[0]["header_chain"] == ["Section 1"]
|
|
assert "Content one" in sections[0]["content"]
|
|
assert sections[1]["header_chain"] == ["Section 2"]
|
|
|
|
def test_nested_headers(self):
|
|
text = "## Config\nIntro\n\n### Advanced Options\nDetails"
|
|
sections = _split_at_headers(text)
|
|
assert len(sections) == 2
|
|
# The ### should have full chain
|
|
assert sections[1]["header_chain"] == ["Config", "Advanced Options"]
|
|
|
|
def test_leading_content(self):
|
|
text = "Preamble text\n\n## First Section\nContent"
|
|
sections = _split_at_headers(text)
|
|
assert len(sections) == 2
|
|
assert sections[0]["header_chain"] == []
|
|
assert "Preamble" in sections[0]["content"]
|
|
|
|
def test_header_level_reset(self):
|
|
text = "## A\n\n### B\n\n## C\n\n### D"
|
|
sections = _split_at_headers(text)
|
|
assert sections[2]["header_chain"] == ["C"]
|
|
assert sections[3]["header_chain"] == ["C", "D"]
|
|
|
|
|
|
class TestMergeSmallSections:
|
|
def test_merge_tiny_into_next(self):
|
|
sections = [
|
|
{"header_chain": ["A"], "content": "tiny"},
|
|
{"header_chain": ["B"], "content": "This is a much longer section with plenty of words " * 5},
|
|
]
|
|
merged = _merge_small_sections(sections, min_tokens=10)
|
|
assert len(merged) == 1
|
|
assert "tiny" in merged[0]["content"]
|
|
|
|
def test_no_merge_when_large_enough(self):
|
|
sections = [
|
|
{"header_chain": ["A"], "content": "word " * 100},
|
|
{"header_chain": ["B"], "content": "word " * 100},
|
|
]
|
|
merged = _merge_small_sections(sections, min_tokens=10)
|
|
assert len(merged) == 2
|
|
|
|
|
|
class TestChunkMarkdown:
|
|
def test_header_strategy(self):
|
|
text = "## Intro\nSome intro text with enough words to avoid merging. " * 5
|
|
text += "\n\n## Details\nDetailed content follows here with sufficient length. " * 5
|
|
cfg = make_cfg(min_tokens=5)
|
|
chunks = chunk_markdown(text, cfg)
|
|
assert len(chunks) >= 2
|
|
# Verify chunk_index assigned
|
|
for i, c in enumerate(chunks):
|
|
assert c["chunk_index"] == i
|
|
|
|
def test_hierarchy_context(self):
|
|
text = "## Config\nIntro\n\n### Advanced\n" + "Details " * 60
|
|
cfg = make_cfg(min_tokens=5)
|
|
chunks = chunk_markdown(text, cfg)
|
|
# Find the Advanced chunk
|
|
advanced = [c for c in chunks if "Advanced" in c["text"]]
|
|
assert len(advanced) > 0
|
|
assert "Config > Advanced" in advanced[0]["text"]
|
|
|
|
def test_plain_text_fallback(self):
|
|
text = "No headers here, just plain text. " * 200
|
|
cfg = make_cfg()
|
|
chunks = chunk_markdown(text, cfg)
|
|
assert len(chunks) >= 1
|
|
|
|
def test_empty_text(self):
|
|
chunks = chunk_markdown("", make_cfg())
|
|
assert len(chunks) == 0
|
|
|
|
|
|
class TestFixedChunk:
|
|
def test_basic(self):
|
|
text = "word " * 200
|
|
chunks = _fixed_chunk(text, {"max_tokens": 50, "overlap_tokens": 10})
|
|
assert len(chunks) > 1
|
|
|
|
def test_empty(self):
|
|
chunks = _fixed_chunk("", {})
|
|
assert len(chunks) == 0
|
|
|
|
def test_short_text(self):
|
|
chunks = _fixed_chunk("hello world", {"max_tokens": 512})
|
|
assert len(chunks) == 1
|