Files
kb/tests/test_ingest_markdown.py
T
2026-03-23 20:38:42 +00:00

122 lines
4.1 KiB
Python

"""Tests for markdown header-based splitting."""
from kb_search.ingest.markdown import (
_fixed_chunk,
_has_headers,
_merge_small_sections,
_split_at_headers,
chunk_markdown,
)
def make_cfg(**overrides):
cfg = {"chunking": {"markdown": {"strategy": "header", "min_tokens": 50, "max_tokens": 1024}}}
cfg["chunking"]["markdown"].update(overrides)
return cfg
class TestHasHeaders:
def test_with_headers(self):
assert _has_headers("## Title\nContent")
def test_without_headers(self):
assert not _has_headers("Just plain text\nNo headers here")
def test_h3(self):
assert _has_headers("### Subsection\nStuff")
class TestSplitAtHeaders:
def test_basic_split(self):
text = "## Section 1\nContent one\n\n## Section 2\nContent two"
sections = _split_at_headers(text)
assert len(sections) == 2
assert sections[0]["header_chain"] == ["Section 1"]
assert "Content one" in sections[0]["content"]
assert sections[1]["header_chain"] == ["Section 2"]
def test_nested_headers(self):
text = "## Config\nIntro\n\n### Advanced Options\nDetails"
sections = _split_at_headers(text)
assert len(sections) == 2
# The ### should have full chain
assert sections[1]["header_chain"] == ["Config", "Advanced Options"]
def test_leading_content(self):
text = "Preamble text\n\n## First Section\nContent"
sections = _split_at_headers(text)
assert len(sections) == 2
assert sections[0]["header_chain"] == []
assert "Preamble" in sections[0]["content"]
def test_header_level_reset(self):
text = "## A\n\n### B\n\n## C\n\n### D"
sections = _split_at_headers(text)
assert sections[2]["header_chain"] == ["C"]
assert sections[3]["header_chain"] == ["C", "D"]
class TestMergeSmallSections:
def test_merge_tiny_into_next(self):
sections = [
{"header_chain": ["A"], "content": "tiny"},
{"header_chain": ["B"], "content": "This is a much longer section with plenty of words " * 5},
]
merged = _merge_small_sections(sections, min_tokens=10)
assert len(merged) == 1
assert "tiny" in merged[0]["content"]
def test_no_merge_when_large_enough(self):
sections = [
{"header_chain": ["A"], "content": "word " * 100},
{"header_chain": ["B"], "content": "word " * 100},
]
merged = _merge_small_sections(sections, min_tokens=10)
assert len(merged) == 2
class TestChunkMarkdown:
def test_header_strategy(self):
text = "## Intro\nSome intro text with enough words to avoid merging. " * 5
text += "\n\n## Details\nDetailed content follows here with sufficient length. " * 5
cfg = make_cfg(min_tokens=5)
chunks = chunk_markdown(text, cfg)
assert len(chunks) >= 2
# Verify chunk_index assigned
for i, c in enumerate(chunks):
assert c["chunk_index"] == i
def test_hierarchy_context(self):
text = "## Config\nIntro\n\n### Advanced\n" + "Details " * 60
cfg = make_cfg(min_tokens=5)
chunks = chunk_markdown(text, cfg)
# Find the Advanced chunk
advanced = [c for c in chunks if "Advanced" in c["text"]]
assert len(advanced) > 0
assert "Config > Advanced" in advanced[0]["text"]
def test_plain_text_fallback(self):
text = "No headers here, just plain text. " * 200
cfg = make_cfg()
chunks = chunk_markdown(text, cfg)
assert len(chunks) >= 1
def test_empty_text(self):
chunks = chunk_markdown("", make_cfg())
assert len(chunks) == 0
class TestFixedChunk:
def test_basic(self):
text = "word " * 200
chunks = _fixed_chunk(text, {"max_tokens": 50, "overlap_tokens": 10})
assert len(chunks) > 1
def test_empty(self):
chunks = _fixed_chunk("", {})
assert len(chunks) == 0
def test_short_text(self):
chunks = _fixed_chunk("hello world", {"max_tokens": 512})
assert len(chunks) == 1