"""Tests for markdown header-based splitting.""" from kb_search.ingest.markdown import ( _fixed_chunk, _has_headers, _merge_small_sections, _split_at_headers, chunk_markdown, ) def make_cfg(**overrides): cfg = {"chunking": {"markdown": {"strategy": "header", "min_tokens": 50, "max_tokens": 1024}}} cfg["chunking"]["markdown"].update(overrides) return cfg class TestHasHeaders: def test_with_headers(self): assert _has_headers("## Title\nContent") def test_without_headers(self): assert not _has_headers("Just plain text\nNo headers here") def test_h3(self): assert _has_headers("### Subsection\nStuff") class TestSplitAtHeaders: def test_basic_split(self): text = "## Section 1\nContent one\n\n## Section 2\nContent two" sections = _split_at_headers(text) assert len(sections) == 2 assert sections[0]["header_chain"] == ["Section 1"] assert "Content one" in sections[0]["content"] assert sections[1]["header_chain"] == ["Section 2"] def test_nested_headers(self): text = "## Config\nIntro\n\n### Advanced Options\nDetails" sections = _split_at_headers(text) assert len(sections) == 2 # The ### should have full chain assert sections[1]["header_chain"] == ["Config", "Advanced Options"] def test_leading_content(self): text = "Preamble text\n\n## First Section\nContent" sections = _split_at_headers(text) assert len(sections) == 2 assert sections[0]["header_chain"] == [] assert "Preamble" in sections[0]["content"] def test_header_level_reset(self): text = "## A\n\n### B\n\n## C\n\n### D" sections = _split_at_headers(text) assert sections[2]["header_chain"] == ["C"] assert sections[3]["header_chain"] == ["C", "D"] class TestMergeSmallSections: def test_merge_tiny_into_next(self): sections = [ {"header_chain": ["A"], "content": "tiny"}, {"header_chain": ["B"], "content": "This is a much longer section with plenty of words " * 5}, ] merged = _merge_small_sections(sections, min_tokens=10) assert len(merged) == 1 assert "tiny" in merged[0]["content"] def test_no_merge_when_large_enough(self): sections = [ {"header_chain": ["A"], "content": "word " * 100}, {"header_chain": ["B"], "content": "word " * 100}, ] merged = _merge_small_sections(sections, min_tokens=10) assert len(merged) == 2 class TestChunkMarkdown: def test_header_strategy(self): text = "## Intro\nSome intro text with enough words to avoid merging. " * 5 text += "\n\n## Details\nDetailed content follows here with sufficient length. " * 5 cfg = make_cfg(min_tokens=5) chunks = chunk_markdown(text, cfg) assert len(chunks) >= 2 # Verify chunk_index assigned for i, c in enumerate(chunks): assert c["chunk_index"] == i def test_hierarchy_context(self): text = "## Config\nIntro\n\n### Advanced\n" + "Details " * 60 cfg = make_cfg(min_tokens=5) chunks = chunk_markdown(text, cfg) # Find the Advanced chunk advanced = [c for c in chunks if "Advanced" in c["text"]] assert len(advanced) > 0 assert "Config > Advanced" in advanced[0]["text"] def test_plain_text_fallback(self): text = "No headers here, just plain text. " * 200 cfg = make_cfg() chunks = chunk_markdown(text, cfg) assert len(chunks) >= 1 def test_empty_text(self): chunks = chunk_markdown("", make_cfg()) assert len(chunks) == 0 class TestFixedChunk: def test_basic(self): text = "word " * 200 chunks = _fixed_chunk(text, {"max_tokens": 50, "overlap_tokens": 10}) assert len(chunks) > 1 def test_empty(self): chunks = _fixed_chunk("", {}) assert len(chunks) == 0 def test_short_text(self): chunks = _fixed_chunk("hello world", {"max_tokens": 512}) assert len(chunks) == 1