kb/tests/test_ingest_markdown.py

"""Tests for markdown header-based splitting."""

from kb_search.ingest.markdown import (
    _fixed_chunk,
    _has_headers,
    _merge_small_sections,
    _split_at_headers,
    chunk_markdown,
)


def make_cfg(**overrides):
    cfg = {"chunking": {"markdown": {"strategy": "header", "min_tokens": 50, "max_tokens": 1024}}}
    cfg["chunking"]["markdown"].update(overrides)
    return cfg


class TestHasHeaders:
    def test_with_headers(self):
        assert _has_headers("## Title\nContent")

    def test_without_headers(self):
        assert not _has_headers("Just plain text\nNo headers here")

    def test_h3(self):
        assert _has_headers("### Subsection\nStuff")


class TestSplitAtHeaders:
    def test_basic_split(self):
        text = "## Section 1\nContent one\n\n## Section 2\nContent two"
        sections = _split_at_headers(text)
        assert len(sections) == 2
        assert sections[0]["header_chain"] == ["Section 1"]
        assert "Content one" in sections[0]["content"]
        assert sections[1]["header_chain"] == ["Section 2"]

    def test_nested_headers(self):
        text = "## Config\nIntro\n\n### Advanced Options\nDetails"
        sections = _split_at_headers(text)
        assert len(sections) == 2
        # The ### should have full chain
        assert sections[1]["header_chain"] == ["Config", "Advanced Options"]

    def test_leading_content(self):
        text = "Preamble text\n\n## First Section\nContent"
        sections = _split_at_headers(text)
        assert len(sections) == 2
        assert sections[0]["header_chain"] == []
        assert "Preamble" in sections[0]["content"]

    def test_header_level_reset(self):
        text = "## A\n\n### B\n\n## C\n\n### D"
        sections = _split_at_headers(text)
        assert sections[2]["header_chain"] == ["C"]
        assert sections[3]["header_chain"] == ["C", "D"]


class TestMergeSmallSections:
    def test_merge_tiny_into_next(self):
        sections = [
            {"header_chain": ["A"], "content": "tiny"},
            {"header_chain": ["B"], "content": "This is a much longer section with plenty of words " * 5},
        ]
        merged = _merge_small_sections(sections, min_tokens=10)
        assert len(merged) == 1
        assert "tiny" in merged[0]["content"]

    def test_no_merge_when_large_enough(self):
        sections = [
            {"header_chain": ["A"], "content": "word " * 100},
            {"header_chain": ["B"], "content": "word " * 100},
        ]
        merged = _merge_small_sections(sections, min_tokens=10)
        assert len(merged) == 2


class TestChunkMarkdown:
    def test_header_strategy(self):
        text = "## Intro\nSome intro text with enough words to avoid merging. " * 5
        text += "\n\n## Details\nDetailed content follows here with sufficient length. " * 5
        cfg = make_cfg(min_tokens=5)
        chunks = chunk_markdown(text, cfg)
        assert len(chunks) >= 2
        # Verify chunk_index assigned
        for i, c in enumerate(chunks):
            assert c["chunk_index"] == i

    def test_hierarchy_context(self):
        text = "## Config\nIntro\n\n### Advanced\n" + "Details " * 60
        cfg = make_cfg(min_tokens=5)
        chunks = chunk_markdown(text, cfg)
        # Find the Advanced chunk
        advanced = [c for c in chunks if "Advanced" in c["text"]]
        assert len(advanced) > 0
        assert "Config > Advanced" in advanced[0]["text"]

    def test_plain_text_fallback(self):
        text = "No headers here, just plain text. " * 200
        cfg = make_cfg()
        chunks = chunk_markdown(text, cfg)
        assert len(chunks) >= 1

    def test_empty_text(self):
        chunks = chunk_markdown("", make_cfg())
        assert len(chunks) == 0


class TestFixedChunk:
    def test_basic(self):
        text = "word " * 200
        chunks = _fixed_chunk(text, {"max_tokens": 50, "overlap_tokens": 10})
        assert len(chunks) > 1

    def test_empty(self):
        chunks = _fixed_chunk("", {})
        assert len(chunks) == 0

    def test_short_text(self):
        chunks = _fixed_chunk("hello world", {"max_tokens": 512})
        assert len(chunks) == 1