Chunk enrichment: prepend document title to embeddings

Adds enriched_text column to chunks table that prepends document title (and section header when present) to chunk text. Embeddings and FTS now use enriched text for better search relevance. Includes schema migration with backfill for existing data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-29 21:03:48 +01:00
parent 5f9946efc9
commit b2176c36ea
10 changed files with 278 additions and 21 deletions
@@ -10,6 +10,60 @@ import struct
 from typing import Any, Optional
 def build_enriched_text(title: str, chunk_text: str, metadata: dict | None = None) -> str:
    """Build enriched text by prepending document title and optional section header.
    Format: "{title} > {section_header}\\n\\n{chunk_text}" or "{title}\\n\\n{chunk_text}".
    """
    section_header = (metadata or {}).get("section_header")
    if section_header:
        return f"{title} > {section_header}\n\n{chunk_text}"
    return f"{title}\n\n{chunk_text}"
 def _backfill_enriched_text(conn: sqlite3.Connection) -> None:
    """Backfill enriched_text for all existing chunks."""
    rows = conn.execute(
        "SELECT c.id, c.text, c.metadata, d.title "
        "FROM chunks c JOIN documents d ON c.document_id = d.id"
    ).fetchall()
    for row in rows:
        metadata = json.loads(row["metadata"]) if row["metadata"] else None
        enriched = build_enriched_text(row["title"], row["text"], metadata)
        conn.execute("UPDATE chunks SET enriched_text = ? WHERE id = ?", (enriched, row["id"]))
 def _rebuild_fts(conn: sqlite3.Connection) -> None:
    """Drop and recreate chunks_fts to index enriched_text, with updated triggers."""
    conn.executescript("""
        DROP TRIGGER IF EXISTS chunks_ai;
        DROP TRIGGER IF EXISTS chunks_ad;
        DROP TRIGGER IF EXISTS chunks_au;
        DROP TABLE IF EXISTS chunks_fts;
        CREATE VIRTUAL TABLE chunks_fts USING fts5(
            text,
            content=chunks,
            content_rowid=id
        );
        CREATE TRIGGER chunks_ai AFTER INSERT ON chunks BEGIN
            INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.enriched_text);
        END;
        CREATE TRIGGER chunks_ad AFTER DELETE ON chunks BEGIN
            INSERT INTO chunks_fts(chunks_fts, rowid, text) VALUES ('delete', old.id, old.enriched_text);
        END;
        CREATE TRIGGER chunks_au AFTER UPDATE ON chunks BEGIN
            INSERT INTO chunks_fts(chunks_fts, rowid, text) VALUES ('delete', old.id, old.enriched_text);
            INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.enriched_text);
        END;
    """)
    # Repopulate FTS from existing enriched_text
    conn.execute("INSERT INTO chunks_fts(rowid, text) SELECT id, enriched_text FROM chunks")
 def get_connection(db_path: str) -> sqlite3.Connection:
    """Return a sqlite3 connection with WAL mode, Row factory, and foreign keys enabled."""
    import sqlite_vec
@@ -44,6 +98,7 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
            document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
            chunk_index INTEGER,
            text TEXT,
            enriched_text TEXT,
            token_count INTEGER,
            metadata TEXT DEFAULT '{{}}',
            UNIQUE(document_id, chunk_index)
@@ -55,18 +110,18 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
            content_rowid=id
        );
-        -- Triggers to keep FTS index in sync with chunks table
+        -- Triggers to keep FTS index in sync with chunks table (using enriched_text)
        CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
-            INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.text);
+            INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.enriched_text);
        END;
        CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
-            INSERT INTO chunks_fts(chunks_fts, rowid, text) VALUES ('delete', old.id, old.text);
+            INSERT INTO chunks_fts(chunks_fts, rowid, text) VALUES ('delete', old.id, old.enriched_text);
        END;
        CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN
-            INSERT INTO chunks_fts(chunks_fts, rowid, text) VALUES ('delete', old.id, old.text);
+            INSERT INTO chunks_fts(chunks_fts, rowid, text) VALUES ('delete', old.id, old.enriched_text);
-            INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.text);
+            INSERT INTO chunks_fts(rowid, text) VALUES (new.id, new.enriched_text);
        END;
        CREATE TABLE IF NOT EXISTS tags (
@@ -123,6 +178,13 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
    if "original_filename" not in doc_cols:
        conn.execute("ALTER TABLE documents ADD COLUMN original_filename TEXT")
    # Migrate: add enriched_text to chunks and rebuild FTS to index it
    chunk_cols = {row[1] for row in conn.execute("PRAGMA table_info(chunks)").fetchall()}
    if "enriched_text" not in chunk_cols:
        conn.execute("ALTER TABLE chunks ADD COLUMN enriched_text TEXT")
        _backfill_enriched_text(conn)
        _rebuild_fts(conn)
    conn.commit()
@@ -205,6 +267,7 @@ def insert_chunk(
    document_id: int,
    chunk_index: int,
    text: str,
    enriched_text: str | None = None,
    token_count: Optional[int] = None,
    metadata: Any = None,
 ) -> int:
@@ -217,8 +280,8 @@ def insert_chunk(
        metadata_str = str(metadata)
    cur = conn.execute(
-        "INSERT INTO chunks(document_id, chunk_index, text, token_count, metadata) VALUES (?, ?, ?, ?, ?)",
+        "INSERT INTO chunks(document_id, chunk_index, text, enriched_text, token_count, metadata) VALUES (?, ?, ?, ?, ?, ?)",
-        (document_id, chunk_index, text, token_count, metadata_str),
+        (document_id, chunk_index, text, enriched_text or text, token_count, metadata_str),
    )
    conn.commit()
    return cur.lastrowid
@@ -19,10 +19,10 @@ async def reindex():
    conn = get_connection(cfg.db_path)
    try:
-        # Fetch all chunks
+        # Fetch all chunks — use enriched_text for embedding (includes title context)
-        rows = conn.execute("SELECT id, text FROM chunks ORDER BY id").fetchall()
+        rows = conn.execute("SELECT id, enriched_text FROM chunks ORDER BY id").fetchall()
        chunk_ids = [row["id"] for row in rows]
-        chunk_texts = [row["text"] for row in rows]
+        chunk_texts = [row["enriched_text"] or "" for row in rows]
        logger.info("Reindexing %d chunks with model '%s'", len(chunk_ids), cfg.model)
@@ -8,6 +8,7 @@ import shutil
 from pathlib import Path
 from kb import config, database, embeddings, staging
 from kb.database import build_enriched_text
 from kb.ingest import detector
 logger = logging.getLogger("kb.worker")
@@ -146,20 +147,30 @@ def _process_job(job_row) -> tuple[str, int | None, int]:
        )
        chunk_texts = [c if isinstance(c, str) else c["text"] for c in chunks]
-        vectors = embeddings.embed_texts(chunk_texts)
+        chunk_metas = []
        for idx, c in enumerate(chunks):
            if isinstance(c, str):
                chunk_metas.append(None)
            else:
                meta = {k: v for k, v in c.items() if k != "text"} or None
                chunk_metas.append(meta)
-        for idx, (chunk_text, vector) in enumerate(zip(chunk_texts, vectors)):
+        enriched_texts = [
-            metadata = None
+            build_enriched_text(title, ct, cm)
-            if not isinstance(chunks[idx], str):
+            for ct, cm in zip(chunk_texts, chunk_metas)
-                metadata = {
+        ]
-                    k: v for k, v in chunks[idx].items() if k != "text"
+        vectors = embeddings.embed_texts(enriched_texts)
-                } or None
+
        for idx, (chunk_text, enriched, vector) in enumerate(
            zip(chunk_texts, enriched_texts, vectors)
        ):
            chunk_id = database.insert_chunk(
                conn,
                document_id=doc_id,
                chunk_index=idx,
                text=chunk_text,
-                metadata=metadata,
+                enriched_text=enriched,
                metadata=chunk_metas[idx],
            )
            database.insert_embedding(conn, chunk_id, vector)
@@ -0,0 +1,2 @@
 schema: spec-driven
 created: 2026-03-29
@@ -0,0 +1,29 @@
 ## Context
 The root cobra command in `client/cmd/root.go` uses `cobra.ArbitraryArgs` and its `RunE` handler to catch any arguments not matching a subcommand. Currently, any non-empty args are joined and submitted as a note. This means a single mistyped word (e.g., `kb infow` instead of `kb info`) silently creates a junk note in the knowledge base.
 ## Goals / Non-Goals
 **Goals:**
 - Prevent single bare words from being silently ingested as notes
 - Provide a clear error message that helps the user correct their input
 - Preserve the multi-word implicit note shorthand (`kb remember to update dns`)
 **Non-Goals:**
 - Detecting "close matches" to real commands (fuzzy matching / did-you-mean)
 - Changing how quoted strings work at the shell level (we can't detect quotes after shell expansion)
 ## Decisions
 ### Guard on argument count in RunE
 When `len(args) == 1`, reject with an error message instead of submitting as a note. When `len(args) > 1`, continue treating as implicit note shorthand.
 **Rationale**: This is the simplest reliable heuristic. The shell strips quotes before cobra sees args, so we cannot distinguish `kb "singleword"` from `kb singleword`. However, single-word notes are rare in practice, and the error message tells the user how to work around it (use multiple words or the full note workflow). Multi-word input is almost certainly intentional note text, not a mistyped command.
 **Alternative considered**: Checking against a list of known subcommand names — rejected because it wouldn't catch typos of commands we don't know about and adds maintenance burden.
 ## Risks / Trade-offs
 - **Single-word notes no longer work via shorthand** → Users must use `kb add --note "singleword"` or include additional words. This is an acceptable trade-off since single-word notes are uncommon and the error message is clear.
 - **Shell quote stripping means we can't be perfect** → `kb "my note"` with exactly one word after quote removal will be rejected. This is a known limitation but very rare in practice.
@@ -0,0 +1,24 @@
 ## Why
 A single unquoted word passed to `kb` (e.g., `kb infow`) is silently treated as a note and ingested. This is almost always a mistyped command, not an intentional note. Users lose trust when typos pollute their knowledge base.
 ## What Changes
 - The implicit note shorthand will require **more than one argument** to be treated as a note. A single bare word will be rejected with a helpful error suggesting the user check their command or quote a multi-word note.
 - This is a **BREAKING** change to the implicit note shorthand: `kb singleword` no longer creates a note. Users must write `kb "singleword is important"` or use multiple words.
 ## Capabilities
 ### New Capabilities
 _(none)_
 ### Modified Capabilities
 - `go-client`: The "Implicit note shorthand" requirement changes to reject single-word bare arguments and print an error instead of submitting them as notes.
 ## Impact
 - **Code**: `client/cmd/root.go` — `RunE` handler for the root command
 - **Tests**: `client/cmd/root_test.go` or equivalent — add/update tests for single-word rejection
 - **Users**: Anyone who intentionally used `kb singleword` as a note shorthand will need to use multiple words or quotes
@@ -0,0 +1,37 @@
 ## MODIFIED Requirements
 ### Requirement: Implicit note shorthand
 The client SHALL treat bare string arguments (with no subcommand) as an implicit note only when **more than one argument** is provided. `kb "my note"` SHALL behave identically to submitting a note via `POST /api/v1/jobs`. All persistent flags (`--format`, `--engine`, `--api-key`) and the root `--tags` flag SHALL work with the shorthand form. A single bare word SHALL be rejected with an error message.
 #### Scenario: Quick note via bare argument
 - **WHEN** the user runs `kb "remember to update DNS"`
 - **THEN** the client SHALL submit the text as a note via `POST /api/v1/jobs` and print `Queued: note`
 #### Scenario: Bare argument with tags
 - **WHEN** the user runs `kb "server room is building 3" --tags ops`
 - **THEN** the client SHALL submit the note with the specified tags
 #### Scenario: Bare argument with JSON output
 - **WHEN** the user runs `kb "my note" --format json`
 - **THEN** the client SHALL output the raw JSON response from the engine
 #### Scenario: Bare argument duplicate detection
 - **WHEN** the user runs `kb "my note"` and the engine returns HTTP 409
 - **THEN** the client SHALL handle the duplicate response identically to the previous `kb add --note` behaviour
 #### Scenario: Multiple unquoted words
 - **WHEN** the user runs `kb remember to update dns` (without quotes)
 - **THEN** the client SHALL join all arguments into a single note string and submit it
 #### Scenario: Single bare word rejected
 - **WHEN** the user runs `kb infow` (a single unrecognized word)
 - **THEN** the client SHALL print to stderr: `Unknown command "infow". Run 'kb --help' for available commands.` followed by a hint about note usage, and exit with a non-zero code
 #### Scenario: No interference with subcommands
 - **WHEN** the user runs `kb search "query"` or any other existing subcommand
 - **THEN** the client SHALL route to the subcommand as before — the implicit note shorthand SHALL NOT interfere
 #### Scenario: No arguments
 - **WHEN** the user runs `kb` with no arguments
 - **THEN** the client SHALL display the help text
@@ -0,0 +1,10 @@
 ## 1. Core Implementation
 - [x] 1.1 Update `RunE` in `client/cmd/root.go` to reject single-word bare arguments with an error message and non-zero exit
 - [x] 1.2 Update usage template in `root.go` to reflect that note shorthand requires multiple words
 ## 2. Tests
 - [x] 2.1 Add test: single bare word prints error to stderr and exits non-zero
 - [x] 2.2 Add test: multiple bare words are submitted as a note (existing behavior preserved)
 - [x] 2.3 Add test: zero arguments shows help (existing behavior preserved)
@@ -0,0 +1,81 @@
 # Chunk Enrichment
 ## Purpose
 Chunk enrichment prepends document titles and section headers to chunk text before indexing and embedding, ensuring that document-level context participates in both full-text and semantic search.
 ## Requirements
 ### Requirement: Chunk text enrichment with document title
 The engine SHALL prepend the document title to each chunk's text before FTS indexing and vector embedding. The enriched text SHALL be stored in a dedicated `enriched_text` column on the `chunks` table. The original chunk text SHALL remain in the `text` column for display purposes.
 The enrichment format SHALL be:
 - Without section header: `"{title}\n\n{chunk_text}"`
 - With section header: `"{title} > {section_header}\n\n{chunk_text}"`
 Where `section_header` is the value from the chunk's metadata `section_header` field, when present.
 #### Scenario: Note ingestion with title enrichment
 - **WHEN** a note titled "Suitcase Locks" with content "Steve = 363" is ingested
 - **THEN** the `chunks.text` column SHALL contain "Steve = 363" and the `chunks.enriched_text` column SHALL contain "Suitcase Locks\n\nSteve = 363"
 #### Scenario: Markdown chunk with section header enrichment
 - **WHEN** a markdown document titled "DCG Lab Hardware" produces a chunk with section_header "GRIMDAWN > motherboard" and text "MSI X870 Tomahawk"
 - **THEN** the `chunks.enriched_text` SHALL contain "DCG Lab Hardware > GRIMDAWN > motherboard\n\nMSI X870 Tomahawk"
 #### Scenario: Chunk without section header
 - **WHEN** a document titled "Docker Tips" produces a chunk with no section_header in metadata and text "dbash() { docker exec -it $1 bash; }"
 - **THEN** the `chunks.enriched_text` SHALL contain "Docker Tips\n\ndbash() { docker exec -it $1 bash; }"
 ---
 ### Requirement: FTS5 indexes enriched text
 The FTS5 virtual table `chunks_fts` SHALL index the `enriched_text` column instead of the `text` column. All FTS sync triggers (insert, update, delete) SHALL operate on `enriched_text`.
 #### Scenario: FTS search matches document title
 - **WHEN** a user searches for "suitcase locks" and a document titled "Suitcase Locks" exists with chunk text "Steve = 363"
 - **THEN** the FTS5 search SHALL return that chunk as a match
 #### Scenario: FTS search still matches chunk content
 - **WHEN** a user searches for "MSI X870" and a chunk contains that text in its body
 - **THEN** the FTS5 search SHALL return that chunk as a match (enrichment does not break content matching)
 ---
 ### Requirement: Vector embeddings use enriched text
 The embedding model SHALL receive `enriched_text` (not raw `text`) when generating vectors during both initial ingestion and reindex operations.
 #### Scenario: Vector search matches document title
 - **WHEN** a user searches semantically for "luggage combination codes" and a document titled "Suitcase Locks" exists
 - **THEN** the vector search SHALL return that chunk with higher similarity than it would without title enrichment
 #### Scenario: Reindex uses enriched text
 - **WHEN** `POST /api/v1/reindex` is called
 - **THEN** the engine SHALL read `enriched_text` from the chunks table and embed that (not `text`)
 ---
 ### Requirement: Schema migration adds enriched_text column
 On startup, `init_schema` SHALL add the `enriched_text` column to the `chunks` table if it does not exist. It SHALL then backfill `enriched_text` for all existing chunks by joining with `documents.title` and parsing chunk metadata for section headers. It SHALL rebuild the FTS5 table and triggers to index `enriched_text`.
 #### Scenario: First startup after upgrade
 - **WHEN** the engine starts and `chunks.enriched_text` column does not exist
 - **THEN** the engine SHALL add the column, backfill all rows, drop and recreate `chunks_fts` to index `enriched_text`, and recreate the FTS sync triggers
 #### Scenario: Subsequent startup
 - **WHEN** the engine starts and `chunks.enriched_text` column already exists
 - **THEN** the engine SHALL not perform any migration and start normally
 ---
 ### Requirement: Search results return raw text
 Search results SHALL continue to return the original chunk text (from `chunks.text`) in the `text` field, not the enriched text. The document title is already returned as a separate `title` field.
 #### Scenario: Search result text field
 - **WHEN** a search returns a chunk from document "Suitcase Locks" with raw text "Steve = 363"
 - **THEN** the result `text` field SHALL be "Steve = 363" (not "Suitcase Locks\n\nSteve = 363")
@@ -128,11 +128,11 @@ The engine SHALL maintain job records in SQLite with status tracking. Jobs SHALL
 ### Requirement: Background ingestion worker
-The engine SHALL run a background worker that processes queued jobs. The worker SHALL process one job at a time. For each job, it SHALL: detect document type, run the appropriate chunking pipeline (Docling for PDFs, header-based for Markdown, AST-based for code, whole-text for notes), generate embeddings using the resident model, insert chunks and vectors into the database, and move the original file to persistent storage.
+The engine SHALL run a background worker that processes queued jobs. The worker SHALL process one job at a time. For each job, it SHALL: detect document type, run the appropriate chunking pipeline (Docling for PDFs, header-based for Markdown, AST-based for code, whole-text for notes), build enriched text by prepending the document title (and section header when present) to each chunk's text, generate embeddings using the enriched text and the resident model, insert chunks (with both raw text and enriched text) and vectors into the database, and move the original file to persistent storage.
 #### Scenario: Successful PDF ingestion
 - **WHEN** the background worker picks up a queued PDF job
- **THEN** it SHALL update the job status to `processing`, run Docling conversion and chunking, embed all chunks, insert document and chunks into the database, move the staged file to `{data_dir}/documents/{content_hash}.pdf`, update `documents.stored_path` with the permanent path, store the original filename in `documents.original_filename`, update the job status to `done` with the resulting document_id and chunk count, and clean up the staging entry
+- **THEN** it SHALL update the job status to `processing`, run Docling conversion and chunking, build enriched text for each chunk by prepending the document title, embed all chunks using enriched text, insert document and chunks into the database, move the staged file to `{data_dir}/documents/{content_hash}.pdf`, update `documents.stored_path` with the permanent path, store the original filename in `documents.original_filename`, update the job status to `done` with the resulting document_id and chunk count, and clean up the staging entry
 #### Scenario: Ingestion failure
 - **WHEN** the background worker encounters an error during processing (e.g., corrupt PDF)
@@ -202,7 +202,7 @@ The engine SHALL provide status information and support re-embedding all chunks.
 #### Scenario: Trigger reindex
 - **WHEN** a client sends `POST /api/v1/reindex`
- **THEN** the engine SHALL re-embed all existing chunks using the currently loaded model and return progress information. This operation SHALL NOT block search queries.
+- **THEN** the engine SHALL re-embed all existing chunks using the `enriched_text` column and the currently loaded model, and return progress information. This operation SHALL NOT block search queries.
 ---