Files
kb/engine/kb/routes/reindex.py
T
steve b2176c36ea Chunk enrichment: prepend document title to embeddings
Adds enriched_text column to chunks table that prepends document title
(and section header when present) to chunk text. Embeddings and FTS now
use enriched text for better search relevance. Includes schema migration
with backfill for existing data.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-29 21:03:48 +01:00

56 lines
1.7 KiB
Python

"""Reindex endpoint — re-embed all chunks with the current model."""
import logging
import struct
from main import app
from kb.config import cfg
from kb.database import get_connection, recreate_vec_table
from kb.embeddings import embed_texts, get_model_dim
logger = logging.getLogger("kb.routes.reindex")
BATCH_SIZE = 256
@app.post("/api/v1/reindex")
async def reindex():
dim = get_model_dim()
conn = get_connection(cfg.db_path)
try:
# Fetch all chunks — use enriched_text for embedding (includes title context)
rows = conn.execute("SELECT id, enriched_text FROM chunks ORDER BY id").fetchall()
chunk_ids = [row["id"] for row in rows]
chunk_texts = [row["enriched_text"] or "" for row in rows]
logger.info("Reindexing %d chunks with model '%s'", len(chunk_ids), cfg.model)
# Recreate the vec table
recreate_vec_table(conn, dim)
# Embed and insert in batches
for i in range(0, len(chunk_ids), BATCH_SIZE):
batch_ids = chunk_ids[i : i + BATCH_SIZE]
batch_texts = chunk_texts[i : i + BATCH_SIZE]
embeddings = embed_texts(batch_texts)
for chunk_id, embedding in zip(batch_ids, embeddings):
blob = struct.pack(f"{len(embedding)}f", *embedding)
conn.execute(
"INSERT INTO chunks_vec(embedding, chunk_id) VALUES (?, ?)",
(blob, chunk_id),
)
conn.commit()
logger.info("Reindex complete: %d chunks", len(chunk_ids))
return {
"chunks_reindexed": len(chunk_ids),
"model": cfg.model,
}
finally:
conn.close()