b2176c36ea
Adds enriched_text column to chunks table that prepends document title (and section header when present) to chunk text. Embeddings and FTS now use enriched text for better search relevance. Includes schema migration with backfill for existing data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
56 lines
1.7 KiB
Python
56 lines
1.7 KiB
Python
"""Reindex endpoint — re-embed all chunks with the current model."""
|
|
|
|
import logging
|
|
import struct
|
|
|
|
from main import app
|
|
from kb.config import cfg
|
|
from kb.database import get_connection, recreate_vec_table
|
|
from kb.embeddings import embed_texts, get_model_dim
|
|
|
|
logger = logging.getLogger("kb.routes.reindex")
|
|
|
|
BATCH_SIZE = 256
|
|
|
|
|
|
@app.post("/api/v1/reindex")
|
|
async def reindex():
|
|
dim = get_model_dim()
|
|
|
|
conn = get_connection(cfg.db_path)
|
|
try:
|
|
# Fetch all chunks — use enriched_text for embedding (includes title context)
|
|
rows = conn.execute("SELECT id, enriched_text FROM chunks ORDER BY id").fetchall()
|
|
chunk_ids = [row["id"] for row in rows]
|
|
chunk_texts = [row["enriched_text"] or "" for row in rows]
|
|
|
|
logger.info("Reindexing %d chunks with model '%s'", len(chunk_ids), cfg.model)
|
|
|
|
# Recreate the vec table
|
|
recreate_vec_table(conn, dim)
|
|
|
|
# Embed and insert in batches
|
|
for i in range(0, len(chunk_ids), BATCH_SIZE):
|
|
batch_ids = chunk_ids[i : i + BATCH_SIZE]
|
|
batch_texts = chunk_texts[i : i + BATCH_SIZE]
|
|
|
|
embeddings = embed_texts(batch_texts)
|
|
|
|
for chunk_id, embedding in zip(batch_ids, embeddings):
|
|
blob = struct.pack(f"{len(embedding)}f", *embedding)
|
|
conn.execute(
|
|
"INSERT INTO chunks_vec(embedding, chunk_id) VALUES (?, ?)",
|
|
(blob, chunk_id),
|
|
)
|
|
|
|
conn.commit()
|
|
|
|
logger.info("Reindex complete: %d chunks", len(chunk_ids))
|
|
|
|
return {
|
|
"chunks_reindexed": len(chunk_ids),
|
|
"model": cfg.model,
|
|
}
|
|
finally:
|
|
conn.close()
|