b2176c36ea
Adds enriched_text column to chunks table that prepends document title (and section header when present) to chunk text. Embeddings and FTS now use enriched text for better search relevance. Includes schema migration with backfill for existing data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
211 lines
7.3 KiB
Python
211 lines
7.3 KiB
Python
"""Async background worker for processing ingestion jobs."""
|
|
|
|
import asyncio
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
from kb import config, database, embeddings, staging
|
|
from kb.database import build_enriched_text
|
|
from kb.ingest import detector
|
|
|
|
logger = logging.getLogger("kb.worker")
|
|
|
|
|
|
async def ingestion_worker() -> None:
|
|
"""Main background loop that processes queued ingestion jobs.
|
|
|
|
Runs indefinitely until cancelled. Every 2 seconds it checks for the
|
|
oldest queued job, marks it as *processing*, and delegates to
|
|
:func:`_process_job` in a thread pool.
|
|
"""
|
|
logger.info("Ingestion worker started")
|
|
while True:
|
|
try:
|
|
cfg = config.cfg
|
|
conn = database.get_connection(cfg.db_path)
|
|
try:
|
|
row = conn.execute(
|
|
"SELECT * FROM jobs WHERE status = 'queued' "
|
|
"ORDER BY created_at ASC LIMIT 1"
|
|
).fetchone()
|
|
if row is None:
|
|
await asyncio.sleep(2)
|
|
continue
|
|
|
|
job_id = row["id"]
|
|
database.update_job_status(conn, job_id, "processing")
|
|
logger.info("Processing job %d (%s)", job_id, row["filename"])
|
|
finally:
|
|
conn.close()
|
|
|
|
try:
|
|
status, doc_id, chunk_count = await asyncio.to_thread(
|
|
_process_job, row
|
|
)
|
|
except Exception as exc:
|
|
logger.exception("Job %d failed", job_id)
|
|
conn = database.get_connection(cfg.db_path)
|
|
try:
|
|
database.update_job_status(
|
|
conn, job_id, "failed", error=str(exc)
|
|
)
|
|
finally:
|
|
conn.close()
|
|
continue
|
|
|
|
conn = database.get_connection(cfg.db_path)
|
|
try:
|
|
database.update_job_status(
|
|
conn,
|
|
job_id,
|
|
status,
|
|
document_id=doc_id,
|
|
chunk_count=chunk_count,
|
|
)
|
|
finally:
|
|
conn.close()
|
|
|
|
logger.info(
|
|
"Job %d finished: status=%s doc_id=%s chunks=%s",
|
|
job_id, status, doc_id, chunk_count,
|
|
)
|
|
|
|
except asyncio.CancelledError:
|
|
logger.info("Ingestion worker cancelled")
|
|
raise
|
|
except Exception:
|
|
logger.exception("Unexpected error in ingestion worker loop")
|
|
await asyncio.sleep(2)
|
|
|
|
|
|
def _process_job(job_row) -> tuple[str, int | None, int]:
|
|
"""Synchronously process a single ingestion job.
|
|
|
|
Returns:
|
|
A tuple of ``(status, document_id, chunk_count)`` where *status* is
|
|
one of ``"done"``, ``"skipped"``.
|
|
"""
|
|
cfg = config.cfg
|
|
conn = database.get_connection(cfg.db_path)
|
|
staged_path = Path(job_row["staging_path"])
|
|
|
|
try:
|
|
# --- Determine document type and language -------------------------
|
|
filename = job_row["filename"]
|
|
forced_type = job_row["doc_type"]
|
|
|
|
if staged_path.suffix == ".note":
|
|
doc_type = "note"
|
|
language = None
|
|
elif forced_type:
|
|
doc_type = forced_type
|
|
language = None
|
|
else:
|
|
doc_type, language = detector.detect_type(Path(filename))
|
|
|
|
# --- Chunk the content --------------------------------------------
|
|
if doc_type == "note":
|
|
text = staged_path.read_text(encoding="utf-8")
|
|
from kb.ingest.note import chunk_note
|
|
chunks = chunk_note(text)
|
|
elif doc_type == "pdf":
|
|
from kb.ingest.docling_pipeline import chunk_document
|
|
chunks = chunk_document(staged_path, cfg.ingest_device)
|
|
elif doc_type == "markdown":
|
|
text = staged_path.read_text(encoding="utf-8")
|
|
from kb.ingest.markdown import chunk_markdown
|
|
chunks = chunk_markdown(text)
|
|
elif doc_type == "code":
|
|
text = staged_path.read_text(encoding="utf-8")
|
|
if not language:
|
|
_, language = detector.detect_type(Path(filename))
|
|
from kb.ingest.code import chunk_code
|
|
chunks = chunk_code(text, language)
|
|
else:
|
|
raise ValueError(f"Unsupported doc_type: {doc_type}")
|
|
|
|
# --- Duplicate detection via content hash -------------------------
|
|
raw_bytes = staged_path.read_bytes()
|
|
content_hash = hashlib.sha256(raw_bytes).hexdigest()
|
|
|
|
if database.hash_exists(conn, content_hash):
|
|
logger.info("Duplicate detected for job %d, skipping", job_row["id"])
|
|
return ("skipped", None, 0)
|
|
|
|
# --- Persist document, chunks, and embeddings ---------------------
|
|
title = job_row["title"] or filename
|
|
doc_id = database.insert_document(
|
|
conn,
|
|
title=title,
|
|
source_path=str(staged_path),
|
|
content_hash=content_hash,
|
|
doc_type=doc_type,
|
|
language=language,
|
|
)
|
|
|
|
chunk_texts = [c if isinstance(c, str) else c["text"] for c in chunks]
|
|
chunk_metas = []
|
|
for idx, c in enumerate(chunks):
|
|
if isinstance(c, str):
|
|
chunk_metas.append(None)
|
|
else:
|
|
meta = {k: v for k, v in c.items() if k != "text"} or None
|
|
chunk_metas.append(meta)
|
|
|
|
enriched_texts = [
|
|
build_enriched_text(title, ct, cm)
|
|
for ct, cm in zip(chunk_texts, chunk_metas)
|
|
]
|
|
vectors = embeddings.embed_texts(enriched_texts)
|
|
|
|
for idx, (chunk_text, enriched, vector) in enumerate(
|
|
zip(chunk_texts, enriched_texts, vectors)
|
|
):
|
|
chunk_id = database.insert_chunk(
|
|
conn,
|
|
document_id=doc_id,
|
|
chunk_index=idx,
|
|
text=chunk_text,
|
|
enriched_text=enriched,
|
|
metadata=chunk_metas[idx],
|
|
)
|
|
database.insert_embedding(conn, chunk_id, vector)
|
|
|
|
# --- Tags ---------------------------------------------------------
|
|
tags = json.loads(job_row["tags_json"] or "[]")
|
|
if tags:
|
|
database.tag_document(conn, doc_id, tags)
|
|
|
|
conn.commit()
|
|
|
|
# --- Move original file to persistent storage ---------------------
|
|
ext = Path(filename).suffix or staged_path.suffix
|
|
dest = cfg.documents_dir / f"{content_hash}{ext}"
|
|
try:
|
|
cfg.documents_dir.mkdir(parents=True, exist_ok=True)
|
|
shutil.move(str(staged_path), str(dest))
|
|
conn_update = database.get_connection(cfg.db_path)
|
|
try:
|
|
conn_update.execute(
|
|
"UPDATE documents SET stored_path = ?, original_filename = ? WHERE id = ?",
|
|
(str(dest), filename, doc_id),
|
|
)
|
|
conn_update.commit()
|
|
finally:
|
|
conn_update.close()
|
|
logger.info("Stored original file: %s", dest)
|
|
except Exception as exc:
|
|
logger.warning("Failed to store original file: %s", exc)
|
|
staging.cleanup(staged_path)
|
|
|
|
return ("done", doc_id, len(chunk_texts))
|
|
|
|
finally:
|
|
conn.close()
|
|
# Only clean up staging if the file is still there (not moved)
|
|
if staged_path.exists():
|
|
staging.cleanup(staged_path)
|