kb/engine/kb/routes/documents.py

"""Document management endpoints — list, view, and delete documents."""

import json
import logging
import mimetypes
from pathlib import Path
from typing import Optional

from fastapi import HTTPException, Query
from fastapi.responses import FileResponse

from main import app
from kb.config import cfg
from kb.database import get_connection

logger = logging.getLogger("kb.routes.documents")


@app.get("/api/v1/documents")
async def list_documents(
    type: Optional[str] = Query(None),
    tags: Optional[str] = Query(None),
):
    conn = get_connection(cfg.db_path)
    try:
        sql = """
            SELECT d.id, d.title, d.doc_type,
                   (SELECT COUNT(*) FROM chunks c WHERE c.document_id = d.id) AS chunk_count,
                   d.created_at, d.updated_at
            FROM documents d
        """
        joins: list[str] = []
        where: list[str] = []
        params: list = []

        if type:
            where.append("d.doc_type = ?")
            params.append(type)

        if tags:
            tag_list = [t.strip() for t in tags.split(",") if t.strip()]
            for i, tag in enumerate(tag_list):
                joins.append(f"JOIN document_tags dt{i} ON d.id = dt{i}.document_id")
                joins.append(f"JOIN tags t{i} ON dt{i}.tag_id = t{i}.id")
                where.append(f"t{i}.name = ?")
                params.append(tag)

        if joins:
            sql += " " + " ".join(joins)
        if where:
            sql += " WHERE " + " AND ".join(where)

        sql += " ORDER BY COALESCE(d.updated_at, d.created_at) DESC"

        rows = conn.execute(sql, params).fetchall()

        results = []
        for row in rows:
            doc_id = row["id"]
            tag_rows = conn.execute(
                """
                SELECT t.name FROM tags t
                JOIN document_tags dt ON t.id = dt.tag_id
                WHERE dt.document_id = ?
                ORDER BY t.name
                """,
                (doc_id,),
            ).fetchall()

            results.append({
                "id": row["id"],
                "title": row["title"],
                "doc_type": row["doc_type"],
                "tags": [t["name"] for t in tag_rows],
                "chunk_count": row["chunk_count"],
                "created_at": row["created_at"],
                "updated_at": row["updated_at"],
            })

        return results
    finally:
        conn.close()


@app.get("/api/v1/documents/{doc_id}")
async def get_document(doc_id: int):
    conn = get_connection(cfg.db_path)
    try:
        doc = conn.execute(
            "SELECT * FROM documents WHERE id = ?", (doc_id,)
        ).fetchone()
        if not doc:
            raise HTTPException(status_code=404, detail="Document not found.")

        chunks = conn.execute(
            "SELECT * FROM chunks WHERE document_id = ? ORDER BY chunk_index",
            (doc_id,),
        ).fetchall()

        tag_rows = conn.execute(
            """
            SELECT t.name FROM tags t
            JOIN document_tags dt ON t.id = dt.tag_id
            WHERE dt.document_id = ?
            ORDER BY t.name
            """,
            (doc_id,),
        ).fetchall()

        stored_path = doc["stored_path"]
        has_file = bool(stored_path and Path(stored_path).exists())

        return {
            **dict(doc),
            "has_file": has_file,
            "tags": [t["name"] for t in tag_rows],
            "chunks": [dict(c) for c in chunks],
        }
    finally:
        conn.close()


@app.get("/api/v1/documents/{doc_id}/file")
async def download_document_file(doc_id: int):
    conn = get_connection(cfg.db_path)
    try:
        doc = conn.execute(
            "SELECT id, title, stored_path, original_filename FROM documents WHERE id = ?",
            (doc_id,),
        ).fetchone()
        if not doc:
            raise HTTPException(status_code=404, detail="Document not found.")

        stored_path = doc["stored_path"]
        if not stored_path:
            raise HTTPException(
                status_code=404,
                detail="Original file not available - ingested before document storage was enabled.",
            )

        file_path = Path(stored_path)
        if not file_path.exists():
            raise HTTPException(
                status_code=404,
                detail="Stored file not found on disk.",
            )

        original_filename = doc["original_filename"]
        if not original_filename:
            ext = file_path.suffix
            original_filename = (doc["title"] or "document") + ext

        media_type = mimetypes.guess_type(original_filename)[0] or "application/octet-stream"

        return FileResponse(
            path=str(file_path),
            media_type=media_type,
            filename=original_filename,
        )
    finally:
        conn.close()


@app.delete("/api/v1/documents/{doc_id}")
async def delete_document(doc_id: int):
    conn = get_connection(cfg.db_path)
    try:
        doc = conn.execute(
            "SELECT id, title, stored_path FROM documents WHERE id = ?", (doc_id,)
        ).fetchone()
        if not doc:
            raise HTTPException(status_code=404, detail="Document not found.")

        # Get chunk IDs for embedding cleanup
        chunk_ids = conn.execute(
            "SELECT id FROM chunks WHERE document_id = ?", (doc_id,)
        ).fetchall()

        # Delete embeddings from vec table
        for row in chunk_ids:
            conn.execute(
                "DELETE FROM chunks_vec WHERE chunk_id = ?", (row["id"],)
            )

        # Delete document (cascades to chunks, document_tags)
        conn.execute("DELETE FROM documents WHERE id = ?", (doc_id,))
        conn.commit()

        # Delete stored file from disk
        stored_path = doc["stored_path"]
        if stored_path:
            try:
                file_path = Path(stored_path)
                if file_path.exists():
                    file_path.unlink()
                    logger.info("Deleted stored file: %s", stored_path)
                else:
                    logger.warning("Stored file already missing: %s", stored_path)
            except OSError as exc:
                logger.warning("Failed to delete stored file %s: %s", stored_path, exc)

        return {
            "status": "deleted",
            "document_id": doc_id,
            "title": doc["title"],
        }
    finally:
        conn.close()