Files
kb/engine/kb/routes/documents.py
T
steve e7136a4a20 Add MCP server, note mutation endpoint, and updated_at tracking (v3.0.0)
New MCP server (mcp/) exposes kb operations as native MCP tools over
Streamable HTTP with Bearer token auth. Supports collections via tag
conventions, chunked file uploads, and agent-side search patterns.

Engine gains PATCH /api/v1/notes/{id} for in-place note updates with
transactional re-chunk/re-embed, and updated_at column on documents.

Go client adds updatenote command and Patch HTTP method.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 21:34:55 +01:00

209 lines
6.3 KiB
Python

"""Document management endpoints — list, view, and delete documents."""
import json
import logging
import mimetypes
from pathlib import Path
from typing import Optional
from fastapi import HTTPException, Query
from fastapi.responses import FileResponse
from main import app
from kb.config import cfg
from kb.database import get_connection
logger = logging.getLogger("kb.routes.documents")
@app.get("/api/v1/documents")
async def list_documents(
type: Optional[str] = Query(None),
tags: Optional[str] = Query(None),
):
conn = get_connection(cfg.db_path)
try:
sql = """
SELECT d.id, d.title, d.doc_type,
(SELECT COUNT(*) FROM chunks c WHERE c.document_id = d.id) AS chunk_count,
d.created_at, d.updated_at
FROM documents d
"""
joins: list[str] = []
where: list[str] = []
params: list = []
if type:
where.append("d.doc_type = ?")
params.append(type)
if tags:
tag_list = [t.strip() for t in tags.split(",") if t.strip()]
for i, tag in enumerate(tag_list):
joins.append(f"JOIN document_tags dt{i} ON d.id = dt{i}.document_id")
joins.append(f"JOIN tags t{i} ON dt{i}.tag_id = t{i}.id")
where.append(f"t{i}.name = ?")
params.append(tag)
if joins:
sql += " " + " ".join(joins)
if where:
sql += " WHERE " + " AND ".join(where)
sql += " ORDER BY COALESCE(d.updated_at, d.created_at) DESC"
rows = conn.execute(sql, params).fetchall()
results = []
for row in rows:
doc_id = row["id"]
tag_rows = conn.execute(
"""
SELECT t.name FROM tags t
JOIN document_tags dt ON t.id = dt.tag_id
WHERE dt.document_id = ?
ORDER BY t.name
""",
(doc_id,),
).fetchall()
results.append({
"id": row["id"],
"title": row["title"],
"doc_type": row["doc_type"],
"tags": [t["name"] for t in tag_rows],
"chunk_count": row["chunk_count"],
"created_at": row["created_at"],
"updated_at": row["updated_at"],
})
return results
finally:
conn.close()
@app.get("/api/v1/documents/{doc_id}")
async def get_document(doc_id: int):
conn = get_connection(cfg.db_path)
try:
doc = conn.execute(
"SELECT * FROM documents WHERE id = ?", (doc_id,)
).fetchone()
if not doc:
raise HTTPException(status_code=404, detail="Document not found.")
chunks = conn.execute(
"SELECT * FROM chunks WHERE document_id = ? ORDER BY chunk_index",
(doc_id,),
).fetchall()
tag_rows = conn.execute(
"""
SELECT t.name FROM tags t
JOIN document_tags dt ON t.id = dt.tag_id
WHERE dt.document_id = ?
ORDER BY t.name
""",
(doc_id,),
).fetchall()
stored_path = doc["stored_path"]
has_file = bool(stored_path and Path(stored_path).exists())
return {
**dict(doc),
"has_file": has_file,
"tags": [t["name"] for t in tag_rows],
"chunks": [dict(c) for c in chunks],
}
finally:
conn.close()
@app.get("/api/v1/documents/{doc_id}/file")
async def download_document_file(doc_id: int):
conn = get_connection(cfg.db_path)
try:
doc = conn.execute(
"SELECT id, title, stored_path, original_filename FROM documents WHERE id = ?",
(doc_id,),
).fetchone()
if not doc:
raise HTTPException(status_code=404, detail="Document not found.")
stored_path = doc["stored_path"]
if not stored_path:
raise HTTPException(
status_code=404,
detail="Original file not available - ingested before document storage was enabled.",
)
file_path = Path(stored_path)
if not file_path.exists():
raise HTTPException(
status_code=404,
detail="Stored file not found on disk.",
)
original_filename = doc["original_filename"]
if not original_filename:
ext = file_path.suffix
original_filename = (doc["title"] or "document") + ext
media_type = mimetypes.guess_type(original_filename)[0] or "application/octet-stream"
return FileResponse(
path=str(file_path),
media_type=media_type,
filename=original_filename,
)
finally:
conn.close()
@app.delete("/api/v1/documents/{doc_id}")
async def delete_document(doc_id: int):
conn = get_connection(cfg.db_path)
try:
doc = conn.execute(
"SELECT id, title, stored_path FROM documents WHERE id = ?", (doc_id,)
).fetchone()
if not doc:
raise HTTPException(status_code=404, detail="Document not found.")
# Get chunk IDs for embedding cleanup
chunk_ids = conn.execute(
"SELECT id FROM chunks WHERE document_id = ?", (doc_id,)
).fetchall()
# Delete embeddings from vec table
for row in chunk_ids:
conn.execute(
"DELETE FROM chunks_vec WHERE chunk_id = ?", (row["id"],)
)
# Delete document (cascades to chunks, document_tags)
conn.execute("DELETE FROM documents WHERE id = ?", (doc_id,))
conn.commit()
# Delete stored file from disk
stored_path = doc["stored_path"]
if stored_path:
try:
file_path = Path(stored_path)
if file_path.exists():
file_path.unlink()
logger.info("Deleted stored file: %s", stored_path)
else:
logger.warning("Stored file already missing: %s", stored_path)
except OSError as exc:
logger.warning("Failed to delete stored file %s: %s", stored_path, exc)
return {
"status": "deleted",
"document_id": doc_id,
"title": doc["title"],
}
finally:
conn.close()