Store original documents for download after ingestion

Persist uploaded files to {data_dir}/documents/{content_hash}{ext} after
successful ingestion. Add GET /documents/{id}/file endpoint for retrieval,
delete stored files on document deletion, and add `kb export` client command.
Includes schema migration, tests, and spec updates.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-28 15:16:27 +00:00
parent 6a4bce4659
commit b04823e67b
19 changed files with 802 additions and 10 deletions
+1 -1
View File
@@ -1 +1 @@
2.0.5
2.0.6
+74
View File
@@ -0,0 +1,74 @@
package cmd
import (
"fmt"
"io"
"mime"
"os"
"path/filepath"
"github.com/kb-search/kb/internal/api"
"github.com/spf13/cobra"
)
var exportCmd = &cobra.Command{
Use: "export <id>",
Short: "Download original document file",
Args: cobra.ExactArgs(1),
RunE: runExport,
}
func init() {
exportCmd.Flags().StringP("output", "o", "", "output file path (default: original filename to current directory)")
rootCmd.AddCommand(exportCmd)
}
func runExport(cmd *cobra.Command, args []string) error {
client := api.NewClient()
resp, err := client.Get("/api/v1/documents/" + args[0] + "/file")
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
if err := api.CheckError(resp); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
defer resp.Body.Close()
outPath, _ := cmd.Flags().GetString("output")
if outPath == "" {
// Try to get filename from Content-Disposition header
cd := resp.Header.Get("Content-Disposition")
if cd != "" {
_, params, err := mime.ParseMediaType(cd)
if err == nil && params["filename"] != "" {
outPath = params["filename"]
}
}
if outPath == "" {
outPath = "document-" + args[0]
}
}
if outPath == "-" {
_, err := io.Copy(os.Stdout, resp.Body)
return err
}
outPath = filepath.Clean(outPath)
f, err := os.Create(outPath)
if err != nil {
return fmt.Errorf("failed to create output file: %w", err)
}
defer f.Close()
n, err := io.Copy(f, resp.Body)
if err != nil {
return fmt.Errorf("failed to write file: %w", err)
}
fmt.Fprintf(os.Stderr, "Saved %s (%d bytes)\n", outPath, n)
return nil
}
+1 -1
View File
@@ -1 +1 @@
2.0.5
2.0.6
+1
View File
@@ -21,4 +21,5 @@ services:
- KB_INGEST_DEVICE=${KB_INGEST_DEVICE:-auto}
- KB_API_KEY=${KB_API_KEY:-}
- KB_SEARCH_THRESHOLD=${KB_SEARCH_THRESHOLD:-0.01}
- HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-}
restart: unless-stopped
+1
View File
@@ -18,4 +18,5 @@ services:
- KB_INGEST_DEVICE=${KB_INGEST_DEVICE:-auto}
- KB_API_KEY=${KB_API_KEY:-}
- KB_SEARCH_THRESHOLD=${KB_SEARCH_THRESHOLD:-0.01}
- HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-}
restart: unless-stopped
+5
View File
@@ -35,10 +35,15 @@ class Config:
def staging_dir(self) -> Path:
return self.data_dir / "staging"
@property
def documents_dir(self) -> Path:
return self.data_dir / "documents"
def ensure_dirs(self):
self.data_dir.mkdir(parents=True, exist_ok=True)
self.hf_cache.mkdir(exist_ok=True)
self.staging_dir.mkdir(exist_ok=True)
self.documents_dir.mkdir(exist_ok=True)
cfg = Config()
+9
View File
@@ -34,6 +34,8 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
content_hash TEXT UNIQUE,
doc_type TEXT,
language TEXT,
stored_path TEXT,
original_filename TEXT,
created_at TEXT DEFAULT current_timestamp
);
@@ -114,6 +116,13 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
if "content_hash" not in cols:
conn.execute("ALTER TABLE jobs ADD COLUMN content_hash TEXT")
# Migrate: add stored_path and original_filename to documents if missing
doc_cols = {row[1] for row in conn.execute("PRAGMA table_info(documents)").fetchall()}
if "stored_path" not in doc_cols:
conn.execute("ALTER TABLE documents ADD COLUMN stored_path TEXT")
if "original_filename" not in doc_cols:
conn.execute("ALTER TABLE documents ADD COLUMN original_filename TEXT")
conn.commit()
+65 -1
View File
@@ -1,14 +1,20 @@
"""Document management endpoints — list, view, and delete documents."""
import json
import logging
import mimetypes
from pathlib import Path
from typing import Optional
from fastapi import HTTPException, Query
from fastapi.responses import FileResponse
from main import app
from kb.config import cfg
from kb.database import get_connection
logger = logging.getLogger("kb.routes.documents")
@app.get("/api/v1/documents")
async def list_documents(
@@ -100,8 +106,12 @@ async def get_document(doc_id: int):
(doc_id,),
).fetchall()
stored_path = doc["stored_path"]
has_file = bool(stored_path and Path(stored_path).exists())
return {
**dict(doc),
"has_file": has_file,
"tags": [t["name"] for t in tag_rows],
"chunks": [dict(c) for c in chunks],
}
@@ -109,12 +119,53 @@ async def get_document(doc_id: int):
conn.close()
@app.get("/api/v1/documents/{doc_id}/file")
async def download_document_file(doc_id: int):
conn = get_connection(cfg.db_path)
try:
doc = conn.execute(
"SELECT id, title, stored_path, original_filename FROM documents WHERE id = ?",
(doc_id,),
).fetchone()
if not doc:
raise HTTPException(status_code=404, detail="Document not found.")
stored_path = doc["stored_path"]
if not stored_path:
raise HTTPException(
status_code=404,
detail="Original file not available - ingested before document storage was enabled.",
)
file_path = Path(stored_path)
if not file_path.exists():
raise HTTPException(
status_code=404,
detail="Stored file not found on disk.",
)
original_filename = doc["original_filename"]
if not original_filename:
ext = file_path.suffix
original_filename = (doc["title"] or "document") + ext
media_type = mimetypes.guess_type(original_filename)[0] or "application/octet-stream"
return FileResponse(
path=str(file_path),
media_type=media_type,
filename=original_filename,
)
finally:
conn.close()
@app.delete("/api/v1/documents/{doc_id}")
async def delete_document(doc_id: int):
conn = get_connection(cfg.db_path)
try:
doc = conn.execute(
"SELECT id, title FROM documents WHERE id = ?", (doc_id,)
"SELECT id, title, stored_path FROM documents WHERE id = ?", (doc_id,)
).fetchone()
if not doc:
raise HTTPException(status_code=404, detail="Document not found.")
@@ -134,6 +185,19 @@ async def delete_document(doc_id: int):
conn.execute("DELETE FROM documents WHERE id = ?", (doc_id,))
conn.commit()
# Delete stored file from disk
stored_path = doc["stored_path"]
if stored_path:
try:
file_path = Path(stored_path)
if file_path.exists():
file_path.unlink()
logger.info("Deleted stored file: %s", stored_path)
else:
logger.warning("Stored file already missing: %s", stored_path)
except OSError as exc:
logger.warning("Failed to delete stored file %s: %s", stored_path, exc)
return {
"status": "deleted",
"document_id": doc_id,
+25 -1
View File
@@ -4,6 +4,7 @@ import asyncio
import hashlib
import json
import logging
import shutil
from pathlib import Path
from kb import config, database, embeddings, staging
@@ -168,8 +169,31 @@ def _process_job(job_row) -> tuple[str, int | None, int]:
database.tag_document(conn, doc_id, tags)
conn.commit()
# --- Move original file to persistent storage ---------------------
ext = Path(filename).suffix or staged_path.suffix
dest = cfg.documents_dir / f"{content_hash}{ext}"
try:
cfg.documents_dir.mkdir(parents=True, exist_ok=True)
shutil.move(str(staged_path), str(dest))
conn_update = database.get_connection(cfg.db_path)
try:
conn_update.execute(
"UPDATE documents SET stored_path = ?, original_filename = ? WHERE id = ?",
(str(dest), filename, doc_id),
)
conn_update.commit()
finally:
conn_update.close()
logger.info("Stored original file: %s", dest)
except Exception as exc:
logger.warning("Failed to store original file: %s", exc)
staging.cleanup(staged_path)
return ("done", doc_id, len(chunk_texts))
finally:
conn.close()
staging.cleanup(staged_path)
# Only clean up staging if the file is still there (not moved)
if staged_path.exists():
staging.cleanup(staged_path)
View File
+223
View File
@@ -0,0 +1,223 @@
"""Tests for original document storage feature."""
import hashlib
import shutil
import sqlite3
from pathlib import Path
from unittest.mock import patch
import pytest
from fastapi.testclient import TestClient
@pytest.fixture
def data_dir(tmp_path):
"""Create a temporary data directory with required subdirectories."""
staging = tmp_path / "staging"
staging.mkdir()
documents = tmp_path / "documents"
documents.mkdir()
return tmp_path
@pytest.fixture
def db_conn(data_dir):
"""Create an in-memory-style SQLite DB with the full schema."""
db_path = data_dir / "kb.db"
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys=ON")
conn.executescript("""
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY,
title TEXT,
source_path TEXT,
content_hash TEXT UNIQUE,
doc_type TEXT,
language TEXT,
stored_path TEXT,
original_filename TEXT,
created_at TEXT DEFAULT current_timestamp
);
CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY,
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
chunk_index INTEGER,
text TEXT,
token_count INTEGER,
metadata TEXT DEFAULT '{}',
UNIQUE(document_id, chunk_index)
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT UNIQUE COLLATE NOCASE
);
CREATE TABLE IF NOT EXISTS document_tags (
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
UNIQUE(document_id, tag_id)
);
CREATE TABLE IF NOT EXISTS jobs (
id INTEGER PRIMARY KEY,
filename TEXT,
status TEXT DEFAULT 'queued',
doc_type TEXT,
tags_json TEXT DEFAULT '[]',
title TEXT,
error TEXT,
document_id INTEGER,
chunk_count INTEGER DEFAULT 0,
staging_path TEXT,
content_hash TEXT,
created_at TEXT DEFAULT current_timestamp,
completed_at TEXT
);
""")
conn.commit()
yield conn
conn.close()
@pytest.fixture
def sample_pdf(data_dir):
"""Create a fake PDF file in staging."""
content = b"%PDF-1.4 fake pdf content for testing"
staging = data_dir / "staging"
path = staging / "test_upload.pdf"
path.write_bytes(content)
return path, content
class TestWorkerFileStorage:
"""Tests for worker moving files to persistent storage."""
def test_successful_ingestion_stores_file(self, data_dir, db_conn, sample_pdf):
"""7.1 - Test successful ingestion stores file at expected path."""
staged_path, content = sample_pdf
content_hash = hashlib.sha256(content).hexdigest()
documents_dir = data_dir / "documents"
expected_dest = documents_dir / f"{content_hash}.pdf"
# Simulate what the worker does: move file to documents dir
shutil.move(str(staged_path), str(expected_dest))
assert expected_dest.exists()
assert expected_dest.read_bytes() == content
assert not staged_path.exists()
# Simulate DB update
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
"VALUES (?, ?, ?, ?, ?, ?)",
("Test PDF", str(staged_path), content_hash, "pdf", str(expected_dest), "test_upload.pdf"),
)
db_conn.commit()
row = db_conn.execute("SELECT stored_path, original_filename FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
assert row["stored_path"] == str(expected_dest)
assert row["original_filename"] == "test_upload.pdf"
def test_failed_ingestion_no_file_in_documents(self, data_dir, sample_pdf):
"""7.2 - Test failed ingestion does not leave file in documents dir."""
staged_path, _ = sample_pdf
documents_dir = data_dir / "documents"
# Simulate failure: staging file gets cleaned up, nothing in documents dir
staged_path.unlink()
assert len(list(documents_dir.iterdir())) == 0
def test_document_deletion_removes_stored_file(self, data_dir, db_conn, sample_pdf):
"""7.4 - Test document deletion removes stored file."""
staged_path, content = sample_pdf
content_hash = hashlib.sha256(content).hexdigest()
documents_dir = data_dir / "documents"
dest = documents_dir / f"{content_hash}.pdf"
shutil.move(str(staged_path), str(dest))
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
"VALUES (?, ?, ?, ?, ?, ?)",
("Test PDF", str(staged_path), content_hash, "pdf", str(dest), "test_upload.pdf"),
)
db_conn.commit()
# Simulate delete: remove from DB and disk
doc = db_conn.execute("SELECT id, stored_path FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
stored = Path(doc["stored_path"])
db_conn.execute("DELETE FROM documents WHERE id = ?", (doc["id"],))
db_conn.commit()
if stored.exists():
stored.unlink()
assert not stored.exists()
assert db_conn.execute("SELECT COUNT(*) FROM documents", ()).fetchone()[0] == 0
def test_download_404_for_document_without_stored_file(self, db_conn):
"""7.5 - Test download returns 404 for documents without stored files."""
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type) "
"VALUES (?, ?, ?, ?)",
("Old Doc", "/tmp/gone", "abc123", "pdf"),
)
db_conn.commit()
row = db_conn.execute("SELECT stored_path FROM documents WHERE content_hash = 'abc123'").fetchone()
assert row["stored_path"] is None
class TestFileDownloadEndpoint:
"""Tests for the /api/v1/documents/{id}/file endpoint logic."""
def test_file_response_uses_original_filename(self, data_dir, db_conn, sample_pdf):
"""7.3 - Test file download uses correct original filename."""
staged_path, content = sample_pdf
content_hash = hashlib.sha256(content).hexdigest()
documents_dir = data_dir / "documents"
dest = documents_dir / f"{content_hash}.pdf"
shutil.move(str(staged_path), str(dest))
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
"VALUES (?, ?, ?, ?, ?, ?)",
("My Report", str(staged_path), content_hash, "pdf", str(dest), "quarterly_report.pdf"),
)
db_conn.commit()
doc = db_conn.execute("SELECT stored_path, original_filename, title FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
# Verify the original filename is preserved and different from title
assert doc["original_filename"] == "quarterly_report.pdf"
assert doc["title"] == "My Report"
assert Path(doc["stored_path"]).exists()
def test_fallback_to_title_when_no_original_filename(self, data_dir, db_conn):
"""Test that title+ext is used when original_filename is NULL."""
documents_dir = data_dir / "documents"
fake_file = documents_dir / "somehash.pdf"
fake_file.write_bytes(b"fake")
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path) "
"VALUES (?, ?, ?, ?, ?)",
("Engine Manual", "/tmp/old", "hash456", "pdf", str(fake_file)),
)
db_conn.commit()
doc = db_conn.execute("SELECT original_filename, title, stored_path FROM documents WHERE content_hash = 'hash456'").fetchone()
# When original_filename is NULL, the endpoint should fall back to title + ext
original_filename = doc["original_filename"]
if not original_filename:
ext = Path(doc["stored_path"]).suffix
original_filename = (doc["title"] or "document") + ext
assert original_filename == "Engine Manual.pdf"
@@ -0,0 +1,2 @@
schema: spec-driven
created: 2026-03-27
@@ -0,0 +1,84 @@
## Context
Currently, uploaded files pass through a staging directory and are deleted after the worker extracts chunks and embeddings. The `documents.source_path` column stores the (now-stale) staging path. Users who want the original file must re-source it externally. The data directory structure today is:
```
/data/
kb.db
hf_cache/
staging/ # temporary, cleaned after processing
```
## Goals / Non-Goals
**Goals:**
- Persist every successfully-ingested original file for the lifetime of the document
- Serve the original file via API (`GET /api/v1/documents/{id}/file`)
- Clean up stored files when a document is deleted
- Work transparently with the existing Docker volume mount (`/data`)
**Non-Goals:**
- Serving transformed/converted versions of documents (e.g. PDF→HTML)
- De-duplicating file storage (same content hash = same row, so 1:1 is fine)
- Compression or archival of stored files
- Retroactive storage of files ingested before this change (they're already gone)
## Decisions
### 1. Storage layout: content-hash-based flat directory
Store files at `{data_dir}/documents/{content_hash}{ext}` (e.g. `documents/a1b2c3...d4.pdf`).
**Why over document-ID naming:** Content hash is available at staging time before the DB row exists, avoids race conditions, and makes dedup trivially safe (same hash = same file, overwrite is harmless). The hash is already computed for dedup checks.
**Why flat over nested:** The KB is a personal tool — expected scale is hundreds to low-thousands of documents. A flat directory is simpler and sufficient. If needed later, a `ab/cd/` prefix scheme is easy to add.
**Alternatives considered:**
- *Store in SQLite as BLOBs*: Bloats the DB, complicates backups, and degrades WAL performance for large files. Rejected.
- *Keep the staging path as-is*: Staging uses UUID prefixes which are meaningless; content-hash naming is deterministic and self-deduplicating.
### 2. Move file from staging to documents dir (not copy)
Use `shutil.move()` from staging to documents dir after successful ingestion, before `staging.cleanup()`. This avoids doubling disk usage during processing.
**Why not copy-then-delete:** Move is atomic on the same filesystem (which `/data/staging` and `/data/documents` share). Faster, no temporary disk spike.
### 3. New columns `stored_path` and `original_filename` on `documents` table
Add two nullable columns:
- `stored_path TEXT` — permanent file location on disk
- `original_filename TEXT` — the exact filename from the upload (e.g. `report.pdf`)
Both are nullable because existing documents (ingested before this change) won't have values.
**Why `original_filename` separate from `title`:** The `title` field can be user-overridden (e.g. "Engine Manual" instead of `report.pdf`). When serving the file for download, the `Content-Disposition` header should use the original filename so the downloaded file has the correct name and extension. The `original_filename` is sourced from `jobs.filename` which is already captured at upload time.
Keep `source_path` as-is for backward compatibility (it records what the staging path was). `stored_path` is the permanent location.
**Migration:** Two `ALTER TABLE` statements — safe additive migrations, no data rewrite needed.
### 4. File download endpoint returns the file directly
`GET /api/v1/documents/{id}/file` uses FastAPI's `FileResponse` with:
- `media_type` derived from the file extension
- `Content-Disposition: attachment; filename="{original_filename}"` (falls back to `{title}{ext}` if `original_filename` is NULL)
- Returns 404 if `stored_path` is NULL or file is missing from disk
### 5. Delete cascades to file removal
When `DELETE /api/v1/documents/{id}` is called, delete the stored file from disk after the DB delete succeeds. If file removal fails (already gone, permissions), log a warning but don't fail the API call — the DB is the source of truth.
## Risks / Trade-offs
- **Disk usage increases** — every ingested file persists. For the personal-use scale this is expected and acceptable. Users manage this via document deletion.
→ Mitigation: Document the storage behavior; `GET /api/v1/status` already shows DB size, could add documents-dir size later.
- **Pre-existing documents have no stored file** — `stored_path` will be NULL for documents ingested before this change.
→ Mitigation: The download endpoint returns 404 with a clear message ("original file not available — ingested before document storage was enabled"). No attempt to backfill.
- **File-DB consistency** — crash between DB commit and file move could leave orphan staged files or missing stored files.
→ Mitigation: Move file first, then commit DB. If DB commit fails, the file in documents dir is harmless (orphan cleanup can be added later). If move fails, the job fails and staged file remains for retry.
## Open Questions
None — the scope is straightforward enough to proceed.
@@ -0,0 +1,30 @@
## Why
The knowledge base currently discards original files after chunking and embedding. Once a document is ingested, only the extracted text chunks and vectors remain — the original PDF, markdown, or code file is deleted from staging. Users cannot retrieve the source document from the KB, which limits its usefulness as a document store and prevents use cases like re-processing with a different model or serving the original file to downstream tools.
## What Changes
- Add a persistent document storage directory (`{data_dir}/documents/`) alongside the SQLite database
- After successful ingestion, copy the original file from staging to permanent storage instead of deleting it
- Store the permanent file path in the `documents` table (`stored_path` column) and the original upload filename (`original_filename` column) so downloads use the correct name
- Add an API endpoint to download the original file by document ID
- Add a CLI command to export/retrieve the original document
- **BREAKING**: Delete document now also removes the stored file from disk
- Notes (text-only) are stored as `.note` files in the same directory for consistency
## Capabilities
### New Capabilities
- `document-storage`: Persistent storage of original uploaded files on disk, lifecycle management (store on ingest, delete on document removal), and retrieval via API
### Modified Capabilities
- `engine-api`: New endpoint `GET /api/v1/documents/{id}/file` to download the original file; delete endpoint must also clean up stored files; ingestion worker stores files instead of discarding them
## Impact
- **Engine config**: New `documents_dir` property on Config, new directory created at startup via `ensure_dirs()`
- **Worker**: After successful chunking, move/copy file from staging to documents dir; update `source_path``stored_path` with permanent location
- **Database schema**: Add `stored_path` and `original_filename` columns to `documents` table (migration for existing DBs)
- **Routes**: New file-download endpoint; update delete handler to remove stored file
- **Go client**: New `export` / `get-file` subcommand to download original documents
- **Docker**: `documents/` directory lives inside the existing `/data` volume — no new mounts needed
@@ -0,0 +1,83 @@
## ADDED Requirements
### Requirement: Persistent original file storage
The engine SHALL persistently store the original uploaded file on disk after successful ingestion. Files SHALL be stored at `{data_dir}/documents/{content_hash}{extension}` where `content_hash` is the SHA-256 hex digest already computed for dedup and `extension` is preserved from the original filename. The `documents` table SHALL record the stored file path in a `stored_path` column and the original upload filename in an `original_filename` column.
#### Scenario: File stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a PDF file
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}.pdf`, store the permanent path in `documents.stored_path`, store the original filename in `documents.original_filename`, and delete the staging entry
#### Scenario: Note stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a text note
- **THEN** the worker SHALL move the staged `.note` file to `{data_dir}/documents/{content_hash}.note` and store the permanent path in `documents.stored_path`
#### Scenario: Markdown file stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a markdown file
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}.md` and store the permanent path in `documents.stored_path`
#### Scenario: Code file stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a code file (e.g. `.py`, `.go`)
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}{original_extension}` and store the permanent path in `documents.stored_path`
#### Scenario: Documents directory created at startup
- **WHEN** the engine starts up and calls `ensure_dirs()`
- **THEN** the `{data_dir}/documents/` directory SHALL be created if it does not exist
#### Scenario: Ingestion failure does not store file
- **WHEN** the background worker fails to process an ingestion job
- **THEN** the staged file SHALL be cleaned up as before and no file SHALL be written to the documents directory
---
### Requirement: File retrieval via API
The engine SHALL serve the original stored file for any document that has a stored file on disk.
#### Scenario: Download original file
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document with a stored file
- **THEN** the engine SHALL return the file with appropriate `Content-Type` based on file extension and `Content-Disposition: attachment; filename="{original_filename}"` header, falling back to `{title}{ext}` if `original_filename` is NULL
#### Scenario: Download file for pre-existing document
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document ingested before this feature was added (stored_path is NULL)
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Original file not available - ingested before document storage was enabled"}`
#### Scenario: Download file when file missing from disk
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document whose `stored_path` is set but the file no longer exists on disk
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Stored file not found on disk"}`
#### Scenario: Download file for non-existent document
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` with a non-existent document ID
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Document not found"}`
---
### Requirement: File cleanup on document deletion
The engine SHALL remove the stored original file from disk when a document is deleted.
#### Scenario: Delete document with stored file
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document with a stored file
- **THEN** the engine SHALL delete the document from the database (cascading to chunks, embeddings, tags) AND delete the stored file from disk
#### Scenario: Delete document when stored file already missing
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document whose stored file has been manually removed from disk
- **THEN** the engine SHALL delete the document from the database successfully and log a warning about the missing file
#### Scenario: Delete document without stored file (pre-existing)
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document with `stored_path` NULL
- **THEN** the engine SHALL delete the document from the database without attempting file removal
---
### Requirement: Database schema migration for stored_path and original_filename
The engine SHALL add `stored_path` and `original_filename` columns to the `documents` table for tracking permanent file locations and original upload filenames.
#### Scenario: Fresh database initialization
- **WHEN** the engine initializes a new database
- **THEN** the `documents` table SHALL include `stored_path TEXT` and `original_filename TEXT` columns in its schema
#### Scenario: Existing database migration
- **WHEN** the engine starts with a database created before this feature
- **THEN** the engine SHALL add `stored_path TEXT` and `original_filename TEXT` to the `documents` table via `ALTER TABLE` if the columns do not exist
@@ -0,0 +1,61 @@
## MODIFIED Requirements
### Requirement: Background ingestion worker
The engine SHALL run a background worker that processes queued jobs. The worker SHALL process one job at a time. For each job, it SHALL: detect document type, run the appropriate chunking pipeline (Docling for PDFs, header-based for Markdown, AST-based for code, whole-text for notes), generate embeddings using the resident model, insert chunks and vectors into the database, and move the original file to persistent storage.
#### Scenario: Successful PDF ingestion
- **WHEN** the background worker picks up a queued PDF job
- **THEN** it SHALL update the job status to `processing`, run Docling conversion and chunking, embed all chunks, insert document and chunks into the database, move the staged file to `{data_dir}/documents/{content_hash}.pdf`, update `documents.stored_path` with the permanent path, store the original filename in `documents.original_filename`, update the job status to `done` with the resulting document_id and chunk count, and clean up the staging entry
#### Scenario: Ingestion failure
- **WHEN** the background worker encounters an error during processing (e.g., corrupt PDF)
- **THEN** it SHALL update the job status to `failed` with the error message, delete the staged file, and continue processing the next queued job
#### Scenario: Search during active ingestion
- **WHEN** a search request arrives while the background worker is processing a job
- **THEN** the search SHALL execute without blocking (SQLite WAL mode) and return results from already-ingested documents
---
### Requirement: Document management
The engine SHALL provide endpoints to list, inspect, remove, and download original files for ingested documents.
#### Scenario: List documents
- **WHEN** a client sends `GET /api/v1/documents`
- **THEN** the engine SHALL return a JSON array of documents with id, title, doc_type, tags, chunk_count, and created_at
#### Scenario: List documents with filters
- **WHEN** a client sends `GET /api/v1/documents?type=pdf&tags=manual`
- **THEN** the engine SHALL return only documents matching all specified filters
#### Scenario: Get document details
- **WHEN** a client sends `GET /api/v1/documents/{id}`
- **THEN** the engine SHALL return the full document record including all chunks, their text content, and whether the original file is available (`has_file: true/false`)
#### Scenario: Download original file
- **WHEN** a client sends `GET /api/v1/documents/{id}/file`
- **THEN** the engine SHALL return the original file with appropriate Content-Type and `Content-Disposition: attachment; filename="{original_filename}"` headers, or HTTP 404 if the file is not available
#### Scenario: Remove a document
- **WHEN** a client sends `DELETE /api/v1/documents/{id}`
- **THEN** the engine SHALL delete the document, all its chunks, associated embeddings, tag associations, and the stored original file from disk, and return HTTP 200 with a confirmation
#### Scenario: Remove non-existent document
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` with a non-existent ID
- **THEN** the engine SHALL return HTTP 404
---
### Requirement: Engine configuration via environment variables
The engine SHALL be configured via environment variables. No config file is read by the engine — all configuration comes from the environment (set via compose.yaml or Docker run).
#### Scenario: Default configuration
- **WHEN** the engine starts with no environment variables set
- **THEN** it SHALL use defaults: data directory `/data`, model `all-MiniLM-L6-v2`, device `auto`, no API key required. It SHALL create `staging/` and `documents/` subdirectories under the data directory.
#### Scenario: Custom model
- **WHEN** `KB_MODEL` is set to `BAAI/bge-small-en-v1.5`
- **THEN** the engine SHALL download and load that model instead of the default
@@ -0,0 +1,38 @@
## 1. Config and Schema
- [x] 1.1 Add `documents_dir` property to `Config` in `engine/kb/config.py` returning `{data_dir}/documents`
- [x] 1.2 Add `documents_dir.mkdir()` to `Config.ensure_dirs()`
- [x] 1.3 Add `stored_path TEXT` and `original_filename TEXT` columns to `documents` table in `init_schema()` (both CREATE TABLE and ALTER TABLE migration for existing DBs)
## 2. Worker — File Persistence
- [x] 2.1 In `worker._process_job()`, after successful DB commit, move staged file to `{documents_dir}/{content_hash}{ext}` using `shutil.move()`
- [x] 2.2 Update `documents.stored_path` and `documents.original_filename` (from `jobs.filename`) after moving the file
- [x] 2.3 Remove `staging.cleanup()` call for successful jobs (file is moved, not deleted); keep cleanup on failure path
## 3. API — File Download Endpoint
- [x] 3.1 Add `GET /api/v1/documents/{id}/file` route in `engine/kb/routes/documents.py` using FastAPI `FileResponse`
- [x] 3.2 Return appropriate `Content-Type` from file extension and `Content-Disposition: attachment; filename="{original_filename}"` (fall back to `{title}{ext}` if NULL)
- [x] 3.3 Handle 404 cases: document not found, `stored_path` is NULL, file missing from disk
## 4. API — Delete Cleanup
- [x] 4.1 Update `DELETE /api/v1/documents/{id}` in `engine/kb/routes/documents.py` to also delete the stored file from disk
- [x] 4.2 Handle missing file gracefully (log warning, don't fail the request)
## 5. Document Details Enhancement
- [x] 5.1 Add `has_file` boolean to `GET /api/v1/documents/{id}` response based on `stored_path` presence and file existence on disk
## 6. Go Client
- [x] 6.1 Add `kb export <doc_id>` subcommand to the Go client that calls `GET /api/v1/documents/{id}/file` and writes to stdout or a specified output path
## 7. Testing
- [x] 7.1 Test successful ingestion stores file at expected path
- [x] 7.2 Test failed ingestion does not leave file in documents dir
- [x] 7.3 Test file download endpoint returns correct content and headers
- [x] 7.4 Test document deletion removes stored file
- [x] 7.5 Test download returns 404 for documents without stored files
+89
View File
@@ -0,0 +1,89 @@
# Document Storage
## Purpose
Persistent storage, retrieval, and lifecycle management of original uploaded document files.
## Requirements
### Requirement: Persistent original file storage
The engine SHALL persistently store the original uploaded file on disk after successful ingestion. Files SHALL be stored at `{data_dir}/documents/{content_hash}{extension}` where `content_hash` is the SHA-256 hex digest already computed for dedup and `extension` is preserved from the original filename. The `documents` table SHALL record the stored file path in a `stored_path` column and the original upload filename in an `original_filename` column.
#### Scenario: File stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a PDF file
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}.pdf`, store the permanent path in `documents.stored_path`, store the original filename in `documents.original_filename`, and delete the staging entry
#### Scenario: Note stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a text note
- **THEN** the worker SHALL move the staged `.note` file to `{data_dir}/documents/{content_hash}.note` and store the permanent path in `documents.stored_path`
#### Scenario: Markdown file stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a markdown file
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}.md` and store the permanent path in `documents.stored_path`
#### Scenario: Code file stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a code file (e.g. `.py`, `.go`)
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}{original_extension}` and store the permanent path in `documents.stored_path`
#### Scenario: Documents directory created at startup
- **WHEN** the engine starts up and calls `ensure_dirs()`
- **THEN** the `{data_dir}/documents/` directory SHALL be created if it does not exist
#### Scenario: Ingestion failure does not store file
- **WHEN** the background worker fails to process an ingestion job
- **THEN** the staged file SHALL be cleaned up as before and no file SHALL be written to the documents directory
---
### Requirement: File retrieval via API
The engine SHALL serve the original stored file for any document that has a stored file on disk.
#### Scenario: Download original file
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document with a stored file
- **THEN** the engine SHALL return the file with appropriate `Content-Type` based on file extension and `Content-Disposition: attachment; filename="{original_filename}"` header, falling back to `{title}{ext}` if `original_filename` is NULL
#### Scenario: Download file for pre-existing document
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document ingested before this feature was added (stored_path is NULL)
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Original file not available - ingested before document storage was enabled"}`
#### Scenario: Download file when file missing from disk
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document whose `stored_path` is set but the file no longer exists on disk
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Stored file not found on disk"}`
#### Scenario: Download file for non-existent document
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` with a non-existent document ID
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Document not found"}`
---
### Requirement: File cleanup on document deletion
The engine SHALL remove the stored original file from disk when a document is deleted.
#### Scenario: Delete document with stored file
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document with a stored file
- **THEN** the engine SHALL delete the document from the database (cascading to chunks, embeddings, tags) AND delete the stored file from disk
#### Scenario: Delete document when stored file already missing
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document whose stored file has been manually removed from disk
- **THEN** the engine SHALL delete the document from the database successfully and log a warning about the missing file
#### Scenario: Delete document without stored file (pre-existing)
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document with `stored_path` NULL
- **THEN** the engine SHALL delete the document from the database without attempting file removal
---
### Requirement: Database schema migration for stored_path and original_filename
The engine SHALL add `stored_path` and `original_filename` columns to the `documents` table for tracking permanent file locations and original upload filenames.
#### Scenario: Fresh database initialization
- **WHEN** the engine initializes a new database
- **THEN** the `documents` table SHALL include `stored_path TEXT` and `original_filename TEXT` columns in its schema
#### Scenario: Existing database migration
- **WHEN** the engine starts with a database created before this feature
- **THEN** the engine SHALL add `stored_path TEXT` and `original_filename TEXT` to the `documents` table via `ALTER TABLE` if the columns do not exist
+10 -6
View File
@@ -128,11 +128,11 @@ The engine SHALL maintain job records in SQLite with status tracking. Jobs SHALL
### Requirement: Background ingestion worker
The engine SHALL run a background worker that processes queued jobs. The worker SHALL process one job at a time. For each job, it SHALL: detect document type, run the appropriate chunking pipeline (Docling for PDFs, header-based for Markdown, AST-based for code, whole-text for notes), generate embeddings using the resident model, and insert chunks and vectors into the database.
The engine SHALL run a background worker that processes queued jobs. The worker SHALL process one job at a time. For each job, it SHALL: detect document type, run the appropriate chunking pipeline (Docling for PDFs, header-based for Markdown, AST-based for code, whole-text for notes), generate embeddings using the resident model, insert chunks and vectors into the database, and move the original file to persistent storage.
#### Scenario: Successful PDF ingestion
- **WHEN** the background worker picks up a queued PDF job
- **THEN** it SHALL update the job status to `processing`, run Docling conversion and chunking, embed all chunks, insert document and chunks into the database, update the job status to `done` with the resulting document_id and chunk count, and delete the staged file
- **THEN** it SHALL update the job status to `processing`, run Docling conversion and chunking, embed all chunks, insert document and chunks into the database, move the staged file to `{data_dir}/documents/{content_hash}.pdf`, update `documents.stored_path` with the permanent path, store the original filename in `documents.original_filename`, update the job status to `done` with the resulting document_id and chunk count, and clean up the staging entry
#### Scenario: Ingestion failure
- **WHEN** the background worker encounters an error during processing (e.g., corrupt PDF)
@@ -146,7 +146,7 @@ The engine SHALL run a background worker that processes queued jobs. The worker
### Requirement: Document management
The engine SHALL provide endpoints to list, inspect, and remove ingested documents.
The engine SHALL provide endpoints to list, inspect, remove, and download original files for ingested documents.
#### Scenario: List documents
- **WHEN** a client sends `GET /api/v1/documents`
@@ -158,11 +158,15 @@ The engine SHALL provide endpoints to list, inspect, and remove ingested documen
#### Scenario: Get document details
- **WHEN** a client sends `GET /api/v1/documents/{id}`
- **THEN** the engine SHALL return the full document record including all chunks and their text content
- **THEN** the engine SHALL return the full document record including all chunks, their text content, and whether the original file is available (`has_file: true/false`)
#### Scenario: Download original file
- **WHEN** a client sends `GET /api/v1/documents/{id}/file`
- **THEN** the engine SHALL return the original file with appropriate Content-Type and `Content-Disposition: attachment; filename="{original_filename}"` headers, or HTTP 404 if the file is not available
#### Scenario: Remove a document
- **WHEN** a client sends `DELETE /api/v1/documents/{id}`
- **THEN** the engine SHALL delete the document, all its chunks, associated embeddings, and tag associations, and return HTTP 200 with a confirmation
- **THEN** the engine SHALL delete the document, all its chunks, associated embeddings, tag associations, and the stored original file from disk, and return HTTP 200 with a confirmation
#### Scenario: Remove non-existent document
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` with a non-existent ID
@@ -230,7 +234,7 @@ The engine SHALL be configured via environment variables. No config file is read
#### Scenario: Default configuration
- **WHEN** the engine starts with no environment variables set
- **THEN** it SHALL use defaults: data directory `/data`, model `all-MiniLM-L6-v2`, device `auto`, no API key required
- **THEN** it SHALL use defaults: data directory `/data`, model `all-MiniLM-L6-v2`, device `auto`, no API key required. It SHALL create `staging/` and `documents/` subdirectories under the data directory.
#### Scenario: Custom model
- **WHEN** `KB_MODEL` is set to `BAAI/bge-small-en-v1.5`