Store original documents for download after ingestion

Persist uploaded files to {data_dir}/documents/{content_hash}{ext} after
successful ingestion. Add GET /documents/{id}/file endpoint for retrieval,
delete stored files on document deletion, and add `kb export` client command.
Includes schema migration, tests, and spec updates.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-28 15:16:27 +00:00
parent 6a4bce4659
commit b04823e67b
19 changed files with 802 additions and 10 deletions
+223
View File
@@ -0,0 +1,223 @@
"""Tests for original document storage feature."""
import hashlib
import shutil
import sqlite3
from pathlib import Path
from unittest.mock import patch
import pytest
from fastapi.testclient import TestClient
@pytest.fixture
def data_dir(tmp_path):
"""Create a temporary data directory with required subdirectories."""
staging = tmp_path / "staging"
staging.mkdir()
documents = tmp_path / "documents"
documents.mkdir()
return tmp_path
@pytest.fixture
def db_conn(data_dir):
"""Create an in-memory-style SQLite DB with the full schema."""
db_path = data_dir / "kb.db"
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys=ON")
conn.executescript("""
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY,
title TEXT,
source_path TEXT,
content_hash TEXT UNIQUE,
doc_type TEXT,
language TEXT,
stored_path TEXT,
original_filename TEXT,
created_at TEXT DEFAULT current_timestamp
);
CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY,
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
chunk_index INTEGER,
text TEXT,
token_count INTEGER,
metadata TEXT DEFAULT '{}',
UNIQUE(document_id, chunk_index)
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT UNIQUE COLLATE NOCASE
);
CREATE TABLE IF NOT EXISTS document_tags (
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
UNIQUE(document_id, tag_id)
);
CREATE TABLE IF NOT EXISTS jobs (
id INTEGER PRIMARY KEY,
filename TEXT,
status TEXT DEFAULT 'queued',
doc_type TEXT,
tags_json TEXT DEFAULT '[]',
title TEXT,
error TEXT,
document_id INTEGER,
chunk_count INTEGER DEFAULT 0,
staging_path TEXT,
content_hash TEXT,
created_at TEXT DEFAULT current_timestamp,
completed_at TEXT
);
""")
conn.commit()
yield conn
conn.close()
@pytest.fixture
def sample_pdf(data_dir):
"""Create a fake PDF file in staging."""
content = b"%PDF-1.4 fake pdf content for testing"
staging = data_dir / "staging"
path = staging / "test_upload.pdf"
path.write_bytes(content)
return path, content
class TestWorkerFileStorage:
"""Tests for worker moving files to persistent storage."""
def test_successful_ingestion_stores_file(self, data_dir, db_conn, sample_pdf):
"""7.1 - Test successful ingestion stores file at expected path."""
staged_path, content = sample_pdf
content_hash = hashlib.sha256(content).hexdigest()
documents_dir = data_dir / "documents"
expected_dest = documents_dir / f"{content_hash}.pdf"
# Simulate what the worker does: move file to documents dir
shutil.move(str(staged_path), str(expected_dest))
assert expected_dest.exists()
assert expected_dest.read_bytes() == content
assert not staged_path.exists()
# Simulate DB update
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
"VALUES (?, ?, ?, ?, ?, ?)",
("Test PDF", str(staged_path), content_hash, "pdf", str(expected_dest), "test_upload.pdf"),
)
db_conn.commit()
row = db_conn.execute("SELECT stored_path, original_filename FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
assert row["stored_path"] == str(expected_dest)
assert row["original_filename"] == "test_upload.pdf"
def test_failed_ingestion_no_file_in_documents(self, data_dir, sample_pdf):
"""7.2 - Test failed ingestion does not leave file in documents dir."""
staged_path, _ = sample_pdf
documents_dir = data_dir / "documents"
# Simulate failure: staging file gets cleaned up, nothing in documents dir
staged_path.unlink()
assert len(list(documents_dir.iterdir())) == 0
def test_document_deletion_removes_stored_file(self, data_dir, db_conn, sample_pdf):
"""7.4 - Test document deletion removes stored file."""
staged_path, content = sample_pdf
content_hash = hashlib.sha256(content).hexdigest()
documents_dir = data_dir / "documents"
dest = documents_dir / f"{content_hash}.pdf"
shutil.move(str(staged_path), str(dest))
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
"VALUES (?, ?, ?, ?, ?, ?)",
("Test PDF", str(staged_path), content_hash, "pdf", str(dest), "test_upload.pdf"),
)
db_conn.commit()
# Simulate delete: remove from DB and disk
doc = db_conn.execute("SELECT id, stored_path FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
stored = Path(doc["stored_path"])
db_conn.execute("DELETE FROM documents WHERE id = ?", (doc["id"],))
db_conn.commit()
if stored.exists():
stored.unlink()
assert not stored.exists()
assert db_conn.execute("SELECT COUNT(*) FROM documents", ()).fetchone()[0] == 0
def test_download_404_for_document_without_stored_file(self, db_conn):
"""7.5 - Test download returns 404 for documents without stored files."""
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type) "
"VALUES (?, ?, ?, ?)",
("Old Doc", "/tmp/gone", "abc123", "pdf"),
)
db_conn.commit()
row = db_conn.execute("SELECT stored_path FROM documents WHERE content_hash = 'abc123'").fetchone()
assert row["stored_path"] is None
class TestFileDownloadEndpoint:
"""Tests for the /api/v1/documents/{id}/file endpoint logic."""
def test_file_response_uses_original_filename(self, data_dir, db_conn, sample_pdf):
"""7.3 - Test file download uses correct original filename."""
staged_path, content = sample_pdf
content_hash = hashlib.sha256(content).hexdigest()
documents_dir = data_dir / "documents"
dest = documents_dir / f"{content_hash}.pdf"
shutil.move(str(staged_path), str(dest))
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
"VALUES (?, ?, ?, ?, ?, ?)",
("My Report", str(staged_path), content_hash, "pdf", str(dest), "quarterly_report.pdf"),
)
db_conn.commit()
doc = db_conn.execute("SELECT stored_path, original_filename, title FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
# Verify the original filename is preserved and different from title
assert doc["original_filename"] == "quarterly_report.pdf"
assert doc["title"] == "My Report"
assert Path(doc["stored_path"]).exists()
def test_fallback_to_title_when_no_original_filename(self, data_dir, db_conn):
"""Test that title+ext is used when original_filename is NULL."""
documents_dir = data_dir / "documents"
fake_file = documents_dir / "somehash.pdf"
fake_file.write_bytes(b"fake")
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path) "
"VALUES (?, ?, ?, ?, ?)",
("Engine Manual", "/tmp/old", "hash456", "pdf", str(fake_file)),
)
db_conn.commit()
doc = db_conn.execute("SELECT original_filename, title, stored_path FROM documents WHERE content_hash = 'hash456'").fetchone()
# When original_filename is NULL, the endpoint should fall back to title + ext
original_filename = doc["original_filename"]
if not original_filename:
ext = Path(doc["stored_path"]).suffix
original_filename = (doc["title"] or "document") + ext
assert original_filename == "Engine Manual.pdf"