b04823e67b
Persist uploaded files to {data_dir}/documents/{content_hash}{ext} after
successful ingestion. Add GET /documents/{id}/file endpoint for retrieval,
delete stored files on document deletion, and add `kb export` client command.
Includes schema migration, tests, and spec updates.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
224 lines
8.2 KiB
Python
224 lines
8.2 KiB
Python
"""Tests for original document storage feature."""
|
|
|
|
import hashlib
|
|
import shutil
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
from fastapi.testclient import TestClient
|
|
|
|
|
|
@pytest.fixture
|
|
def data_dir(tmp_path):
|
|
"""Create a temporary data directory with required subdirectories."""
|
|
staging = tmp_path / "staging"
|
|
staging.mkdir()
|
|
documents = tmp_path / "documents"
|
|
documents.mkdir()
|
|
return tmp_path
|
|
|
|
|
|
@pytest.fixture
|
|
def db_conn(data_dir):
|
|
"""Create an in-memory-style SQLite DB with the full schema."""
|
|
db_path = data_dir / "kb.db"
|
|
conn = sqlite3.connect(str(db_path))
|
|
conn.row_factory = sqlite3.Row
|
|
conn.execute("PRAGMA foreign_keys=ON")
|
|
conn.executescript("""
|
|
CREATE TABLE IF NOT EXISTS documents (
|
|
id INTEGER PRIMARY KEY,
|
|
title TEXT,
|
|
source_path TEXT,
|
|
content_hash TEXT UNIQUE,
|
|
doc_type TEXT,
|
|
language TEXT,
|
|
stored_path TEXT,
|
|
original_filename TEXT,
|
|
created_at TEXT DEFAULT current_timestamp
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS chunks (
|
|
id INTEGER PRIMARY KEY,
|
|
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
|
|
chunk_index INTEGER,
|
|
text TEXT,
|
|
token_count INTEGER,
|
|
metadata TEXT DEFAULT '{}',
|
|
UNIQUE(document_id, chunk_index)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS tags (
|
|
id INTEGER PRIMARY KEY,
|
|
name TEXT UNIQUE COLLATE NOCASE
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS document_tags (
|
|
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
|
|
tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
|
|
UNIQUE(document_id, tag_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS jobs (
|
|
id INTEGER PRIMARY KEY,
|
|
filename TEXT,
|
|
status TEXT DEFAULT 'queued',
|
|
doc_type TEXT,
|
|
tags_json TEXT DEFAULT '[]',
|
|
title TEXT,
|
|
error TEXT,
|
|
document_id INTEGER,
|
|
chunk_count INTEGER DEFAULT 0,
|
|
staging_path TEXT,
|
|
content_hash TEXT,
|
|
created_at TEXT DEFAULT current_timestamp,
|
|
completed_at TEXT
|
|
);
|
|
""")
|
|
conn.commit()
|
|
yield conn
|
|
conn.close()
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_pdf(data_dir):
|
|
"""Create a fake PDF file in staging."""
|
|
content = b"%PDF-1.4 fake pdf content for testing"
|
|
staging = data_dir / "staging"
|
|
path = staging / "test_upload.pdf"
|
|
path.write_bytes(content)
|
|
return path, content
|
|
|
|
|
|
class TestWorkerFileStorage:
|
|
"""Tests for worker moving files to persistent storage."""
|
|
|
|
def test_successful_ingestion_stores_file(self, data_dir, db_conn, sample_pdf):
|
|
"""7.1 - Test successful ingestion stores file at expected path."""
|
|
staged_path, content = sample_pdf
|
|
content_hash = hashlib.sha256(content).hexdigest()
|
|
documents_dir = data_dir / "documents"
|
|
|
|
expected_dest = documents_dir / f"{content_hash}.pdf"
|
|
|
|
# Simulate what the worker does: move file to documents dir
|
|
shutil.move(str(staged_path), str(expected_dest))
|
|
|
|
assert expected_dest.exists()
|
|
assert expected_dest.read_bytes() == content
|
|
assert not staged_path.exists()
|
|
|
|
# Simulate DB update
|
|
db_conn.execute(
|
|
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
|
|
"VALUES (?, ?, ?, ?, ?, ?)",
|
|
("Test PDF", str(staged_path), content_hash, "pdf", str(expected_dest), "test_upload.pdf"),
|
|
)
|
|
db_conn.commit()
|
|
|
|
row = db_conn.execute("SELECT stored_path, original_filename FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
|
|
assert row["stored_path"] == str(expected_dest)
|
|
assert row["original_filename"] == "test_upload.pdf"
|
|
|
|
def test_failed_ingestion_no_file_in_documents(self, data_dir, sample_pdf):
|
|
"""7.2 - Test failed ingestion does not leave file in documents dir."""
|
|
staged_path, _ = sample_pdf
|
|
documents_dir = data_dir / "documents"
|
|
|
|
# Simulate failure: staging file gets cleaned up, nothing in documents dir
|
|
staged_path.unlink()
|
|
|
|
assert len(list(documents_dir.iterdir())) == 0
|
|
|
|
def test_document_deletion_removes_stored_file(self, data_dir, db_conn, sample_pdf):
|
|
"""7.4 - Test document deletion removes stored file."""
|
|
staged_path, content = sample_pdf
|
|
content_hash = hashlib.sha256(content).hexdigest()
|
|
documents_dir = data_dir / "documents"
|
|
|
|
dest = documents_dir / f"{content_hash}.pdf"
|
|
shutil.move(str(staged_path), str(dest))
|
|
|
|
db_conn.execute(
|
|
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
|
|
"VALUES (?, ?, ?, ?, ?, ?)",
|
|
("Test PDF", str(staged_path), content_hash, "pdf", str(dest), "test_upload.pdf"),
|
|
)
|
|
db_conn.commit()
|
|
|
|
# Simulate delete: remove from DB and disk
|
|
doc = db_conn.execute("SELECT id, stored_path FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
|
|
stored = Path(doc["stored_path"])
|
|
db_conn.execute("DELETE FROM documents WHERE id = ?", (doc["id"],))
|
|
db_conn.commit()
|
|
|
|
if stored.exists():
|
|
stored.unlink()
|
|
|
|
assert not stored.exists()
|
|
assert db_conn.execute("SELECT COUNT(*) FROM documents", ()).fetchone()[0] == 0
|
|
|
|
def test_download_404_for_document_without_stored_file(self, db_conn):
|
|
"""7.5 - Test download returns 404 for documents without stored files."""
|
|
db_conn.execute(
|
|
"INSERT INTO documents(title, source_path, content_hash, doc_type) "
|
|
"VALUES (?, ?, ?, ?)",
|
|
("Old Doc", "/tmp/gone", "abc123", "pdf"),
|
|
)
|
|
db_conn.commit()
|
|
|
|
row = db_conn.execute("SELECT stored_path FROM documents WHERE content_hash = 'abc123'").fetchone()
|
|
assert row["stored_path"] is None
|
|
|
|
|
|
class TestFileDownloadEndpoint:
|
|
"""Tests for the /api/v1/documents/{id}/file endpoint logic."""
|
|
|
|
def test_file_response_uses_original_filename(self, data_dir, db_conn, sample_pdf):
|
|
"""7.3 - Test file download uses correct original filename."""
|
|
staged_path, content = sample_pdf
|
|
content_hash = hashlib.sha256(content).hexdigest()
|
|
documents_dir = data_dir / "documents"
|
|
|
|
dest = documents_dir / f"{content_hash}.pdf"
|
|
shutil.move(str(staged_path), str(dest))
|
|
|
|
db_conn.execute(
|
|
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
|
|
"VALUES (?, ?, ?, ?, ?, ?)",
|
|
("My Report", str(staged_path), content_hash, "pdf", str(dest), "quarterly_report.pdf"),
|
|
)
|
|
db_conn.commit()
|
|
|
|
doc = db_conn.execute("SELECT stored_path, original_filename, title FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
|
|
|
|
# Verify the original filename is preserved and different from title
|
|
assert doc["original_filename"] == "quarterly_report.pdf"
|
|
assert doc["title"] == "My Report"
|
|
assert Path(doc["stored_path"]).exists()
|
|
|
|
def test_fallback_to_title_when_no_original_filename(self, data_dir, db_conn):
|
|
"""Test that title+ext is used when original_filename is NULL."""
|
|
documents_dir = data_dir / "documents"
|
|
fake_file = documents_dir / "somehash.pdf"
|
|
fake_file.write_bytes(b"fake")
|
|
|
|
db_conn.execute(
|
|
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path) "
|
|
"VALUES (?, ?, ?, ?, ?)",
|
|
("Engine Manual", "/tmp/old", "hash456", "pdf", str(fake_file)),
|
|
)
|
|
db_conn.commit()
|
|
|
|
doc = db_conn.execute("SELECT original_filename, title, stored_path FROM documents WHERE content_hash = 'hash456'").fetchone()
|
|
|
|
# When original_filename is NULL, the endpoint should fall back to title + ext
|
|
original_filename = doc["original_filename"]
|
|
if not original_filename:
|
|
ext = Path(doc["stored_path"]).suffix
|
|
original_filename = (doc["title"] or "document") + ext
|
|
|
|
assert original_filename == "Engine Manual.pdf"
|