"""Tests for original document storage feature.""" import hashlib import shutil import sqlite3 from pathlib import Path from unittest.mock import patch import pytest from fastapi.testclient import TestClient @pytest.fixture def data_dir(tmp_path): """Create a temporary data directory with required subdirectories.""" staging = tmp_path / "staging" staging.mkdir() documents = tmp_path / "documents" documents.mkdir() return tmp_path @pytest.fixture def db_conn(data_dir): """Create an in-memory-style SQLite DB with the full schema.""" db_path = data_dir / "kb.db" conn = sqlite3.connect(str(db_path)) conn.row_factory = sqlite3.Row conn.execute("PRAGMA foreign_keys=ON") conn.executescript(""" CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY, title TEXT, source_path TEXT, content_hash TEXT UNIQUE, doc_type TEXT, language TEXT, stored_path TEXT, original_filename TEXT, created_at TEXT DEFAULT current_timestamp ); CREATE TABLE IF NOT EXISTS chunks ( id INTEGER PRIMARY KEY, document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE, chunk_index INTEGER, text TEXT, token_count INTEGER, metadata TEXT DEFAULT '{}', UNIQUE(document_id, chunk_index) ); CREATE TABLE IF NOT EXISTS tags ( id INTEGER PRIMARY KEY, name TEXT UNIQUE COLLATE NOCASE ); CREATE TABLE IF NOT EXISTS document_tags ( document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE, tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE, UNIQUE(document_id, tag_id) ); CREATE TABLE IF NOT EXISTS jobs ( id INTEGER PRIMARY KEY, filename TEXT, status TEXT DEFAULT 'queued', doc_type TEXT, tags_json TEXT DEFAULT '[]', title TEXT, error TEXT, document_id INTEGER, chunk_count INTEGER DEFAULT 0, staging_path TEXT, content_hash TEXT, created_at TEXT DEFAULT current_timestamp, completed_at TEXT ); """) conn.commit() yield conn conn.close() @pytest.fixture def sample_pdf(data_dir): """Create a fake PDF file in staging.""" content = b"%PDF-1.4 fake pdf content for testing" staging = data_dir / "staging" path = staging / "test_upload.pdf" path.write_bytes(content) return path, content class TestWorkerFileStorage: """Tests for worker moving files to persistent storage.""" def test_successful_ingestion_stores_file(self, data_dir, db_conn, sample_pdf): """7.1 - Test successful ingestion stores file at expected path.""" staged_path, content = sample_pdf content_hash = hashlib.sha256(content).hexdigest() documents_dir = data_dir / "documents" expected_dest = documents_dir / f"{content_hash}.pdf" # Simulate what the worker does: move file to documents dir shutil.move(str(staged_path), str(expected_dest)) assert expected_dest.exists() assert expected_dest.read_bytes() == content assert not staged_path.exists() # Simulate DB update db_conn.execute( "INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) " "VALUES (?, ?, ?, ?, ?, ?)", ("Test PDF", str(staged_path), content_hash, "pdf", str(expected_dest), "test_upload.pdf"), ) db_conn.commit() row = db_conn.execute("SELECT stored_path, original_filename FROM documents WHERE content_hash = ?", (content_hash,)).fetchone() assert row["stored_path"] == str(expected_dest) assert row["original_filename"] == "test_upload.pdf" def test_failed_ingestion_no_file_in_documents(self, data_dir, sample_pdf): """7.2 - Test failed ingestion does not leave file in documents dir.""" staged_path, _ = sample_pdf documents_dir = data_dir / "documents" # Simulate failure: staging file gets cleaned up, nothing in documents dir staged_path.unlink() assert len(list(documents_dir.iterdir())) == 0 def test_document_deletion_removes_stored_file(self, data_dir, db_conn, sample_pdf): """7.4 - Test document deletion removes stored file.""" staged_path, content = sample_pdf content_hash = hashlib.sha256(content).hexdigest() documents_dir = data_dir / "documents" dest = documents_dir / f"{content_hash}.pdf" shutil.move(str(staged_path), str(dest)) db_conn.execute( "INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) " "VALUES (?, ?, ?, ?, ?, ?)", ("Test PDF", str(staged_path), content_hash, "pdf", str(dest), "test_upload.pdf"), ) db_conn.commit() # Simulate delete: remove from DB and disk doc = db_conn.execute("SELECT id, stored_path FROM documents WHERE content_hash = ?", (content_hash,)).fetchone() stored = Path(doc["stored_path"]) db_conn.execute("DELETE FROM documents WHERE id = ?", (doc["id"],)) db_conn.commit() if stored.exists(): stored.unlink() assert not stored.exists() assert db_conn.execute("SELECT COUNT(*) FROM documents", ()).fetchone()[0] == 0 def test_download_404_for_document_without_stored_file(self, db_conn): """7.5 - Test download returns 404 for documents without stored files.""" db_conn.execute( "INSERT INTO documents(title, source_path, content_hash, doc_type) " "VALUES (?, ?, ?, ?)", ("Old Doc", "/tmp/gone", "abc123", "pdf"), ) db_conn.commit() row = db_conn.execute("SELECT stored_path FROM documents WHERE content_hash = 'abc123'").fetchone() assert row["stored_path"] is None class TestFileDownloadEndpoint: """Tests for the /api/v1/documents/{id}/file endpoint logic.""" def test_file_response_uses_original_filename(self, data_dir, db_conn, sample_pdf): """7.3 - Test file download uses correct original filename.""" staged_path, content = sample_pdf content_hash = hashlib.sha256(content).hexdigest() documents_dir = data_dir / "documents" dest = documents_dir / f"{content_hash}.pdf" shutil.move(str(staged_path), str(dest)) db_conn.execute( "INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) " "VALUES (?, ?, ?, ?, ?, ?)", ("My Report", str(staged_path), content_hash, "pdf", str(dest), "quarterly_report.pdf"), ) db_conn.commit() doc = db_conn.execute("SELECT stored_path, original_filename, title FROM documents WHERE content_hash = ?", (content_hash,)).fetchone() # Verify the original filename is preserved and different from title assert doc["original_filename"] == "quarterly_report.pdf" assert doc["title"] == "My Report" assert Path(doc["stored_path"]).exists() def test_fallback_to_title_when_no_original_filename(self, data_dir, db_conn): """Test that title+ext is used when original_filename is NULL.""" documents_dir = data_dir / "documents" fake_file = documents_dir / "somehash.pdf" fake_file.write_bytes(b"fake") db_conn.execute( "INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path) " "VALUES (?, ?, ?, ?, ?)", ("Engine Manual", "/tmp/old", "hash456", "pdf", str(fake_file)), ) db_conn.commit() doc = db_conn.execute("SELECT original_filename, title, stored_path FROM documents WHERE content_hash = 'hash456'").fetchone() # When original_filename is NULL, the endpoint should fall back to title + ext original_filename = doc["original_filename"] if not original_filename: ext = Path(doc["stored_path"]).suffix original_filename = (doc["title"] or "document") + ext assert original_filename == "Engine Manual.pdf"