Upload-time duplicate detection, FTS5 query sanitization, release guard

- Reject duplicate uploads at the API boundary (HTTP 409) instead of silently skipping in the background worker. Checks both ingested documents and in-flight jobs via content_hash on the jobs table. - Go client handles 409 with distinct messages for already-imported documents vs already-queued jobs. - Sanitize FTS5 search queries by quoting each token to prevent syntax errors from special characters like ?, *, ", (), AND, OR, NOT. - Add try/except safety net around FTS5 execute for edge cases. - Add main branch guard to release.sh to prevent releasing from feature branches. - Update specs and README to reflect new behaviour. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Fix tea asset upload syntax in release script
2026-03-26 23:05:07 +00:00 · 2026-03-26 22:10:48 +00:00
20 changed files with 538 additions and 32 deletions
@@ -1 +1,2 @@
 examples/
 .claude/
@@ -1,4 +1,4 @@
-# kb-search
+# kb
 Personal knowledge base with hybrid search (full-text + semantic vector search).
@@ -129,7 +129,7 @@ All endpoints are under `/api/v1/`. Requires `Authorization: Bearer <key>` heade
 |---|---|---|
 | `GET` | `/health` | Health check (bypasses auth) |
 | `POST` | `/search` | Hybrid search (JSON body) |
-| `POST` | `/jobs` | Upload file/note for ingestion (multipart, returns 202) |
+| `POST` | `/jobs` | Upload file/note for ingestion (multipart, returns 202 or 409 if duplicate) |
 | `GET` | `/jobs` | List ingestion jobs |
 | `GET` | `/jobs/{id}` | Job details |
 | `GET` | `/documents` | List documents |
@@ -1,7 +1,9 @@
 package cmd
 import (
 	"encoding/json"
 	"fmt"
 	"net/http"
 	"os"
 	"path/filepath"
 	"strings"
@@ -11,6 +13,21 @@ import (
 	"github.com/spf13/cobra"
 )
 type uploadResult struct {
 	Raw       interface{}
 	Duplicate bool
 	DocID     float64
 	JobID     float64
 	Title     string
 }
 func (r *uploadResult) duplicateMsg() string {
 	if r.DocID > 0 {
 		return fmt.Sprintf("Already imported: %s (doc ID: %.0f)", r.Title, r.DocID)
 	}
 	return fmt.Sprintf("Already queued: %s (job ID: %.0f)", r.Title, r.JobID)
 }
 var supportedExts = map[string]bool{
 	".pdf":  true,
 	".docx": true,
@@ -67,6 +84,26 @@ func runAdd(cmd *cobra.Command, args []string) error {
 			fmt.Fprintln(os.Stderr, err)
 			os.Exit(1)
 		}
 		if resp.StatusCode == http.StatusConflict {
 			var result interface{}
 			if err := api.DecodeJSON(resp, &result); err != nil {
 				return fmt.Errorf("failed to decode response: %w", err)
 			}
 			if output.IsJSON() {
 				output.PrintJSON(result)
 			} else {
 				if m, ok := result.(map[string]interface{}); ok {
 					if docID, ok := m["document_id"].(float64); ok {
 						fmt.Printf("Already imported: %s (doc ID: %.0f)\n", m["title"], docID)
 					} else if jobID, ok := m["job_id"].(float64); ok {
 						fmt.Printf("Already queued: %s (job ID: %.0f)\n", m["title"], jobID)
 					}
 				}
 			}
 			return nil
 		}
 		if err := api.CheckError(resp); err != nil {
 			fmt.Fprintln(os.Stderr, err)
 			os.Exit(1)
@@ -104,7 +141,9 @@ func runAdd(cmd *cobra.Command, args []string) error {
 		}
 		if output.IsJSON() {
-			output.PrintJSON([]interface{}{result})
+			output.PrintJSON([]interface{}{result.Raw})
 		} else if result.Duplicate {
 			fmt.Println(result.duplicateMsg())
 		} else {
 			fmt.Printf("Queued: %s\n", filepath.Base(path))
 		}
@@ -135,27 +174,39 @@ func runAdd(cmd *cobra.Command, args []string) error {
 	}
 	var results []interface{}
 	queued := 0
 	duplicates := 0
 	for _, f := range files {
 		result, err := uploadFile(client, f, tags, docType)
 		if err != nil {
 			fmt.Fprintf(os.Stderr, "Error uploading %s: %v\n", f, err)
 			continue
 		}
-		results = append(results, result)
+		results = append(results, result.Raw)
-		if !output.IsJSON() {
+		if result.Duplicate {
-			fmt.Printf("Queued: %s\n", filepath.Base(f))
+			duplicates++
 			if !output.IsJSON() {
 				fmt.Println(result.duplicateMsg())
 			}
 		} else {
 			queued++
 			if !output.IsJSON() {
 				fmt.Printf("Queued: %s\n", filepath.Base(f))
 			}
 		}
 	}
 	if output.IsJSON() {
 		output.PrintJSON(results)
 	} else if duplicates > 0 {
 		fmt.Printf("Queued: %d files, %d duplicates skipped\n", queued, duplicates)
 	} else {
-		fmt.Printf("Queued: %d files\n", len(results))
+		fmt.Printf("Queued: %d files\n", queued)
 	}
 	return nil
 }
-func uploadFile(client *api.Client, path, tags, docType string) (interface{}, error) {
+func uploadFile(client *api.Client, path, tags, docType string) (*uploadResult, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return nil, fmt.Errorf("cannot open %s: %w", path, err)
@@ -180,6 +231,29 @@ func uploadFile(client *api.Client, path, tags, docType string) (interface{}, er
 	if err != nil {
 		return nil, err
 	}
 	if resp.StatusCode == http.StatusConflict {
 		var raw json.RawMessage
 		if err := api.DecodeJSON(resp, &raw); err != nil {
 			return nil, fmt.Errorf("failed to decode response: %w", err)
 		}
 		var dupResp struct {
 			DocumentID float64 `json:"document_id"`
 			JobID      float64 `json:"job_id"`
 			Title      string  `json:"title"`
 		}
 		json.Unmarshal(raw, &dupResp)
 		var rawIface interface{}
 		json.Unmarshal(raw, &rawIface)
 		return &uploadResult{
 			Raw:       rawIface,
 			Duplicate: true,
 			DocID:     dupResp.DocumentID,
 			JobID:     dupResp.JobID,
 			Title:     dupResp.Title,
 		}, nil
 	}
 	if err := api.CheckError(resp); err != nil {
 		return nil, err
 	}
@@ -188,5 +262,5 @@ func uploadFile(client *api.Client, path, tags, docType string) (interface{}, er
 	if err := api.DecodeJSON(resp, &result); err != nil {
 		return nil, fmt.Errorf("failed to decode response: %w", err)
 	}
-	return result, nil
+	return &uploadResult{Raw: result}, nil
 }
@@ -94,6 +94,7 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
            document_id INTEGER,
            chunk_count INTEGER DEFAULT 0,
            staging_path TEXT,
            content_hash TEXT,
            created_at TEXT DEFAULT current_timestamp,
            completed_at TEXT
        );
@@ -108,6 +109,11 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
            f"CREATE VIRTUAL TABLE chunks_vec USING vec0(embedding float[{embedding_dim}], chunk_id integer)"
        )
    # Migrate: add content_hash to jobs if missing (added in v2.0.5)
    cols = {row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()}
    if "content_hash" not in cols:
        conn.execute("ALTER TABLE jobs ADD COLUMN content_hash TEXT")
    conn.commit()
@@ -142,6 +148,28 @@ def hash_exists(conn: sqlite3.Connection, content_hash: str) -> bool:
    return row is not None
 def get_document_by_hash(conn: sqlite3.Connection, content_hash: str) -> dict | None:
    """Return duplicate info for a given hash, or None.
    Checks both the documents table (already ingested) and the jobs table
    (queued/processing).  Returns a dict with either ``document_id`` or
    ``job_id`` so callers can distinguish the two cases.
    """
    row = conn.execute(
        "SELECT id, title FROM documents WHERE content_hash = ?", (content_hash,)
    ).fetchone()
    if row is not None:
        return {"document_id": row["id"], "title": row["title"]}
    # Also check pending/processing jobs that haven't been committed to documents yet
    row = conn.execute(
        "SELECT id, filename FROM jobs WHERE content_hash = ? AND status IN ('queued', 'processing')",
        (content_hash,),
    ).fetchone()
    if row is not None:
        return {"job_id": row["id"], "title": row["filename"]}
    return None
 def insert_document(
    conn: sqlite3.Connection,
    title: str,
@@ -252,11 +280,12 @@ def create_job(
    doc_type: Optional[str] = None,
    tags_json: str = "[]",
    title: Optional[str] = None,
    content_hash: Optional[str] = None,
 ) -> int:
    """Create a new ingest job and return its id."""
    cur = conn.execute(
-        "INSERT INTO jobs(filename, staging_path, doc_type, tags_json, title) VALUES (?, ?, ?, ?, ?)",
+        "INSERT INTO jobs(filename, staging_path, doc_type, tags_json, title, content_hash) VALUES (?, ?, ?, ?, ?, ?)",
-        (filename, staging_path, doc_type, tags_json, title),
+        (filename, staging_path, doc_type, tags_json, title, content_hash),
    )
    conn.commit()
    return cur.lastrowid
@@ -1,13 +1,15 @@
 """Job management endpoints — submit files/notes for ingestion and track progress."""
 import hashlib
 import json
 from typing import Optional
 from fastapi import HTTPException, UploadFile, File, Form, Query
 from fastapi.responses import JSONResponse
 from main import app
 from kb.config import cfg
-from kb.database import get_connection, create_job, get_job, list_jobs
+from kb.database import get_connection, create_job, get_job, list_jobs, get_document_by_hash
 from kb.staging import stage_file, stage_note
@@ -27,18 +29,32 @@ async def submit_job(
    if file:
        content = await file.read()
-        staging_path = stage_file(cfg.staging_dir, file.filename, content)
+        content_hash = hashlib.sha256(content).hexdigest()
        filename = file.filename
    else:
-        staging_path = stage_note(cfg.staging_dir, title or "note", note)
+        content = note.encode("utf-8")
-        filename = staging_path.name
+        content_hash = hashlib.sha256(content).hexdigest()
-
+        filename = None
    tags_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else []
    tags_json = json.dumps(tags_list)
    conn = get_connection(cfg.db_path)
    try:
-        job_id = create_job(conn, filename, str(staging_path), doc_type, tags_json, title)
+        existing = get_document_by_hash(conn, content_hash)
        if existing:
            return JSONResponse(
                status_code=409,
                content={"error": "duplicate", **existing},
            )
        if file:
            staging_path = stage_file(cfg.staging_dir, file.filename, content)
        else:
            staging_path = stage_note(cfg.staging_dir, title or "note", note)
            filename = staging_path.name
        tags_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else []
        tags_json = json.dumps(tags_list)
        job_id = create_job(conn, filename, str(staging_path), doc_type, tags_json, title, content_hash)
        return {"job_id": job_id, "status": "queued", "filename": filename}
    finally:
        conn.close()
@@ -1,9 +1,12 @@
 """Hybrid search — FTS5 + sqlite-vec with Reciprocal Rank Fusion."""
 import json
 import logging
 import struct
 import sqlite3
 logger = logging.getLogger("kb.search")
 def hybrid_search(
    conn: sqlite3.Connection,
@@ -74,6 +77,21 @@ def hybrid_search(
 # Internal helpers
 # ---------------------------------------------------------------------------
 def _sanitize_fts_query(query: str) -> str:
    """Escape a raw user query for safe use with FTS5 MATCH.
    Splits on whitespace, strips double quotes from each token, wraps each
    token in double quotes (making FTS5 treat all content as literals), and
    joins with spaces.  Returns empty string if no valid tokens remain.
    """
    tokens = []
    for token in query.split():
        token = token.replace('"', '')
        if token:
            tokens.append(f'"{token}"')
    return " ".join(tokens)
 def _fts_search(
    conn: sqlite3.Connection,
    query: str,
@@ -86,10 +104,14 @@ def _fts_search(
    Returns:
        {chunk_id: bm25_score} where scores are positive (higher = better).
    """
    safe_query = _sanitize_fts_query(query)
    if not safe_query:
        return {}
    sql = "SELECT f.rowid AS chunk_id, bm25(chunks_fts) AS rank FROM chunks_fts f"
    joins: list[str] = []
    where: list[str] = ["chunks_fts MATCH ?"]
-    params: list = [query]
+    params: list = [safe_query]
    if tags or doc_type:
        joins.append("JOIN chunks c ON f.rowid = c.id")
@@ -111,7 +133,11 @@ def _fts_search(
    sql += " ORDER BY rank LIMIT ?"
    params.append(limit)
-    rows = conn.execute(sql, params).fetchall()
+    try:
        rows = conn.execute(sql, params).fetchall()
    except sqlite3.OperationalError:
        logger.warning("FTS5 query failed for input: %r", query)
        return {}
    # BM25 returns negative values (lower = better match); negate so
    # higher = better.
@@ -0,0 +1,2 @@
 schema: spec-driven
 created: 2026-03-26
@@ -0,0 +1,40 @@
 ## Context
 FTS5 has its own query syntax. Characters like `?`, `*`, `"`, `(`, `)`, `+`, `-`, `^` and keywords like `AND`, `OR`, `NOT`, `NEAR` have special meaning. The current code passes the raw user query to `chunks_fts MATCH ?` — parameterized (safe from SQL injection) but not safe from FTS5 syntax errors.
 The fix point is `_fts_search()` in `engine/kb/search.py:92` where `params: list = [query]`.
 ## Goals / Non-Goals
 **Goals:**
 - Any user input to the search endpoint produces either valid results or an empty result set — never a 500 error
 - Preserve the user's search intent as much as possible (don't over-strip)
 **Non-Goals:**
 - Exposing FTS5 advanced syntax to users (they can't use AND/OR/NEAR operators intentionally)
 - Changing vector search (it already handles arbitrary strings via the embedding model)
 ## Decisions
 ### 1. Quote each token individually
 Split the query on whitespace, wrap each token in double quotes (`"token"`), and join with spaces. FTS5 interprets double-quoted strings as literal phrases, disabling all operator parsing within them. Any embedded double quotes in a token are stripped.
 Example: `what color is grass?` becomes `"what" "color" "is" "grass?"` — FTS5 treats `?` as a literal character inside quotes.
 **Alternative considered**: Strip all non-alphanumeric characters. Rejected because it would break searches for terms containing hyphens, dots, or other meaningful punctuation (e.g., searching for "v2.0" or "self-hosted").
 **Alternative considered**: Use a try/except to catch FTS5 errors and fall back. Rejected as a primary strategy because it silently degrades — but we'll add it as a safety net.
 ### 2. Handle empty/whitespace-only queries
 If after sanitization no tokens remain, skip FTS search entirely and return empty results. This prevents sending an empty string to MATCH which would also error.
 ### 3. Try/except safety net
 Wrap the FTS5 execute call in a try/except for `sqlite3.OperationalError`. If an edge case still slips through, return empty FTS results and log a warning rather than crashing with a 500.
 ## Risks / Trade-offs
 - **[Reduced FTS expressiveness]** Users cannot use FTS5 operators like `AND`, `OR`, phrase matching. → Acceptable trade-off for a personal knowledge base tool where natural language queries are the norm. The hybrid search (vector + FTS) compensates.
 - **[Edge cases]** Some Unicode or control characters might still cause issues. → The try/except safety net handles these.
@@ -0,0 +1,24 @@
 ## Why
 Searching with natural language queries containing characters like `?`, `"`, `*`, `(`, `)`, `-`, or FTS5 keywords (`AND`, `OR`, `NOT`, `NEAR`) causes a 500 error because the raw query string is passed directly to `chunks_fts MATCH ?` without escaping. Users should be able to type anything into a search query without triggering syntax errors.
 ## What Changes
 - **Sanitize FTS5 query input**: Escape or strip FTS5 special characters from the user's query before passing it to the MATCH operator
 - **Graceful fallback**: If the sanitized query produces no valid FTS5 terms, return empty results from FTS instead of erroring
 ## Capabilities
 ### New Capabilities
 _(none)_
 ### Modified Capabilities
 - `engine-api`: The "Hybrid search" requirement changes — the engine must sanitize user queries to prevent FTS5 syntax errors for any input
 ## Impact
 - **Engine search** (`engine/kb/search.py`): `_fts_search()` needs query sanitization before the MATCH parameter
 - **No client changes**: The client already displays results or errors correctly
 - **No schema changes**: No database modifications needed
@@ -0,0 +1,21 @@
 ## MODIFIED Requirements
 ### Requirement: Hybrid search
 The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5) and vector similarity search (via sqlite-vec), merged using Reciprocal Rank Fusion. Search SHALL complete in under 100ms when the model is warm. The engine SHALL sanitize user query strings to prevent FTS5 syntax errors for any input.
 #### Scenario: Search with special characters
 - **WHEN** a client sends `POST /api/v1/search` with body `{"query": "what color is grass?"}`
 - **THEN** the engine SHALL sanitize the query for FTS5, execute the search successfully, and return results (not a 500 error)
 #### Scenario: Search with FTS5 operators in query
 - **WHEN** a client sends `POST /api/v1/search` with body `{"query": "NOT something OR (other)"}`
 - **THEN** the engine SHALL treat the input as literal search terms, not FTS5 operators, and return matching results
 #### Scenario: Search with only special characters
 - **WHEN** a client sends `POST /api/v1/search` with body `{"query": "??!@#"}`
 - **THEN** the engine SHALL return HTTP 200 with an empty result set (not a 500 error)
 #### Scenario: Search with quotes in query
 - **WHEN** a client sends `POST /api/v1/search` with body `{"query": "the \"quick\" fox"}`
 - **THEN** the engine SHALL sanitize embedded quotes and return results normally
@@ -0,0 +1,15 @@
 ## 1. Query Sanitization
 - [x] 1.1 Add `_sanitize_fts_query(query)` function to `engine/kb/search.py` that splits on whitespace, strips double quotes from each token, wraps each token in double quotes, and joins with spaces
 - [x] 1.2 Handle edge case: if no valid tokens remain after sanitization, return empty dict from `_fts_search` without executing the query
 ## 2. Integration
 - [x] 2.1 Call `_sanitize_fts_query()` in `_fts_search()` before adding the query to params (line 92)
 - [x] 2.2 Add try/except `sqlite3.OperationalError` around the FTS5 execute call — log a warning and return empty results on error
 ## 3. Testing
 - [x] 3.1 Test: `kb search "what color is grass?"` returns results, not a 500 error
 - [x] 3.2 Test: `kb search "NOT something OR (other)"` returns results, treating input as literal terms
 - [x] 3.3 Test: query with only special characters returns empty results, not an error
@@ -0,0 +1,2 @@
 schema: spec-driven
 created: 2026-03-26
@@ -0,0 +1,58 @@
 ## Context
 The engine currently accepts all uploads with HTTP 202, stages the file, creates a job record, and relies on the background worker to detect duplicates via SHA256 content hash. When a duplicate is found, the worker marks the job as `skipped` — but the user has already received a success response and must poll job status to discover the duplicate. This creates unnecessary I/O (staging), pollutes the job list, and provides poor UX.
 The `documents` table already has a `content_hash TEXT UNIQUE` column, and `database.hash_exists()` already exists. The infrastructure for dedup is in place — it just runs too late in the pipeline.
 ## Goals / Non-Goals
 **Goals:**
 - Reject duplicate uploads at the API boundary with HTTP 409 and useful context (existing document ID/title)
 - Avoid staging files or creating job records for duplicates
 - Apply to both file uploads and note submissions
 - Keep the worker-side hash check as a race condition safety net
 - Update the Go client to handle 409 and display a clear message
 **Non-Goals:**
 - Fuzzy/near-duplicate detection (e.g., same PDF with different metadata) — byte-identical only
 - Changing the hash algorithm (SHA256 is fine)
 - Adding a "force re-import" override flag (can be added later if needed)
 - Dedup across different file formats with identical content (e.g., .md and .pdf of same text)
 ## Decisions
 ### 1. Hash in the upload endpoint, before staging
 Compute SHA256 from the uploaded bytes in `submit_job()` before calling `stage_file()`. This avoids writing to disk or creating a DB job record for duplicates.
 **Alternative considered**: Hash after staging but before job creation. Rejected because it still wastes disk I/O for the staging write.
 ### 2. Return HTTP 409 Conflict with context-dependent metadata
 The 409 response includes `{"error": "duplicate", ...}` with a distinct shape depending on where the duplicate was found:
 - **Already-ingested document**: `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
 - **In-flight job (queued/processing)**: `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
 This allows clients to distinguish between "this document already exists" and "this document is already being processed" and display appropriate messages.
 **Alternative considered**: Return 200 with a `"status": "duplicate"` field. Rejected because 409 is the semantically correct status code and allows clients to distinguish duplicates from successful uploads without parsing the body.
 ### 3. New database helper: `get_document_by_hash()`
 Returns a dict with duplicate info for a given hash, or `None`. Checks both the `documents` table (already ingested) and the `jobs` table (queued/processing), returning `document_id` or `job_id` accordingly. The `content_hash` column on the `jobs` table is populated at upload time to support this check. The boolean `hash_exists()` is retained for the worker safety net.
 **Alternative considered**: Modify `hash_exists()` to return the document row. Rejected to avoid changing the worker's existing interface — keep changes minimal.
 ### 4. Retain worker-side dedup as safety net
 The worker's `hash_exists()` check stays. In theory, two identical uploads could arrive in the same instant — both pass the API hash check before either commits. The jobs-table check closes most of this window (the hash is written at job creation), but a narrow race remains between the API check and the job insert. The UNIQUE constraint on `documents.content_hash` is the final backstop.
 ### 5. Note dedup: hash the text content
 For notes submitted via the `note` field, SHA256-hash the UTF-8 encoded text. This catches identical note resubmissions.
 ## Risks / Trade-offs
 - **[Race condition window]** Two identical files uploaded in the same millisecond could both pass the API hash check. → Mitigated by the worker-side `hash_exists()` check and the UNIQUE constraint. The second job would be `skipped`, not a crash.
 - **[Blocking I/O in async endpoint]** SHA256 hashing is CPU-bound but fast (~5ms for 10MB). → Acceptable for the upload endpoint which already reads the full file into memory. No need for `run_in_executor`.
 - **[Client compatibility]** Older clients not expecting 409 will see an error. → This is correct behavior — they'll see an HTTP error rather than silently accepting a duplicate. The Go client will be updated to handle it gracefully.
@@ -0,0 +1,31 @@
 ## Why
 Duplicate document detection currently happens in the background worker — the upload endpoint always returns HTTP 202, and the user only discovers a duplicate later when the job status is `skipped`. This wastes staging I/O, creates noise in the job list, and gives poor user feedback. Moving the SHA256 content hash check to the upload endpoint allows immediate rejection with a clear error, preventing unnecessary work and giving the user instant feedback.
 ## What Changes
 - **Compute content hash at upload time**: The `POST /api/v1/jobs` endpoint will SHA256-hash the uploaded file bytes before staging and check against `documents.content_hash`
 - **Reject duplicates immediately**: Return HTTP 409 Conflict with the existing document ID when a duplicate is detected, instead of accepting and later skipping
 - **No job created for duplicates**: Duplicate uploads will not create a job record or stage a file
 - **Remove worker-side dedup**: The background worker's `hash_exists()` check becomes redundant for the normal flow but should be retained as a safety net (race condition guard)
 - **Update Go client**: Surface the 409 response with a clear message (e.g., "Already imported: <title> (doc ID: <id>)")
 - **Note dedup**: Apply the same check to notes — hash the note text content
 ## Capabilities
 ### New Capabilities
 _(none — this modifies existing capabilities)_
 ### Modified Capabilities
 - `engine-api`: The "Async ingestion via job queue" requirement changes — duplicate content is now rejected at upload time (HTTP 409) instead of accepted and later skipped by the worker. The "Duplicate content detection" scenario moves from background to synchronous.
 - `go-client`: The "Add command" requirement changes — the client must handle HTTP 409 responses and display the duplicate document info to the user.
 ## Impact
 - **Engine API** (`engine/kb/routes/jobs.py`): `submit_job()` gains hash computation and DB lookup before staging/job creation
 - **Engine database** (`engine/kb/database.py`): Need a query to return the existing document ID/title for a given hash (not just boolean exists check)
 - **Engine worker** (`engine/kb/worker.py`): Dedup check retained as safety net but no longer the primary guard
 - **Go client** (`client/cmd/add.go`): Handle 409 response, display duplicate info
 - **API contract**: New HTTP 409 response on `POST /api/v1/jobs` — this is additive, not breaking, since no consumer expects 409 today
@@ -0,0 +1,41 @@
 ## MODIFIED Requirements
 ### Requirement: Async ingestion via job queue
 The engine SHALL accept file uploads and text notes for ingestion asynchronously. Uploaded content SHALL be written to a staging area and a job record created in the database. The engine SHALL return HTTP 202 immediately. A background worker SHALL process queued jobs sequentially. Before staging, the engine SHALL compute a SHA256 hash of the uploaded content and reject duplicates immediately.
 #### Scenario: Upload a PDF file
 - **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a PDF file and optional fields (tags, doc_type)
 - **THEN** the engine SHALL compute the SHA256 hash of the file bytes, verify no existing document has the same hash, write the file to the staging directory, create a job record with status `queued`, and return HTTP 202 with `{"job_id": "<id>", "status": "queued", "filename": "report.pdf"}`
 #### Scenario: Upload a text note
 - **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a `note` text field and optional `title` field
 - **THEN** the engine SHALL compute the SHA256 hash of the note text (UTF-8 encoded), verify no existing document has the same hash, write the note content to a staging file, create a job record with status `queued`, and return HTTP 202 with the job ID
 #### Scenario: Upload multiple files in sequence
 - **WHEN** a client sends multiple `POST /api/v1/jobs` requests in quick succession
 - **THEN** the engine SHALL queue each job independently and the background worker SHALL process them in FIFO order
 #### Scenario: Duplicate file detected at upload time (already ingested)
 - **WHEN** a client uploads a file whose SHA256 content hash matches an already-ingested document
 - **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
 #### Scenario: Duplicate file detected at upload time (in-flight job)
 - **WHEN** a client uploads a file whose SHA256 content hash matches a queued or processing job
 - **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
 #### Scenario: Duplicate note detected at upload time (already ingested)
 - **WHEN** a client submits a note whose SHA256 content hash matches an already-ingested document
 - **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
 #### Scenario: Duplicate note detected at upload time (in-flight job)
 - **WHEN** a client submits a note whose SHA256 content hash matches a queued or processing job
 - **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
 #### Scenario: Duplicate uploaded during concurrent request handling
 - **WHEN** two identical files are uploaded in the same instant, both passing the API hash check before either job is committed
 - **THEN** both jobs SHALL be queued, and the background worker SHALL process the first normally and mark the second as `skipped` (worker-side safety net via `hash_exists()` and UNIQUE constraint)
 #### Scenario: Upload failure due to unsupported file type
 - **WHEN** a client uploads a file with an unsupported extension
 - **THEN** the engine SHALL return HTTP 422 with an error message listing supported types
@@ -0,0 +1,49 @@
 ## MODIFIED Requirements
 ### Requirement: Add command (file and note ingestion)
 The client SHALL provide a `kb add` command that uploads files or notes to the engine for async ingestion. The client SHALL exit immediately after a successful upload. The client SHALL handle duplicate rejection (HTTP 409) and display the existing document information.
 #### Scenario: Add a single file
 - **WHEN** the user runs `kb add report.pdf`
 - **THEN** the client SHALL upload the file via `POST /api/v1/jobs` (multipart), print "Queued: report.pdf", and exit
 #### Scenario: Add a file with tags
 - **WHEN** the user runs `kb add manual.pdf --tags car,maintenance`
 - **THEN** the client SHALL include the tags in the multipart upload metadata
 #### Scenario: Add a directory recursively
 - **WHEN** the user runs `kb add ~/documents/ --recursive`
 - **THEN** the client SHALL discover all supported files in the directory tree, upload each one sequentially, and print "Queued: N files"
 #### Scenario: Add a text note
 - **WHEN** the user runs `kb add --note "The server room is in building 3, floor 2"`
 - **THEN** the client SHALL submit the note text via `POST /api/v1/jobs` (multipart with note field), print "Queued: note", and exit
 #### Scenario: Duplicate file rejected (already ingested)
 - **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "document_id": 42, "title": "report.pdf"}`
 - **THEN** the client SHALL print "Already imported: report.pdf (doc ID: 42)" and exit with code 0
 #### Scenario: Duplicate file rejected (in-flight job)
 - **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "job_id": 7, "title": "report.pdf"}`
 - **THEN** the client SHALL print "Already queued: report.pdf (job ID: 7)" and exit with code 0
 #### Scenario: Duplicate file in recursive add
 - **WHEN** the user runs `kb add ~/documents/ --recursive` and some files are rejected as duplicates
 - **THEN** the client SHALL print the duplicate message for each rejected file (distinguishing "Already imported" from "Already queued"), continue uploading remaining files, and include a summary (e.g., "Queued: 5 files, 2 duplicates skipped")
 #### Scenario: Duplicate with JSON output
 - **WHEN** the user runs `kb add report.pdf --format json` and the engine returns HTTP 409
 - **THEN** the client SHALL output the raw JSON response from the engine including the document_id and title
 #### Scenario: Add with JSON output
 - **WHEN** the user runs `kb add report.pdf --format json`
 - **THEN** the client SHALL output the JSON response from the engine including the job_id
 #### Scenario: File not found
 - **WHEN** the user runs `kb add nonexistent.pdf`
 - **THEN** the client SHALL print an error and exit with a non-zero code without making any API call
 #### Scenario: Upload failure
 - **WHEN** the upload fails (network error, engine returns 4xx/5xx other than 409)
 - **THEN** the client SHALL print the error and exit with a non-zero code
@@ -0,0 +1,22 @@
 ## 1. Database Layer
 - [x] 1.1 Add `get_document_by_hash(conn, content_hash)` function to `engine/kb/database.py` that returns `(document_id, title)` or `None`
 ## 2. Upload Endpoint
 - [x] 2.1 Update `submit_job()` in `engine/kb/routes/jobs.py` to compute SHA256 hash of uploaded file bytes before staging
 - [x] 2.2 Add duplicate check: call `get_document_by_hash()` and return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}` if match found
 - [x] 2.3 Apply same hash check for note submissions (hash the UTF-8 encoded note text)
 ## 3. Go Client
 - [x] 3.1 Update `uploadFile()` in `client/cmd/add.go` to handle HTTP 409 responses — parse the JSON body and print "Already imported: <title> (doc ID: <id>)"
 - [x] 3.2 Update recursive directory upload to continue on 409, track duplicate count, and include in summary output
 - [x] 3.3 Handle 409 in JSON output mode — pass through the raw engine response
 ## 4. Testing
 - [x] 4.1 Test: upload a file, then upload the same file again — verify 409 with correct document_id and title
 - [x] 4.2 Test: upload a note, then upload the same note text — verify 409
 - [x] 4.3 Test: upload a file, then upload a different file — verify 202 as normal
 - [x] 4.4 Test: verify the worker-side `hash_exists()` safety net still works (direct job insertion bypassing API)
@@ -26,7 +26,7 @@ The engine SHALL load the embedding model eagerly at startup before accepting HT
 ### Requirement: Hybrid search
-The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5) and vector similarity search (via sqlite-vec), merged using Reciprocal Rank Fusion. Search SHALL complete in under 100ms when the model is warm.
+The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5) and vector similarity search (via sqlite-vec), merged using Reciprocal Rank Fusion. Search SHALL complete in under 100ms when the model is warm. The engine SHALL sanitize user query strings to prevent FTS5 syntax errors for any input.
 #### Scenario: Hybrid search with results
 - **WHEN** a client sends `POST /api/v1/search` with body `{"query": "how to change oil", "top": 5}`
@@ -44,27 +44,59 @@ The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5
 - **WHEN** a client searches against an empty database
 - **THEN** the engine SHALL return HTTP 200 with `{"query": "...", "results": [], "total_matches": 0}`
 #### Scenario: Search with special characters
 - **WHEN** a client sends `POST /api/v1/search` with body `{"query": "what color is grass?"}`
 - **THEN** the engine SHALL sanitize the query for FTS5, execute the search successfully, and return results (not a 500 error)
 #### Scenario: Search with FTS5 operators in query
 - **WHEN** a client sends `POST /api/v1/search` with body `{"query": "NOT something OR (other)"}`
 - **THEN** the engine SHALL treat the input as literal search terms, not FTS5 operators, and return matching results
 #### Scenario: Search with only special characters
 - **WHEN** a client sends `POST /api/v1/search` with body `{"query": "??!@#"}`
 - **THEN** the engine SHALL return HTTP 200 with an empty result set (not a 500 error)
 #### Scenario: Search with quotes in query
 - **WHEN** a client sends `POST /api/v1/search` with body `{"query": "the \"quick\" fox"}`
 - **THEN** the engine SHALL sanitize embedded quotes and return results normally
 ---
 ### Requirement: Async ingestion via job queue
-The engine SHALL accept file uploads and text notes for ingestion asynchronously. Uploaded content SHALL be written to a staging area and a job record created in the database. The engine SHALL return HTTP 202 immediately. A background worker SHALL process queued jobs sequentially.
+The engine SHALL accept file uploads and text notes for ingestion asynchronously. Uploaded content SHALL be written to a staging area and a job record created in the database. The engine SHALL return HTTP 202 immediately. A background worker SHALL process queued jobs sequentially. Before staging, the engine SHALL compute a SHA256 hash of the uploaded content and reject duplicates immediately.
 #### Scenario: Upload a PDF file
 - **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a PDF file and optional fields (tags, doc_type)
- **THEN** the engine SHALL write the file to the staging directory, create a job record with status `queued`, and return HTTP 202 with `{"job_id": "<id>", "status": "queued", "filename": "report.pdf"}`
+- **THEN** the engine SHALL compute the SHA256 hash of the file bytes, verify no existing document has the same hash, write the file to the staging directory, create a job record with status `queued`, and return HTTP 202 with `{"job_id": "<id>", "status": "queued", "filename": "report.pdf"}`
 #### Scenario: Upload a text note
 - **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a `note` text field and optional `title` field
- **THEN** the engine SHALL write the note content to a staging file, create a job record with status `queued`, and return HTTP 202 with the job ID
+- **THEN** the engine SHALL compute the SHA256 hash of the note text (UTF-8 encoded), verify no existing document has the same hash, write the note content to a staging file, create a job record with status `queued`, and return HTTP 202 with the job ID
 #### Scenario: Upload multiple files in sequence
 - **WHEN** a client sends multiple `POST /api/v1/jobs` requests in quick succession
 - **THEN** the engine SHALL queue each job independently and the background worker SHALL process them in FIFO order
-#### Scenario: Duplicate content detection
+#### Scenario: Duplicate file detected at upload time (already ingested)
- **WHEN** a client uploads a file whose content hash matches an already-ingested document
+- **WHEN** a client uploads a file whose SHA256 content hash matches an already-ingested document
- **THEN** the engine SHALL return HTTP 202 but the background worker SHALL mark the job as `skipped` with reason `duplicate`
+- **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
 #### Scenario: Duplicate file detected at upload time (in-flight job)
 - **WHEN** a client uploads a file whose SHA256 content hash matches a queued or processing job
 - **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
 #### Scenario: Duplicate note detected at upload time (already ingested)
 - **WHEN** a client submits a note whose SHA256 content hash matches an already-ingested document
 - **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
 #### Scenario: Duplicate note detected at upload time (in-flight job)
 - **WHEN** a client submits a note whose SHA256 content hash matches a queued or processing job
 - **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
 #### Scenario: Duplicate uploaded during concurrent request handling
 - **WHEN** two identical files are uploaded in the same instant, both passing the API hash check before either job is committed
 - **THEN** both jobs SHALL be queued, and the background worker SHALL process the first normally and mark the second as `skipped` (worker-side safety net via `hash_exists()` and UNIQUE constraint)
 #### Scenario: Upload failure due to unsupported file type
 - **WHEN** a client uploads a file with an unsupported extension
@@ -66,7 +66,7 @@ The client SHALL provide a `kb search <query>` command that sends the query to t
 ### Requirement: Add command (file and note ingestion)
-The client SHALL provide a `kb add` command that uploads files or notes to the engine for async ingestion. The client SHALL exit immediately after a successful upload.
+The client SHALL provide a `kb add` command that uploads files or notes to the engine for async ingestion. The client SHALL exit immediately after a successful upload. The client SHALL handle duplicate rejection (HTTP 409) and display the existing document information.
 #### Scenario: Add a single file
 - **WHEN** the user runs `kb add report.pdf`
@@ -84,6 +84,22 @@ The client SHALL provide a `kb add` command that uploads files or notes to the e
 - **WHEN** the user runs `kb add --note "The server room is in building 3, floor 2"`
 - **THEN** the client SHALL submit the note text via `POST /api/v1/jobs` (multipart with note field), print "Queued: note", and exit
 #### Scenario: Duplicate file rejected (already ingested)
 - **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "document_id": 42, "title": "report.pdf"}`
 - **THEN** the client SHALL print "Already imported: report.pdf (doc ID: 42)" and exit with code 0
 #### Scenario: Duplicate file rejected (in-flight job)
 - **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "job_id": 7, "title": "report.pdf"}`
 - **THEN** the client SHALL print "Already queued: report.pdf (job ID: 7)" and exit with code 0
 #### Scenario: Duplicate file in recursive add
 - **WHEN** the user runs `kb add ~/documents/ --recursive` and some files are rejected as duplicates
 - **THEN** the client SHALL print the duplicate message for each rejected file (distinguishing "Already imported" from "Already queued"), continue uploading remaining files, and include a summary (e.g., "Queued: 5 files, 2 duplicates skipped")
 #### Scenario: Duplicate with JSON output
 - **WHEN** the user runs `kb add report.pdf --format json` and the engine returns HTTP 409
 - **THEN** the client SHALL output the raw JSON response from the engine including the document_id and title
 #### Scenario: Add with JSON output
 - **WHEN** the user runs `kb add report.pdf --format json`
 - **THEN** the client SHALL output the JSON response from the engine including the job_id
@@ -93,7 +109,7 @@ The client SHALL provide a `kb add` command that uploads files or notes to the e
 - **THEN** the client SHALL print an error and exit with a non-zero code without making any API call
 #### Scenario: Upload failure
- **WHEN** the upload fails (network error, engine returns 4xx/5xx)
+- **WHEN** the upload fails (network error, engine returns 4xx/5xx other than 409)
 - **THEN** the client SHALL print the error and exit with a non-zero code
 ---
@@ -65,6 +65,13 @@ if [[ -z "$FORGE" ]]; then
    exit 1
 fi
 # Ensure we're on main branch
 CURRENT_BRANCH="$(git -C "$SCRIPT_DIR" rev-parse --abbrev-ref HEAD)"
 if [[ "$CURRENT_BRANCH" != "main" ]]; then
    echo "Error: releases must be made from the main branch (currently on '$CURRENT_BRANCH')"
    exit 1
 fi
 if ! command -v "$FORGE" &>/dev/null; then
    echo "Error: '$FORGE' not found in PATH"
    exit 1
@@ -240,9 +247,9 @@ elif [[ "$FORGE" == "tea" ]]; then
        --title "$RELEASE_TITLE" \
        --note "$RELEASE_NOTES"
-    # tea attaches assets separately
+    # tea attaches assets as positional args: tea release asset create <tag> <file>...
    for f in "${ASSETS[@]+"${ASSETS[@]}"}"; do
-        run tea release asset create --tag "$TAG" "$f"
+        run tea release asset create "$TAG" "$f"
    done
 fi