Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6fec627503 | |||
| 63654a59b8 |
@@ -1 +1,2 @@
|
|||||||
examples/
|
examples/
|
||||||
|
.claude/
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
# kb-search
|
# kb
|
||||||
|
|
||||||
Personal knowledge base with hybrid search (full-text + semantic vector search).
|
Personal knowledge base with hybrid search (full-text + semantic vector search).
|
||||||
|
|
||||||
@@ -129,7 +129,7 @@ All endpoints are under `/api/v1/`. Requires `Authorization: Bearer <key>` heade
|
|||||||
|---|---|---|
|
|---|---|---|
|
||||||
| `GET` | `/health` | Health check (bypasses auth) |
|
| `GET` | `/health` | Health check (bypasses auth) |
|
||||||
| `POST` | `/search` | Hybrid search (JSON body) |
|
| `POST` | `/search` | Hybrid search (JSON body) |
|
||||||
| `POST` | `/jobs` | Upload file/note for ingestion (multipart, returns 202) |
|
| `POST` | `/jobs` | Upload file/note for ingestion (multipart, returns 202 or 409 if duplicate) |
|
||||||
| `GET` | `/jobs` | List ingestion jobs |
|
| `GET` | `/jobs` | List ingestion jobs |
|
||||||
| `GET` | `/jobs/{id}` | Job details |
|
| `GET` | `/jobs/{id}` | Job details |
|
||||||
| `GET` | `/documents` | List documents |
|
| `GET` | `/documents` | List documents |
|
||||||
|
|||||||
+81
-7
@@ -1,7 +1,9 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -11,6 +13,21 @@ import (
|
|||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type uploadResult struct {
|
||||||
|
Raw interface{}
|
||||||
|
Duplicate bool
|
||||||
|
DocID float64
|
||||||
|
JobID float64
|
||||||
|
Title string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *uploadResult) duplicateMsg() string {
|
||||||
|
if r.DocID > 0 {
|
||||||
|
return fmt.Sprintf("Already imported: %s (doc ID: %.0f)", r.Title, r.DocID)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("Already queued: %s (job ID: %.0f)", r.Title, r.JobID)
|
||||||
|
}
|
||||||
|
|
||||||
var supportedExts = map[string]bool{
|
var supportedExts = map[string]bool{
|
||||||
".pdf": true,
|
".pdf": true,
|
||||||
".docx": true,
|
".docx": true,
|
||||||
@@ -67,6 +84,26 @@ func runAdd(cmd *cobra.Command, args []string) error {
|
|||||||
fmt.Fprintln(os.Stderr, err)
|
fmt.Fprintln(os.Stderr, err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode == http.StatusConflict {
|
||||||
|
var result interface{}
|
||||||
|
if err := api.DecodeJSON(resp, &result); err != nil {
|
||||||
|
return fmt.Errorf("failed to decode response: %w", err)
|
||||||
|
}
|
||||||
|
if output.IsJSON() {
|
||||||
|
output.PrintJSON(result)
|
||||||
|
} else {
|
||||||
|
if m, ok := result.(map[string]interface{}); ok {
|
||||||
|
if docID, ok := m["document_id"].(float64); ok {
|
||||||
|
fmt.Printf("Already imported: %s (doc ID: %.0f)\n", m["title"], docID)
|
||||||
|
} else if jobID, ok := m["job_id"].(float64); ok {
|
||||||
|
fmt.Printf("Already queued: %s (job ID: %.0f)\n", m["title"], jobID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
if err := api.CheckError(resp); err != nil {
|
if err := api.CheckError(resp); err != nil {
|
||||||
fmt.Fprintln(os.Stderr, err)
|
fmt.Fprintln(os.Stderr, err)
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
@@ -104,7 +141,9 @@ func runAdd(cmd *cobra.Command, args []string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if output.IsJSON() {
|
if output.IsJSON() {
|
||||||
output.PrintJSON([]interface{}{result})
|
output.PrintJSON([]interface{}{result.Raw})
|
||||||
|
} else if result.Duplicate {
|
||||||
|
fmt.Println(result.duplicateMsg())
|
||||||
} else {
|
} else {
|
||||||
fmt.Printf("Queued: %s\n", filepath.Base(path))
|
fmt.Printf("Queued: %s\n", filepath.Base(path))
|
||||||
}
|
}
|
||||||
@@ -135,27 +174,39 @@ func runAdd(cmd *cobra.Command, args []string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var results []interface{}
|
var results []interface{}
|
||||||
|
queued := 0
|
||||||
|
duplicates := 0
|
||||||
for _, f := range files {
|
for _, f := range files {
|
||||||
result, err := uploadFile(client, f, tags, docType)
|
result, err := uploadFile(client, f, tags, docType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "Error uploading %s: %v\n", f, err)
|
fmt.Fprintf(os.Stderr, "Error uploading %s: %v\n", f, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
results = append(results, result)
|
results = append(results, result.Raw)
|
||||||
if !output.IsJSON() {
|
if result.Duplicate {
|
||||||
fmt.Printf("Queued: %s\n", filepath.Base(f))
|
duplicates++
|
||||||
|
if !output.IsJSON() {
|
||||||
|
fmt.Println(result.duplicateMsg())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
queued++
|
||||||
|
if !output.IsJSON() {
|
||||||
|
fmt.Printf("Queued: %s\n", filepath.Base(f))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if output.IsJSON() {
|
if output.IsJSON() {
|
||||||
output.PrintJSON(results)
|
output.PrintJSON(results)
|
||||||
|
} else if duplicates > 0 {
|
||||||
|
fmt.Printf("Queued: %d files, %d duplicates skipped\n", queued, duplicates)
|
||||||
} else {
|
} else {
|
||||||
fmt.Printf("Queued: %d files\n", len(results))
|
fmt.Printf("Queued: %d files\n", queued)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func uploadFile(client *api.Client, path, tags, docType string) (interface{}, error) {
|
func uploadFile(client *api.Client, path, tags, docType string) (*uploadResult, error) {
|
||||||
f, err := os.Open(path)
|
f, err := os.Open(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("cannot open %s: %w", path, err)
|
return nil, fmt.Errorf("cannot open %s: %w", path, err)
|
||||||
@@ -180,6 +231,29 @@ func uploadFile(client *api.Client, path, tags, docType string) (interface{}, er
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if resp.StatusCode == http.StatusConflict {
|
||||||
|
var raw json.RawMessage
|
||||||
|
if err := api.DecodeJSON(resp, &raw); err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to decode response: %w", err)
|
||||||
|
}
|
||||||
|
var dupResp struct {
|
||||||
|
DocumentID float64 `json:"document_id"`
|
||||||
|
JobID float64 `json:"job_id"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
}
|
||||||
|
json.Unmarshal(raw, &dupResp)
|
||||||
|
var rawIface interface{}
|
||||||
|
json.Unmarshal(raw, &rawIface)
|
||||||
|
return &uploadResult{
|
||||||
|
Raw: rawIface,
|
||||||
|
Duplicate: true,
|
||||||
|
DocID: dupResp.DocumentID,
|
||||||
|
JobID: dupResp.JobID,
|
||||||
|
Title: dupResp.Title,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
if err := api.CheckError(resp); err != nil {
|
if err := api.CheckError(resp); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -188,5 +262,5 @@ func uploadFile(client *api.Client, path, tags, docType string) (interface{}, er
|
|||||||
if err := api.DecodeJSON(resp, &result); err != nil {
|
if err := api.DecodeJSON(resp, &result); err != nil {
|
||||||
return nil, fmt.Errorf("failed to decode response: %w", err)
|
return nil, fmt.Errorf("failed to decode response: %w", err)
|
||||||
}
|
}
|
||||||
return result, nil
|
return &uploadResult{Raw: result}, nil
|
||||||
}
|
}
|
||||||
|
|||||||
+31
-2
@@ -94,6 +94,7 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
|
|||||||
document_id INTEGER,
|
document_id INTEGER,
|
||||||
chunk_count INTEGER DEFAULT 0,
|
chunk_count INTEGER DEFAULT 0,
|
||||||
staging_path TEXT,
|
staging_path TEXT,
|
||||||
|
content_hash TEXT,
|
||||||
created_at TEXT DEFAULT current_timestamp,
|
created_at TEXT DEFAULT current_timestamp,
|
||||||
completed_at TEXT
|
completed_at TEXT
|
||||||
);
|
);
|
||||||
@@ -108,6 +109,11 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
|
|||||||
f"CREATE VIRTUAL TABLE chunks_vec USING vec0(embedding float[{embedding_dim}], chunk_id integer)"
|
f"CREATE VIRTUAL TABLE chunks_vec USING vec0(embedding float[{embedding_dim}], chunk_id integer)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Migrate: add content_hash to jobs if missing (added in v2.0.5)
|
||||||
|
cols = {row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()}
|
||||||
|
if "content_hash" not in cols:
|
||||||
|
conn.execute("ALTER TABLE jobs ADD COLUMN content_hash TEXT")
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
@@ -142,6 +148,28 @@ def hash_exists(conn: sqlite3.Connection, content_hash: str) -> bool:
|
|||||||
return row is not None
|
return row is not None
|
||||||
|
|
||||||
|
|
||||||
|
def get_document_by_hash(conn: sqlite3.Connection, content_hash: str) -> dict | None:
|
||||||
|
"""Return duplicate info for a given hash, or None.
|
||||||
|
|
||||||
|
Checks both the documents table (already ingested) and the jobs table
|
||||||
|
(queued/processing). Returns a dict with either ``document_id`` or
|
||||||
|
``job_id`` so callers can distinguish the two cases.
|
||||||
|
"""
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT id, title FROM documents WHERE content_hash = ?", (content_hash,)
|
||||||
|
).fetchone()
|
||||||
|
if row is not None:
|
||||||
|
return {"document_id": row["id"], "title": row["title"]}
|
||||||
|
# Also check pending/processing jobs that haven't been committed to documents yet
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT id, filename FROM jobs WHERE content_hash = ? AND status IN ('queued', 'processing')",
|
||||||
|
(content_hash,),
|
||||||
|
).fetchone()
|
||||||
|
if row is not None:
|
||||||
|
return {"job_id": row["id"], "title": row["filename"]}
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def insert_document(
|
def insert_document(
|
||||||
conn: sqlite3.Connection,
|
conn: sqlite3.Connection,
|
||||||
title: str,
|
title: str,
|
||||||
@@ -252,11 +280,12 @@ def create_job(
|
|||||||
doc_type: Optional[str] = None,
|
doc_type: Optional[str] = None,
|
||||||
tags_json: str = "[]",
|
tags_json: str = "[]",
|
||||||
title: Optional[str] = None,
|
title: Optional[str] = None,
|
||||||
|
content_hash: Optional[str] = None,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Create a new ingest job and return its id."""
|
"""Create a new ingest job and return its id."""
|
||||||
cur = conn.execute(
|
cur = conn.execute(
|
||||||
"INSERT INTO jobs(filename, staging_path, doc_type, tags_json, title) VALUES (?, ?, ?, ?, ?)",
|
"INSERT INTO jobs(filename, staging_path, doc_type, tags_json, title, content_hash) VALUES (?, ?, ?, ?, ?, ?)",
|
||||||
(filename, staging_path, doc_type, tags_json, title),
|
(filename, staging_path, doc_type, tags_json, title, content_hash),
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
return cur.lastrowid
|
return cur.lastrowid
|
||||||
|
|||||||
@@ -1,13 +1,15 @@
|
|||||||
"""Job management endpoints — submit files/notes for ingestion and track progress."""
|
"""Job management endpoints — submit files/notes for ingestion and track progress."""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from fastapi import HTTPException, UploadFile, File, Form, Query
|
from fastapi import HTTPException, UploadFile, File, Form, Query
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
from main import app
|
from main import app
|
||||||
from kb.config import cfg
|
from kb.config import cfg
|
||||||
from kb.database import get_connection, create_job, get_job, list_jobs
|
from kb.database import get_connection, create_job, get_job, list_jobs, get_document_by_hash
|
||||||
from kb.staging import stage_file, stage_note
|
from kb.staging import stage_file, stage_note
|
||||||
|
|
||||||
|
|
||||||
@@ -27,18 +29,32 @@ async def submit_job(
|
|||||||
|
|
||||||
if file:
|
if file:
|
||||||
content = await file.read()
|
content = await file.read()
|
||||||
staging_path = stage_file(cfg.staging_dir, file.filename, content)
|
content_hash = hashlib.sha256(content).hexdigest()
|
||||||
filename = file.filename
|
filename = file.filename
|
||||||
else:
|
else:
|
||||||
staging_path = stage_note(cfg.staging_dir, title or "note", note)
|
content = note.encode("utf-8")
|
||||||
filename = staging_path.name
|
content_hash = hashlib.sha256(content).hexdigest()
|
||||||
|
filename = None
|
||||||
tags_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else []
|
|
||||||
tags_json = json.dumps(tags_list)
|
|
||||||
|
|
||||||
conn = get_connection(cfg.db_path)
|
conn = get_connection(cfg.db_path)
|
||||||
try:
|
try:
|
||||||
job_id = create_job(conn, filename, str(staging_path), doc_type, tags_json, title)
|
existing = get_document_by_hash(conn, content_hash)
|
||||||
|
if existing:
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=409,
|
||||||
|
content={"error": "duplicate", **existing},
|
||||||
|
)
|
||||||
|
|
||||||
|
if file:
|
||||||
|
staging_path = stage_file(cfg.staging_dir, file.filename, content)
|
||||||
|
else:
|
||||||
|
staging_path = stage_note(cfg.staging_dir, title or "note", note)
|
||||||
|
filename = staging_path.name
|
||||||
|
|
||||||
|
tags_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else []
|
||||||
|
tags_json = json.dumps(tags_list)
|
||||||
|
|
||||||
|
job_id = create_job(conn, filename, str(staging_path), doc_type, tags_json, title, content_hash)
|
||||||
return {"job_id": job_id, "status": "queued", "filename": filename}
|
return {"job_id": job_id, "status": "queued", "filename": filename}
|
||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|||||||
+28
-2
@@ -1,9 +1,12 @@
|
|||||||
"""Hybrid search — FTS5 + sqlite-vec with Reciprocal Rank Fusion."""
|
"""Hybrid search — FTS5 + sqlite-vec with Reciprocal Rank Fusion."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import struct
|
import struct
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
|
logger = logging.getLogger("kb.search")
|
||||||
|
|
||||||
|
|
||||||
def hybrid_search(
|
def hybrid_search(
|
||||||
conn: sqlite3.Connection,
|
conn: sqlite3.Connection,
|
||||||
@@ -74,6 +77,21 @@ def hybrid_search(
|
|||||||
# Internal helpers
|
# Internal helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _sanitize_fts_query(query: str) -> str:
|
||||||
|
"""Escape a raw user query for safe use with FTS5 MATCH.
|
||||||
|
|
||||||
|
Splits on whitespace, strips double quotes from each token, wraps each
|
||||||
|
token in double quotes (making FTS5 treat all content as literals), and
|
||||||
|
joins with spaces. Returns empty string if no valid tokens remain.
|
||||||
|
"""
|
||||||
|
tokens = []
|
||||||
|
for token in query.split():
|
||||||
|
token = token.replace('"', '')
|
||||||
|
if token:
|
||||||
|
tokens.append(f'"{token}"')
|
||||||
|
return " ".join(tokens)
|
||||||
|
|
||||||
|
|
||||||
def _fts_search(
|
def _fts_search(
|
||||||
conn: sqlite3.Connection,
|
conn: sqlite3.Connection,
|
||||||
query: str,
|
query: str,
|
||||||
@@ -86,10 +104,14 @@ def _fts_search(
|
|||||||
Returns:
|
Returns:
|
||||||
{chunk_id: bm25_score} where scores are positive (higher = better).
|
{chunk_id: bm25_score} where scores are positive (higher = better).
|
||||||
"""
|
"""
|
||||||
|
safe_query = _sanitize_fts_query(query)
|
||||||
|
if not safe_query:
|
||||||
|
return {}
|
||||||
|
|
||||||
sql = "SELECT f.rowid AS chunk_id, bm25(chunks_fts) AS rank FROM chunks_fts f"
|
sql = "SELECT f.rowid AS chunk_id, bm25(chunks_fts) AS rank FROM chunks_fts f"
|
||||||
joins: list[str] = []
|
joins: list[str] = []
|
||||||
where: list[str] = ["chunks_fts MATCH ?"]
|
where: list[str] = ["chunks_fts MATCH ?"]
|
||||||
params: list = [query]
|
params: list = [safe_query]
|
||||||
|
|
||||||
if tags or doc_type:
|
if tags or doc_type:
|
||||||
joins.append("JOIN chunks c ON f.rowid = c.id")
|
joins.append("JOIN chunks c ON f.rowid = c.id")
|
||||||
@@ -111,7 +133,11 @@ def _fts_search(
|
|||||||
sql += " ORDER BY rank LIMIT ?"
|
sql += " ORDER BY rank LIMIT ?"
|
||||||
params.append(limit)
|
params.append(limit)
|
||||||
|
|
||||||
rows = conn.execute(sql, params).fetchall()
|
try:
|
||||||
|
rows = conn.execute(sql, params).fetchall()
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
logger.warning("FTS5 query failed for input: %r", query)
|
||||||
|
return {}
|
||||||
|
|
||||||
# BM25 returns negative values (lower = better match); negate so
|
# BM25 returns negative values (lower = better match); negate so
|
||||||
# higher = better.
|
# higher = better.
|
||||||
|
|||||||
@@ -0,0 +1,2 @@
|
|||||||
|
schema: spec-driven
|
||||||
|
created: 2026-03-26
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
## Context
|
||||||
|
|
||||||
|
FTS5 has its own query syntax. Characters like `?`, `*`, `"`, `(`, `)`, `+`, `-`, `^` and keywords like `AND`, `OR`, `NOT`, `NEAR` have special meaning. The current code passes the raw user query to `chunks_fts MATCH ?` — parameterized (safe from SQL injection) but not safe from FTS5 syntax errors.
|
||||||
|
|
||||||
|
The fix point is `_fts_search()` in `engine/kb/search.py:92` where `params: list = [query]`.
|
||||||
|
|
||||||
|
## Goals / Non-Goals
|
||||||
|
|
||||||
|
**Goals:**
|
||||||
|
- Any user input to the search endpoint produces either valid results or an empty result set — never a 500 error
|
||||||
|
- Preserve the user's search intent as much as possible (don't over-strip)
|
||||||
|
|
||||||
|
**Non-Goals:**
|
||||||
|
- Exposing FTS5 advanced syntax to users (they can't use AND/OR/NEAR operators intentionally)
|
||||||
|
- Changing vector search (it already handles arbitrary strings via the embedding model)
|
||||||
|
|
||||||
|
## Decisions
|
||||||
|
|
||||||
|
### 1. Quote each token individually
|
||||||
|
|
||||||
|
Split the query on whitespace, wrap each token in double quotes (`"token"`), and join with spaces. FTS5 interprets double-quoted strings as literal phrases, disabling all operator parsing within them. Any embedded double quotes in a token are stripped.
|
||||||
|
|
||||||
|
Example: `what color is grass?` becomes `"what" "color" "is" "grass?"` — FTS5 treats `?` as a literal character inside quotes.
|
||||||
|
|
||||||
|
**Alternative considered**: Strip all non-alphanumeric characters. Rejected because it would break searches for terms containing hyphens, dots, or other meaningful punctuation (e.g., searching for "v2.0" or "self-hosted").
|
||||||
|
|
||||||
|
**Alternative considered**: Use a try/except to catch FTS5 errors and fall back. Rejected as a primary strategy because it silently degrades — but we'll add it as a safety net.
|
||||||
|
|
||||||
|
### 2. Handle empty/whitespace-only queries
|
||||||
|
|
||||||
|
If after sanitization no tokens remain, skip FTS search entirely and return empty results. This prevents sending an empty string to MATCH which would also error.
|
||||||
|
|
||||||
|
### 3. Try/except safety net
|
||||||
|
|
||||||
|
Wrap the FTS5 execute call in a try/except for `sqlite3.OperationalError`. If an edge case still slips through, return empty FTS results and log a warning rather than crashing with a 500.
|
||||||
|
|
||||||
|
## Risks / Trade-offs
|
||||||
|
|
||||||
|
- **[Reduced FTS expressiveness]** Users cannot use FTS5 operators like `AND`, `OR`, phrase matching. → Acceptable trade-off for a personal knowledge base tool where natural language queries are the norm. The hybrid search (vector + FTS) compensates.
|
||||||
|
- **[Edge cases]** Some Unicode or control characters might still cause issues. → The try/except safety net handles these.
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
## Why
|
||||||
|
|
||||||
|
Searching with natural language queries containing characters like `?`, `"`, `*`, `(`, `)`, `-`, or FTS5 keywords (`AND`, `OR`, `NOT`, `NEAR`) causes a 500 error because the raw query string is passed directly to `chunks_fts MATCH ?` without escaping. Users should be able to type anything into a search query without triggering syntax errors.
|
||||||
|
|
||||||
|
## What Changes
|
||||||
|
|
||||||
|
- **Sanitize FTS5 query input**: Escape or strip FTS5 special characters from the user's query before passing it to the MATCH operator
|
||||||
|
- **Graceful fallback**: If the sanitized query produces no valid FTS5 terms, return empty results from FTS instead of erroring
|
||||||
|
|
||||||
|
## Capabilities
|
||||||
|
|
||||||
|
### New Capabilities
|
||||||
|
|
||||||
|
_(none)_
|
||||||
|
|
||||||
|
### Modified Capabilities
|
||||||
|
|
||||||
|
- `engine-api`: The "Hybrid search" requirement changes — the engine must sanitize user queries to prevent FTS5 syntax errors for any input
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
- **Engine search** (`engine/kb/search.py`): `_fts_search()` needs query sanitization before the MATCH parameter
|
||||||
|
- **No client changes**: The client already displays results or errors correctly
|
||||||
|
- **No schema changes**: No database modifications needed
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
## MODIFIED Requirements
|
||||||
|
|
||||||
|
### Requirement: Hybrid search
|
||||||
|
|
||||||
|
The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5) and vector similarity search (via sqlite-vec), merged using Reciprocal Rank Fusion. Search SHALL complete in under 100ms when the model is warm. The engine SHALL sanitize user query strings to prevent FTS5 syntax errors for any input.
|
||||||
|
|
||||||
|
#### Scenario: Search with special characters
|
||||||
|
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "what color is grass?"}`
|
||||||
|
- **THEN** the engine SHALL sanitize the query for FTS5, execute the search successfully, and return results (not a 500 error)
|
||||||
|
|
||||||
|
#### Scenario: Search with FTS5 operators in query
|
||||||
|
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "NOT something OR (other)"}`
|
||||||
|
- **THEN** the engine SHALL treat the input as literal search terms, not FTS5 operators, and return matching results
|
||||||
|
|
||||||
|
#### Scenario: Search with only special characters
|
||||||
|
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "??!@#"}`
|
||||||
|
- **THEN** the engine SHALL return HTTP 200 with an empty result set (not a 500 error)
|
||||||
|
|
||||||
|
#### Scenario: Search with quotes in query
|
||||||
|
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "the \"quick\" fox"}`
|
||||||
|
- **THEN** the engine SHALL sanitize embedded quotes and return results normally
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
## 1. Query Sanitization
|
||||||
|
|
||||||
|
- [x] 1.1 Add `_sanitize_fts_query(query)` function to `engine/kb/search.py` that splits on whitespace, strips double quotes from each token, wraps each token in double quotes, and joins with spaces
|
||||||
|
- [x] 1.2 Handle edge case: if no valid tokens remain after sanitization, return empty dict from `_fts_search` without executing the query
|
||||||
|
|
||||||
|
## 2. Integration
|
||||||
|
|
||||||
|
- [x] 2.1 Call `_sanitize_fts_query()` in `_fts_search()` before adding the query to params (line 92)
|
||||||
|
- [x] 2.2 Add try/except `sqlite3.OperationalError` around the FTS5 execute call — log a warning and return empty results on error
|
||||||
|
|
||||||
|
## 3. Testing
|
||||||
|
|
||||||
|
- [x] 3.1 Test: `kb search "what color is grass?"` returns results, not a 500 error
|
||||||
|
- [x] 3.2 Test: `kb search "NOT something OR (other)"` returns results, treating input as literal terms
|
||||||
|
- [x] 3.3 Test: query with only special characters returns empty results, not an error
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
schema: spec-driven
|
||||||
|
created: 2026-03-26
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
## Context
|
||||||
|
|
||||||
|
The engine currently accepts all uploads with HTTP 202, stages the file, creates a job record, and relies on the background worker to detect duplicates via SHA256 content hash. When a duplicate is found, the worker marks the job as `skipped` — but the user has already received a success response and must poll job status to discover the duplicate. This creates unnecessary I/O (staging), pollutes the job list, and provides poor UX.
|
||||||
|
|
||||||
|
The `documents` table already has a `content_hash TEXT UNIQUE` column, and `database.hash_exists()` already exists. The infrastructure for dedup is in place — it just runs too late in the pipeline.
|
||||||
|
|
||||||
|
## Goals / Non-Goals
|
||||||
|
|
||||||
|
**Goals:**
|
||||||
|
- Reject duplicate uploads at the API boundary with HTTP 409 and useful context (existing document ID/title)
|
||||||
|
- Avoid staging files or creating job records for duplicates
|
||||||
|
- Apply to both file uploads and note submissions
|
||||||
|
- Keep the worker-side hash check as a race condition safety net
|
||||||
|
- Update the Go client to handle 409 and display a clear message
|
||||||
|
|
||||||
|
**Non-Goals:**
|
||||||
|
- Fuzzy/near-duplicate detection (e.g., same PDF with different metadata) — byte-identical only
|
||||||
|
- Changing the hash algorithm (SHA256 is fine)
|
||||||
|
- Adding a "force re-import" override flag (can be added later if needed)
|
||||||
|
- Dedup across different file formats with identical content (e.g., .md and .pdf of same text)
|
||||||
|
|
||||||
|
## Decisions
|
||||||
|
|
||||||
|
### 1. Hash in the upload endpoint, before staging
|
||||||
|
|
||||||
|
Compute SHA256 from the uploaded bytes in `submit_job()` before calling `stage_file()`. This avoids writing to disk or creating a DB job record for duplicates.
|
||||||
|
|
||||||
|
**Alternative considered**: Hash after staging but before job creation. Rejected because it still wastes disk I/O for the staging write.
|
||||||
|
|
||||||
|
### 2. Return HTTP 409 Conflict with context-dependent metadata
|
||||||
|
|
||||||
|
The 409 response includes `{"error": "duplicate", ...}` with a distinct shape depending on where the duplicate was found:
|
||||||
|
- **Already-ingested document**: `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
|
||||||
|
- **In-flight job (queued/processing)**: `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
|
||||||
|
|
||||||
|
This allows clients to distinguish between "this document already exists" and "this document is already being processed" and display appropriate messages.
|
||||||
|
|
||||||
|
**Alternative considered**: Return 200 with a `"status": "duplicate"` field. Rejected because 409 is the semantically correct status code and allows clients to distinguish duplicates from successful uploads without parsing the body.
|
||||||
|
|
||||||
|
### 3. New database helper: `get_document_by_hash()`
|
||||||
|
|
||||||
|
Returns a dict with duplicate info for a given hash, or `None`. Checks both the `documents` table (already ingested) and the `jobs` table (queued/processing), returning `document_id` or `job_id` accordingly. The `content_hash` column on the `jobs` table is populated at upload time to support this check. The boolean `hash_exists()` is retained for the worker safety net.
|
||||||
|
|
||||||
|
**Alternative considered**: Modify `hash_exists()` to return the document row. Rejected to avoid changing the worker's existing interface — keep changes minimal.
|
||||||
|
|
||||||
|
### 4. Retain worker-side dedup as safety net
|
||||||
|
|
||||||
|
The worker's `hash_exists()` check stays. In theory, two identical uploads could arrive in the same instant — both pass the API hash check before either commits. The jobs-table check closes most of this window (the hash is written at job creation), but a narrow race remains between the API check and the job insert. The UNIQUE constraint on `documents.content_hash` is the final backstop.
|
||||||
|
|
||||||
|
### 5. Note dedup: hash the text content
|
||||||
|
|
||||||
|
For notes submitted via the `note` field, SHA256-hash the UTF-8 encoded text. This catches identical note resubmissions.
|
||||||
|
|
||||||
|
## Risks / Trade-offs
|
||||||
|
|
||||||
|
- **[Race condition window]** Two identical files uploaded in the same millisecond could both pass the API hash check. → Mitigated by the worker-side `hash_exists()` check and the UNIQUE constraint. The second job would be `skipped`, not a crash.
|
||||||
|
- **[Blocking I/O in async endpoint]** SHA256 hashing is CPU-bound but fast (~5ms for 10MB). → Acceptable for the upload endpoint which already reads the full file into memory. No need for `run_in_executor`.
|
||||||
|
- **[Client compatibility]** Older clients not expecting 409 will see an error. → This is correct behavior — they'll see an HTTP error rather than silently accepting a duplicate. The Go client will be updated to handle it gracefully.
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
## Why
|
||||||
|
|
||||||
|
Duplicate document detection currently happens in the background worker — the upload endpoint always returns HTTP 202, and the user only discovers a duplicate later when the job status is `skipped`. This wastes staging I/O, creates noise in the job list, and gives poor user feedback. Moving the SHA256 content hash check to the upload endpoint allows immediate rejection with a clear error, preventing unnecessary work and giving the user instant feedback.
|
||||||
|
|
||||||
|
## What Changes
|
||||||
|
|
||||||
|
- **Compute content hash at upload time**: The `POST /api/v1/jobs` endpoint will SHA256-hash the uploaded file bytes before staging and check against `documents.content_hash`
|
||||||
|
- **Reject duplicates immediately**: Return HTTP 409 Conflict with the existing document ID when a duplicate is detected, instead of accepting and later skipping
|
||||||
|
- **No job created for duplicates**: Duplicate uploads will not create a job record or stage a file
|
||||||
|
- **Remove worker-side dedup**: The background worker's `hash_exists()` check becomes redundant for the normal flow but should be retained as a safety net (race condition guard)
|
||||||
|
- **Update Go client**: Surface the 409 response with a clear message (e.g., "Already imported: <title> (doc ID: <id>)")
|
||||||
|
- **Note dedup**: Apply the same check to notes — hash the note text content
|
||||||
|
|
||||||
|
## Capabilities
|
||||||
|
|
||||||
|
### New Capabilities
|
||||||
|
|
||||||
|
_(none — this modifies existing capabilities)_
|
||||||
|
|
||||||
|
### Modified Capabilities
|
||||||
|
|
||||||
|
- `engine-api`: The "Async ingestion via job queue" requirement changes — duplicate content is now rejected at upload time (HTTP 409) instead of accepted and later skipped by the worker. The "Duplicate content detection" scenario moves from background to synchronous.
|
||||||
|
- `go-client`: The "Add command" requirement changes — the client must handle HTTP 409 responses and display the duplicate document info to the user.
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
- **Engine API** (`engine/kb/routes/jobs.py`): `submit_job()` gains hash computation and DB lookup before staging/job creation
|
||||||
|
- **Engine database** (`engine/kb/database.py`): Need a query to return the existing document ID/title for a given hash (not just boolean exists check)
|
||||||
|
- **Engine worker** (`engine/kb/worker.py`): Dedup check retained as safety net but no longer the primary guard
|
||||||
|
- **Go client** (`client/cmd/add.go`): Handle 409 response, display duplicate info
|
||||||
|
- **API contract**: New HTTP 409 response on `POST /api/v1/jobs` — this is additive, not breaking, since no consumer expects 409 today
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
## MODIFIED Requirements
|
||||||
|
|
||||||
|
### Requirement: Async ingestion via job queue
|
||||||
|
|
||||||
|
The engine SHALL accept file uploads and text notes for ingestion asynchronously. Uploaded content SHALL be written to a staging area and a job record created in the database. The engine SHALL return HTTP 202 immediately. A background worker SHALL process queued jobs sequentially. Before staging, the engine SHALL compute a SHA256 hash of the uploaded content and reject duplicates immediately.
|
||||||
|
|
||||||
|
#### Scenario: Upload a PDF file
|
||||||
|
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a PDF file and optional fields (tags, doc_type)
|
||||||
|
- **THEN** the engine SHALL compute the SHA256 hash of the file bytes, verify no existing document has the same hash, write the file to the staging directory, create a job record with status `queued`, and return HTTP 202 with `{"job_id": "<id>", "status": "queued", "filename": "report.pdf"}`
|
||||||
|
|
||||||
|
#### Scenario: Upload a text note
|
||||||
|
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a `note` text field and optional `title` field
|
||||||
|
- **THEN** the engine SHALL compute the SHA256 hash of the note text (UTF-8 encoded), verify no existing document has the same hash, write the note content to a staging file, create a job record with status `queued`, and return HTTP 202 with the job ID
|
||||||
|
|
||||||
|
#### Scenario: Upload multiple files in sequence
|
||||||
|
- **WHEN** a client sends multiple `POST /api/v1/jobs` requests in quick succession
|
||||||
|
- **THEN** the engine SHALL queue each job independently and the background worker SHALL process them in FIFO order
|
||||||
|
|
||||||
|
#### Scenario: Duplicate file detected at upload time (already ingested)
|
||||||
|
- **WHEN** a client uploads a file whose SHA256 content hash matches an already-ingested document
|
||||||
|
- **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
|
||||||
|
|
||||||
|
#### Scenario: Duplicate file detected at upload time (in-flight job)
|
||||||
|
- **WHEN** a client uploads a file whose SHA256 content hash matches a queued or processing job
|
||||||
|
- **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
|
||||||
|
|
||||||
|
#### Scenario: Duplicate note detected at upload time (already ingested)
|
||||||
|
- **WHEN** a client submits a note whose SHA256 content hash matches an already-ingested document
|
||||||
|
- **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
|
||||||
|
|
||||||
|
#### Scenario: Duplicate note detected at upload time (in-flight job)
|
||||||
|
- **WHEN** a client submits a note whose SHA256 content hash matches a queued or processing job
|
||||||
|
- **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
|
||||||
|
|
||||||
|
#### Scenario: Duplicate uploaded during concurrent request handling
|
||||||
|
- **WHEN** two identical files are uploaded in the same instant, both passing the API hash check before either job is committed
|
||||||
|
- **THEN** both jobs SHALL be queued, and the background worker SHALL process the first normally and mark the second as `skipped` (worker-side safety net via `hash_exists()` and UNIQUE constraint)
|
||||||
|
|
||||||
|
#### Scenario: Upload failure due to unsupported file type
|
||||||
|
- **WHEN** a client uploads a file with an unsupported extension
|
||||||
|
- **THEN** the engine SHALL return HTTP 422 with an error message listing supported types
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
## MODIFIED Requirements
|
||||||
|
|
||||||
|
### Requirement: Add command (file and note ingestion)
|
||||||
|
|
||||||
|
The client SHALL provide a `kb add` command that uploads files or notes to the engine for async ingestion. The client SHALL exit immediately after a successful upload. The client SHALL handle duplicate rejection (HTTP 409) and display the existing document information.
|
||||||
|
|
||||||
|
#### Scenario: Add a single file
|
||||||
|
- **WHEN** the user runs `kb add report.pdf`
|
||||||
|
- **THEN** the client SHALL upload the file via `POST /api/v1/jobs` (multipart), print "Queued: report.pdf", and exit
|
||||||
|
|
||||||
|
#### Scenario: Add a file with tags
|
||||||
|
- **WHEN** the user runs `kb add manual.pdf --tags car,maintenance`
|
||||||
|
- **THEN** the client SHALL include the tags in the multipart upload metadata
|
||||||
|
|
||||||
|
#### Scenario: Add a directory recursively
|
||||||
|
- **WHEN** the user runs `kb add ~/documents/ --recursive`
|
||||||
|
- **THEN** the client SHALL discover all supported files in the directory tree, upload each one sequentially, and print "Queued: N files"
|
||||||
|
|
||||||
|
#### Scenario: Add a text note
|
||||||
|
- **WHEN** the user runs `kb add --note "The server room is in building 3, floor 2"`
|
||||||
|
- **THEN** the client SHALL submit the note text via `POST /api/v1/jobs` (multipart with note field), print "Queued: note", and exit
|
||||||
|
|
||||||
|
#### Scenario: Duplicate file rejected (already ingested)
|
||||||
|
- **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "document_id": 42, "title": "report.pdf"}`
|
||||||
|
- **THEN** the client SHALL print "Already imported: report.pdf (doc ID: 42)" and exit with code 0
|
||||||
|
|
||||||
|
#### Scenario: Duplicate file rejected (in-flight job)
|
||||||
|
- **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "job_id": 7, "title": "report.pdf"}`
|
||||||
|
- **THEN** the client SHALL print "Already queued: report.pdf (job ID: 7)" and exit with code 0
|
||||||
|
|
||||||
|
#### Scenario: Duplicate file in recursive add
|
||||||
|
- **WHEN** the user runs `kb add ~/documents/ --recursive` and some files are rejected as duplicates
|
||||||
|
- **THEN** the client SHALL print the duplicate message for each rejected file (distinguishing "Already imported" from "Already queued"), continue uploading remaining files, and include a summary (e.g., "Queued: 5 files, 2 duplicates skipped")
|
||||||
|
|
||||||
|
#### Scenario: Duplicate with JSON output
|
||||||
|
- **WHEN** the user runs `kb add report.pdf --format json` and the engine returns HTTP 409
|
||||||
|
- **THEN** the client SHALL output the raw JSON response from the engine including the document_id and title
|
||||||
|
|
||||||
|
#### Scenario: Add with JSON output
|
||||||
|
- **WHEN** the user runs `kb add report.pdf --format json`
|
||||||
|
- **THEN** the client SHALL output the JSON response from the engine including the job_id
|
||||||
|
|
||||||
|
#### Scenario: File not found
|
||||||
|
- **WHEN** the user runs `kb add nonexistent.pdf`
|
||||||
|
- **THEN** the client SHALL print an error and exit with a non-zero code without making any API call
|
||||||
|
|
||||||
|
#### Scenario: Upload failure
|
||||||
|
- **WHEN** the upload fails (network error, engine returns 4xx/5xx other than 409)
|
||||||
|
- **THEN** the client SHALL print the error and exit with a non-zero code
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
## 1. Database Layer
|
||||||
|
|
||||||
|
- [x] 1.1 Add `get_document_by_hash(conn, content_hash)` function to `engine/kb/database.py` that returns `(document_id, title)` or `None`
|
||||||
|
|
||||||
|
## 2. Upload Endpoint
|
||||||
|
|
||||||
|
- [x] 2.1 Update `submit_job()` in `engine/kb/routes/jobs.py` to compute SHA256 hash of uploaded file bytes before staging
|
||||||
|
- [x] 2.2 Add duplicate check: call `get_document_by_hash()` and return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}` if match found
|
||||||
|
- [x] 2.3 Apply same hash check for note submissions (hash the UTF-8 encoded note text)
|
||||||
|
|
||||||
|
## 3. Go Client
|
||||||
|
|
||||||
|
- [x] 3.1 Update `uploadFile()` in `client/cmd/add.go` to handle HTTP 409 responses — parse the JSON body and print "Already imported: <title> (doc ID: <id>)"
|
||||||
|
- [x] 3.2 Update recursive directory upload to continue on 409, track duplicate count, and include in summary output
|
||||||
|
- [x] 3.3 Handle 409 in JSON output mode — pass through the raw engine response
|
||||||
|
|
||||||
|
## 4. Testing
|
||||||
|
|
||||||
|
- [x] 4.1 Test: upload a file, then upload the same file again — verify 409 with correct document_id and title
|
||||||
|
- [x] 4.2 Test: upload a note, then upload the same note text — verify 409
|
||||||
|
- [x] 4.3 Test: upload a file, then upload a different file — verify 202 as normal
|
||||||
|
- [x] 4.4 Test: verify the worker-side `hash_exists()` safety net still works (direct job insertion bypassing API)
|
||||||
@@ -26,7 +26,7 @@ The engine SHALL load the embedding model eagerly at startup before accepting HT
|
|||||||
|
|
||||||
### Requirement: Hybrid search
|
### Requirement: Hybrid search
|
||||||
|
|
||||||
The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5) and vector similarity search (via sqlite-vec), merged using Reciprocal Rank Fusion. Search SHALL complete in under 100ms when the model is warm.
|
The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5) and vector similarity search (via sqlite-vec), merged using Reciprocal Rank Fusion. Search SHALL complete in under 100ms when the model is warm. The engine SHALL sanitize user query strings to prevent FTS5 syntax errors for any input.
|
||||||
|
|
||||||
#### Scenario: Hybrid search with results
|
#### Scenario: Hybrid search with results
|
||||||
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "how to change oil", "top": 5}`
|
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "how to change oil", "top": 5}`
|
||||||
@@ -44,27 +44,59 @@ The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5
|
|||||||
- **WHEN** a client searches against an empty database
|
- **WHEN** a client searches against an empty database
|
||||||
- **THEN** the engine SHALL return HTTP 200 with `{"query": "...", "results": [], "total_matches": 0}`
|
- **THEN** the engine SHALL return HTTP 200 with `{"query": "...", "results": [], "total_matches": 0}`
|
||||||
|
|
||||||
|
#### Scenario: Search with special characters
|
||||||
|
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "what color is grass?"}`
|
||||||
|
- **THEN** the engine SHALL sanitize the query for FTS5, execute the search successfully, and return results (not a 500 error)
|
||||||
|
|
||||||
|
#### Scenario: Search with FTS5 operators in query
|
||||||
|
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "NOT something OR (other)"}`
|
||||||
|
- **THEN** the engine SHALL treat the input as literal search terms, not FTS5 operators, and return matching results
|
||||||
|
|
||||||
|
#### Scenario: Search with only special characters
|
||||||
|
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "??!@#"}`
|
||||||
|
- **THEN** the engine SHALL return HTTP 200 with an empty result set (not a 500 error)
|
||||||
|
|
||||||
|
#### Scenario: Search with quotes in query
|
||||||
|
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "the \"quick\" fox"}`
|
||||||
|
- **THEN** the engine SHALL sanitize embedded quotes and return results normally
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### Requirement: Async ingestion via job queue
|
### Requirement: Async ingestion via job queue
|
||||||
|
|
||||||
The engine SHALL accept file uploads and text notes for ingestion asynchronously. Uploaded content SHALL be written to a staging area and a job record created in the database. The engine SHALL return HTTP 202 immediately. A background worker SHALL process queued jobs sequentially.
|
The engine SHALL accept file uploads and text notes for ingestion asynchronously. Uploaded content SHALL be written to a staging area and a job record created in the database. The engine SHALL return HTTP 202 immediately. A background worker SHALL process queued jobs sequentially. Before staging, the engine SHALL compute a SHA256 hash of the uploaded content and reject duplicates immediately.
|
||||||
|
|
||||||
#### Scenario: Upload a PDF file
|
#### Scenario: Upload a PDF file
|
||||||
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a PDF file and optional fields (tags, doc_type)
|
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a PDF file and optional fields (tags, doc_type)
|
||||||
- **THEN** the engine SHALL write the file to the staging directory, create a job record with status `queued`, and return HTTP 202 with `{"job_id": "<id>", "status": "queued", "filename": "report.pdf"}`
|
- **THEN** the engine SHALL compute the SHA256 hash of the file bytes, verify no existing document has the same hash, write the file to the staging directory, create a job record with status `queued`, and return HTTP 202 with `{"job_id": "<id>", "status": "queued", "filename": "report.pdf"}`
|
||||||
|
|
||||||
#### Scenario: Upload a text note
|
#### Scenario: Upload a text note
|
||||||
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a `note` text field and optional `title` field
|
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a `note` text field and optional `title` field
|
||||||
- **THEN** the engine SHALL write the note content to a staging file, create a job record with status `queued`, and return HTTP 202 with the job ID
|
- **THEN** the engine SHALL compute the SHA256 hash of the note text (UTF-8 encoded), verify no existing document has the same hash, write the note content to a staging file, create a job record with status `queued`, and return HTTP 202 with the job ID
|
||||||
|
|
||||||
#### Scenario: Upload multiple files in sequence
|
#### Scenario: Upload multiple files in sequence
|
||||||
- **WHEN** a client sends multiple `POST /api/v1/jobs` requests in quick succession
|
- **WHEN** a client sends multiple `POST /api/v1/jobs` requests in quick succession
|
||||||
- **THEN** the engine SHALL queue each job independently and the background worker SHALL process them in FIFO order
|
- **THEN** the engine SHALL queue each job independently and the background worker SHALL process them in FIFO order
|
||||||
|
|
||||||
#### Scenario: Duplicate content detection
|
#### Scenario: Duplicate file detected at upload time (already ingested)
|
||||||
- **WHEN** a client uploads a file whose content hash matches an already-ingested document
|
- **WHEN** a client uploads a file whose SHA256 content hash matches an already-ingested document
|
||||||
- **THEN** the engine SHALL return HTTP 202 but the background worker SHALL mark the job as `skipped` with reason `duplicate`
|
- **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
|
||||||
|
|
||||||
|
#### Scenario: Duplicate file detected at upload time (in-flight job)
|
||||||
|
- **WHEN** a client uploads a file whose SHA256 content hash matches a queued or processing job
|
||||||
|
- **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
|
||||||
|
|
||||||
|
#### Scenario: Duplicate note detected at upload time (already ingested)
|
||||||
|
- **WHEN** a client submits a note whose SHA256 content hash matches an already-ingested document
|
||||||
|
- **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
|
||||||
|
|
||||||
|
#### Scenario: Duplicate note detected at upload time (in-flight job)
|
||||||
|
- **WHEN** a client submits a note whose SHA256 content hash matches a queued or processing job
|
||||||
|
- **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
|
||||||
|
|
||||||
|
#### Scenario: Duplicate uploaded during concurrent request handling
|
||||||
|
- **WHEN** two identical files are uploaded in the same instant, both passing the API hash check before either job is committed
|
||||||
|
- **THEN** both jobs SHALL be queued, and the background worker SHALL process the first normally and mark the second as `skipped` (worker-side safety net via `hash_exists()` and UNIQUE constraint)
|
||||||
|
|
||||||
#### Scenario: Upload failure due to unsupported file type
|
#### Scenario: Upload failure due to unsupported file type
|
||||||
- **WHEN** a client uploads a file with an unsupported extension
|
- **WHEN** a client uploads a file with an unsupported extension
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ The client SHALL provide a `kb search <query>` command that sends the query to t
|
|||||||
|
|
||||||
### Requirement: Add command (file and note ingestion)
|
### Requirement: Add command (file and note ingestion)
|
||||||
|
|
||||||
The client SHALL provide a `kb add` command that uploads files or notes to the engine for async ingestion. The client SHALL exit immediately after a successful upload.
|
The client SHALL provide a `kb add` command that uploads files or notes to the engine for async ingestion. The client SHALL exit immediately after a successful upload. The client SHALL handle duplicate rejection (HTTP 409) and display the existing document information.
|
||||||
|
|
||||||
#### Scenario: Add a single file
|
#### Scenario: Add a single file
|
||||||
- **WHEN** the user runs `kb add report.pdf`
|
- **WHEN** the user runs `kb add report.pdf`
|
||||||
@@ -84,6 +84,22 @@ The client SHALL provide a `kb add` command that uploads files or notes to the e
|
|||||||
- **WHEN** the user runs `kb add --note "The server room is in building 3, floor 2"`
|
- **WHEN** the user runs `kb add --note "The server room is in building 3, floor 2"`
|
||||||
- **THEN** the client SHALL submit the note text via `POST /api/v1/jobs` (multipart with note field), print "Queued: note", and exit
|
- **THEN** the client SHALL submit the note text via `POST /api/v1/jobs` (multipart with note field), print "Queued: note", and exit
|
||||||
|
|
||||||
|
#### Scenario: Duplicate file rejected (already ingested)
|
||||||
|
- **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "document_id": 42, "title": "report.pdf"}`
|
||||||
|
- **THEN** the client SHALL print "Already imported: report.pdf (doc ID: 42)" and exit with code 0
|
||||||
|
|
||||||
|
#### Scenario: Duplicate file rejected (in-flight job)
|
||||||
|
- **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "job_id": 7, "title": "report.pdf"}`
|
||||||
|
- **THEN** the client SHALL print "Already queued: report.pdf (job ID: 7)" and exit with code 0
|
||||||
|
|
||||||
|
#### Scenario: Duplicate file in recursive add
|
||||||
|
- **WHEN** the user runs `kb add ~/documents/ --recursive` and some files are rejected as duplicates
|
||||||
|
- **THEN** the client SHALL print the duplicate message for each rejected file (distinguishing "Already imported" from "Already queued"), continue uploading remaining files, and include a summary (e.g., "Queued: 5 files, 2 duplicates skipped")
|
||||||
|
|
||||||
|
#### Scenario: Duplicate with JSON output
|
||||||
|
- **WHEN** the user runs `kb add report.pdf --format json` and the engine returns HTTP 409
|
||||||
|
- **THEN** the client SHALL output the raw JSON response from the engine including the document_id and title
|
||||||
|
|
||||||
#### Scenario: Add with JSON output
|
#### Scenario: Add with JSON output
|
||||||
- **WHEN** the user runs `kb add report.pdf --format json`
|
- **WHEN** the user runs `kb add report.pdf --format json`
|
||||||
- **THEN** the client SHALL output the JSON response from the engine including the job_id
|
- **THEN** the client SHALL output the JSON response from the engine including the job_id
|
||||||
@@ -93,7 +109,7 @@ The client SHALL provide a `kb add` command that uploads files or notes to the e
|
|||||||
- **THEN** the client SHALL print an error and exit with a non-zero code without making any API call
|
- **THEN** the client SHALL print an error and exit with a non-zero code without making any API call
|
||||||
|
|
||||||
#### Scenario: Upload failure
|
#### Scenario: Upload failure
|
||||||
- **WHEN** the upload fails (network error, engine returns 4xx/5xx)
|
- **WHEN** the upload fails (network error, engine returns 4xx/5xx other than 409)
|
||||||
- **THEN** the client SHALL print the error and exit with a non-zero code
|
- **THEN** the client SHALL print the error and exit with a non-zero code
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
+9
-2
@@ -65,6 +65,13 @@ if [[ -z "$FORGE" ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Ensure we're on main branch
|
||||||
|
CURRENT_BRANCH="$(git -C "$SCRIPT_DIR" rev-parse --abbrev-ref HEAD)"
|
||||||
|
if [[ "$CURRENT_BRANCH" != "main" ]]; then
|
||||||
|
echo "Error: releases must be made from the main branch (currently on '$CURRENT_BRANCH')"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
if ! command -v "$FORGE" &>/dev/null; then
|
if ! command -v "$FORGE" &>/dev/null; then
|
||||||
echo "Error: '$FORGE' not found in PATH"
|
echo "Error: '$FORGE' not found in PATH"
|
||||||
exit 1
|
exit 1
|
||||||
@@ -240,9 +247,9 @@ elif [[ "$FORGE" == "tea" ]]; then
|
|||||||
--title "$RELEASE_TITLE" \
|
--title "$RELEASE_TITLE" \
|
||||||
--note "$RELEASE_NOTES"
|
--note "$RELEASE_NOTES"
|
||||||
|
|
||||||
# tea attaches assets separately
|
# tea attaches assets as positional args: tea release asset create <tag> <file>...
|
||||||
for f in "${ASSETS[@]+"${ASSETS[@]}"}"; do
|
for f in "${ASSETS[@]+"${ASSETS[@]}"}"; do
|
||||||
run tea release asset create --tag "$TAG" "$f"
|
run tea release asset create "$TAG" "$f"
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user