6 Commits

Author SHA1 Message Date
steve 528a09ca90 Independent client/engine versioning with compatibility check
Split release.sh into release-client.sh and release-engine.sh for
independent release cadences. Client checks engine version on first
API call and hard-fails if engine is below MinEngineVersion.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 15:59:16 +00:00
steve b04823e67b Store original documents for download after ingestion
Persist uploaded files to {data_dir}/documents/{content_hash}{ext} after
successful ingestion. Add GET /documents/{id}/file endpoint for retrieval,
delete stored files on document deletion, and add `kb export` client command.
Includes schema migration, tests, and spec updates.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 15:16:27 +00:00
steve 6a4bce4659 Bump version to 2.0.5 2026-03-26 23:08:48 +00:00
steve 4590c124ad Merge pull request 'Upload-time dedup, FTS5 query sanitization, release guard' (#1) from 2.0.5 into main
Reviewed-on: #1
2026-03-26 23:06:08 +00:00
steve 6fec627503 Upload-time duplicate detection, FTS5 query sanitization, release guard
- Reject duplicate uploads at the API boundary (HTTP 409) instead of
  silently skipping in the background worker. Checks both ingested
  documents and in-flight jobs via content_hash on the jobs table.
- Go client handles 409 with distinct messages for already-imported
  documents vs already-queued jobs.
- Sanitize FTS5 search queries by quoting each token to prevent syntax
  errors from special characters like ?, *, ", (), AND, OR, NOT.
- Add try/except safety net around FTS5 execute for edge cases.
- Add main branch guard to release.sh to prevent releasing from
  feature branches.
- Update specs and README to reflect new behaviour.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 23:05:07 +00:00
steve 63654a59b8 Fix tea asset upload syntax in release script
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 22:10:48 +00:00
50 changed files with 2067 additions and 133 deletions
+1
View File
@@ -1 +1,2 @@
examples/
.claude/
+2 -2
View File
@@ -1,4 +1,4 @@
# kb-search
# kb
Personal knowledge base with hybrid search (full-text + semantic vector search).
@@ -129,7 +129,7 @@ All endpoints are under `/api/v1/`. Requires `Authorization: Bearer <key>` heade
|---|---|---|
| `GET` | `/health` | Health check (bypasses auth) |
| `POST` | `/search` | Hybrid search (JSON body) |
| `POST` | `/jobs` | Upload file/note for ingestion (multipart, returns 202) |
| `POST` | `/jobs` | Upload file/note for ingestion (multipart, returns 202 or 409 if duplicate) |
| `GET` | `/jobs` | List ingestion jobs |
| `GET` | `/jobs/{id}` | Job details |
| `GET` | `/documents` | List documents |
+1
View File
@@ -0,0 +1 @@
2.0.0
+2 -1
View File
@@ -1,5 +1,6 @@
VERSION ?= $(shell cat VERSION 2>/dev/null || echo "dev")
LDFLAGS := -ldflags "-s -w -X github.com/kb-search/kb/cmd.Version=$(VERSION)"
MIN_ENGINE_VERSION ?= $(shell cat MIN_ENGINE_VERSION 2>/dev/null || echo "dev")
LDFLAGS := -ldflags "-s -w -X github.com/kb-search/kb/cmd.Version=$(VERSION) -X github.com/kb-search/kb/cmd.MinEngineVersion=$(MIN_ENGINE_VERSION)"
PLATFORMS := linux/amd64 linux/arm64 darwin/amd64 darwin/arm64 windows/amd64
+1 -1
View File
@@ -1 +1 @@
2.0.4
2.0.6
+81 -7
View File
@@ -1,7 +1,9 @@
package cmd
import (
"encoding/json"
"fmt"
"net/http"
"os"
"path/filepath"
"strings"
@@ -11,6 +13,21 @@ import (
"github.com/spf13/cobra"
)
type uploadResult struct {
Raw interface{}
Duplicate bool
DocID float64
JobID float64
Title string
}
func (r *uploadResult) duplicateMsg() string {
if r.DocID > 0 {
return fmt.Sprintf("Already imported: %s (doc ID: %.0f)", r.Title, r.DocID)
}
return fmt.Sprintf("Already queued: %s (job ID: %.0f)", r.Title, r.JobID)
}
var supportedExts = map[string]bool{
".pdf": true,
".docx": true,
@@ -67,6 +84,26 @@ func runAdd(cmd *cobra.Command, args []string) error {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
if resp.StatusCode == http.StatusConflict {
var result interface{}
if err := api.DecodeJSON(resp, &result); err != nil {
return fmt.Errorf("failed to decode response: %w", err)
}
if output.IsJSON() {
output.PrintJSON(result)
} else {
if m, ok := result.(map[string]interface{}); ok {
if docID, ok := m["document_id"].(float64); ok {
fmt.Printf("Already imported: %s (doc ID: %.0f)\n", m["title"], docID)
} else if jobID, ok := m["job_id"].(float64); ok {
fmt.Printf("Already queued: %s (job ID: %.0f)\n", m["title"], jobID)
}
}
}
return nil
}
if err := api.CheckError(resp); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
@@ -104,7 +141,9 @@ func runAdd(cmd *cobra.Command, args []string) error {
}
if output.IsJSON() {
output.PrintJSON([]interface{}{result})
output.PrintJSON([]interface{}{result.Raw})
} else if result.Duplicate {
fmt.Println(result.duplicateMsg())
} else {
fmt.Printf("Queued: %s\n", filepath.Base(path))
}
@@ -135,27 +174,39 @@ func runAdd(cmd *cobra.Command, args []string) error {
}
var results []interface{}
queued := 0
duplicates := 0
for _, f := range files {
result, err := uploadFile(client, f, tags, docType)
if err != nil {
fmt.Fprintf(os.Stderr, "Error uploading %s: %v\n", f, err)
continue
}
results = append(results, result)
if !output.IsJSON() {
fmt.Printf("Queued: %s\n", filepath.Base(f))
results = append(results, result.Raw)
if result.Duplicate {
duplicates++
if !output.IsJSON() {
fmt.Println(result.duplicateMsg())
}
} else {
queued++
if !output.IsJSON() {
fmt.Printf("Queued: %s\n", filepath.Base(f))
}
}
}
if output.IsJSON() {
output.PrintJSON(results)
} else if duplicates > 0 {
fmt.Printf("Queued: %d files, %d duplicates skipped\n", queued, duplicates)
} else {
fmt.Printf("Queued: %d files\n", len(results))
fmt.Printf("Queued: %d files\n", queued)
}
return nil
}
func uploadFile(client *api.Client, path, tags, docType string) (interface{}, error) {
func uploadFile(client *api.Client, path, tags, docType string) (*uploadResult, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("cannot open %s: %w", path, err)
@@ -180,6 +231,29 @@ func uploadFile(client *api.Client, path, tags, docType string) (interface{}, er
if err != nil {
return nil, err
}
if resp.StatusCode == http.StatusConflict {
var raw json.RawMessage
if err := api.DecodeJSON(resp, &raw); err != nil {
return nil, fmt.Errorf("failed to decode response: %w", err)
}
var dupResp struct {
DocumentID float64 `json:"document_id"`
JobID float64 `json:"job_id"`
Title string `json:"title"`
}
json.Unmarshal(raw, &dupResp)
var rawIface interface{}
json.Unmarshal(raw, &rawIface)
return &uploadResult{
Raw: rawIface,
Duplicate: true,
DocID: dupResp.DocumentID,
JobID: dupResp.JobID,
Title: dupResp.Title,
}, nil
}
if err := api.CheckError(resp); err != nil {
return nil, err
}
@@ -188,5 +262,5 @@ func uploadFile(client *api.Client, path, tags, docType string) (interface{}, er
if err := api.DecodeJSON(resp, &result); err != nil {
return nil, fmt.Errorf("failed to decode response: %w", err)
}
return result, nil
return &uploadResult{Raw: result}, nil
}
+74
View File
@@ -0,0 +1,74 @@
package cmd
import (
"fmt"
"io"
"mime"
"os"
"path/filepath"
"github.com/kb-search/kb/internal/api"
"github.com/spf13/cobra"
)
var exportCmd = &cobra.Command{
Use: "export <id>",
Short: "Download original document file",
Args: cobra.ExactArgs(1),
RunE: runExport,
}
func init() {
exportCmd.Flags().StringP("output", "o", "", "output file path (default: original filename to current directory)")
rootCmd.AddCommand(exportCmd)
}
func runExport(cmd *cobra.Command, args []string) error {
client := api.NewClient()
resp, err := client.Get("/api/v1/documents/" + args[0] + "/file")
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
if err := api.CheckError(resp); err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(1)
}
defer resp.Body.Close()
outPath, _ := cmd.Flags().GetString("output")
if outPath == "" {
// Try to get filename from Content-Disposition header
cd := resp.Header.Get("Content-Disposition")
if cd != "" {
_, params, err := mime.ParseMediaType(cd)
if err == nil && params["filename"] != "" {
outPath = params["filename"]
}
}
if outPath == "" {
outPath = "document-" + args[0]
}
}
if outPath == "-" {
_, err := io.Copy(os.Stdout, resp.Body)
return err
}
outPath = filepath.Clean(outPath)
f, err := os.Create(outPath)
if err != nil {
return fmt.Errorf("failed to create output file: %w", err)
}
defer f.Close()
n, err := io.Copy(f, resp.Body)
if err != nil {
return fmt.Errorf("failed to write file: %w", err)
}
fmt.Fprintf(os.Stderr, "Saved %s (%d bytes)\n", outPath, n)
return nil
}
+5
View File
@@ -4,6 +4,7 @@ import (
"fmt"
"os"
"github.com/kb-search/kb/internal/api"
"github.com/kb-search/kb/internal/config"
"github.com/spf13/cobra"
)
@@ -11,6 +12,9 @@ import (
// Version is set at build time via -ldflags.
var Version = "dev"
// MinEngineVersion is set at build time via -ldflags.
var MinEngineVersion = "dev"
var (
flagEngine string
flagFormat string
@@ -31,6 +35,7 @@ var rootCmd = &cobra.Command{
}
func init() {
api.SetVersionInfo(Version, MinEngineVersion)
rootCmd.Version = Version
rootCmd.PersistentFlags().StringVar(&flagEngine, "engine", "", "engine API URL")
rootCmd.PersistentFlags().StringVar(&flagFormat, "format", "", "output format (human|json)")
+86 -3
View File
@@ -7,6 +7,9 @@ import (
"io"
"mime/multipart"
"net/http"
"os"
"strconv"
"strings"
"github.com/kb-search/kb/internal/config"
)
@@ -18,11 +21,25 @@ type FileUpload struct {
Reader io.Reader
}
// Package-level version info, set once by cmd.init via SetVersionInfo.
var (
clientVersion string
minEngineVersion string
)
// SetVersionInfo configures the client and minimum engine version for compatibility checking.
// Called once from cmd package initialization.
func SetVersionInfo(cv, minEV string) {
clientVersion = cv
minEngineVersion = minEV
}
// Client is an HTTP client for the kb-search engine API.
type Client struct {
baseURL string
apiKey string
httpClient *http.Client
baseURL string
apiKey string
httpClient *http.Client
versionChecked bool
}
// NewClient creates a Client from the current configuration.
@@ -48,6 +65,7 @@ func (c *Client) newRequest(method, path string, body io.Reader) (*http.Request,
}
func (c *Client) do(req *http.Request) (*http.Response, error) {
c.checkEngineVersion()
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("Cannot reach engine at %s: %v", c.baseURL, err)
@@ -55,6 +73,71 @@ func (c *Client) do(req *http.Request) (*http.Response, error) {
return resp, nil
}
func (c *Client) checkEngineVersion() {
if c.versionChecked {
return
}
c.versionChecked = true
minVer := minEngineVersion
if minVer == "" || minVer == "dev" {
return
}
statusReq, err := c.newRequest(http.MethodGet, "/api/v1/status", nil)
if err != nil {
return
}
resp, err := c.httpClient.Do(statusReq)
if err != nil {
return // unreachable — let the actual request surface the error
}
defer resp.Body.Close()
var status struct {
Version string `json:"version"`
}
if err := json.NewDecoder(resp.Body).Decode(&status); err != nil {
return
}
if !semverAtLeast(status.Version, minVer) {
fmt.Fprintf(os.Stderr, "Error: kb client v%s requires engine v%s+ (connected engine is v%s)\nUpdate your engine image to engine-v%s or later.\n",
clientVersion, minVer, status.Version, minVer)
os.Exit(1)
}
}
// semverAtLeast returns true if version >= minimum, comparing major.minor.patch.
func semverAtLeast(version, minimum string) bool {
parse := func(s string) (int, int, int) {
s = strings.TrimPrefix(s, "v")
parts := strings.SplitN(s, ".", 3)
var major, minor, patch int
if len(parts) >= 1 {
major, _ = strconv.Atoi(parts[0])
}
if len(parts) >= 2 {
minor, _ = strconv.Atoi(parts[1])
}
if len(parts) >= 3 {
patch, _ = strconv.Atoi(parts[2])
}
return major, minor, patch
}
vMaj, vMin, vPat := parse(version)
mMaj, mMin, mPat := parse(minimum)
if vMaj != mMaj {
return vMaj > mMaj
}
if vMin != mMin {
return vMin > mMin
}
return vPat >= mPat
}
// Get performs a GET request to the given path.
func (c *Client) Get(path string) (*http.Response, error) {
req, err := c.newRequest(http.MethodGet, path, nil)
+136
View File
@@ -0,0 +1,136 @@
package api
import (
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
)
func TestSemverAtLeast(t *testing.T) {
tests := []struct {
version string
minimum string
expected bool
}{
{"2.1.0", "2.0.0", true},
{"2.0.0", "2.0.0", true},
{"2.0.5", "2.0.0", true},
{"2.1.5", "2.1.0", true},
{"2.0.9", "2.1.0", false},
{"1.9.9", "2.0.0", false},
{"3.0.0", "2.9.9", true},
{"2.0.0", "2.0.1", false},
}
for _, tt := range tests {
t.Run(tt.version+">="+tt.minimum, func(t *testing.T) {
got := semverAtLeast(tt.version, tt.minimum)
if got != tt.expected {
t.Errorf("semverAtLeast(%q, %q) = %v, want %v", tt.version, tt.minimum, got, tt.expected)
}
})
}
}
func TestCheckEngineVersion_Compatible(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
json.NewEncoder(w).Encode(map[string]string{"version": "2.1.0"})
}))
defer srv.Close()
clientVersion = "2.2.0"
minEngineVersion = "2.1.0"
defer func() { clientVersion = ""; minEngineVersion = "" }()
c := &Client{
baseURL: srv.URL,
httpClient: &http.Client{},
}
// Should not panic or exit
c.checkEngineVersion()
if !c.versionChecked {
t.Error("versionChecked should be true after check")
}
}
func TestCheckEngineVersion_SkipsWhenDev(t *testing.T) {
clientVersion = "dev"
minEngineVersion = "dev"
defer func() { clientVersion = ""; minEngineVersion = "" }()
c := &Client{
baseURL: "http://localhost:99999",
httpClient: &http.Client{},
}
// Should not attempt connection
c.checkEngineVersion()
if !c.versionChecked {
t.Error("versionChecked should be true after skipping")
}
}
func TestCheckEngineVersion_SkipsWhenEmpty(t *testing.T) {
clientVersion = "1.0.0"
minEngineVersion = ""
defer func() { clientVersion = ""; minEngineVersion = "" }()
c := &Client{
baseURL: "http://localhost:99999",
httpClient: &http.Client{},
}
c.checkEngineVersion()
if !c.versionChecked {
t.Error("versionChecked should be true after skipping")
}
}
func TestCheckEngineVersion_SkipsWhenUnreachable(t *testing.T) {
clientVersion = "2.0.0"
minEngineVersion = "2.0.0"
defer func() { clientVersion = ""; minEngineVersion = "" }()
c := &Client{
baseURL: "http://localhost:99999",
httpClient: &http.Client{},
}
// Should not panic — just skip
c.checkEngineVersion()
if !c.versionChecked {
t.Error("versionChecked should be true even when unreachable")
}
}
func TestCheckEngineVersion_CachedAfterFirstCall(t *testing.T) {
callCount := 0
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
callCount++
json.NewEncoder(w).Encode(map[string]string{"version": "2.1.0"})
}))
defer srv.Close()
clientVersion = "2.1.0"
minEngineVersion = "2.0.0"
defer func() { clientVersion = ""; minEngineVersion = "" }()
c := &Client{
baseURL: srv.URL,
httpClient: &http.Client{},
}
c.checkEngineVersion()
c.checkEngineVersion()
c.checkEngineVersion()
if callCount != 1 {
t.Errorf("expected 1 status call, got %d", callCount)
}
}
+1 -1
View File
@@ -1 +1 @@
2.0.4
2.0.6
+1
View File
@@ -21,4 +21,5 @@ services:
- KB_INGEST_DEVICE=${KB_INGEST_DEVICE:-auto}
- KB_API_KEY=${KB_API_KEY:-}
- KB_SEARCH_THRESHOLD=${KB_SEARCH_THRESHOLD:-0.01}
- HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-}
restart: unless-stopped
+1
View File
@@ -18,4 +18,5 @@ services:
- KB_INGEST_DEVICE=${KB_INGEST_DEVICE:-auto}
- KB_API_KEY=${KB_API_KEY:-}
- KB_SEARCH_THRESHOLD=${KB_SEARCH_THRESHOLD:-0.01}
- HF_HUB_OFFLINE=${HF_HUB_OFFLINE:-}
restart: unless-stopped
+5
View File
@@ -35,10 +35,15 @@ class Config:
def staging_dir(self) -> Path:
return self.data_dir / "staging"
@property
def documents_dir(self) -> Path:
return self.data_dir / "documents"
def ensure_dirs(self):
self.data_dir.mkdir(parents=True, exist_ok=True)
self.hf_cache.mkdir(exist_ok=True)
self.staging_dir.mkdir(exist_ok=True)
self.documents_dir.mkdir(exist_ok=True)
cfg = Config()
+40 -2
View File
@@ -34,6 +34,8 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
content_hash TEXT UNIQUE,
doc_type TEXT,
language TEXT,
stored_path TEXT,
original_filename TEXT,
created_at TEXT DEFAULT current_timestamp
);
@@ -94,6 +96,7 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
document_id INTEGER,
chunk_count INTEGER DEFAULT 0,
staging_path TEXT,
content_hash TEXT,
created_at TEXT DEFAULT current_timestamp,
completed_at TEXT
);
@@ -108,6 +111,18 @@ def init_schema(conn: sqlite3.Connection, embedding_dim: int) -> None:
f"CREATE VIRTUAL TABLE chunks_vec USING vec0(embedding float[{embedding_dim}], chunk_id integer)"
)
# Migrate: add content_hash to jobs if missing (added in v2.0.5)
cols = {row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()}
if "content_hash" not in cols:
conn.execute("ALTER TABLE jobs ADD COLUMN content_hash TEXT")
# Migrate: add stored_path and original_filename to documents if missing
doc_cols = {row[1] for row in conn.execute("PRAGMA table_info(documents)").fetchall()}
if "stored_path" not in doc_cols:
conn.execute("ALTER TABLE documents ADD COLUMN stored_path TEXT")
if "original_filename" not in doc_cols:
conn.execute("ALTER TABLE documents ADD COLUMN original_filename TEXT")
conn.commit()
@@ -142,6 +157,28 @@ def hash_exists(conn: sqlite3.Connection, content_hash: str) -> bool:
return row is not None
def get_document_by_hash(conn: sqlite3.Connection, content_hash: str) -> dict | None:
"""Return duplicate info for a given hash, or None.
Checks both the documents table (already ingested) and the jobs table
(queued/processing). Returns a dict with either ``document_id`` or
``job_id`` so callers can distinguish the two cases.
"""
row = conn.execute(
"SELECT id, title FROM documents WHERE content_hash = ?", (content_hash,)
).fetchone()
if row is not None:
return {"document_id": row["id"], "title": row["title"]}
# Also check pending/processing jobs that haven't been committed to documents yet
row = conn.execute(
"SELECT id, filename FROM jobs WHERE content_hash = ? AND status IN ('queued', 'processing')",
(content_hash,),
).fetchone()
if row is not None:
return {"job_id": row["id"], "title": row["filename"]}
return None
def insert_document(
conn: sqlite3.Connection,
title: str,
@@ -252,11 +289,12 @@ def create_job(
doc_type: Optional[str] = None,
tags_json: str = "[]",
title: Optional[str] = None,
content_hash: Optional[str] = None,
) -> int:
"""Create a new ingest job and return its id."""
cur = conn.execute(
"INSERT INTO jobs(filename, staging_path, doc_type, tags_json, title) VALUES (?, ?, ?, ?, ?)",
(filename, staging_path, doc_type, tags_json, title),
"INSERT INTO jobs(filename, staging_path, doc_type, tags_json, title, content_hash) VALUES (?, ?, ?, ?, ?, ?)",
(filename, staging_path, doc_type, tags_json, title, content_hash),
)
conn.commit()
return cur.lastrowid
+65 -1
View File
@@ -1,14 +1,20 @@
"""Document management endpoints — list, view, and delete documents."""
import json
import logging
import mimetypes
from pathlib import Path
from typing import Optional
from fastapi import HTTPException, Query
from fastapi.responses import FileResponse
from main import app
from kb.config import cfg
from kb.database import get_connection
logger = logging.getLogger("kb.routes.documents")
@app.get("/api/v1/documents")
async def list_documents(
@@ -100,8 +106,12 @@ async def get_document(doc_id: int):
(doc_id,),
).fetchall()
stored_path = doc["stored_path"]
has_file = bool(stored_path and Path(stored_path).exists())
return {
**dict(doc),
"has_file": has_file,
"tags": [t["name"] for t in tag_rows],
"chunks": [dict(c) for c in chunks],
}
@@ -109,12 +119,53 @@ async def get_document(doc_id: int):
conn.close()
@app.get("/api/v1/documents/{doc_id}/file")
async def download_document_file(doc_id: int):
conn = get_connection(cfg.db_path)
try:
doc = conn.execute(
"SELECT id, title, stored_path, original_filename FROM documents WHERE id = ?",
(doc_id,),
).fetchone()
if not doc:
raise HTTPException(status_code=404, detail="Document not found.")
stored_path = doc["stored_path"]
if not stored_path:
raise HTTPException(
status_code=404,
detail="Original file not available - ingested before document storage was enabled.",
)
file_path = Path(stored_path)
if not file_path.exists():
raise HTTPException(
status_code=404,
detail="Stored file not found on disk.",
)
original_filename = doc["original_filename"]
if not original_filename:
ext = file_path.suffix
original_filename = (doc["title"] or "document") + ext
media_type = mimetypes.guess_type(original_filename)[0] or "application/octet-stream"
return FileResponse(
path=str(file_path),
media_type=media_type,
filename=original_filename,
)
finally:
conn.close()
@app.delete("/api/v1/documents/{doc_id}")
async def delete_document(doc_id: int):
conn = get_connection(cfg.db_path)
try:
doc = conn.execute(
"SELECT id, title FROM documents WHERE id = ?", (doc_id,)
"SELECT id, title, stored_path FROM documents WHERE id = ?", (doc_id,)
).fetchone()
if not doc:
raise HTTPException(status_code=404, detail="Document not found.")
@@ -134,6 +185,19 @@ async def delete_document(doc_id: int):
conn.execute("DELETE FROM documents WHERE id = ?", (doc_id,))
conn.commit()
# Delete stored file from disk
stored_path = doc["stored_path"]
if stored_path:
try:
file_path = Path(stored_path)
if file_path.exists():
file_path.unlink()
logger.info("Deleted stored file: %s", stored_path)
else:
logger.warning("Stored file already missing: %s", stored_path)
except OSError as exc:
logger.warning("Failed to delete stored file %s: %s", stored_path, exc)
return {
"status": "deleted",
"document_id": doc_id,
+24 -8
View File
@@ -1,13 +1,15 @@
"""Job management endpoints — submit files/notes for ingestion and track progress."""
import hashlib
import json
from typing import Optional
from fastapi import HTTPException, UploadFile, File, Form, Query
from fastapi.responses import JSONResponse
from main import app
from kb.config import cfg
from kb.database import get_connection, create_job, get_job, list_jobs
from kb.database import get_connection, create_job, get_job, list_jobs, get_document_by_hash
from kb.staging import stage_file, stage_note
@@ -27,18 +29,32 @@ async def submit_job(
if file:
content = await file.read()
staging_path = stage_file(cfg.staging_dir, file.filename, content)
content_hash = hashlib.sha256(content).hexdigest()
filename = file.filename
else:
staging_path = stage_note(cfg.staging_dir, title or "note", note)
filename = staging_path.name
tags_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else []
tags_json = json.dumps(tags_list)
content = note.encode("utf-8")
content_hash = hashlib.sha256(content).hexdigest()
filename = None
conn = get_connection(cfg.db_path)
try:
job_id = create_job(conn, filename, str(staging_path), doc_type, tags_json, title)
existing = get_document_by_hash(conn, content_hash)
if existing:
return JSONResponse(
status_code=409,
content={"error": "duplicate", **existing},
)
if file:
staging_path = stage_file(cfg.staging_dir, file.filename, content)
else:
staging_path = stage_note(cfg.staging_dir, title or "note", note)
filename = staging_path.name
tags_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else []
tags_json = json.dumps(tags_list)
job_id = create_job(conn, filename, str(staging_path), doc_type, tags_json, title, content_hash)
return {"job_id": job_id, "status": "queued", "filename": filename}
finally:
conn.close()
+28 -2
View File
@@ -1,9 +1,12 @@
"""Hybrid search — FTS5 + sqlite-vec with Reciprocal Rank Fusion."""
import json
import logging
import struct
import sqlite3
logger = logging.getLogger("kb.search")
def hybrid_search(
conn: sqlite3.Connection,
@@ -74,6 +77,21 @@ def hybrid_search(
# Internal helpers
# ---------------------------------------------------------------------------
def _sanitize_fts_query(query: str) -> str:
"""Escape a raw user query for safe use with FTS5 MATCH.
Splits on whitespace, strips double quotes from each token, wraps each
token in double quotes (making FTS5 treat all content as literals), and
joins with spaces. Returns empty string if no valid tokens remain.
"""
tokens = []
for token in query.split():
token = token.replace('"', '')
if token:
tokens.append(f'"{token}"')
return " ".join(tokens)
def _fts_search(
conn: sqlite3.Connection,
query: str,
@@ -86,10 +104,14 @@ def _fts_search(
Returns:
{chunk_id: bm25_score} where scores are positive (higher = better).
"""
safe_query = _sanitize_fts_query(query)
if not safe_query:
return {}
sql = "SELECT f.rowid AS chunk_id, bm25(chunks_fts) AS rank FROM chunks_fts f"
joins: list[str] = []
where: list[str] = ["chunks_fts MATCH ?"]
params: list = [query]
params: list = [safe_query]
if tags or doc_type:
joins.append("JOIN chunks c ON f.rowid = c.id")
@@ -111,7 +133,11 @@ def _fts_search(
sql += " ORDER BY rank LIMIT ?"
params.append(limit)
rows = conn.execute(sql, params).fetchall()
try:
rows = conn.execute(sql, params).fetchall()
except sqlite3.OperationalError:
logger.warning("FTS5 query failed for input: %r", query)
return {}
# BM25 returns negative values (lower = better match); negate so
# higher = better.
+25 -1
View File
@@ -4,6 +4,7 @@ import asyncio
import hashlib
import json
import logging
import shutil
from pathlib import Path
from kb import config, database, embeddings, staging
@@ -168,8 +169,31 @@ def _process_job(job_row) -> tuple[str, int | None, int]:
database.tag_document(conn, doc_id, tags)
conn.commit()
# --- Move original file to persistent storage ---------------------
ext = Path(filename).suffix or staged_path.suffix
dest = cfg.documents_dir / f"{content_hash}{ext}"
try:
cfg.documents_dir.mkdir(parents=True, exist_ok=True)
shutil.move(str(staged_path), str(dest))
conn_update = database.get_connection(cfg.db_path)
try:
conn_update.execute(
"UPDATE documents SET stored_path = ?, original_filename = ? WHERE id = ?",
(str(dest), filename, doc_id),
)
conn_update.commit()
finally:
conn_update.close()
logger.info("Stored original file: %s", dest)
except Exception as exc:
logger.warning("Failed to store original file: %s", exc)
staging.cleanup(staged_path)
return ("done", doc_id, len(chunk_texts))
finally:
conn.close()
staging.cleanup(staged_path)
# Only clean up staging if the file is still there (not moved)
if staged_path.exists():
staging.cleanup(staged_path)
View File
+223
View File
@@ -0,0 +1,223 @@
"""Tests for original document storage feature."""
import hashlib
import shutil
import sqlite3
from pathlib import Path
from unittest.mock import patch
import pytest
from fastapi.testclient import TestClient
@pytest.fixture
def data_dir(tmp_path):
"""Create a temporary data directory with required subdirectories."""
staging = tmp_path / "staging"
staging.mkdir()
documents = tmp_path / "documents"
documents.mkdir()
return tmp_path
@pytest.fixture
def db_conn(data_dir):
"""Create an in-memory-style SQLite DB with the full schema."""
db_path = data_dir / "kb.db"
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys=ON")
conn.executescript("""
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY,
title TEXT,
source_path TEXT,
content_hash TEXT UNIQUE,
doc_type TEXT,
language TEXT,
stored_path TEXT,
original_filename TEXT,
created_at TEXT DEFAULT current_timestamp
);
CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY,
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
chunk_index INTEGER,
text TEXT,
token_count INTEGER,
metadata TEXT DEFAULT '{}',
UNIQUE(document_id, chunk_index)
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT UNIQUE COLLATE NOCASE
);
CREATE TABLE IF NOT EXISTS document_tags (
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
UNIQUE(document_id, tag_id)
);
CREATE TABLE IF NOT EXISTS jobs (
id INTEGER PRIMARY KEY,
filename TEXT,
status TEXT DEFAULT 'queued',
doc_type TEXT,
tags_json TEXT DEFAULT '[]',
title TEXT,
error TEXT,
document_id INTEGER,
chunk_count INTEGER DEFAULT 0,
staging_path TEXT,
content_hash TEXT,
created_at TEXT DEFAULT current_timestamp,
completed_at TEXT
);
""")
conn.commit()
yield conn
conn.close()
@pytest.fixture
def sample_pdf(data_dir):
"""Create a fake PDF file in staging."""
content = b"%PDF-1.4 fake pdf content for testing"
staging = data_dir / "staging"
path = staging / "test_upload.pdf"
path.write_bytes(content)
return path, content
class TestWorkerFileStorage:
"""Tests for worker moving files to persistent storage."""
def test_successful_ingestion_stores_file(self, data_dir, db_conn, sample_pdf):
"""7.1 - Test successful ingestion stores file at expected path."""
staged_path, content = sample_pdf
content_hash = hashlib.sha256(content).hexdigest()
documents_dir = data_dir / "documents"
expected_dest = documents_dir / f"{content_hash}.pdf"
# Simulate what the worker does: move file to documents dir
shutil.move(str(staged_path), str(expected_dest))
assert expected_dest.exists()
assert expected_dest.read_bytes() == content
assert not staged_path.exists()
# Simulate DB update
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
"VALUES (?, ?, ?, ?, ?, ?)",
("Test PDF", str(staged_path), content_hash, "pdf", str(expected_dest), "test_upload.pdf"),
)
db_conn.commit()
row = db_conn.execute("SELECT stored_path, original_filename FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
assert row["stored_path"] == str(expected_dest)
assert row["original_filename"] == "test_upload.pdf"
def test_failed_ingestion_no_file_in_documents(self, data_dir, sample_pdf):
"""7.2 - Test failed ingestion does not leave file in documents dir."""
staged_path, _ = sample_pdf
documents_dir = data_dir / "documents"
# Simulate failure: staging file gets cleaned up, nothing in documents dir
staged_path.unlink()
assert len(list(documents_dir.iterdir())) == 0
def test_document_deletion_removes_stored_file(self, data_dir, db_conn, sample_pdf):
"""7.4 - Test document deletion removes stored file."""
staged_path, content = sample_pdf
content_hash = hashlib.sha256(content).hexdigest()
documents_dir = data_dir / "documents"
dest = documents_dir / f"{content_hash}.pdf"
shutil.move(str(staged_path), str(dest))
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
"VALUES (?, ?, ?, ?, ?, ?)",
("Test PDF", str(staged_path), content_hash, "pdf", str(dest), "test_upload.pdf"),
)
db_conn.commit()
# Simulate delete: remove from DB and disk
doc = db_conn.execute("SELECT id, stored_path FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
stored = Path(doc["stored_path"])
db_conn.execute("DELETE FROM documents WHERE id = ?", (doc["id"],))
db_conn.commit()
if stored.exists():
stored.unlink()
assert not stored.exists()
assert db_conn.execute("SELECT COUNT(*) FROM documents", ()).fetchone()[0] == 0
def test_download_404_for_document_without_stored_file(self, db_conn):
"""7.5 - Test download returns 404 for documents without stored files."""
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type) "
"VALUES (?, ?, ?, ?)",
("Old Doc", "/tmp/gone", "abc123", "pdf"),
)
db_conn.commit()
row = db_conn.execute("SELECT stored_path FROM documents WHERE content_hash = 'abc123'").fetchone()
assert row["stored_path"] is None
class TestFileDownloadEndpoint:
"""Tests for the /api/v1/documents/{id}/file endpoint logic."""
def test_file_response_uses_original_filename(self, data_dir, db_conn, sample_pdf):
"""7.3 - Test file download uses correct original filename."""
staged_path, content = sample_pdf
content_hash = hashlib.sha256(content).hexdigest()
documents_dir = data_dir / "documents"
dest = documents_dir / f"{content_hash}.pdf"
shutil.move(str(staged_path), str(dest))
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path, original_filename) "
"VALUES (?, ?, ?, ?, ?, ?)",
("My Report", str(staged_path), content_hash, "pdf", str(dest), "quarterly_report.pdf"),
)
db_conn.commit()
doc = db_conn.execute("SELECT stored_path, original_filename, title FROM documents WHERE content_hash = ?", (content_hash,)).fetchone()
# Verify the original filename is preserved and different from title
assert doc["original_filename"] == "quarterly_report.pdf"
assert doc["title"] == "My Report"
assert Path(doc["stored_path"]).exists()
def test_fallback_to_title_when_no_original_filename(self, data_dir, db_conn):
"""Test that title+ext is used when original_filename is NULL."""
documents_dir = data_dir / "documents"
fake_file = documents_dir / "somehash.pdf"
fake_file.write_bytes(b"fake")
db_conn.execute(
"INSERT INTO documents(title, source_path, content_hash, doc_type, stored_path) "
"VALUES (?, ?, ?, ?, ?)",
("Engine Manual", "/tmp/old", "hash456", "pdf", str(fake_file)),
)
db_conn.commit()
doc = db_conn.execute("SELECT original_filename, title, stored_path FROM documents WHERE content_hash = 'hash456'").fetchone()
# When original_filename is NULL, the endpoint should fall back to title + ext
original_filename = doc["original_filename"]
if not original_filename:
ext = Path(doc["stored_path"]).suffix
original_filename = (doc["title"] or "document") + ext
assert original_filename == "Engine Manual.pdf"
@@ -0,0 +1,2 @@
schema: spec-driven
created: 2026-03-26
@@ -0,0 +1,40 @@
## Context
FTS5 has its own query syntax. Characters like `?`, `*`, `"`, `(`, `)`, `+`, `-`, `^` and keywords like `AND`, `OR`, `NOT`, `NEAR` have special meaning. The current code passes the raw user query to `chunks_fts MATCH ?` — parameterized (safe from SQL injection) but not safe from FTS5 syntax errors.
The fix point is `_fts_search()` in `engine/kb/search.py:92` where `params: list = [query]`.
## Goals / Non-Goals
**Goals:**
- Any user input to the search endpoint produces either valid results or an empty result set — never a 500 error
- Preserve the user's search intent as much as possible (don't over-strip)
**Non-Goals:**
- Exposing FTS5 advanced syntax to users (they can't use AND/OR/NEAR operators intentionally)
- Changing vector search (it already handles arbitrary strings via the embedding model)
## Decisions
### 1. Quote each token individually
Split the query on whitespace, wrap each token in double quotes (`"token"`), and join with spaces. FTS5 interprets double-quoted strings as literal phrases, disabling all operator parsing within them. Any embedded double quotes in a token are stripped.
Example: `what color is grass?` becomes `"what" "color" "is" "grass?"` — FTS5 treats `?` as a literal character inside quotes.
**Alternative considered**: Strip all non-alphanumeric characters. Rejected because it would break searches for terms containing hyphens, dots, or other meaningful punctuation (e.g., searching for "v2.0" or "self-hosted").
**Alternative considered**: Use a try/except to catch FTS5 errors and fall back. Rejected as a primary strategy because it silently degrades — but we'll add it as a safety net.
### 2. Handle empty/whitespace-only queries
If after sanitization no tokens remain, skip FTS search entirely and return empty results. This prevents sending an empty string to MATCH which would also error.
### 3. Try/except safety net
Wrap the FTS5 execute call in a try/except for `sqlite3.OperationalError`. If an edge case still slips through, return empty FTS results and log a warning rather than crashing with a 500.
## Risks / Trade-offs
- **[Reduced FTS expressiveness]** Users cannot use FTS5 operators like `AND`, `OR`, phrase matching. → Acceptable trade-off for a personal knowledge base tool where natural language queries are the norm. The hybrid search (vector + FTS) compensates.
- **[Edge cases]** Some Unicode or control characters might still cause issues. → The try/except safety net handles these.
@@ -0,0 +1,24 @@
## Why
Searching with natural language queries containing characters like `?`, `"`, `*`, `(`, `)`, `-`, or FTS5 keywords (`AND`, `OR`, `NOT`, `NEAR`) causes a 500 error because the raw query string is passed directly to `chunks_fts MATCH ?` without escaping. Users should be able to type anything into a search query without triggering syntax errors.
## What Changes
- **Sanitize FTS5 query input**: Escape or strip FTS5 special characters from the user's query before passing it to the MATCH operator
- **Graceful fallback**: If the sanitized query produces no valid FTS5 terms, return empty results from FTS instead of erroring
## Capabilities
### New Capabilities
_(none)_
### Modified Capabilities
- `engine-api`: The "Hybrid search" requirement changes — the engine must sanitize user queries to prevent FTS5 syntax errors for any input
## Impact
- **Engine search** (`engine/kb/search.py`): `_fts_search()` needs query sanitization before the MATCH parameter
- **No client changes**: The client already displays results or errors correctly
- **No schema changes**: No database modifications needed
@@ -0,0 +1,21 @@
## MODIFIED Requirements
### Requirement: Hybrid search
The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5) and vector similarity search (via sqlite-vec), merged using Reciprocal Rank Fusion. Search SHALL complete in under 100ms when the model is warm. The engine SHALL sanitize user query strings to prevent FTS5 syntax errors for any input.
#### Scenario: Search with special characters
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "what color is grass?"}`
- **THEN** the engine SHALL sanitize the query for FTS5, execute the search successfully, and return results (not a 500 error)
#### Scenario: Search with FTS5 operators in query
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "NOT something OR (other)"}`
- **THEN** the engine SHALL treat the input as literal search terms, not FTS5 operators, and return matching results
#### Scenario: Search with only special characters
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "??!@#"}`
- **THEN** the engine SHALL return HTTP 200 with an empty result set (not a 500 error)
#### Scenario: Search with quotes in query
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "the \"quick\" fox"}`
- **THEN** the engine SHALL sanitize embedded quotes and return results normally
@@ -0,0 +1,15 @@
## 1. Query Sanitization
- [x] 1.1 Add `_sanitize_fts_query(query)` function to `engine/kb/search.py` that splits on whitespace, strips double quotes from each token, wraps each token in double quotes, and joins with spaces
- [x] 1.2 Handle edge case: if no valid tokens remain after sanitization, return empty dict from `_fts_search` without executing the query
## 2. Integration
- [x] 2.1 Call `_sanitize_fts_query()` in `_fts_search()` before adding the query to params (line 92)
- [x] 2.2 Add try/except `sqlite3.OperationalError` around the FTS5 execute call — log a warning and return empty results on error
## 3. Testing
- [x] 3.1 Test: `kb search "what color is grass?"` returns results, not a 500 error
- [x] 3.2 Test: `kb search "NOT something OR (other)"` returns results, treating input as literal terms
- [x] 3.3 Test: query with only special characters returns empty results, not an error
@@ -0,0 +1,2 @@
schema: spec-driven
created: 2026-03-26
@@ -0,0 +1,58 @@
## Context
The engine currently accepts all uploads with HTTP 202, stages the file, creates a job record, and relies on the background worker to detect duplicates via SHA256 content hash. When a duplicate is found, the worker marks the job as `skipped` — but the user has already received a success response and must poll job status to discover the duplicate. This creates unnecessary I/O (staging), pollutes the job list, and provides poor UX.
The `documents` table already has a `content_hash TEXT UNIQUE` column, and `database.hash_exists()` already exists. The infrastructure for dedup is in place — it just runs too late in the pipeline.
## Goals / Non-Goals
**Goals:**
- Reject duplicate uploads at the API boundary with HTTP 409 and useful context (existing document ID/title)
- Avoid staging files or creating job records for duplicates
- Apply to both file uploads and note submissions
- Keep the worker-side hash check as a race condition safety net
- Update the Go client to handle 409 and display a clear message
**Non-Goals:**
- Fuzzy/near-duplicate detection (e.g., same PDF with different metadata) — byte-identical only
- Changing the hash algorithm (SHA256 is fine)
- Adding a "force re-import" override flag (can be added later if needed)
- Dedup across different file formats with identical content (e.g., .md and .pdf of same text)
## Decisions
### 1. Hash in the upload endpoint, before staging
Compute SHA256 from the uploaded bytes in `submit_job()` before calling `stage_file()`. This avoids writing to disk or creating a DB job record for duplicates.
**Alternative considered**: Hash after staging but before job creation. Rejected because it still wastes disk I/O for the staging write.
### 2. Return HTTP 409 Conflict with context-dependent metadata
The 409 response includes `{"error": "duplicate", ...}` with a distinct shape depending on where the duplicate was found:
- **Already-ingested document**: `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
- **In-flight job (queued/processing)**: `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
This allows clients to distinguish between "this document already exists" and "this document is already being processed" and display appropriate messages.
**Alternative considered**: Return 200 with a `"status": "duplicate"` field. Rejected because 409 is the semantically correct status code and allows clients to distinguish duplicates from successful uploads without parsing the body.
### 3. New database helper: `get_document_by_hash()`
Returns a dict with duplicate info for a given hash, or `None`. Checks both the `documents` table (already ingested) and the `jobs` table (queued/processing), returning `document_id` or `job_id` accordingly. The `content_hash` column on the `jobs` table is populated at upload time to support this check. The boolean `hash_exists()` is retained for the worker safety net.
**Alternative considered**: Modify `hash_exists()` to return the document row. Rejected to avoid changing the worker's existing interface — keep changes minimal.
### 4. Retain worker-side dedup as safety net
The worker's `hash_exists()` check stays. In theory, two identical uploads could arrive in the same instant — both pass the API hash check before either commits. The jobs-table check closes most of this window (the hash is written at job creation), but a narrow race remains between the API check and the job insert. The UNIQUE constraint on `documents.content_hash` is the final backstop.
### 5. Note dedup: hash the text content
For notes submitted via the `note` field, SHA256-hash the UTF-8 encoded text. This catches identical note resubmissions.
## Risks / Trade-offs
- **[Race condition window]** Two identical files uploaded in the same millisecond could both pass the API hash check. → Mitigated by the worker-side `hash_exists()` check and the UNIQUE constraint. The second job would be `skipped`, not a crash.
- **[Blocking I/O in async endpoint]** SHA256 hashing is CPU-bound but fast (~5ms for 10MB). → Acceptable for the upload endpoint which already reads the full file into memory. No need for `run_in_executor`.
- **[Client compatibility]** Older clients not expecting 409 will see an error. → This is correct behavior — they'll see an HTTP error rather than silently accepting a duplicate. The Go client will be updated to handle it gracefully.
@@ -0,0 +1,31 @@
## Why
Duplicate document detection currently happens in the background worker — the upload endpoint always returns HTTP 202, and the user only discovers a duplicate later when the job status is `skipped`. This wastes staging I/O, creates noise in the job list, and gives poor user feedback. Moving the SHA256 content hash check to the upload endpoint allows immediate rejection with a clear error, preventing unnecessary work and giving the user instant feedback.
## What Changes
- **Compute content hash at upload time**: The `POST /api/v1/jobs` endpoint will SHA256-hash the uploaded file bytes before staging and check against `documents.content_hash`
- **Reject duplicates immediately**: Return HTTP 409 Conflict with the existing document ID when a duplicate is detected, instead of accepting and later skipping
- **No job created for duplicates**: Duplicate uploads will not create a job record or stage a file
- **Remove worker-side dedup**: The background worker's `hash_exists()` check becomes redundant for the normal flow but should be retained as a safety net (race condition guard)
- **Update Go client**: Surface the 409 response with a clear message (e.g., "Already imported: <title> (doc ID: <id>)")
- **Note dedup**: Apply the same check to notes — hash the note text content
## Capabilities
### New Capabilities
_(none — this modifies existing capabilities)_
### Modified Capabilities
- `engine-api`: The "Async ingestion via job queue" requirement changes — duplicate content is now rejected at upload time (HTTP 409) instead of accepted and later skipped by the worker. The "Duplicate content detection" scenario moves from background to synchronous.
- `go-client`: The "Add command" requirement changes — the client must handle HTTP 409 responses and display the duplicate document info to the user.
## Impact
- **Engine API** (`engine/kb/routes/jobs.py`): `submit_job()` gains hash computation and DB lookup before staging/job creation
- **Engine database** (`engine/kb/database.py`): Need a query to return the existing document ID/title for a given hash (not just boolean exists check)
- **Engine worker** (`engine/kb/worker.py`): Dedup check retained as safety net but no longer the primary guard
- **Go client** (`client/cmd/add.go`): Handle 409 response, display duplicate info
- **API contract**: New HTTP 409 response on `POST /api/v1/jobs` — this is additive, not breaking, since no consumer expects 409 today
@@ -0,0 +1,41 @@
## MODIFIED Requirements
### Requirement: Async ingestion via job queue
The engine SHALL accept file uploads and text notes for ingestion asynchronously. Uploaded content SHALL be written to a staging area and a job record created in the database. The engine SHALL return HTTP 202 immediately. A background worker SHALL process queued jobs sequentially. Before staging, the engine SHALL compute a SHA256 hash of the uploaded content and reject duplicates immediately.
#### Scenario: Upload a PDF file
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a PDF file and optional fields (tags, doc_type)
- **THEN** the engine SHALL compute the SHA256 hash of the file bytes, verify no existing document has the same hash, write the file to the staging directory, create a job record with status `queued`, and return HTTP 202 with `{"job_id": "<id>", "status": "queued", "filename": "report.pdf"}`
#### Scenario: Upload a text note
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a `note` text field and optional `title` field
- **THEN** the engine SHALL compute the SHA256 hash of the note text (UTF-8 encoded), verify no existing document has the same hash, write the note content to a staging file, create a job record with status `queued`, and return HTTP 202 with the job ID
#### Scenario: Upload multiple files in sequence
- **WHEN** a client sends multiple `POST /api/v1/jobs` requests in quick succession
- **THEN** the engine SHALL queue each job independently and the background worker SHALL process them in FIFO order
#### Scenario: Duplicate file detected at upload time (already ingested)
- **WHEN** a client uploads a file whose SHA256 content hash matches an already-ingested document
- **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
#### Scenario: Duplicate file detected at upload time (in-flight job)
- **WHEN** a client uploads a file whose SHA256 content hash matches a queued or processing job
- **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
#### Scenario: Duplicate note detected at upload time (already ingested)
- **WHEN** a client submits a note whose SHA256 content hash matches an already-ingested document
- **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
#### Scenario: Duplicate note detected at upload time (in-flight job)
- **WHEN** a client submits a note whose SHA256 content hash matches a queued or processing job
- **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
#### Scenario: Duplicate uploaded during concurrent request handling
- **WHEN** two identical files are uploaded in the same instant, both passing the API hash check before either job is committed
- **THEN** both jobs SHALL be queued, and the background worker SHALL process the first normally and mark the second as `skipped` (worker-side safety net via `hash_exists()` and UNIQUE constraint)
#### Scenario: Upload failure due to unsupported file type
- **WHEN** a client uploads a file with an unsupported extension
- **THEN** the engine SHALL return HTTP 422 with an error message listing supported types
@@ -0,0 +1,49 @@
## MODIFIED Requirements
### Requirement: Add command (file and note ingestion)
The client SHALL provide a `kb add` command that uploads files or notes to the engine for async ingestion. The client SHALL exit immediately after a successful upload. The client SHALL handle duplicate rejection (HTTP 409) and display the existing document information.
#### Scenario: Add a single file
- **WHEN** the user runs `kb add report.pdf`
- **THEN** the client SHALL upload the file via `POST /api/v1/jobs` (multipart), print "Queued: report.pdf", and exit
#### Scenario: Add a file with tags
- **WHEN** the user runs `kb add manual.pdf --tags car,maintenance`
- **THEN** the client SHALL include the tags in the multipart upload metadata
#### Scenario: Add a directory recursively
- **WHEN** the user runs `kb add ~/documents/ --recursive`
- **THEN** the client SHALL discover all supported files in the directory tree, upload each one sequentially, and print "Queued: N files"
#### Scenario: Add a text note
- **WHEN** the user runs `kb add --note "The server room is in building 3, floor 2"`
- **THEN** the client SHALL submit the note text via `POST /api/v1/jobs` (multipart with note field), print "Queued: note", and exit
#### Scenario: Duplicate file rejected (already ingested)
- **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "document_id": 42, "title": "report.pdf"}`
- **THEN** the client SHALL print "Already imported: report.pdf (doc ID: 42)" and exit with code 0
#### Scenario: Duplicate file rejected (in-flight job)
- **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "job_id": 7, "title": "report.pdf"}`
- **THEN** the client SHALL print "Already queued: report.pdf (job ID: 7)" and exit with code 0
#### Scenario: Duplicate file in recursive add
- **WHEN** the user runs `kb add ~/documents/ --recursive` and some files are rejected as duplicates
- **THEN** the client SHALL print the duplicate message for each rejected file (distinguishing "Already imported" from "Already queued"), continue uploading remaining files, and include a summary (e.g., "Queued: 5 files, 2 duplicates skipped")
#### Scenario: Duplicate with JSON output
- **WHEN** the user runs `kb add report.pdf --format json` and the engine returns HTTP 409
- **THEN** the client SHALL output the raw JSON response from the engine including the document_id and title
#### Scenario: Add with JSON output
- **WHEN** the user runs `kb add report.pdf --format json`
- **THEN** the client SHALL output the JSON response from the engine including the job_id
#### Scenario: File not found
- **WHEN** the user runs `kb add nonexistent.pdf`
- **THEN** the client SHALL print an error and exit with a non-zero code without making any API call
#### Scenario: Upload failure
- **WHEN** the upload fails (network error, engine returns 4xx/5xx other than 409)
- **THEN** the client SHALL print the error and exit with a non-zero code
@@ -0,0 +1,22 @@
## 1. Database Layer
- [x] 1.1 Add `get_document_by_hash(conn, content_hash)` function to `engine/kb/database.py` that returns `(document_id, title)` or `None`
## 2. Upload Endpoint
- [x] 2.1 Update `submit_job()` in `engine/kb/routes/jobs.py` to compute SHA256 hash of uploaded file bytes before staging
- [x] 2.2 Add duplicate check: call `get_document_by_hash()` and return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}` if match found
- [x] 2.3 Apply same hash check for note submissions (hash the UTF-8 encoded note text)
## 3. Go Client
- [x] 3.1 Update `uploadFile()` in `client/cmd/add.go` to handle HTTP 409 responses — parse the JSON body and print "Already imported: <title> (doc ID: <id>)"
- [x] 3.2 Update recursive directory upload to continue on 409, track duplicate count, and include in summary output
- [x] 3.3 Handle 409 in JSON output mode — pass through the raw engine response
## 4. Testing
- [x] 4.1 Test: upload a file, then upload the same file again — verify 409 with correct document_id and title
- [x] 4.2 Test: upload a note, then upload the same note text — verify 409
- [x] 4.3 Test: upload a file, then upload a different file — verify 202 as normal
- [x] 4.4 Test: verify the worker-side `hash_exists()` safety net still works (direct job insertion bypassing API)
@@ -0,0 +1,2 @@
schema: spec-driven
created: 2026-03-27
@@ -0,0 +1,84 @@
## Context
Currently, uploaded files pass through a staging directory and are deleted after the worker extracts chunks and embeddings. The `documents.source_path` column stores the (now-stale) staging path. Users who want the original file must re-source it externally. The data directory structure today is:
```
/data/
kb.db
hf_cache/
staging/ # temporary, cleaned after processing
```
## Goals / Non-Goals
**Goals:**
- Persist every successfully-ingested original file for the lifetime of the document
- Serve the original file via API (`GET /api/v1/documents/{id}/file`)
- Clean up stored files when a document is deleted
- Work transparently with the existing Docker volume mount (`/data`)
**Non-Goals:**
- Serving transformed/converted versions of documents (e.g. PDF→HTML)
- De-duplicating file storage (same content hash = same row, so 1:1 is fine)
- Compression or archival of stored files
- Retroactive storage of files ingested before this change (they're already gone)
## Decisions
### 1. Storage layout: content-hash-based flat directory
Store files at `{data_dir}/documents/{content_hash}{ext}` (e.g. `documents/a1b2c3...d4.pdf`).
**Why over document-ID naming:** Content hash is available at staging time before the DB row exists, avoids race conditions, and makes dedup trivially safe (same hash = same file, overwrite is harmless). The hash is already computed for dedup checks.
**Why flat over nested:** The KB is a personal tool — expected scale is hundreds to low-thousands of documents. A flat directory is simpler and sufficient. If needed later, a `ab/cd/` prefix scheme is easy to add.
**Alternatives considered:**
- *Store in SQLite as BLOBs*: Bloats the DB, complicates backups, and degrades WAL performance for large files. Rejected.
- *Keep the staging path as-is*: Staging uses UUID prefixes which are meaningless; content-hash naming is deterministic and self-deduplicating.
### 2. Move file from staging to documents dir (not copy)
Use `shutil.move()` from staging to documents dir after successful ingestion, before `staging.cleanup()`. This avoids doubling disk usage during processing.
**Why not copy-then-delete:** Move is atomic on the same filesystem (which `/data/staging` and `/data/documents` share). Faster, no temporary disk spike.
### 3. New columns `stored_path` and `original_filename` on `documents` table
Add two nullable columns:
- `stored_path TEXT` — permanent file location on disk
- `original_filename TEXT` — the exact filename from the upload (e.g. `report.pdf`)
Both are nullable because existing documents (ingested before this change) won't have values.
**Why `original_filename` separate from `title`:** The `title` field can be user-overridden (e.g. "Engine Manual" instead of `report.pdf`). When serving the file for download, the `Content-Disposition` header should use the original filename so the downloaded file has the correct name and extension. The `original_filename` is sourced from `jobs.filename` which is already captured at upload time.
Keep `source_path` as-is for backward compatibility (it records what the staging path was). `stored_path` is the permanent location.
**Migration:** Two `ALTER TABLE` statements — safe additive migrations, no data rewrite needed.
### 4. File download endpoint returns the file directly
`GET /api/v1/documents/{id}/file` uses FastAPI's `FileResponse` with:
- `media_type` derived from the file extension
- `Content-Disposition: attachment; filename="{original_filename}"` (falls back to `{title}{ext}` if `original_filename` is NULL)
- Returns 404 if `stored_path` is NULL or file is missing from disk
### 5. Delete cascades to file removal
When `DELETE /api/v1/documents/{id}` is called, delete the stored file from disk after the DB delete succeeds. If file removal fails (already gone, permissions), log a warning but don't fail the API call — the DB is the source of truth.
## Risks / Trade-offs
- **Disk usage increases** — every ingested file persists. For the personal-use scale this is expected and acceptable. Users manage this via document deletion.
→ Mitigation: Document the storage behavior; `GET /api/v1/status` already shows DB size, could add documents-dir size later.
- **Pre-existing documents have no stored file** — `stored_path` will be NULL for documents ingested before this change.
→ Mitigation: The download endpoint returns 404 with a clear message ("original file not available — ingested before document storage was enabled"). No attempt to backfill.
- **File-DB consistency** — crash between DB commit and file move could leave orphan staged files or missing stored files.
→ Mitigation: Move file first, then commit DB. If DB commit fails, the file in documents dir is harmless (orphan cleanup can be added later). If move fails, the job fails and staged file remains for retry.
## Open Questions
None — the scope is straightforward enough to proceed.
@@ -0,0 +1,30 @@
## Why
The knowledge base currently discards original files after chunking and embedding. Once a document is ingested, only the extracted text chunks and vectors remain — the original PDF, markdown, or code file is deleted from staging. Users cannot retrieve the source document from the KB, which limits its usefulness as a document store and prevents use cases like re-processing with a different model or serving the original file to downstream tools.
## What Changes
- Add a persistent document storage directory (`{data_dir}/documents/`) alongside the SQLite database
- After successful ingestion, copy the original file from staging to permanent storage instead of deleting it
- Store the permanent file path in the `documents` table (`stored_path` column) and the original upload filename (`original_filename` column) so downloads use the correct name
- Add an API endpoint to download the original file by document ID
- Add a CLI command to export/retrieve the original document
- **BREAKING**: Delete document now also removes the stored file from disk
- Notes (text-only) are stored as `.note` files in the same directory for consistency
## Capabilities
### New Capabilities
- `document-storage`: Persistent storage of original uploaded files on disk, lifecycle management (store on ingest, delete on document removal), and retrieval via API
### Modified Capabilities
- `engine-api`: New endpoint `GET /api/v1/documents/{id}/file` to download the original file; delete endpoint must also clean up stored files; ingestion worker stores files instead of discarding them
## Impact
- **Engine config**: New `documents_dir` property on Config, new directory created at startup via `ensure_dirs()`
- **Worker**: After successful chunking, move/copy file from staging to documents dir; update `source_path``stored_path` with permanent location
- **Database schema**: Add `stored_path` and `original_filename` columns to `documents` table (migration for existing DBs)
- **Routes**: New file-download endpoint; update delete handler to remove stored file
- **Go client**: New `export` / `get-file` subcommand to download original documents
- **Docker**: `documents/` directory lives inside the existing `/data` volume — no new mounts needed
@@ -0,0 +1,83 @@
## ADDED Requirements
### Requirement: Persistent original file storage
The engine SHALL persistently store the original uploaded file on disk after successful ingestion. Files SHALL be stored at `{data_dir}/documents/{content_hash}{extension}` where `content_hash` is the SHA-256 hex digest already computed for dedup and `extension` is preserved from the original filename. The `documents` table SHALL record the stored file path in a `stored_path` column and the original upload filename in an `original_filename` column.
#### Scenario: File stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a PDF file
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}.pdf`, store the permanent path in `documents.stored_path`, store the original filename in `documents.original_filename`, and delete the staging entry
#### Scenario: Note stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a text note
- **THEN** the worker SHALL move the staged `.note` file to `{data_dir}/documents/{content_hash}.note` and store the permanent path in `documents.stored_path`
#### Scenario: Markdown file stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a markdown file
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}.md` and store the permanent path in `documents.stored_path`
#### Scenario: Code file stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a code file (e.g. `.py`, `.go`)
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}{original_extension}` and store the permanent path in `documents.stored_path`
#### Scenario: Documents directory created at startup
- **WHEN** the engine starts up and calls `ensure_dirs()`
- **THEN** the `{data_dir}/documents/` directory SHALL be created if it does not exist
#### Scenario: Ingestion failure does not store file
- **WHEN** the background worker fails to process an ingestion job
- **THEN** the staged file SHALL be cleaned up as before and no file SHALL be written to the documents directory
---
### Requirement: File retrieval via API
The engine SHALL serve the original stored file for any document that has a stored file on disk.
#### Scenario: Download original file
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document with a stored file
- **THEN** the engine SHALL return the file with appropriate `Content-Type` based on file extension and `Content-Disposition: attachment; filename="{original_filename}"` header, falling back to `{title}{ext}` if `original_filename` is NULL
#### Scenario: Download file for pre-existing document
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document ingested before this feature was added (stored_path is NULL)
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Original file not available - ingested before document storage was enabled"}`
#### Scenario: Download file when file missing from disk
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document whose `stored_path` is set but the file no longer exists on disk
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Stored file not found on disk"}`
#### Scenario: Download file for non-existent document
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` with a non-existent document ID
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Document not found"}`
---
### Requirement: File cleanup on document deletion
The engine SHALL remove the stored original file from disk when a document is deleted.
#### Scenario: Delete document with stored file
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document with a stored file
- **THEN** the engine SHALL delete the document from the database (cascading to chunks, embeddings, tags) AND delete the stored file from disk
#### Scenario: Delete document when stored file already missing
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document whose stored file has been manually removed from disk
- **THEN** the engine SHALL delete the document from the database successfully and log a warning about the missing file
#### Scenario: Delete document without stored file (pre-existing)
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document with `stored_path` NULL
- **THEN** the engine SHALL delete the document from the database without attempting file removal
---
### Requirement: Database schema migration for stored_path and original_filename
The engine SHALL add `stored_path` and `original_filename` columns to the `documents` table for tracking permanent file locations and original upload filenames.
#### Scenario: Fresh database initialization
- **WHEN** the engine initializes a new database
- **THEN** the `documents` table SHALL include `stored_path TEXT` and `original_filename TEXT` columns in its schema
#### Scenario: Existing database migration
- **WHEN** the engine starts with a database created before this feature
- **THEN** the engine SHALL add `stored_path TEXT` and `original_filename TEXT` to the `documents` table via `ALTER TABLE` if the columns do not exist
@@ -0,0 +1,61 @@
## MODIFIED Requirements
### Requirement: Background ingestion worker
The engine SHALL run a background worker that processes queued jobs. The worker SHALL process one job at a time. For each job, it SHALL: detect document type, run the appropriate chunking pipeline (Docling for PDFs, header-based for Markdown, AST-based for code, whole-text for notes), generate embeddings using the resident model, insert chunks and vectors into the database, and move the original file to persistent storage.
#### Scenario: Successful PDF ingestion
- **WHEN** the background worker picks up a queued PDF job
- **THEN** it SHALL update the job status to `processing`, run Docling conversion and chunking, embed all chunks, insert document and chunks into the database, move the staged file to `{data_dir}/documents/{content_hash}.pdf`, update `documents.stored_path` with the permanent path, store the original filename in `documents.original_filename`, update the job status to `done` with the resulting document_id and chunk count, and clean up the staging entry
#### Scenario: Ingestion failure
- **WHEN** the background worker encounters an error during processing (e.g., corrupt PDF)
- **THEN** it SHALL update the job status to `failed` with the error message, delete the staged file, and continue processing the next queued job
#### Scenario: Search during active ingestion
- **WHEN** a search request arrives while the background worker is processing a job
- **THEN** the search SHALL execute without blocking (SQLite WAL mode) and return results from already-ingested documents
---
### Requirement: Document management
The engine SHALL provide endpoints to list, inspect, remove, and download original files for ingested documents.
#### Scenario: List documents
- **WHEN** a client sends `GET /api/v1/documents`
- **THEN** the engine SHALL return a JSON array of documents with id, title, doc_type, tags, chunk_count, and created_at
#### Scenario: List documents with filters
- **WHEN** a client sends `GET /api/v1/documents?type=pdf&tags=manual`
- **THEN** the engine SHALL return only documents matching all specified filters
#### Scenario: Get document details
- **WHEN** a client sends `GET /api/v1/documents/{id}`
- **THEN** the engine SHALL return the full document record including all chunks, their text content, and whether the original file is available (`has_file: true/false`)
#### Scenario: Download original file
- **WHEN** a client sends `GET /api/v1/documents/{id}/file`
- **THEN** the engine SHALL return the original file with appropriate Content-Type and `Content-Disposition: attachment; filename="{original_filename}"` headers, or HTTP 404 if the file is not available
#### Scenario: Remove a document
- **WHEN** a client sends `DELETE /api/v1/documents/{id}`
- **THEN** the engine SHALL delete the document, all its chunks, associated embeddings, tag associations, and the stored original file from disk, and return HTTP 200 with a confirmation
#### Scenario: Remove non-existent document
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` with a non-existent ID
- **THEN** the engine SHALL return HTTP 404
---
### Requirement: Engine configuration via environment variables
The engine SHALL be configured via environment variables. No config file is read by the engine — all configuration comes from the environment (set via compose.yaml or Docker run).
#### Scenario: Default configuration
- **WHEN** the engine starts with no environment variables set
- **THEN** it SHALL use defaults: data directory `/data`, model `all-MiniLM-L6-v2`, device `auto`, no API key required. It SHALL create `staging/` and `documents/` subdirectories under the data directory.
#### Scenario: Custom model
- **WHEN** `KB_MODEL` is set to `BAAI/bge-small-en-v1.5`
- **THEN** the engine SHALL download and load that model instead of the default
@@ -0,0 +1,38 @@
## 1. Config and Schema
- [x] 1.1 Add `documents_dir` property to `Config` in `engine/kb/config.py` returning `{data_dir}/documents`
- [x] 1.2 Add `documents_dir.mkdir()` to `Config.ensure_dirs()`
- [x] 1.3 Add `stored_path TEXT` and `original_filename TEXT` columns to `documents` table in `init_schema()` (both CREATE TABLE and ALTER TABLE migration for existing DBs)
## 2. Worker — File Persistence
- [x] 2.1 In `worker._process_job()`, after successful DB commit, move staged file to `{documents_dir}/{content_hash}{ext}` using `shutil.move()`
- [x] 2.2 Update `documents.stored_path` and `documents.original_filename` (from `jobs.filename`) after moving the file
- [x] 2.3 Remove `staging.cleanup()` call for successful jobs (file is moved, not deleted); keep cleanup on failure path
## 3. API — File Download Endpoint
- [x] 3.1 Add `GET /api/v1/documents/{id}/file` route in `engine/kb/routes/documents.py` using FastAPI `FileResponse`
- [x] 3.2 Return appropriate `Content-Type` from file extension and `Content-Disposition: attachment; filename="{original_filename}"` (fall back to `{title}{ext}` if NULL)
- [x] 3.3 Handle 404 cases: document not found, `stored_path` is NULL, file missing from disk
## 4. API — Delete Cleanup
- [x] 4.1 Update `DELETE /api/v1/documents/{id}` in `engine/kb/routes/documents.py` to also delete the stored file from disk
- [x] 4.2 Handle missing file gracefully (log warning, don't fail the request)
## 5. Document Details Enhancement
- [x] 5.1 Add `has_file` boolean to `GET /api/v1/documents/{id}` response based on `stored_path` presence and file existence on disk
## 6. Go Client
- [x] 6.1 Add `kb export <doc_id>` subcommand to the Go client that calls `GET /api/v1/documents/{id}/file` and writes to stdout or a specified output path
## 7. Testing
- [x] 7.1 Test successful ingestion stores file at expected path
- [x] 7.2 Test failed ingestion does not leave file in documents dir
- [x] 7.3 Test file download endpoint returns correct content and headers
- [x] 7.4 Test document deletion removes stored file
- [x] 7.5 Test download returns 404 for documents without stored files
@@ -0,0 +1,2 @@
schema: spec-driven
created: 2026-03-28
@@ -0,0 +1,93 @@
## Context
Currently the project uses a single version number shared between client and engine, managed by `release.sh`. Both `client/VERSION` and `engine/VERSION` are always bumped to the same value. A single git tag `vX.Y.Z` is created, and a single Gitea release bundles Go client binaries and Docker engine image references. This means any change to either component forces a full release of both.
The client is a Go binary distributed as platform-specific downloads. The engine is a Python FastAPI server distributed as Docker images. They communicate over HTTP via `/api/v1/` endpoints. The engine already exposes its version via `GET /api/v1/status``{"version": "X.Y.Z", ...}`.
## Goals / Non-Goals
**Goals:**
- Allow client and engine to have independent version numbers and release cadences
- Provide a runtime compatibility check so users get a clear error when their client is too new for their engine
- Split release tooling so each component can be released without touching the other
**Non-Goals:**
- API versioning beyond the existing `/api/v1/` path prefix
- Backward-compatible negotiation or feature detection (client either works or fails)
- Automatic upgrades or update notifications
- Version checking in the other direction (engine requiring minimum client)
## Decisions
### 1. Tag naming: `client-vX.Y.Z` and `engine-vX.Y.Z`
Prefix-style tags clearly identify which component a release belongs to and sort well in git tag listings.
**Why over path-style (`client/vX.Y.Z`):** Slashes in git tags can cause issues with some tooling and are less conventional. Prefix-style is simpler and widely used in monorepos.
**Why over separate repos:** The project is small and tightly coupled at the API level. A monorepo with prefixed tags keeps everything together while allowing independent releases.
### 2. Two release scripts: `release-client.sh` and `release-engine.sh`
Each script handles its own component end-to-end: version bump, build, tag, release, push.
**Why over a single script with flags:** Two simple scripts are easier to understand and maintain than one script with component-selection logic. Each script is ~100 lines instead of one ~200-line script with branching. The shared logic (version helpers, pre-flight checks) is minimal and acceptable to duplicate.
**Shared structure for both scripts:**
1. Pre-flight checks (on main branch, tag doesn't exist)
2. Version bump (reads/writes component's VERSION file only)
3. Build artifacts (Go binaries or Docker images)
4. Commit version bump, create prefixed tag, push
5. Create Gitea release with assets
6. (Engine only) Push Docker images
### 3. `MinEngineVersion` as a build-time constant in the Go client
The client embeds a `MinEngineVersion` string constant alongside the existing `Version` constant. It is set via `-ldflags` at build time, sourced from a `client/MIN_ENGINE_VERSION` file.
**Why a separate file over embedding in `VERSION`:** The two values have different lifecycles. `VERSION` changes every release; `MIN_ENGINE_VERSION` changes only when the client starts using a new engine feature. A separate file makes the intent clear.
**Why ldflags over hardcoding in Go source:** Consistent with how `Version` is already injected. The value lives in a plain text file that's easy to bump manually.
### 4. Compatibility check on every API call via the `Client` struct
The `api.Client` checks engine compatibility on its first HTTP call by hitting `GET /api/v1/status` and comparing the `version` field against `MinEngineVersion`. The result is cached on the `Client` instance — subsequent calls skip the check.
**Flow:**
1. First call to any `Client` method (Get/Post/Delete/Put)
2. Before the actual request, call `GET /api/v1/status`
3. Parse `version` from response
4. Compare against `MinEngineVersion` using semver major.minor.patch comparison
5. If engine version < min: print error to stderr, `os.Exit(1)`
6. If check passes: set `versionChecked = true`, proceed with original request
7. If status endpoint unreachable: proceed with original request (connectivity error will surface on the actual call)
**Why hard fail, no skip flag:** This is a personal tool. If the client needs a newer engine, the user needs to update. A skip flag adds complexity for a scenario where the outcome (broken behavior) is worse than the error.
**Why check on first API call, not at startup:** The `PersistentPreRunE` in cobra runs before every command, but some future commands might not need the engine (e.g. `kb version`, `kb help`). Checking in the `Client` ensures we only check when actually contacting the engine.
**Why proceed when status endpoint is unreachable:** If we can't reach `/status`, the actual API call will also fail with a connection error. No point in double-failing. The compatibility check is for version mismatch, not connectivity.
### 5. Compose files: use `build:` context, not pinned image tags
The compose files currently use `build:` directives, not pre-built image references. Users who build locally don't need pinned tags — they're building from source. Users pulling pre-built images will reference the image tag directly in their own compose file or `docker run` command.
**Decision:** Leave compose files as-is. Release notes for engine releases will include the exact `docker pull` command with the versioned tag.
### 6. Semver comparison: major.minor.patch, no pre-release
Compare versions as three integers. No support for pre-release suffixes (`-rc1`, `-beta`) — the project doesn't use them. If `MinEngineVersion` is `2.1.0` and engine reports `2.1.5`, the check passes. If engine reports `2.0.9`, it fails.
## Risks / Trade-offs
- **Extra HTTP round-trip on first command** — One additional `GET /api/v1/status` call per client invocation. Negligible for a local-network tool.
→ Mitigation: Cached after first check within the Client instance.
- **Developer must remember to bump `MIN_ENGINE_VERSION`** — When adding client code that depends on a new engine endpoint/field, the developer must manually update the file.
→ Mitigation: This is a conscious decision point. The file's existence serves as a reminder. Could add a CI check later if needed.
- **Breaking change to git tag format** — Existing `v2.0.x` tags won't match the new `client-v*` / `engine-v*` convention. Old tags remain in history.
→ Mitigation: No migration needed. Old tags stay as historical artifacts. New convention starts from the first independent release.
- **Two Gitea releases per coordinated release** — When both components change, two releases are created instead of one.
→ Mitigation: Acceptable trade-off. Each release is self-contained with its own assets and notes.
@@ -0,0 +1,32 @@
## Why
Client and engine are currently locked to the same version number and released together via a single script. This means a client-only bug fix (e.g. output formatting) forces a full engine Docker image rebuild and push, and vice versa. Decoupling versions allows each component to be released independently on its own cadence, while a compatibility check ensures users don't run a client that requires engine features not yet deployed.
## What Changes
- **Separate version files** — `client/VERSION` and `engine/VERSION` may diverge (they already exist as separate files, but are currently always set to the same value)
- **Split release script** — Replace single `release.sh` with `release-client.sh` (builds Go binaries, tags `client-vX.Y.Z`, creates release) and `release-engine.sh` (builds Docker images, tags `engine-vX.Y.Z`, creates release, pushes images)
- **Client compatibility check** — Client embeds a `MinEngineVersion` constant (set at build time or in code). On every command that contacts the engine, the client calls `GET /api/v1/status`, compares the engine's reported version against `MinEngineVersion`, and hard-fails with an actionable error if the engine is too old. No skip flag, no warning — just a clear error with upgrade instructions.
- **Tag naming convention** — `client-vX.Y.Z` and `engine-vX.Y.Z` replace the current `vX.Y.Z` tag format. **BREAKING** — existing tag format changes.
## Capabilities
### New Capabilities
(none)
### Modified Capabilities
- `go-client`: Add engine version compatibility check requirement (hard fail if engine version < MinEngineVersion)
- `engine-api`: Status endpoint already returns `version` — no change needed, but delta spec documents the contract that the version field is required for compatibility checking
- `docker-deployment`: Compose files pin engine image tag; release script changes affect image tagging
## Impact
- `release.sh` — replaced by `release-client.sh` + `release-engine.sh`
- `client/cmd/root.go` — new `MinEngineVersion` constant
- `client/internal/api/client.go` — version check on first API call
- `client/Makefile` — may inject `MinEngineVersion` via ldflags alongside `Version`
- Git tags — new naming convention (`client-v*`, `engine-v*`)
- Gitea releases — two separate releases per independent release cycle
- `engine/compose.nvidia.yaml`, `engine/compose.rocm.yaml` — add pinned image tag
@@ -0,0 +1,25 @@
## MODIFIED Requirements
### Requirement: Compose files for deployment
The project SHALL provide Docker Compose files for single-command deployment. Compose files SHALL use `build:` context for local development. Release notes SHALL document the versioned image tag for users pulling pre-built images.
#### Scenario: Start NVIDIA deployment
- **WHEN** an admin runs `docker compose -f compose.nvidia.yaml up -d`
- **THEN** the engine SHALL start with GPU access, bind-mount the data directory, and be reachable on the configured port
#### Scenario: Start ROCm deployment
- **WHEN** an admin runs `docker compose -f compose.rocm.yaml up -d`
- **THEN** the engine SHALL start with GPU access via ROCm device passthrough, bind-mount the data directory, and be reachable on the configured port
#### Scenario: Automatic restart
- **WHEN** the engine process crashes or the host reboots
- **THEN** Docker SHALL automatically restart the container (restart policy `unless-stopped`)
#### Scenario: Configure via environment
- **WHEN** an admin sets environment variables in the compose file (KB_MODEL, KB_API_KEY, KB_DEVICE, etc.)
- **THEN** the engine SHALL use those values
#### Scenario: Pre-built image deployment
- **WHEN** an admin wants to use a pre-built engine image without building from source
- **THEN** the engine release notes SHALL include the exact `docker pull` command with the versioned tag (e.g. `docker.dcglab.co.uk/dcg/kb/engine:engine-v2.1.0-nvidia`)
@@ -0,0 +1,13 @@
## MODIFIED Requirements
### Requirement: Engine status and reindex
The engine SHALL provide status information and support re-embedding all chunks. The `version` field in the status response SHALL always be present and SHALL reflect the engine's release version as read from the `VERSION` file. This field is the contract used by clients for compatibility checking.
#### Scenario: Get engine status
- **WHEN** a client sends `GET /api/v1/status`
- **THEN** the engine SHALL return JSON with `version` (string, from VERSION file), model_name, embedding_dim, GPU device info, database stats (document count by type, total chunks, DB size), and queue stats (queued/processing job count)
#### Scenario: Trigger reindex
- **WHEN** a client sends `POST /api/v1/reindex`
- **THEN** the engine SHALL re-embed all existing chunks using the currently loaded model and return progress information. This operation SHALL NOT block search queries.
@@ -0,0 +1,45 @@
## ADDED Requirements
### Requirement: Engine version compatibility check
The client SHALL verify that the connected engine meets a minimum version requirement before executing any API command. The minimum required engine version SHALL be embedded in the client binary at build time. If the engine version is below the minimum, the client SHALL print an error message and exit with a non-zero code. There SHALL be no flag to skip or suppress this check.
#### Scenario: Compatible engine version
- **WHEN** the client connects to an engine reporting version `2.1.5` and `MinEngineVersion` is `2.1.0`
- **THEN** the client SHALL proceed with the command normally
#### Scenario: Incompatible engine version
- **WHEN** the client connects to an engine reporting version `2.0.3` and `MinEngineVersion` is `2.1.0`
- **THEN** the client SHALL print to stderr: `Error: kb client vX.Y.Z requires engine v2.1.0+ (connected engine is v2.0.3)` followed by an upgrade hint, and exit with code 1
#### Scenario: Engine unreachable during version check
- **WHEN** the client cannot reach the engine's `/api/v1/status` endpoint
- **THEN** the client SHALL skip the version check and proceed with the original command (the actual API call will surface the connectivity error)
#### Scenario: Version check is cached per session
- **WHEN** the client has already verified engine compatibility during the current invocation
- **THEN** subsequent API calls within the same invocation SHALL NOT repeat the version check
#### Scenario: Client version command does not check engine
- **WHEN** the user runs `kb --version`
- **THEN** the client SHALL print the client version without contacting the engine
#### Scenario: MinEngineVersion not set
- **WHEN** the client binary has `MinEngineVersion` set to empty string or `dev`
- **THEN** the client SHALL skip the version check entirely (development builds)
---
## MODIFIED Requirements
### Requirement: Single static binary with zero runtime dependencies
The Go client SHALL compile to a single static binary with no runtime dependencies. It SHALL support cross-compilation for Linux (amd64, arm64), macOS (amd64, arm64), and Windows (amd64). The build SHALL inject both `Version` and `MinEngineVersion` via ldflags.
#### Scenario: Install on a clean machine
- **WHEN** a user downloads the `kb` binary for their platform
- **THEN** they SHALL be able to run it immediately with no additional installs (no Python, no Docker, no shared libraries)
#### Scenario: Version and compatibility info embedded at build time
- **WHEN** the client is built with `make all VERSION=2.1.0 MIN_ENGINE_VERSION=2.0.0`
- **THEN** `kb --version` SHALL report `2.1.0` and the compatibility check SHALL use `2.0.0` as the minimum engine version
@@ -0,0 +1,35 @@
## 1. Client Compatibility Check
- [x] 1.1 Create `client/MIN_ENGINE_VERSION` file with initial value `2.0.0`
- [x] 1.2 Add `MinEngineVersion` variable to `client/cmd/root.go` (set via ldflags, default `dev`)
- [x] 1.3 Update `client/Makefile` to read `MIN_ENGINE_VERSION` file and inject via `-ldflags "-X cmd.MinEngineVersion=..."` alongside existing `Version`
- [x] 1.4 Add `CheckEngineVersion(minVersion string)` method to `client/internal/api/client.go` that calls `GET /api/v1/status`, parses `version` field, and compares against `minVersion` using semver major.minor.patch
- [x] 1.5 Add `versionChecked bool` field to `Client` struct; guard `CheckEngineVersion` so it runs at most once per Client instance
- [x] 1.6 Call `CheckEngineVersion` at the start of `Client.do()` (before executing the actual request); skip if `MinEngineVersion` is empty or `dev`
- [x] 1.7 On version mismatch: print `Error: kb client vX.Y.Z requires engine vM.N.P+ (connected engine is vA.B.C)\nUpdate your engine image to engine-vM.N.P or later.` to stderr and `os.Exit(1)`
- [x] 1.8 On status endpoint unreachable: skip version check silently (let the actual request surface the error)
## 2. Release Script — Client
- [x] 2.1 Create `release-client.sh` extracting client-specific logic from `release.sh`: version bump of `client/VERSION`, Go binary build, git tag `client-vX.Y.Z`, Gitea release with binary assets
- [x] 2.2 Release notes template: include `MinEngineVersion` requirement (e.g. "Requires engine v2.0.0+")
- [x] 2.3 Pass `MIN_ENGINE_VERSION` to `make all` in the build step
## 3. Release Script — Engine
- [x] 3.1 Create `release-engine.sh` extracting engine-specific logic from `release.sh`: version bump of `engine/VERSION`, Docker image build (nvidia + rocm), git tag `engine-vX.Y.Z`, Gitea release, image push
- [x] 3.2 Release notes template: include Docker pull commands with `engine-vX.Y.Z` prefixed tags
## 4. Cleanup
- [x] 4.1 Remove old `release.sh` (replaced by the two new scripts)
- [x] 4.2 Update Docker image tag format in release scripts from `vX.Y.Z-nvidia` to `engine-vX.Y.Z-nvidia` (and same for rocm/latest)
## 5. Testing
- [x] 5.1 Test client version check passes when engine version >= MinEngineVersion
- [x] 5.2 Test client version check fails with correct error message when engine version < MinEngineVersion
- [x] 5.3 Test client skips version check when MinEngineVersion is empty or `dev`
- [x] 5.4 Test client skips version check when engine is unreachable
- [x] 5.5 Dry-run `release-client.sh --dry-run --gitea` and verify correct tag format and build
- [x] 5.6 Dry-run `release-engine.sh --dry-run --gitea` and verify correct tag format and image names
+89
View File
@@ -0,0 +1,89 @@
# Document Storage
## Purpose
Persistent storage, retrieval, and lifecycle management of original uploaded document files.
## Requirements
### Requirement: Persistent original file storage
The engine SHALL persistently store the original uploaded file on disk after successful ingestion. Files SHALL be stored at `{data_dir}/documents/{content_hash}{extension}` where `content_hash` is the SHA-256 hex digest already computed for dedup and `extension` is preserved from the original filename. The `documents` table SHALL record the stored file path in a `stored_path` column and the original upload filename in an `original_filename` column.
#### Scenario: File stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a PDF file
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}.pdf`, store the permanent path in `documents.stored_path`, store the original filename in `documents.original_filename`, and delete the staging entry
#### Scenario: Note stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a text note
- **THEN** the worker SHALL move the staged `.note` file to `{data_dir}/documents/{content_hash}.note` and store the permanent path in `documents.stored_path`
#### Scenario: Markdown file stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a markdown file
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}.md` and store the permanent path in `documents.stored_path`
#### Scenario: Code file stored after successful ingestion
- **WHEN** the background worker successfully processes an ingestion job for a code file (e.g. `.py`, `.go`)
- **THEN** the worker SHALL move the staged file to `{data_dir}/documents/{content_hash}{original_extension}` and store the permanent path in `documents.stored_path`
#### Scenario: Documents directory created at startup
- **WHEN** the engine starts up and calls `ensure_dirs()`
- **THEN** the `{data_dir}/documents/` directory SHALL be created if it does not exist
#### Scenario: Ingestion failure does not store file
- **WHEN** the background worker fails to process an ingestion job
- **THEN** the staged file SHALL be cleaned up as before and no file SHALL be written to the documents directory
---
### Requirement: File retrieval via API
The engine SHALL serve the original stored file for any document that has a stored file on disk.
#### Scenario: Download original file
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document with a stored file
- **THEN** the engine SHALL return the file with appropriate `Content-Type` based on file extension and `Content-Disposition: attachment; filename="{original_filename}"` header, falling back to `{title}{ext}` if `original_filename` is NULL
#### Scenario: Download file for pre-existing document
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document ingested before this feature was added (stored_path is NULL)
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Original file not available - ingested before document storage was enabled"}`
#### Scenario: Download file when file missing from disk
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` for a document whose `stored_path` is set but the file no longer exists on disk
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Stored file not found on disk"}`
#### Scenario: Download file for non-existent document
- **WHEN** a client sends `GET /api/v1/documents/{id}/file` with a non-existent document ID
- **THEN** the engine SHALL return HTTP 404 with `{"error": "Document not found"}`
---
### Requirement: File cleanup on document deletion
The engine SHALL remove the stored original file from disk when a document is deleted.
#### Scenario: Delete document with stored file
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document with a stored file
- **THEN** the engine SHALL delete the document from the database (cascading to chunks, embeddings, tags) AND delete the stored file from disk
#### Scenario: Delete document when stored file already missing
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document whose stored file has been manually removed from disk
- **THEN** the engine SHALL delete the document from the database successfully and log a warning about the missing file
#### Scenario: Delete document without stored file (pre-existing)
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` for a document with `stored_path` NULL
- **THEN** the engine SHALL delete the document from the database without attempting file removal
---
### Requirement: Database schema migration for stored_path and original_filename
The engine SHALL add `stored_path` and `original_filename` columns to the `documents` table for tracking permanent file locations and original upload filenames.
#### Scenario: Fresh database initialization
- **WHEN** the engine initializes a new database
- **THEN** the `documents` table SHALL include `stored_path TEXT` and `original_filename TEXT` columns in its schema
#### Scenario: Existing database migration
- **WHEN** the engine starts with a database created before this feature
- **THEN** the engine SHALL add `stored_path TEXT` and `original_filename TEXT` to the `documents` table via `ALTER TABLE` if the columns do not exist
+49 -13
View File
@@ -26,7 +26,7 @@ The engine SHALL load the embedding model eagerly at startup before accepting HT
### Requirement: Hybrid search
The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5) and vector similarity search (via sqlite-vec), merged using Reciprocal Rank Fusion. Search SHALL complete in under 100ms when the model is warm.
The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5) and vector similarity search (via sqlite-vec), merged using Reciprocal Rank Fusion. Search SHALL complete in under 100ms when the model is warm. The engine SHALL sanitize user query strings to prevent FTS5 syntax errors for any input.
#### Scenario: Hybrid search with results
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "how to change oil", "top": 5}`
@@ -44,27 +44,59 @@ The engine SHALL provide hybrid search combining BM25 full-text search (via FTS5
- **WHEN** a client searches against an empty database
- **THEN** the engine SHALL return HTTP 200 with `{"query": "...", "results": [], "total_matches": 0}`
#### Scenario: Search with special characters
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "what color is grass?"}`
- **THEN** the engine SHALL sanitize the query for FTS5, execute the search successfully, and return results (not a 500 error)
#### Scenario: Search with FTS5 operators in query
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "NOT something OR (other)"}`
- **THEN** the engine SHALL treat the input as literal search terms, not FTS5 operators, and return matching results
#### Scenario: Search with only special characters
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "??!@#"}`
- **THEN** the engine SHALL return HTTP 200 with an empty result set (not a 500 error)
#### Scenario: Search with quotes in query
- **WHEN** a client sends `POST /api/v1/search` with body `{"query": "the \"quick\" fox"}`
- **THEN** the engine SHALL sanitize embedded quotes and return results normally
---
### Requirement: Async ingestion via job queue
The engine SHALL accept file uploads and text notes for ingestion asynchronously. Uploaded content SHALL be written to a staging area and a job record created in the database. The engine SHALL return HTTP 202 immediately. A background worker SHALL process queued jobs sequentially.
The engine SHALL accept file uploads and text notes for ingestion asynchronously. Uploaded content SHALL be written to a staging area and a job record created in the database. The engine SHALL return HTTP 202 immediately. A background worker SHALL process queued jobs sequentially. Before staging, the engine SHALL compute a SHA256 hash of the uploaded content and reject duplicates immediately.
#### Scenario: Upload a PDF file
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a PDF file and optional fields (tags, doc_type)
- **THEN** the engine SHALL write the file to the staging directory, create a job record with status `queued`, and return HTTP 202 with `{"job_id": "<id>", "status": "queued", "filename": "report.pdf"}`
- **THEN** the engine SHALL compute the SHA256 hash of the file bytes, verify no existing document has the same hash, write the file to the staging directory, create a job record with status `queued`, and return HTTP 202 with `{"job_id": "<id>", "status": "queued", "filename": "report.pdf"}`
#### Scenario: Upload a text note
- **WHEN** a client sends `POST /api/v1/jobs` with a multipart form containing a `note` text field and optional `title` field
- **THEN** the engine SHALL write the note content to a staging file, create a job record with status `queued`, and return HTTP 202 with the job ID
- **THEN** the engine SHALL compute the SHA256 hash of the note text (UTF-8 encoded), verify no existing document has the same hash, write the note content to a staging file, create a job record with status `queued`, and return HTTP 202 with the job ID
#### Scenario: Upload multiple files in sequence
- **WHEN** a client sends multiple `POST /api/v1/jobs` requests in quick succession
- **THEN** the engine SHALL queue each job independently and the background worker SHALL process them in FIFO order
#### Scenario: Duplicate content detection
- **WHEN** a client uploads a file whose content hash matches an already-ingested document
- **THEN** the engine SHALL return HTTP 202 but the background worker SHALL mark the job as `skipped` with reason `duplicate`
#### Scenario: Duplicate file detected at upload time (already ingested)
- **WHEN** a client uploads a file whose SHA256 content hash matches an already-ingested document
- **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
#### Scenario: Duplicate file detected at upload time (in-flight job)
- **WHEN** a client uploads a file whose SHA256 content hash matches a queued or processing job
- **THEN** the engine SHALL NOT stage the file or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
#### Scenario: Duplicate note detected at upload time (already ingested)
- **WHEN** a client submits a note whose SHA256 content hash matches an already-ingested document
- **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "document_id": <id>, "title": "<title>"}`
#### Scenario: Duplicate note detected at upload time (in-flight job)
- **WHEN** a client submits a note whose SHA256 content hash matches a queued or processing job
- **THEN** the engine SHALL NOT stage the note or create a job record, and SHALL return HTTP 409 with `{"error": "duplicate", "job_id": <id>, "title": "<filename>"}`
#### Scenario: Duplicate uploaded during concurrent request handling
- **WHEN** two identical files are uploaded in the same instant, both passing the API hash check before either job is committed
- **THEN** both jobs SHALL be queued, and the background worker SHALL process the first normally and mark the second as `skipped` (worker-side safety net via `hash_exists()` and UNIQUE constraint)
#### Scenario: Upload failure due to unsupported file type
- **WHEN** a client uploads a file with an unsupported extension
@@ -96,11 +128,11 @@ The engine SHALL maintain job records in SQLite with status tracking. Jobs SHALL
### Requirement: Background ingestion worker
The engine SHALL run a background worker that processes queued jobs. The worker SHALL process one job at a time. For each job, it SHALL: detect document type, run the appropriate chunking pipeline (Docling for PDFs, header-based for Markdown, AST-based for code, whole-text for notes), generate embeddings using the resident model, and insert chunks and vectors into the database.
The engine SHALL run a background worker that processes queued jobs. The worker SHALL process one job at a time. For each job, it SHALL: detect document type, run the appropriate chunking pipeline (Docling for PDFs, header-based for Markdown, AST-based for code, whole-text for notes), generate embeddings using the resident model, insert chunks and vectors into the database, and move the original file to persistent storage.
#### Scenario: Successful PDF ingestion
- **WHEN** the background worker picks up a queued PDF job
- **THEN** it SHALL update the job status to `processing`, run Docling conversion and chunking, embed all chunks, insert document and chunks into the database, update the job status to `done` with the resulting document_id and chunk count, and delete the staged file
- **THEN** it SHALL update the job status to `processing`, run Docling conversion and chunking, embed all chunks, insert document and chunks into the database, move the staged file to `{data_dir}/documents/{content_hash}.pdf`, update `documents.stored_path` with the permanent path, store the original filename in `documents.original_filename`, update the job status to `done` with the resulting document_id and chunk count, and clean up the staging entry
#### Scenario: Ingestion failure
- **WHEN** the background worker encounters an error during processing (e.g., corrupt PDF)
@@ -114,7 +146,7 @@ The engine SHALL run a background worker that processes queued jobs. The worker
### Requirement: Document management
The engine SHALL provide endpoints to list, inspect, and remove ingested documents.
The engine SHALL provide endpoints to list, inspect, remove, and download original files for ingested documents.
#### Scenario: List documents
- **WHEN** a client sends `GET /api/v1/documents`
@@ -126,11 +158,15 @@ The engine SHALL provide endpoints to list, inspect, and remove ingested documen
#### Scenario: Get document details
- **WHEN** a client sends `GET /api/v1/documents/{id}`
- **THEN** the engine SHALL return the full document record including all chunks and their text content
- **THEN** the engine SHALL return the full document record including all chunks, their text content, and whether the original file is available (`has_file: true/false`)
#### Scenario: Download original file
- **WHEN** a client sends `GET /api/v1/documents/{id}/file`
- **THEN** the engine SHALL return the original file with appropriate Content-Type and `Content-Disposition: attachment; filename="{original_filename}"` headers, or HTTP 404 if the file is not available
#### Scenario: Remove a document
- **WHEN** a client sends `DELETE /api/v1/documents/{id}`
- **THEN** the engine SHALL delete the document, all its chunks, associated embeddings, and tag associations, and return HTTP 200 with a confirmation
- **THEN** the engine SHALL delete the document, all its chunks, associated embeddings, tag associations, and the stored original file from disk, and return HTTP 200 with a confirmation
#### Scenario: Remove non-existent document
- **WHEN** a client sends `DELETE /api/v1/documents/{id}` with a non-existent ID
@@ -198,7 +234,7 @@ The engine SHALL be configured via environment variables. No config file is read
#### Scenario: Default configuration
- **WHEN** the engine starts with no environment variables set
- **THEN** it SHALL use defaults: data directory `/data`, model `all-MiniLM-L6-v2`, device `auto`, no API key required
- **THEN** it SHALL use defaults: data directory `/data`, model `all-MiniLM-L6-v2`, device `auto`, no API key required. It SHALL create `staging/` and `documents/` subdirectories under the data directory.
#### Scenario: Custom model
- **WHEN** `KB_MODEL` is set to `BAAI/bge-small-en-v1.5`
+18 -2
View File
@@ -66,7 +66,7 @@ The client SHALL provide a `kb search <query>` command that sends the query to t
### Requirement: Add command (file and note ingestion)
The client SHALL provide a `kb add` command that uploads files or notes to the engine for async ingestion. The client SHALL exit immediately after a successful upload.
The client SHALL provide a `kb add` command that uploads files or notes to the engine for async ingestion. The client SHALL exit immediately after a successful upload. The client SHALL handle duplicate rejection (HTTP 409) and display the existing document information.
#### Scenario: Add a single file
- **WHEN** the user runs `kb add report.pdf`
@@ -84,6 +84,22 @@ The client SHALL provide a `kb add` command that uploads files or notes to the e
- **WHEN** the user runs `kb add --note "The server room is in building 3, floor 2"`
- **THEN** the client SHALL submit the note text via `POST /api/v1/jobs` (multipart with note field), print "Queued: note", and exit
#### Scenario: Duplicate file rejected (already ingested)
- **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "document_id": 42, "title": "report.pdf"}`
- **THEN** the client SHALL print "Already imported: report.pdf (doc ID: 42)" and exit with code 0
#### Scenario: Duplicate file rejected (in-flight job)
- **WHEN** the user runs `kb add report.pdf` and the engine returns HTTP 409 with `{"error": "duplicate", "job_id": 7, "title": "report.pdf"}`
- **THEN** the client SHALL print "Already queued: report.pdf (job ID: 7)" and exit with code 0
#### Scenario: Duplicate file in recursive add
- **WHEN** the user runs `kb add ~/documents/ --recursive` and some files are rejected as duplicates
- **THEN** the client SHALL print the duplicate message for each rejected file (distinguishing "Already imported" from "Already queued"), continue uploading remaining files, and include a summary (e.g., "Queued: 5 files, 2 duplicates skipped")
#### Scenario: Duplicate with JSON output
- **WHEN** the user runs `kb add report.pdf --format json` and the engine returns HTTP 409
- **THEN** the client SHALL output the raw JSON response from the engine including the document_id and title
#### Scenario: Add with JSON output
- **WHEN** the user runs `kb add report.pdf --format json`
- **THEN** the client SHALL output the JSON response from the engine including the job_id
@@ -93,7 +109,7 @@ The client SHALL provide a `kb add` command that uploads files or notes to the e
- **THEN** the client SHALL print an error and exit with a non-zero code without making any API call
#### Scenario: Upload failure
- **WHEN** the upload fails (network error, engine returns 4xx/5xx)
- **WHEN** the upload fails (network error, engine returns 4xx/5xx other than 409)
- **THEN** the client SHALL print the error and exit with a non-zero code
---
+218
View File
@@ -0,0 +1,218 @@
#!/usr/bin/env bash
#
# release-client.sh — Build, tag, and release the Go client
#
# Usage:
# ./release-client.sh --gitea|--github [--dry-run] [--no-increment] [--patch|--minor|--major]
set -euo pipefail
#──────────────────────────────────────────────────────────────────────
# Config
#──────────────────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
CLIENT_DIR="$SCRIPT_DIR/client"
VERSION_FILE="$CLIENT_DIR/VERSION"
MIN_ENGINE_FILE="$CLIENT_DIR/MIN_ENGINE_VERSION"
#──────────────────────────────────────────────────────────────────────
# Parse args
#──────────────────────────────────────────────────────────────────────
DRY_RUN=false
INCREMENT=true
BUMP="patch"
FORGE=""
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=true ;;
--no-increment) INCREMENT=false ;;
--minor) BUMP="minor" ;;
--major) BUMP="major" ;;
--patch) BUMP="patch" ;;
--gitea) FORGE="tea" ;;
--github) FORGE="gh" ;;
*)
echo "Unknown argument: $arg"
echo "Usage: $0 --gitea|--github [--dry-run] [--no-increment] [--patch|--minor|--major]"
exit 1
;;
esac
done
if [[ -z "$FORGE" ]]; then
echo "Error: specify --gitea or --github"
echo "Usage: $0 --gitea|--github [--dry-run] [--no-increment] [--patch|--minor|--major]"
exit 1
fi
# Ensure we're on main branch
CURRENT_BRANCH="$(git -C "$SCRIPT_DIR" rev-parse --abbrev-ref HEAD)"
if [[ "$CURRENT_BRANCH" != "main" ]]; then
echo "Error: releases must be made from the main branch (currently on '$CURRENT_BRANCH')"
exit 1
fi
if ! command -v "$FORGE" &>/dev/null; then
echo "Error: '$FORGE' not found in PATH"
exit 1
fi
#──────────────────────────────────────────────────────────────────────
# Version helpers
#──────────────────────────────────────────────────────────────────────
read_version() {
local file="$1"
if [[ ! -f "$file" ]]; then
echo "Error: version file not found: $file" >&2
exit 1
fi
tr -d '[:space:]' < "$file"
}
bump_version() {
local ver="$1" part="$2"
local major minor patch
IFS='.' read -r major minor patch <<< "$ver"
case "$part" in
major) echo "$((major + 1)).0.0" ;;
minor) echo "${major}.$((minor + 1)).0" ;;
patch) echo "${major}.${minor}.$((patch + 1))" ;;
esac
}
write_version() {
local file="$1" ver="$2"
echo "$ver" > "$file"
}
run() {
echo " $ $*"
if [[ "$DRY_RUN" == false ]]; then
"$@"
fi
}
#──────────────────────────────────────────────────────────────────────
# Determine release version
#──────────────────────────────────────────────────────────────────────
CURRENT_VERSION="$(read_version "$VERSION_FILE")"
MIN_ENGINE_VERSION="$(read_version "$MIN_ENGINE_FILE")"
if [[ "$INCREMENT" == true ]]; then
VERSION="$(bump_version "$CURRENT_VERSION" "$BUMP")"
echo "==> Client version bump: $CURRENT_VERSION$VERSION ($BUMP)"
else
VERSION="$CURRENT_VERSION"
echo "==> Client version: $VERSION (no increment)"
fi
TAG="client-v${VERSION}"
echo " Tag: $TAG"
echo " Min engine: v$MIN_ENGINE_VERSION"
echo " Forge CLI: $FORGE"
echo " Dry run: $DRY_RUN"
echo ""
#──────────────────────────────────────────────────────────────────────
# 1. Pre-flight checks
#──────────────────────────────────────────────────────────────────────
echo "==> Pre-flight checks"
if [[ "$DRY_RUN" == false ]]; then
if git -C "$SCRIPT_DIR" rev-parse "$TAG" &>/dev/null; then
echo "Error: tag $TAG already exists"
exit 1
fi
fi
echo " OK"
echo ""
#──────────────────────────────────────────────────────────────────────
# 2. Update version file
#──────────────────────────────────────────────────────────────────────
if [[ "$INCREMENT" == true ]]; then
echo "==> Updating client version to $VERSION"
run write_version "$VERSION_FILE" "$VERSION"
echo ""
fi
#──────────────────────────────────────────────────────────────────────
# 3. Build Go client binaries
#──────────────────────────────────────────────────────────────────────
echo "==> Building Go client binaries ($VERSION, min engine $MIN_ENGINE_VERSION)"
run make -C "$CLIENT_DIR" clean
run make -C "$CLIENT_DIR" all VERSION="$VERSION" MIN_ENGINE_VERSION="$MIN_ENGINE_VERSION"
# Collect release assets
ASSETS=()
if [[ "$DRY_RUN" == false ]]; then
for bin in "$CLIENT_DIR"/dist/kb-*; do
ASSETS+=("$bin")
done
echo " Built ${#ASSETS[@]} binaries"
else
echo " (skipped — dry run)"
fi
echo ""
#──────────────────────────────────────────────────────────────────────
# 4. Commit, tag, and push
#──────────────────────────────────────────────────────────────────────
echo "==> Committing and tagging $TAG"
if [[ "$INCREMENT" == true ]]; then
run git -C "$SCRIPT_DIR" add "$VERSION_FILE"
run git -C "$SCRIPT_DIR" commit -m "Bump client version to $VERSION"
fi
run git -C "$SCRIPT_DIR" tag -a "$TAG" -m "Release $TAG"
run git -C "$SCRIPT_DIR" push origin HEAD
run git -C "$SCRIPT_DIR" push origin "$TAG"
echo ""
#──────────────────────────────────────────────────────────────────────
# 5. Create release with assets
#──────────────────────────────────────────────────────────────────────
echo "==> Creating release via $FORGE"
RELEASE_TITLE="Client $TAG"
RELEASE_NOTES="## Go client v${VERSION}
Requires engine v${MIN_ENGINE_VERSION}+
## Client binaries
Download the binary for your platform from the assets below, rename to \`kb\`, and place on your PATH."
if [[ "$FORGE" == "gh" ]]; then
ASSET_FLAGS=()
for f in "${ASSETS[@]+"${ASSETS[@]}"}"; do
ASSET_FLAGS+=("$f")
done
run gh release create "$TAG" \
--title "$RELEASE_TITLE" \
--notes "$RELEASE_NOTES" \
"${ASSET_FLAGS[@]+"${ASSET_FLAGS[@]}"}"
elif [[ "$FORGE" == "tea" ]]; then
run tea release create \
--tag "$TAG" \
--title "$RELEASE_TITLE" \
--note "$RELEASE_NOTES"
for f in "${ASSETS[@]+"${ASSETS[@]}"}"; do
run tea release asset create "$TAG" "$f"
done
fi
echo ""
echo "==> Release $TAG complete!"
echo ""
echo " Binaries: ${#ASSETS[@]} platform(s) attached to release"
echo " Min engine: v$MIN_ENGINE_VERSION"
+43 -89
View File
@@ -1,21 +1,9 @@
#!/usr/bin/env bash
#
# release.sh — Build, tag, and release kb-search
#
# Builds Go client binaries, Docker engine images, creates a Git tag + release,
# and pushes container images to the registry.
# release-engine.sh — Build, tag, and release the engine Docker images
#
# Usage:
# ./release.sh # auto-increment patch, build, release
# ./release.sh --no-increment # release using current VERSION files as-is
# ./release.sh --dry-run # show what would happen without doing it
# ./release.sh --minor # bump minor version (e.g. 2.0.1 → 2.1.0)
# ./release.sh --major # bump major version (e.g. 2.1.0 → 3.0.0)
# ./release.sh --gitea # use Gitea (tea) for release creation
# ./release.sh --github # use GitHub (gh) for release creation
#
# One of --gitea or --github is required.
# Assumes Docker is already authenticated to the registry.
# ./release-engine.sh --gitea|--github [--dry-run] [--no-increment] [--patch|--minor|--major]
set -euo pipefail
@@ -23,11 +11,8 @@ set -euo pipefail
# Config
#──────────────────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
CLIENT_DIR="$SCRIPT_DIR/client"
ENGINE_DIR="$SCRIPT_DIR/engine"
CLIENT_VERSION_FILE="$CLIENT_DIR/VERSION"
ENGINE_VERSION_FILE="$ENGINE_DIR/VERSION"
VERSION_FILE="$ENGINE_DIR/VERSION"
# Container registry
REGISTRY="${REGISTRY:-docker.dcglab.co.uk}"
@@ -65,6 +50,13 @@ if [[ -z "$FORGE" ]]; then
exit 1
fi
# Ensure we're on main branch
CURRENT_BRANCH="$(git -C "$SCRIPT_DIR" rev-parse --abbrev-ref HEAD)"
if [[ "$CURRENT_BRANCH" != "main" ]]; then
echo "Error: releases must be made from the main branch (currently on '$CURRENT_BRANCH')"
exit 1
fi
if ! command -v "$FORGE" &>/dev/null; then
echo "Error: '$FORGE' not found in PATH"
exit 1
@@ -99,27 +91,6 @@ write_version() {
echo "$ver" > "$file"
}
#──────────────────────────────────────────────────────────────────────
# Determine release version
#──────────────────────────────────────────────────────────────────────
CURRENT_VERSION="$(read_version "$CLIENT_VERSION_FILE")"
if [[ "$INCREMENT" == true ]]; then
VERSION="$(bump_version "$CURRENT_VERSION" "$BUMP")"
echo "==> Version bump: $CURRENT_VERSION$VERSION ($BUMP)"
else
VERSION="$CURRENT_VERSION"
echo "==> Version: $VERSION (no increment)"
fi
TAG="v${VERSION}"
echo " Tag: $TAG"
echo " Registry: $IMAGE_BASE"
echo " Forge CLI: $FORGE"
echo " Dry run: $DRY_RUN"
echo ""
run() {
echo " $ $*"
if [[ "$DRY_RUN" == false ]]; then
@@ -127,13 +98,33 @@ run() {
fi
}
#──────────────────────────────────────────────────────────────────────
# Determine release version
#──────────────────────────────────────────────────────────────────────
CURRENT_VERSION="$(read_version "$VERSION_FILE")"
if [[ "$INCREMENT" == true ]]; then
VERSION="$(bump_version "$CURRENT_VERSION" "$BUMP")"
echo "==> Engine version bump: $CURRENT_VERSION$VERSION ($BUMP)"
else
VERSION="$CURRENT_VERSION"
echo "==> Engine version: $VERSION (no increment)"
fi
TAG="engine-v${VERSION}"
echo " Tag: $TAG"
echo " Registry: $IMAGE_BASE"
echo " Forge CLI: $FORGE"
echo " Dry run: $DRY_RUN"
echo ""
#──────────────────────────────────────────────────────────────────────
# 1. Pre-flight checks
#──────────────────────────────────────────────────────────────────────
echo "==> Pre-flight checks"
if [[ "$DRY_RUN" == false ]]; then
# Check tag doesn't already exist
if git -C "$SCRIPT_DIR" rev-parse "$TAG" &>/dev/null; then
echo "Error: tag $TAG already exists"
exit 1
@@ -144,37 +135,16 @@ echo " OK"
echo ""
#──────────────────────────────────────────────────────────────────────
# 2. Update version files
# 2. Update version file
#──────────────────────────────────────────────────────────────────────
if [[ "$INCREMENT" == true ]]; then
echo "==> Updating version files to $VERSION"
run write_version "$CLIENT_VERSION_FILE" "$VERSION"
run write_version "$ENGINE_VERSION_FILE" "$VERSION"
echo "==> Updating engine version to $VERSION"
run write_version "$VERSION_FILE" "$VERSION"
echo ""
fi
#──────────────────────────────────────────────────────────────────────
# 3. Build Go client binaries
#──────────────────────────────────────────────────────────────────────
echo "==> Building Go client binaries ($VERSION)"
run make -C "$CLIENT_DIR" clean
run make -C "$CLIENT_DIR" all VERSION="$VERSION"
# Collect release assets
ASSETS=()
if [[ "$DRY_RUN" == false ]]; then
for bin in "$CLIENT_DIR"/dist/kb-*; do
ASSETS+=("$bin")
done
echo " Built ${#ASSETS[@]} binaries"
else
echo " (skipped — dry run)"
fi
echo ""
#──────────────────────────────────────────────────────────────────────
# 4. Build Docker engine images
# 3. Build Docker engine images
#──────────────────────────────────────────────────────────────────────
echo "==> Building Docker engine images ($VERSION)"
@@ -189,13 +159,13 @@ run docker build -t "$ROCM_IMAGE" -t "$ROCM_LATEST" -f "$ENGINE_DIR/Dockerfile.r
echo ""
#──────────────────────────────────────────────────────────────────────
# 5. Commit version bump, tag, and push
# 4. Commit, tag, and push
#──────────────────────────────────────────────────────────────────────
echo "==> Committing and tagging $TAG"
if [[ "$INCREMENT" == true ]]; then
run git -C "$SCRIPT_DIR" add "$CLIENT_VERSION_FILE" "$ENGINE_VERSION_FILE"
run git -C "$SCRIPT_DIR" commit -m "Bump version to $VERSION"
run git -C "$SCRIPT_DIR" add "$VERSION_FILE"
run git -C "$SCRIPT_DIR" commit -m "Bump engine version to $VERSION"
fi
run git -C "$SCRIPT_DIR" tag -a "$TAG" -m "Release $TAG"
@@ -205,11 +175,11 @@ run git -C "$SCRIPT_DIR" push origin "$TAG"
echo ""
#──────────────────────────────────────────────────────────────────────
# 6. Create release with assets
# 5. Create release
#──────────────────────────────────────────────────────────────────────
echo "==> Creating release via $FORGE"
RELEASE_TITLE="$TAG"
RELEASE_TITLE="Engine $TAG"
RELEASE_NOTES="## Docker images
\`\`\`bash
@@ -218,38 +188,24 @@ docker pull ${NVIDIA_IMAGE}
# AMD GPU (ROCm)
docker pull ${ROCM_IMAGE}
\`\`\`
## Client binaries
Download the binary for your platform from the assets below, rename to \`kb\`, and place on your PATH."
\`\`\`"
if [[ "$FORGE" == "gh" ]]; then
ASSET_FLAGS=()
for f in "${ASSETS[@]+"${ASSETS[@]}"}"; do
ASSET_FLAGS+=("$f")
done
run gh release create "$TAG" \
--title "$RELEASE_TITLE" \
--notes "$RELEASE_NOTES" \
"${ASSET_FLAGS[@]+"${ASSET_FLAGS[@]}"}"
--notes "$RELEASE_NOTES"
elif [[ "$FORGE" == "tea" ]]; then
run tea release create \
--tag "$TAG" \
--title "$RELEASE_TITLE" \
--note "$RELEASE_NOTES"
# tea attaches assets separately
for f in "${ASSETS[@]+"${ASSETS[@]}"}"; do
run tea release asset create --tag "$TAG" "$f"
done
fi
echo ""
#──────────────────────────────────────────────────────────────────────
# 7. Push Docker images to registry
# 6. Push Docker images to registry
#──────────────────────────────────────────────────────────────────────
echo "==> Pushing Docker images to $REGISTRY"
@@ -264,5 +220,3 @@ echo ""
echo " Images:"
echo " $NVIDIA_IMAGE"
echo " $ROCM_IMAGE"
echo ""
echo " Binaries: ${#ASSETS[@]} platform(s) attached to release"