a47a1504c8
Three-layer organization: primary topic (one per article, for ranking and brief balance) + grouping tags (1-4 per article from a controlled vocabulary, the organic "wandering" axis) + tonal flavor. - taxonomy: add technology + learning topics; 4 calm tag families (Discovery & Wonder, People & Kindness, Solutions & Progress, Mind & Craft) defined in code, not the DB; ALLOWED_TAGS union + coerce_tags validation. - db: article_tags(article_id, tag) join table + tag index. - llm: tags added to the classifier json_schema (enum-constrained, maxItems 4) and system prompt; normalize_scores coerces tags; upsert_article_score replaces a row's tags atomically on every (re)classification. - queries: feed gains a tag filter and exposes tags via group_concat; tag_counts. - api: Article.tags, feed tag param, and /api/families with per-tag counts. - tests: coerce/normalize/upsert/tag-filter/reclassify-replace/tag_counts + /api/families. 99 passing. Corpus reclassify (re-tag + new primary topics) runs separately against the local LLM. Frontend (B2) pairs with this; the live site is unchanged until then. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
181 lines
5.9 KiB
Python
181 lines
5.9 KiB
Python
from __future__ import annotations
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
|
|
SCHEMA = """
|
|
PRAGMA foreign_keys = ON;
|
|
|
|
CREATE TABLE IF NOT EXISTS sources (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
name TEXT NOT NULL UNIQUE,
|
|
homepage_url TEXT,
|
|
feed_url TEXT NOT NULL UNIQUE,
|
|
source_type TEXT NOT NULL DEFAULT 'rss',
|
|
default_category TEXT,
|
|
trust_score INTEGER NOT NULL DEFAULT 5,
|
|
pr_risk_score INTEGER NOT NULL DEFAULT 3,
|
|
active INTEGER NOT NULL DEFAULT 1,
|
|
poll_interval_minutes INTEGER NOT NULL DEFAULT 60,
|
|
notes TEXT,
|
|
last_success_at TEXT,
|
|
last_error_at TEXT,
|
|
last_error TEXT,
|
|
consecutive_failures INTEGER NOT NULL DEFAULT 0,
|
|
review_flag INTEGER NOT NULL DEFAULT 0,
|
|
review_reason TEXT,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS articles (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
source_id INTEGER NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
canonical_url TEXT NOT NULL,
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
author TEXT,
|
|
published_at TEXT,
|
|
discovered_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
image_url TEXT,
|
|
language TEXT,
|
|
raw_guid TEXT,
|
|
url_hash TEXT NOT NULL UNIQUE,
|
|
title_hash TEXT,
|
|
duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
|
|
image_checked_at TEXT,
|
|
FOREIGN KEY (source_id) REFERENCES sources(id)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_source_id ON articles(source_id);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_title_hash ON articles(title_hash);
|
|
|
|
CREATE TABLE IF NOT EXISTS article_scores (
|
|
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
|
|
constructive_score INTEGER,
|
|
cortisol_score INTEGER,
|
|
ragebait_score INTEGER,
|
|
agency_score INTEGER,
|
|
human_benefit_score INTEGER,
|
|
novelty_score INTEGER,
|
|
pr_risk_score INTEGER,
|
|
accepted INTEGER,
|
|
reason_code TEXT,
|
|
reason_text TEXT,
|
|
topic TEXT,
|
|
flavor TEXT,
|
|
model_name TEXT,
|
|
scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS article_tags (
|
|
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
|
|
tag TEXT NOT NULL,
|
|
PRIMARY KEY (article_id, tag)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_article_tags_tag ON article_tags(tag);
|
|
|
|
CREATE TABLE IF NOT EXISTS article_embeddings (
|
|
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
|
|
vector BLOB NOT NULL,
|
|
dim INTEGER NOT NULL,
|
|
model TEXT NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS ingest_runs (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
|
|
started_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
finished_at TEXT,
|
|
status TEXT NOT NULL DEFAULT 'running',
|
|
items_seen INTEGER NOT NULL DEFAULT 0,
|
|
items_inserted INTEGER NOT NULL DEFAULT 0,
|
|
items_duplicate INTEGER NOT NULL DEFAULT 0,
|
|
error TEXT
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS source_candidates (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
feed_url TEXT NOT NULL UNIQUE,
|
|
homepage_url TEXT,
|
|
name TEXT,
|
|
status TEXT NOT NULL DEFAULT 'suggested',
|
|
preview_json TEXT,
|
|
notes TEXT,
|
|
last_previewed_at TEXT,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS daily_briefs (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
brief_date TEXT NOT NULL UNIQUE,
|
|
title TEXT NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
notes TEXT
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS daily_brief_items (
|
|
brief_id INTEGER NOT NULL REFERENCES daily_briefs(id) ON DELETE CASCADE,
|
|
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
|
|
rank INTEGER NOT NULL,
|
|
selection_reason TEXT,
|
|
PRIMARY KEY (brief_id, article_id),
|
|
UNIQUE (brief_id, rank)
|
|
);
|
|
"""
|
|
|
|
|
|
def connect(db_path: Path | str) -> sqlite3.Connection:
|
|
path = Path(db_path)
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
conn = sqlite3.connect(path)
|
|
conn.row_factory = sqlite3.Row
|
|
conn.execute("PRAGMA foreign_keys = ON")
|
|
return conn
|
|
|
|
|
|
def init_db(conn: sqlite3.Connection) -> None:
|
|
conn.executescript(SCHEMA)
|
|
_migrate(conn)
|
|
conn.commit()
|
|
|
|
|
|
def _migrate(conn: sqlite3.Connection) -> None:
|
|
"""Add columns introduced after the initial schema to existing databases.
|
|
|
|
CREATE TABLE IF NOT EXISTS never alters an existing table, so new columns
|
|
need an explicit, idempotent ALTER guarded by the current column set.
|
|
"""
|
|
score_cols = {row["name"] for row in conn.execute("PRAGMA table_info(article_scores)")}
|
|
for column in ("topic", "flavor"):
|
|
if column not in score_cols:
|
|
conn.execute(f"ALTER TABLE article_scores ADD COLUMN {column} TEXT")
|
|
|
|
article_cols = {row["name"] for row in conn.execute("PRAGMA table_info(articles)")}
|
|
if "duplicate_of" not in article_cols:
|
|
conn.execute(
|
|
"ALTER TABLE articles ADD COLUMN duplicate_of INTEGER REFERENCES articles(id)"
|
|
)
|
|
if "image_checked_at" not in article_cols:
|
|
conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT")
|
|
# Created here (not in SCHEMA) so it runs after the column exists on upgrades.
|
|
conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")
|
|
|
|
source_cols = {row["name"] for row in conn.execute("PRAGMA table_info(sources)")}
|
|
health_columns = {
|
|
"last_success_at": "TEXT",
|
|
"last_error_at": "TEXT",
|
|
"last_error": "TEXT",
|
|
"consecutive_failures": "INTEGER NOT NULL DEFAULT 0",
|
|
"review_flag": "INTEGER NOT NULL DEFAULT 0",
|
|
"review_reason": "TEXT",
|
|
}
|
|
for column, decl in health_columns.items():
|
|
if column not in source_cols:
|
|
conn.execute(f"ALTER TABLE sources ADD COLUMN {column} {decl}")
|