Files
upbeatBytes/goodnews/db.py
T
thejayman77 a47a1504c8 Phase B1: multi-tag groupings model (backend)
Three-layer organization: primary topic (one per article, for ranking and
brief balance) + grouping tags (1-4 per article from a controlled vocabulary,
the organic "wandering" axis) + tonal flavor.

- taxonomy: add technology + learning topics; 4 calm tag families
  (Discovery & Wonder, People & Kindness, Solutions & Progress, Mind & Craft)
  defined in code, not the DB; ALLOWED_TAGS union + coerce_tags validation.
- db: article_tags(article_id, tag) join table + tag index.
- llm: tags added to the classifier json_schema (enum-constrained, maxItems 4)
  and system prompt; normalize_scores coerces tags; upsert_article_score
  replaces a row's tags atomically on every (re)classification.
- queries: feed gains a tag filter and exposes tags via group_concat; tag_counts.
- api: Article.tags, feed tag param, and /api/families with per-tag counts.
- tests: coerce/normalize/upsert/tag-filter/reclassify-replace/tag_counts +
  /api/families. 99 passing.

Corpus reclassify (re-tag + new primary topics) runs separately against the
local LLM. Frontend (B2) pairs with this; the live site is unchanged until then.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 18:35:25 +00:00

181 lines
5.9 KiB
Python

from __future__ import annotations
import sqlite3
from pathlib import Path
SCHEMA = """
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS sources (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
homepage_url TEXT,
feed_url TEXT NOT NULL UNIQUE,
source_type TEXT NOT NULL DEFAULT 'rss',
default_category TEXT,
trust_score INTEGER NOT NULL DEFAULT 5,
pr_risk_score INTEGER NOT NULL DEFAULT 3,
active INTEGER NOT NULL DEFAULT 1,
poll_interval_minutes INTEGER NOT NULL DEFAULT 60,
notes TEXT,
last_success_at TEXT,
last_error_at TEXT,
last_error TEXT,
consecutive_failures INTEGER NOT NULL DEFAULT 0,
review_flag INTEGER NOT NULL DEFAULT 0,
review_reason TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id INTEGER NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
canonical_url TEXT NOT NULL,
title TEXT NOT NULL,
description TEXT,
author TEXT,
published_at TEXT,
discovered_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
image_url TEXT,
language TEXT,
raw_guid TEXT,
url_hash TEXT NOT NULL UNIQUE,
title_hash TEXT,
duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
image_checked_at TEXT,
FOREIGN KEY (source_id) REFERENCES sources(id)
);
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
CREATE INDEX IF NOT EXISTS idx_articles_source_id ON articles(source_id);
CREATE INDEX IF NOT EXISTS idx_articles_title_hash ON articles(title_hash);
CREATE TABLE IF NOT EXISTS article_scores (
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
constructive_score INTEGER,
cortisol_score INTEGER,
ragebait_score INTEGER,
agency_score INTEGER,
human_benefit_score INTEGER,
novelty_score INTEGER,
pr_risk_score INTEGER,
accepted INTEGER,
reason_code TEXT,
reason_text TEXT,
topic TEXT,
flavor TEXT,
model_name TEXT,
scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS article_tags (
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
tag TEXT NOT NULL,
PRIMARY KEY (article_id, tag)
);
CREATE INDEX IF NOT EXISTS idx_article_tags_tag ON article_tags(tag);
CREATE TABLE IF NOT EXISTS article_embeddings (
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
vector BLOB NOT NULL,
dim INTEGER NOT NULL,
model TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS ingest_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
started_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
finished_at TEXT,
status TEXT NOT NULL DEFAULT 'running',
items_seen INTEGER NOT NULL DEFAULT 0,
items_inserted INTEGER NOT NULL DEFAULT 0,
items_duplicate INTEGER NOT NULL DEFAULT 0,
error TEXT
);
CREATE TABLE IF NOT EXISTS source_candidates (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feed_url TEXT NOT NULL UNIQUE,
homepage_url TEXT,
name TEXT,
status TEXT NOT NULL DEFAULT 'suggested',
preview_json TEXT,
notes TEXT,
last_previewed_at TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS daily_briefs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
brief_date TEXT NOT NULL UNIQUE,
title TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
notes TEXT
);
CREATE TABLE IF NOT EXISTS daily_brief_items (
brief_id INTEGER NOT NULL REFERENCES daily_briefs(id) ON DELETE CASCADE,
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
rank INTEGER NOT NULL,
selection_reason TEXT,
PRIMARY KEY (brief_id, article_id),
UNIQUE (brief_id, rank)
);
"""
def connect(db_path: Path | str) -> sqlite3.Connection:
path = Path(db_path)
path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(path)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
return conn
def init_db(conn: sqlite3.Connection) -> None:
conn.executescript(SCHEMA)
_migrate(conn)
conn.commit()
def _migrate(conn: sqlite3.Connection) -> None:
"""Add columns introduced after the initial schema to existing databases.
CREATE TABLE IF NOT EXISTS never alters an existing table, so new columns
need an explicit, idempotent ALTER guarded by the current column set.
"""
score_cols = {row["name"] for row in conn.execute("PRAGMA table_info(article_scores)")}
for column in ("topic", "flavor"):
if column not in score_cols:
conn.execute(f"ALTER TABLE article_scores ADD COLUMN {column} TEXT")
article_cols = {row["name"] for row in conn.execute("PRAGMA table_info(articles)")}
if "duplicate_of" not in article_cols:
conn.execute(
"ALTER TABLE articles ADD COLUMN duplicate_of INTEGER REFERENCES articles(id)"
)
if "image_checked_at" not in article_cols:
conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT")
# Created here (not in SCHEMA) so it runs after the column exists on upgrades.
conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")
source_cols = {row["name"] for row in conn.execute("PRAGMA table_info(sources)")}
health_columns = {
"last_success_at": "TEXT",
"last_error_at": "TEXT",
"last_error": "TEXT",
"consecutive_failures": "INTEGER NOT NULL DEFAULT 0",
"review_flag": "INTEGER NOT NULL DEFAULT 0",
"review_reason": "TEXT",
}
for column, decl in health_columns.items():
if column not in source_cols:
conn.execute(f"ALTER TABLE sources ADD COLUMN {column} {decl}")