Files
upbeatBytes/goodnews/db.py
T
thejayman77 337dc3f901 Article pages: structured "Why it belongs" editorial read
Per Codex — make /a/<id> feel like Upbeat Bytes has editorial judgment, not just
a summary wrapper. Trust-building, short, not an essay.

* article_summaries gains what_happened / why_matters / why_belongs (+ migration).
* summarize.explain_article: a separate, fallback-able LLM pass producing three
  short notes (parsed from a labelled WHAT/MATTERS/BELONGS format). generate_summary
  now stores them alongside the summary, and tops up older summaries on next view.
  get_explanation returns them only when all three are present.
* API: share_page + /api/summary expose the explanation.
* share.py: renders the three-part section (accent rule) when complete; otherwise
  the single "Why it's here" reason line is the calm fallback. The page polls and
  swaps in both the summary and the section as they cache.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 20:05:26 -04:00

375 lines
15 KiB
Python

from __future__ import annotations
import sqlite3
from pathlib import Path
SCHEMA = """
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS sources (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL UNIQUE,
homepage_url TEXT,
feed_url TEXT NOT NULL UNIQUE,
source_type TEXT NOT NULL DEFAULT 'rss',
default_category TEXT,
trust_score INTEGER NOT NULL DEFAULT 5,
pr_risk_score INTEGER NOT NULL DEFAULT 3,
active INTEGER NOT NULL DEFAULT 1,
status TEXT NOT NULL DEFAULT 'active',
content_visible INTEGER NOT NULL DEFAULT 1,
poll_interval_minutes INTEGER NOT NULL DEFAULT 60,
notes TEXT,
last_success_at TEXT,
last_error_at TEXT,
last_error TEXT,
consecutive_failures INTEGER NOT NULL DEFAULT 0,
retry_after_at TEXT,
review_flag INTEGER NOT NULL DEFAULT 0,
review_reason TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id INTEGER NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
canonical_url TEXT NOT NULL,
title TEXT NOT NULL,
description TEXT,
author TEXT,
published_at TEXT,
discovered_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
image_url TEXT,
language TEXT,
raw_guid TEXT,
url_hash TEXT NOT NULL UNIQUE,
title_hash TEXT,
duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
image_checked_at TEXT,
FOREIGN KEY (source_id) REFERENCES sources(id)
);
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
CREATE INDEX IF NOT EXISTS idx_articles_source_id ON articles(source_id);
CREATE INDEX IF NOT EXISTS idx_articles_title_hash ON articles(title_hash);
CREATE TABLE IF NOT EXISTS article_scores (
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
constructive_score INTEGER,
cortisol_score INTEGER,
ragebait_score INTEGER,
agency_score INTEGER,
human_benefit_score INTEGER,
novelty_score INTEGER,
pr_risk_score INTEGER,
accepted INTEGER,
reason_code TEXT,
reason_text TEXT,
topic TEXT,
flavor TEXT,
model_name TEXT,
scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS article_tags (
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
tag TEXT NOT NULL,
PRIMARY KEY (article_id, tag)
);
CREATE INDEX IF NOT EXISTS idx_article_tags_tag ON article_tags(tag);
CREATE TABLE IF NOT EXISTS article_embeddings (
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
vector BLOB NOT NULL,
dim INTEGER NOT NULL,
model TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS ingest_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
started_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
finished_at TEXT,
status TEXT NOT NULL DEFAULT 'running',
items_seen INTEGER NOT NULL DEFAULT 0,
items_inserted INTEGER NOT NULL DEFAULT 0,
items_duplicate INTEGER NOT NULL DEFAULT 0,
error TEXT
);
CREATE TABLE IF NOT EXISTS source_candidates (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feed_url TEXT NOT NULL UNIQUE,
homepage_url TEXT,
name TEXT,
status TEXT NOT NULL DEFAULT 'suggested',
preview_json TEXT,
notes TEXT,
last_previewed_at TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS daily_briefs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
brief_date TEXT NOT NULL UNIQUE,
title TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
notes TEXT
);
CREATE TABLE IF NOT EXISTS daily_brief_items (
brief_id INTEGER NOT NULL REFERENCES daily_briefs(id) ON DELETE CASCADE,
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
rank INTEGER NOT NULL,
selection_reason TEXT,
PRIMARY KEY (brief_id, article_id),
UNIQUE (brief_id, rank)
);
-- ---- Accounts ----------------------------------------------------------------
-- Self-hosted, minimal-PII. The host ingestion owns the content tables above;
-- the API owns these (writes happen via the API, so the DB runs in WAL mode).
CREATE TABLE IF NOT EXISTS users (
id INTEGER PRIMARY KEY AUTOINCREMENT,
email TEXT NOT NULL UNIQUE,
display_name TEXT,
avatar_url TEXT,
is_admin INTEGER NOT NULL DEFAULT 0,
digest_enabled INTEGER NOT NULL DEFAULT 0,
digest_unsub_token TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- One row per sign-in method linked to a user; lets Google + magic-link
-- (same verified email) resolve to a single account.
CREATE TABLE IF NOT EXISTS identities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
provider TEXT NOT NULL, -- 'email' | 'google' | 'apple'
provider_subject TEXT NOT NULL, -- email address, or the provider's stable user id
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE (provider, provider_subject)
);
CREATE INDEX IF NOT EXISTS idx_identities_user ON identities(user_id);
-- Single-use, short-lived magic-link tokens (stored hashed).
CREATE TABLE IF NOT EXISTS login_tokens (
id INTEGER PRIMARY KEY AUTOINCREMENT,
email TEXT NOT NULL,
token_hash TEXT NOT NULL UNIQUE,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
expires_at TEXT NOT NULL,
consumed_at TEXT
);
CREATE INDEX IF NOT EXISTS idx_login_tokens_email ON login_tokens(email);
-- Active sessions (opaque token stored hashed); validated for cookie or bearer.
CREATE TABLE IF NOT EXISTS sessions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
token_hash TEXT NOT NULL UNIQUE,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
last_seen_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
expires_at TEXT NOT NULL,
user_agent TEXT
);
CREATE INDEX IF NOT EXISTS idx_sessions_user ON sessions(user_id);
CREATE TABLE IF NOT EXISTS saved_articles (
user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
saved_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (user_id, article_id)
);
CREATE TABLE IF NOT EXISTS user_history (
user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
event TEXT NOT NULL DEFAULT 'seen', -- 'seen' | 'dismissed'
at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (user_id, article_id, event)
);
CREATE TABLE IF NOT EXISTS user_prefs (
user_id INTEGER PRIMARY KEY REFERENCES users(id) ON DELETE CASCADE,
prefs_json TEXT NOT NULL DEFAULT '{}',
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- Our OWN short summary of an article (generated on demand, cached forever).
-- We store only our derived summary text — never the publisher's article body.
CREATE TABLE IF NOT EXISTS article_summaries (
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
summary TEXT NOT NULL,
what_happened TEXT,
why_matters TEXT,
why_belongs TEXT,
model TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- Privacy-respecting, first-party analytics. NO IP / user-agent / referrer / raw
-- URL. visitor_hash is a hash of a random localStorage token (never email/IP).
-- The UNIQUE key dedups to one row per (kind, article, visitor, day) — that both
-- caps volume and makes counts mean "distinct visitor-days". Groupings are derived
-- from article_id at query time, never stored here.
CREATE TABLE IF NOT EXISTS events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
kind TEXT NOT NULL,
article_id INTEGER NOT NULL DEFAULT 0,
visitor_hash TEXT NOT NULL DEFAULT '',
day TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE (kind, article_id, visitor_hash, day)
);
CREATE INDEX IF NOT EXISTS idx_events_day ON events(day);
CREATE INDEX IF NOT EXISTS idx_events_kind ON events(kind);
CREATE INDEX IF NOT EXISTS idx_events_article ON events(article_id);
-- User feedback (idea / concern / bug / praise). Anonymous-friendly; optional
-- contact email only if the person wants a reply. visitor_hash is for rate-limit
-- only (the same hashed anonymous token used by analytics).
CREATE TABLE IF NOT EXISTS feedback (
id INTEGER PRIMARY KEY AUTOINCREMENT,
category TEXT NOT NULL DEFAULT 'other',
message TEXT NOT NULL,
contact_email TEXT,
user_id INTEGER REFERENCES users(id) ON DELETE SET NULL,
visitor_hash TEXT NOT NULL DEFAULT '',
day TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
read_at TEXT
);
CREATE INDEX IF NOT EXISTS idx_feedback_created ON feedback(created_at);
CREATE TABLE IF NOT EXISTS feedback_replies (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feedback_id INTEGER NOT NULL REFERENCES feedback(id) ON DELETE CASCADE,
user_id INTEGER REFERENCES users(id) ON DELETE SET NULL,
message TEXT NOT NULL,
message_html TEXT,
sent_to TEXT NOT NULL,
sent_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_feedback_replies_fid ON feedback_replies(feedback_id);
CREATE TABLE IF NOT EXISTS user_follows (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
kind TEXT NOT NULL, -- 'source' | 'tag'
value TEXT NOT NULL, -- source id (as text) or tag key
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE (user_id, kind, value)
);
CREATE TABLE IF NOT EXISTS digest_sends (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id INTEGER NOT NULL REFERENCES users(id) ON DELETE CASCADE,
brief_date TEXT NOT NULL,
item_count INTEGER NOT NULL DEFAULT 0,
sent_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
UNIQUE (user_id, brief_date)
);
"""
def connect(db_path: Path | str) -> sqlite3.Connection:
path = Path(db_path)
path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(path, check_same_thread=False)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
# WAL lets the API write account data while the ingestion cycle writes content
# concurrently (readers never block the writer). busy_timeout rides out the
# brief moments the single writer lock is held. Both are no-ops if already set.
conn.execute("PRAGMA busy_timeout = 5000")
if str(path) != ":memory:":
conn.execute("PRAGMA journal_mode = WAL")
conn.execute("PRAGMA synchronous = NORMAL")
return conn
def init_db(conn: sqlite3.Connection) -> None:
conn.executescript(SCHEMA)
_migrate(conn)
conn.commit()
def _migrate(conn: sqlite3.Connection) -> None:
"""Add columns introduced after the initial schema to existing databases.
CREATE TABLE IF NOT EXISTS never alters an existing table, so new columns
need an explicit, idempotent ALTER guarded by the current column set.
"""
score_cols = {row["name"] for row in conn.execute("PRAGMA table_info(article_scores)")}
for column in ("topic", "flavor"):
if column not in score_cols:
conn.execute(f"ALTER TABLE article_scores ADD COLUMN {column} TEXT")
# users.avatar_url (Google pictures) + is_admin (admin dashboard) added later.
user_tbl = {row["name"] for row in conn.execute("PRAGMA table_info(users)")}
if user_tbl and "avatar_url" not in user_tbl:
conn.execute("ALTER TABLE users ADD COLUMN avatar_url TEXT")
if user_tbl and "is_admin" not in user_tbl:
conn.execute("ALTER TABLE users ADD COLUMN is_admin INTEGER NOT NULL DEFAULT 0")
if user_tbl and "digest_enabled" not in user_tbl:
conn.execute("ALTER TABLE users ADD COLUMN digest_enabled INTEGER NOT NULL DEFAULT 0")
if user_tbl and "digest_unsub_token" not in user_tbl:
conn.execute("ALTER TABLE users ADD COLUMN digest_unsub_token TEXT")
article_cols = {row["name"] for row in conn.execute("PRAGMA table_info(articles)")}
if "duplicate_of" not in article_cols:
conn.execute(
"ALTER TABLE articles ADD COLUMN duplicate_of INTEGER REFERENCES articles(id)"
)
if "image_checked_at" not in article_cols:
conn.execute("ALTER TABLE articles ADD COLUMN image_checked_at TEXT")
# Created here (not in SCHEMA) so it runs after the column exists on upgrades.
conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")
source_cols = {row["name"] for row in conn.execute("PRAGMA table_info(sources)")}
health_columns = {
"last_success_at": "TEXT",
"last_error_at": "TEXT",
"last_error": "TEXT",
"consecutive_failures": "INTEGER NOT NULL DEFAULT 0",
"review_flag": "INTEGER NOT NULL DEFAULT 0",
"review_reason": "TEXT",
}
for column, decl in health_columns.items():
if column not in source_cols:
conn.execute(f"ALTER TABLE sources ADD COLUMN {column} {decl}")
# Lifecycle: status (active/paused/retired) + content_visible. `active` is
# kept as a synced mirror so legacy code (scheduler/CLI) keeps working.
if "status" not in source_cols:
conn.execute("ALTER TABLE sources ADD COLUMN status TEXT NOT NULL DEFAULT 'active'")
conn.execute("UPDATE sources SET status = CASE WHEN active = 1 THEN 'active' ELSE 'paused' END")
if "content_visible" not in source_cols:
conn.execute("ALTER TABLE sources ADD COLUMN content_visible INTEGER NOT NULL DEFAULT 1")
if "retry_after_at" not in source_cols:
conn.execute("ALTER TABLE sources ADD COLUMN retry_after_at TEXT")
# feedback.read_at (admin inbox read/unread) added later.
fb_cols = {row["name"] for row in conn.execute("PRAGMA table_info(feedback)")}
if fb_cols and "read_at" not in fb_cols:
conn.execute("ALTER TABLE feedback ADD COLUMN read_at TEXT")
# feedback_replies.message_html (rendered Markdown subset) added later.
rep_cols = {row["name"] for row in conn.execute("PRAGMA table_info(feedback_replies)")}
if rep_cols and "message_html" not in rep_cols:
conn.execute("ALTER TABLE feedback_replies ADD COLUMN message_html TEXT")
# article_summaries: structured "Why it belongs" fields added later.
sum_cols = {row["name"] for row in conn.execute("PRAGMA table_info(article_summaries)")}
for column in ("what_happened", "why_matters", "why_belongs"):
if sum_cols and column not in sum_cols:
conn.execute(f"ALTER TABLE article_summaries ADD COLUMN {column} TEXT")