Small joys backend: shared daily framework + On This Day engine

- goodnews/daily.py: shared helpers for the daily "small joys" (http_json, date-seeded
  deterministic pick, dedup key) so each joy is a small self-contained module.
- goodnews/onthisday.py: harvest today's MM-DD from Wikimedia's On-this-day feed →
  tone-filter to good/neutral (keyword floor + optional LLM refine) → pool → deterministic
  daily pick (idempotent, respects blocked/featured) → cached row. Network/LLM before any
  DB write. Multi-source ready (source column).
- db.py: onthisday_pool + daily_onthisday tables.
- api.py: GET /api/onthisday/today (edge-cacheable).
- cli.py: cycle step (run after Daily Art; --no-joys to skip), LLM client for tone refine.
- tests/test_onthisday.py: 7 tests (filter+dedup, pick idempotent, blocked/featured,
  never-empty, empty-pool, LLM-narrow). 382 backend tests green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-22 16:51:29 -04:00
parent 4739d87f4b
commit a7da8362ab
6 changed files with 351 additions and 2 deletions
+15 -1
View File
@@ -36,7 +36,7 @@ from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from . import art, auth, bloom, email_send, feeds, games, oauth_google, publishing, queries, share, sources, summarize
from . import art, auth, bloom, email_send, feeds, games, oauth_google, onthisday, publishing, queries, share, sources, summarize
from .localtime import local_today
from .markup import reply_html_to_text, sanitize_reply_html
from .db import connect
@@ -2290,6 +2290,20 @@ def create_app() -> FastAPI:
# Cached museum image: immutable for a given object id.
return FileResponse(str(matches[0]), headers={"Cache-Control": "public, max-age=31536000, immutable"})
@app.get("/api/onthisday/today")
def onthisday_today(response: Response) -> dict:
with get_conn() as conn:
a = onthisday.get_today(conn)
if not a:
response.headers["Cache-Control"] = _PRIVATE
raise HTTPException(status_code=404, detail="No fact yet.")
response.headers["Cache-Control"] = _EDGE_FEED # one fact a day, same for everyone
return {
"date": a["feature_date"], "year": a["year"], "text": a["text"],
"summary": a["summary"], "image_url": a["image_url"], "source_url": a["page_url"],
"source": a["source"],
}
@app.get("/api/replacement", response_model=Article | None)
def replacement(
exclude: str = Query("", description="comma-separated article ids already shown"),
+10 -1
View File
@@ -13,7 +13,7 @@ from .games import generate_daily_puzzles
from .localtime import local_today
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
from .geo import tag_articles as tag_geo
from . import art
from . import art, onthisday
from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
from .summarize import generate_summary, get_summary
from .feeds import (
@@ -137,6 +137,7 @@ def main() -> None:
cycle_parser.add_argument("--no-geo", action="store_true", help="Skip tagging article subject-geography")
cycle_parser.add_argument("--geo-limit", type=int, default=60, help="Max articles to geo-tag per cycle")
cycle_parser.add_argument("--no-art", action="store_true", help="Skip the Daily Art pick")
cycle_parser.add_argument("--no-joys", action="store_true", help="Skip the small-joys picks (On This Day, etc.)")
cycle_parser.add_argument("--no-brief", action="store_true", help="Skip rebuilding today's brief")
cycle_parser.add_argument("--no-review", action="store_true", help="Skip recomputing source review flags")
cycle_parser.add_argument("--no-digest", action="store_true", help="Skip sending due daily digests")
@@ -560,6 +561,14 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
except Exception as exc:
print(f"art: skipped ({exc})")
# On This Day: harvest + tone-filter today's date in history, then pick one good fact.
if not args.no_joys:
try:
o = onthisday.run_daily(conn, client=LocalModelClient.from_env())
print(f"onthisday: md={o['md']} picked={'yes' if o['picked'] else 'no'}")
except Exception as exc:
print(f"onthisday: skipped ({exc})")
if not args.no_brief:
today = local_today()
try:
+39
View File
@@ -0,0 +1,39 @@
"""Shared helpers for the daily "small joys" features — On This Day, Word of the Day,
Quote of the Day (and whatever calm little delights come next).
Each joy keeps its own pool + daily table, but they all share this skeleton:
harvest -> pool → deterministic daily pick (date-seeded, least-recently-shown) →
cache row in a daily_* table → API → page.
This module holds only the genuinely shared bits (network + the deterministic pick), so a
new joy is a small self-contained module, not a copy-paste of plumbing. Network calls go
through http_json so tests can monkeypatch them.
"""
from __future__ import annotations
import hashlib
import json
import urllib.request
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
def http_json(url: str, timeout: int = 20) -> dict:
req = urllib.request.Request(url, headers=_UA)
with urllib.request.urlopen(req, timeout=timeout) as r:
return json.loads(r.read().decode("utf-8"))
def seeded_order(ids: list, date_str: str) -> list:
"""Rotate a list deterministically by the date, so the day's pick is the same for
everyone and varies day to day (the same trick Daily Art uses)."""
if not ids:
return ids
seed = int(hashlib.sha256(date_str.encode()).hexdigest(), 16) % len(ids)
return ids[seed:] + ids[:seed]
def content_key(*parts) -> str:
"""A stable dedup key for a pool item (so re-harvesting never duplicates a row)."""
raw = "|".join("" if p is None else str(p) for p in parts)
return hashlib.sha256(raw.encode()).hexdigest()[:24]
+30
View File
@@ -274,6 +274,36 @@ CREATE TABLE IF NOT EXISTS daily_art (
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- "Small joys" daily features. On This Day: a good/neutral thing that happened on
-- today's calendar date, harvested + tone-filtered into a pool, then one picked per day.
CREATE TABLE IF NOT EXISTS onthisday_pool (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT NOT NULL DEFAULT 'wikimedia', -- multi-source ready (wikimedia | admin | ...)
md TEXT NOT NULL, -- 'MM-DD'
year INTEGER,
ckey TEXT NOT NULL UNIQUE, -- dedup hash so re-harvest never duplicates
text TEXT NOT NULL,
summary TEXT,
image_url TEXT,
page_url TEXT,
shown_at TEXT, -- last date this was the pick (no-soon-repeat)
blocked INTEGER NOT NULL DEFAULT 0, -- admin lever: never pick this
featured INTEGER NOT NULL DEFAULT 0, -- admin lever: prefer this for its date
added_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS daily_onthisday (
feature_date TEXT PRIMARY KEY, -- 'YYYY-MM-DD'
pool_id INTEGER NOT NULL,
source TEXT NOT NULL DEFAULT 'wikimedia',
md TEXT NOT NULL,
year INTEGER,
text TEXT,
summary TEXT,
image_url TEXT,
page_url TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- Privacy-respecting, first-party analytics. NO IP / user-agent / referrer / raw
-- URL. visitor_hash is a hash of a random localStorage token (never email/IP).
-- The UNIQUE key dedups to one row per (kind, article, visitor, day) — that both
+179
View File
@@ -0,0 +1,179 @@
"""On This Day — a good thing that happened on today's date in history.
Source: Wikimedia's "On this day" feed (free, CC) — comprehensive, and it carries a
summary extract + thumbnail per event, which makes for a rich page. Multi-source ready
(a `source` column), so admin-curated entries and any future source slot in cleanly.
Pipeline (mirrors Daily Art): harvest today's MM-DD events → tone-filter to good/neutral
(keyword floor + optional LLM refine) → pool → deterministic daily pick → cached row.
All network/LLM work happens before any DB write, so the write txn is brief.
"""
from __future__ import annotations
import json
import re
import sqlite3
from . import daily
from .localtime import local_today
WIKI_BASE = "https://en.wikipedia.org/api/rest_v1/feed/onthisday/events"
_NO_REPEAT_POOL = 40 # pick from the N least-recently-shown for today's date
# Keyword floor: drop the obviously grim before the LLM ever sees it (and a safety net
# for when the LLM is unavailable). Substring match on a lowercased event text.
_NEG = (
"war", "kill", "died", "dies", "death", "deaths", "dead", "massacre", "genocide",
"disaster", "earthquake", "hurricane", "tsunami", "flood", "famine", "plague",
"bomb", "attack", "assassinat", "murder", "shooting", "shot dead", "riot", "crash",
"invad", "slaughter", "execut", "tragedy", "terror", "nuclear", "explosion",
"sank", "sink", "wreck", "epidemic", "pandemic", "outbreak", "hostage", "coup",
)
def _fetch_events(md: str) -> list[dict]:
"""All events for a MM-DD from Wikimedia, normalized to our candidate shape."""
mm, dd = md.split("-")
data = daily.http_json(f"{WIKI_BASE}/{mm}/{dd}")
out = []
for e in (data.get("events") or []):
text = (e.get("text") or "").strip()
if not text:
continue
page = (e.get("pages") or [{}])[0] or {}
out.append({
"md": md,
"year": e.get("year"),
"text": text,
"summary": (page.get("extract") or "").strip() or None,
"image_url": ((page.get("thumbnail") or {}).get("source")) or None,
"page_url": (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None,
})
return out
def _keyword_ok(text: str) -> bool:
t = text.lower()
return not any(neg in t for neg in _NEG)
def _llm_keep(client, candidates: list[dict]) -> list[dict]:
"""Ask the LLM which candidates are genuinely positive/neutral. On any trouble,
keep the keyword-passed set (never lose the day to a model hiccup)."""
lines = [f"{i}: {c['text']}" for i, c in enumerate(candidates)]
user = (
"These are 'on this day' history events. Return the indices of the ones with a "
"POSITIVE or NEUTRAL, uplifting tone — discoveries, inventions, firsts, achievements, "
"peace, the arts, science, exploration, culture, milestones. EXCLUDE anything about "
"war, violence, disasters, death, or tragedy.\n\n" + "\n".join(lines) +
'\n\nReply with JSON only, exactly: {"keep": [<indices>]}'
)
txt = client.chat_text([{"role": "user", "content": user}])
m = re.search(r"\{.*\}", txt, re.S)
if not m:
return candidates
keep = json.loads(m.group(0)).get("keep", [])
idx = {int(i) for i in keep if str(i).lstrip("-").isdigit()}
sub = [c for i, c in enumerate(candidates) if i in idx]
return sub or candidates
def _tone_filter(candidates: list[dict], client=None) -> list[dict]:
kept = [c for c in candidates if _keyword_ok(c["text"])]
if client and kept:
try:
kept = _llm_keep(client, kept)
except Exception: # noqa: BLE001 — LLM is best-effort; keyword floor stands
pass
return kept
def _pool_count(conn: sqlite3.Connection, md: str) -> int:
return conn.execute(
"SELECT COUNT(*) FROM onthisday_pool WHERE md=? AND blocked=0", (md,)
).fetchone()[0]
def harvest(conn: sqlite3.Connection, md: str | None = None, client=None) -> dict:
"""Fetch + tone-filter today's MM-DD events into the pool. Per-day, idempotent
(dedup by content key). Non-fatal on network failure."""
md = md or local_today()[5:]
try:
events = _fetch_events(md)
except Exception: # noqa: BLE001
return {"md": md, "fetched": 0, "kept": 0, "added": 0, "pool": _pool_count(conn, md)}
kept = _tone_filter(events, client) # all network/LLM done before the write
before = _pool_count(conn, md)
conn.executemany(
"INSERT OR IGNORE INTO onthisday_pool (source, md, year, ckey, text, summary, image_url, page_url) "
"VALUES ('wikimedia', ?, ?, ?, ?, ?, ?, ?)",
[(c["md"], c["year"], daily.content_key(c["md"], c["year"], c["text"]),
c["text"], c["summary"], c["image_url"], c["page_url"]) for c in kept],
)
conn.commit()
after = _pool_count(conn, md)
return {"md": md, "fetched": len(events), "kept": len(kept), "added": after - before, "pool": after}
def _candidates(conn: sqlite3.Connection, md: str) -> list[int]:
"""The pick pool for a date: if admin has featured any, pick only among those;
otherwise the N least-recently-shown."""
featured = conn.execute(
"SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 AND featured=1 ORDER BY id", (md,)
).fetchall()
if featured:
return [r[0] for r in featured]
rows = conn.execute(
"SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 "
"ORDER BY shown_at IS NOT NULL, shown_at, id LIMIT ?",
(md, _NO_REPEAT_POOL),
).fetchall()
return [r[0] for r in rows]
def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False) -> dict | None:
"""Pick + cache today's fact. Idempotent (skips if today's done unless force).
Returns the stored row, or None if the pool has nothing for today's date."""
feature_date = feature_date or local_today()
md = feature_date[5:]
existing = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone()
if existing and not force:
return dict(existing)
ids = _candidates(conn, md)
if not ids:
return None
pick_id = daily.seeded_order(ids, feature_date)[0]
row = conn.execute("SELECT * FROM onthisday_pool WHERE id=?", (pick_id,)).fetchone()
conn.execute(
"INSERT INTO daily_onthisday (feature_date, pool_id, source, md, year, text, summary, image_url, page_url) "
"VALUES (?,?,?,?,?,?,?,?,?) "
"ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, source=excluded.source, "
"year=excluded.year, text=excluded.text, summary=excluded.summary, image_url=excluded.image_url, "
"page_url=excluded.page_url",
(feature_date, row["id"], row["source"], row["md"], row["year"], row["text"],
row["summary"], row["image_url"], row["page_url"]),
)
conn.execute("UPDATE onthisday_pool SET shown_at=? WHERE id=?", (feature_date, pick_id))
conn.commit()
return dict(conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone())
def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None:
"""Today's fact if present, else the most recent (the room is never empty)."""
if feature_date:
row = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone()
if row:
return dict(row)
row = conn.execute("SELECT * FROM daily_onthisday ORDER BY feature_date DESC LIMIT 1").fetchone()
return dict(row) if row else None
def run_daily(conn: sqlite3.Connection, client=None) -> dict:
"""Cycle entry point: ensure today's date has a pool, then ensure it has a pick.
Bounded + non-fatal — safe to call every cycle (no-ops once the day is picked)."""
md = local_today()[5:]
harvested = None
if _pool_count(conn, md) == 0:
harvested = harvest(conn, md, client)
picked = pick_daily(conn)
return {"md": md, "harvested": harvested, "picked": picked["text"] if picked else None}