Small joys backend: shared daily framework + On This Day engine
- goodnews/daily.py: shared helpers for the daily "small joys" (http_json, date-seeded deterministic pick, dedup key) so each joy is a small self-contained module. - goodnews/onthisday.py: harvest today's MM-DD from Wikimedia's On-this-day feed → tone-filter to good/neutral (keyword floor + optional LLM refine) → pool → deterministic daily pick (idempotent, respects blocked/featured) → cached row. Network/LLM before any DB write. Multi-source ready (source column). - db.py: onthisday_pool + daily_onthisday tables. - api.py: GET /api/onthisday/today (edge-cacheable). - cli.py: cycle step (run after Daily Art; --no-joys to skip), LLM client for tone refine. - tests/test_onthisday.py: 7 tests (filter+dedup, pick idempotent, blocked/featured, never-empty, empty-pool, LLM-narrow). 382 backend tests green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+15
-1
@@ -36,7 +36,7 @@ from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
|
||||
from . import art, auth, bloom, email_send, feeds, games, oauth_google, publishing, queries, share, sources, summarize
|
||||
from . import art, auth, bloom, email_send, feeds, games, oauth_google, onthisday, publishing, queries, share, sources, summarize
|
||||
from .localtime import local_today
|
||||
from .markup import reply_html_to_text, sanitize_reply_html
|
||||
from .db import connect
|
||||
@@ -2290,6 +2290,20 @@ def create_app() -> FastAPI:
|
||||
# Cached museum image: immutable for a given object id.
|
||||
return FileResponse(str(matches[0]), headers={"Cache-Control": "public, max-age=31536000, immutable"})
|
||||
|
||||
@app.get("/api/onthisday/today")
|
||||
def onthisday_today(response: Response) -> dict:
|
||||
with get_conn() as conn:
|
||||
a = onthisday.get_today(conn)
|
||||
if not a:
|
||||
response.headers["Cache-Control"] = _PRIVATE
|
||||
raise HTTPException(status_code=404, detail="No fact yet.")
|
||||
response.headers["Cache-Control"] = _EDGE_FEED # one fact a day, same for everyone
|
||||
return {
|
||||
"date": a["feature_date"], "year": a["year"], "text": a["text"],
|
||||
"summary": a["summary"], "image_url": a["image_url"], "source_url": a["page_url"],
|
||||
"source": a["source"],
|
||||
}
|
||||
|
||||
@app.get("/api/replacement", response_model=Article | None)
|
||||
def replacement(
|
||||
exclude: str = Query("", description="comma-separated article ids already shown"),
|
||||
|
||||
+10
-1
@@ -13,7 +13,7 @@ from .games import generate_daily_puzzles
|
||||
from .localtime import local_today
|
||||
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
|
||||
from .geo import tag_articles as tag_geo
|
||||
from . import art
|
||||
from . import art, onthisday
|
||||
from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
|
||||
from .summarize import generate_summary, get_summary
|
||||
from .feeds import (
|
||||
@@ -137,6 +137,7 @@ def main() -> None:
|
||||
cycle_parser.add_argument("--no-geo", action="store_true", help="Skip tagging article subject-geography")
|
||||
cycle_parser.add_argument("--geo-limit", type=int, default=60, help="Max articles to geo-tag per cycle")
|
||||
cycle_parser.add_argument("--no-art", action="store_true", help="Skip the Daily Art pick")
|
||||
cycle_parser.add_argument("--no-joys", action="store_true", help="Skip the small-joys picks (On This Day, etc.)")
|
||||
cycle_parser.add_argument("--no-brief", action="store_true", help="Skip rebuilding today's brief")
|
||||
cycle_parser.add_argument("--no-review", action="store_true", help="Skip recomputing source review flags")
|
||||
cycle_parser.add_argument("--no-digest", action="store_true", help="Skip sending due daily digests")
|
||||
@@ -560,6 +561,14 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
|
||||
except Exception as exc:
|
||||
print(f"art: skipped ({exc})")
|
||||
|
||||
# On This Day: harvest + tone-filter today's date in history, then pick one good fact.
|
||||
if not args.no_joys:
|
||||
try:
|
||||
o = onthisday.run_daily(conn, client=LocalModelClient.from_env())
|
||||
print(f"onthisday: md={o['md']} picked={'yes' if o['picked'] else 'no'}")
|
||||
except Exception as exc:
|
||||
print(f"onthisday: skipped ({exc})")
|
||||
|
||||
if not args.no_brief:
|
||||
today = local_today()
|
||||
try:
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
"""Shared helpers for the daily "small joys" features — On This Day, Word of the Day,
|
||||
Quote of the Day (and whatever calm little delights come next).
|
||||
|
||||
Each joy keeps its own pool + daily table, but they all share this skeleton:
|
||||
harvest -> pool → deterministic daily pick (date-seeded, least-recently-shown) →
|
||||
cache row in a daily_* table → API → page.
|
||||
|
||||
This module holds only the genuinely shared bits (network + the deterministic pick), so a
|
||||
new joy is a small self-contained module, not a copy-paste of plumbing. Network calls go
|
||||
through http_json so tests can monkeypatch them.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import urllib.request
|
||||
|
||||
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
|
||||
|
||||
|
||||
def http_json(url: str, timeout: int = 20) -> dict:
|
||||
req = urllib.request.Request(url, headers=_UA)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return json.loads(r.read().decode("utf-8"))
|
||||
|
||||
|
||||
def seeded_order(ids: list, date_str: str) -> list:
|
||||
"""Rotate a list deterministically by the date, so the day's pick is the same for
|
||||
everyone and varies day to day (the same trick Daily Art uses)."""
|
||||
if not ids:
|
||||
return ids
|
||||
seed = int(hashlib.sha256(date_str.encode()).hexdigest(), 16) % len(ids)
|
||||
return ids[seed:] + ids[:seed]
|
||||
|
||||
|
||||
def content_key(*parts) -> str:
|
||||
"""A stable dedup key for a pool item (so re-harvesting never duplicates a row)."""
|
||||
raw = "|".join("" if p is None else str(p) for p in parts)
|
||||
return hashlib.sha256(raw.encode()).hexdigest()[:24]
|
||||
@@ -274,6 +274,36 @@ CREATE TABLE IF NOT EXISTS daily_art (
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- "Small joys" daily features. On This Day: a good/neutral thing that happened on
|
||||
-- today's calendar date, harvested + tone-filtered into a pool, then one picked per day.
|
||||
CREATE TABLE IF NOT EXISTS onthisday_pool (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source TEXT NOT NULL DEFAULT 'wikimedia', -- multi-source ready (wikimedia | admin | ...)
|
||||
md TEXT NOT NULL, -- 'MM-DD'
|
||||
year INTEGER,
|
||||
ckey TEXT NOT NULL UNIQUE, -- dedup hash so re-harvest never duplicates
|
||||
text TEXT NOT NULL,
|
||||
summary TEXT,
|
||||
image_url TEXT,
|
||||
page_url TEXT,
|
||||
shown_at TEXT, -- last date this was the pick (no-soon-repeat)
|
||||
blocked INTEGER NOT NULL DEFAULT 0, -- admin lever: never pick this
|
||||
featured INTEGER NOT NULL DEFAULT 0, -- admin lever: prefer this for its date
|
||||
added_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS daily_onthisday (
|
||||
feature_date TEXT PRIMARY KEY, -- 'YYYY-MM-DD'
|
||||
pool_id INTEGER NOT NULL,
|
||||
source TEXT NOT NULL DEFAULT 'wikimedia',
|
||||
md TEXT NOT NULL,
|
||||
year INTEGER,
|
||||
text TEXT,
|
||||
summary TEXT,
|
||||
image_url TEXT,
|
||||
page_url TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Privacy-respecting, first-party analytics. NO IP / user-agent / referrer / raw
|
||||
-- URL. visitor_hash is a hash of a random localStorage token (never email/IP).
|
||||
-- The UNIQUE key dedups to one row per (kind, article, visitor, day) — that both
|
||||
|
||||
@@ -0,0 +1,179 @@
|
||||
"""On This Day — a good thing that happened on today's date in history.
|
||||
|
||||
Source: Wikimedia's "On this day" feed (free, CC) — comprehensive, and it carries a
|
||||
summary extract + thumbnail per event, which makes for a rich page. Multi-source ready
|
||||
(a `source` column), so admin-curated entries and any future source slot in cleanly.
|
||||
|
||||
Pipeline (mirrors Daily Art): harvest today's MM-DD events → tone-filter to good/neutral
|
||||
(keyword floor + optional LLM refine) → pool → deterministic daily pick → cached row.
|
||||
All network/LLM work happens before any DB write, so the write txn is brief.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
|
||||
from . import daily
|
||||
from .localtime import local_today
|
||||
|
||||
WIKI_BASE = "https://en.wikipedia.org/api/rest_v1/feed/onthisday/events"
|
||||
_NO_REPEAT_POOL = 40 # pick from the N least-recently-shown for today's date
|
||||
|
||||
# Keyword floor: drop the obviously grim before the LLM ever sees it (and a safety net
|
||||
# for when the LLM is unavailable). Substring match on a lowercased event text.
|
||||
_NEG = (
|
||||
"war", "kill", "died", "dies", "death", "deaths", "dead", "massacre", "genocide",
|
||||
"disaster", "earthquake", "hurricane", "tsunami", "flood", "famine", "plague",
|
||||
"bomb", "attack", "assassinat", "murder", "shooting", "shot dead", "riot", "crash",
|
||||
"invad", "slaughter", "execut", "tragedy", "terror", "nuclear", "explosion",
|
||||
"sank", "sink", "wreck", "epidemic", "pandemic", "outbreak", "hostage", "coup",
|
||||
)
|
||||
|
||||
|
||||
def _fetch_events(md: str) -> list[dict]:
|
||||
"""All events for a MM-DD from Wikimedia, normalized to our candidate shape."""
|
||||
mm, dd = md.split("-")
|
||||
data = daily.http_json(f"{WIKI_BASE}/{mm}/{dd}")
|
||||
out = []
|
||||
for e in (data.get("events") or []):
|
||||
text = (e.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
page = (e.get("pages") or [{}])[0] or {}
|
||||
out.append({
|
||||
"md": md,
|
||||
"year": e.get("year"),
|
||||
"text": text,
|
||||
"summary": (page.get("extract") or "").strip() or None,
|
||||
"image_url": ((page.get("thumbnail") or {}).get("source")) or None,
|
||||
"page_url": (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def _keyword_ok(text: str) -> bool:
|
||||
t = text.lower()
|
||||
return not any(neg in t for neg in _NEG)
|
||||
|
||||
|
||||
def _llm_keep(client, candidates: list[dict]) -> list[dict]:
|
||||
"""Ask the LLM which candidates are genuinely positive/neutral. On any trouble,
|
||||
keep the keyword-passed set (never lose the day to a model hiccup)."""
|
||||
lines = [f"{i}: {c['text']}" for i, c in enumerate(candidates)]
|
||||
user = (
|
||||
"These are 'on this day' history events. Return the indices of the ones with a "
|
||||
"POSITIVE or NEUTRAL, uplifting tone — discoveries, inventions, firsts, achievements, "
|
||||
"peace, the arts, science, exploration, culture, milestones. EXCLUDE anything about "
|
||||
"war, violence, disasters, death, or tragedy.\n\n" + "\n".join(lines) +
|
||||
'\n\nReply with JSON only, exactly: {"keep": [<indices>]}'
|
||||
)
|
||||
txt = client.chat_text([{"role": "user", "content": user}])
|
||||
m = re.search(r"\{.*\}", txt, re.S)
|
||||
if not m:
|
||||
return candidates
|
||||
keep = json.loads(m.group(0)).get("keep", [])
|
||||
idx = {int(i) for i in keep if str(i).lstrip("-").isdigit()}
|
||||
sub = [c for i, c in enumerate(candidates) if i in idx]
|
||||
return sub or candidates
|
||||
|
||||
|
||||
def _tone_filter(candidates: list[dict], client=None) -> list[dict]:
|
||||
kept = [c for c in candidates if _keyword_ok(c["text"])]
|
||||
if client and kept:
|
||||
try:
|
||||
kept = _llm_keep(client, kept)
|
||||
except Exception: # noqa: BLE001 — LLM is best-effort; keyword floor stands
|
||||
pass
|
||||
return kept
|
||||
|
||||
|
||||
def _pool_count(conn: sqlite3.Connection, md: str) -> int:
|
||||
return conn.execute(
|
||||
"SELECT COUNT(*) FROM onthisday_pool WHERE md=? AND blocked=0", (md,)
|
||||
).fetchone()[0]
|
||||
|
||||
|
||||
def harvest(conn: sqlite3.Connection, md: str | None = None, client=None) -> dict:
|
||||
"""Fetch + tone-filter today's MM-DD events into the pool. Per-day, idempotent
|
||||
(dedup by content key). Non-fatal on network failure."""
|
||||
md = md or local_today()[5:]
|
||||
try:
|
||||
events = _fetch_events(md)
|
||||
except Exception: # noqa: BLE001
|
||||
return {"md": md, "fetched": 0, "kept": 0, "added": 0, "pool": _pool_count(conn, md)}
|
||||
kept = _tone_filter(events, client) # all network/LLM done before the write
|
||||
before = _pool_count(conn, md)
|
||||
conn.executemany(
|
||||
"INSERT OR IGNORE INTO onthisday_pool (source, md, year, ckey, text, summary, image_url, page_url) "
|
||||
"VALUES ('wikimedia', ?, ?, ?, ?, ?, ?, ?)",
|
||||
[(c["md"], c["year"], daily.content_key(c["md"], c["year"], c["text"]),
|
||||
c["text"], c["summary"], c["image_url"], c["page_url"]) for c in kept],
|
||||
)
|
||||
conn.commit()
|
||||
after = _pool_count(conn, md)
|
||||
return {"md": md, "fetched": len(events), "kept": len(kept), "added": after - before, "pool": after}
|
||||
|
||||
|
||||
def _candidates(conn: sqlite3.Connection, md: str) -> list[int]:
|
||||
"""The pick pool for a date: if admin has featured any, pick only among those;
|
||||
otherwise the N least-recently-shown."""
|
||||
featured = conn.execute(
|
||||
"SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 AND featured=1 ORDER BY id", (md,)
|
||||
).fetchall()
|
||||
if featured:
|
||||
return [r[0] for r in featured]
|
||||
rows = conn.execute(
|
||||
"SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 "
|
||||
"ORDER BY shown_at IS NOT NULL, shown_at, id LIMIT ?",
|
||||
(md, _NO_REPEAT_POOL),
|
||||
).fetchall()
|
||||
return [r[0] for r in rows]
|
||||
|
||||
|
||||
def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False) -> dict | None:
|
||||
"""Pick + cache today's fact. Idempotent (skips if today's done unless force).
|
||||
Returns the stored row, or None if the pool has nothing for today's date."""
|
||||
feature_date = feature_date or local_today()
|
||||
md = feature_date[5:]
|
||||
existing = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone()
|
||||
if existing and not force:
|
||||
return dict(existing)
|
||||
ids = _candidates(conn, md)
|
||||
if not ids:
|
||||
return None
|
||||
pick_id = daily.seeded_order(ids, feature_date)[0]
|
||||
row = conn.execute("SELECT * FROM onthisday_pool WHERE id=?", (pick_id,)).fetchone()
|
||||
conn.execute(
|
||||
"INSERT INTO daily_onthisday (feature_date, pool_id, source, md, year, text, summary, image_url, page_url) "
|
||||
"VALUES (?,?,?,?,?,?,?,?,?) "
|
||||
"ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, source=excluded.source, "
|
||||
"year=excluded.year, text=excluded.text, summary=excluded.summary, image_url=excluded.image_url, "
|
||||
"page_url=excluded.page_url",
|
||||
(feature_date, row["id"], row["source"], row["md"], row["year"], row["text"],
|
||||
row["summary"], row["image_url"], row["page_url"]),
|
||||
)
|
||||
conn.execute("UPDATE onthisday_pool SET shown_at=? WHERE id=?", (feature_date, pick_id))
|
||||
conn.commit()
|
||||
return dict(conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone())
|
||||
|
||||
|
||||
def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None:
|
||||
"""Today's fact if present, else the most recent (the room is never empty)."""
|
||||
if feature_date:
|
||||
row = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone()
|
||||
if row:
|
||||
return dict(row)
|
||||
row = conn.execute("SELECT * FROM daily_onthisday ORDER BY feature_date DESC LIMIT 1").fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def run_daily(conn: sqlite3.Connection, client=None) -> dict:
|
||||
"""Cycle entry point: ensure today's date has a pool, then ensure it has a pick.
|
||||
Bounded + non-fatal — safe to call every cycle (no-ops once the day is picked)."""
|
||||
md = local_today()[5:]
|
||||
harvested = None
|
||||
if _pool_count(conn, md) == 0:
|
||||
harvested = harvest(conn, md, client)
|
||||
picked = pick_daily(conn)
|
||||
return {"md": md, "harvested": harvested, "picked": picked["text"] if picked else None}
|
||||
Reference in New Issue
Block a user