Files
upbeatBytes/goodnews/onthisday.py
T
thejayman77 6c10ad99a9 On This Day: serve sharp images (originalimage, not the 330px thumbnail)
The Wikimedia feed's thumbnail is 330px, which upscales blurry in our hero. Use
originalimage.source instead — it's reliably sharp. (Can't just request a bigger
thumbnail width: for very large source images Wikimedia only serves pre-generated
bucket sizes and 400s on arbitrary widths — e.g. 500px ok, 800/1024px fail.)

- onthisday._best_image() prefers originalimage, falls back to the thumbnail.
- scripts/otd_image_upsize_backfill.py re-fetches each stored MM-DD and upgrades
  image_url in onthisday_pool + daily_onthisday in place (ran on host: pool + 6
  daily rows now sharp; today's hero verified 200). Only the /onthisday hero
  loads this image (home card is text-only), so larger files are a single-page,
  one-time load.
- test_best_image locks the prefer-original/fallback behavior.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 17:07:37 -04:00

203 lines
9.4 KiB
Python

"""On This Day — a good thing that happened on today's date in history.
Source: Wikimedia's "On this day" feed (free, CC) — comprehensive, and it carries a
summary extract + thumbnail per event, which makes for a rich page. Multi-source ready
(a `source` column), so admin-curated entries and any future source slot in cleanly.
Pipeline (mirrors Daily Art): harvest today's MM-DD events → tone-filter to good/neutral
(keyword floor + optional LLM refine) → pool → deterministic daily pick → cached row.
All network/LLM work happens before any DB write, so the write txn is brief.
"""
from __future__ import annotations
import json
import re
import sqlite3
from . import daily
from .localtime import local_today
WIKI_BASE = "https://en.wikipedia.org/api/rest_v1/feed/onthisday/events"
_NO_REPEAT_POOL = 40 # pick from the N least-recently-shown for today's date
# Keyword floor: drop the obviously grim before the LLM ever sees it (and a safety net
# for when the LLM is unavailable). Substring match on a lowercased event text.
_NEG = (
"war", "kill", "died", "dies", "death", "deaths", "dead", "massacre", "genocide",
"disaster", "earthquake", "hurricane", "tsunami", "flood", "famine", "plague",
"bomb", "attack", "assassinat", "murder", "shooting", "shot dead", "riot", "crash",
"invad", "slaughter", "execut", "tragedy", "terror", "nuclear", "explosion",
"sank", "sink", "wreck", "epidemic", "pandemic", "outbreak", "hostage", "coup",
)
# Wikimedia's feed hands us a 330px `thumbnail`, which upscales (blurry) in our hero. It also
# gives `originalimage` — a sharp, full-size URL that's always valid. We can't just request a
# bigger thumbnail width: for very large source images Wikimedia only serves pre-generated
# bucket sizes and 400s on arbitrary widths (e.g. 500px ok, 800/1024px fail, 1280px ok). So
# prefer the originalimage (reliably sharp), falling back to the thumbnail.
def _best_image(page: dict) -> str | None:
"""The sharpest reliably-served image URL: originalimage, else the 330px thumbnail."""
orig = (page.get("originalimage") or {}).get("source")
thumb = (page.get("thumbnail") or {}).get("source")
return orig or thumb or None
def _fetch_events(md: str) -> list[dict]:
"""All events for a MM-DD from Wikimedia, normalized to our candidate shape."""
mm, dd = md.split("-")
data = daily.http_json(f"{WIKI_BASE}/{mm}/{dd}")
out = []
for e in (data.get("events") or []):
text = (e.get("text") or "").strip()
if not text:
continue
page = (e.get("pages") or [{}])[0] or {}
out.append({
"md": md,
"year": e.get("year"),
"text": text,
"summary": (page.get("extract") or "").strip() or None,
"image_url": _best_image(page),
"page_url": (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None,
})
return out
def _keyword_ok(text: str) -> bool:
t = text.lower()
return not any(neg in t for neg in _NEG)
def _llm_keep(client, candidates: list[dict]) -> list[dict]:
"""Ask the LLM which candidates are genuinely positive/neutral. On any trouble,
keep the keyword-passed set (never lose the day to a model hiccup)."""
lines = [f"{i}: {c['text']}" for i, c in enumerate(candidates)]
user = (
"These are 'on this day' history events. Return the indices of the ones that are "
"GENUINELY UPLIFTING — a reader should feel a small lift of wonder, hope, or delight. "
"Keep: discoveries, inventions, scientific breakthroughs, the arts and culture, "
"exploration, human achievement, acts of courage or kindness, milestones of progress "
"(rights won, things built, records set). EXCLUDE war, violence, disasters, death, or "
"tragedy, AND exclude merely procedural or political-administrative events that carry no "
"warmth (a coronation or accession, a treaty signing, an election, a law passed, a "
"boundary or office change). When unsure whether something is truly uplifting, leave it "
"out.\n\n" + "\n".join(lines) +
'\n\nReply with JSON only, exactly: {"keep": [<indices>]}'
)
txt = client.chat_text([{"role": "user", "content": user}])
m = re.search(r"\{.*\}", txt, re.S)
if not m:
return candidates
keep = json.loads(m.group(0)).get("keep", [])
idx = {int(i) for i in keep if str(i).lstrip("-").isdigit()}
sub = [c for i, c in enumerate(candidates) if i in idx]
return sub or candidates
def _tone_filter(candidates: list[dict], client=None) -> list[dict]:
kept = [c for c in candidates if _keyword_ok(c["text"])]
if client and kept:
try:
kept = _llm_keep(client, kept)
except Exception: # noqa: BLE001 — LLM is best-effort; keyword floor stands
pass
return kept
def _pool_count(conn: sqlite3.Connection, md: str) -> int:
return conn.execute(
"SELECT COUNT(*) FROM onthisday_pool WHERE md=? AND blocked=0", (md,)
).fetchone()[0]
def harvest(conn: sqlite3.Connection, md: str | None = None, client=None) -> dict:
"""Fetch + tone-filter today's MM-DD events into the pool. Per-day, idempotent
(dedup by content key). Non-fatal on network failure."""
md = md or local_today()[5:]
try:
events = _fetch_events(md)
except Exception: # noqa: BLE001
return {"md": md, "fetched": 0, "kept": 0, "added": 0, "pool": _pool_count(conn, md)}
kept = _tone_filter(events, client) # all network/LLM done before the write
before = _pool_count(conn, md)
conn.executemany(
"INSERT OR IGNORE INTO onthisday_pool (source, md, year, ckey, text, summary, image_url, page_url) "
"VALUES ('wikimedia', ?, ?, ?, ?, ?, ?, ?)",
[(c["md"], c["year"], daily.content_key(c["md"], c["year"], c["text"]),
c["text"], c["summary"], c["image_url"], c["page_url"]) for c in kept],
)
conn.commit()
after = _pool_count(conn, md)
return {"md": md, "fetched": len(events), "kept": len(kept), "added": after - before, "pool": after}
def _candidates(conn: sqlite3.Connection, md: str, avoid: int | None = None) -> list[int]:
"""The pick pool for a date: if admin has featured any, pick only among those;
otherwise the N least-recently-shown. `avoid` drops a specific id (admin re-pick)
unless it's the only option."""
featured = conn.execute(
"SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 AND featured=1 ORDER BY id", (md,)
).fetchall()
if featured:
ids = [r[0] for r in featured]
else:
rows = conn.execute(
"SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 "
"ORDER BY shown_at IS NOT NULL, shown_at, id LIMIT ?",
(md, _NO_REPEAT_POOL),
).fetchall()
ids = [r[0] for r in rows]
if avoid is not None:
ids = [i for i in ids if i != avoid] or ids
return ids
def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False,
avoid: int | None = None) -> dict | None:
"""Pick + cache today's fact. Idempotent (skips if today's done unless force).
Returns the stored row, or None if the pool has nothing for today's date."""
feature_date = feature_date or local_today()
md = feature_date[5:]
existing = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone()
if existing and not force:
return dict(existing)
ids = _candidates(conn, md, avoid)
if not ids:
return None
pick_id = daily.seeded_order(ids, feature_date)[0]
row = conn.execute("SELECT * FROM onthisday_pool WHERE id=?", (pick_id,)).fetchone()
conn.execute(
"INSERT INTO daily_onthisday (feature_date, pool_id, source, md, year, text, summary, image_url, page_url) "
"VALUES (?,?,?,?,?,?,?,?,?) "
"ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, source=excluded.source, "
"year=excluded.year, text=excluded.text, summary=excluded.summary, image_url=excluded.image_url, "
"page_url=excluded.page_url",
(feature_date, row["id"], row["source"], row["md"], row["year"], row["text"],
row["summary"], row["image_url"], row["page_url"]),
)
conn.execute("UPDATE onthisday_pool SET shown_at=? WHERE id=?", (feature_date, pick_id))
conn.commit()
return dict(conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone())
def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None:
"""Today's fact if present, else the most recent (the room is never empty)."""
if feature_date:
row = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone()
if row:
return dict(row)
row = conn.execute("SELECT * FROM daily_onthisday ORDER BY feature_date DESC LIMIT 1").fetchone()
return dict(row) if row else None
def run_daily(conn: sqlite3.Connection, client=None) -> dict:
"""Cycle entry point: ensure today's date has a pool, then ensure it has a pick.
Bounded + non-fatal — safe to call every cycle (no-ops once the day is picked)."""
md = local_today()[5:]
harvested = None
if _pool_count(conn, md) == 0:
harvested = harvest(conn, md, client)
picked = pick_daily(conn)
return {"md": md, "harvested": harvested, "picked": picked["text"] if picked else None}