6c10ad99a9
The Wikimedia feed's thumbnail is 330px, which upscales blurry in our hero. Use originalimage.source instead — it's reliably sharp. (Can't just request a bigger thumbnail width: for very large source images Wikimedia only serves pre-generated bucket sizes and 400s on arbitrary widths — e.g. 500px ok, 800/1024px fail.) - onthisday._best_image() prefers originalimage, falls back to the thumbnail. - scripts/otd_image_upsize_backfill.py re-fetches each stored MM-DD and upgrades image_url in onthisday_pool + daily_onthisday in place (ran on host: pool + 6 daily rows now sharp; today's hero verified 200). Only the /onthisday hero loads this image (home card is text-only), so larger files are a single-page, one-time load. - test_best_image locks the prefer-original/fallback behavior. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
203 lines
9.4 KiB
Python
203 lines
9.4 KiB
Python
"""On This Day — a good thing that happened on today's date in history.
|
|
|
|
Source: Wikimedia's "On this day" feed (free, CC) — comprehensive, and it carries a
|
|
summary extract + thumbnail per event, which makes for a rich page. Multi-source ready
|
|
(a `source` column), so admin-curated entries and any future source slot in cleanly.
|
|
|
|
Pipeline (mirrors Daily Art): harvest today's MM-DD events → tone-filter to good/neutral
|
|
(keyword floor + optional LLM refine) → pool → deterministic daily pick → cached row.
|
|
All network/LLM work happens before any DB write, so the write txn is brief.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
|
|
from . import daily
|
|
from .localtime import local_today
|
|
|
|
WIKI_BASE = "https://en.wikipedia.org/api/rest_v1/feed/onthisday/events"
|
|
_NO_REPEAT_POOL = 40 # pick from the N least-recently-shown for today's date
|
|
|
|
# Keyword floor: drop the obviously grim before the LLM ever sees it (and a safety net
|
|
# for when the LLM is unavailable). Substring match on a lowercased event text.
|
|
_NEG = (
|
|
"war", "kill", "died", "dies", "death", "deaths", "dead", "massacre", "genocide",
|
|
"disaster", "earthquake", "hurricane", "tsunami", "flood", "famine", "plague",
|
|
"bomb", "attack", "assassinat", "murder", "shooting", "shot dead", "riot", "crash",
|
|
"invad", "slaughter", "execut", "tragedy", "terror", "nuclear", "explosion",
|
|
"sank", "sink", "wreck", "epidemic", "pandemic", "outbreak", "hostage", "coup",
|
|
)
|
|
|
|
|
|
# Wikimedia's feed hands us a 330px `thumbnail`, which upscales (blurry) in our hero. It also
|
|
# gives `originalimage` — a sharp, full-size URL that's always valid. We can't just request a
|
|
# bigger thumbnail width: for very large source images Wikimedia only serves pre-generated
|
|
# bucket sizes and 400s on arbitrary widths (e.g. 500px ok, 800/1024px fail, 1280px ok). So
|
|
# prefer the originalimage (reliably sharp), falling back to the thumbnail.
|
|
def _best_image(page: dict) -> str | None:
|
|
"""The sharpest reliably-served image URL: originalimage, else the 330px thumbnail."""
|
|
orig = (page.get("originalimage") or {}).get("source")
|
|
thumb = (page.get("thumbnail") or {}).get("source")
|
|
return orig or thumb or None
|
|
|
|
|
|
def _fetch_events(md: str) -> list[dict]:
|
|
"""All events for a MM-DD from Wikimedia, normalized to our candidate shape."""
|
|
mm, dd = md.split("-")
|
|
data = daily.http_json(f"{WIKI_BASE}/{mm}/{dd}")
|
|
out = []
|
|
for e in (data.get("events") or []):
|
|
text = (e.get("text") or "").strip()
|
|
if not text:
|
|
continue
|
|
page = (e.get("pages") or [{}])[0] or {}
|
|
out.append({
|
|
"md": md,
|
|
"year": e.get("year"),
|
|
"text": text,
|
|
"summary": (page.get("extract") or "").strip() or None,
|
|
"image_url": _best_image(page),
|
|
"page_url": (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None,
|
|
})
|
|
return out
|
|
|
|
|
|
def _keyword_ok(text: str) -> bool:
|
|
t = text.lower()
|
|
return not any(neg in t for neg in _NEG)
|
|
|
|
|
|
def _llm_keep(client, candidates: list[dict]) -> list[dict]:
|
|
"""Ask the LLM which candidates are genuinely positive/neutral. On any trouble,
|
|
keep the keyword-passed set (never lose the day to a model hiccup)."""
|
|
lines = [f"{i}: {c['text']}" for i, c in enumerate(candidates)]
|
|
user = (
|
|
"These are 'on this day' history events. Return the indices of the ones that are "
|
|
"GENUINELY UPLIFTING — a reader should feel a small lift of wonder, hope, or delight. "
|
|
"Keep: discoveries, inventions, scientific breakthroughs, the arts and culture, "
|
|
"exploration, human achievement, acts of courage or kindness, milestones of progress "
|
|
"(rights won, things built, records set). EXCLUDE war, violence, disasters, death, or "
|
|
"tragedy, AND exclude merely procedural or political-administrative events that carry no "
|
|
"warmth (a coronation or accession, a treaty signing, an election, a law passed, a "
|
|
"boundary or office change). When unsure whether something is truly uplifting, leave it "
|
|
"out.\n\n" + "\n".join(lines) +
|
|
'\n\nReply with JSON only, exactly: {"keep": [<indices>]}'
|
|
)
|
|
txt = client.chat_text([{"role": "user", "content": user}])
|
|
m = re.search(r"\{.*\}", txt, re.S)
|
|
if not m:
|
|
return candidates
|
|
keep = json.loads(m.group(0)).get("keep", [])
|
|
idx = {int(i) for i in keep if str(i).lstrip("-").isdigit()}
|
|
sub = [c for i, c in enumerate(candidates) if i in idx]
|
|
return sub or candidates
|
|
|
|
|
|
def _tone_filter(candidates: list[dict], client=None) -> list[dict]:
|
|
kept = [c for c in candidates if _keyword_ok(c["text"])]
|
|
if client and kept:
|
|
try:
|
|
kept = _llm_keep(client, kept)
|
|
except Exception: # noqa: BLE001 — LLM is best-effort; keyword floor stands
|
|
pass
|
|
return kept
|
|
|
|
|
|
def _pool_count(conn: sqlite3.Connection, md: str) -> int:
|
|
return conn.execute(
|
|
"SELECT COUNT(*) FROM onthisday_pool WHERE md=? AND blocked=0", (md,)
|
|
).fetchone()[0]
|
|
|
|
|
|
def harvest(conn: sqlite3.Connection, md: str | None = None, client=None) -> dict:
|
|
"""Fetch + tone-filter today's MM-DD events into the pool. Per-day, idempotent
|
|
(dedup by content key). Non-fatal on network failure."""
|
|
md = md or local_today()[5:]
|
|
try:
|
|
events = _fetch_events(md)
|
|
except Exception: # noqa: BLE001
|
|
return {"md": md, "fetched": 0, "kept": 0, "added": 0, "pool": _pool_count(conn, md)}
|
|
kept = _tone_filter(events, client) # all network/LLM done before the write
|
|
before = _pool_count(conn, md)
|
|
conn.executemany(
|
|
"INSERT OR IGNORE INTO onthisday_pool (source, md, year, ckey, text, summary, image_url, page_url) "
|
|
"VALUES ('wikimedia', ?, ?, ?, ?, ?, ?, ?)",
|
|
[(c["md"], c["year"], daily.content_key(c["md"], c["year"], c["text"]),
|
|
c["text"], c["summary"], c["image_url"], c["page_url"]) for c in kept],
|
|
)
|
|
conn.commit()
|
|
after = _pool_count(conn, md)
|
|
return {"md": md, "fetched": len(events), "kept": len(kept), "added": after - before, "pool": after}
|
|
|
|
|
|
def _candidates(conn: sqlite3.Connection, md: str, avoid: int | None = None) -> list[int]:
|
|
"""The pick pool for a date: if admin has featured any, pick only among those;
|
|
otherwise the N least-recently-shown. `avoid` drops a specific id (admin re-pick)
|
|
unless it's the only option."""
|
|
featured = conn.execute(
|
|
"SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 AND featured=1 ORDER BY id", (md,)
|
|
).fetchall()
|
|
if featured:
|
|
ids = [r[0] for r in featured]
|
|
else:
|
|
rows = conn.execute(
|
|
"SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 "
|
|
"ORDER BY shown_at IS NOT NULL, shown_at, id LIMIT ?",
|
|
(md, _NO_REPEAT_POOL),
|
|
).fetchall()
|
|
ids = [r[0] for r in rows]
|
|
if avoid is not None:
|
|
ids = [i for i in ids if i != avoid] or ids
|
|
return ids
|
|
|
|
|
|
def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False,
|
|
avoid: int | None = None) -> dict | None:
|
|
"""Pick + cache today's fact. Idempotent (skips if today's done unless force).
|
|
Returns the stored row, or None if the pool has nothing for today's date."""
|
|
feature_date = feature_date or local_today()
|
|
md = feature_date[5:]
|
|
existing = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone()
|
|
if existing and not force:
|
|
return dict(existing)
|
|
ids = _candidates(conn, md, avoid)
|
|
if not ids:
|
|
return None
|
|
pick_id = daily.seeded_order(ids, feature_date)[0]
|
|
row = conn.execute("SELECT * FROM onthisday_pool WHERE id=?", (pick_id,)).fetchone()
|
|
conn.execute(
|
|
"INSERT INTO daily_onthisday (feature_date, pool_id, source, md, year, text, summary, image_url, page_url) "
|
|
"VALUES (?,?,?,?,?,?,?,?,?) "
|
|
"ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, source=excluded.source, "
|
|
"year=excluded.year, text=excluded.text, summary=excluded.summary, image_url=excluded.image_url, "
|
|
"page_url=excluded.page_url",
|
|
(feature_date, row["id"], row["source"], row["md"], row["year"], row["text"],
|
|
row["summary"], row["image_url"], row["page_url"]),
|
|
)
|
|
conn.execute("UPDATE onthisday_pool SET shown_at=? WHERE id=?", (feature_date, pick_id))
|
|
conn.commit()
|
|
return dict(conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone())
|
|
|
|
|
|
def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None:
|
|
"""Today's fact if present, else the most recent (the room is never empty)."""
|
|
if feature_date:
|
|
row = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone()
|
|
if row:
|
|
return dict(row)
|
|
row = conn.execute("SELECT * FROM daily_onthisday ORDER BY feature_date DESC LIMIT 1").fetchone()
|
|
return dict(row) if row else None
|
|
|
|
|
|
def run_daily(conn: sqlite3.Connection, client=None) -> dict:
|
|
"""Cycle entry point: ensure today's date has a pool, then ensure it has a pick.
|
|
Bounded + non-fatal — safe to call every cycle (no-ops once the day is picked)."""
|
|
md = local_today()[5:]
|
|
harvested = None
|
|
if _pool_count(conn, md) == 0:
|
|
harvested = harvest(conn, md, client)
|
|
picked = pick_daily(conn)
|
|
return {"md": md, "harvested": harvested, "picked": picked["text"] if picked else None}
|