"""On This Day — a good thing that happened on today's date in history. Source: Wikimedia's "On this day" feed (free, CC) — comprehensive, and it carries a summary extract + thumbnail per event, which makes for a rich page. Multi-source ready (a `source` column), so admin-curated entries and any future source slot in cleanly. Pipeline (mirrors Daily Art): harvest today's MM-DD events → tone-filter to good/neutral (keyword floor + optional LLM refine) → pool → deterministic daily pick → cached row. All network/LLM work happens before any DB write, so the write txn is brief. """ from __future__ import annotations import json import re import sqlite3 from . import daily from .localtime import local_today WIKI_BASE = "https://en.wikipedia.org/api/rest_v1/feed/onthisday/events" _NO_REPEAT_POOL = 40 # pick from the N least-recently-shown for today's date # Keyword floor: drop the obviously grim before the LLM ever sees it (and a safety net # for when the LLM is unavailable). Substring match on a lowercased event text. _NEG = ( "war", "kill", "died", "dies", "death", "deaths", "dead", "massacre", "genocide", "disaster", "earthquake", "hurricane", "tsunami", "flood", "famine", "plague", "bomb", "attack", "assassinat", "murder", "shooting", "shot dead", "riot", "crash", "invad", "slaughter", "execut", "tragedy", "terror", "nuclear", "explosion", "sank", "sink", "wreck", "epidemic", "pandemic", "outbreak", "hostage", "coup", ) # Wikimedia's feed hands us a 330px `thumbnail`, which upscales (blurry) in our hero. It also # gives `originalimage` — a sharp, full-size URL that's always valid. We can't just request a # bigger thumbnail width: for very large source images Wikimedia only serves pre-generated # bucket sizes and 400s on arbitrary widths (e.g. 500px ok, 800/1024px fail, 1280px ok). So # prefer the originalimage (reliably sharp), falling back to the thumbnail. def _best_image(page: dict) -> str | None: """The sharpest reliably-served image URL: originalimage, else the 330px thumbnail.""" orig = (page.get("originalimage") or {}).get("source") thumb = (page.get("thumbnail") or {}).get("source") return orig or thumb or None def _fetch_events(md: str) -> list[dict]: """All events for a MM-DD from Wikimedia, normalized to our candidate shape.""" mm, dd = md.split("-") data = daily.http_json(f"{WIKI_BASE}/{mm}/{dd}") out = [] for e in (data.get("events") or []): text = (e.get("text") or "").strip() if not text: continue page = (e.get("pages") or [{}])[0] or {} out.append({ "md": md, "year": e.get("year"), "text": text, "summary": (page.get("extract") or "").strip() or None, "image_url": _best_image(page), "page_url": (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None, }) return out def _keyword_ok(text: str) -> bool: t = text.lower() return not any(neg in t for neg in _NEG) def _llm_keep(client, candidates: list[dict]) -> list[dict]: """Ask the LLM which candidates are genuinely positive/neutral. On any trouble, keep the keyword-passed set (never lose the day to a model hiccup).""" lines = [f"{i}: {c['text']}" for i, c in enumerate(candidates)] user = ( "These are 'on this day' history events. Return the indices of the ones that are " "GENUINELY UPLIFTING — a reader should feel a small lift of wonder, hope, or delight. " "Keep: discoveries, inventions, scientific breakthroughs, the arts and culture, " "exploration, human achievement, acts of courage or kindness, milestones of progress " "(rights won, things built, records set). EXCLUDE war, violence, disasters, death, or " "tragedy, AND exclude merely procedural or political-administrative events that carry no " "warmth (a coronation or accession, a treaty signing, an election, a law passed, a " "boundary or office change). When unsure whether something is truly uplifting, leave it " "out.\n\n" + "\n".join(lines) + '\n\nReply with JSON only, exactly: {"keep": []}' ) txt = client.chat_text([{"role": "user", "content": user}]) m = re.search(r"\{.*\}", txt, re.S) if not m: return candidates keep = json.loads(m.group(0)).get("keep", []) idx = {int(i) for i in keep if str(i).lstrip("-").isdigit()} sub = [c for i, c in enumerate(candidates) if i in idx] return sub or candidates def _tone_filter(candidates: list[dict], client=None) -> list[dict]: kept = [c for c in candidates if _keyword_ok(c["text"])] if client and kept: try: kept = _llm_keep(client, kept) except Exception: # noqa: BLE001 — LLM is best-effort; keyword floor stands pass return kept def _pool_count(conn: sqlite3.Connection, md: str) -> int: return conn.execute( "SELECT COUNT(*) FROM onthisday_pool WHERE md=? AND blocked=0", (md,) ).fetchone()[0] def harvest(conn: sqlite3.Connection, md: str | None = None, client=None) -> dict: """Fetch + tone-filter today's MM-DD events into the pool. Per-day, idempotent (dedup by content key). Non-fatal on network failure.""" md = md or local_today()[5:] try: events = _fetch_events(md) except Exception: # noqa: BLE001 return {"md": md, "fetched": 0, "kept": 0, "added": 0, "pool": _pool_count(conn, md)} kept = _tone_filter(events, client) # all network/LLM done before the write before = _pool_count(conn, md) conn.executemany( "INSERT OR IGNORE INTO onthisday_pool (source, md, year, ckey, text, summary, image_url, page_url) " "VALUES ('wikimedia', ?, ?, ?, ?, ?, ?, ?)", [(c["md"], c["year"], daily.content_key(c["md"], c["year"], c["text"]), c["text"], c["summary"], c["image_url"], c["page_url"]) for c in kept], ) conn.commit() after = _pool_count(conn, md) return {"md": md, "fetched": len(events), "kept": len(kept), "added": after - before, "pool": after} def _candidates(conn: sqlite3.Connection, md: str, avoid: int | None = None) -> list[int]: """The pick pool for a date: if admin has featured any, pick only among those; otherwise the N least-recently-shown. `avoid` drops a specific id (admin re-pick) unless it's the only option.""" featured = conn.execute( "SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 AND featured=1 ORDER BY id", (md,) ).fetchall() if featured: ids = [r[0] for r in featured] else: rows = conn.execute( "SELECT id FROM onthisday_pool WHERE md=? AND blocked=0 " "ORDER BY shown_at IS NOT NULL, shown_at, id LIMIT ?", (md, _NO_REPEAT_POOL), ).fetchall() ids = [r[0] for r in rows] if avoid is not None: ids = [i for i in ids if i != avoid] or ids return ids def pick_daily(conn: sqlite3.Connection, feature_date: str | None = None, force: bool = False, avoid: int | None = None) -> dict | None: """Pick + cache today's fact. Idempotent (skips if today's done unless force). Returns the stored row, or None if the pool has nothing for today's date.""" feature_date = feature_date or local_today() md = feature_date[5:] existing = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone() if existing and not force: return dict(existing) ids = _candidates(conn, md, avoid) if not ids: return None pick_id = daily.seeded_order(ids, feature_date)[0] row = conn.execute("SELECT * FROM onthisday_pool WHERE id=?", (pick_id,)).fetchone() conn.execute( "INSERT INTO daily_onthisday (feature_date, pool_id, source, md, year, text, summary, image_url, page_url) " "VALUES (?,?,?,?,?,?,?,?,?) " "ON CONFLICT(feature_date) DO UPDATE SET pool_id=excluded.pool_id, source=excluded.source, " "year=excluded.year, text=excluded.text, summary=excluded.summary, image_url=excluded.image_url, " "page_url=excluded.page_url", (feature_date, row["id"], row["source"], row["md"], row["year"], row["text"], row["summary"], row["image_url"], row["page_url"]), ) conn.execute("UPDATE onthisday_pool SET shown_at=? WHERE id=?", (feature_date, pick_id)) conn.commit() return dict(conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone()) def get_today(conn: sqlite3.Connection, feature_date: str | None = None) -> dict | None: """Today's fact if present, else the most recent (the room is never empty).""" if feature_date: row = conn.execute("SELECT * FROM daily_onthisday WHERE feature_date=?", (feature_date,)).fetchone() if row: return dict(row) row = conn.execute("SELECT * FROM daily_onthisday ORDER BY feature_date DESC LIMIT 1").fetchone() return dict(row) if row else None def run_daily(conn: sqlite3.Connection, client=None) -> dict: """Cycle entry point: ensure today's date has a pool, then ensure it has a pick. Bounded + non-fatal — safe to call every cycle (no-ops once the day is picked).""" md = local_today()[5:] harvested = None if _pool_count(conn, md) == 0: harvested = harvest(conn, md, client) picked = pick_daily(conn) return {"md": md, "harvested": harvested, "picked": picked["text"] if picked else None}