On This Day: serve sharp images (originalimage, not the 330px thumbnail)

The Wikimedia feed's thumbnail is 330px, which upscales blurry in our hero. Use originalimage.source instead — it's reliably sharp. (Can't just request a bigger thumbnail width: for very large source images Wikimedia only serves pre-generated bucket sizes and 400s on arbitrary widths — e.g. 500px ok, 800/1024px fail.) - onthisday._best_image() prefers originalimage, falls back to the thumbnail. - scripts/otd_image_upsize_backfill.py re-fetches each stored MM-DD and upgrades image_url in onthisday_pool + daily_onthisday in place (ran on host: pool + 6 daily rows now sharp; today's hero verified 200). Only the /onthisday hero loads this image (home card is text-only), so larger files are a single-page, one-time load. - test_best_image locks the prefer-original/fallback behavior. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-27 17:07:37 -04:00
parent e3e6f24753
commit 6c10ad99a9
3 changed files with 75 additions and 1 deletions
@@ -0,0 +1,50 @@
+"""One-off: upgrade stored On This Day images from the blurry 330px thumbnail to a sharp,
+size-capped URL. Re-fetches the Wikimedia events for each stored MM-DD, matches by page_url,
+and rewrites image_url in onthisday_pool + daily_onthisday in place (no re-picking). Idempotent.
+
+Run on the host:  python -m scripts.otd_image_upsize_backfill
+"""
+import os
+
+from goodnews import daily, onthisday
+from goodnews.db import connect
+
+conn = connect(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
+
+# distinct dates we've harvested
+mds = [r[0] for r in conn.execute(
+    "SELECT DISTINCT md FROM onthisday_pool WHERE image_url LIKE '%/thumb/%px-%'").fetchall()]
+print(f"dates to refresh: {mds}")
+
+# page_url -> sharp image_url, from a fresh feed fetch per date
+best: dict[str, str] = {}
+for md in mds:
+    mm, dd = md.split("-")
+    data = daily.http_json(f"{onthisday.WIKI_BASE}/{mm}/{dd}")
+    for e in (data.get("events") or []):
+        page = (e.get("pages") or [{}])[0] or {}
+        page_url = (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None
+        img = onthisday._best_image(page)
+        if page_url and img:
+            best[page_url] = img
+    print(f"  {md}: {len(data.get('events') or [])} events fetched")
+
+updated_pool = updated_daily = 0
+for table in ("onthisday_pool", "daily_onthisday"):
+    for page_url, new in best.items():
+        cur = conn.execute(
+            f"UPDATE {table} SET image_url=? "
+            f"WHERE page_url=? AND image_url LIKE '%/thumb/%px-%' AND image_url<>?",
+            (new, page_url, new))
+        if table == "onthisday_pool":
+            updated_pool += cur.rowcount
+        else:
+            updated_daily += cur.rowcount
+conn.commit()
+print(f"updated: onthisday_pool={updated_pool}, daily_onthisday={updated_daily}")
+
+# show a few results
+for r in conn.execute("SELECT md, year, image_url FROM onthisday_pool "
+                      "WHERE image_url IS NOT NULL ORDER BY md LIMIT 6").fetchall():
+    print(f"  {r['md']} {r['year']}: {r['image_url']}")
+conn.close()