From 6c10ad99a91869f5bc2060b1f30f7c6227c29ea7 Mon Sep 17 00:00:00 2001 From: jay Date: Sat, 27 Jun 2026 17:07:37 -0400 Subject: [PATCH] On This Day: serve sharp images (originalimage, not the 330px thumbnail) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Wikimedia feed's thumbnail is 330px, which upscales blurry in our hero. Use originalimage.source instead — it's reliably sharp. (Can't just request a bigger thumbnail width: for very large source images Wikimedia only serves pre-generated bucket sizes and 400s on arbitrary widths — e.g. 500px ok, 800/1024px fail.) - onthisday._best_image() prefers originalimage, falls back to the thumbnail. - scripts/otd_image_upsize_backfill.py re-fetches each stored MM-DD and upgrades image_url in onthisday_pool + daily_onthisday in place (ran on host: pool + 6 daily rows now sharp; today's hero verified 200). Only the /onthisday hero loads this image (home card is text-only), so larger files are a single-page, one-time load. - test_best_image locks the prefer-original/fallback behavior. Co-Authored-By: Claude Opus 4.8 --- goodnews/onthisday.py | 14 +++++++- scripts/otd_image_upsize_backfill.py | 50 ++++++++++++++++++++++++++++ tests/test_onthisday.py | 12 +++++++ 3 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 scripts/otd_image_upsize_backfill.py diff --git a/goodnews/onthisday.py b/goodnews/onthisday.py index e5eeb8b..25139b6 100644 --- a/goodnews/onthisday.py +++ b/goodnews/onthisday.py @@ -31,6 +31,18 @@ _NEG = ( ) +# Wikimedia's feed hands us a 330px `thumbnail`, which upscales (blurry) in our hero. It also +# gives `originalimage` — a sharp, full-size URL that's always valid. We can't just request a +# bigger thumbnail width: for very large source images Wikimedia only serves pre-generated +# bucket sizes and 400s on arbitrary widths (e.g. 500px ok, 800/1024px fail, 1280px ok). So +# prefer the originalimage (reliably sharp), falling back to the thumbnail. +def _best_image(page: dict) -> str | None: + """The sharpest reliably-served image URL: originalimage, else the 330px thumbnail.""" + orig = (page.get("originalimage") or {}).get("source") + thumb = (page.get("thumbnail") or {}).get("source") + return orig or thumb or None + + def _fetch_events(md: str) -> list[dict]: """All events for a MM-DD from Wikimedia, normalized to our candidate shape.""" mm, dd = md.split("-") @@ -46,7 +58,7 @@ def _fetch_events(md: str) -> list[dict]: "year": e.get("year"), "text": text, "summary": (page.get("extract") or "").strip() or None, - "image_url": ((page.get("thumbnail") or {}).get("source")) or None, + "image_url": _best_image(page), "page_url": (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None, }) return out diff --git a/scripts/otd_image_upsize_backfill.py b/scripts/otd_image_upsize_backfill.py new file mode 100644 index 0000000..0894ed1 --- /dev/null +++ b/scripts/otd_image_upsize_backfill.py @@ -0,0 +1,50 @@ +"""One-off: upgrade stored On This Day images from the blurry 330px thumbnail to a sharp, +size-capped URL. Re-fetches the Wikimedia events for each stored MM-DD, matches by page_url, +and rewrites image_url in onthisday_pool + daily_onthisday in place (no re-picking). Idempotent. + +Run on the host: python -m scripts.otd_image_upsize_backfill +""" +import os + +from goodnews import daily, onthisday +from goodnews.db import connect + +conn = connect(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3")) + +# distinct dates we've harvested +mds = [r[0] for r in conn.execute( + "SELECT DISTINCT md FROM onthisday_pool WHERE image_url LIKE '%/thumb/%px-%'").fetchall()] +print(f"dates to refresh: {mds}") + +# page_url -> sharp image_url, from a fresh feed fetch per date +best: dict[str, str] = {} +for md in mds: + mm, dd = md.split("-") + data = daily.http_json(f"{onthisday.WIKI_BASE}/{mm}/{dd}") + for e in (data.get("events") or []): + page = (e.get("pages") or [{}])[0] or {} + page_url = (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None + img = onthisday._best_image(page) + if page_url and img: + best[page_url] = img + print(f" {md}: {len(data.get('events') or [])} events fetched") + +updated_pool = updated_daily = 0 +for table in ("onthisday_pool", "daily_onthisday"): + for page_url, new in best.items(): + cur = conn.execute( + f"UPDATE {table} SET image_url=? " + f"WHERE page_url=? AND image_url LIKE '%/thumb/%px-%' AND image_url<>?", + (new, page_url, new)) + if table == "onthisday_pool": + updated_pool += cur.rowcount + else: + updated_daily += cur.rowcount +conn.commit() +print(f"updated: onthisday_pool={updated_pool}, daily_onthisday={updated_daily}") + +# show a few results +for r in conn.execute("SELECT md, year, image_url FROM onthisday_pool " + "WHERE image_url IS NOT NULL ORDER BY md LIMIT 6").fetchall(): + print(f" {r['md']} {r['year']}: {r['image_url']}") +conn.close() diff --git a/tests/test_onthisday.py b/tests/test_onthisday.py index ed98f9d..ba5a5da 100644 --- a/tests/test_onthisday.py +++ b/tests/test_onthisday.py @@ -76,3 +76,15 @@ def test_tone_filter_llm_narrows(conn): kept = onthisday._tone_filter([dict(e) for e in FAKE], client=FakeClient()) # keyword floor drops the invasion (3 remain), then the LLM narrows to 1 assert len(kept) == 1 and kept[0]["year"] == 1611 + + +def test_best_image_prefers_original_over_thumbnail(): + # the 330px thumbnail upscales (blurry); originalimage is reliably sharp → prefer it + page = { + "thumbnail": {"source": "https://x/thumb/Foo.jpg/330px-Foo.jpg", "width": 330}, + "originalimage": {"source": "https://x/thumb/Foo.jpg/3840px-Foo.jpg", "width": 7000}, + } + assert onthisday._best_image(page) == "https://x/thumb/Foo.jpg/3840px-Foo.jpg" + # falls back to the thumbnail when there's no originalimage, and to None when neither exists + assert onthisday._best_image({"thumbnail": {"source": "https://x/330px-Foo.jpg"}}) == "https://x/330px-Foo.jpg" + assert onthisday._best_image({}) is None