On This Day: serve sharp images (originalimage, not the 330px thumbnail)
The Wikimedia feed's thumbnail is 330px, which upscales blurry in our hero. Use originalimage.source instead — it's reliably sharp. (Can't just request a bigger thumbnail width: for very large source images Wikimedia only serves pre-generated bucket sizes and 400s on arbitrary widths — e.g. 500px ok, 800/1024px fail.) - onthisday._best_image() prefers originalimage, falls back to the thumbnail. - scripts/otd_image_upsize_backfill.py re-fetches each stored MM-DD and upgrades image_url in onthisday_pool + daily_onthisday in place (ran on host: pool + 6 daily rows now sharp; today's hero verified 200). Only the /onthisday hero loads this image (home card is text-only), so larger files are a single-page, one-time load. - test_best_image locks the prefer-original/fallback behavior. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+13
-1
@@ -31,6 +31,18 @@ _NEG = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Wikimedia's feed hands us a 330px `thumbnail`, which upscales (blurry) in our hero. It also
|
||||||
|
# gives `originalimage` — a sharp, full-size URL that's always valid. We can't just request a
|
||||||
|
# bigger thumbnail width: for very large source images Wikimedia only serves pre-generated
|
||||||
|
# bucket sizes and 400s on arbitrary widths (e.g. 500px ok, 800/1024px fail, 1280px ok). So
|
||||||
|
# prefer the originalimage (reliably sharp), falling back to the thumbnail.
|
||||||
|
def _best_image(page: dict) -> str | None:
|
||||||
|
"""The sharpest reliably-served image URL: originalimage, else the 330px thumbnail."""
|
||||||
|
orig = (page.get("originalimage") or {}).get("source")
|
||||||
|
thumb = (page.get("thumbnail") or {}).get("source")
|
||||||
|
return orig or thumb or None
|
||||||
|
|
||||||
|
|
||||||
def _fetch_events(md: str) -> list[dict]:
|
def _fetch_events(md: str) -> list[dict]:
|
||||||
"""All events for a MM-DD from Wikimedia, normalized to our candidate shape."""
|
"""All events for a MM-DD from Wikimedia, normalized to our candidate shape."""
|
||||||
mm, dd = md.split("-")
|
mm, dd = md.split("-")
|
||||||
@@ -46,7 +58,7 @@ def _fetch_events(md: str) -> list[dict]:
|
|||||||
"year": e.get("year"),
|
"year": e.get("year"),
|
||||||
"text": text,
|
"text": text,
|
||||||
"summary": (page.get("extract") or "").strip() or None,
|
"summary": (page.get("extract") or "").strip() or None,
|
||||||
"image_url": ((page.get("thumbnail") or {}).get("source")) or None,
|
"image_url": _best_image(page),
|
||||||
"page_url": (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None,
|
"page_url": (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None,
|
||||||
})
|
})
|
||||||
return out
|
return out
|
||||||
|
|||||||
@@ -0,0 +1,50 @@
|
|||||||
|
"""One-off: upgrade stored On This Day images from the blurry 330px thumbnail to a sharp,
|
||||||
|
size-capped URL. Re-fetches the Wikimedia events for each stored MM-DD, matches by page_url,
|
||||||
|
and rewrites image_url in onthisday_pool + daily_onthisday in place (no re-picking). Idempotent.
|
||||||
|
|
||||||
|
Run on the host: python -m scripts.otd_image_upsize_backfill
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
from goodnews import daily, onthisday
|
||||||
|
from goodnews.db import connect
|
||||||
|
|
||||||
|
conn = connect(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
|
||||||
|
|
||||||
|
# distinct dates we've harvested
|
||||||
|
mds = [r[0] for r in conn.execute(
|
||||||
|
"SELECT DISTINCT md FROM onthisday_pool WHERE image_url LIKE '%/thumb/%px-%'").fetchall()]
|
||||||
|
print(f"dates to refresh: {mds}")
|
||||||
|
|
||||||
|
# page_url -> sharp image_url, from a fresh feed fetch per date
|
||||||
|
best: dict[str, str] = {}
|
||||||
|
for md in mds:
|
||||||
|
mm, dd = md.split("-")
|
||||||
|
data = daily.http_json(f"{onthisday.WIKI_BASE}/{mm}/{dd}")
|
||||||
|
for e in (data.get("events") or []):
|
||||||
|
page = (e.get("pages") or [{}])[0] or {}
|
||||||
|
page_url = (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None
|
||||||
|
img = onthisday._best_image(page)
|
||||||
|
if page_url and img:
|
||||||
|
best[page_url] = img
|
||||||
|
print(f" {md}: {len(data.get('events') or [])} events fetched")
|
||||||
|
|
||||||
|
updated_pool = updated_daily = 0
|
||||||
|
for table in ("onthisday_pool", "daily_onthisday"):
|
||||||
|
for page_url, new in best.items():
|
||||||
|
cur = conn.execute(
|
||||||
|
f"UPDATE {table} SET image_url=? "
|
||||||
|
f"WHERE page_url=? AND image_url LIKE '%/thumb/%px-%' AND image_url<>?",
|
||||||
|
(new, page_url, new))
|
||||||
|
if table == "onthisday_pool":
|
||||||
|
updated_pool += cur.rowcount
|
||||||
|
else:
|
||||||
|
updated_daily += cur.rowcount
|
||||||
|
conn.commit()
|
||||||
|
print(f"updated: onthisday_pool={updated_pool}, daily_onthisday={updated_daily}")
|
||||||
|
|
||||||
|
# show a few results
|
||||||
|
for r in conn.execute("SELECT md, year, image_url FROM onthisday_pool "
|
||||||
|
"WHERE image_url IS NOT NULL ORDER BY md LIMIT 6").fetchall():
|
||||||
|
print(f" {r['md']} {r['year']}: {r['image_url']}")
|
||||||
|
conn.close()
|
||||||
@@ -76,3 +76,15 @@ def test_tone_filter_llm_narrows(conn):
|
|||||||
kept = onthisday._tone_filter([dict(e) for e in FAKE], client=FakeClient())
|
kept = onthisday._tone_filter([dict(e) for e in FAKE], client=FakeClient())
|
||||||
# keyword floor drops the invasion (3 remain), then the LLM narrows to 1
|
# keyword floor drops the invasion (3 remain), then the LLM narrows to 1
|
||||||
assert len(kept) == 1 and kept[0]["year"] == 1611
|
assert len(kept) == 1 and kept[0]["year"] == 1611
|
||||||
|
|
||||||
|
|
||||||
|
def test_best_image_prefers_original_over_thumbnail():
|
||||||
|
# the 330px thumbnail upscales (blurry); originalimage is reliably sharp → prefer it
|
||||||
|
page = {
|
||||||
|
"thumbnail": {"source": "https://x/thumb/Foo.jpg/330px-Foo.jpg", "width": 330},
|
||||||
|
"originalimage": {"source": "https://x/thumb/Foo.jpg/3840px-Foo.jpg", "width": 7000},
|
||||||
|
}
|
||||||
|
assert onthisday._best_image(page) == "https://x/thumb/Foo.jpg/3840px-Foo.jpg"
|
||||||
|
# falls back to the thumbnail when there's no originalimage, and to None when neither exists
|
||||||
|
assert onthisday._best_image({"thumbnail": {"source": "https://x/330px-Foo.jpg"}}) == "https://x/330px-Foo.jpg"
|
||||||
|
assert onthisday._best_image({}) is None
|
||||||
|
|||||||
Reference in New Issue
Block a user