upbeatBytes/scripts/otd_image_upsize_backfill.py

"""One-off: upgrade stored On This Day images from the blurry 330px thumbnail to a sharp,
size-capped URL. Re-fetches the Wikimedia events for each stored MM-DD, matches by page_url,
and rewrites image_url in onthisday_pool + daily_onthisday in place (no re-picking). Idempotent.

Run on the host:  python -m scripts.otd_image_upsize_backfill
"""
import os

from goodnews import daily, onthisday
from goodnews.db import connect

conn = connect(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))

# distinct dates we've harvested
mds = [r[0] for r in conn.execute(
    "SELECT DISTINCT md FROM onthisday_pool WHERE image_url LIKE '%/thumb/%px-%'").fetchall()]
print(f"dates to refresh: {mds}")

# page_url -> sharp image_url, from a fresh feed fetch per date
best: dict[str, str] = {}
for md in mds:
    mm, dd = md.split("-")
    data = daily.http_json(f"{onthisday.WIKI_BASE}/{mm}/{dd}")
    for e in (data.get("events") or []):
        page = (e.get("pages") or [{}])[0] or {}
        page_url = (((page.get("content_urls") or {}).get("desktop") or {}).get("page")) or None
        img = onthisday._best_image(page)
        if page_url and img:
            best[page_url] = img
    print(f"  {md}: {len(data.get('events') or [])} events fetched")

updated_pool = updated_daily = 0
for table in ("onthisday_pool", "daily_onthisday"):
    for page_url, new in best.items():
        cur = conn.execute(
            f"UPDATE {table} SET image_url=? "
            f"WHERE page_url=? AND image_url LIKE '%/thumb/%px-%' AND image_url<>?",
            (new, page_url, new))
        if table == "onthisday_pool":
            updated_pool += cur.rowcount
        else:
            updated_daily += cur.rowcount
conn.commit()
print(f"updated: onthisday_pool={updated_pool}, daily_onthisday={updated_daily}")

# show a few results
for r in conn.execute("SELECT md, year, image_url FROM onthisday_pool "
                      "WHERE image_url IS NOT NULL ORDER BY md LIMIT 6").fetchall():
    print(f"  {r['md']} {r['year']}: {r['image_url']}")
conn.close()