Daily Art backend: curated Met pool, daily cached pick, /api/art (prototype)

The engine for the /art room (design-independent; deploy held for Codex review).

- goodnews/art.py: harvest a curated pool of public-domain HIGHLIGHT artworks from the
  Met (isHighlight+isPublicDomain+hasImages -> masterworks, never potsherds; CC0). Daily
  deterministic pick from the least-recently-shown (no soon-repeats, same for everyone),
  fetch metadata + download the image to OUR cache (data/art_cache) so the homepage never
  waits on or hotlinks the museum. Bulletproof: bad object/image falls through candidates;
  a failed day keeps the last piece (room never empty). Injectable HTTP for tests.
- Schema: art_pool + daily_art. /api/art/today (edge-cacheable) + /api/art/image/{id}
  (served from cache, immutable). CLI `art [--harvest] [--force]` + a non-fatal cycle step.
- Tests (5, mocked HTTP) + verified live against the Met: harvested 1641 works,
  picked/cached "Repose" by John White Alexander. 371 tests green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-21 14:50:20 -04:00
parent 0c68c22221
commit 308516a263
6 changed files with 341 additions and 2 deletions
+1
View File
@@ -9,3 +9,4 @@ data/*.db
data/geo_audit*.json
logs/
data/art_cache/
+26 -2
View File
@@ -32,11 +32,11 @@ from pathlib import Path
from fastapi import BackgroundTasks, FastAPI, HTTPException, Query, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from . import auth, bloom, email_send, feeds, games, oauth_google, publishing, queries, share, sources, summarize
from . import art, auth, bloom, email_send, feeds, games, oauth_google, publishing, queries, share, sources, summarize
from .localtime import local_today
from .markup import reply_html_to_text, sanitize_reply_html
from .db import connect
@@ -2258,6 +2258,30 @@ def create_app() -> FastAPI:
with get_conn() as conn:
return queries.available_dates(conn, limit=limit)
# --- Daily Art (the /art room) -----------------------------------------
@app.get("/api/art/today")
def art_today(response: Response) -> dict:
with get_conn() as conn:
a = art.get_today(conn)
if not a:
response.headers["Cache-Control"] = _PRIVATE
raise HTTPException(status_code=404, detail="No art yet.")
response.headers["Cache-Control"] = _EDGE_FEED # one piece a day, same for everyone
return {
"date": a["art_date"], "object_id": a["object_id"], "title": a["title"],
"artist": a["artist"], "date_text": a["date_text"], "medium": a["medium"],
"department": a["department"], "credit": a["credit"], "source_url": a["source_url"],
"source": a["source"], "image_url": f"/api/art/image/{a['object_id']}",
}
@app.get("/api/art/image/{object_id}")
def art_image(object_id: int) -> FileResponse:
matches = sorted(art.cache_dir().glob(f"{object_id}.*"))
if not matches:
raise HTTPException(status_code=404, detail="Not cached.")
# Cached museum image: immutable for a given object id.
return FileResponse(str(matches[0]), headers={"Cache-Control": "public, max-age=31536000, immutable"})
@app.get("/api/replacement", response_model=Article | None)
def replacement(
exclude: str = Query("", description="comma-separated article ids already shown"),
+187
View File
@@ -0,0 +1,187 @@
"""Daily Art — the /art room. One gorgeous public-domain masterwork a day, picked from
a curated pool of museum highlights and cached to OUR origin (image + metadata), so the
homepage never waits on, nor hotlinks, the museum.
Source: The Met Collection API (no key; public-domain works are CC0 — free, unrestricted,
caching encouraged). Curation = isHighlight + isPublicDomain + hasImages, so the pool is
masterworks, never potsherds. Built to be bulletproof: a failed pick falls through to the
next candidate, and a failed day keeps yesterday's piece — the room is never empty.
Network calls go through module-level _http_* helpers so tests can monkeypatch them.
"""
from __future__ import annotations
import hashlib
import json
import os
import sqlite3
import urllib.error
import urllib.request
from pathlib import Path
from .localtime import local_today
MET_BASE = "https://collectionapi.metmuseum.org/public/collection/v1"
# Broad, visual, museum-grade terms. Each is filtered to public-domain highlights with
# images, then deduped — a diverse pool of a few thousand masterworks.
HARVEST_QUERIES = ("painting", "portrait", "landscape", "still life", "flowers",
"sculpture", "drawing", "garden", "river", "sunset")
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
_PICK_ATTEMPTS = 8 # candidates to try before giving up for the day
_NO_REPEAT_POOL = 40 # pick the daily piece from the N least-recently-shown
_MIN_IMAGE_BYTES = 3000 # smaller than this = not a real image
def _http_json(url: str, timeout: int = 20) -> dict:
req = urllib.request.Request(url, headers=_UA)
with urllib.request.urlopen(req, timeout=timeout) as r:
return json.loads(r.read().decode("utf-8"))
def _http_bytes(url: str, timeout: int = 30) -> tuple[bytes, str]:
req = urllib.request.Request(url, headers=_UA)
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read(), (r.headers.get("Content-Type") or "")
def cache_dir() -> Path:
"""Where cached images live — beside the DB, so the host cycle writes and the API
container reads the same mounted volume."""
override = os.environ.get("GOODNEWS_ART_CACHE")
if override:
d = Path(override)
else:
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
d = db.parent / "art_cache"
d.mkdir(parents=True, exist_ok=True)
return d
# --- harvest: build the curated pool of object IDs -------------------------------
def harvest_pool(conn: sqlite3.Connection, queries=HARVEST_QUERIES, source: str = "met") -> dict:
"""Query the Met for public-domain highlight images across broad art terms; store the
deduped object IDs. Cheap: each search returns all matching IDs in one call. Per-query
failure is non-fatal."""
found, errors = set(), 0
for q in queries:
url = (f"{MET_BASE}/search?isHighlight=true&hasImages=true&isPublicDomain=true"
f"&q={urllib.request.quote(q)}")
try:
data = _http_json(url)
for oid in (data.get("objectIDs") or []):
if isinstance(oid, int):
found.add(oid)
except Exception: # noqa: BLE001 — non-fatal per query
errors += 1
before = conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0]
conn.executemany(
"INSERT OR IGNORE INTO art_pool (source, object_id) VALUES (?, ?)",
[(source, oid) for oid in found],
)
conn.commit()
after = conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0]
return {"queried": len(queries), "errors": errors, "found": len(found),
"added": after - before, "pool": after}
# --- daily pick: choose, fetch, cache --------------------------------------------
def _object(object_id: int) -> dict:
return _http_json(f"{MET_BASE}/objects/{object_id}")
def _download_image(obj: dict, object_id: int) -> str | None:
"""Download the web-large (then full) image to our cache; return the filename or None."""
for key in ("primaryImageSmall", "primaryImage"):
url = obj.get(key)
if not url:
continue
try:
data, ctype = _http_bytes(url)
except Exception: # noqa: BLE001
continue
if not ctype.startswith("image/") or len(data) < _MIN_IMAGE_BYTES:
continue
ext = ".png" if "png" in ctype else ".jpg"
fname = f"{object_id}{ext}"
try:
(cache_dir() / fname).write_bytes(data)
except OSError:
return None
return fname
return None
def _candidates(conn: sqlite3.Connection, art_date: str, source: str) -> list[int]:
"""The N least-recently-shown pool IDs, rotated deterministically by the date so the
same piece shows for everyone that day and pieces don't repeat soon."""
rows = conn.execute(
"SELECT object_id FROM art_pool WHERE source=? ORDER BY shown_at IS NOT NULL, shown_at, object_id LIMIT ?",
(source, _NO_REPEAT_POOL),
).fetchall()
ids = [r[0] for r in rows]
if not ids:
return ids
seed = int(hashlib.sha256(art_date.encode()).hexdigest(), 16) % len(ids)
return ids[seed:] + ids[:seed] # rotate so the daily choice is stable but varies
def pick_daily(conn: sqlite3.Connection, art_date: str | None = None, source: str = "met",
force: bool = False) -> dict | None:
"""Pick + cache the day's art. Idempotent (skips if today's already done unless force).
Tries successive candidates so a bad object/image never breaks the day; returns the
stored row, or None if nothing could be fetched (caller keeps the prior day's piece)."""
art_date = art_date or local_today()
existing = conn.execute("SELECT * FROM daily_art WHERE art_date=?", (art_date,)).fetchone()
if existing and not force:
return dict(existing)
for oid in _candidates(conn, art_date, source):
try:
obj = _object(oid)
except Exception: # noqa: BLE001
continue
if not obj.get("isPublicDomain"):
continue
fname = _download_image(obj, oid)
if not fname:
continue
conn.execute(
"INSERT INTO daily_art (art_date, source, object_id, title, artist, date_text, medium, "
"department, credit, source_url, image_file) VALUES (?,?,?,?,?,?,?,?,?,?,?) "
"ON CONFLICT(art_date) DO UPDATE SET object_id=excluded.object_id, title=excluded.title, "
"artist=excluded.artist, date_text=excluded.date_text, medium=excluded.medium, "
"department=excluded.department, credit=excluded.credit, source_url=excluded.source_url, "
"image_file=excluded.image_file",
(art_date, source, oid, obj.get("title") or "Untitled",
obj.get("artistDisplayName") or None, obj.get("objectDate") or None,
obj.get("medium") or None, obj.get("department") or None,
obj.get("creditLine") or None, obj.get("objectURL") or None, fname),
)
conn.execute("UPDATE art_pool SET shown_at=? WHERE source=? AND object_id=?",
(art_date, source, oid))
conn.commit()
return dict(conn.execute("SELECT * FROM daily_art WHERE art_date=?", (art_date,)).fetchone())
return None # nothing fetched today — get_today falls back to the latest piece
def get_today(conn: sqlite3.Connection, art_date: str | None = None) -> dict | None:
"""Today's piece if present, else the most recent one cached (room is never empty)."""
if art_date:
row = conn.execute("SELECT * FROM daily_art WHERE art_date=?", (art_date,)).fetchone()
if row:
return dict(row)
row = conn.execute("SELECT * FROM daily_art ORDER BY art_date DESC LIMIT 1").fetchone()
return dict(row) if row else None
def run_daily(conn: sqlite3.Connection, source: str = "met") -> dict:
"""Cycle entry point: ensure the pool exists, then ensure today has a piece. Bounded
and non-fatal — safe to call every cycle (it no-ops once the day is picked)."""
pool = conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0]
harvested = None
if pool == 0:
harvested = harvest_pool(conn, source=source)
picked = pick_daily(conn, source=source)
return {"pool": conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0],
"harvested": harvested, "picked_object": picked.get("object_id") if picked else None}
+26
View File
@@ -13,6 +13,7 @@ from .games import generate_daily_puzzles
from .localtime import local_today
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
from .geo import tag_articles as tag_geo
from . import art
from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
from .summarize import generate_summary, get_summary
from .feeds import (
@@ -135,6 +136,7 @@ def main() -> None:
cycle_parser.add_argument("--no-dedup", action="store_true", help="Skip the embedding dedup step")
cycle_parser.add_argument("--no-geo", action="store_true", help="Skip tagging article subject-geography")
cycle_parser.add_argument("--geo-limit", type=int, default=60, help="Max articles to geo-tag per cycle")
cycle_parser.add_argument("--no-art", action="store_true", help="Skip the Daily Art pick")
cycle_parser.add_argument("--no-brief", action="store_true", help="Skip rebuilding today's brief")
cycle_parser.add_argument("--no-review", action="store_true", help="Skip recomputing source review flags")
cycle_parser.add_argument("--no-digest", action="store_true", help="Skip sending due daily digests")
@@ -150,6 +152,10 @@ def main() -> None:
)
enrich_images_parser.add_argument("--limit", type=int, default=50, help="Max articles to fetch this batch")
art_parser = subparsers.add_parser("art", help="Daily Art: harvest the pool and/or pick today's cached piece")
art_parser.add_argument("--harvest", action="store_true", help="(Re)harvest the curated museum pool")
art_parser.add_argument("--force", action="store_true", help="Re-pick today's art even if already chosen")
geo_parser = subparsers.add_parser("geo", help="Tag article subject-geography (backfill / manual). Cycle-locked.")
geo_parser.add_argument("--limit", type=int, default=200, help="Max articles to tag this batch")
geo_parser.add_argument("--reclassify", action="store_true", help="Re-tag even rows already at the current geo version")
@@ -307,6 +313,17 @@ def main() -> None:
elif args.command == "enrich-images":
found = enrich_summarized_images(conn, limit=args.limit)
print(f"enrich-images: {found} new image(s) for summarized articles")
elif args.command == "art":
init_db(conn)
if args.harvest:
h = art.harvest_pool(conn)
print(f"art harvest: found={h['found']} added={h['added']} pool={h['pool']} errors={h['errors']}")
picked = art.pick_daily(conn, force=args.force)
if picked:
print(f"art pick: {picked['art_date']} -> #{picked['object_id']} "
f"\"{picked['title']}\"{picked['artist'] or 'Unknown'}")
else:
print("art pick: nothing fetched (kept the last piece)")
elif args.command == "geo":
init_db(conn)
# Cycle-locked so a manual backfill can't contend with the scheduled cycle.
@@ -534,6 +551,15 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
except Exception as exc:
print(f"geo: skipped ({exc})")
# Daily Art: ensure the pool exists, then ensure today has a cached piece. No-ops
# once the day is picked; non-fatal like every other step.
if not args.no_art:
try:
a = art.run_daily(conn)
print(f"art: pool={a['pool']} picked={a['picked_object']}")
except Exception as exc:
print(f"art: skipped ({exc})")
if not args.no_brief:
today = local_today()
try:
+26
View File
@@ -245,6 +245,32 @@ CREATE INDEX IF NOT EXISTS idx_article_places_article ON article_places(article_
CREATE INDEX IF NOT EXISTS idx_article_places_country ON article_places(country_code);
CREATE INDEX IF NOT EXISTS idx_article_geo_breadth ON article_geo(breadth);
-- Daily Art (the /art room). art_pool = a curated set of public-domain, highlighted
-- museum object IDs (so the daily pick never hits a potsherd). daily_art = one cached
-- piece per day (metadata + a locally-cached image), so the homepage never waits on or
-- hotlinks the museum. shown_at lets us avoid repeating a piece too soon.
CREATE TABLE IF NOT EXISTS art_pool (
object_id INTEGER NOT NULL,
source TEXT NOT NULL DEFAULT 'met',
shown_at TEXT,
added_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (source, object_id)
);
CREATE TABLE IF NOT EXISTS daily_art (
art_date TEXT PRIMARY KEY,
source TEXT NOT NULL DEFAULT 'met',
object_id INTEGER NOT NULL,
title TEXT,
artist TEXT,
date_text TEXT,
medium TEXT,
department TEXT,
credit TEXT,
source_url TEXT,
image_file TEXT,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- Privacy-respecting, first-party analytics. NO IP / user-agent / referrer / raw
-- URL. visitor_hash is a hash of a random localStorage token (never email/IP).
-- The UNIQUE key dedups to one row per (kind, article, visitor, day) — that both
+75
View File
@@ -0,0 +1,75 @@
"""Daily Art: curated harvest, bulletproof daily pick (skips non-public-domain / bad
images, falls through candidates), local image cache, and never-empty get_today."""
import pytest
from goodnews import art
from goodnews.db import connect, init_db
OBJECTS = {
1: {"objectID": 1, "isPublicDomain": True, "title": "Sunflowers", "artistDisplayName": "Van Gogh",
"objectDate": "1887", "medium": "Oil on canvas", "department": "European Paintings",
"creditLine": "Gift", "objectURL": "https://met/1",
"primaryImageSmall": "https://img/1-web.jpg", "primaryImage": "https://img/1.jpg"},
2: {"objectID": 2, "isPublicDomain": False, "primaryImageSmall": "https://img/2.jpg"}, # not CC0 -> skip
3: {"objectID": 3, "isPublicDomain": True, "title": "Irises", "artistDisplayName": "Van Gogh",
"primaryImageSmall": "https://img/3-web.jpg"},
}
def _fake_json(url, timeout=20):
if "/search" in url:
return {"total": 3, "objectIDs": [1, 2, 3]}
if "/objects/" in url:
return OBJECTS[int(url.rstrip("/").split("/")[-1])]
raise AssertionError(url)
def _fake_bytes(url, timeout=30):
return (b"\xff\xd8\xff" + b"x" * 5000, "image/jpeg") # a valid-looking jpeg
@pytest.fixture
def conn(tmp_path, monkeypatch):
monkeypatch.setenv("GOODNEWS_ART_CACHE", str(tmp_path / "art"))
monkeypatch.setattr(art, "_http_json", _fake_json)
monkeypatch.setattr(art, "_http_bytes", _fake_bytes)
c = connect(":memory:"); init_db(c)
yield c
c.close()
def test_harvest_dedupes_into_pool(conn):
r = art.harvest_pool(conn)
assert r["pool"] == 3 and r["added"] == 3
assert art.harvest_pool(conn)["added"] == 0 # idempotent
def test_pick_caches_image_metadata_and_marks_shown(conn):
art.harvest_pool(conn)
a = art.pick_daily(conn, art_date="2026-06-21")
assert a and a["object_id"] in (1, 3) and a["title"] in ("Sunflowers", "Irises")
assert a["artist"] == "Van Gogh" and a["image_file"]
assert list(art.cache_dir().glob(f"{a['object_id']}.*")) # image cached locally
shown = conn.execute("SELECT shown_at FROM art_pool WHERE object_id=?", (a["object_id"],)).fetchone()[0]
assert shown == "2026-06-21"
def test_pick_skips_non_public_domain(conn):
conn.execute("INSERT INTO art_pool (source, object_id) VALUES ('met', 2)") # only the non-CC0 one
conn.commit()
assert art.pick_daily(conn, art_date="2026-06-21") is None # nothing fetched, not an error
def test_pick_is_idempotent_and_get_today_never_empty(conn):
art.harvest_pool(conn)
a1 = art.pick_daily(conn, art_date="2026-06-21")
a2 = art.pick_daily(conn, art_date="2026-06-21") # same day -> unchanged
assert a1["object_id"] == a2["object_id"]
assert art.get_today(conn, "2026-06-21")["object_id"] == a1["object_id"]
# an unknown date falls back to the most recent cached piece (room never empty)
assert art.get_today(conn, "2099-01-01")["object_id"] == a1["object_id"]
def test_run_daily_bootstraps_pool_then_picks(conn):
r = art.run_daily(conn)
assert r["pool"] == 3 and r["picked_object"] in (1, 3)