Daily Art backend: curated Met pool, daily cached pick, /api/art (prototype)
The engine for the /art room (design-independent; deploy held for Codex review).
- goodnews/art.py: harvest a curated pool of public-domain HIGHLIGHT artworks from the
Met (isHighlight+isPublicDomain+hasImages -> masterworks, never potsherds; CC0). Daily
deterministic pick from the least-recently-shown (no soon-repeats, same for everyone),
fetch metadata + download the image to OUR cache (data/art_cache) so the homepage never
waits on or hotlinks the museum. Bulletproof: bad object/image falls through candidates;
a failed day keeps the last piece (room never empty). Injectable HTTP for tests.
- Schema: art_pool + daily_art. /api/art/today (edge-cacheable) + /api/art/image/{id}
(served from cache, immutable). CLI `art [--harvest] [--force]` + a non-fatal cycle step.
- Tests (5, mocked HTTP) + verified live against the Met: harvested 1641 works,
picked/cached "Repose" by John White Alexander. 371 tests green.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+26
-2
@@ -32,11 +32,11 @@ from pathlib import Path
|
||||
|
||||
from fastapi import BackgroundTasks, FastAPI, HTTPException, Query, Request, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||
from fastapi.responses import FileResponse, HTMLResponse, RedirectResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
|
||||
from . import auth, bloom, email_send, feeds, games, oauth_google, publishing, queries, share, sources, summarize
|
||||
from . import art, auth, bloom, email_send, feeds, games, oauth_google, publishing, queries, share, sources, summarize
|
||||
from .localtime import local_today
|
||||
from .markup import reply_html_to_text, sanitize_reply_html
|
||||
from .db import connect
|
||||
@@ -2258,6 +2258,30 @@ def create_app() -> FastAPI:
|
||||
with get_conn() as conn:
|
||||
return queries.available_dates(conn, limit=limit)
|
||||
|
||||
# --- Daily Art (the /art room) -----------------------------------------
|
||||
@app.get("/api/art/today")
|
||||
def art_today(response: Response) -> dict:
|
||||
with get_conn() as conn:
|
||||
a = art.get_today(conn)
|
||||
if not a:
|
||||
response.headers["Cache-Control"] = _PRIVATE
|
||||
raise HTTPException(status_code=404, detail="No art yet.")
|
||||
response.headers["Cache-Control"] = _EDGE_FEED # one piece a day, same for everyone
|
||||
return {
|
||||
"date": a["art_date"], "object_id": a["object_id"], "title": a["title"],
|
||||
"artist": a["artist"], "date_text": a["date_text"], "medium": a["medium"],
|
||||
"department": a["department"], "credit": a["credit"], "source_url": a["source_url"],
|
||||
"source": a["source"], "image_url": f"/api/art/image/{a['object_id']}",
|
||||
}
|
||||
|
||||
@app.get("/api/art/image/{object_id}")
|
||||
def art_image(object_id: int) -> FileResponse:
|
||||
matches = sorted(art.cache_dir().glob(f"{object_id}.*"))
|
||||
if not matches:
|
||||
raise HTTPException(status_code=404, detail="Not cached.")
|
||||
# Cached museum image: immutable for a given object id.
|
||||
return FileResponse(str(matches[0]), headers={"Cache-Control": "public, max-age=31536000, immutable"})
|
||||
|
||||
@app.get("/api/replacement", response_model=Article | None)
|
||||
def replacement(
|
||||
exclude: str = Query("", description="comma-separated article ids already shown"),
|
||||
|
||||
+187
@@ -0,0 +1,187 @@
|
||||
"""Daily Art — the /art room. One gorgeous public-domain masterwork a day, picked from
|
||||
a curated pool of museum highlights and cached to OUR origin (image + metadata), so the
|
||||
homepage never waits on, nor hotlinks, the museum.
|
||||
|
||||
Source: The Met Collection API (no key; public-domain works are CC0 — free, unrestricted,
|
||||
caching encouraged). Curation = isHighlight + isPublicDomain + hasImages, so the pool is
|
||||
masterworks, never potsherds. Built to be bulletproof: a failed pick falls through to the
|
||||
next candidate, and a failed day keeps yesterday's piece — the room is never empty.
|
||||
|
||||
Network calls go through module-level _http_* helpers so tests can monkeypatch them.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
from .localtime import local_today
|
||||
|
||||
MET_BASE = "https://collectionapi.metmuseum.org/public/collection/v1"
|
||||
# Broad, visual, museum-grade terms. Each is filtered to public-domain highlights with
|
||||
# images, then deduped — a diverse pool of a few thousand masterworks.
|
||||
HARVEST_QUERIES = ("painting", "portrait", "landscape", "still life", "flowers",
|
||||
"sculpture", "drawing", "garden", "river", "sunset")
|
||||
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
|
||||
_PICK_ATTEMPTS = 8 # candidates to try before giving up for the day
|
||||
_NO_REPEAT_POOL = 40 # pick the daily piece from the N least-recently-shown
|
||||
_MIN_IMAGE_BYTES = 3000 # smaller than this = not a real image
|
||||
|
||||
|
||||
def _http_json(url: str, timeout: int = 20) -> dict:
|
||||
req = urllib.request.Request(url, headers=_UA)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return json.loads(r.read().decode("utf-8"))
|
||||
|
||||
|
||||
def _http_bytes(url: str, timeout: int = 30) -> tuple[bytes, str]:
|
||||
req = urllib.request.Request(url, headers=_UA)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read(), (r.headers.get("Content-Type") or "")
|
||||
|
||||
|
||||
def cache_dir() -> Path:
|
||||
"""Where cached images live — beside the DB, so the host cycle writes and the API
|
||||
container reads the same mounted volume."""
|
||||
override = os.environ.get("GOODNEWS_ART_CACHE")
|
||||
if override:
|
||||
d = Path(override)
|
||||
else:
|
||||
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
|
||||
d = db.parent / "art_cache"
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
return d
|
||||
|
||||
|
||||
# --- harvest: build the curated pool of object IDs -------------------------------
|
||||
|
||||
def harvest_pool(conn: sqlite3.Connection, queries=HARVEST_QUERIES, source: str = "met") -> dict:
|
||||
"""Query the Met for public-domain highlight images across broad art terms; store the
|
||||
deduped object IDs. Cheap: each search returns all matching IDs in one call. Per-query
|
||||
failure is non-fatal."""
|
||||
found, errors = set(), 0
|
||||
for q in queries:
|
||||
url = (f"{MET_BASE}/search?isHighlight=true&hasImages=true&isPublicDomain=true"
|
||||
f"&q={urllib.request.quote(q)}")
|
||||
try:
|
||||
data = _http_json(url)
|
||||
for oid in (data.get("objectIDs") or []):
|
||||
if isinstance(oid, int):
|
||||
found.add(oid)
|
||||
except Exception: # noqa: BLE001 — non-fatal per query
|
||||
errors += 1
|
||||
before = conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0]
|
||||
conn.executemany(
|
||||
"INSERT OR IGNORE INTO art_pool (source, object_id) VALUES (?, ?)",
|
||||
[(source, oid) for oid in found],
|
||||
)
|
||||
conn.commit()
|
||||
after = conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0]
|
||||
return {"queried": len(queries), "errors": errors, "found": len(found),
|
||||
"added": after - before, "pool": after}
|
||||
|
||||
|
||||
# --- daily pick: choose, fetch, cache --------------------------------------------
|
||||
|
||||
def _object(object_id: int) -> dict:
|
||||
return _http_json(f"{MET_BASE}/objects/{object_id}")
|
||||
|
||||
|
||||
def _download_image(obj: dict, object_id: int) -> str | None:
|
||||
"""Download the web-large (then full) image to our cache; return the filename or None."""
|
||||
for key in ("primaryImageSmall", "primaryImage"):
|
||||
url = obj.get(key)
|
||||
if not url:
|
||||
continue
|
||||
try:
|
||||
data, ctype = _http_bytes(url)
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
if not ctype.startswith("image/") or len(data) < _MIN_IMAGE_BYTES:
|
||||
continue
|
||||
ext = ".png" if "png" in ctype else ".jpg"
|
||||
fname = f"{object_id}{ext}"
|
||||
try:
|
||||
(cache_dir() / fname).write_bytes(data)
|
||||
except OSError:
|
||||
return None
|
||||
return fname
|
||||
return None
|
||||
|
||||
|
||||
def _candidates(conn: sqlite3.Connection, art_date: str, source: str) -> list[int]:
|
||||
"""The N least-recently-shown pool IDs, rotated deterministically by the date so the
|
||||
same piece shows for everyone that day and pieces don't repeat soon."""
|
||||
rows = conn.execute(
|
||||
"SELECT object_id FROM art_pool WHERE source=? ORDER BY shown_at IS NOT NULL, shown_at, object_id LIMIT ?",
|
||||
(source, _NO_REPEAT_POOL),
|
||||
).fetchall()
|
||||
ids = [r[0] for r in rows]
|
||||
if not ids:
|
||||
return ids
|
||||
seed = int(hashlib.sha256(art_date.encode()).hexdigest(), 16) % len(ids)
|
||||
return ids[seed:] + ids[:seed] # rotate so the daily choice is stable but varies
|
||||
|
||||
|
||||
def pick_daily(conn: sqlite3.Connection, art_date: str | None = None, source: str = "met",
|
||||
force: bool = False) -> dict | None:
|
||||
"""Pick + cache the day's art. Idempotent (skips if today's already done unless force).
|
||||
Tries successive candidates so a bad object/image never breaks the day; returns the
|
||||
stored row, or None if nothing could be fetched (caller keeps the prior day's piece)."""
|
||||
art_date = art_date or local_today()
|
||||
existing = conn.execute("SELECT * FROM daily_art WHERE art_date=?", (art_date,)).fetchone()
|
||||
if existing and not force:
|
||||
return dict(existing)
|
||||
for oid in _candidates(conn, art_date, source):
|
||||
try:
|
||||
obj = _object(oid)
|
||||
except Exception: # noqa: BLE001
|
||||
continue
|
||||
if not obj.get("isPublicDomain"):
|
||||
continue
|
||||
fname = _download_image(obj, oid)
|
||||
if not fname:
|
||||
continue
|
||||
conn.execute(
|
||||
"INSERT INTO daily_art (art_date, source, object_id, title, artist, date_text, medium, "
|
||||
"department, credit, source_url, image_file) VALUES (?,?,?,?,?,?,?,?,?,?,?) "
|
||||
"ON CONFLICT(art_date) DO UPDATE SET object_id=excluded.object_id, title=excluded.title, "
|
||||
"artist=excluded.artist, date_text=excluded.date_text, medium=excluded.medium, "
|
||||
"department=excluded.department, credit=excluded.credit, source_url=excluded.source_url, "
|
||||
"image_file=excluded.image_file",
|
||||
(art_date, source, oid, obj.get("title") or "Untitled",
|
||||
obj.get("artistDisplayName") or None, obj.get("objectDate") or None,
|
||||
obj.get("medium") or None, obj.get("department") or None,
|
||||
obj.get("creditLine") or None, obj.get("objectURL") or None, fname),
|
||||
)
|
||||
conn.execute("UPDATE art_pool SET shown_at=? WHERE source=? AND object_id=?",
|
||||
(art_date, source, oid))
|
||||
conn.commit()
|
||||
return dict(conn.execute("SELECT * FROM daily_art WHERE art_date=?", (art_date,)).fetchone())
|
||||
return None # nothing fetched today — get_today falls back to the latest piece
|
||||
|
||||
|
||||
def get_today(conn: sqlite3.Connection, art_date: str | None = None) -> dict | None:
|
||||
"""Today's piece if present, else the most recent one cached (room is never empty)."""
|
||||
if art_date:
|
||||
row = conn.execute("SELECT * FROM daily_art WHERE art_date=?", (art_date,)).fetchone()
|
||||
if row:
|
||||
return dict(row)
|
||||
row = conn.execute("SELECT * FROM daily_art ORDER BY art_date DESC LIMIT 1").fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def run_daily(conn: sqlite3.Connection, source: str = "met") -> dict:
|
||||
"""Cycle entry point: ensure the pool exists, then ensure today has a piece. Bounded
|
||||
and non-fatal — safe to call every cycle (it no-ops once the day is picked)."""
|
||||
pool = conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0]
|
||||
harvested = None
|
||||
if pool == 0:
|
||||
harvested = harvest_pool(conn, source=source)
|
||||
picked = pick_daily(conn, source=source)
|
||||
return {"pool": conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0],
|
||||
"harvested": harvested, "picked_object": picked.get("object_id") if picked else None}
|
||||
@@ -13,6 +13,7 @@ from .games import generate_daily_puzzles
|
||||
from .localtime import local_today
|
||||
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
|
||||
from .geo import tag_articles as tag_geo
|
||||
from . import art
|
||||
from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
|
||||
from .summarize import generate_summary, get_summary
|
||||
from .feeds import (
|
||||
@@ -135,6 +136,7 @@ def main() -> None:
|
||||
cycle_parser.add_argument("--no-dedup", action="store_true", help="Skip the embedding dedup step")
|
||||
cycle_parser.add_argument("--no-geo", action="store_true", help="Skip tagging article subject-geography")
|
||||
cycle_parser.add_argument("--geo-limit", type=int, default=60, help="Max articles to geo-tag per cycle")
|
||||
cycle_parser.add_argument("--no-art", action="store_true", help="Skip the Daily Art pick")
|
||||
cycle_parser.add_argument("--no-brief", action="store_true", help="Skip rebuilding today's brief")
|
||||
cycle_parser.add_argument("--no-review", action="store_true", help="Skip recomputing source review flags")
|
||||
cycle_parser.add_argument("--no-digest", action="store_true", help="Skip sending due daily digests")
|
||||
@@ -150,6 +152,10 @@ def main() -> None:
|
||||
)
|
||||
enrich_images_parser.add_argument("--limit", type=int, default=50, help="Max articles to fetch this batch")
|
||||
|
||||
art_parser = subparsers.add_parser("art", help="Daily Art: harvest the pool and/or pick today's cached piece")
|
||||
art_parser.add_argument("--harvest", action="store_true", help="(Re)harvest the curated museum pool")
|
||||
art_parser.add_argument("--force", action="store_true", help="Re-pick today's art even if already chosen")
|
||||
|
||||
geo_parser = subparsers.add_parser("geo", help="Tag article subject-geography (backfill / manual). Cycle-locked.")
|
||||
geo_parser.add_argument("--limit", type=int, default=200, help="Max articles to tag this batch")
|
||||
geo_parser.add_argument("--reclassify", action="store_true", help="Re-tag even rows already at the current geo version")
|
||||
@@ -307,6 +313,17 @@ def main() -> None:
|
||||
elif args.command == "enrich-images":
|
||||
found = enrich_summarized_images(conn, limit=args.limit)
|
||||
print(f"enrich-images: {found} new image(s) for summarized articles")
|
||||
elif args.command == "art":
|
||||
init_db(conn)
|
||||
if args.harvest:
|
||||
h = art.harvest_pool(conn)
|
||||
print(f"art harvest: found={h['found']} added={h['added']} pool={h['pool']} errors={h['errors']}")
|
||||
picked = art.pick_daily(conn, force=args.force)
|
||||
if picked:
|
||||
print(f"art pick: {picked['art_date']} -> #{picked['object_id']} "
|
||||
f"\"{picked['title']}\" — {picked['artist'] or 'Unknown'}")
|
||||
else:
|
||||
print("art pick: nothing fetched (kept the last piece)")
|
||||
elif args.command == "geo":
|
||||
init_db(conn)
|
||||
# Cycle-locked so a manual backfill can't contend with the scheduled cycle.
|
||||
@@ -534,6 +551,15 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
|
||||
except Exception as exc:
|
||||
print(f"geo: skipped ({exc})")
|
||||
|
||||
# Daily Art: ensure the pool exists, then ensure today has a cached piece. No-ops
|
||||
# once the day is picked; non-fatal like every other step.
|
||||
if not args.no_art:
|
||||
try:
|
||||
a = art.run_daily(conn)
|
||||
print(f"art: pool={a['pool']} picked={a['picked_object']}")
|
||||
except Exception as exc:
|
||||
print(f"art: skipped ({exc})")
|
||||
|
||||
if not args.no_brief:
|
||||
today = local_today()
|
||||
try:
|
||||
|
||||
@@ -245,6 +245,32 @@ CREATE INDEX IF NOT EXISTS idx_article_places_article ON article_places(article_
|
||||
CREATE INDEX IF NOT EXISTS idx_article_places_country ON article_places(country_code);
|
||||
CREATE INDEX IF NOT EXISTS idx_article_geo_breadth ON article_geo(breadth);
|
||||
|
||||
-- Daily Art (the /art room). art_pool = a curated set of public-domain, highlighted
|
||||
-- museum object IDs (so the daily pick never hits a potsherd). daily_art = one cached
|
||||
-- piece per day (metadata + a locally-cached image), so the homepage never waits on or
|
||||
-- hotlinks the museum. shown_at lets us avoid repeating a piece too soon.
|
||||
CREATE TABLE IF NOT EXISTS art_pool (
|
||||
object_id INTEGER NOT NULL,
|
||||
source TEXT NOT NULL DEFAULT 'met',
|
||||
shown_at TEXT,
|
||||
added_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (source, object_id)
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS daily_art (
|
||||
art_date TEXT PRIMARY KEY,
|
||||
source TEXT NOT NULL DEFAULT 'met',
|
||||
object_id INTEGER NOT NULL,
|
||||
title TEXT,
|
||||
artist TEXT,
|
||||
date_text TEXT,
|
||||
medium TEXT,
|
||||
department TEXT,
|
||||
credit TEXT,
|
||||
source_url TEXT,
|
||||
image_file TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Privacy-respecting, first-party analytics. NO IP / user-agent / referrer / raw
|
||||
-- URL. visitor_hash is a hash of a random localStorage token (never email/IP).
|
||||
-- The UNIQUE key dedups to one row per (kind, article, visitor, day) — that both
|
||||
|
||||
Reference in New Issue
Block a user