ed814c97b9
- daily_art gains blurb + palette columns (idempotent migration). - art._palette: Pillow median-cut to ~5 hex colors from the cached image (best- effort → [] on any failure). art._blurb: a warm 2-3 sentence "what you're looking at" note grounded in the Met catalogue (title/artist/bio/date/medium/ classification/culture/tags). Prompt leans on context/significance and the title+tags for subject — explicitly NOT asserting literal composition (figure counts/poses) it can't see, since the model can't view the image. Markdown stripped from the output. - pick_daily generates both (client optional → blurb skipped when absent); cycle + art CLI pass an LLM client. /api/art/today exposes blurb + palette. - Backfilled the last 3 days on host (Veteran / Magnolia Vase / Bierstadt). - scripts/art_blurb_palette_backfill.py for in-place backfill (no re-pick). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
277 lines
13 KiB
Python
277 lines
13 KiB
Python
"""Daily Art — the /art room. One gorgeous public-domain masterwork a day, picked from
|
|
a curated pool of museum highlights and cached to OUR origin (image + metadata), so the
|
|
homepage never waits on, nor hotlinks, the museum.
|
|
|
|
Source: The Met Collection API (no key; public-domain works are CC0 — free, unrestricted,
|
|
caching encouraged). Curation = isHighlight + isPublicDomain + hasImages, so the pool is
|
|
masterworks, never potsherds. Built to be bulletproof: a failed pick falls through to the
|
|
next candidate, and a failed day keeps yesterday's piece — the room is never empty.
|
|
|
|
Network calls go through module-level _http_* helpers so tests can monkeypatch them.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
import urllib.error
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
from .localtime import local_today
|
|
|
|
MET_BASE = "https://collectionapi.metmuseum.org/public/collection/v1"
|
|
# Broad, visual, museum-grade terms. Each is filtered to public-domain highlights with
|
|
# images, then deduped — a diverse pool of a few thousand masterworks.
|
|
HARVEST_QUERIES = ("painting", "portrait", "landscape", "still life", "flowers",
|
|
"sculpture", "drawing", "garden", "river", "sunset")
|
|
_UA = {"User-Agent": "upbeatBytes/1.0 (+https://upbeatbytes.com)"}
|
|
_PICK_ATTEMPTS = 8 # candidates to try before giving up for the day
|
|
_NO_REPEAT_POOL = 40 # pick the daily piece from the N least-recently-shown
|
|
_MIN_IMAGE_BYTES = 3000 # smaller than this = not a real image
|
|
|
|
|
|
def _http_json(url: str, timeout: int = 20) -> dict:
|
|
req = urllib.request.Request(url, headers=_UA)
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return json.loads(r.read().decode("utf-8"))
|
|
|
|
|
|
def _http_bytes(url: str, timeout: int = 30) -> tuple[bytes, str]:
|
|
req = urllib.request.Request(url, headers=_UA)
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.read(), (r.headers.get("Content-Type") or "")
|
|
|
|
|
|
def cache_dir() -> Path:
|
|
"""Where cached images live — beside the DB, so the host cycle writes and the API
|
|
container reads the same mounted volume."""
|
|
override = os.environ.get("GOODNEWS_ART_CACHE")
|
|
if override:
|
|
d = Path(override)
|
|
else:
|
|
db = Path(os.environ.get("GOODNEWS_DB", "data/goodnews.sqlite3"))
|
|
d = db.parent / "art_cache"
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
return d
|
|
|
|
|
|
# --- harvest: build the curated pool of object IDs -------------------------------
|
|
|
|
def harvest_pool(conn: sqlite3.Connection, queries=HARVEST_QUERIES, source: str = "met") -> dict:
|
|
"""Query the Met for public-domain highlight images across broad art terms; store the
|
|
deduped object IDs. Cheap: each search returns all matching IDs in one call. Per-query
|
|
failure is non-fatal."""
|
|
found, errors = set(), 0
|
|
for q in queries:
|
|
url = (f"{MET_BASE}/search?isHighlight=true&hasImages=true&isPublicDomain=true"
|
|
f"&q={urllib.request.quote(q)}")
|
|
try:
|
|
data = _http_json(url)
|
|
for oid in (data.get("objectIDs") or []):
|
|
if isinstance(oid, int):
|
|
found.add(oid)
|
|
except Exception: # noqa: BLE001 — non-fatal per query
|
|
errors += 1
|
|
before = conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0]
|
|
conn.executemany(
|
|
"INSERT OR IGNORE INTO art_pool (source, object_id) VALUES (?, ?)",
|
|
[(source, oid) for oid in found],
|
|
)
|
|
conn.commit()
|
|
after = conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0]
|
|
return {"queried": len(queries), "errors": errors, "found": len(found),
|
|
"added": after - before, "pool": after}
|
|
|
|
|
|
# --- daily pick: choose, fetch, cache --------------------------------------------
|
|
|
|
def _object(object_id: int) -> dict:
|
|
return _http_json(f"{MET_BASE}/objects/{object_id}")
|
|
|
|
|
|
def _fetch_to_cache(url: str | None, stem: str) -> str | None:
|
|
"""Download one image URL to cache as {stem}.{ext}; return the filename or None.
|
|
Writes to a temp file then atomically renames, so a reader never sees a half-file."""
|
|
if not url:
|
|
return None
|
|
try:
|
|
data, ctype = _http_bytes(url)
|
|
except Exception: # noqa: BLE001
|
|
return None
|
|
if not ctype.startswith("image/") or len(data) < _MIN_IMAGE_BYTES:
|
|
return None
|
|
ext = ".png" if "png" in ctype else ".jpg"
|
|
fname = f"{stem}{ext}"
|
|
cdir = cache_dir()
|
|
tmp = cdir / f".{stem}.tmp"
|
|
try:
|
|
tmp.write_bytes(data)
|
|
os.replace(tmp, cdir / fname) # atomic
|
|
except OSError:
|
|
try:
|
|
tmp.unlink()
|
|
except OSError:
|
|
pass
|
|
return None
|
|
return fname
|
|
|
|
|
|
def _download_image(obj: dict, object_id: int) -> str | None:
|
|
"""Cache the day's images to our origin. Stores the web-large display copy as
|
|
{id}.{ext} (what the page shows) and, when available, the full-resolution copy as
|
|
{id}-full.{ext} (what the lightbox opens, so zoom fills the screen). Returns the
|
|
display filename or None if even the display copy couldn't be fetched."""
|
|
display = _fetch_to_cache(obj.get("primaryImageSmall") or obj.get("primaryImage"), str(object_id))
|
|
if not display:
|
|
return None
|
|
full_url = obj.get("primaryImage")
|
|
if full_url and full_url != obj.get("primaryImageSmall"):
|
|
_fetch_to_cache(full_url, f"{object_id}-full") # best-effort hi-res for zoom
|
|
return display
|
|
|
|
|
|
def _palette(image_path: "Path", n: int = 5) -> list[str]:
|
|
"""Extract ~n representative hex colors from the cached image (for the 'colors in this
|
|
piece' strip). Best-effort: any failure → empty list (the strip just hides)."""
|
|
try:
|
|
from PIL import Image
|
|
with Image.open(image_path) as im:
|
|
im = im.convert("RGB")
|
|
im.thumbnail((120, 120)) # tiny — palette, not fidelity
|
|
# Adaptive median-cut to a small palette, then order by how much of the image each covers.
|
|
q = im.quantize(colors=max(n * 2, 8), method=Image.Quantize.MEDIANCUT)
|
|
pal = q.getpalette()
|
|
counts = sorted(q.getcolors(), reverse=True) # [(count, index), ...] most-used first
|
|
out, seen = [], set()
|
|
for _count, idx in counts:
|
|
r, g, b = pal[idx * 3], pal[idx * 3 + 1], pal[idx * 3 + 2]
|
|
hexc = f"#{r:02x}{g:02x}{b:02x}"
|
|
if hexc in seen:
|
|
continue
|
|
seen.add(hexc)
|
|
out.append(hexc)
|
|
if len(out) >= n:
|
|
break
|
|
return out
|
|
except Exception: # noqa: BLE001 — palette is decorative; never break the pick
|
|
return []
|
|
|
|
|
|
_BLURB_SYSTEM = (
|
|
"You are the calm, knowledgeable curator of a daily-art feature for a general audience — "
|
|
"people who enjoy a beautiful painting but aren't art historians. In 2 to 3 warm, plain "
|
|
"sentences, help them appreciate the piece and why it's worth a moment: its mood, the "
|
|
"artist, the era or movement, and a little real context or significance.\n"
|
|
"GROUNDING (important): the catalogue details below — especially the title and the "
|
|
"'Depicts' tags — are your only reliable guide to the SUBJECT. You cannot actually see the "
|
|
"image, so do NOT assert literal visual specifics you can't verify: do not state how many "
|
|
"figures are shown, their exact poses or actions, colors, or background details. Lean on "
|
|
"what's certain (title, tags, medium, date, artist, movement) and on feeling/significance. "
|
|
"If you don't recognize the exact work, stay general and contextual rather than inventing. "
|
|
"No preamble, no title repetition, no hype, no markdown — just the note."
|
|
)
|
|
|
|
|
|
def _blurb(client, obj: dict) -> str | None:
|
|
"""A short 'museum guide' note for the piece, grounded in the Met catalogue metadata.
|
|
Best-effort + cached by the caller; returns None on any trouble."""
|
|
tags = ", ".join(t.get("term", "") for t in (obj.get("tags") or []) if t.get("term"))[:200]
|
|
facts = "\n".join(f"{k}: {v}" for k, v in (
|
|
("Title", obj.get("title")), ("Artist", obj.get("artistDisplayName")),
|
|
("Artist bio", obj.get("artistDisplayBio")), ("Date", obj.get("objectDate")),
|
|
("Medium", obj.get("medium")), ("Type", obj.get("objectName")),
|
|
("Classification", obj.get("classification")), ("Culture", obj.get("culture")),
|
|
("Period", obj.get("period")), ("Depicts", tags),
|
|
) if v)
|
|
user = f"Catalogue details:\n{facts}\n\nWrite the note."
|
|
try:
|
|
out = client.chat_text([{"role": "system", "content": _BLURB_SYSTEM},
|
|
{"role": "user", "content": user}]) or ""
|
|
except Exception: # noqa: BLE001
|
|
return None
|
|
out = " ".join(out.replace("*", "").replace("_", " ").split()).strip()[:600] # no stray markdown
|
|
return out or None
|
|
|
|
|
|
def _candidates(conn: sqlite3.Connection, art_date: str, source: str) -> list[int]:
|
|
"""The N least-recently-shown pool IDs, rotated deterministically by the date so the
|
|
same piece shows for everyone that day and pieces don't repeat soon."""
|
|
rows = conn.execute(
|
|
"SELECT object_id FROM art_pool WHERE source=? AND blocked=0 "
|
|
"ORDER BY shown_at IS NOT NULL, shown_at, object_id LIMIT ?",
|
|
(source, _NO_REPEAT_POOL),
|
|
).fetchall()
|
|
ids = [r[0] for r in rows]
|
|
if not ids:
|
|
return ids
|
|
seed = int(hashlib.sha256(art_date.encode()).hexdigest(), 16) % len(ids)
|
|
return ids[seed:] + ids[:seed] # rotate so the daily choice is stable but varies
|
|
|
|
|
|
def pick_daily(conn: sqlite3.Connection, art_date: str | None = None, source: str = "met",
|
|
force: bool = False, client=None) -> dict | None:
|
|
"""Pick + cache the day's art. Idempotent (skips if today's already done unless force).
|
|
Tries successive candidates so a bad object/image never breaks the day; returns the
|
|
stored row, or None if nothing could be fetched (caller keeps the prior day's piece)."""
|
|
art_date = art_date or local_today()
|
|
existing = conn.execute("SELECT * FROM daily_art WHERE art_date=?", (art_date,)).fetchone()
|
|
if existing and not force:
|
|
return dict(existing)
|
|
for oid in _candidates(conn, art_date, source):
|
|
try:
|
|
obj = _object(oid)
|
|
except Exception: # noqa: BLE001
|
|
continue
|
|
if not obj.get("isPublicDomain"):
|
|
continue
|
|
fname = _download_image(obj, oid)
|
|
if not fname:
|
|
continue
|
|
# All network/LLM/compute is done up front; only then a brief write txn + commit.
|
|
palette = json.dumps(_palette(cache_dir() / fname)) or None
|
|
blurb = _blurb(client, obj) if client else None
|
|
conn.execute(
|
|
"INSERT INTO daily_art (art_date, source, object_id, title, artist, date_text, medium, "
|
|
"department, credit, source_url, image_file, image_url_full, is_public_domain, blurb, palette) "
|
|
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) "
|
|
"ON CONFLICT(art_date) DO UPDATE SET object_id=excluded.object_id, title=excluded.title, "
|
|
"artist=excluded.artist, date_text=excluded.date_text, medium=excluded.medium, "
|
|
"department=excluded.department, credit=excluded.credit, source_url=excluded.source_url, "
|
|
"image_file=excluded.image_file, image_url_full=excluded.image_url_full, "
|
|
"is_public_domain=excluded.is_public_domain, blurb=excluded.blurb, palette=excluded.palette",
|
|
(art_date, source, oid, obj.get("title") or "Untitled",
|
|
obj.get("artistDisplayName") or None, obj.get("objectDate") or None,
|
|
obj.get("medium") or None, obj.get("department") or None,
|
|
obj.get("creditLine") or None, obj.get("objectURL") or None, fname,
|
|
obj.get("primaryImage") or None, 1 if obj.get("isPublicDomain") else 0, blurb, palette),
|
|
)
|
|
conn.execute("UPDATE art_pool SET shown_at=? WHERE source=? AND object_id=?",
|
|
(art_date, source, oid))
|
|
conn.commit()
|
|
return dict(conn.execute("SELECT * FROM daily_art WHERE art_date=?", (art_date,)).fetchone())
|
|
return None # nothing fetched today — get_today falls back to the latest piece
|
|
|
|
|
|
def get_today(conn: sqlite3.Connection, art_date: str | None = None) -> dict | None:
|
|
"""Today's piece if present, else the most recent one cached (room is never empty)."""
|
|
if art_date:
|
|
row = conn.execute("SELECT * FROM daily_art WHERE art_date=?", (art_date,)).fetchone()
|
|
if row:
|
|
return dict(row)
|
|
row = conn.execute("SELECT * FROM daily_art ORDER BY art_date DESC LIMIT 1").fetchone()
|
|
return dict(row) if row else None
|
|
|
|
|
|
def run_daily(conn: sqlite3.Connection, source: str = "met", client=None) -> dict:
|
|
"""Cycle entry point: ensure the pool exists, then ensure today has a piece. Bounded
|
|
and non-fatal — safe to call every cycle (it no-ops once the day is picked)."""
|
|
pool = conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0]
|
|
harvested = None
|
|
if pool == 0:
|
|
harvested = harvest_pool(conn, source=source)
|
|
picked = pick_daily(conn, source=source, client=client)
|
|
return {"pool": conn.execute("SELECT COUNT(*) FROM art_pool WHERE source=?", (source,)).fetchone()[0],
|
|
"harvested": harvested, "picked_object": picked.get("object_id") if picked else None}
|