Files
thejayman77 a55ba185a8 images: harden the cache per Codex audit (SSRF-safe, cache-only endpoint, WebP-only)
Blocker fixes for the image cache:
- /api/img/{id} now serves cache HITS ONLY and is restricted to ACCEPTED, CANONICAL
  articles. It never fetches — the cycle (newsimg.warm) owns all fetching — so the
  public endpoint has no SSRF/worker-exhaustion surface. Dropped 1-year immutable
  caching (image_url can change) → public, max-age=86400.
- newsimg._safe_fetch: SSRF-safe (reuses enrich._host_is_public + _NoRedirect, http(s)
  only, every redirect hop re-validated, body capped). _FetchError distinguishes
  permanent refusals (negative-cached via a .fail marker) from transient errors (retry).
- _encode re-encodes only decoded RASTER images to WebP and REJECTS everything else
  (SVG, undecodable, decompression bombs via MAX_IMAGE_PIXELS, pathological dimensions);
  originals are never retained. prune() also sweeps stale .fail markers.
- Concurrency: fetching only runs inside the cycle lock; writes stay atomic.

Smaller fixes:
- share.py visible image has onerror→this.remove() (degrade to the text unfurl, no
  broken icon when an image isn't cached yet).
- share-page Back follows history only on a SAME-ORIGIN referrer (never bounce to an
  external site); menu now honors Escape + resets crossing back to desktop (HubBar parity).

Tests: private host, redirect-to-private, hostile SVG/non-image, transient-vs-permanent
failure, LRU prune, warm (accepted+canonical only, idempotent), cache-only endpoint
(404 on not-cached/unaccepted/duplicate, never fetches), share chrome parity. 441 pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 12:19:57 -04:00

120 lines
5.7 KiB
Python

import pytest
from fastapi.testclient import TestClient
@pytest.fixture
def client(tmp_path, monkeypatch):
db = tmp_path / "t.sqlite3"
monkeypatch.setenv("GOODNEWS_DB", str(db))
monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "https://upbeatbytes.com")
import importlib
import goodnews.api as api
importlib.reload(api)
from goodnews.db import connect, init_db
c = connect(str(db)); init_db(c)
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'BBC','http://s/f',5)")
# 1: accepted (renders), 2: rejected, 3: duplicate
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
"VALUES (1,1,'https://bbc.com/x','Water voles return','h1','https://img/v.jpg')")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (2,1,'https://bbc.com/y','Meh','h2')")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,duplicate_of) VALUES (3,1,'https://bbc.com/z','Dup','h3',1)")
c.execute("INSERT INTO article_scores (article_id,accepted,reason_text) VALUES (1,1,'A hopeful restoration story.')")
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (2,0)")
c.execute("INSERT INTO article_scores (article_id,accepted) VALUES (3,1)")
c.commit(); c.close()
return api.create_app()
def test_share_page_renders(client):
r = TestClient(client).get("/a/1")
assert r.status_code == 200
html = r.text
assert "Water voles return" in html
assert 'property="og:title"' in html and 'name="twitter:card"' in html
assert 'rel="canonical" href="https://upbeatbytes.com/a/1"' in html
assert 'https://bbc.com/x' in html # source link present
assert "A hopeful restoration story." in html
assert 'content="https://img/v.jpg"' in html # og image
def test_share_page_missing_and_malformed(client):
tc = TestClient(client)
assert tc.get("/a/999").status_code == 404 # unknown
assert tc.get("/a/not-a-number").status_code == 404 # malformed → calm 404
assert tc.get("/a/2").status_code == 404 # rejected article
def test_share_page_duplicate_redirects_to_canonical(client):
# article 3 is a duplicate of the live article 1 — its URL may be indexed, so it
# 301s to the canonical (consolidates) rather than 404ing and dropping from Google.
r = TestClient(client).get("/a/3", follow_redirects=False)
assert r.status_code == 301 and r.headers["location"] == "/a/1"
def test_share_page_creates_visitor_token_for_beacons(client):
# /a/ is server-rendered outside the SPA, so its inline analytics must CREATE the
# visitor token when absent (mirroring visitorId) — else a first-time /a/ landing
# beacons an empty token and is dropped from visitor stats.
html = TestClient(client).get("/a/1").text
assert "localStorage.setItem('goodnews:visitor'" in html
assert "crypto.randomUUID" in html
assert "kind:'visit'" in html # daily visit beacon present
def test_share_page_no_image_uses_summary_card(client, tmp_path, monkeypatch):
# article 1 has an image → large card
assert 'summary_large_image' in TestClient(client).get("/a/1").text
def test_incomplete_page_is_not_cached(client):
# article 1 has no summary/explanation → "generating" page must not be cached,
# and carries no-cache so it re-fetches once the summary lands.
import goodnews.api as api
r = TestClient(client).get("/a/1")
assert r.status_code == 200
assert r.headers.get("cache-control") == "no-cache"
assert 1 not in api._SHARE_CACHE
@pytest.fixture
def app_complete(tmp_path, monkeypatch):
db = tmp_path / "t.sqlite3"
monkeypatch.setenv("GOODNEWS_DB", str(db))
monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "https://upbeatbytes.com")
import importlib
import goodnews.api as api
importlib.reload(api)
from goodnews.db import connect, init_db
c = connect(str(db)); init_db(c)
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'BBC','http://s/f',5)")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,image_url) "
"VALUES (1,1,'https://bbc.com/x','Water voles return','h1','https://img/v.jpg')")
c.execute("INSERT INTO article_scores (article_id,accepted,reason_text) VALUES (1,1,'Hopeful.')")
c.execute("INSERT INTO article_summaries (article_id,summary,what_happened,why_matters,why_belongs) "
"VALUES (1,'Voles are back.','They returned to the river.','Biodiversity rebound.','Quietly hopeful.')")
c.commit(); c.close()
return api
def test_complete_page_is_cached_and_served_from_cache(app_complete):
api = app_complete
tc = TestClient(api.create_app())
r1 = tc.get("/a/1")
assert r1.status_code == 200
assert r1.headers.get("cache-control") == "public, max-age=300"
assert 1 in api._SHARE_CACHE # finished page cached
r2 = tc.get("/a/1") # second hit served from cache
assert r2.status_code == 200 and r2.text == r1.text
def test_share_chrome_parity_and_fallbacks():
"""HubBar-replica parity Codex asked for: Back only follows a SAME-ORIGIN referrer,
the menu honors Escape, and a failed image removes itself (clean text unfurl)."""
from goodnews import share
html = share.render_share_page(
{"id": 7, "title": "T", "image_url": "https://x/p.jpg", "canonical_url": "https://s/x",
"source_name": "S"}, "https://upbeatbytes.com")
assert "origin===location.origin" in html # Back requires same-origin referrer
assert "'Escape'" in html or "Escape" in html # menu closes on Escape (parity)
assert 'onerror="this.remove()"' in html # broken image degrades to text unfurl