Image quality gate: reject too-small images (no more blurry thumbnails)

Some cards showed blurry photos — feed RSS thumbnails (~90×90, e.g. Phys.org's
/tmb/ path) that load fine but upscale to mush in the banner. Add a header-based
dimension parser (PNG/GIF/JPEG/WebP, stdlib only) and fold a minimum-size gate
(450×250) into the image validation, alongside the existing load check. Images
we can't measure (SVG/AVIF) still pass on content-type. A re-prune clears the
small ones already stored so those cards fall back to the clean placeholder.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-07 15:40:04 -04:00
parent 8b44e559e4
commit b134c2dab6
2 changed files with 68 additions and 8 deletions
+58 -8
View File
@@ -20,6 +20,7 @@ import ipaddress
import re
import socket
import sqlite3
import struct
import urllib.error
import urllib.request
from urllib.parse import urljoin, urlsplit
@@ -30,6 +31,10 @@ USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
TIMEOUT = 6
MAX_BYTES = 300_000
MAX_REDIRECTS = 3
# Below this, a feed thumbnail upscales to mush in the card banner. Real share
# images (og:image) are ~1200×630; tiny RSS thumbnails (~90px) are what we reject.
MIN_IMG_WIDTH = 450
MIN_IMG_HEIGHT = 250
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
@@ -169,14 +174,53 @@ def fetch_og_image(url: str | None) -> str | None:
return None # too many redirects
def _image_loads(url: str) -> bool:
"""Confirm an image URL truly returns an image (HTTP 200 + image/* type).
def _image_dimensions(data: bytes) -> "tuple[int, int] | None":
"""Best-effort (width, height) from an image file's header bytes — PNG, GIF,
JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG)."""
if len(data) < 10:
return None
if len(data) >= 24 and data[:8] == b"\x89PNG\r\n\x1a\n" and data[12:16] == b"IHDR":
return struct.unpack(">II", data[16:24])
if data[:6] in (b"GIF87a", b"GIF89a"):
return struct.unpack("<HH", data[6:10])
if data[:2] == b"\xff\xd8": # JPEG: scan for a Start-Of-Frame marker
i, n = 2, len(data)
while i < n - 9:
if data[i] != 0xFF:
i += 1
continue
marker = data[i + 1]
if marker in (0xC0, 0xC1, 0xC2, 0xC3, 0xC5, 0xC6, 0xC7, 0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF):
h, w = struct.unpack(">HH", data[i + 5:i + 9])
return (w, h)
if marker == 0xD8 or marker == 0xD9 or 0xD0 <= marker <= 0xD7:
i += 2
continue
i += 2 + struct.unpack(">H", data[i + 2:i + 4])[0]
return None
if data[:4] == b"RIFF" and data[8:12] == b"WEBP":
fmt = data[12:16]
try:
if fmt == b"VP8 ":
return (struct.unpack("<H", data[26:28])[0] & 0x3FFF,
struct.unpack("<H", data[28:30])[0] & 0x3FFF)
if fmt == b"VP8X":
return ((data[24] | data[25] << 8 | data[26] << 16) + 1,
(data[27] | data[28] << 8 | data[29] << 16) + 1)
except (struct.error, IndexError):
return None
return None
Many publishers serve a signed or hotlink-protected og:image that 401/403s
on a direct request (e.g. the Guardian's i.guim.co.uk), so storing the URL
would overstate coverage and the card would never render it. We request as
the browser does — no referrer — with the same per-hop host safety as the
page fetch. Returns False on any error.
def _image_loads(url: str) -> bool:
"""Confirm an image URL returns a real, big-enough image (HTTP 200 + image/*
+ dimensions ≥ the minimum).
Two failure modes this guards against: signed/hotlink-protected URLs that
401/403 on a direct load (e.g. the Guardian's i.guim.co.uk), and tiny feed
thumbnails (~90px) that upscale to mush in the card banner. We request as the
browser does — no referrer — with the same per-hop host safety as the page
fetch. Images we can't measure (SVG/AVIF) pass on content-type alone.
"""
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(MAX_REDIRECTS + 1):
@@ -199,9 +243,15 @@ def _image_loads(url: str) -> bool:
url = urljoin(url, location)
continue
ctype = (response.headers.get("Content-Type") or "").lower()
return status == 200 and ctype.startswith("image/")
if status != 200 or not ctype.startswith("image/"):
return False
head = response.read(200_000)
finally:
response.close()
dims = _image_dimensions(head)
if dims and (dims[0] < MIN_IMG_WIDTH or dims[1] < MIN_IMG_HEIGHT):
return False # too small — would upscale to mush
return True
return False
+10
View File
@@ -68,3 +68,13 @@ def test_backfill_only_targets_summarized_accepted_imageless(tmp_path):
n = enrich.enrich_summarized_images(c, fetch=lambda url: "http://og.jpg", limit=50)
assert n == 1
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] is None
def test_image_dimensions_parses_headers():
import struct
from goodnews import enrich
png = b"\x89PNG\r\n\x1a\n" + b"\x00\x00\x00\x0d" + b"IHDR" + struct.pack(">II", 1200, 630)
assert enrich._image_dimensions(png) == (1200, 630)
gif = b"GIF89a" + struct.pack("<HH", 90, 90)
assert enrich._image_dimensions(gif) == (90, 90)
assert enrich._image_dimensions(b"not an image at all") is None