Image quality gate: reject too-small images (no more blurry thumbnails)
Some cards showed blurry photos — feed RSS thumbnails (~90×90, e.g. Phys.org's /tmb/ path) that load fine but upscale to mush in the banner. Add a header-based dimension parser (PNG/GIF/JPEG/WebP, stdlib only) and fold a minimum-size gate (450×250) into the image validation, alongside the existing load check. Images we can't measure (SVG/AVIF) still pass on content-type. A re-prune clears the small ones already stored so those cards fall back to the clean placeholder. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+58
-8
@@ -20,6 +20,7 @@ import ipaddress
|
||||
import re
|
||||
import socket
|
||||
import sqlite3
|
||||
import struct
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from urllib.parse import urljoin, urlsplit
|
||||
@@ -30,6 +31,10 @@ USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
|
||||
TIMEOUT = 6
|
||||
MAX_BYTES = 300_000
|
||||
MAX_REDIRECTS = 3
|
||||
# Below this, a feed thumbnail upscales to mush in the card banner. Real share
|
||||
# images (og:image) are ~1200×630; tiny RSS thumbnails (~90px) are what we reject.
|
||||
MIN_IMG_WIDTH = 450
|
||||
MIN_IMG_HEIGHT = 250
|
||||
|
||||
_META_RE = re.compile(rb"<meta\b[^>]*>", re.IGNORECASE)
|
||||
_HEAD_END_RE = re.compile(rb"</head>", re.IGNORECASE)
|
||||
@@ -169,14 +174,53 @@ def fetch_og_image(url: str | None) -> str | None:
|
||||
return None # too many redirects
|
||||
|
||||
|
||||
def _image_loads(url: str) -> bool:
|
||||
"""Confirm an image URL truly returns an image (HTTP 200 + image/* type).
|
||||
def _image_dimensions(data: bytes) -> "tuple[int, int] | None":
|
||||
"""Best-effort (width, height) from an image file's header bytes — PNG, GIF,
|
||||
JPEG, WebP. Returns None for formats we can't cheaply measure (e.g. SVG)."""
|
||||
if len(data) < 10:
|
||||
return None
|
||||
if len(data) >= 24 and data[:8] == b"\x89PNG\r\n\x1a\n" and data[12:16] == b"IHDR":
|
||||
return struct.unpack(">II", data[16:24])
|
||||
if data[:6] in (b"GIF87a", b"GIF89a"):
|
||||
return struct.unpack("<HH", data[6:10])
|
||||
if data[:2] == b"\xff\xd8": # JPEG: scan for a Start-Of-Frame marker
|
||||
i, n = 2, len(data)
|
||||
while i < n - 9:
|
||||
if data[i] != 0xFF:
|
||||
i += 1
|
||||
continue
|
||||
marker = data[i + 1]
|
||||
if marker in (0xC0, 0xC1, 0xC2, 0xC3, 0xC5, 0xC6, 0xC7, 0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF):
|
||||
h, w = struct.unpack(">HH", data[i + 5:i + 9])
|
||||
return (w, h)
|
||||
if marker == 0xD8 or marker == 0xD9 or 0xD0 <= marker <= 0xD7:
|
||||
i += 2
|
||||
continue
|
||||
i += 2 + struct.unpack(">H", data[i + 2:i + 4])[0]
|
||||
return None
|
||||
if data[:4] == b"RIFF" and data[8:12] == b"WEBP":
|
||||
fmt = data[12:16]
|
||||
try:
|
||||
if fmt == b"VP8 ":
|
||||
return (struct.unpack("<H", data[26:28])[0] & 0x3FFF,
|
||||
struct.unpack("<H", data[28:30])[0] & 0x3FFF)
|
||||
if fmt == b"VP8X":
|
||||
return ((data[24] | data[25] << 8 | data[26] << 16) + 1,
|
||||
(data[27] | data[28] << 8 | data[29] << 16) + 1)
|
||||
except (struct.error, IndexError):
|
||||
return None
|
||||
return None
|
||||
|
||||
Many publishers serve a signed or hotlink-protected og:image that 401/403s
|
||||
on a direct request (e.g. the Guardian's i.guim.co.uk), so storing the URL
|
||||
would overstate coverage and the card would never render it. We request as
|
||||
the browser does — no referrer — with the same per-hop host safety as the
|
||||
page fetch. Returns False on any error.
|
||||
|
||||
def _image_loads(url: str) -> bool:
|
||||
"""Confirm an image URL returns a real, big-enough image (HTTP 200 + image/*
|
||||
+ dimensions ≥ the minimum).
|
||||
|
||||
Two failure modes this guards against: signed/hotlink-protected URLs that
|
||||
401/403 on a direct load (e.g. the Guardian's i.guim.co.uk), and tiny feed
|
||||
thumbnails (~90px) that upscale to mush in the card banner. We request as the
|
||||
browser does — no referrer — with the same per-hop host safety as the page
|
||||
fetch. Images we can't measure (SVG/AVIF) pass on content-type alone.
|
||||
"""
|
||||
opener = urllib.request.build_opener(_NoRedirect)
|
||||
for _ in range(MAX_REDIRECTS + 1):
|
||||
@@ -199,9 +243,15 @@ def _image_loads(url: str) -> bool:
|
||||
url = urljoin(url, location)
|
||||
continue
|
||||
ctype = (response.headers.get("Content-Type") or "").lower()
|
||||
return status == 200 and ctype.startswith("image/")
|
||||
if status != 200 or not ctype.startswith("image/"):
|
||||
return False
|
||||
head = response.read(200_000)
|
||||
finally:
|
||||
response.close()
|
||||
dims = _image_dimensions(head)
|
||||
if dims and (dims[0] < MIN_IMG_WIDTH or dims[1] < MIN_IMG_HEIGHT):
|
||||
return False # too small — would upscale to mush
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
||||
@@ -68,3 +68,13 @@ def test_backfill_only_targets_summarized_accepted_imageless(tmp_path):
|
||||
n = enrich.enrich_summarized_images(c, fetch=lambda url: "http://og.jpg", limit=50)
|
||||
assert n == 1
|
||||
assert c.execute("SELECT image_url FROM articles WHERE id=2").fetchone()[0] is None
|
||||
|
||||
|
||||
def test_image_dimensions_parses_headers():
|
||||
import struct
|
||||
from goodnews import enrich
|
||||
png = b"\x89PNG\r\n\x1a\n" + b"\x00\x00\x00\x0d" + b"IHDR" + struct.pack(">II", 1200, 630)
|
||||
assert enrich._image_dimensions(png) == (1200, 630)
|
||||
gif = b"GIF89a" + struct.pack("<HH", 90, 90)
|
||||
assert enrich._image_dimensions(gif) == (90, 90)
|
||||
assert enrich._image_dimensions(b"not an image at all") is None
|
||||
|
||||
Reference in New Issue
Block a user