89c0fbe1f6
The deploy pipeline runs from the working tree, so a wave of shipped features
had never been committed. This snapshots git to what's actually running.
SEO impression recovery (live + verified):
- Duplicate /a/{id} now 301-redirect to their canonical twin instead of 404
(a hard 404 silently dropped already-indexed URLs and tanked impressions).
- Dedup representative selection reworked: accepted/serveable -> established
rep (URL stability) -> quality score, so an accepted page never retires to a
rejected rep and an indexed canonical doesn't churn when a newer twin arrives.
- HEAD /a/{id} returns the same status as GET (api_route GET+HEAD) instead of
falling through to the static mount and 404ing.
- `dedup --force-recluster`: cycle-locked, model-free re-cluster to re-apply the
policy to the existing corpus (shared cycle_lock context manager).
- CLI honors GOODNEWS_DB for its default --db (was silently ignored).
Publishing Desk (admin tool to post highlights to X via Web Intents):
- publishing.py queue/rank/handle-resolution; admin UI; full searchable emoji
picker (bundled data, no CDN) for the blurb editor.
Play games + site:
- Bloom (word-wheel), Memory Match, daily ritual set, Zen Den (dev-gated).
- English-only language gate; source prospecting; paywall + dedup hardening.
Tests: full suite green (349). Ignores tightened (node_modules, data/*.db).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
108 lines
4.3 KiB
Python
108 lines
4.3 KiB
Python
"""Domain-level paywall hints.
|
|
|
|
We never fetch article pages, so a paywall can only be inferred from the host.
|
|
This is a curated, conservative list of hard/soft paywalls — enough to label a
|
|
card "subscription may be required" and to prefer readable stories for the lead
|
|
and for replacements. It will never be perfect; it's an honest hint, not a gate.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from urllib.parse import urlsplit
|
|
|
|
# Host suffixes considered paywalled. Subdomains match (news.nature.com → nature.com).
|
|
PAYWALL_DOMAINS = {
|
|
"newscientist.com",
|
|
"nature.com",
|
|
"nytimes.com",
|
|
"wsj.com",
|
|
"ft.com",
|
|
"economist.com",
|
|
"wired.com",
|
|
"theatlantic.com",
|
|
"washingtonpost.com",
|
|
"bloomberg.com",
|
|
"technologyreview.com",
|
|
"newyorker.com",
|
|
"scientificamerican.com",
|
|
"nationalgeographic.com",
|
|
"thetimes.co.uk",
|
|
"telegraph.co.uk",
|
|
"foreignpolicy.com",
|
|
"hbr.org",
|
|
"harpers.org",
|
|
}
|
|
|
|
|
|
def is_paywalled(url: str | None) -> bool:
|
|
"""Low-level DOMAIN rule. Keep this distinct from the source-aware decision so
|
|
callers can tell 'domain says paywalled' from 'this source is overridden'."""
|
|
host = urlsplit(url or "").netloc.lower()
|
|
if host.startswith("www."):
|
|
host = host[4:]
|
|
return any(host == d or host.endswith("." + d) for d in PAYWALL_DOMAINS)
|
|
|
|
|
|
def is_paywalled_for_source(url: str | None, override: str | None = None) -> bool:
|
|
"""The EFFECTIVE paywall decision used for ranking/lead/badges: a per-source
|
|
override (set in admin after inspecting the articles) wins over the domain
|
|
rule — 'free' clears a false positive (e.g. NY Times Learning), 'paywalled'
|
|
flags a false negative. NULL falls back to the domain rule."""
|
|
if override == "free":
|
|
return False
|
|
if override == "paywalled":
|
|
return True
|
|
return is_paywalled(url)
|
|
|
|
|
|
# --- Content-level accessibility (deep-preview only; the live pipeline still never
|
|
# fetches article pages) -----------------------------------------------------
|
|
|
|
# Wall phrases that appear in the rendered, walled state. Kept specific so a footer
|
|
# "subscribe to our newsletter" doesn't read as a paywall.
|
|
_WALL_MARKERS = (
|
|
"subscribe to continue", "subscribe to keep reading", "subscribe to read",
|
|
"to continue reading", "already a subscriber", "subscribers only",
|
|
"this article is for subscribers", "this content is for subscribers",
|
|
"create a free account to continue", "create an account to keep reading",
|
|
"unlock this article", "register to continue reading",
|
|
)
|
|
_ACCESS_FALSE = re.compile(r'"isaccessibleforfree"\s*:\s*("?)(false)\1', re.I)
|
|
_ACCESS_TRUE = re.compile(r'"isaccessibleforfree"\s*:\s*("?)(true)\1', re.I)
|
|
_CONTENT_LOCKED = re.compile(r'content[_-]tier"[^>]*content="locked', re.I)
|
|
_STRIP_BLOCKS = re.compile(r"(?is)<(script|style|noscript|template)[^>]*>.*?</\1>")
|
|
_STRIP_TAGS = re.compile(r"(?s)<[^>]+>")
|
|
_WS = re.compile(r"\s+")
|
|
|
|
|
|
def check_article_access(url: str, fetcher, timeout: int = 8) -> str:
|
|
"""Best-effort readability of ONE article URL, for the deep-preview accessibility
|
|
sample. Returns 'readable' | 'paywalled' | 'blocked' | 'unknown'.
|
|
|
|
Conservative + evidence-led: an explicit signal (schema.org isAccessibleForFree,
|
|
content-tier=locked, or a clear wall phrase) marks 'paywalled'; otherwise a page
|
|
with substantial body text reads as 'readable'; thin/ambiguous pages stay
|
|
'unknown'. A fetch error is 'blocked'. Heuristic by nature — it informs the
|
|
verdict, it never auto-rejects (domain rules already proved they can lie)."""
|
|
try:
|
|
raw = fetcher(url, timeout=timeout)
|
|
except Exception: # noqa: BLE001 — any fetch failure = can't read it right now
|
|
return "blocked"
|
|
try:
|
|
html = raw.decode("utf-8", "ignore")
|
|
except Exception: # noqa: BLE001
|
|
return "unknown"
|
|
if _ACCESS_FALSE.search(html) or _CONTENT_LOCKED.search(html):
|
|
return "paywalled"
|
|
low = html.lower()
|
|
if any(m in low for m in _WALL_MARKERS):
|
|
return "paywalled"
|
|
# No wall signal — judge by how much real article text is present.
|
|
text = _WS.sub(" ", _STRIP_TAGS.sub(" ", _STRIP_BLOCKS.sub(" ", html))).strip()
|
|
if _ACCESS_TRUE.search(html) and len(text) >= 600:
|
|
return "readable"
|
|
if len(text) >= 1500:
|
|
return "readable"
|
|
return "unknown"
|