Files
thejayman77 89c0fbe1f6 Sync repo to deployed state: SEO recovery, Publishing Desk, Play games, emoji picker
The deploy pipeline runs from the working tree, so a wave of shipped features
had never been committed. This snapshots git to what's actually running.

SEO impression recovery (live + verified):
- Duplicate /a/{id} now 301-redirect to their canonical twin instead of 404
  (a hard 404 silently dropped already-indexed URLs and tanked impressions).
- Dedup representative selection reworked: accepted/serveable -> established
  rep (URL stability) -> quality score, so an accepted page never retires to a
  rejected rep and an indexed canonical doesn't churn when a newer twin arrives.
- HEAD /a/{id} returns the same status as GET (api_route GET+HEAD) instead of
  falling through to the static mount and 404ing.
- `dedup --force-recluster`: cycle-locked, model-free re-cluster to re-apply the
  policy to the existing corpus (shared cycle_lock context manager).
- CLI honors GOODNEWS_DB for its default --db (was silently ignored).

Publishing Desk (admin tool to post highlights to X via Web Intents):
- publishing.py queue/rank/handle-resolution; admin UI; full searchable emoji
  picker (bundled data, no CDN) for the blurb editor.

Play games + site:
- Bloom (word-wheel), Memory Match, daily ritual set, Zen Den (dev-gated).
- English-only language gate; source prospecting; paywall + dedup hardening.

Tests: full suite green (349). Ignores tightened (node_modules, data/*.db).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 11:32:27 -04:00

108 lines
4.3 KiB
Python

"""Domain-level paywall hints.
We never fetch article pages, so a paywall can only be inferred from the host.
This is a curated, conservative list of hard/soft paywalls — enough to label a
card "subscription may be required" and to prefer readable stories for the lead
and for replacements. It will never be perfect; it's an honest hint, not a gate.
"""
from __future__ import annotations
import re
from urllib.parse import urlsplit
# Host suffixes considered paywalled. Subdomains match (news.nature.com → nature.com).
PAYWALL_DOMAINS = {
"newscientist.com",
"nature.com",
"nytimes.com",
"wsj.com",
"ft.com",
"economist.com",
"wired.com",
"theatlantic.com",
"washingtonpost.com",
"bloomberg.com",
"technologyreview.com",
"newyorker.com",
"scientificamerican.com",
"nationalgeographic.com",
"thetimes.co.uk",
"telegraph.co.uk",
"foreignpolicy.com",
"hbr.org",
"harpers.org",
}
def is_paywalled(url: str | None) -> bool:
"""Low-level DOMAIN rule. Keep this distinct from the source-aware decision so
callers can tell 'domain says paywalled' from 'this source is overridden'."""
host = urlsplit(url or "").netloc.lower()
if host.startswith("www."):
host = host[4:]
return any(host == d or host.endswith("." + d) for d in PAYWALL_DOMAINS)
def is_paywalled_for_source(url: str | None, override: str | None = None) -> bool:
"""The EFFECTIVE paywall decision used for ranking/lead/badges: a per-source
override (set in admin after inspecting the articles) wins over the domain
rule — 'free' clears a false positive (e.g. NY Times Learning), 'paywalled'
flags a false negative. NULL falls back to the domain rule."""
if override == "free":
return False
if override == "paywalled":
return True
return is_paywalled(url)
# --- Content-level accessibility (deep-preview only; the live pipeline still never
# fetches article pages) -----------------------------------------------------
# Wall phrases that appear in the rendered, walled state. Kept specific so a footer
# "subscribe to our newsletter" doesn't read as a paywall.
_WALL_MARKERS = (
"subscribe to continue", "subscribe to keep reading", "subscribe to read",
"to continue reading", "already a subscriber", "subscribers only",
"this article is for subscribers", "this content is for subscribers",
"create a free account to continue", "create an account to keep reading",
"unlock this article", "register to continue reading",
)
_ACCESS_FALSE = re.compile(r'"isaccessibleforfree"\s*:\s*("?)(false)\1', re.I)
_ACCESS_TRUE = re.compile(r'"isaccessibleforfree"\s*:\s*("?)(true)\1', re.I)
_CONTENT_LOCKED = re.compile(r'content[_-]tier"[^>]*content="locked', re.I)
_STRIP_BLOCKS = re.compile(r"(?is)<(script|style|noscript|template)[^>]*>.*?</\1>")
_STRIP_TAGS = re.compile(r"(?s)<[^>]+>")
_WS = re.compile(r"\s+")
def check_article_access(url: str, fetcher, timeout: int = 8) -> str:
"""Best-effort readability of ONE article URL, for the deep-preview accessibility
sample. Returns 'readable' | 'paywalled' | 'blocked' | 'unknown'.
Conservative + evidence-led: an explicit signal (schema.org isAccessibleForFree,
content-tier=locked, or a clear wall phrase) marks 'paywalled'; otherwise a page
with substantial body text reads as 'readable'; thin/ambiguous pages stay
'unknown'. A fetch error is 'blocked'. Heuristic by nature — it informs the
verdict, it never auto-rejects (domain rules already proved they can lie)."""
try:
raw = fetcher(url, timeout=timeout)
except Exception: # noqa: BLE001 — any fetch failure = can't read it right now
return "blocked"
try:
html = raw.decode("utf-8", "ignore")
except Exception: # noqa: BLE001
return "unknown"
if _ACCESS_FALSE.search(html) or _CONTENT_LOCKED.search(html):
return "paywalled"
low = html.lower()
if any(m in low for m in _WALL_MARKERS):
return "paywalled"
# No wall signal — judge by how much real article text is present.
text = _WS.sub(" ", _STRIP_TAGS.sub(" ", _STRIP_BLOCKS.sub(" ", html))).strip()
if _ACCESS_TRUE.search(html) and len(text) >= 600:
return "readable"
if len(text) >= 1500:
return "readable"
return "unknown"