Files
upbeatBytes/goodnews/geo.py
T
thejayman77 3486f3102a Scope dial v2: Nearby / Region / Country / World radius on the homepage
Codex-approved evolution: the reader controls the "emotional radius" of the landing.

- Census-region "Regional" grain (geo.region_of / region_states). Scope-aware tiering
  (queries.home_tiers): closest->widest lead, confidence-gated on state + region, never
  a hard filter — blends outward so the set is always full. 'world' = the global brief.
- queries.home_brief takes a scope; /api/brief gains a scope param (nearby|region|
  country|world). Country-only / non-US homes collapse to country.
- Homepage dial replaces the 2-button toggle: adaptive stops (4 with a US state, else
  Country/World), persisted scope, "Good news closest first" framing. Concrete, soft
  section labels (Around New Jersey / Across the Northeast / Across the US / Around the
  world) so the reader sees the dial worked.

Backend 366 + frontend tests green. (Latest feed still on v1 local-first; aligning it
to the dial is the immediate follow-up.)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-19 21:59:32 -04:00

257 lines
12 KiB
Python

"""Subject-geography for articles ("where is this story ABOUT").
Kept deliberately separate from scoring (see article_geo / article_places in db.py):
geography is durable metadata, scoring is volatile. The LLM proposes place NAMES;
this module disposes by normalizing to ISO codes in code, never trusting the model's
free text (so "Europe" never gets stored as a country). 'global'/placeless is a real,
first-class result, not a failure. "local" is NOT stored — it's relative to the reader;
the UI decides "Near you" by comparing these places to the visitor's chosen home.
"""
from __future__ import annotations
import json
import re
import sqlite3
from .llm import LocalModelClient, parse_classifier_json
# Bump when the prompt/taxonomy changes, so a re-backfill can target stale rows.
GEO_VERSION = "geo-v1"
BREADTHS = ("locality", "regional", "national", "multinational", "global", "unknown")
CONFIDENCES = ("high", "medium", "low")
# --- normalization data (LLM returns names; we map to ISO, drop the unmappable) ----
US_STATES = {
"alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR", "california": "CA",
"colorado": "CO", "connecticut": "CT", "delaware": "DE", "florida": "FL", "georgia": "GA",
"hawaii": "HI", "idaho": "ID", "illinois": "IL", "indiana": "IN", "iowa": "IA",
"kansas": "KS", "kentucky": "KY", "louisiana": "LA", "maine": "ME", "maryland": "MD",
"massachusetts": "MA", "michigan": "MI", "minnesota": "MN", "mississippi": "MS",
"missouri": "MO", "montana": "MT", "nebraska": "NE", "nevada": "NV", "new hampshire": "NH",
"new jersey": "NJ", "new mexico": "NM", "new york": "NY", "north carolina": "NC",
"north dakota": "ND", "ohio": "OH", "oklahoma": "OK", "oregon": "OR", "pennsylvania": "PA",
"rhode island": "RI", "south carolina": "SC", "south dakota": "SD", "tennessee": "TN",
"texas": "TX", "utah": "UT", "vermont": "VT", "virginia": "VA", "washington": "WA",
"west virginia": "WV", "wisconsin": "WI", "wyoming": "WY",
"district of columbia": "DC", "washington dc": "DC", "washington d c": "DC",
}
# US Census Bureau regions — the "Regional" grain for the scope dial. Standard,
# explainable, not arbitrary. DC sits in the South (South Atlantic) per Census.
US_REGIONS = {
"Northeast": {"CT", "ME", "MA", "NH", "RI", "VT", "NJ", "NY", "PA"},
"Midwest": {"IL", "IN", "MI", "OH", "WI", "IA", "KS", "MN", "MO", "NE", "ND", "SD"},
"South": {"DE", "FL", "GA", "MD", "NC", "SC", "VA", "DC", "WV", "AL", "KY", "MS",
"TN", "AR", "LA", "OK", "TX"},
"West": {"AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY", "AK", "CA", "HI", "OR", "WA"},
}
def region_of(state_code: str | None) -> str | None:
"""The Census region name containing a US state code, or None."""
if not state_code:
return None
sc = state_code.upper()
for name, states in US_REGIONS.items():
if sc in states:
return name
return None
def region_states(state_code: str | None) -> list[str]:
"""Sorted state codes in the same Census region as a state (incl. it)."""
name = region_of(state_code)
return sorted(US_REGIONS[name]) if name else []
# Common countries + aliases (extensible). Anything not here returns None -> we drop
# the country rather than store garbage. breadth still captures national/global, etc.
COUNTRY_TO_ISO = {
"united states": "US", "united states of america": "US", "usa": "US", "us": "US", "america": "US",
"united kingdom": "GB", "uk": "GB", "britain": "GB", "great britain": "GB", "england": "GB",
"scotland": "GB", "wales": "GB", "northern ireland": "GB",
"canada": "CA", "australia": "AU", "new zealand": "NZ", "ireland": "IE",
"france": "FR", "germany": "DE", "spain": "ES", "portugal": "PT", "italy": "IT",
"netherlands": "NL", "belgium": "BE", "luxembourg": "LU", "switzerland": "CH", "austria": "AT",
"denmark": "DK", "sweden": "SE", "norway": "NO", "finland": "FI", "iceland": "IS",
"poland": "PL", "czech republic": "CZ", "czechia": "CZ", "slovakia": "SK", "hungary": "HU",
"greece": "GR", "romania": "RO", "bulgaria": "BG", "croatia": "HR", "serbia": "RS",
"ukraine": "UA", "russia": "RU", "turkey": "TR", "turkiye": "TR",
"china": "CN", "japan": "JP", "south korea": "KR", "korea": "KR", "north korea": "KP",
"india": "IN", "pakistan": "PK", "bangladesh": "BD", "sri lanka": "LK", "nepal": "NP",
"indonesia": "ID", "malaysia": "MY", "singapore": "SG", "thailand": "TH", "vietnam": "VN",
"philippines": "PH", "taiwan": "TW", "hong kong": "HK",
"israel": "IL", "palestine": "PS", "saudi arabia": "SA", "united arab emirates": "AE",
"uae": "AE", "qatar": "QA", "iran": "IR", "iraq": "IQ", "egypt": "EG", "jordan": "JO",
"south africa": "ZA", "nigeria": "NG", "kenya": "KE", "ethiopia": "ET", "ghana": "GH",
"tanzania": "TZ", "uganda": "UG", "rwanda": "RW", "morocco": "MA", "tunisia": "TN",
"mexico": "MX", "brazil": "BR", "argentina": "AR", "chile": "CL", "colombia": "CO",
"peru": "PE", "venezuela": "VE", "ecuador": "EC", "bolivia": "BO", "uruguay": "UY",
"costa rica": "CR", "panama": "PA", "guatemala": "GT", "cuba": "CU", "jamaica": "JM",
}
# Words that look like countries but are regions/continents -> never a country_code.
_NON_COUNTRY = {"europe", "asia", "africa", "north america", "south america", "latin america",
"the americas", "middle east", "scandinavia", "eu", "european union", "world",
"global", "international", "earth", "the world"}
def _norm_key(name) -> str:
s = re.sub(r"[^a-z0-9 ]", " ", str(name or "").lower())
s = re.sub(r"\bthe\b", " ", s)
return re.sub(r"\s+", " ", s).strip()
def normalize_country(name) -> str | None:
key = _norm_key(name)
if not key or key in _NON_COUNTRY:
return None
return COUNTRY_TO_ISO.get(key)
def normalize_state(name, country_code) -> str | None:
if country_code != "US":
return None # only US subdivisions for v1
return US_STATES.get(_norm_key(name))
def normalize_places(raw) -> list[dict]:
"""LLM place dicts -> cleaned, deduped [{country_code, state_code, locality}]."""
out, seen = [], set()
if not isinstance(raw, list):
return out
for p in raw:
if not isinstance(p, dict):
continue
cc = normalize_country(p.get("country"))
sc = normalize_state(p.get("state_province"), cc)
loc = str(p.get("locality") or "").strip() or None
if not (cc or sc or loc):
continue # entirely empty -> drop
key = (cc, sc, (loc or "").lower())
if key in seen:
continue
seen.add(key)
out.append({"country_code": cc, "state_code": sc, "locality": loc})
return out
# --- LLM extraction (separate pass; does not touch the scoring prompt) ------------
SYSTEM = (
"You tag the real-world geography of a news story for a calm good-news site. "
"Identify the place(s) the story is fundamentally ABOUT or where it HAPPENED, "
"NOT places mentioned only in passing. Many good-news stories (general science, "
"space, broad research, health) have no specific place; those are 'global'. If a "
"location is only incidental or genuinely unclear, use 'unknown'. Never guess. "
"Reply with ONLY a JSON object, no prose."
)
INSTRUCT = (
"Return JSON exactly like:\n"
'{"breadth": "<locality|regional|national|multinational|global|unknown>", '
'"places": [{"country": "<name or null>", "state_province": "<name or null>", '
'"locality": "<city/town or null>"}], "confidence": "<high|medium|low>", '
'"rationale": "<one short clause: where it happened and why>"}\n'
"breadth: locality=a specific city/town/county; regional=a state/province/region; "
"national=about a whole country; multinational=a few specific countries; "
"global=worldwide or no specific country; unknown=incidental/unclear. "
"places may list more than one when a story genuinely spans regions; use null for parts you can't support.\n"
"confidence: use 'high' ONLY when the location is explicitly stated or unmistakable; "
"'medium' when reasonably inferred; 'low' when shaky. Do NOT default to high."
)
def _article_text(row) -> str:
parts = [f"TITLE: {row['title']}"]
for label, key in (("SUMMARY", "summary"), ("WHAT HAPPENED", "what_happened"),
("WHY IT MATTERS", "why_matters"), ("PUBLISHER BLURB", "description")):
try:
v = row[key]
except (KeyError, IndexError):
v = None
if v:
parts.append(f"{label}: {v}")
return "\n".join(parts)
def classify_geo(client: LocalModelClient, row) -> dict:
"""One geo pass over an article row -> normalized result. Raises on unparseable."""
messages = [
{"role": "system", "content": SYSTEM},
{"role": "user", "content": _article_text(row) + "\n\n" + INSTRUCT},
]
data = parse_classifier_json(client.chat_text(messages))
breadth = data.get("breadth")
if breadth not in BREADTHS:
breadth = "unknown"
confidence = data.get("confidence")
if confidence not in CONFIDENCES:
confidence = "low"
return {
"breadth": breadth,
"confidence": confidence,
"rationale": (str(data.get("rationale") or "")[:300]) or None,
"places": normalize_places(data.get("places")),
}
def store_geo(conn: sqlite3.Connection, article_id: int, result: dict, version: str = GEO_VERSION) -> None:
"""Upsert article_geo and replace article_places. Geo is fully re-derivable, so
replacing places (unlike scores, which we never delete) is safe."""
conn.execute(
"INSERT INTO article_geo (article_id, breadth, confidence, rationale, geo_version, updated_at) "
"VALUES (?,?,?,?,?, datetime('now')) "
"ON CONFLICT(article_id) DO UPDATE SET breadth=excluded.breadth, confidence=excluded.confidence, "
"rationale=excluded.rationale, geo_version=excluded.geo_version, updated_at=excluded.updated_at",
(article_id, result["breadth"], result["confidence"], result.get("rationale"), version),
)
conn.execute("DELETE FROM article_places WHERE article_id=?", (article_id,))
for i, p in enumerate(result.get("places") or []):
conn.execute(
"INSERT INTO article_places (article_id, country_code, state_code, locality, ord) VALUES (?,?,?,?,?)",
(article_id, p.get("country_code"), p.get("state_code"), p.get("locality"), i),
)
def tag_articles(conn: sqlite3.Connection, client: LocalModelClient, limit: int = 200,
reclassify: bool = False) -> dict:
"""Tag accepted, non-duplicate articles that lack current geo. Idempotent: skips
rows already at GEO_VERSION unless reclassify=True. Used both by the cycle (new
articles) and the backfill (existing ones). Per-article failure is non-fatal."""
if reclassify:
where = "1=1"
else:
where = "(g.article_id IS NULL OR g.geo_version IS NOT ?)"
rows = conn.execute(
f"""SELECT a.id, a.title, a.description,
sm.summary, sm.what_happened, sm.why_matters
FROM articles a
JOIN article_scores s ON s.article_id = a.id
LEFT JOIN article_summaries sm ON sm.article_id = a.id
LEFT JOIN article_geo g ON g.article_id = a.id
WHERE s.accepted = 1 AND a.duplicate_of IS NULL AND {where}
ORDER BY a.discovered_at DESC
LIMIT ?""",
(() if reclassify else (GEO_VERSION,)) + (limit,),
).fetchall()
tagged = errors = 0
for r in rows:
try:
store_geo(conn, r["id"], classify_geo(client, r))
# Keep live auth/admin writes healthy while the scheduled cycle runs.
# Geo classification calls the LLM per article; if we batch commits, the
# first stored article opens a write transaction that can stay open while
# the next several LLM calls run. That starves login/session writes long
# enough to trip SQLite's busy timeout. Commit each successful article so
# the writer lock is held for milliseconds, not minutes.
conn.commit()
tagged += 1
except Exception: # noqa: BLE001 — non-fatal, like other cycle steps
conn.rollback()
errors += 1
conn.commit()
return {"candidates": len(rows), "tagged": tagged, "errors": errors}