Geo Stage 1-2: subject-geography model + classifier + pipeline wiring

"Closer to Home" foundation (audit greenlit by Codex). Durable geography, kept decoupled from volatile scoring. - Schema: article_geo (breadth/confidence/rationale/geo_version) + article_places (0..N ISO-coded places), separate from article_scores so re-runs/audits never disturb scoring or acceptance. "local" is never stored — it's relative to the reader; the UI computes "Near you" later. - geo.py: LLM proposes place NAMES, code disposes to ISO codes (country alpha-2, US state 2-letter); region words like "Europe" can never become a country. 'global'/placeless is first-class, not failure. Confidence calibrated so 'high' needs an explicit location. Geo is its OWN LLM pass, not merged into the scoring prompt (durable metadata, re-runnable, keeps the sensitive prompt untouched). - store_geo replaces places (geo is re-derivable, unlike scores). tag_articles is idempotent by geo_version, only touches accepted non-duplicate articles. - CLI `geo` command (cycle-locked, --limit/--reclassify) for backfill, plus a bounded geo step in the cycle (--geo-limit 60, --no-geo). scripts/geo_audit.py is the prototype audit tool. 360 tests green; live smoke tagged real articles correctly (Gaza->PS, London->GB, placeless science->global). No UI / SEO pages yet — ranking/personalization only. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-19 16:56:49 -04:00
parent 59ff48ae90
commit 1c05554a28
7 changed files with 613 additions and 1 deletions
@@ -12,6 +12,7 @@ from .digest import send_due_digests
 from .games import generate_daily_puzzles
 from .localtime import local_today
 from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
+from .geo import tag_articles as tag_geo
 from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
 from .summarize import generate_summary, get_summary
 from .feeds import (
@@ -132,6 +133,8 @@ def main() -> None:
    cycle_parser.add_argument("--classify-limit", type=int, default=40)
    cycle_parser.add_argument("--no-classify", action="store_true", help="Skip the LLM classify step")
    cycle_parser.add_argument("--no-dedup", action="store_true", help="Skip the embedding dedup step")
+    cycle_parser.add_argument("--no-geo", action="store_true", help="Skip tagging article subject-geography")
+    cycle_parser.add_argument("--geo-limit", type=int, default=60, help="Max articles to geo-tag per cycle")
    cycle_parser.add_argument("--no-brief", action="store_true", help="Skip rebuilding today's brief")
    cycle_parser.add_argument("--no-review", action="store_true", help="Skip recomputing source review flags")
    cycle_parser.add_argument("--no-digest", action="store_true", help="Skip sending due daily digests")
@@ -147,6 +150,12 @@ def main() -> None:
    )
    enrich_images_parser.add_argument("--limit", type=int, default=50, help="Max articles to fetch this batch")

+    geo_parser = subparsers.add_parser("geo", help="Tag article subject-geography (backfill / manual). Cycle-locked.")
+    geo_parser.add_argument("--limit", type=int, default=200, help="Max articles to tag this batch")
+    geo_parser.add_argument("--reclassify", action="store_true", help="Re-tag even rows already at the current geo version")
+    geo_parser.add_argument("--base-url", help="OpenAI-compatible base URL")
+    geo_parser.add_argument("--model", help="Local model name")
+
    dedup_parser = subparsers.add_parser("dedup", help="Cluster near-duplicate stories via local embeddings")
    dedup_parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD, help="Cosine similarity cutoff")
    dedup_parser.add_argument("--window-days", type=int, default=DEFAULT_WINDOW_DAYS)
@@ -298,6 +307,15 @@ def main() -> None:
    elif args.command == "enrich-images":
        found = enrich_summarized_images(conn, limit=args.limit)
        print(f"enrich-images: {found} new image(s) for summarized articles")
+    elif args.command == "geo":
+        init_db(conn)
+        # Cycle-locked so a manual backfill can't contend with the scheduled cycle.
+        with cycle_lock(args.db) as acquired:
+            if not acquired:
+                print("geo: a cycle is already running; try again after it finishes")
+                return
+            g = tag_geo(conn, llm_client_from_args(args), limit=args.limit, reclassify=args.reclassify)
+        print(f"geo: tagged={g['tagged']} errors={g['errors']} (of {g['candidates']} candidates)")
    elif args.command == "dedup":
        init_db(conn)
        if args.force_recluster:
@@ -506,6 +524,16 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
        except Exception as exc:
            print(f"dedup: skipped ({exc})")

+    # Geo: tag newly-accepted, non-duplicate articles with subject geography (its own
+    # LLM pass, decoupled from scoring). Bounded per cycle; idempotent (skips rows
+    # already at the current GEO_VERSION). Non-fatal like every other step.
+    if not args.no_geo:
+        try:
+            g = tag_geo(conn, llm_client_from_args(args), limit=args.geo_limit)
+            print(f"geo: tagged={g['tagged']} errors={g['errors']} (of {g['candidates']} untagged)")
+        except Exception as exc:
+            print(f"geo: skipped ({exc})")
+
    if not args.no_brief:
        today = local_today()
        try:
@@ -217,6 +217,34 @@ CREATE TABLE IF NOT EXISTS article_summaries (
    created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
 );

+-- Where a story is ABOUT (subject geography), kept SEPARATE from article_scores so
+-- durable geography isn't coupled to volatile scoring/acceptance. "local" is never
+-- stored here — it's relative to the reader; the UI computes "Near you" by comparing
+-- these places to the visitor's chosen home. geo_version lets us re-backfill cleanly
+-- when the prompt/taxonomy changes. 'global' is a real category, not a failure.
+CREATE TABLE IF NOT EXISTS article_geo (
+    article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
+    breadth TEXT NOT NULL DEFAULT 'unknown',   -- locality|regional|national|multinational|global|unknown
+    confidence TEXT NOT NULL DEFAULT 'low',    -- high|medium|low
+    rationale TEXT,
+    geo_version TEXT,
+    updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+-- 0..N normalized places per article (a story can span regions). Codes are ISO
+-- (country = alpha-2, state = US 2-letter / ISO-3166-2 subdivision), normalized in
+-- code — never trusting the model's free text.
+CREATE TABLE IF NOT EXISTS article_places (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
+    country_code TEXT,
+    state_code TEXT,
+    locality TEXT,
+    ord INTEGER NOT NULL DEFAULT 0
+);
+CREATE INDEX IF NOT EXISTS idx_article_places_article ON article_places(article_id);
+CREATE INDEX IF NOT EXISTS idx_article_places_country ON article_places(country_code);
+CREATE INDEX IF NOT EXISTS idx_article_geo_breadth ON article_geo(breadth);
+
 -- Privacy-respecting, first-party analytics. NO IP / user-agent / referrer / raw
 -- URL. visitor_hash is a hash of a random localStorage token (never email/IP).
 -- The UNIQUE key dedups to one row per (kind, article, visitor, day) — that both
@@ -0,0 +1,222 @@
+"""Subject-geography for articles ("where is this story ABOUT").
+
+Kept deliberately separate from scoring (see article_geo / article_places in db.py):
+geography is durable metadata, scoring is volatile. The LLM proposes place NAMES;
+this module disposes by normalizing to ISO codes in code, never trusting the model's
+free text (so "Europe" never gets stored as a country). 'global'/placeless is a real,
+first-class result, not a failure. "local" is NOT stored — it's relative to the reader;
+the UI decides "Near you" by comparing these places to the visitor's chosen home.
+"""
+from __future__ import annotations
+
+import json
+import re
+import sqlite3
+
+from .llm import LocalModelClient, parse_classifier_json
+
+# Bump when the prompt/taxonomy changes, so a re-backfill can target stale rows.
+GEO_VERSION = "geo-v1"
+
+BREADTHS = ("locality", "regional", "national", "multinational", "global", "unknown")
+CONFIDENCES = ("high", "medium", "low")
+
+# --- normalization data (LLM returns names; we map to ISO, drop the unmappable) ----
+
+US_STATES = {
+    "alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR", "california": "CA",
+    "colorado": "CO", "connecticut": "CT", "delaware": "DE", "florida": "FL", "georgia": "GA",
+    "hawaii": "HI", "idaho": "ID", "illinois": "IL", "indiana": "IN", "iowa": "IA",
+    "kansas": "KS", "kentucky": "KY", "louisiana": "LA", "maine": "ME", "maryland": "MD",
+    "massachusetts": "MA", "michigan": "MI", "minnesota": "MN", "mississippi": "MS",
+    "missouri": "MO", "montana": "MT", "nebraska": "NE", "nevada": "NV", "new hampshire": "NH",
+    "new jersey": "NJ", "new mexico": "NM", "new york": "NY", "north carolina": "NC",
+    "north dakota": "ND", "ohio": "OH", "oklahoma": "OK", "oregon": "OR", "pennsylvania": "PA",
+    "rhode island": "RI", "south carolina": "SC", "south dakota": "SD", "tennessee": "TN",
+    "texas": "TX", "utah": "UT", "vermont": "VT", "virginia": "VA", "washington": "WA",
+    "west virginia": "WV", "wisconsin": "WI", "wyoming": "WY",
+    "district of columbia": "DC", "washington dc": "DC", "washington d c": "DC",
+}
+
+# Common countries + aliases (extensible). Anything not here returns None -> we drop
+# the country rather than store garbage. breadth still captures national/global, etc.
+COUNTRY_TO_ISO = {
+    "united states": "US", "united states of america": "US", "usa": "US", "us": "US", "america": "US",
+    "united kingdom": "GB", "uk": "GB", "britain": "GB", "great britain": "GB", "england": "GB",
+    "scotland": "GB", "wales": "GB", "northern ireland": "GB",
+    "canada": "CA", "australia": "AU", "new zealand": "NZ", "ireland": "IE",
+    "france": "FR", "germany": "DE", "spain": "ES", "portugal": "PT", "italy": "IT",
+    "netherlands": "NL", "belgium": "BE", "luxembourg": "LU", "switzerland": "CH", "austria": "AT",
+    "denmark": "DK", "sweden": "SE", "norway": "NO", "finland": "FI", "iceland": "IS",
+    "poland": "PL", "czech republic": "CZ", "czechia": "CZ", "slovakia": "SK", "hungary": "HU",
+    "greece": "GR", "romania": "RO", "bulgaria": "BG", "croatia": "HR", "serbia": "RS",
+    "ukraine": "UA", "russia": "RU", "turkey": "TR", "turkiye": "TR",
+    "china": "CN", "japan": "JP", "south korea": "KR", "korea": "KR", "north korea": "KP",
+    "india": "IN", "pakistan": "PK", "bangladesh": "BD", "sri lanka": "LK", "nepal": "NP",
+    "indonesia": "ID", "malaysia": "MY", "singapore": "SG", "thailand": "TH", "vietnam": "VN",
+    "philippines": "PH", "taiwan": "TW", "hong kong": "HK",
+    "israel": "IL", "palestine": "PS", "saudi arabia": "SA", "united arab emirates": "AE",
+    "uae": "AE", "qatar": "QA", "iran": "IR", "iraq": "IQ", "egypt": "EG", "jordan": "JO",
+    "south africa": "ZA", "nigeria": "NG", "kenya": "KE", "ethiopia": "ET", "ghana": "GH",
+    "tanzania": "TZ", "uganda": "UG", "rwanda": "RW", "morocco": "MA", "tunisia": "TN",
+    "mexico": "MX", "brazil": "BR", "argentina": "AR", "chile": "CL", "colombia": "CO",
+    "peru": "PE", "venezuela": "VE", "ecuador": "EC", "bolivia": "BO", "uruguay": "UY",
+    "costa rica": "CR", "panama": "PA", "guatemala": "GT", "cuba": "CU", "jamaica": "JM",
+}
+
+# Words that look like countries but are regions/continents -> never a country_code.
+_NON_COUNTRY = {"europe", "asia", "africa", "north america", "south america", "latin america",
+                "the americas", "middle east", "scandinavia", "eu", "european union", "world",
+                "global", "international", "earth", "the world"}
+
+
+def _norm_key(name) -> str:
+    s = re.sub(r"[^a-z0-9 ]", " ", str(name or "").lower())
+    s = re.sub(r"\bthe\b", " ", s)
+    return re.sub(r"\s+", " ", s).strip()
+
+
+def normalize_country(name) -> str | None:
+    key = _norm_key(name)
+    if not key or key in _NON_COUNTRY:
+        return None
+    return COUNTRY_TO_ISO.get(key)
+
+
+def normalize_state(name, country_code) -> str | None:
+    if country_code != "US":
+        return None  # only US subdivisions for v1
+    return US_STATES.get(_norm_key(name))
+
+
+def normalize_places(raw) -> list[dict]:
+    """LLM place dicts -> cleaned, deduped [{country_code, state_code, locality}]."""
+    out, seen = [], set()
+    if not isinstance(raw, list):
+        return out
+    for p in raw:
+        if not isinstance(p, dict):
+            continue
+        cc = normalize_country(p.get("country"))
+        sc = normalize_state(p.get("state_province"), cc)
+        loc = str(p.get("locality") or "").strip() or None
+        if not (cc or sc or loc):
+            continue  # entirely empty -> drop
+        key = (cc, sc, (loc or "").lower())
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append({"country_code": cc, "state_code": sc, "locality": loc})
+    return out
+
+
+# --- LLM extraction (separate pass; does not touch the scoring prompt) ------------
+
+SYSTEM = (
+    "You tag the real-world geography of a news story for a calm good-news site. "
+    "Identify the place(s) the story is fundamentally ABOUT or where it HAPPENED, "
+    "NOT places mentioned only in passing. Many good-news stories (general science, "
+    "space, broad research, health) have no specific place; those are 'global'. If a "
+    "location is only incidental or genuinely unclear, use 'unknown'. Never guess. "
+    "Reply with ONLY a JSON object, no prose."
+)
+
+INSTRUCT = (
+    "Return JSON exactly like:\n"
+    '{"breadth": "<locality|regional|national|multinational|global|unknown>", '
+    '"places": [{"country": "<name or null>", "state_province": "<name or null>", '
+    '"locality": "<city/town or null>"}], "confidence": "<high|medium|low>", '
+    '"rationale": "<one short clause: where it happened and why>"}\n'
+    "breadth: locality=a specific city/town/county; regional=a state/province/region; "
+    "national=about a whole country; multinational=a few specific countries; "
+    "global=worldwide or no specific country; unknown=incidental/unclear. "
+    "places may list more than one when a story genuinely spans regions; use null for parts you can't support.\n"
+    "confidence: use 'high' ONLY when the location is explicitly stated or unmistakable; "
+    "'medium' when reasonably inferred; 'low' when shaky. Do NOT default to high."
+)
+
+
+def _article_text(row) -> str:
+    parts = [f"TITLE: {row['title']}"]
+    for label, key in (("SUMMARY", "summary"), ("WHAT HAPPENED", "what_happened"),
+                       ("WHY IT MATTERS", "why_matters"), ("PUBLISHER BLURB", "description")):
+        try:
+            v = row[key]
+        except (KeyError, IndexError):
+            v = None
+        if v:
+            parts.append(f"{label}: {v}")
+    return "\n".join(parts)
+
+
+def classify_geo(client: LocalModelClient, row) -> dict:
+    """One geo pass over an article row -> normalized result. Raises on unparseable."""
+    messages = [
+        {"role": "system", "content": SYSTEM},
+        {"role": "user", "content": _article_text(row) + "\n\n" + INSTRUCT},
+    ]
+    data = parse_classifier_json(client.chat_text(messages))
+    breadth = data.get("breadth")
+    if breadth not in BREADTHS:
+        breadth = "unknown"
+    confidence = data.get("confidence")
+    if confidence not in CONFIDENCES:
+        confidence = "low"
+    return {
+        "breadth": breadth,
+        "confidence": confidence,
+        "rationale": (str(data.get("rationale") or "")[:300]) or None,
+        "places": normalize_places(data.get("places")),
+    }
+
+
+def store_geo(conn: sqlite3.Connection, article_id: int, result: dict, version: str = GEO_VERSION) -> None:
+    """Upsert article_geo and replace article_places. Geo is fully re-derivable, so
+    replacing places (unlike scores, which we never delete) is safe."""
+    conn.execute(
+        "INSERT INTO article_geo (article_id, breadth, confidence, rationale, geo_version, updated_at) "
+        "VALUES (?,?,?,?,?, datetime('now')) "
+        "ON CONFLICT(article_id) DO UPDATE SET breadth=excluded.breadth, confidence=excluded.confidence, "
+        "rationale=excluded.rationale, geo_version=excluded.geo_version, updated_at=excluded.updated_at",
+        (article_id, result["breadth"], result["confidence"], result.get("rationale"), version),
+    )
+    conn.execute("DELETE FROM article_places WHERE article_id=?", (article_id,))
+    for i, p in enumerate(result.get("places") or []):
+        conn.execute(
+            "INSERT INTO article_places (article_id, country_code, state_code, locality, ord) VALUES (?,?,?,?,?)",
+            (article_id, p.get("country_code"), p.get("state_code"), p.get("locality"), i),
+        )
+
+
+def tag_articles(conn: sqlite3.Connection, client: LocalModelClient, limit: int = 200,
+                 reclassify: bool = False) -> dict:
+    """Tag accepted, non-duplicate articles that lack current geo. Idempotent: skips
+    rows already at GEO_VERSION unless reclassify=True. Used both by the cycle (new
+    articles) and the backfill (existing ones). Per-article failure is non-fatal."""
+    if reclassify:
+        where = "1=1"
+    else:
+        where = "(g.article_id IS NULL OR g.geo_version IS NOT ?)"
+    rows = conn.execute(
+        f"""SELECT a.id, a.title, a.description,
+                   sm.summary, sm.what_happened, sm.why_matters
+            FROM articles a
+            JOIN article_scores s ON s.article_id = a.id
+            LEFT JOIN article_summaries sm ON sm.article_id = a.id
+            LEFT JOIN article_geo g ON g.article_id = a.id
+            WHERE s.accepted = 1 AND a.duplicate_of IS NULL AND {where}
+            ORDER BY a.discovered_at DESC
+            LIMIT ?""",
+        (() if reclassify else (GEO_VERSION,)) + (limit,),
+    ).fetchall()
+    tagged = errors = 0
+    for r in rows:
+        try:
+            store_geo(conn, r["id"], classify_geo(client, r))
+            tagged += 1
+        except Exception:  # noqa: BLE001 — non-fatal, like other cycle steps
+            errors += 1
+        if (tagged + errors) % 25 == 0:
+            conn.commit()
+    conn.commit()
+    return {"candidates": len(rows), "tagged": tagged, "errors": errors}