"""Subject-geography for articles ("where is this story ABOUT"). Kept deliberately separate from scoring (see article_geo / article_places in db.py): geography is durable metadata, scoring is volatile. The LLM proposes place NAMES; this module disposes by normalizing to ISO codes in code, never trusting the model's free text (so "Europe" never gets stored as a country). 'global'/placeless is a real, first-class result, not a failure. "local" is NOT stored — it's relative to the reader; the UI decides "Near you" by comparing these places to the visitor's chosen home. """ from __future__ import annotations import json import re import sqlite3 from .llm import LocalModelClient, parse_classifier_json # Bump when the prompt/taxonomy changes, so a re-backfill can target stale rows. GEO_VERSION = "geo-v1" BREADTHS = ("locality", "regional", "national", "multinational", "global", "unknown") CONFIDENCES = ("high", "medium", "low") # --- normalization data (LLM returns names; we map to ISO, drop the unmappable) ---- US_STATES = { "alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR", "california": "CA", "colorado": "CO", "connecticut": "CT", "delaware": "DE", "florida": "FL", "georgia": "GA", "hawaii": "HI", "idaho": "ID", "illinois": "IL", "indiana": "IN", "iowa": "IA", "kansas": "KS", "kentucky": "KY", "louisiana": "LA", "maine": "ME", "maryland": "MD", "massachusetts": "MA", "michigan": "MI", "minnesota": "MN", "mississippi": "MS", "missouri": "MO", "montana": "MT", "nebraska": "NE", "nevada": "NV", "new hampshire": "NH", "new jersey": "NJ", "new mexico": "NM", "new york": "NY", "north carolina": "NC", "north dakota": "ND", "ohio": "OH", "oklahoma": "OK", "oregon": "OR", "pennsylvania": "PA", "rhode island": "RI", "south carolina": "SC", "south dakota": "SD", "tennessee": "TN", "texas": "TX", "utah": "UT", "vermont": "VT", "virginia": "VA", "washington": "WA", "west virginia": "WV", "wisconsin": "WI", "wyoming": "WY", "district of columbia": "DC", "washington dc": "DC", "washington d c": "DC", } # Common countries + aliases (extensible). Anything not here returns None -> we drop # the country rather than store garbage. breadth still captures national/global, etc. COUNTRY_TO_ISO = { "united states": "US", "united states of america": "US", "usa": "US", "us": "US", "america": "US", "united kingdom": "GB", "uk": "GB", "britain": "GB", "great britain": "GB", "england": "GB", "scotland": "GB", "wales": "GB", "northern ireland": "GB", "canada": "CA", "australia": "AU", "new zealand": "NZ", "ireland": "IE", "france": "FR", "germany": "DE", "spain": "ES", "portugal": "PT", "italy": "IT", "netherlands": "NL", "belgium": "BE", "luxembourg": "LU", "switzerland": "CH", "austria": "AT", "denmark": "DK", "sweden": "SE", "norway": "NO", "finland": "FI", "iceland": "IS", "poland": "PL", "czech republic": "CZ", "czechia": "CZ", "slovakia": "SK", "hungary": "HU", "greece": "GR", "romania": "RO", "bulgaria": "BG", "croatia": "HR", "serbia": "RS", "ukraine": "UA", "russia": "RU", "turkey": "TR", "turkiye": "TR", "china": "CN", "japan": "JP", "south korea": "KR", "korea": "KR", "north korea": "KP", "india": "IN", "pakistan": "PK", "bangladesh": "BD", "sri lanka": "LK", "nepal": "NP", "indonesia": "ID", "malaysia": "MY", "singapore": "SG", "thailand": "TH", "vietnam": "VN", "philippines": "PH", "taiwan": "TW", "hong kong": "HK", "israel": "IL", "palestine": "PS", "saudi arabia": "SA", "united arab emirates": "AE", "uae": "AE", "qatar": "QA", "iran": "IR", "iraq": "IQ", "egypt": "EG", "jordan": "JO", "south africa": "ZA", "nigeria": "NG", "kenya": "KE", "ethiopia": "ET", "ghana": "GH", "tanzania": "TZ", "uganda": "UG", "rwanda": "RW", "morocco": "MA", "tunisia": "TN", "mexico": "MX", "brazil": "BR", "argentina": "AR", "chile": "CL", "colombia": "CO", "peru": "PE", "venezuela": "VE", "ecuador": "EC", "bolivia": "BO", "uruguay": "UY", "costa rica": "CR", "panama": "PA", "guatemala": "GT", "cuba": "CU", "jamaica": "JM", } # Words that look like countries but are regions/continents -> never a country_code. _NON_COUNTRY = {"europe", "asia", "africa", "north america", "south america", "latin america", "the americas", "middle east", "scandinavia", "eu", "european union", "world", "global", "international", "earth", "the world"} def _norm_key(name) -> str: s = re.sub(r"[^a-z0-9 ]", " ", str(name or "").lower()) s = re.sub(r"\bthe\b", " ", s) return re.sub(r"\s+", " ", s).strip() def normalize_country(name) -> str | None: key = _norm_key(name) if not key or key in _NON_COUNTRY: return None return COUNTRY_TO_ISO.get(key) def normalize_state(name, country_code) -> str | None: if country_code != "US": return None # only US subdivisions for v1 return US_STATES.get(_norm_key(name)) def normalize_places(raw) -> list[dict]: """LLM place dicts -> cleaned, deduped [{country_code, state_code, locality}].""" out, seen = [], set() if not isinstance(raw, list): return out for p in raw: if not isinstance(p, dict): continue cc = normalize_country(p.get("country")) sc = normalize_state(p.get("state_province"), cc) loc = str(p.get("locality") or "").strip() or None if not (cc or sc or loc): continue # entirely empty -> drop key = (cc, sc, (loc or "").lower()) if key in seen: continue seen.add(key) out.append({"country_code": cc, "state_code": sc, "locality": loc}) return out # --- LLM extraction (separate pass; does not touch the scoring prompt) ------------ SYSTEM = ( "You tag the real-world geography of a news story for a calm good-news site. " "Identify the place(s) the story is fundamentally ABOUT or where it HAPPENED, " "NOT places mentioned only in passing. Many good-news stories (general science, " "space, broad research, health) have no specific place; those are 'global'. If a " "location is only incidental or genuinely unclear, use 'unknown'. Never guess. " "Reply with ONLY a JSON object, no prose." ) INSTRUCT = ( "Return JSON exactly like:\n" '{"breadth": "", ' '"places": [{"country": "", "state_province": "", ' '"locality": ""}], "confidence": "", ' '"rationale": ""}\n' "breadth: locality=a specific city/town/county; regional=a state/province/region; " "national=about a whole country; multinational=a few specific countries; " "global=worldwide or no specific country; unknown=incidental/unclear. " "places may list more than one when a story genuinely spans regions; use null for parts you can't support.\n" "confidence: use 'high' ONLY when the location is explicitly stated or unmistakable; " "'medium' when reasonably inferred; 'low' when shaky. Do NOT default to high." ) def _article_text(row) -> str: parts = [f"TITLE: {row['title']}"] for label, key in (("SUMMARY", "summary"), ("WHAT HAPPENED", "what_happened"), ("WHY IT MATTERS", "why_matters"), ("PUBLISHER BLURB", "description")): try: v = row[key] except (KeyError, IndexError): v = None if v: parts.append(f"{label}: {v}") return "\n".join(parts) def classify_geo(client: LocalModelClient, row) -> dict: """One geo pass over an article row -> normalized result. Raises on unparseable.""" messages = [ {"role": "system", "content": SYSTEM}, {"role": "user", "content": _article_text(row) + "\n\n" + INSTRUCT}, ] data = parse_classifier_json(client.chat_text(messages)) breadth = data.get("breadth") if breadth not in BREADTHS: breadth = "unknown" confidence = data.get("confidence") if confidence not in CONFIDENCES: confidence = "low" return { "breadth": breadth, "confidence": confidence, "rationale": (str(data.get("rationale") or "")[:300]) or None, "places": normalize_places(data.get("places")), } def store_geo(conn: sqlite3.Connection, article_id: int, result: dict, version: str = GEO_VERSION) -> None: """Upsert article_geo and replace article_places. Geo is fully re-derivable, so replacing places (unlike scores, which we never delete) is safe.""" conn.execute( "INSERT INTO article_geo (article_id, breadth, confidence, rationale, geo_version, updated_at) " "VALUES (?,?,?,?,?, datetime('now')) " "ON CONFLICT(article_id) DO UPDATE SET breadth=excluded.breadth, confidence=excluded.confidence, " "rationale=excluded.rationale, geo_version=excluded.geo_version, updated_at=excluded.updated_at", (article_id, result["breadth"], result["confidence"], result.get("rationale"), version), ) conn.execute("DELETE FROM article_places WHERE article_id=?", (article_id,)) for i, p in enumerate(result.get("places") or []): conn.execute( "INSERT INTO article_places (article_id, country_code, state_code, locality, ord) VALUES (?,?,?,?,?)", (article_id, p.get("country_code"), p.get("state_code"), p.get("locality"), i), ) def tag_articles(conn: sqlite3.Connection, client: LocalModelClient, limit: int = 200, reclassify: bool = False) -> dict: """Tag accepted, non-duplicate articles that lack current geo. Idempotent: skips rows already at GEO_VERSION unless reclassify=True. Used both by the cycle (new articles) and the backfill (existing ones). Per-article failure is non-fatal.""" if reclassify: where = "1=1" else: where = "(g.article_id IS NULL OR g.geo_version IS NOT ?)" rows = conn.execute( f"""SELECT a.id, a.title, a.description, sm.summary, sm.what_happened, sm.why_matters FROM articles a JOIN article_scores s ON s.article_id = a.id LEFT JOIN article_summaries sm ON sm.article_id = a.id LEFT JOIN article_geo g ON g.article_id = a.id WHERE s.accepted = 1 AND a.duplicate_of IS NULL AND {where} ORDER BY a.discovered_at DESC LIMIT ?""", (() if reclassify else (GEO_VERSION,)) + (limit,), ).fetchall() tagged = errors = 0 for r in rows: try: store_geo(conn, r["id"], classify_geo(client, r)) tagged += 1 except Exception: # noqa: BLE001 — non-fatal, like other cycle steps errors += 1 if (tagged + errors) % 25 == 0: conn.commit() conn.commit() return {"candidates": len(rows), "tagged": tagged, "errors": errors}