upbeatBytes/scripts/geo_audit.py

#!/usr/bin/env python3
"""PROTOTYPE geo audit (not production).

Codex/Claude plan: before building any "Closer to Home" UI or touching the
production classify schema, measure what subject-geography the LLM can actually
extract from recent good-news articles, and whether it understands WHERE A STORY
HAPPENED vs. merely spotting place names.

Key taxonomy decision: "local" is relative to the VIEWER, so we do NOT store it.
We store the article's intrinsic geographic BREADTH (locality/regional/national/
multinational/global/unknown) plus the actual place(s). The UI later decides
"Near you" by comparing those places to the visitor's chosen home.

This writes results to a scratch JSON file and prints a coverage report. It does
not migrate the DB, change the classify pipeline, or backfill anything.

Run (host can reach the LAN model):
  .venv/bin/python scripts/geo_audit.py --limit 400 --base-url http://127.0.0.1:8080/v1
Resumable: re-running skips article ids already in the out file.
"""
from __future__ import annotations

import argparse
import json
import statistics
from collections import Counter
from pathlib import Path

from goodnews.cli import _default_db
from goodnews.db import connect
from goodnews.llm import LocalModelClient, parse_classifier_json

BREADTHS = {"locality", "regional", "national", "multinational", "global", "unknown"}

SYSTEM = (
    "You tag the real-world geography of a news story for a calm good-news site. "
    "Identify the place(s) the story is fundamentally ABOUT or where it HAPPENED — "
    "NOT places mentioned only in passing. Many good-news stories (general science, "
    "space, broad research) have no specific place; those are 'global'. If a location "
    "is only incidental or genuinely unclear, use 'unknown'. Do not guess. "
    "Reply with ONLY a JSON object, no prose."
)

INSTRUCT = (
    "Return JSON exactly like:\n"
    '{"breadth": "<locality|regional|national|multinational|global|unknown>", '
    '"places": [{"country": "<name or null>", "state_province": "<name or null>", '
    '"locality": "<city/town or null>"}], "confidence": "<high|medium|low>", '
    '"rationale": "<one short clause: where it happened and why>"}\n'
    "breadth guide: locality=a specific city/town/county; regional=a state/province/region; "
    "national=about a whole country; multinational=a few specific countries; "
    "global=worldwide or no specific country; unknown=incidental/unclear. "
    "places may list more than one when a story genuinely spans regions; use null for parts you can't support."
)


def fetch(conn, limit):
    return conn.execute(
        """
        SELECT a.id, a.title, a.description, a.published_at, a.discovered_at,
               sm.summary, sm.what_happened, sm.why_matters
        FROM articles a
        JOIN article_scores s ON s.article_id = a.id
        LEFT JOIN article_summaries sm ON sm.article_id = a.id
        WHERE s.accepted = 1 AND a.duplicate_of IS NULL
        ORDER BY a.discovered_at DESC
        LIMIT ?
        """,
        (limit,),
    ).fetchall()


def article_text(r):
    parts = [f"TITLE: {r['title']}"]
    for label, key in (("SUMMARY", "summary"), ("WHAT HAPPENED", "what_happened"),
                       ("WHY IT MATTERS", "why_matters"), ("PUBLISHER BLURB", "description")):
        v = r[key]
        if v:
            parts.append(f"{label}: {v}")
    return "\n".join(parts)


def extract(client, r):
    messages = [
        {"role": "system", "content": SYSTEM},
        {"role": "user", "content": article_text(r) + "\n\n" + INSTRUCT},
    ]
    raw = client.chat_text(messages)
    data = parse_classifier_json(raw)  # raises on unparseable
    breadth = data.get("breadth")
    if breadth not in BREADTHS:
        breadth = "unknown"
    places = data.get("places")
    places = [p for p in places if isinstance(p, dict)] if isinstance(places, list) else []
    conf = data.get("confidence") if data.get("confidence") in {"high", "medium", "low"} else "low"
    return {
        "breadth": breadth,
        "places": places,
        "confidence": conf,
        "rationale": (data.get("rationale") or "")[:300],
    }


def report(rows, results):
    by_id = {r["id"]: r for r in rows}
    n = len(results)
    print(f"\n===== GEO AUDIT REPORT (n={n}) =====")
    if not n:
        return
    breadth = Counter(v["breadth"] for v in results.values())
    conf = Counter(v["confidence"] for v in results.values())
    countries = Counter()
    states = Counter()
    unknown = 0
    for v in results.values():
        if v["breadth"] == "unknown" or not v["places"]:
            unknown += 1
        for p in v["places"]:
            if p.get("country"):
                countries[str(p["country"]).strip()] += 1
            if p.get("state_province"):
                states[str(p["state_province"]).strip()] += 1

    def pct(x):
        return f"{100*x/n:.0f}%"

    print("\nBreadth:")
    for k in ("locality", "regional", "national", "multinational", "global", "unknown"):
        print(f"  {k:<13} {breadth.get(k,0):>4}  {pct(breadth.get(k,0))}")
    print(f"\nUnknown/no-place rate: {unknown}/{n}  {pct(unknown)}")
    print("Confidence:", dict(conf))
    print("\nTop countries:")
    for name, c in countries.most_common(12):
        print(f"  {name:<22} {c}")
    print("\nTop states/provinces:")
    for name, c in states.most_common(12):
        print(f"  {name:<22} {c}")
    # US-local fuel check: how many map to a US state (the "Near you" payload for Americans)
    us_local = sum(1 for v in results.values()
                   if any((p.get("country") or "").strip() in ("United States", "USA", "US") and p.get("state_province")
                          for p in v["places"]))
    print(f"\nArticles with a US state attached (US 'Near you' fuel): {us_local}  {pct(us_local)}")
    # freshness
    days = [by_id[int(i)]["discovered_at"][:10] for i in results if by_id.get(int(i)) and by_id[int(i)]["discovered_at"]]
    if days:
        print(f"\nFreshness: {min(days)} .. {max(days)} ({len(set(days))} distinct days)")


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--db", default=None)
    ap.add_argument("--limit", type=int, default=400)
    ap.add_argument("--inspect", type=int, default=8, help="print N samples for manual eyeballing")
    ap.add_argument("--out", default="data/geo_audit.json")
    ap.add_argument("--base-url", default=None)
    ap.add_argument("--model", default=None)
    args = ap.parse_args()

    conn = connect(args.db or str(_default_db()))
    client = LocalModelClient.from_env()
    if args.base_url:
        client.base_url = args.base_url.rstrip("/")
    if args.model:
        client.model = args.model

    out = Path(args.out)
    results = json.loads(out.read_text()) if out.exists() else {}
    rows = fetch(conn, args.limit)
    print(f"Fetched {len(rows)} accepted non-dup articles; {len(results)} already done.")

    done = 0
    for r in rows:
        rid = str(r["id"])
        if rid in results:
            continue
        try:
            results[rid] = extract(client, r)
        except Exception as exc:  # noqa: BLE001 — prototype; record + continue
            results[rid] = {"breadth": "unknown", "places": [], "confidence": "low",
                            "rationale": f"ERROR: {type(exc).__name__}: {exc}"[:300], "error": True}
        done += 1
        if done % 25 == 0:
            out.write_text(json.dumps(results, indent=1))
            print(f"  ...{done} new, {len(results)} total")
    out.write_text(json.dumps(results, indent=1))
    conn.close()

    # Manual-inspection sample: the step Codex flagged as essential — eyeball whether
    # the model captured WHERE IT HAPPENED, not just place-name recognition.
    print(f"\n----- SAMPLE FOR MANUAL INSPECTION (first {args.inspect}) -----")
    shown = 0
    for r in rows:
        rid = str(r["id"])
        if rid not in results:
            continue
        v = results[rid]
        print(f"\n[{rid}] {r['title']}")
        print(f"   breadth={v['breadth']} conf={v['confidence']} places={v['places']}")
        print(f"   why: {v['rationale']}")
        shown += 1
        if shown >= args.inspect:
            break

    report(rows, results)


if __name__ == "__main__":
    main()