Files
upbeatBytes/scripts/geo_audit.py
T
thejayman77 1c05554a28 Geo Stage 1-2: subject-geography model + classifier + pipeline wiring
"Closer to Home" foundation (audit greenlit by Codex). Durable geography, kept
decoupled from volatile scoring.

- Schema: article_geo (breadth/confidence/rationale/geo_version) + article_places
  (0..N ISO-coded places), separate from article_scores so re-runs/audits never
  disturb scoring or acceptance. "local" is never stored — it's relative to the
  reader; the UI computes "Near you" later.
- geo.py: LLM proposes place NAMES, code disposes to ISO codes (country alpha-2,
  US state 2-letter); region words like "Europe" can never become a country.
  'global'/placeless is first-class, not failure. Confidence calibrated so 'high'
  needs an explicit location. Geo is its OWN LLM pass, not merged into the scoring
  prompt (durable metadata, re-runnable, keeps the sensitive prompt untouched).
- store_geo replaces places (geo is re-derivable, unlike scores). tag_articles is
  idempotent by geo_version, only touches accepted non-duplicate articles.
- CLI `geo` command (cycle-locked, --limit/--reclassify) for backfill, plus a
  bounded geo step in the cycle (--geo-limit 60, --no-geo). scripts/geo_audit.py
  is the prototype audit tool.

360 tests green; live smoke tagged real articles correctly (Gaza->PS, London->GB,
placeless science->global). No UI / SEO pages yet — ranking/personalization only.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-19 16:56:49 -04:00

209 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""PROTOTYPE geo audit (not production).
Codex/Claude plan: before building any "Closer to Home" UI or touching the
production classify schema, measure what subject-geography the LLM can actually
extract from recent good-news articles, and whether it understands WHERE A STORY
HAPPENED vs. merely spotting place names.
Key taxonomy decision: "local" is relative to the VIEWER, so we do NOT store it.
We store the article's intrinsic geographic BREADTH (locality/regional/national/
multinational/global/unknown) plus the actual place(s). The UI later decides
"Near you" by comparing those places to the visitor's chosen home.
This writes results to a scratch JSON file and prints a coverage report. It does
not migrate the DB, change the classify pipeline, or backfill anything.
Run (host can reach the LAN model):
.venv/bin/python scripts/geo_audit.py --limit 400 --base-url http://127.0.0.1:8080/v1
Resumable: re-running skips article ids already in the out file.
"""
from __future__ import annotations
import argparse
import json
import statistics
from collections import Counter
from pathlib import Path
from goodnews.cli import _default_db
from goodnews.db import connect
from goodnews.llm import LocalModelClient, parse_classifier_json
BREADTHS = {"locality", "regional", "national", "multinational", "global", "unknown"}
SYSTEM = (
"You tag the real-world geography of a news story for a calm good-news site. "
"Identify the place(s) the story is fundamentally ABOUT or where it HAPPENED — "
"NOT places mentioned only in passing. Many good-news stories (general science, "
"space, broad research) have no specific place; those are 'global'. If a location "
"is only incidental or genuinely unclear, use 'unknown'. Do not guess. "
"Reply with ONLY a JSON object, no prose."
)
INSTRUCT = (
"Return JSON exactly like:\n"
'{"breadth": "<locality|regional|national|multinational|global|unknown>", '
'"places": [{"country": "<name or null>", "state_province": "<name or null>", '
'"locality": "<city/town or null>"}], "confidence": "<high|medium|low>", '
'"rationale": "<one short clause: where it happened and why>"}\n'
"breadth guide: locality=a specific city/town/county; regional=a state/province/region; "
"national=about a whole country; multinational=a few specific countries; "
"global=worldwide or no specific country; unknown=incidental/unclear. "
"places may list more than one when a story genuinely spans regions; use null for parts you can't support."
)
def fetch(conn, limit):
return conn.execute(
"""
SELECT a.id, a.title, a.description, a.published_at, a.discovered_at,
sm.summary, sm.what_happened, sm.why_matters
FROM articles a
JOIN article_scores s ON s.article_id = a.id
LEFT JOIN article_summaries sm ON sm.article_id = a.id
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
ORDER BY a.discovered_at DESC
LIMIT ?
""",
(limit,),
).fetchall()
def article_text(r):
parts = [f"TITLE: {r['title']}"]
for label, key in (("SUMMARY", "summary"), ("WHAT HAPPENED", "what_happened"),
("WHY IT MATTERS", "why_matters"), ("PUBLISHER BLURB", "description")):
v = r[key]
if v:
parts.append(f"{label}: {v}")
return "\n".join(parts)
def extract(client, r):
messages = [
{"role": "system", "content": SYSTEM},
{"role": "user", "content": article_text(r) + "\n\n" + INSTRUCT},
]
raw = client.chat_text(messages)
data = parse_classifier_json(raw) # raises on unparseable
breadth = data.get("breadth")
if breadth not in BREADTHS:
breadth = "unknown"
places = data.get("places")
places = [p for p in places if isinstance(p, dict)] if isinstance(places, list) else []
conf = data.get("confidence") if data.get("confidence") in {"high", "medium", "low"} else "low"
return {
"breadth": breadth,
"places": places,
"confidence": conf,
"rationale": (data.get("rationale") or "")[:300],
}
def report(rows, results):
by_id = {r["id"]: r for r in rows}
n = len(results)
print(f"\n===== GEO AUDIT REPORT (n={n}) =====")
if not n:
return
breadth = Counter(v["breadth"] for v in results.values())
conf = Counter(v["confidence"] for v in results.values())
countries = Counter()
states = Counter()
unknown = 0
for v in results.values():
if v["breadth"] == "unknown" or not v["places"]:
unknown += 1
for p in v["places"]:
if p.get("country"):
countries[str(p["country"]).strip()] += 1
if p.get("state_province"):
states[str(p["state_province"]).strip()] += 1
def pct(x):
return f"{100*x/n:.0f}%"
print("\nBreadth:")
for k in ("locality", "regional", "national", "multinational", "global", "unknown"):
print(f" {k:<13} {breadth.get(k,0):>4} {pct(breadth.get(k,0))}")
print(f"\nUnknown/no-place rate: {unknown}/{n} {pct(unknown)}")
print("Confidence:", dict(conf))
print("\nTop countries:")
for name, c in countries.most_common(12):
print(f" {name:<22} {c}")
print("\nTop states/provinces:")
for name, c in states.most_common(12):
print(f" {name:<22} {c}")
# US-local fuel check: how many map to a US state (the "Near you" payload for Americans)
us_local = sum(1 for v in results.values()
if any((p.get("country") or "").strip() in ("United States", "USA", "US") and p.get("state_province")
for p in v["places"]))
print(f"\nArticles with a US state attached (US 'Near you' fuel): {us_local} {pct(us_local)}")
# freshness
days = [by_id[int(i)]["discovered_at"][:10] for i in results if by_id.get(int(i)) and by_id[int(i)]["discovered_at"]]
if days:
print(f"\nFreshness: {min(days)} .. {max(days)} ({len(set(days))} distinct days)")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", default=None)
ap.add_argument("--limit", type=int, default=400)
ap.add_argument("--inspect", type=int, default=8, help="print N samples for manual eyeballing")
ap.add_argument("--out", default="data/geo_audit.json")
ap.add_argument("--base-url", default=None)
ap.add_argument("--model", default=None)
args = ap.parse_args()
conn = connect(args.db or str(_default_db()))
client = LocalModelClient.from_env()
if args.base_url:
client.base_url = args.base_url.rstrip("/")
if args.model:
client.model = args.model
out = Path(args.out)
results = json.loads(out.read_text()) if out.exists() else {}
rows = fetch(conn, args.limit)
print(f"Fetched {len(rows)} accepted non-dup articles; {len(results)} already done.")
done = 0
for r in rows:
rid = str(r["id"])
if rid in results:
continue
try:
results[rid] = extract(client, r)
except Exception as exc: # noqa: BLE001 — prototype; record + continue
results[rid] = {"breadth": "unknown", "places": [], "confidence": "low",
"rationale": f"ERROR: {type(exc).__name__}: {exc}"[:300], "error": True}
done += 1
if done % 25 == 0:
out.write_text(json.dumps(results, indent=1))
print(f" ...{done} new, {len(results)} total")
out.write_text(json.dumps(results, indent=1))
conn.close()
# Manual-inspection sample: the step Codex flagged as essential — eyeball whether
# the model captured WHERE IT HAPPENED, not just place-name recognition.
print(f"\n----- SAMPLE FOR MANUAL INSPECTION (first {args.inspect}) -----")
shown = 0
for r in rows:
rid = str(r["id"])
if rid not in results:
continue
v = results[rid]
print(f"\n[{rid}] {r['title']}")
print(f" breadth={v['breadth']} conf={v['confidence']} places={v['places']}")
print(f" why: {v['rationale']}")
shown += 1
if shown >= args.inspect:
break
report(rows, results)
if __name__ == "__main__":
main()