#!/usr/bin/env python3 """Geo coverage report over tagged articles (article_geo / article_places). Codex's post-backfill checklist: breadth counts, top countries/states, unknown & global rates, low-confidence + multi-place examples, normalization gaps, and a random spot-check. Read-only; no LLM. Also the seed for the Stage-6 admin report. .venv/bin/python scripts/geo_report.py [--spot 20] """ from __future__ import annotations import argparse import sqlite3 from collections import Counter from goodnews.cli import _default_db def main(): ap = argparse.ArgumentParser() ap.add_argument("--db", default=None) ap.add_argument("--spot", type=int, default=20, help="random tagged articles to spot-check") args = ap.parse_args() c = sqlite3.connect(args.db or str(_default_db())) c.row_factory = sqlite3.Row total_accepted = c.execute( "SELECT COUNT(*) FROM articles a JOIN article_scores s ON s.article_id=a.id " "WHERE s.accepted=1 AND a.duplicate_of IS NULL" ).fetchone()[0] tagged = c.execute("SELECT COUNT(*) FROM article_geo").fetchone()[0] n = tagged or 1 def pct(x): return f"{100*x/n:.0f}%" print(f"\n===== GEO COVERAGE ({tagged} tagged / {total_accepted} accepted non-dup = " f"{100*tagged/(total_accepted or 1):.0f}% covered) =====") print("\nBreadth:") br = Counter(r["breadth"] for r in c.execute("SELECT breadth FROM article_geo")) for k in ("locality", "regional", "national", "multinational", "global", "unknown"): print(f" {k:<13} {br.get(k,0):>4} {pct(br.get(k,0))}") unknown = br.get("unknown", 0) glob = br.get("global", 0) print(f"\nUnknown rate: {unknown} ({pct(unknown)}) Global/placeless: {glob} ({pct(glob)})") conf = Counter(r["confidence"] for r in c.execute("SELECT confidence FROM article_geo")) print("Confidence:", {k: conf.get(k, 0) for k in ("high", "medium", "low")}) print("\nTop countries:") for r in c.execute("SELECT country_code, COUNT(*) n FROM article_places " "WHERE country_code IS NOT NULL GROUP BY country_code ORDER BY n DESC LIMIT 15"): print(f" {r['country_code']} {r['n']}") print("\nTop US states:") for r in c.execute("SELECT state_code, COUNT(*) n FROM article_places " "WHERE state_code IS NOT NULL GROUP BY state_code ORDER BY n DESC LIMIT 15"): print(f" {r['state_code']} {r['n']}") # US "Near you" fuel: tagged articles with a US state attached us_state = c.execute("SELECT COUNT(DISTINCT article_id) FROM article_places WHERE state_code IS NOT NULL").fetchone()[0] print(f"\nArticles with a US state (US 'Near you' fuel): {us_state} ({pct(us_state)})") # Normalization gap: breadth implies a place but none stored (name didn't map, or # the model named no place). A proxy for where normalization/extraction is weak. gap = c.execute( "SELECT COUNT(*) FROM article_geo g WHERE g.breadth IN ('locality','regional','national','multinational') " "AND NOT EXISTS (SELECT 1 FROM article_places p WHERE p.article_id=g.article_id)" ).fetchone()[0] print(f"Normalization/extraction gaps (place-bearing breadth, 0 places stored): {gap} ({pct(gap)})") print("\n--- low-confidence examples (up to 8) ---") for r in c.execute( "SELECT g.article_id, a.title, g.breadth, g.rationale FROM article_geo g " "JOIN articles a ON a.id=g.article_id WHERE g.confidence='low' LIMIT 8"): print(f" [{r['article_id']}] {r['breadth']} | {r['title'][:55]} | {r['rationale'] or ''}") print("\n--- multi-place examples (up to 8) ---") for r in c.execute( "SELECT article_id, COUNT(*) n FROM article_places GROUP BY article_id HAVING n>1 ORDER BY n DESC LIMIT 8"): a = c.execute("SELECT title FROM articles WHERE id=?", (r["article_id"],)).fetchone() places = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?", (r["article_id"],)).fetchall() print(f" [{r['article_id']}] ({r['n']}) {a['title'][:45]} | {[dict(p) for p in places]}") print(f"\n--- random spot-check ({args.spot}) ---") for r in c.execute( "SELECT g.article_id, a.title, g.breadth, g.confidence, g.rationale FROM article_geo g " "JOIN articles a ON a.id=g.article_id ORDER BY RANDOM() LIMIT ?", (args.spot,)): pl = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?", (r["article_id"],)).fetchall() tag = ", ".join( f"{p['country_code'] or '?'}{('/'+p['state_code']) if p['state_code'] else ''}" f"{(':'+p['locality']) if p['locality'] else ''}" for p in pl) or "(no place)" print(f" [{r['article_id']}] {r['breadth']}/{r['confidence']} [{tag}] {r['title'][:50]}") if r["rationale"]: print(f" why: {r['rationale']}") c.close() if __name__ == "__main__": main()