upbeatBytes/scripts/geo_report.py

#!/usr/bin/env python3
"""Geo coverage report over tagged articles (article_geo / article_places).

Codex's post-backfill checklist: breadth counts, top countries/states, unknown &
global rates, low-confidence + multi-place examples, normalization gaps, and a
random spot-check. Read-only; no LLM. Also the seed for the Stage-6 admin report.

  .venv/bin/python scripts/geo_report.py [--spot 20]
"""
from __future__ import annotations

import argparse
import sqlite3
from collections import Counter

from goodnews.cli import _default_db


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--db", default=None)
    ap.add_argument("--spot", type=int, default=20, help="random tagged articles to spot-check")
    args = ap.parse_args()

    c = sqlite3.connect(args.db or str(_default_db()))
    c.row_factory = sqlite3.Row

    total_accepted = c.execute(
        "SELECT COUNT(*) FROM articles a JOIN article_scores s ON s.article_id=a.id "
        "WHERE s.accepted=1 AND a.duplicate_of IS NULL"
    ).fetchone()[0]
    tagged = c.execute("SELECT COUNT(*) FROM article_geo").fetchone()[0]
    n = tagged or 1

    def pct(x):
        return f"{100*x/n:.0f}%"

    print(f"\n===== GEO COVERAGE ({tagged} tagged / {total_accepted} accepted non-dup = "
          f"{100*tagged/(total_accepted or 1):.0f}% covered) =====")

    print("\nBreadth:")
    br = Counter(r["breadth"] for r in c.execute("SELECT breadth FROM article_geo"))
    for k in ("locality", "regional", "national", "multinational", "global", "unknown"):
        print(f"  {k:<13} {br.get(k,0):>4}  {pct(br.get(k,0))}")
    unknown = br.get("unknown", 0)
    glob = br.get("global", 0)
    print(f"\nUnknown rate: {unknown} ({pct(unknown)})   Global/placeless: {glob} ({pct(glob)})")

    conf = Counter(r["confidence"] for r in c.execute("SELECT confidence FROM article_geo"))
    print("Confidence:", {k: conf.get(k, 0) for k in ("high", "medium", "low")})

    print("\nTop countries:")
    for r in c.execute("SELECT country_code, COUNT(*) n FROM article_places "
                       "WHERE country_code IS NOT NULL GROUP BY country_code ORDER BY n DESC LIMIT 15"):
        print(f"  {r['country_code']}  {r['n']}")
    print("\nTop US states:")
    for r in c.execute("SELECT state_code, COUNT(*) n FROM article_places "
                       "WHERE state_code IS NOT NULL GROUP BY state_code ORDER BY n DESC LIMIT 15"):
        print(f"  {r['state_code']}  {r['n']}")

    # US "Near you" fuel: tagged articles with a US state attached
    us_state = c.execute("SELECT COUNT(DISTINCT article_id) FROM article_places WHERE state_code IS NOT NULL").fetchone()[0]
    print(f"\nArticles with a US state (US 'Near you' fuel): {us_state} ({pct(us_state)})")

    # Normalization gap: breadth implies a place but none stored (name didn't map, or
    # the model named no place). A proxy for where normalization/extraction is weak.
    gap = c.execute(
        "SELECT COUNT(*) FROM article_geo g WHERE g.breadth IN ('locality','regional','national','multinational') "
        "AND NOT EXISTS (SELECT 1 FROM article_places p WHERE p.article_id=g.article_id)"
    ).fetchone()[0]
    print(f"Normalization/extraction gaps (place-bearing breadth, 0 places stored): {gap} ({pct(gap)})")

    print("\n--- low-confidence examples (up to 8) ---")
    for r in c.execute(
        "SELECT g.article_id, a.title, g.breadth, g.rationale FROM article_geo g "
        "JOIN articles a ON a.id=g.article_id WHERE g.confidence='low' LIMIT 8"):
        print(f"  [{r['article_id']}] {r['breadth']} | {r['title'][:55]} | {r['rationale'] or ''}")

    print("\n--- multi-place examples (up to 8) ---")
    for r in c.execute(
        "SELECT article_id, COUNT(*) n FROM article_places GROUP BY article_id HAVING n>1 ORDER BY n DESC LIMIT 8"):
        a = c.execute("SELECT title FROM articles WHERE id=?", (r["article_id"],)).fetchone()
        places = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?",
                           (r["article_id"],)).fetchall()
        print(f"  [{r['article_id']}] ({r['n']}) {a['title'][:45]} | {[dict(p) for p in places]}")

    print(f"\n--- random spot-check ({args.spot}) ---")
    for r in c.execute(
        "SELECT g.article_id, a.title, g.breadth, g.confidence, g.rationale FROM article_geo g "
        "JOIN articles a ON a.id=g.article_id ORDER BY RANDOM() LIMIT ?", (args.spot,)):
        pl = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?",
                       (r["article_id"],)).fetchall()
        tag = ", ".join(
            f"{p['country_code'] or '?'}{('/'+p['state_code']) if p['state_code'] else ''}"
            f"{(':'+p['locality']) if p['locality'] else ''}" for p in pl) or "(no place)"
        print(f"  [{r['article_id']}] {r['breadth']}/{r['confidence']} [{tag}] {r['title'][:50]}")
        if r["rationale"]:
            print(f"        why: {r['rationale']}")
    c.close()


if __name__ == "__main__":
    main()