Files
thejayman77 ad4e88c8f2 Geo Stage 4 (data layer): geo on feed responses + home-scope query filters
Foundation for "Closer to Home" (server-side, Codex-approved). No behavior change
yet — geo_scope defaults None, so the default/edge-cached feed is identical.

- queries.feed now returns each article's geo (breadth, confidence, and ISO-coded
  places) via a LEFT JOIN + places subquery. Article.from_row parses geo_places
  into [{country, state}]. Brief query doesn't select geo, so the Brief stays bare.
- queries.feed gains home-scope filters (home_country/home_state/geo_scope =
  near|country|world): STATE match only counts on high/medium geo confidence;
  untagged articles fall to 'world' so nothing is lost during backfill.

Next: API composition (home param + near/country/world sectioning with soft/blended
headers + a next_offset pagination model) and the Home picker UI. 360 tests green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-19 19:30:43 -04:00

104 lines
4.9 KiB
Python

#!/usr/bin/env python3
"""Geo coverage report over tagged articles (article_geo / article_places).
Codex's post-backfill checklist: breadth counts, top countries/states, unknown &
global rates, low-confidence + multi-place examples, normalization gaps, and a
random spot-check. Read-only; no LLM. Also the seed for the Stage-6 admin report.
.venv/bin/python scripts/geo_report.py [--spot 20]
"""
from __future__ import annotations
import argparse
import sqlite3
from collections import Counter
from goodnews.cli import _default_db
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", default=None)
ap.add_argument("--spot", type=int, default=20, help="random tagged articles to spot-check")
args = ap.parse_args()
c = sqlite3.connect(args.db or str(_default_db()))
c.row_factory = sqlite3.Row
total_accepted = c.execute(
"SELECT COUNT(*) FROM articles a JOIN article_scores s ON s.article_id=a.id "
"WHERE s.accepted=1 AND a.duplicate_of IS NULL"
).fetchone()[0]
tagged = c.execute("SELECT COUNT(*) FROM article_geo").fetchone()[0]
n = tagged or 1
def pct(x):
return f"{100*x/n:.0f}%"
print(f"\n===== GEO COVERAGE ({tagged} tagged / {total_accepted} accepted non-dup = "
f"{100*tagged/(total_accepted or 1):.0f}% covered) =====")
print("\nBreadth:")
br = Counter(r["breadth"] for r in c.execute("SELECT breadth FROM article_geo"))
for k in ("locality", "regional", "national", "multinational", "global", "unknown"):
print(f" {k:<13} {br.get(k,0):>4} {pct(br.get(k,0))}")
unknown = br.get("unknown", 0)
glob = br.get("global", 0)
print(f"\nUnknown rate: {unknown} ({pct(unknown)}) Global/placeless: {glob} ({pct(glob)})")
conf = Counter(r["confidence"] for r in c.execute("SELECT confidence FROM article_geo"))
print("Confidence:", {k: conf.get(k, 0) for k in ("high", "medium", "low")})
print("\nTop countries:")
for r in c.execute("SELECT country_code, COUNT(*) n FROM article_places "
"WHERE country_code IS NOT NULL GROUP BY country_code ORDER BY n DESC LIMIT 15"):
print(f" {r['country_code']} {r['n']}")
print("\nTop US states:")
for r in c.execute("SELECT state_code, COUNT(*) n FROM article_places "
"WHERE state_code IS NOT NULL GROUP BY state_code ORDER BY n DESC LIMIT 15"):
print(f" {r['state_code']} {r['n']}")
# US "Near you" fuel: tagged articles with a US state attached
us_state = c.execute("SELECT COUNT(DISTINCT article_id) FROM article_places WHERE state_code IS NOT NULL").fetchone()[0]
print(f"\nArticles with a US state (US 'Near you' fuel): {us_state} ({pct(us_state)})")
# Normalization gap: breadth implies a place but none stored (name didn't map, or
# the model named no place). A proxy for where normalization/extraction is weak.
gap = c.execute(
"SELECT COUNT(*) FROM article_geo g WHERE g.breadth IN ('locality','regional','national','multinational') "
"AND NOT EXISTS (SELECT 1 FROM article_places p WHERE p.article_id=g.article_id)"
).fetchone()[0]
print(f"Normalization/extraction gaps (place-bearing breadth, 0 places stored): {gap} ({pct(gap)})")
print("\n--- low-confidence examples (up to 8) ---")
for r in c.execute(
"SELECT g.article_id, a.title, g.breadth, g.rationale FROM article_geo g "
"JOIN articles a ON a.id=g.article_id WHERE g.confidence='low' LIMIT 8"):
print(f" [{r['article_id']}] {r['breadth']} | {r['title'][:55]} | {r['rationale'] or ''}")
print("\n--- multi-place examples (up to 8) ---")
for r in c.execute(
"SELECT article_id, COUNT(*) n FROM article_places GROUP BY article_id HAVING n>1 ORDER BY n DESC LIMIT 8"):
a = c.execute("SELECT title FROM articles WHERE id=?", (r["article_id"],)).fetchone()
places = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?",
(r["article_id"],)).fetchall()
print(f" [{r['article_id']}] ({r['n']}) {a['title'][:45]} | {[dict(p) for p in places]}")
print(f"\n--- random spot-check ({args.spot}) ---")
for r in c.execute(
"SELECT g.article_id, a.title, g.breadth, g.confidence, g.rationale FROM article_geo g "
"JOIN articles a ON a.id=g.article_id ORDER BY RANDOM() LIMIT ?", (args.spot,)):
pl = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?",
(r["article_id"],)).fetchall()
tag = ", ".join(
f"{p['country_code'] or '?'}{('/'+p['state_code']) if p['state_code'] else ''}"
f"{(':'+p['locality']) if p['locality'] else ''}" for p in pl) or "(no place)"
print(f" [{r['article_id']}] {r['breadth']}/{r['confidence']} [{tag}] {r['title'][:50]}")
if r["rationale"]:
print(f" why: {r['rationale']}")
c.close()
if __name__ == "__main__":
main()