1c05554a28
"Closer to Home" foundation (audit greenlit by Codex). Durable geography, kept decoupled from volatile scoring. - Schema: article_geo (breadth/confidence/rationale/geo_version) + article_places (0..N ISO-coded places), separate from article_scores so re-runs/audits never disturb scoring or acceptance. "local" is never stored — it's relative to the reader; the UI computes "Near you" later. - geo.py: LLM proposes place NAMES, code disposes to ISO codes (country alpha-2, US state 2-letter); region words like "Europe" can never become a country. 'global'/placeless is first-class, not failure. Confidence calibrated so 'high' needs an explicit location. Geo is its OWN LLM pass, not merged into the scoring prompt (durable metadata, re-runnable, keeps the sensitive prompt untouched). - store_geo replaces places (geo is re-derivable, unlike scores). tag_articles is idempotent by geo_version, only touches accepted non-duplicate articles. - CLI `geo` command (cycle-locked, --limit/--reclassify) for backfill, plus a bounded geo step in the cycle (--geo-limit 60, --no-geo). scripts/geo_audit.py is the prototype audit tool. 360 tests green; live smoke tagged real articles correctly (Gaza->PS, London->GB, placeless science->global). No UI / SEO pages yet — ranking/personalization only. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
209 lines
8.1 KiB
Python
209 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""PROTOTYPE geo audit (not production).
|
|
|
|
Codex/Claude plan: before building any "Closer to Home" UI or touching the
|
|
production classify schema, measure what subject-geography the LLM can actually
|
|
extract from recent good-news articles, and whether it understands WHERE A STORY
|
|
HAPPENED vs. merely spotting place names.
|
|
|
|
Key taxonomy decision: "local" is relative to the VIEWER, so we do NOT store it.
|
|
We store the article's intrinsic geographic BREADTH (locality/regional/national/
|
|
multinational/global/unknown) plus the actual place(s). The UI later decides
|
|
"Near you" by comparing those places to the visitor's chosen home.
|
|
|
|
This writes results to a scratch JSON file and prints a coverage report. It does
|
|
not migrate the DB, change the classify pipeline, or backfill anything.
|
|
|
|
Run (host can reach the LAN model):
|
|
.venv/bin/python scripts/geo_audit.py --limit 400 --base-url http://127.0.0.1:8080/v1
|
|
Resumable: re-running skips article ids already in the out file.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import statistics
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
from goodnews.cli import _default_db
|
|
from goodnews.db import connect
|
|
from goodnews.llm import LocalModelClient, parse_classifier_json
|
|
|
|
BREADTHS = {"locality", "regional", "national", "multinational", "global", "unknown"}
|
|
|
|
SYSTEM = (
|
|
"You tag the real-world geography of a news story for a calm good-news site. "
|
|
"Identify the place(s) the story is fundamentally ABOUT or where it HAPPENED — "
|
|
"NOT places mentioned only in passing. Many good-news stories (general science, "
|
|
"space, broad research) have no specific place; those are 'global'. If a location "
|
|
"is only incidental or genuinely unclear, use 'unknown'. Do not guess. "
|
|
"Reply with ONLY a JSON object, no prose."
|
|
)
|
|
|
|
INSTRUCT = (
|
|
"Return JSON exactly like:\n"
|
|
'{"breadth": "<locality|regional|national|multinational|global|unknown>", '
|
|
'"places": [{"country": "<name or null>", "state_province": "<name or null>", '
|
|
'"locality": "<city/town or null>"}], "confidence": "<high|medium|low>", '
|
|
'"rationale": "<one short clause: where it happened and why>"}\n'
|
|
"breadth guide: locality=a specific city/town/county; regional=a state/province/region; "
|
|
"national=about a whole country; multinational=a few specific countries; "
|
|
"global=worldwide or no specific country; unknown=incidental/unclear. "
|
|
"places may list more than one when a story genuinely spans regions; use null for parts you can't support."
|
|
)
|
|
|
|
|
|
def fetch(conn, limit):
|
|
return conn.execute(
|
|
"""
|
|
SELECT a.id, a.title, a.description, a.published_at, a.discovered_at,
|
|
sm.summary, sm.what_happened, sm.why_matters
|
|
FROM articles a
|
|
JOIN article_scores s ON s.article_id = a.id
|
|
LEFT JOIN article_summaries sm ON sm.article_id = a.id
|
|
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
|
|
ORDER BY a.discovered_at DESC
|
|
LIMIT ?
|
|
""",
|
|
(limit,),
|
|
).fetchall()
|
|
|
|
|
|
def article_text(r):
|
|
parts = [f"TITLE: {r['title']}"]
|
|
for label, key in (("SUMMARY", "summary"), ("WHAT HAPPENED", "what_happened"),
|
|
("WHY IT MATTERS", "why_matters"), ("PUBLISHER BLURB", "description")):
|
|
v = r[key]
|
|
if v:
|
|
parts.append(f"{label}: {v}")
|
|
return "\n".join(parts)
|
|
|
|
|
|
def extract(client, r):
|
|
messages = [
|
|
{"role": "system", "content": SYSTEM},
|
|
{"role": "user", "content": article_text(r) + "\n\n" + INSTRUCT},
|
|
]
|
|
raw = client.chat_text(messages)
|
|
data = parse_classifier_json(raw) # raises on unparseable
|
|
breadth = data.get("breadth")
|
|
if breadth not in BREADTHS:
|
|
breadth = "unknown"
|
|
places = data.get("places")
|
|
places = [p for p in places if isinstance(p, dict)] if isinstance(places, list) else []
|
|
conf = data.get("confidence") if data.get("confidence") in {"high", "medium", "low"} else "low"
|
|
return {
|
|
"breadth": breadth,
|
|
"places": places,
|
|
"confidence": conf,
|
|
"rationale": (data.get("rationale") or "")[:300],
|
|
}
|
|
|
|
|
|
def report(rows, results):
|
|
by_id = {r["id"]: r for r in rows}
|
|
n = len(results)
|
|
print(f"\n===== GEO AUDIT REPORT (n={n}) =====")
|
|
if not n:
|
|
return
|
|
breadth = Counter(v["breadth"] for v in results.values())
|
|
conf = Counter(v["confidence"] for v in results.values())
|
|
countries = Counter()
|
|
states = Counter()
|
|
unknown = 0
|
|
for v in results.values():
|
|
if v["breadth"] == "unknown" or not v["places"]:
|
|
unknown += 1
|
|
for p in v["places"]:
|
|
if p.get("country"):
|
|
countries[str(p["country"]).strip()] += 1
|
|
if p.get("state_province"):
|
|
states[str(p["state_province"]).strip()] += 1
|
|
|
|
def pct(x):
|
|
return f"{100*x/n:.0f}%"
|
|
|
|
print("\nBreadth:")
|
|
for k in ("locality", "regional", "national", "multinational", "global", "unknown"):
|
|
print(f" {k:<13} {breadth.get(k,0):>4} {pct(breadth.get(k,0))}")
|
|
print(f"\nUnknown/no-place rate: {unknown}/{n} {pct(unknown)}")
|
|
print("Confidence:", dict(conf))
|
|
print("\nTop countries:")
|
|
for name, c in countries.most_common(12):
|
|
print(f" {name:<22} {c}")
|
|
print("\nTop states/provinces:")
|
|
for name, c in states.most_common(12):
|
|
print(f" {name:<22} {c}")
|
|
# US-local fuel check: how many map to a US state (the "Near you" payload for Americans)
|
|
us_local = sum(1 for v in results.values()
|
|
if any((p.get("country") or "").strip() in ("United States", "USA", "US") and p.get("state_province")
|
|
for p in v["places"]))
|
|
print(f"\nArticles with a US state attached (US 'Near you' fuel): {us_local} {pct(us_local)}")
|
|
# freshness
|
|
days = [by_id[int(i)]["discovered_at"][:10] for i in results if by_id.get(int(i)) and by_id[int(i)]["discovered_at"]]
|
|
if days:
|
|
print(f"\nFreshness: {min(days)} .. {max(days)} ({len(set(days))} distinct days)")
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--db", default=None)
|
|
ap.add_argument("--limit", type=int, default=400)
|
|
ap.add_argument("--inspect", type=int, default=8, help="print N samples for manual eyeballing")
|
|
ap.add_argument("--out", default="data/geo_audit.json")
|
|
ap.add_argument("--base-url", default=None)
|
|
ap.add_argument("--model", default=None)
|
|
args = ap.parse_args()
|
|
|
|
conn = connect(args.db or str(_default_db()))
|
|
client = LocalModelClient.from_env()
|
|
if args.base_url:
|
|
client.base_url = args.base_url.rstrip("/")
|
|
if args.model:
|
|
client.model = args.model
|
|
|
|
out = Path(args.out)
|
|
results = json.loads(out.read_text()) if out.exists() else {}
|
|
rows = fetch(conn, args.limit)
|
|
print(f"Fetched {len(rows)} accepted non-dup articles; {len(results)} already done.")
|
|
|
|
done = 0
|
|
for r in rows:
|
|
rid = str(r["id"])
|
|
if rid in results:
|
|
continue
|
|
try:
|
|
results[rid] = extract(client, r)
|
|
except Exception as exc: # noqa: BLE001 — prototype; record + continue
|
|
results[rid] = {"breadth": "unknown", "places": [], "confidence": "low",
|
|
"rationale": f"ERROR: {type(exc).__name__}: {exc}"[:300], "error": True}
|
|
done += 1
|
|
if done % 25 == 0:
|
|
out.write_text(json.dumps(results, indent=1))
|
|
print(f" ...{done} new, {len(results)} total")
|
|
out.write_text(json.dumps(results, indent=1))
|
|
conn.close()
|
|
|
|
# Manual-inspection sample: the step Codex flagged as essential — eyeball whether
|
|
# the model captured WHERE IT HAPPENED, not just place-name recognition.
|
|
print(f"\n----- SAMPLE FOR MANUAL INSPECTION (first {args.inspect}) -----")
|
|
shown = 0
|
|
for r in rows:
|
|
rid = str(r["id"])
|
|
if rid not in results:
|
|
continue
|
|
v = results[rid]
|
|
print(f"\n[{rid}] {r['title']}")
|
|
print(f" breadth={v['breadth']} conf={v['confidence']} places={v['places']}")
|
|
print(f" why: {v['rationale']}")
|
|
shown += 1
|
|
if shown >= args.inspect:
|
|
break
|
|
|
|
report(rows, results)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|