From ad4e88c8f263987358c3ab831ef65cd429f6e857 Mon Sep 17 00:00:00 2001 From: jay Date: Fri, 19 Jun 2026 19:30:43 -0400 Subject: [PATCH] Geo Stage 4 (data layer): geo on feed responses + home-scope query filters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foundation for "Closer to Home" (server-side, Codex-approved). No behavior change yet — geo_scope defaults None, so the default/edge-cached feed is identical. - queries.feed now returns each article's geo (breadth, confidence, and ISO-coded places) via a LEFT JOIN + places subquery. Article.from_row parses geo_places into [{country, state}]. Brief query doesn't select geo, so the Brief stays bare. - queries.feed gains home-scope filters (home_country/home_state/geo_scope = near|country|world): STATE match only counts on high/medium geo confidence; untagged articles fall to 'world' so nothing is lost during backfill. Next: API composition (home param + near/country/world sectioning with soft/blended headers + a next_offset pagination model) and the Home picker UI. 360 tests green. Co-Authored-By: Claude Opus 4.8 --- goodnews/api.py | 15 ++++++ goodnews/queries.py | 34 +++++++++++++- scripts/geo_report.py | 103 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 scripts/geo_report.py diff --git a/goodnews/api.py b/goodnews/api.py index 4b04750..ab93b36 100644 --- a/goodnews/api.py +++ b/goodnews/api.py @@ -322,11 +322,26 @@ class Article(BaseModel): paywalled: bool = False tags: list[str] = [] summary: str | None = None # our own cached summary (present on the brief) + # Subject geography (present on feed rows; absent/empty on the brief). breadth is + # locality|regional|national|multinational|global|unknown; places are ISO codes. + geo_breadth: str | None = None + geo_confidence: str | None = None + geo_places: list[dict] = [] # e.g. [{"country": "US", "state": "NY"}, {"country": "GB", "state": None}] @classmethod def from_row(cls, row: dict) -> "Article": raw_tags = row.get("tags") + places = [] + for tok in (row.get("geo_places") or "").split(","): + tok = tok.strip() + if not tok: + continue + cc, _, sc = tok.partition("-") + places.append({"country": cc, "state": sc or None}) return cls( + geo_breadth=row.get("geo_breadth"), + geo_confidence=row.get("geo_confidence"), + geo_places=places, summary=row.get("summary"), id=row["id"], title=row["title"], diff --git a/goodnews/queries.py b/goodnews/queries.py index 3fc5736..ec24b63 100644 --- a/goodnews/queries.py +++ b/goodnews/queries.py @@ -80,6 +80,9 @@ def feed( follow_tags: list[str] | None = None, since: str | None = None, match: str | None = None, + home_country: str | None = None, + home_state: str | None = None, + geo_scope: str | None = None, # 'near' | 'country' | 'world' relative to the reader's home ) -> list[dict]: """Return articles with categorical filters applied in SQL. @@ -162,6 +165,30 @@ def feed( params.extend(ftags) clauses.append("(" + " OR ".join(ors) + ")" if ors else "0") + # Home-aware scoping for "Closer to Home" (server-side). Relative to the reader's + # chosen home; geo_scope=None leaves the feed exactly as it is today. A STATE match + # only counts when geo confidence is high/medium (don't surface "Near you" on a + # shaky location). Untagged articles have no places, so they land in 'world' — never + # lost while the backfill is still running. + if geo_scope == "near": + if home_state and home_country: + clauses.append( + "g.confidence IN ('high','medium') AND EXISTS (SELECT 1 FROM article_places p " + "WHERE p.article_id = a.id AND p.country_code = ? AND p.state_code = ?)") + params.extend([home_country, home_state]) + elif home_country: + clauses.append("EXISTS (SELECT 1 FROM article_places p WHERE p.article_id = a.id AND p.country_code = ?)") + params.append(home_country) + elif geo_scope == "country" and home_country: + clauses.append("EXISTS (SELECT 1 FROM article_places p WHERE p.article_id = a.id AND p.country_code = ?)") + params.append(home_country) + if home_state: # "elsewhere in your country" = your country, but not your state + clauses.append("NOT EXISTS (SELECT 1 FROM article_places p2 WHERE p2.article_id = a.id AND p2.state_code = ?)") + params.append(home_state) + elif geo_scope == "world" and home_country: + clauses.append("NOT EXISTS (SELECT 1 FROM article_places p WHERE p.article_id = a.id AND p.country_code = ?)") + params.append(home_country) + where = "WHERE " + " AND ".join(clauses) params.extend([limit, offset]) @@ -173,10 +200,15 @@ def feed( order_by = "rank_score DESC, COALESCE(a.published_at, a.discovered_at) DESC" rows = conn.execute( f""" - SELECT {_ARTICLE_COLUMNS} + SELECT {_ARTICLE_COLUMNS}, + g.breadth AS geo_breadth, g.confidence AS geo_confidence, + (SELECT group_concat( + p.country_code || CASE WHEN p.state_code IS NOT NULL THEN '-' || p.state_code ELSE '' END, ',') + FROM article_places p WHERE p.article_id = a.id) AS geo_places FROM articles a JOIN sources src ON src.id = a.source_id JOIN article_scores s ON s.article_id = a.id + LEFT JOIN article_geo g ON g.article_id = a.id {fts_join} {where} ORDER BY {order_by} diff --git a/scripts/geo_report.py b/scripts/geo_report.py new file mode 100644 index 0000000..441c81a --- /dev/null +++ b/scripts/geo_report.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +"""Geo coverage report over tagged articles (article_geo / article_places). + +Codex's post-backfill checklist: breadth counts, top countries/states, unknown & +global rates, low-confidence + multi-place examples, normalization gaps, and a +random spot-check. Read-only; no LLM. Also the seed for the Stage-6 admin report. + + .venv/bin/python scripts/geo_report.py [--spot 20] +""" +from __future__ import annotations + +import argparse +import sqlite3 +from collections import Counter + +from goodnews.cli import _default_db + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--db", default=None) + ap.add_argument("--spot", type=int, default=20, help="random tagged articles to spot-check") + args = ap.parse_args() + + c = sqlite3.connect(args.db or str(_default_db())) + c.row_factory = sqlite3.Row + + total_accepted = c.execute( + "SELECT COUNT(*) FROM articles a JOIN article_scores s ON s.article_id=a.id " + "WHERE s.accepted=1 AND a.duplicate_of IS NULL" + ).fetchone()[0] + tagged = c.execute("SELECT COUNT(*) FROM article_geo").fetchone()[0] + n = tagged or 1 + + def pct(x): + return f"{100*x/n:.0f}%" + + print(f"\n===== GEO COVERAGE ({tagged} tagged / {total_accepted} accepted non-dup = " + f"{100*tagged/(total_accepted or 1):.0f}% covered) =====") + + print("\nBreadth:") + br = Counter(r["breadth"] for r in c.execute("SELECT breadth FROM article_geo")) + for k in ("locality", "regional", "national", "multinational", "global", "unknown"): + print(f" {k:<13} {br.get(k,0):>4} {pct(br.get(k,0))}") + unknown = br.get("unknown", 0) + glob = br.get("global", 0) + print(f"\nUnknown rate: {unknown} ({pct(unknown)}) Global/placeless: {glob} ({pct(glob)})") + + conf = Counter(r["confidence"] for r in c.execute("SELECT confidence FROM article_geo")) + print("Confidence:", {k: conf.get(k, 0) for k in ("high", "medium", "low")}) + + print("\nTop countries:") + for r in c.execute("SELECT country_code, COUNT(*) n FROM article_places " + "WHERE country_code IS NOT NULL GROUP BY country_code ORDER BY n DESC LIMIT 15"): + print(f" {r['country_code']} {r['n']}") + print("\nTop US states:") + for r in c.execute("SELECT state_code, COUNT(*) n FROM article_places " + "WHERE state_code IS NOT NULL GROUP BY state_code ORDER BY n DESC LIMIT 15"): + print(f" {r['state_code']} {r['n']}") + + # US "Near you" fuel: tagged articles with a US state attached + us_state = c.execute("SELECT COUNT(DISTINCT article_id) FROM article_places WHERE state_code IS NOT NULL").fetchone()[0] + print(f"\nArticles with a US state (US 'Near you' fuel): {us_state} ({pct(us_state)})") + + # Normalization gap: breadth implies a place but none stored (name didn't map, or + # the model named no place). A proxy for where normalization/extraction is weak. + gap = c.execute( + "SELECT COUNT(*) FROM article_geo g WHERE g.breadth IN ('locality','regional','national','multinational') " + "AND NOT EXISTS (SELECT 1 FROM article_places p WHERE p.article_id=g.article_id)" + ).fetchone()[0] + print(f"Normalization/extraction gaps (place-bearing breadth, 0 places stored): {gap} ({pct(gap)})") + + print("\n--- low-confidence examples (up to 8) ---") + for r in c.execute( + "SELECT g.article_id, a.title, g.breadth, g.rationale FROM article_geo g " + "JOIN articles a ON a.id=g.article_id WHERE g.confidence='low' LIMIT 8"): + print(f" [{r['article_id']}] {r['breadth']} | {r['title'][:55]} | {r['rationale'] or ''}") + + print("\n--- multi-place examples (up to 8) ---") + for r in c.execute( + "SELECT article_id, COUNT(*) n FROM article_places GROUP BY article_id HAVING n>1 ORDER BY n DESC LIMIT 8"): + a = c.execute("SELECT title FROM articles WHERE id=?", (r["article_id"],)).fetchone() + places = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?", + (r["article_id"],)).fetchall() + print(f" [{r['article_id']}] ({r['n']}) {a['title'][:45]} | {[dict(p) for p in places]}") + + print(f"\n--- random spot-check ({args.spot}) ---") + for r in c.execute( + "SELECT g.article_id, a.title, g.breadth, g.confidence, g.rationale FROM article_geo g " + "JOIN articles a ON a.id=g.article_id ORDER BY RANDOM() LIMIT ?", (args.spot,)): + pl = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?", + (r["article_id"],)).fetchall() + tag = ", ".join( + f"{p['country_code'] or '?'}{('/'+p['state_code']) if p['state_code'] else ''}" + f"{(':'+p['locality']) if p['locality'] else ''}" for p in pl) or "(no place)" + print(f" [{r['article_id']}] {r['breadth']}/{r['confidence']} [{tag}] {r['title'][:50]}") + if r["rationale"]: + print(f" why: {r['rationale']}") + c.close() + + +if __name__ == "__main__": + main()