Geo Stage 4 (data layer): geo on feed responses + home-scope query filters
Foundation for "Closer to Home" (server-side, Codex-approved). No behavior change
yet — geo_scope defaults None, so the default/edge-cached feed is identical.
- queries.feed now returns each article's geo (breadth, confidence, and ISO-coded
places) via a LEFT JOIN + places subquery. Article.from_row parses geo_places
into [{country, state}]. Brief query doesn't select geo, so the Brief stays bare.
- queries.feed gains home-scope filters (home_country/home_state/geo_scope =
near|country|world): STATE match only counts on high/medium geo confidence;
untagged articles fall to 'world' so nothing is lost during backfill.
Next: API composition (home param + near/country/world sectioning with soft/blended
headers + a next_offset pagination model) and the Home picker UI. 360 tests green.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -322,11 +322,26 @@ class Article(BaseModel):
|
||||
paywalled: bool = False
|
||||
tags: list[str] = []
|
||||
summary: str | None = None # our own cached summary (present on the brief)
|
||||
# Subject geography (present on feed rows; absent/empty on the brief). breadth is
|
||||
# locality|regional|national|multinational|global|unknown; places are ISO codes.
|
||||
geo_breadth: str | None = None
|
||||
geo_confidence: str | None = None
|
||||
geo_places: list[dict] = [] # e.g. [{"country": "US", "state": "NY"}, {"country": "GB", "state": None}]
|
||||
|
||||
@classmethod
|
||||
def from_row(cls, row: dict) -> "Article":
|
||||
raw_tags = row.get("tags")
|
||||
places = []
|
||||
for tok in (row.get("geo_places") or "").split(","):
|
||||
tok = tok.strip()
|
||||
if not tok:
|
||||
continue
|
||||
cc, _, sc = tok.partition("-")
|
||||
places.append({"country": cc, "state": sc or None})
|
||||
return cls(
|
||||
geo_breadth=row.get("geo_breadth"),
|
||||
geo_confidence=row.get("geo_confidence"),
|
||||
geo_places=places,
|
||||
summary=row.get("summary"),
|
||||
id=row["id"],
|
||||
title=row["title"],
|
||||
|
||||
+33
-1
@@ -80,6 +80,9 @@ def feed(
|
||||
follow_tags: list[str] | None = None,
|
||||
since: str | None = None,
|
||||
match: str | None = None,
|
||||
home_country: str | None = None,
|
||||
home_state: str | None = None,
|
||||
geo_scope: str | None = None, # 'near' | 'country' | 'world' relative to the reader's home
|
||||
) -> list[dict]:
|
||||
"""Return articles with categorical filters applied in SQL.
|
||||
|
||||
@@ -162,6 +165,30 @@ def feed(
|
||||
params.extend(ftags)
|
||||
clauses.append("(" + " OR ".join(ors) + ")" if ors else "0")
|
||||
|
||||
# Home-aware scoping for "Closer to Home" (server-side). Relative to the reader's
|
||||
# chosen home; geo_scope=None leaves the feed exactly as it is today. A STATE match
|
||||
# only counts when geo confidence is high/medium (don't surface "Near you" on a
|
||||
# shaky location). Untagged articles have no places, so they land in 'world' — never
|
||||
# lost while the backfill is still running.
|
||||
if geo_scope == "near":
|
||||
if home_state and home_country:
|
||||
clauses.append(
|
||||
"g.confidence IN ('high','medium') AND EXISTS (SELECT 1 FROM article_places p "
|
||||
"WHERE p.article_id = a.id AND p.country_code = ? AND p.state_code = ?)")
|
||||
params.extend([home_country, home_state])
|
||||
elif home_country:
|
||||
clauses.append("EXISTS (SELECT 1 FROM article_places p WHERE p.article_id = a.id AND p.country_code = ?)")
|
||||
params.append(home_country)
|
||||
elif geo_scope == "country" and home_country:
|
||||
clauses.append("EXISTS (SELECT 1 FROM article_places p WHERE p.article_id = a.id AND p.country_code = ?)")
|
||||
params.append(home_country)
|
||||
if home_state: # "elsewhere in your country" = your country, but not your state
|
||||
clauses.append("NOT EXISTS (SELECT 1 FROM article_places p2 WHERE p2.article_id = a.id AND p2.state_code = ?)")
|
||||
params.append(home_state)
|
||||
elif geo_scope == "world" and home_country:
|
||||
clauses.append("NOT EXISTS (SELECT 1 FROM article_places p WHERE p.article_id = a.id AND p.country_code = ?)")
|
||||
params.append(home_country)
|
||||
|
||||
where = "WHERE " + " AND ".join(clauses)
|
||||
params.extend([limit, offset])
|
||||
|
||||
@@ -173,10 +200,15 @@ def feed(
|
||||
order_by = "rank_score DESC, COALESCE(a.published_at, a.discovered_at) DESC"
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT {_ARTICLE_COLUMNS}
|
||||
SELECT {_ARTICLE_COLUMNS},
|
||||
g.breadth AS geo_breadth, g.confidence AS geo_confidence,
|
||||
(SELECT group_concat(
|
||||
p.country_code || CASE WHEN p.state_code IS NOT NULL THEN '-' || p.state_code ELSE '' END, ',')
|
||||
FROM article_places p WHERE p.article_id = a.id) AS geo_places
|
||||
FROM articles a
|
||||
JOIN sources src ON src.id = a.source_id
|
||||
JOIN article_scores s ON s.article_id = a.id
|
||||
LEFT JOIN article_geo g ON g.article_id = a.id
|
||||
{fts_join}
|
||||
{where}
|
||||
ORDER BY {order_by}
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Geo coverage report over tagged articles (article_geo / article_places).
|
||||
|
||||
Codex's post-backfill checklist: breadth counts, top countries/states, unknown &
|
||||
global rates, low-confidence + multi-place examples, normalization gaps, and a
|
||||
random spot-check. Read-only; no LLM. Also the seed for the Stage-6 admin report.
|
||||
|
||||
.venv/bin/python scripts/geo_report.py [--spot 20]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sqlite3
|
||||
from collections import Counter
|
||||
|
||||
from goodnews.cli import _default_db
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--db", default=None)
|
||||
ap.add_argument("--spot", type=int, default=20, help="random tagged articles to spot-check")
|
||||
args = ap.parse_args()
|
||||
|
||||
c = sqlite3.connect(args.db or str(_default_db()))
|
||||
c.row_factory = sqlite3.Row
|
||||
|
||||
total_accepted = c.execute(
|
||||
"SELECT COUNT(*) FROM articles a JOIN article_scores s ON s.article_id=a.id "
|
||||
"WHERE s.accepted=1 AND a.duplicate_of IS NULL"
|
||||
).fetchone()[0]
|
||||
tagged = c.execute("SELECT COUNT(*) FROM article_geo").fetchone()[0]
|
||||
n = tagged or 1
|
||||
|
||||
def pct(x):
|
||||
return f"{100*x/n:.0f}%"
|
||||
|
||||
print(f"\n===== GEO COVERAGE ({tagged} tagged / {total_accepted} accepted non-dup = "
|
||||
f"{100*tagged/(total_accepted or 1):.0f}% covered) =====")
|
||||
|
||||
print("\nBreadth:")
|
||||
br = Counter(r["breadth"] for r in c.execute("SELECT breadth FROM article_geo"))
|
||||
for k in ("locality", "regional", "national", "multinational", "global", "unknown"):
|
||||
print(f" {k:<13} {br.get(k,0):>4} {pct(br.get(k,0))}")
|
||||
unknown = br.get("unknown", 0)
|
||||
glob = br.get("global", 0)
|
||||
print(f"\nUnknown rate: {unknown} ({pct(unknown)}) Global/placeless: {glob} ({pct(glob)})")
|
||||
|
||||
conf = Counter(r["confidence"] for r in c.execute("SELECT confidence FROM article_geo"))
|
||||
print("Confidence:", {k: conf.get(k, 0) for k in ("high", "medium", "low")})
|
||||
|
||||
print("\nTop countries:")
|
||||
for r in c.execute("SELECT country_code, COUNT(*) n FROM article_places "
|
||||
"WHERE country_code IS NOT NULL GROUP BY country_code ORDER BY n DESC LIMIT 15"):
|
||||
print(f" {r['country_code']} {r['n']}")
|
||||
print("\nTop US states:")
|
||||
for r in c.execute("SELECT state_code, COUNT(*) n FROM article_places "
|
||||
"WHERE state_code IS NOT NULL GROUP BY state_code ORDER BY n DESC LIMIT 15"):
|
||||
print(f" {r['state_code']} {r['n']}")
|
||||
|
||||
# US "Near you" fuel: tagged articles with a US state attached
|
||||
us_state = c.execute("SELECT COUNT(DISTINCT article_id) FROM article_places WHERE state_code IS NOT NULL").fetchone()[0]
|
||||
print(f"\nArticles with a US state (US 'Near you' fuel): {us_state} ({pct(us_state)})")
|
||||
|
||||
# Normalization gap: breadth implies a place but none stored (name didn't map, or
|
||||
# the model named no place). A proxy for where normalization/extraction is weak.
|
||||
gap = c.execute(
|
||||
"SELECT COUNT(*) FROM article_geo g WHERE g.breadth IN ('locality','regional','national','multinational') "
|
||||
"AND NOT EXISTS (SELECT 1 FROM article_places p WHERE p.article_id=g.article_id)"
|
||||
).fetchone()[0]
|
||||
print(f"Normalization/extraction gaps (place-bearing breadth, 0 places stored): {gap} ({pct(gap)})")
|
||||
|
||||
print("\n--- low-confidence examples (up to 8) ---")
|
||||
for r in c.execute(
|
||||
"SELECT g.article_id, a.title, g.breadth, g.rationale FROM article_geo g "
|
||||
"JOIN articles a ON a.id=g.article_id WHERE g.confidence='low' LIMIT 8"):
|
||||
print(f" [{r['article_id']}] {r['breadth']} | {r['title'][:55]} | {r['rationale'] or ''}")
|
||||
|
||||
print("\n--- multi-place examples (up to 8) ---")
|
||||
for r in c.execute(
|
||||
"SELECT article_id, COUNT(*) n FROM article_places GROUP BY article_id HAVING n>1 ORDER BY n DESC LIMIT 8"):
|
||||
a = c.execute("SELECT title FROM articles WHERE id=?", (r["article_id"],)).fetchone()
|
||||
places = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?",
|
||||
(r["article_id"],)).fetchall()
|
||||
print(f" [{r['article_id']}] ({r['n']}) {a['title'][:45]} | {[dict(p) for p in places]}")
|
||||
|
||||
print(f"\n--- random spot-check ({args.spot}) ---")
|
||||
for r in c.execute(
|
||||
"SELECT g.article_id, a.title, g.breadth, g.confidence, g.rationale FROM article_geo g "
|
||||
"JOIN articles a ON a.id=g.article_id ORDER BY RANDOM() LIMIT ?", (args.spot,)):
|
||||
pl = c.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=?",
|
||||
(r["article_id"],)).fetchall()
|
||||
tag = ", ".join(
|
||||
f"{p['country_code'] or '?'}{('/'+p['state_code']) if p['state_code'] else ''}"
|
||||
f"{(':'+p['locality']) if p['locality'] else ''}" for p in pl) or "(no place)"
|
||||
print(f" [{r['article_id']}] {r['breadth']}/{r['confidence']} [{tag}] {r['title'][:50]}")
|
||||
if r["rationale"]:
|
||||
print(f" why: {r['rationale']}")
|
||||
c.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user