1c05554a28
"Closer to Home" foundation (audit greenlit by Codex). Durable geography, kept decoupled from volatile scoring. - Schema: article_geo (breadth/confidence/rationale/geo_version) + article_places (0..N ISO-coded places), separate from article_scores so re-runs/audits never disturb scoring or acceptance. "local" is never stored — it's relative to the reader; the UI computes "Near you" later. - geo.py: LLM proposes place NAMES, code disposes to ISO codes (country alpha-2, US state 2-letter); region words like "Europe" can never become a country. 'global'/placeless is first-class, not failure. Confidence calibrated so 'high' needs an explicit location. Geo is its OWN LLM pass, not merged into the scoring prompt (durable metadata, re-runnable, keeps the sensitive prompt untouched). - store_geo replaces places (geo is re-derivable, unlike scores). tag_articles is idempotent by geo_version, only touches accepted non-duplicate articles. - CLI `geo` command (cycle-locked, --limit/--reclassify) for backfill, plus a bounded geo step in the cycle (--geo-limit 60, --no-geo). scripts/geo_audit.py is the prototype audit tool. 360 tests green; live smoke tagged real articles correctly (Gaza->PS, London->GB, placeless science->global). No UI / SEO pages yet — ranking/personalization only. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
125 lines
6.0 KiB
Python
125 lines
6.0 KiB
Python
"""Subject-geography: ISO normalization (model proposes names, code disposes to codes),
|
|
storage into the decoupled article_geo/article_places tables, and idempotent tagging."""
|
|
import json
|
|
|
|
import pytest
|
|
|
|
from goodnews import geo
|
|
from goodnews.db import connect, init_db
|
|
|
|
|
|
@pytest.fixture
|
|
def conn():
|
|
c = connect(":memory:"); init_db(c)
|
|
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
|
|
yield c
|
|
c.close()
|
|
|
|
|
|
def _article(c, aid, *, accepted=1, dup=None):
|
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,discovered_at) "
|
|
"VALUES (?,1,?,?,?,datetime('now'))", (aid, f"http://s/{aid}", f"Title {aid}", f"h{aid}"))
|
|
if dup is not None:
|
|
c.execute("UPDATE articles SET duplicate_of=? WHERE id=?", (dup, aid))
|
|
c.execute("INSERT INTO article_scores (article_id,accepted,reason_code) VALUES (?,?, 'ok')", (aid, accepted))
|
|
c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?,?)", (aid, f"Summary {aid}"))
|
|
c.commit()
|
|
|
|
|
|
class FakeGeo:
|
|
def __init__(self, payload):
|
|
self._p = payload
|
|
def chat_text(self, messages):
|
|
return json.dumps(self._p)
|
|
|
|
|
|
# --- normalization: names -> ISO codes, garbage dropped --------------------------
|
|
|
|
def test_country_normalization_and_aliases():
|
|
assert geo.normalize_country("United States") == "US"
|
|
assert geo.normalize_country("the USA") == "US"
|
|
assert geo.normalize_country("uganda") == "UG"
|
|
assert geo.normalize_country("United Kingdom") == "GB"
|
|
assert geo.normalize_country("Atlantis") is None # unknown -> drop, never guess
|
|
|
|
|
|
def test_region_words_never_become_a_country():
|
|
# the exact "Europe as country" leak Codex flagged
|
|
for w in ("Europe", "Asia", "the Middle East", "European Union", "global"):
|
|
assert geo.normalize_country(w) is None
|
|
|
|
|
|
def test_state_only_for_us():
|
|
assert geo.normalize_state("California", "US") == "CA"
|
|
assert geo.normalize_state("California", "GB") is None # not a US state context
|
|
assert geo.normalize_state("Ontario", "CA") is None # v1 = US subdivisions only
|
|
|
|
|
|
def test_normalize_places_maps_dedupes_and_drops_empty():
|
|
raw = [
|
|
{"country": "United States", "state_province": "Texas", "locality": "Galveston"},
|
|
{"country": "Europe", "state_province": None, "locality": "Brussels"}, # region->no country, keep locality
|
|
{"country": None, "state_province": None, "locality": None}, # empty -> dropped
|
|
{"country": "United States", "state_province": "Texas", "locality": "Galveston"}, # dup -> dropped
|
|
]
|
|
out = geo.normalize_places(raw)
|
|
assert out == [
|
|
{"country_code": "US", "state_code": "TX", "locality": "Galveston"},
|
|
{"country_code": None, "state_code": None, "locality": "Brussels"},
|
|
]
|
|
|
|
|
|
# --- classify_geo: validates breadth/confidence, normalizes places ---------------
|
|
|
|
def test_classify_geo_validates_and_normalizes(conn):
|
|
client = FakeGeo({"breadth": "locality", "confidence": "high",
|
|
"rationale": "about Galveston",
|
|
"places": [{"country": "USA", "state_province": "Texas", "locality": "Galveston"}]})
|
|
row = conn.execute("SELECT 'x' AS title, NULL AS description, NULL AS summary, "
|
|
"NULL AS what_happened, NULL AS why_matters").fetchone()
|
|
r = geo.classify_geo(client, row)
|
|
assert r["breadth"] == "locality" and r["confidence"] == "high"
|
|
assert r["places"] == [{"country_code": "US", "state_code": "TX", "locality": "Galveston"}]
|
|
|
|
|
|
def test_classify_geo_falls_back_on_bad_enum(conn):
|
|
client = FakeGeo({"breadth": "planetary", "confidence": "absolute", "places": "nope"})
|
|
row = conn.execute("SELECT 'x' AS title, NULL AS description, NULL AS summary, "
|
|
"NULL AS what_happened, NULL AS why_matters").fetchone()
|
|
r = geo.classify_geo(client, row)
|
|
assert r["breadth"] == "unknown" and r["confidence"] == "low" and r["places"] == []
|
|
|
|
|
|
# --- storage: decoupled tables, places replaced on re-store ----------------------
|
|
|
|
def test_store_geo_writes_both_tables_and_replaces_places(conn):
|
|
_article(conn, 1)
|
|
geo.store_geo(conn, 1, {"breadth": "national", "confidence": "medium", "rationale": "US story",
|
|
"places": [{"country_code": "US", "state_code": None, "locality": None}]})
|
|
g = conn.execute("SELECT breadth, confidence, geo_version FROM article_geo WHERE article_id=1").fetchone()
|
|
assert g["breadth"] == "national" and g["confidence"] == "medium" and g["geo_version"] == geo.GEO_VERSION
|
|
assert conn.execute("SELECT COUNT(*) FROM article_places WHERE article_id=1").fetchone()[0] == 1
|
|
# re-store with different places REPLACES (geo is re-derivable, unlike scores)
|
|
geo.store_geo(conn, 1, {"breadth": "locality", "confidence": "high", "rationale": "city",
|
|
"places": [{"country_code": "US", "state_code": "CA", "locality": "Oakland"}]})
|
|
rows = conn.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=1").fetchall()
|
|
assert len(rows) == 1 and rows[0]["state_code"] == "CA" and rows[0]["locality"] == "Oakland"
|
|
|
|
|
|
# --- tag_articles: only accepted non-dup, idempotent by version ------------------
|
|
|
|
def test_tag_articles_targets_eligible_and_is_idempotent(conn):
|
|
_article(conn, 1) # eligible
|
|
_article(conn, 2, accepted=0) # rejected -> skip
|
|
_article(conn, 3, dup=1) # duplicate -> skip
|
|
client = FakeGeo({"breadth": "global", "confidence": "high", "rationale": "science", "places": []})
|
|
|
|
r1 = geo.tag_articles(conn, client, limit=50)
|
|
assert r1["candidates"] == 1 and r1["tagged"] == 1 # only article 1
|
|
assert conn.execute("SELECT breadth FROM article_geo WHERE article_id=1").fetchone()["breadth"] == "global"
|
|
|
|
# second run: already at current version -> nothing to do
|
|
assert geo.tag_articles(conn, client, limit=50)["candidates"] == 0
|
|
# reclassify forces a re-tag
|
|
assert geo.tag_articles(conn, client, limit=50, reclassify=True)["tagged"] == 1
|