Files
upbeatBytes/tests/test_geo.py
T
thejayman77 1c05554a28 Geo Stage 1-2: subject-geography model + classifier + pipeline wiring
"Closer to Home" foundation (audit greenlit by Codex). Durable geography, kept
decoupled from volatile scoring.

- Schema: article_geo (breadth/confidence/rationale/geo_version) + article_places
  (0..N ISO-coded places), separate from article_scores so re-runs/audits never
  disturb scoring or acceptance. "local" is never stored — it's relative to the
  reader; the UI computes "Near you" later.
- geo.py: LLM proposes place NAMES, code disposes to ISO codes (country alpha-2,
  US state 2-letter); region words like "Europe" can never become a country.
  'global'/placeless is first-class, not failure. Confidence calibrated so 'high'
  needs an explicit location. Geo is its OWN LLM pass, not merged into the scoring
  prompt (durable metadata, re-runnable, keeps the sensitive prompt untouched).
- store_geo replaces places (geo is re-derivable, unlike scores). tag_articles is
  idempotent by geo_version, only touches accepted non-duplicate articles.
- CLI `geo` command (cycle-locked, --limit/--reclassify) for backfill, plus a
  bounded geo step in the cycle (--geo-limit 60, --no-geo). scripts/geo_audit.py
  is the prototype audit tool.

360 tests green; live smoke tagged real articles correctly (Gaza->PS, London->GB,
placeless science->global). No UI / SEO pages yet — ranking/personalization only.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-19 16:56:49 -04:00

125 lines
6.0 KiB
Python

"""Subject-geography: ISO normalization (model proposes names, code disposes to codes),
storage into the decoupled article_geo/article_places tables, and idempotent tagging."""
import json
import pytest
from goodnews import geo
from goodnews.db import connect, init_db
@pytest.fixture
def conn():
c = connect(":memory:"); init_db(c)
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
yield c
c.close()
def _article(c, aid, *, accepted=1, dup=None):
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,discovered_at) "
"VALUES (?,1,?,?,?,datetime('now'))", (aid, f"http://s/{aid}", f"Title {aid}", f"h{aid}"))
if dup is not None:
c.execute("UPDATE articles SET duplicate_of=? WHERE id=?", (dup, aid))
c.execute("INSERT INTO article_scores (article_id,accepted,reason_code) VALUES (?,?, 'ok')", (aid, accepted))
c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?,?)", (aid, f"Summary {aid}"))
c.commit()
class FakeGeo:
def __init__(self, payload):
self._p = payload
def chat_text(self, messages):
return json.dumps(self._p)
# --- normalization: names -> ISO codes, garbage dropped --------------------------
def test_country_normalization_and_aliases():
assert geo.normalize_country("United States") == "US"
assert geo.normalize_country("the USA") == "US"
assert geo.normalize_country("uganda") == "UG"
assert geo.normalize_country("United Kingdom") == "GB"
assert geo.normalize_country("Atlantis") is None # unknown -> drop, never guess
def test_region_words_never_become_a_country():
# the exact "Europe as country" leak Codex flagged
for w in ("Europe", "Asia", "the Middle East", "European Union", "global"):
assert geo.normalize_country(w) is None
def test_state_only_for_us():
assert geo.normalize_state("California", "US") == "CA"
assert geo.normalize_state("California", "GB") is None # not a US state context
assert geo.normalize_state("Ontario", "CA") is None # v1 = US subdivisions only
def test_normalize_places_maps_dedupes_and_drops_empty():
raw = [
{"country": "United States", "state_province": "Texas", "locality": "Galveston"},
{"country": "Europe", "state_province": None, "locality": "Brussels"}, # region->no country, keep locality
{"country": None, "state_province": None, "locality": None}, # empty -> dropped
{"country": "United States", "state_province": "Texas", "locality": "Galveston"}, # dup -> dropped
]
out = geo.normalize_places(raw)
assert out == [
{"country_code": "US", "state_code": "TX", "locality": "Galveston"},
{"country_code": None, "state_code": None, "locality": "Brussels"},
]
# --- classify_geo: validates breadth/confidence, normalizes places ---------------
def test_classify_geo_validates_and_normalizes(conn):
client = FakeGeo({"breadth": "locality", "confidence": "high",
"rationale": "about Galveston",
"places": [{"country": "USA", "state_province": "Texas", "locality": "Galveston"}]})
row = conn.execute("SELECT 'x' AS title, NULL AS description, NULL AS summary, "
"NULL AS what_happened, NULL AS why_matters").fetchone()
r = geo.classify_geo(client, row)
assert r["breadth"] == "locality" and r["confidence"] == "high"
assert r["places"] == [{"country_code": "US", "state_code": "TX", "locality": "Galveston"}]
def test_classify_geo_falls_back_on_bad_enum(conn):
client = FakeGeo({"breadth": "planetary", "confidence": "absolute", "places": "nope"})
row = conn.execute("SELECT 'x' AS title, NULL AS description, NULL AS summary, "
"NULL AS what_happened, NULL AS why_matters").fetchone()
r = geo.classify_geo(client, row)
assert r["breadth"] == "unknown" and r["confidence"] == "low" and r["places"] == []
# --- storage: decoupled tables, places replaced on re-store ----------------------
def test_store_geo_writes_both_tables_and_replaces_places(conn):
_article(conn, 1)
geo.store_geo(conn, 1, {"breadth": "national", "confidence": "medium", "rationale": "US story",
"places": [{"country_code": "US", "state_code": None, "locality": None}]})
g = conn.execute("SELECT breadth, confidence, geo_version FROM article_geo WHERE article_id=1").fetchone()
assert g["breadth"] == "national" and g["confidence"] == "medium" and g["geo_version"] == geo.GEO_VERSION
assert conn.execute("SELECT COUNT(*) FROM article_places WHERE article_id=1").fetchone()[0] == 1
# re-store with different places REPLACES (geo is re-derivable, unlike scores)
geo.store_geo(conn, 1, {"breadth": "locality", "confidence": "high", "rationale": "city",
"places": [{"country_code": "US", "state_code": "CA", "locality": "Oakland"}]})
rows = conn.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=1").fetchall()
assert len(rows) == 1 and rows[0]["state_code"] == "CA" and rows[0]["locality"] == "Oakland"
# --- tag_articles: only accepted non-dup, idempotent by version ------------------
def test_tag_articles_targets_eligible_and_is_idempotent(conn):
_article(conn, 1) # eligible
_article(conn, 2, accepted=0) # rejected -> skip
_article(conn, 3, dup=1) # duplicate -> skip
client = FakeGeo({"breadth": "global", "confidence": "high", "rationale": "science", "places": []})
r1 = geo.tag_articles(conn, client, limit=50)
assert r1["candidates"] == 1 and r1["tagged"] == 1 # only article 1
assert conn.execute("SELECT breadth FROM article_geo WHERE article_id=1").fetchone()["breadth"] == "global"
# second run: already at current version -> nothing to do
assert geo.tag_articles(conn, client, limit=50)["candidates"] == 0
# reclassify forces a re-tag
assert geo.tag_articles(conn, client, limit=50, reclassify=True)["tagged"] == 1