"""Subject-geography: ISO normalization (model proposes names, code disposes to codes), storage into the decoupled article_geo/article_places tables, and idempotent tagging.""" import json import pytest from goodnews import geo from goodnews.db import connect, init_db @pytest.fixture def conn(): c = connect(":memory:"); init_db(c) c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)") yield c c.close() def _article(c, aid, *, accepted=1, dup=None): c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,discovered_at) " "VALUES (?,1,?,?,?,datetime('now'))", (aid, f"http://s/{aid}", f"Title {aid}", f"h{aid}")) if dup is not None: c.execute("UPDATE articles SET duplicate_of=? WHERE id=?", (dup, aid)) c.execute("INSERT INTO article_scores (article_id,accepted,reason_code) VALUES (?,?, 'ok')", (aid, accepted)) c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?,?)", (aid, f"Summary {aid}")) c.commit() class FakeGeo: def __init__(self, payload): self._p = payload def chat_text(self, messages): return json.dumps(self._p) # --- normalization: names -> ISO codes, garbage dropped -------------------------- def test_country_normalization_and_aliases(): assert geo.normalize_country("United States") == "US" assert geo.normalize_country("the USA") == "US" assert geo.normalize_country("uganda") == "UG" assert geo.normalize_country("United Kingdom") == "GB" assert geo.normalize_country("Atlantis") is None # unknown -> drop, never guess def test_region_words_never_become_a_country(): # the exact "Europe as country" leak Codex flagged for w in ("Europe", "Asia", "the Middle East", "European Union", "global"): assert geo.normalize_country(w) is None def test_state_only_for_us(): assert geo.normalize_state("California", "US") == "CA" assert geo.normalize_state("California", "GB") is None # not a US state context assert geo.normalize_state("Ontario", "CA") is None # v1 = US subdivisions only def test_normalize_places_maps_dedupes_and_drops_empty(): raw = [ {"country": "United States", "state_province": "Texas", "locality": "Galveston"}, {"country": "Europe", "state_province": None, "locality": "Brussels"}, # region->no country, keep locality {"country": None, "state_province": None, "locality": None}, # empty -> dropped {"country": "United States", "state_province": "Texas", "locality": "Galveston"}, # dup -> dropped ] out = geo.normalize_places(raw) assert out == [ {"country_code": "US", "state_code": "TX", "locality": "Galveston"}, {"country_code": None, "state_code": None, "locality": "Brussels"}, ] # --- classify_geo: validates breadth/confidence, normalizes places --------------- def test_classify_geo_validates_and_normalizes(conn): client = FakeGeo({"breadth": "locality", "confidence": "high", "rationale": "about Galveston", "places": [{"country": "USA", "state_province": "Texas", "locality": "Galveston"}]}) row = conn.execute("SELECT 'x' AS title, NULL AS description, NULL AS summary, " "NULL AS what_happened, NULL AS why_matters").fetchone() r = geo.classify_geo(client, row) assert r["breadth"] == "locality" and r["confidence"] == "high" assert r["places"] == [{"country_code": "US", "state_code": "TX", "locality": "Galveston"}] def test_classify_geo_falls_back_on_bad_enum(conn): client = FakeGeo({"breadth": "planetary", "confidence": "absolute", "places": "nope"}) row = conn.execute("SELECT 'x' AS title, NULL AS description, NULL AS summary, " "NULL AS what_happened, NULL AS why_matters").fetchone() r = geo.classify_geo(client, row) assert r["breadth"] == "unknown" and r["confidence"] == "low" and r["places"] == [] # --- storage: decoupled tables, places replaced on re-store ---------------------- def test_store_geo_writes_both_tables_and_replaces_places(conn): _article(conn, 1) geo.store_geo(conn, 1, {"breadth": "national", "confidence": "medium", "rationale": "US story", "places": [{"country_code": "US", "state_code": None, "locality": None}]}) g = conn.execute("SELECT breadth, confidence, geo_version FROM article_geo WHERE article_id=1").fetchone() assert g["breadth"] == "national" and g["confidence"] == "medium" and g["geo_version"] == geo.GEO_VERSION assert conn.execute("SELECT COUNT(*) FROM article_places WHERE article_id=1").fetchone()[0] == 1 # re-store with different places REPLACES (geo is re-derivable, unlike scores) geo.store_geo(conn, 1, {"breadth": "locality", "confidence": "high", "rationale": "city", "places": [{"country_code": "US", "state_code": "CA", "locality": "Oakland"}]}) rows = conn.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=1").fetchall() assert len(rows) == 1 and rows[0]["state_code"] == "CA" and rows[0]["locality"] == "Oakland" # --- tag_articles: only accepted non-dup, idempotent by version ------------------ def test_tag_articles_targets_eligible_and_is_idempotent(conn): _article(conn, 1) # eligible _article(conn, 2, accepted=0) # rejected -> skip _article(conn, 3, dup=1) # duplicate -> skip client = FakeGeo({"breadth": "global", "confidence": "high", "rationale": "science", "places": []}) r1 = geo.tag_articles(conn, client, limit=50) assert r1["candidates"] == 1 and r1["tagged"] == 1 # only article 1 assert conn.execute("SELECT breadth FROM article_geo WHERE article_id=1").fetchone()["breadth"] == "global" # second run: already at current version -> nothing to do assert geo.tag_articles(conn, client, limit=50)["candidates"] == 0 # reclassify forces a re-tag assert geo.tag_articles(conn, client, limit=50, reclassify=True)["tagged"] == 1