"""English-only gate: non-English articles are HELD (reason_code='non_english'), preserved (not deleted) and distinct from calm-filter rejections, so they don't penalize a multilingual source and can be revisited when translation lands.""" from goodnews import queries from goodnews.db import connect, init_db from goodnews.llm import normalize_scores, upsert_article_score def _data(**kw): base = { "constructive_score": 7, "cortisol_score": 1, "ragebait_score": 1, "agency_score": 5, "human_benefit_score": 6, "novelty_score": 4, "pr_risk_score": 2, "accepted": True, "topic": "science", "flavor": "discovery", "tags": [], "reason_code": "ok", "reason_text": "good", } base.update(kw) return base def test_english_passes_through(): s = normalize_scores(_data(language="en"), "m") assert s["accepted"] == 1 and s["reason_code"] == "ok" and s["language"] == "en" def test_en_variants_count_as_english(): for lang in ("en-US", "EN", "en_us", "en-GB"): assert normalize_scores(_data(language=lang), "m")["accepted"] == 1 def test_non_english_is_held_not_a_rejection(): s = normalize_scores(_data(language="de"), "m") assert s["accepted"] == 0 assert s["reason_code"] == "non_english" # distinct bucket, not a calm-filter reject assert s["language"] == "de" assert "non-English" in s["reason_text"] def test_missing_or_unknown_language_defaults_to_english(): # a model hiccup must never silently drop genuine English content assert normalize_scores(_data(language=""), "m")["accepted"] == 1 assert normalize_scores(_data(language="und"), "m")["accepted"] == 1 assert normalize_scores(_data(), "m")["accepted"] == 1 # no language key at all def test_non_english_buckets_even_a_content_reject(): # a non-English item that was also content-rejected is still 'held', so source # metrics can separate language-holds from calm rejections cleanly s = normalize_scores(_data(language="es", accepted=False, reason_code="ragebait"), "m") assert s["accepted"] == 0 and s["reason_code"] == "non_english" def test_language_persisted_structurally_and_inspector_marks_held(): c = connect(":memory:"); init_db(c) c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)") c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (1,1,'http://x','T','h1')") c.commit() upsert_article_score(c, 1, normalize_scores(_data(language="de"), "m")) row = c.execute("SELECT accepted, reason_code, language FROM article_scores WHERE article_id=1").fetchone() assert row["language"] == "de" and row["reason_code"] == "non_english" and row["accepted"] == 0 # structured, not parsed # inspector: shows under 'held', flagged held=True, and NOT under 'rejected' held = queries.source_articles(c, 1, filter="held") assert len(held) == 1 and held[0]["held"] is True assert queries.source_articles(c, 1, filter="rejected") == []