Files
upbeatBytes/tests/test_language_gate.py
thejayman77 89c0fbe1f6 Sync repo to deployed state: SEO recovery, Publishing Desk, Play games, emoji picker
The deploy pipeline runs from the working tree, so a wave of shipped features
had never been committed. This snapshots git to what's actually running.

SEO impression recovery (live + verified):
- Duplicate /a/{id} now 301-redirect to their canonical twin instead of 404
  (a hard 404 silently dropped already-indexed URLs and tanked impressions).
- Dedup representative selection reworked: accepted/serveable -> established
  rep (URL stability) -> quality score, so an accepted page never retires to a
  rejected rep and an indexed canonical doesn't churn when a newer twin arrives.
- HEAD /a/{id} returns the same status as GET (api_route GET+HEAD) instead of
  falling through to the static mount and 404ing.
- `dedup --force-recluster`: cycle-locked, model-free re-cluster to re-apply the
  policy to the existing corpus (shared cycle_lock context manager).
- CLI honors GOODNEWS_DB for its default --db (was silently ignored).

Publishing Desk (admin tool to post highlights to X via Web Intents):
- publishing.py queue/rank/handle-resolution; admin UI; full searchable emoji
  picker (bundled data, no CDN) for the blurb editor.

Play games + site:
- Bloom (word-wheel), Memory Match, daily ritual set, Zen Den (dev-gated).
- English-only language gate; source prospecting; paywall + dedup hardening.

Tests: full suite green (349). Ignores tightened (node_modules, data/*.db).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 11:32:27 -04:00

64 lines
3.0 KiB
Python

"""English-only gate: non-English articles are HELD (reason_code='non_english'),
preserved (not deleted) and distinct from calm-filter rejections, so they don't
penalize a multilingual source and can be revisited when translation lands."""
from goodnews import queries
from goodnews.db import connect, init_db
from goodnews.llm import normalize_scores, upsert_article_score
def _data(**kw):
base = {
"constructive_score": 7, "cortisol_score": 1, "ragebait_score": 1, "agency_score": 5,
"human_benefit_score": 6, "novelty_score": 4, "pr_risk_score": 2, "accepted": True,
"topic": "science", "flavor": "discovery", "tags": [],
"reason_code": "ok", "reason_text": "good",
}
base.update(kw)
return base
def test_english_passes_through():
s = normalize_scores(_data(language="en"), "m")
assert s["accepted"] == 1 and s["reason_code"] == "ok" and s["language"] == "en"
def test_en_variants_count_as_english():
for lang in ("en-US", "EN", "en_us", "en-GB"):
assert normalize_scores(_data(language=lang), "m")["accepted"] == 1
def test_non_english_is_held_not_a_rejection():
s = normalize_scores(_data(language="de"), "m")
assert s["accepted"] == 0
assert s["reason_code"] == "non_english" # distinct bucket, not a calm-filter reject
assert s["language"] == "de"
assert "non-English" in s["reason_text"]
def test_missing_or_unknown_language_defaults_to_english():
# a model hiccup must never silently drop genuine English content
assert normalize_scores(_data(language=""), "m")["accepted"] == 1
assert normalize_scores(_data(language="und"), "m")["accepted"] == 1
assert normalize_scores(_data(), "m")["accepted"] == 1 # no language key at all
def test_non_english_buckets_even_a_content_reject():
# a non-English item that was also content-rejected is still 'held', so source
# metrics can separate language-holds from calm rejections cleanly
s = normalize_scores(_data(language="es", accepted=False, reason_code="ragebait"), "m")
assert s["accepted"] == 0 and s["reason_code"] == "non_english"
def test_language_persisted_structurally_and_inspector_marks_held():
c = connect(":memory:"); init_db(c)
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash) VALUES (1,1,'http://x','T','h1')")
c.commit()
upsert_article_score(c, 1, normalize_scores(_data(language="de"), "m"))
row = c.execute("SELECT accepted, reason_code, language FROM article_scores WHERE article_id=1").fetchone()
assert row["language"] == "de" and row["reason_code"] == "non_english" and row["accepted"] == 0 # structured, not parsed
# inspector: shows under 'held', flagged held=True, and NOT under 'rejected'
held = queries.source_articles(c, 1, filter="held")
assert len(held) == 1 and held[0]["held"] is True
assert queries.source_articles(c, 1, filter="rejected") == []