"""Share page /a/{id}: a duplicate article 301-redirects to its canonical twin instead of 404ing. A hard 404 silently drops already-indexed URLs from Google and tanked impressions when a newer duplicate retired an older, indexed page.""" import pytest from fastapi.testclient import TestClient @pytest.fixture def client(tmp_path, monkeypatch): db = tmp_path / "t.sqlite3" monkeypatch.setenv("GOODNEWS_DB", str(db)) monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "https://upbeatbytes.com") import importlib import goodnews.api as api importlib.reload(api) from goodnews.db import connect, init_db c = connect(str(db)); init_db(c) c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'BBC','http://s/f',5)") def art(aid, *, accepted=1, dup=None, summary=True): c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,published_at) " "VALUES (?,1,?,?,?,'2026-06-05T08:00:00')", (aid, f"https://bbc.com/{aid}", f"Story {aid}", f"h{aid}")) if dup is not None: c.execute("UPDATE articles SET duplicate_of=? WHERE id=?", (dup, aid)) c.execute("INSERT INTO article_scores (article_id,accepted,reason_text) VALUES (?,?,'x')", (aid, accepted)) if summary: c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?,?)", (aid, f"Summary {aid}.")) art(10) # canonical, live -> 200 art(11, dup=10) # duplicate of a live canonical -> 301 /a/10 art(12, accepted=0, dup=10) # REJECTED follower of an accepted rep -> still 301 /a/10 art(20, accepted=0) # rejected canonical art(21, dup=20) # dup of a REJECTED canonical -> 404 (genuinely gone) art(30, accepted=0) # rejected, not a duplicate -> 404 c.commit(); c.close() return api.create_app() def test_canonical_serves_200(client): r = TestClient(client).get("/a/10") assert r.status_code == 200 and "Story 10" in r.text def test_duplicate_301s_to_canonical(client): r = TestClient(client).get("/a/11", follow_redirects=False) assert r.status_code == 301 assert r.headers["location"] == "/a/10" # consolidates onto the survivor def test_rejected_follower_of_accepted_rep_still_301s(client): # Policy: the route resolves duplicate_of BEFORE the follower's own acceptance, so a # rejected article that points at an ACCEPTED representative 301s to it rather than # 404ing. That's intentional — it sends the visitor/crawler to a serveable equivalent. r = TestClient(client).get("/a/12", follow_redirects=False) assert r.status_code == 301 and r.headers["location"] == "/a/10" def test_duplicate_of_rejected_canonical_404s(client): r = TestClient(client).get("/a/21", follow_redirects=False) assert r.status_code == 404 # nothing serveable to redirect to def test_rejected_article_404s(client): assert TestClient(client).get("/a/30").status_code == 404 def test_missing_article_404s(client): assert TestClient(client).get("/a/9999").status_code == 404 def test_head_matches_get_status(client): # HEAD must return the same status as GET (not fall through to the static mount and # 404). Some crawlers/link-checkers probe with HEAD. tc = TestClient(client) assert tc.head("/a/10").status_code == 200 # canonical r = tc.head("/a/11", follow_redirects=False) assert r.status_code == 301 and r.headers["location"] == "/a/10" # duplicate assert tc.head("/a/9999").status_code == 404 # missing