89c0fbe1f6
The deploy pipeline runs from the working tree, so a wave of shipped features
had never been committed. This snapshots git to what's actually running.
SEO impression recovery (live + verified):
- Duplicate /a/{id} now 301-redirect to their canonical twin instead of 404
(a hard 404 silently dropped already-indexed URLs and tanked impressions).
- Dedup representative selection reworked: accepted/serveable -> established
rep (URL stability) -> quality score, so an accepted page never retires to a
rejected rep and an indexed canonical doesn't churn when a newer twin arrives.
- HEAD /a/{id} returns the same status as GET (api_route GET+HEAD) instead of
falling through to the static mount and 404ing.
- `dedup --force-recluster`: cycle-locked, model-free re-cluster to re-apply the
policy to the existing corpus (shared cycle_lock context manager).
- CLI honors GOODNEWS_DB for its default --db (was silently ignored).
Publishing Desk (admin tool to post highlights to X via Web Intents):
- publishing.py queue/rank/handle-resolution; admin UI; full searchable emoji
picker (bundled data, no CDN) for the blurb editor.
Play games + site:
- Bloom (word-wheel), Memory Match, daily ritual set, Zen Den (dev-gated).
- English-only language gate; source prospecting; paywall + dedup hardening.
Tests: full suite green (349). Ignores tightened (node_modules, data/*.db).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
80 lines
3.6 KiB
Python
80 lines
3.6 KiB
Python
"""Share page /a/{id}: a duplicate article 301-redirects to its canonical twin
|
|
instead of 404ing. A hard 404 silently drops already-indexed URLs from Google and
|
|
tanked impressions when a newer duplicate retired an older, indexed page."""
|
|
import pytest
|
|
from fastapi.testclient import TestClient
|
|
|
|
|
|
@pytest.fixture
|
|
def client(tmp_path, monkeypatch):
|
|
db = tmp_path / "t.sqlite3"
|
|
monkeypatch.setenv("GOODNEWS_DB", str(db))
|
|
monkeypatch.setenv("GOODNEWS_PUBLIC_BASE_URL", "https://upbeatbytes.com")
|
|
import importlib
|
|
import goodnews.api as api
|
|
importlib.reload(api)
|
|
from goodnews.db import connect, init_db
|
|
c = connect(str(db)); init_db(c)
|
|
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'BBC','http://s/f',5)")
|
|
|
|
def art(aid, *, accepted=1, dup=None, summary=True):
|
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,published_at) "
|
|
"VALUES (?,1,?,?,?,'2026-06-05T08:00:00')",
|
|
(aid, f"https://bbc.com/{aid}", f"Story {aid}", f"h{aid}"))
|
|
if dup is not None:
|
|
c.execute("UPDATE articles SET duplicate_of=? WHERE id=?", (dup, aid))
|
|
c.execute("INSERT INTO article_scores (article_id,accepted,reason_text) VALUES (?,?,'x')", (aid, accepted))
|
|
if summary:
|
|
c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?,?)", (aid, f"Summary {aid}."))
|
|
|
|
art(10) # canonical, live -> 200
|
|
art(11, dup=10) # duplicate of a live canonical -> 301 /a/10
|
|
art(12, accepted=0, dup=10) # REJECTED follower of an accepted rep -> still 301 /a/10
|
|
art(20, accepted=0) # rejected canonical
|
|
art(21, dup=20) # dup of a REJECTED canonical -> 404 (genuinely gone)
|
|
art(30, accepted=0) # rejected, not a duplicate -> 404
|
|
c.commit(); c.close()
|
|
return api.create_app()
|
|
|
|
|
|
def test_canonical_serves_200(client):
|
|
r = TestClient(client).get("/a/10")
|
|
assert r.status_code == 200 and "Story 10" in r.text
|
|
|
|
|
|
def test_duplicate_301s_to_canonical(client):
|
|
r = TestClient(client).get("/a/11", follow_redirects=False)
|
|
assert r.status_code == 301
|
|
assert r.headers["location"] == "/a/10" # consolidates onto the survivor
|
|
|
|
|
|
def test_rejected_follower_of_accepted_rep_still_301s(client):
|
|
# Policy: the route resolves duplicate_of BEFORE the follower's own acceptance, so a
|
|
# rejected article that points at an ACCEPTED representative 301s to it rather than
|
|
# 404ing. That's intentional — it sends the visitor/crawler to a serveable equivalent.
|
|
r = TestClient(client).get("/a/12", follow_redirects=False)
|
|
assert r.status_code == 301 and r.headers["location"] == "/a/10"
|
|
|
|
|
|
def test_duplicate_of_rejected_canonical_404s(client):
|
|
r = TestClient(client).get("/a/21", follow_redirects=False)
|
|
assert r.status_code == 404 # nothing serveable to redirect to
|
|
|
|
|
|
def test_rejected_article_404s(client):
|
|
assert TestClient(client).get("/a/30").status_code == 404
|
|
|
|
|
|
def test_missing_article_404s(client):
|
|
assert TestClient(client).get("/a/9999").status_code == 404
|
|
|
|
|
|
def test_head_matches_get_status(client):
|
|
# HEAD must return the same status as GET (not fall through to the static mount and
|
|
# 404). Some crawlers/link-checkers probe with HEAD.
|
|
tc = TestClient(client)
|
|
assert tc.head("/a/10").status_code == 200 # canonical
|
|
r = tc.head("/a/11", follow_redirects=False)
|
|
assert r.status_code == 301 and r.headers["location"] == "/a/10" # duplicate
|
|
assert tc.head("/a/9999").status_code == 404 # missing
|