89c0fbe1f6
The deploy pipeline runs from the working tree, so a wave of shipped features
had never been committed. This snapshots git to what's actually running.
SEO impression recovery (live + verified):
- Duplicate /a/{id} now 301-redirect to their canonical twin instead of 404
(a hard 404 silently dropped already-indexed URLs and tanked impressions).
- Dedup representative selection reworked: accepted/serveable -> established
rep (URL stability) -> quality score, so an accepted page never retires to a
rejected rep and an indexed canonical doesn't churn when a newer twin arrives.
- HEAD /a/{id} returns the same status as GET (api_route GET+HEAD) instead of
falling through to the static mount and 404ing.
- `dedup --force-recluster`: cycle-locked, model-free re-cluster to re-apply the
policy to the existing corpus (shared cycle_lock context manager).
- CLI honors GOODNEWS_DB for its default --db (was silently ignored).
Publishing Desk (admin tool to post highlights to X via Web Intents):
- publishing.py queue/rank/handle-resolution; admin UI; full searchable emoji
picker (bundled data, no CDN) for the blurb editor.
Play games + site:
- Bloom (word-wheel), Memory Match, daily ritual set, Zen Den (dev-gated).
- English-only language gate; source prospecting; paywall + dedup hardening.
Tests: full suite green (349). Ignores tightened (node_modules, data/*.db).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
112 lines
4.7 KiB
Python
112 lines
4.7 KiB
Python
import math
|
|
from array import array
|
|
|
|
import pytest
|
|
|
|
from goodnews.db import connect, init_db
|
|
from goodnews.dedup import _day_ordinal, _unit, cluster_duplicates
|
|
|
|
|
|
def test_unit_normalizes_to_length_one():
|
|
u = _unit([3.0, 4.0])
|
|
assert math.isclose(u[0], 0.6) and math.isclose(u[1], 0.8)
|
|
|
|
|
|
def test_unit_handles_zero_vector():
|
|
assert _unit([0.0, 0.0]) == [0.0, 0.0]
|
|
|
|
|
|
def test_day_ordinal_parsing():
|
|
from datetime import date
|
|
|
|
assert _day_ordinal("2026-05-30T12:00:00+00:00") == date(2026, 5, 30).toordinal()
|
|
assert _day_ordinal(None) == 0
|
|
assert _day_ordinal("not-a-date") == 0
|
|
|
|
|
|
@pytest.fixture
|
|
def conn():
|
|
c = connect(":memory:")
|
|
init_db(c)
|
|
c.execute(
|
|
"INSERT INTO sources (id, name, feed_url, trust_score) VALUES (1, 'S1', 'http://s1/feed', 5)"
|
|
)
|
|
yield c
|
|
c.close()
|
|
|
|
|
|
def _add(conn, article_id, vector, constructive, when="2026-05-30T10:00:00+00:00", accepted=1):
|
|
conn.execute(
|
|
"INSERT INTO articles (id, source_id, canonical_url, title, published_at, url_hash) "
|
|
"VALUES (?, 1, ?, ?, ?, ?)",
|
|
(article_id, f"http://s1/{article_id}", f"Title {article_id}", when, f"hash{article_id}"),
|
|
)
|
|
conn.execute(
|
|
"INSERT INTO article_scores (article_id, constructive_score, agency_score, "
|
|
"human_benefit_score, cortisol_score, ragebait_score, pr_risk_score, accepted) "
|
|
"VALUES (?, ?, 0, 0, 0, 0, 0, ?)",
|
|
(article_id, constructive, accepted),
|
|
)
|
|
conn.execute(
|
|
"INSERT INTO article_embeddings (article_id, vector, dim, model) VALUES (?, ?, ?, 'test')",
|
|
(article_id, array("f", vector).tobytes(), len(vector)),
|
|
)
|
|
conn.commit()
|
|
|
|
|
|
def test_near_duplicates_collapse_to_highest_ranked(conn):
|
|
# A and B are near-identical; A has the higher constructive score so it wins.
|
|
_add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9) # A (rep)
|
|
_add(conn, 2, [0.99, 0.02, 0.0, 0.0], constructive=3) # B (dup of A)
|
|
_add(conn, 3, [0.0, 1.0, 0.0, 0.0], constructive=8) # C (distinct)
|
|
|
|
stats = cluster_duplicates(conn, threshold=0.86, window_days=3)
|
|
assert stats["duplicates"] == 1
|
|
|
|
dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")}
|
|
assert dup_of[2] == 1 # B points at A
|
|
assert dup_of[1] is None # A is representative
|
|
assert dup_of[3] is None # C stands alone
|
|
|
|
|
|
def test_accepted_member_beats_a_higher_quality_rejected_one(conn):
|
|
# The rep must be SERVEABLE: an accepted page may never be retired to a rejected
|
|
# representative (that page would 404 with nothing to 301 to). Accepted wins even
|
|
# though the rejected twin scores higher on quality.
|
|
_add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9, accepted=0) # higher quality, REJECTED
|
|
_add(conn, 2, [0.99, 0.02, 0.0, 0.0], constructive=3, accepted=1) # lower quality, accepted
|
|
cluster_duplicates(conn, threshold=0.86, window_days=3)
|
|
dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")}
|
|
assert dup_of[2] is None # the accepted article is the representative (serves 200)
|
|
assert dup_of[1] == 2 # the rejected one points at it
|
|
|
|
|
|
def test_established_rep_stays_stable_when_a_better_twin_arrives(conn):
|
|
# An already-indexed canonical shouldn't churn just because a higher-quality near
|
|
# duplicate shows up later. Establish 1 as rep (with follower 3), then a stronger 2
|
|
# arrives — 1 must remain the representative for URL stability.
|
|
_add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=5)
|
|
_add(conn, 3, [0.99, 0.01, 0.0, 0.0], constructive=1)
|
|
cluster_duplicates(conn, threshold=0.86, window_days=3) # run 1: 1 is rep (score 5 > 1)
|
|
assert conn.execute("SELECT duplicate_of FROM articles WHERE id=1").fetchone()[0] is None
|
|
|
|
_add(conn, 2, [0.995, 0.01, 0.0, 0.0], constructive=9) # higher quality newcomer
|
|
cluster_duplicates(conn, threshold=0.86, window_days=3) # run 2
|
|
dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")}
|
|
assert dup_of[1] is None # incumbent stays canonical despite 2's higher score
|
|
assert dup_of[2] == 1 and dup_of[3] == 1
|
|
|
|
|
|
def test_distinct_articles_are_not_clustered(conn):
|
|
_add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=5)
|
|
_add(conn, 2, [0.0, 1.0, 0.0, 0.0], constructive=5)
|
|
stats = cluster_duplicates(conn, threshold=0.86, window_days=3)
|
|
assert stats["duplicates"] == 0
|
|
|
|
|
|
def test_outside_time_window_not_clustered(conn):
|
|
_add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9, when="2026-05-30T10:00:00+00:00")
|
|
_add(conn, 2, [1.0, 0.0, 0.0, 0.0], constructive=3, when="2026-05-10T10:00:00+00:00")
|
|
stats = cluster_duplicates(conn, threshold=0.86, window_days=3)
|
|
assert stats["duplicates"] == 0 # identical vectors, but 20 days apart
|