import math from array import array import pytest from goodnews.db import connect, init_db from goodnews.dedup import _day_ordinal, _unit, cluster_duplicates def test_unit_normalizes_to_length_one(): u = _unit([3.0, 4.0]) assert math.isclose(u[0], 0.6) and math.isclose(u[1], 0.8) def test_unit_handles_zero_vector(): assert _unit([0.0, 0.0]) == [0.0, 0.0] def test_day_ordinal_parsing(): from datetime import date assert _day_ordinal("2026-05-30T12:00:00+00:00") == date(2026, 5, 30).toordinal() assert _day_ordinal(None) == 0 assert _day_ordinal("not-a-date") == 0 @pytest.fixture def conn(): c = connect(":memory:") init_db(c) c.execute( "INSERT INTO sources (id, name, feed_url, trust_score) VALUES (1, 'S1', 'http://s1/feed', 5)" ) yield c c.close() def _add(conn, article_id, vector, constructive, when="2026-05-30T10:00:00+00:00", accepted=1): conn.execute( "INSERT INTO articles (id, source_id, canonical_url, title, published_at, url_hash) " "VALUES (?, 1, ?, ?, ?, ?)", (article_id, f"http://s1/{article_id}", f"Title {article_id}", when, f"hash{article_id}"), ) conn.execute( "INSERT INTO article_scores (article_id, constructive_score, agency_score, " "human_benefit_score, cortisol_score, ragebait_score, pr_risk_score, accepted) " "VALUES (?, ?, 0, 0, 0, 0, 0, ?)", (article_id, constructive, accepted), ) conn.execute( "INSERT INTO article_embeddings (article_id, vector, dim, model) VALUES (?, ?, ?, 'test')", (article_id, array("f", vector).tobytes(), len(vector)), ) conn.commit() def test_near_duplicates_collapse_to_highest_ranked(conn): # A and B are near-identical; A has the higher constructive score so it wins. _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9) # A (rep) _add(conn, 2, [0.99, 0.02, 0.0, 0.0], constructive=3) # B (dup of A) _add(conn, 3, [0.0, 1.0, 0.0, 0.0], constructive=8) # C (distinct) stats = cluster_duplicates(conn, threshold=0.86, window_days=3) assert stats["duplicates"] == 1 dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")} assert dup_of[2] == 1 # B points at A assert dup_of[1] is None # A is representative assert dup_of[3] is None # C stands alone def test_accepted_member_beats_a_higher_quality_rejected_one(conn): # The rep must be SERVEABLE: an accepted page may never be retired to a rejected # representative (that page would 404 with nothing to 301 to). Accepted wins even # though the rejected twin scores higher on quality. _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9, accepted=0) # higher quality, REJECTED _add(conn, 2, [0.99, 0.02, 0.0, 0.0], constructive=3, accepted=1) # lower quality, accepted cluster_duplicates(conn, threshold=0.86, window_days=3) dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")} assert dup_of[2] is None # the accepted article is the representative (serves 200) assert dup_of[1] == 2 # the rejected one points at it def test_established_rep_stays_stable_when_a_better_twin_arrives(conn): # An already-indexed canonical shouldn't churn just because a higher-quality near # duplicate shows up later. Establish 1 as rep (with follower 3), then a stronger 2 # arrives — 1 must remain the representative for URL stability. _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=5) _add(conn, 3, [0.99, 0.01, 0.0, 0.0], constructive=1) cluster_duplicates(conn, threshold=0.86, window_days=3) # run 1: 1 is rep (score 5 > 1) assert conn.execute("SELECT duplicate_of FROM articles WHERE id=1").fetchone()[0] is None _add(conn, 2, [0.995, 0.01, 0.0, 0.0], constructive=9) # higher quality newcomer cluster_duplicates(conn, threshold=0.86, window_days=3) # run 2 dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")} assert dup_of[1] is None # incumbent stays canonical despite 2's higher score assert dup_of[2] == 1 and dup_of[3] == 1 def test_distinct_articles_are_not_clustered(conn): _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=5) _add(conn, 2, [0.0, 1.0, 0.0, 0.0], constructive=5) stats = cluster_duplicates(conn, threshold=0.86, window_days=3) assert stats["duplicates"] == 0 def test_outside_time_window_not_clustered(conn): _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9, when="2026-05-30T10:00:00+00:00") _add(conn, 2, [1.0, 0.0, 0.0, 0.0], constructive=3, when="2026-05-10T10:00:00+00:00") stats = cluster_duplicates(conn, threshold=0.86, window_days=3) assert stats["duplicates"] == 0 # identical vectors, but 20 days apart