upbeatBytes/tests/test_dedup.py

import math
from array import array

import pytest

from goodnews.db import connect, init_db
from goodnews.dedup import _day_ordinal, _unit, cluster_duplicates


def test_unit_normalizes_to_length_one():
    u = _unit([3.0, 4.0])
    assert math.isclose(u[0], 0.6) and math.isclose(u[1], 0.8)


def test_unit_handles_zero_vector():
    assert _unit([0.0, 0.0]) == [0.0, 0.0]


def test_day_ordinal_parsing():
    from datetime import date

    assert _day_ordinal("2026-05-30T12:00:00+00:00") == date(2026, 5, 30).toordinal()
    assert _day_ordinal(None) == 0
    assert _day_ordinal("not-a-date") == 0


@pytest.fixture
def conn():
    c = connect(":memory:")
    init_db(c)
    c.execute(
        "INSERT INTO sources (id, name, feed_url, trust_score) VALUES (1, 'S1', 'http://s1/feed', 5)"
    )
    yield c
    c.close()


def _add(conn, article_id, vector, constructive, when="2026-05-30T10:00:00+00:00", accepted=1):
    conn.execute(
        "INSERT INTO articles (id, source_id, canonical_url, title, published_at, url_hash) "
        "VALUES (?, 1, ?, ?, ?, ?)",
        (article_id, f"http://s1/{article_id}", f"Title {article_id}", when, f"hash{article_id}"),
    )
    conn.execute(
        "INSERT INTO article_scores (article_id, constructive_score, agency_score, "
        "human_benefit_score, cortisol_score, ragebait_score, pr_risk_score, accepted) "
        "VALUES (?, ?, 0, 0, 0, 0, 0, ?)",
        (article_id, constructive, accepted),
    )
    conn.execute(
        "INSERT INTO article_embeddings (article_id, vector, dim, model) VALUES (?, ?, ?, 'test')",
        (article_id, array("f", vector).tobytes(), len(vector)),
    )
    conn.commit()


def test_near_duplicates_collapse_to_highest_ranked(conn):
    # A and B are near-identical; A has the higher constructive score so it wins.
    _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9)      # A (rep)
    _add(conn, 2, [0.99, 0.02, 0.0, 0.0], constructive=3)    # B (dup of A)
    _add(conn, 3, [0.0, 1.0, 0.0, 0.0], constructive=8)      # C (distinct)

    stats = cluster_duplicates(conn, threshold=0.86, window_days=3)
    assert stats["duplicates"] == 1

    dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")}
    assert dup_of[2] == 1     # B points at A
    assert dup_of[1] is None  # A is representative
    assert dup_of[3] is None  # C stands alone


def test_accepted_member_beats_a_higher_quality_rejected_one(conn):
    # The rep must be SERVEABLE: an accepted page may never be retired to a rejected
    # representative (that page would 404 with nothing to 301 to). Accepted wins even
    # though the rejected twin scores higher on quality.
    _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9, accepted=0)   # higher quality, REJECTED
    _add(conn, 2, [0.99, 0.02, 0.0, 0.0], constructive=3, accepted=1) # lower quality, accepted
    cluster_duplicates(conn, threshold=0.86, window_days=3)
    dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")}
    assert dup_of[2] is None   # the accepted article is the representative (serves 200)
    assert dup_of[1] == 2      # the rejected one points at it


def test_established_rep_stays_stable_when_a_better_twin_arrives(conn):
    # An already-indexed canonical shouldn't churn just because a higher-quality near
    # duplicate shows up later. Establish 1 as rep (with follower 3), then a stronger 2
    # arrives — 1 must remain the representative for URL stability.
    _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=5)
    _add(conn, 3, [0.99, 0.01, 0.0, 0.0], constructive=1)
    cluster_duplicates(conn, threshold=0.86, window_days=3)   # run 1: 1 is rep (score 5 > 1)
    assert conn.execute("SELECT duplicate_of FROM articles WHERE id=1").fetchone()[0] is None

    _add(conn, 2, [0.995, 0.01, 0.0, 0.0], constructive=9)   # higher quality newcomer
    cluster_duplicates(conn, threshold=0.86, window_days=3)   # run 2
    dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")}
    assert dup_of[1] is None   # incumbent stays canonical despite 2's higher score
    assert dup_of[2] == 1 and dup_of[3] == 1


def test_distinct_articles_are_not_clustered(conn):
    _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=5)
    _add(conn, 2, [0.0, 1.0, 0.0, 0.0], constructive=5)
    stats = cluster_duplicates(conn, threshold=0.86, window_days=3)
    assert stats["duplicates"] == 0


def test_outside_time_window_not_clustered(conn):
    _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9, when="2026-05-30T10:00:00+00:00")
    _add(conn, 2, [1.0, 0.0, 0.0, 0.0], constructive=3, when="2026-05-10T10:00:00+00:00")
    stats = cluster_duplicates(conn, threshold=0.86, window_days=3)
    assert stats["duplicates"] == 0  # identical vectors, but 20 days apart