Files
upbeatBytes/tests/test_dedup.py
T
thejayman77 9cdcda5e02 Durability pass: tests, clearer diversity/classify behavior, Calm Filters foundation
- Add pytest suite (34 tests) covering scoring thresholds, dedup clustering +
  representative selection + time window, brief source/category diversity,
  avoid-term phrase matching, and text canonicalization/truncation.
- Rewrite _select_diverse with an explicit, tested contract (best-first, one
  per source, backfill, then inject a second category by evicting the
  lowest-ranked pick).
- classify_articles now returns attempted/succeeded/skipped (ClassifyReport) so
  silent model failures are visible in both the cycle and classify output.
- Fix clean_text truncation to stay within max_len (ellipsis no longer
  overshoots).
- New filters.py: canonical FilterPrefs shape (include/mute topics+flavors,
  avoid_terms, pauses) and pure word/phrase-boundary matching engine seeding
  Calm Filters. Not yet wired into the API.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 19:07:31 +00:00

84 lines
2.9 KiB
Python

import math
from array import array
import pytest
from goodnews.db import connect, init_db
from goodnews.dedup import _day_ordinal, _unit, cluster_duplicates
def test_unit_normalizes_to_length_one():
u = _unit([3.0, 4.0])
assert math.isclose(u[0], 0.6) and math.isclose(u[1], 0.8)
def test_unit_handles_zero_vector():
assert _unit([0.0, 0.0]) == [0.0, 0.0]
def test_day_ordinal_parsing():
from datetime import date
assert _day_ordinal("2026-05-30T12:00:00+00:00") == date(2026, 5, 30).toordinal()
assert _day_ordinal(None) == 0
assert _day_ordinal("not-a-date") == 0
@pytest.fixture
def conn():
c = connect(":memory:")
init_db(c)
c.execute(
"INSERT INTO sources (id, name, feed_url, trust_score) VALUES (1, 'S1', 'http://s1/feed', 5)"
)
yield c
c.close()
def _add(conn, article_id, vector, constructive, when="2026-05-30T10:00:00+00:00"):
conn.execute(
"INSERT INTO articles (id, source_id, canonical_url, title, published_at, url_hash) "
"VALUES (?, 1, ?, ?, ?, ?)",
(article_id, f"http://s1/{article_id}", f"Title {article_id}", when, f"hash{article_id}"),
)
conn.execute(
"INSERT INTO article_scores (article_id, constructive_score, agency_score, "
"human_benefit_score, cortisol_score, ragebait_score, pr_risk_score, accepted) "
"VALUES (?, ?, 0, 0, 0, 0, 0, 1)",
(article_id, constructive),
)
conn.execute(
"INSERT INTO article_embeddings (article_id, vector, dim, model) VALUES (?, ?, ?, 'test')",
(article_id, array("f", vector).tobytes(), len(vector)),
)
conn.commit()
def test_near_duplicates_collapse_to_highest_ranked(conn):
# A and B are near-identical; A has the higher constructive score so it wins.
_add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9) # A (rep)
_add(conn, 2, [0.99, 0.02, 0.0, 0.0], constructive=3) # B (dup of A)
_add(conn, 3, [0.0, 1.0, 0.0, 0.0], constructive=8) # C (distinct)
stats = cluster_duplicates(conn, threshold=0.86, window_days=3)
assert stats["duplicates"] == 1
dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")}
assert dup_of[2] == 1 # B points at A
assert dup_of[1] is None # A is representative
assert dup_of[3] is None # C stands alone
def test_distinct_articles_are_not_clustered(conn):
_add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=5)
_add(conn, 2, [0.0, 1.0, 0.0, 0.0], constructive=5)
stats = cluster_duplicates(conn, threshold=0.86, window_days=3)
assert stats["duplicates"] == 0
def test_outside_time_window_not_clustered(conn):
_add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9, when="2026-05-30T10:00:00+00:00")
_add(conn, 2, [1.0, 0.0, 0.0, 0.0], constructive=3, when="2026-05-10T10:00:00+00:00")
stats = cluster_duplicates(conn, threshold=0.86, window_days=3)
assert stats["duplicates"] == 0 # identical vectors, but 20 days apart