diff --git a/data/.goodnews-cycle.lock b/data/.goodnews-cycle.lock new file mode 100644 index 0000000..e69de29 diff --git a/goodnews/briefs.py b/goodnews/briefs.py index 3230b6a..9fc9d0f 100644 --- a/goodnews/briefs.py +++ b/goodnews/briefs.py @@ -141,21 +141,30 @@ def _candidate_articles( def _select_diverse(rows: list[sqlite3.Row], limit: int) -> list[sqlite3.Row]: - selected = [] - seen_sources = set() - seen_categories = set() + """Pick up to `limit` items from `rows` (already ranked best-first). + Contract: + 1. Prefer higher-ranked items. + 2. Source diversity: take at most one item per source while other sources + remain; only repeat a source once distinct sources are exhausted. + 3. Category diversity: if the result ended up single-category and a different + category is available in the pool, swap in the highest-ranked off-category + candidate by evicting the lowest-ranked currently-selected item (so we + gain breadth without dropping a higher-ranked pick). + """ + selected: list[sqlite3.Row] = [] + seen_sources: set = set() + + # Pass 1: best-first, one per source. for row in rows: if len(selected) >= limit: break - source = row["source_name"] - category = row["default_category"] - if source in seen_sources and len(rows) > limit: + if row["source_name"] in seen_sources: continue selected.append(row) - seen_sources.add(source) - seen_categories.add(category) + seen_sources.add(row["source_name"]) + # Pass 2: if short on distinct sources, backfill best-first regardless. if len(selected) < limit: selected_ids = {row["id"] for row in selected} for row in rows: @@ -166,13 +175,15 @@ def _select_diverse(rows: list[sqlite3.Row], limit: int) -> list[sqlite3.Row]: selected.append(row) selected_ids.add(row["id"]) - if len(seen_categories) < 2 and len(rows) > limit: + # Pass 3: ensure >= 2 categories when the pool allows it. + categories = {row["default_category"] for row in selected} + if len(categories) < 2: selected_ids = {row["id"] for row in selected} for row in rows: if row["id"] in selected_ids: continue - if row["default_category"] not in seen_categories: - selected[-1] = row + if row["default_category"] not in categories: + selected[-1] = row # evict the lowest-ranked selected item break return selected diff --git a/goodnews/cli.py b/goodnews/cli.py index ec0b3ed..26edf52 100644 --- a/goodnews/cli.py +++ b/goodnews/cli.py @@ -144,20 +144,24 @@ def main() -> None: elif args.command == "classify": init_db(conn) client = llm_client_from_args(args) - results = classify_articles( + report = classify_articles( conn, client, limit=args.limit, include_rejected=args.include_rejected, dry_run=args.dry_run, ) - for article_id, scores in results: + for article_id, scores in report.results: accepted = "yes" if scores["accepted"] else "no" print( f"[{article_id}] accepted={accepted} {scores['topic']}/{scores['flavor']} " f"reason={scores['reason_code']}" ) print(f" {scores['reason_text']}") + print( + f"classify: attempted={report.attempted} succeeded={report.succeeded} " + f"skipped={report.skipped}" + ) if args.dry_run: print("Dry run only; database was not updated.") elif args.command == "cycle": @@ -294,7 +298,7 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non print(f" classify {done}/{total} (article {article_id})", flush=True) try: - results = classify_articles( + report = classify_articles( conn, client, limit=args.classify_limit, @@ -302,7 +306,11 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non only_unclassified=True, progress=_progress, ) - print(f"classify: {len(results)} new article(s) scored by {client.model}", flush=True) + print( + f"classify: attempted={report.attempted} succeeded={report.succeeded} " + f"skipped={report.skipped} (model {client.model})", + flush=True, + ) except Exception as exc: # endpoint down, timeout, etc. — keep going print(f"classify: skipped ({exc})", flush=True) diff --git a/goodnews/filters.py b/goodnews/filters.py new file mode 100644 index 0000000..96259d9 --- /dev/null +++ b/goodnews/filters.py @@ -0,0 +1,123 @@ +"""Calm Filters — the canonical preference model and pure matching engine. + +Everything (localStorage today, query params on the API, a user_preferences row +later) speaks this one shape, so the surfaces never drift. The functions here are +deliberately pure and side-effect-free so they are easy to test and reuse from +both the API and the CLI. + +The humane surface ("Not today" / "Less like this" / "Always hide this") maps onto +this machinery: a pause is a topic/flavor muted *until* a timestamp; a mute is a +standing exclusion; avoid-terms drop anything mentioning a phrase the reader would +rather not see. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from datetime import datetime + +# Split on any run of non-alphanumerics so matching is punctuation- and +# case-insensitive, and anchored to whole words/phrases (no substring surprises: +# "pan" must not match "pandemic", and "stock market" matches as a phrase). +_NONWORD = re.compile(r"[^a-z0-9]+") + + +def _normalize(text: str) -> str: + """Lowercase, collapse non-alphanumerics to single spaces, pad with spaces.""" + return " " + _NONWORD.sub(" ", text.lower()).strip() + " " + + +def text_matches_avoid_terms(text: str | None, terms: list[str]) -> bool: + """True if text contains any avoid term as a whole word or phrase.""" + if not text or not terms: + return False + haystack = _normalize(text) + for term in terms: + needle = _normalize(term).strip() + if needle and f" {needle} " in haystack: + return True + return False + + +@dataclass +class Pause: + kind: str # "topic" or "flavor" + value: str + until: str # ISO 8601 UTC timestamp + + def active(self, now: datetime) -> bool: + try: + until = datetime.fromisoformat(self.until.replace("Z", "+00:00")) + except (ValueError, AttributeError): + return False + return until > now + + +@dataclass +class FilterPrefs: + include_topics: list[str] = field(default_factory=list) + include_flavors: list[str] = field(default_factory=list) + mute_topics: list[str] = field(default_factory=list) + mute_flavors: list[str] = field(default_factory=list) + avoid_terms: list[str] = field(default_factory=list) + pauses: list[Pause] = field(default_factory=list) + + @classmethod + def from_dict(cls, data: dict | None) -> "FilterPrefs": + data = data or {} + return cls( + include_topics=list(data.get("include_topics") or []), + include_flavors=list(data.get("include_flavors") or []), + mute_topics=list(data.get("mute_topics") or []), + mute_flavors=list(data.get("mute_flavors") or []), + avoid_terms=list(data.get("avoid_terms") or []), + pauses=[Pause(**p) for p in (data.get("pauses") or [])], + ) + + def muted_topics(self, now: datetime) -> set[str]: + """Standing mutes plus any topic currently paused.""" + muted = set(self.mute_topics) + muted |= {p.value for p in self.pauses if p.kind == "topic" and p.active(now)} + return muted + + def muted_flavors(self, now: datetime) -> set[str]: + muted = set(self.mute_flavors) + muted |= {p.value for p in self.pauses if p.kind == "flavor" and p.active(now)} + return muted + + def is_empty(self) -> bool: + return not ( + self.include_topics + or self.include_flavors + or self.mute_topics + or self.mute_flavors + or self.avoid_terms + or self.pauses + ) + + +def allows(article: dict, prefs: FilterPrefs, now: datetime) -> bool: + """True if an article (a feed/brief row dict) survives the preferences.""" + topic = article.get("topic") + flavor = article.get("flavor") + + if prefs.include_topics and topic not in prefs.include_topics: + return False + if prefs.include_flavors and flavor not in prefs.include_flavors: + return False + if topic in prefs.muted_topics(now): + return False + if flavor in prefs.muted_flavors(now): + return False + blob = f"{article.get('title') or ''} {article.get('description') or ''}" + if text_matches_avoid_terms(blob, prefs.avoid_terms): + return False + return True + + +def filter_articles(articles: list[dict], prefs: FilterPrefs, now: datetime) -> list[dict]: + """Apply preferences to a list of article rows, preserving order.""" + if prefs.is_empty(): + return articles + return [a for a in articles if allows(a, prefs, now)] diff --git a/goodnews/llm.py b/goodnews/llm.py index 4b89fce..acf724f 100644 --- a/goodnews/llm.py +++ b/goodnews/llm.py @@ -220,6 +220,14 @@ class LocalModelClient: return parse_classifier_json(content) +@dataclass +class ClassifyReport: + results: list[tuple[int, dict]] + attempted: int + succeeded: int + skipped: int + + def classify_articles( conn: sqlite3.Connection, client: LocalModelClient, @@ -228,17 +236,19 @@ def classify_articles( dry_run: bool = False, only_unclassified: bool = False, progress: "Callable[[int, int, int], None] | None" = None, -) -> list[tuple[int, dict]]: +) -> ClassifyReport: rows = _classification_candidates( conn, limit=limit, include_rejected=include_rejected, only_unclassified=only_unclassified ) results = [] + skipped = 0 for index, row in enumerate(rows, start=1): try: scores = client.classify(row) except RuntimeError as exc: # One slow/failed article (timeout, bad response) shouldn't sink the # whole batch or discard work already committed. Skip and continue. + skipped += 1 print(f"[{row['id']}] skipped: {exc}") continue scores = normalize_scores(scores, model_name=client.model) @@ -248,7 +258,7 @@ def classify_articles( conn.commit() if progress is not None: progress(index, len(rows), row["id"]) - return results + return ClassifyReport(results=results, attempted=len(rows), succeeded=len(results), skipped=skipped) def parse_classifier_json(content: str) -> dict: diff --git a/goodnews/text.py b/goodnews/text.py index 54bd3d4..1242048 100644 --- a/goodnews/text.py +++ b/goodnews/text.py @@ -26,7 +26,8 @@ def clean_text(value: str | None, max_len: int = 1000) -> str | None: text = html.unescape(text) text = WHITESPACE_RE.sub(" ", text).strip() if len(text) > max_len: - return text[: max_len - 1].rstrip() + "..." + # Keep the ellipsis inside max_len rather than overshooting by 3. + return text[: max_len - 3].rstrip() + "..." return text or None diff --git a/pyproject.toml b/pyproject.toml index 22eab94..1e90a1f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,9 @@ web = [ "fastapi>=0.110", "uvicorn[standard]>=0.29", ] +test = [ + "pytest>=8", +] [project.scripts] goodnews = "goodnews.cli:main" diff --git a/tests/test_briefs.py b/tests/test_briefs.py new file mode 100644 index 0000000..a601879 --- /dev/null +++ b/tests/test_briefs.py @@ -0,0 +1,50 @@ +from goodnews.briefs import _select_diverse + + +def row(id, source, category): + # _select_diverse only reads these three keys; plain dicts support [] access. + return {"id": id, "source_name": source, "default_category": category} + + +def test_prefers_distinct_sources_best_first(): + rows = [ + row(1, "A", "science"), + row(2, "A", "science"), # same source as #1 — should be skipped while others remain + row(3, "B", "science"), + row(4, "C", "environment"), + ] + selected = _select_diverse(rows, limit=3) + ids = [r["id"] for r in selected] + assert ids == [1, 3, 4] # one per source, ranked order preserved + + +def test_backfills_when_sources_exhausted(): + rows = [row(1, "A", "science"), row(2, "A", "science"), row(3, "A", "science")] + selected = _select_diverse(rows, limit=2) + assert len(selected) == 2 # repeats source A only because no others exist + + +def test_injects_second_category_without_shrinking(): + rows = [ + row(1, "A", "science"), + row(2, "B", "science"), + row(3, "C", "science"), + row(4, "D", "environment"), # the only other category, lowest ranked + ] + selected = _select_diverse(rows, limit=3) + cats = {r["default_category"] for r in selected} + assert len(selected) == 3 + assert len(cats) >= 2 # environment swapped in for diversity + assert any(r["default_category"] == "environment" for r in selected) + + +def test_keeps_single_category_when_no_alternative_exists(): + rows = [row(1, "A", "science"), row(2, "B", "science"), row(3, "C", "science")] + selected = _select_diverse(rows, limit=3) + assert len(selected) == 3 + assert {r["default_category"] for r in selected} == {"science"} + + +def test_never_returns_more_than_limit(): + rows = [row(i, f"S{i}", "science") for i in range(10)] + assert len(_select_diverse(rows, limit=5)) == 5 diff --git a/tests/test_dedup.py b/tests/test_dedup.py new file mode 100644 index 0000000..c4de502 --- /dev/null +++ b/tests/test_dedup.py @@ -0,0 +1,83 @@ +import math +from array import array + +import pytest + +from goodnews.db import connect, init_db +from goodnews.dedup import _day_ordinal, _unit, cluster_duplicates + + +def test_unit_normalizes_to_length_one(): + u = _unit([3.0, 4.0]) + assert math.isclose(u[0], 0.6) and math.isclose(u[1], 0.8) + + +def test_unit_handles_zero_vector(): + assert _unit([0.0, 0.0]) == [0.0, 0.0] + + +def test_day_ordinal_parsing(): + from datetime import date + + assert _day_ordinal("2026-05-30T12:00:00+00:00") == date(2026, 5, 30).toordinal() + assert _day_ordinal(None) == 0 + assert _day_ordinal("not-a-date") == 0 + + +@pytest.fixture +def conn(): + c = connect(":memory:") + init_db(c) + c.execute( + "INSERT INTO sources (id, name, feed_url, trust_score) VALUES (1, 'S1', 'http://s1/feed', 5)" + ) + yield c + c.close() + + +def _add(conn, article_id, vector, constructive, when="2026-05-30T10:00:00+00:00"): + conn.execute( + "INSERT INTO articles (id, source_id, canonical_url, title, published_at, url_hash) " + "VALUES (?, 1, ?, ?, ?, ?)", + (article_id, f"http://s1/{article_id}", f"Title {article_id}", when, f"hash{article_id}"), + ) + conn.execute( + "INSERT INTO article_scores (article_id, constructive_score, agency_score, " + "human_benefit_score, cortisol_score, ragebait_score, pr_risk_score, accepted) " + "VALUES (?, ?, 0, 0, 0, 0, 0, 1)", + (article_id, constructive), + ) + conn.execute( + "INSERT INTO article_embeddings (article_id, vector, dim, model) VALUES (?, ?, ?, 'test')", + (article_id, array("f", vector).tobytes(), len(vector)), + ) + conn.commit() + + +def test_near_duplicates_collapse_to_highest_ranked(conn): + # A and B are near-identical; A has the higher constructive score so it wins. + _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9) # A (rep) + _add(conn, 2, [0.99, 0.02, 0.0, 0.0], constructive=3) # B (dup of A) + _add(conn, 3, [0.0, 1.0, 0.0, 0.0], constructive=8) # C (distinct) + + stats = cluster_duplicates(conn, threshold=0.86, window_days=3) + assert stats["duplicates"] == 1 + + dup_of = {r["id"]: r["duplicate_of"] for r in conn.execute("SELECT id, duplicate_of FROM articles")} + assert dup_of[2] == 1 # B points at A + assert dup_of[1] is None # A is representative + assert dup_of[3] is None # C stands alone + + +def test_distinct_articles_are_not_clustered(conn): + _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=5) + _add(conn, 2, [0.0, 1.0, 0.0, 0.0], constructive=5) + stats = cluster_duplicates(conn, threshold=0.86, window_days=3) + assert stats["duplicates"] == 0 + + +def test_outside_time_window_not_clustered(conn): + _add(conn, 1, [1.0, 0.0, 0.0, 0.0], constructive=9, when="2026-05-30T10:00:00+00:00") + _add(conn, 2, [1.0, 0.0, 0.0, 0.0], constructive=3, when="2026-05-10T10:00:00+00:00") + stats = cluster_duplicates(conn, threshold=0.86, window_days=3) + assert stats["duplicates"] == 0 # identical vectors, but 20 days apart diff --git a/tests/test_filters.py b/tests/test_filters.py new file mode 100644 index 0000000..8dfda55 --- /dev/null +++ b/tests/test_filters.py @@ -0,0 +1,88 @@ +from datetime import datetime, timezone + +from goodnews.filters import ( + FilterPrefs, + Pause, + filter_articles, + text_matches_avoid_terms, +) + +NOW = datetime(2026, 6, 1, tzinfo=timezone.utc) + + +def art(topic="science", flavor="discovery", title="A calm discovery", description=""): + return {"topic": topic, "flavor": flavor, "title": title, "description": description} + + +# --- avoid-term matching: the trust-critical pure function --- + +def test_single_word_matches_whole_word_only(): + assert text_matches_avoid_terms("New cancer drug approved", ["cancer"]) + assert not text_matches_avoid_terms("Cancerous growth studied", ["cancer"]) + + +def test_substring_does_not_match(): + # "pan" must not match "pandemic" + assert not text_matches_avoid_terms("Pandemic preparedness improves", ["pan"]) + + +def test_phrase_matches_as_phrase(): + assert text_matches_avoid_terms("The stock market crashed today", ["stock market"]) + assert not text_matches_avoid_terms("Stocks and other markets", ["stock market"]) + + +def test_punctuation_and_case_normalized(): + assert text_matches_avoid_terms("An Anti-Aging breakthrough", ["anti aging"]) + assert text_matches_avoid_terms("ELECTION results", ["election"]) + + +def test_empty_inputs_are_safe(): + assert not text_matches_avoid_terms("", ["cancer"]) + assert not text_matches_avoid_terms("anything", []) + assert not text_matches_avoid_terms(None, ["cancer"]) + + +# --- filter_articles over the canonical prefs --- + +def test_empty_prefs_pass_everything_through(): + items = [art(), art(topic="health")] + assert filter_articles(items, FilterPrefs(), NOW) == items + + +def test_mute_topic_drops_matching_articles(): + items = [art(topic="science"), art(topic="health")] + prefs = FilterPrefs.from_dict({"mute_topics": ["health"]}) + out = filter_articles(items, prefs, NOW) + assert [a["topic"] for a in out] == ["science"] + + +def test_include_topics_keeps_only_those(): + items = [art(topic="science"), art(topic="animals"), art(topic="health")] + prefs = FilterPrefs.from_dict({"include_topics": ["science", "animals"]}) + out = filter_articles(items, prefs, NOW) + assert {a["topic"] for a in out} == {"science", "animals"} + + +def test_avoid_terms_match_title_and_description(): + items = [art(title="Update on the election"), art(description="about an election too"), art()] + prefs = FilterPrefs.from_dict({"avoid_terms": ["election"]}) + out = filter_articles(items, prefs, NOW) + assert len(out) == 1 + + +def test_active_pause_hides_topic_but_expired_does_not(): + items = [art(topic="health")] + active = FilterPrefs.from_dict( + {"pauses": [{"kind": "topic", "value": "health", "until": "2026-06-02T00:00:00Z"}]} + ) + expired = FilterPrefs.from_dict( + {"pauses": [{"kind": "topic", "value": "health", "until": "2026-05-01T00:00:00Z"}]} + ) + assert filter_articles(items, active, NOW) == [] + assert filter_articles(items, expired, NOW) == items + + +def test_pause_active_helper(): + assert Pause("topic", "health", "2026-06-02T00:00:00Z").active(NOW) + assert not Pause("topic", "health", "2026-05-01T00:00:00Z").active(NOW) + assert not Pause("topic", "health", "garbage").active(NOW) diff --git a/tests/test_scoring.py b/tests/test_scoring.py new file mode 100644 index 0000000..c49a6e6 --- /dev/null +++ b/tests/test_scoring.py @@ -0,0 +1,48 @@ +from goodnews.scoring import score_article + + +def test_constructive_story_is_accepted(): + s = score_article("Community volunteers restore creek habitat", "A hopeful recovery effort", 3) + assert s["accepted"] == 1 + assert s["constructive_score"] >= 5 + assert s["reason_code"] == "heuristic_constructive_candidate" + + +def test_neutral_story_needs_review(): + s = score_article("The weather report for tomorrow", None, 3) + assert s["accepted"] == 0 + assert s["reason_code"] == "heuristic_needs_review" + + +def test_cortisol_heavy_is_rejected(): + s = score_article("War and death as murder and attack escalate", None, 3) + assert s["accepted"] == 0 + assert s["cortisol_score"] > 5 + assert s["reason_code"] == "heuristic_reject_cortisol_heavy" + + +def test_ragebait_is_rejected_before_cortisol(): + s = score_article("Senator slams rival and sparks backlash", None, 3) + assert s["accepted"] == 0 + assert s["ragebait_score"] > 3 + assert s["reason_code"] == "heuristic_reject_ragebait_language" + + +def test_pr_risk_from_source_and_terms_rejects(): + s = score_article("Startup announces funding round and unveils brand", None, 6) + assert s["pr_risk_score"] > 7 + assert s["accepted"] == 0 + + +def test_all_scores_within_bounds(): + s = score_article("breakthrough cure restores hope " * 10, "progress " * 20, 3) + for key in ( + "constructive_score", + "cortisol_score", + "ragebait_score", + "agency_score", + "human_benefit_score", + "novelty_score", + "pr_risk_score", + ): + assert 0 <= s[key] <= 10, key diff --git a/tests/test_text.py b/tests/test_text.py new file mode 100644 index 0000000..76a595a --- /dev/null +++ b/tests/test_text.py @@ -0,0 +1,36 @@ +from goodnews.text import canonicalize_url, clean_text, sha256_text + + +def test_clean_text_strips_tags_and_entities(): + assert clean_text("
Hello& world
") == "Hello& world" + + +def test_clean_text_truncates(): + out = clean_text("x" * 50, max_len=10) + assert out.endswith("...") and len(out) <= 10 + + +def test_clean_text_empty_is_none(): + assert clean_text("") is None + assert clean_text(None) is None + + +def test_canonicalize_strips_tracking_params(): + url = "https://Example.com/story?utm_source=x&id=7&fbclid=abc" + out = canonicalize_url(url) + assert "utm_source" not in out and "fbclid" not in out + assert "id=7" in out + assert out.startswith("https://example.com") # scheme/host lowercased + + +def test_canonicalize_sorts_query_for_stable_hash(): + a = canonicalize_url("https://e.com/p?b=2&a=1") + b = canonicalize_url("https://e.com/p?a=1&b=2") + assert a == b + assert sha256_text(a) == sha256_text(b) + + +def test_canonicalize_rejects_non_http(): + assert canonicalize_url("ftp://e.com/x") is None + assert canonicalize_url("javascript:alert(1)") is None + assert canonicalize_url(None) is None