Add semantic cross-source dedup via local embeddings

- LocalModelClient.embed() calls the OpenAI-compatible /embeddings endpoint
  (local nomic model); base_url shared with chat, model via GOODNEWS_EMBED_MODEL.
- New article_embeddings table and articles.duplicate_of column (+ migration).
- dedup module: embeds missing articles, clusters near-identical stories within
  a date window by cosine similarity (pure-stdlib, vectors normalised once), and
  marks all but the highest-ranked member of each cluster as a duplicate.
- 'dedup' CLI command; cycle now runs poll -> classify -> dedup -> brief.
- Feed and brief queries hide duplicates, so a story carried by multiple
  outlets shows once.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-05-30 15:40:55 +00:00
parent 2a9c49e2a9
commit 5d44072fca
7 changed files with 259 additions and 4 deletions
+15 -1
View File
@@ -15,6 +15,8 @@ python3 -m goodnews poll --limit 3
python3 -m goodnews rescore
python3 -m goodnews check-llm --base-url http://127.0.0.1:1234/v1 --model gpt-oss
python3 -m goodnews classify --limit 10 --base-url http://127.0.0.1:1234/v1 --model gpt-oss
python3 -m goodnews dedup --base-url http://127.0.0.1:1234/v1
python3 -m goodnews check-feeds
python3 -m goodnews build-brief --date 2026-05-27 --replace
python3 -m goodnews show-brief
python3 -m goodnews list-recent --limit 10
@@ -49,6 +51,18 @@ and one **flavor**, allowing browsable category feeds (e.g. "feel-good animals",
The allowed values live in `goodnews/taxonomy.py`. The accept/reject gate is kept
deliberately broad ("not dreary"); ranking and category filters do the curation.
## Deduplication
Two layers:
- **Exact**: a URL hash UNIQUE constraint drops the literal same link at ingest.
- **Semantic**: `dedup` embeds each article's title+snippet with the local
embedding model, clusters near-identical stories within a few-day window
(cosine similarity), and marks all but the highest-ranked in each cluster as
`duplicate_of` the representative. Feed and brief queries hide duplicates, so
the same story carried by several outlets appears once. This runs as part of
`cycle`, so the scheduler keeps the corpus deduped automatically.
## Stored Article Data
For each article, the database stores:
@@ -112,7 +126,7 @@ often as you like — it only polls sources that are *due* (per each source's
rebuilds the current day's brief:
```bash
python3 -m goodnews cycle # poll due -> classify new -> rebuild today's brief
python3 -m goodnews cycle # poll due -> classify new -> dedup -> rebuild today's brief
python3 -m goodnews cycle --force # poll every active source regardless of interval
python3 -m goodnews cycle --no-classify # skip the LLM step (e.g. model box offline)
```
+1
View File
@@ -118,6 +118,7 @@ def _candidate_articles(
JOIN sources src ON src.id = a.source_id
JOIN article_scores s ON s.article_id = a.id
WHERE s.accepted = 1
AND a.duplicate_of IS NULL
AND date(COALESCE(a.published_at, a.discovered_at)) <= date(?)
AND date(COALESCE(a.published_at, a.discovered_at)) > date(?, '-' || ? || ' days')
AND a.id NOT IN (
+27
View File
@@ -8,6 +8,7 @@ from pathlib import Path
from .briefs import build_daily_brief, show_brief
from .db import connect, init_db
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, dedup as run_dedup
from .feeds import fetch_feed, parse_feed, poll_all_sources, poll_due_sources, poll_source
from .llm import LocalModelClient, classify_articles
from .scoring import score_article
@@ -68,11 +69,19 @@ def main() -> None:
)
cycle_parser.add_argument("--classify-limit", type=int, default=40)
cycle_parser.add_argument("--no-classify", action="store_true", help="Skip the LLM classify step")
cycle_parser.add_argument("--no-dedup", action="store_true", help="Skip the embedding dedup step")
cycle_parser.add_argument("--no-brief", action="store_true", help="Skip rebuilding today's brief")
cycle_parser.add_argument("--force", action="store_true", help="Poll all active sources, ignoring intervals")
cycle_parser.add_argument("--base-url", help="OpenAI-compatible base URL for classify")
cycle_parser.add_argument("--model", help="Local model name for classify")
dedup_parser = subparsers.add_parser("dedup", help="Cluster near-duplicate stories via local embeddings")
dedup_parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD, help="Cosine similarity cutoff")
dedup_parser.add_argument("--window-days", type=int, default=DEFAULT_WINDOW_DAYS)
dedup_parser.add_argument("--embed-limit", type=int, help="Cap how many missing embeddings to compute")
dedup_parser.add_argument("--base-url", help="OpenAI-compatible base URL")
dedup_parser.add_argument("--model", help="Chat model name (unused for embeddings)")
check_llm_parser = subparsers.add_parser("check-llm", help="Check local OpenAI-compatible model endpoint")
check_llm_parser.add_argument("--base-url", help="OpenAI-compatible base URL, e.g. http://127.0.0.1:1234/v1")
check_llm_parser.add_argument("--model", help="Expected local model name")
@@ -153,6 +162,17 @@ def main() -> None:
print("Dry run only; database was not updated.")
elif args.command == "cycle":
run_cycle(conn, args)
elif args.command == "dedup":
init_db(conn)
client = llm_client_from_args(args)
stats = run_dedup(
conn, client, threshold=args.threshold, window_days=args.window_days, embed_limit=args.embed_limit
)
print(
f"dedup: embedded={stats['embedded']} articles={stats['articles']} "
f"clusters={stats['clusters']} duplicate_clusters={stats['duplicate_clusters']} "
f"duplicates_hidden={stats['duplicates']}"
)
elif args.command == "check-llm":
client = llm_client_from_args(args)
try:
@@ -256,6 +276,13 @@ def run_cycle(conn: sqlite3.Connection, args: argparse.Namespace) -> None:
except Exception as exc: # endpoint down, timeout, etc. — keep going
print(f"classify: skipped ({exc})")
if not args.no_dedup:
try:
stats = run_dedup(conn, llm_client_from_args(args))
print(f"dedup: embedded={stats['embedded']} duplicates_hidden={stats['duplicates']}")
except Exception as exc:
print(f"dedup: skipped ({exc})")
if not args.no_brief:
today = date.today().isoformat()
try:
+19 -2
View File
@@ -37,6 +37,7 @@ CREATE TABLE IF NOT EXISTS articles (
raw_guid TEXT,
url_hash TEXT NOT NULL UNIQUE,
title_hash TEXT,
duplicate_of INTEGER REFERENCES articles(id) ON DELETE SET NULL,
FOREIGN KEY (source_id) REFERENCES sources(id)
);
@@ -62,6 +63,14 @@ CREATE TABLE IF NOT EXISTS article_scores (
scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS article_embeddings (
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
vector BLOB NOT NULL,
dim INTEGER NOT NULL,
model TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS ingest_runs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
@@ -114,7 +123,15 @@ def _migrate(conn: sqlite3.Connection) -> None:
CREATE TABLE IF NOT EXISTS never alters an existing table, so new columns
need an explicit, idempotent ALTER guarded by the current column set.
"""
cols = {row["name"] for row in conn.execute("PRAGMA table_info(article_scores)")}
score_cols = {row["name"] for row in conn.execute("PRAGMA table_info(article_scores)")}
for column in ("topic", "flavor"):
if column not in cols:
if column not in score_cols:
conn.execute(f"ALTER TABLE article_scores ADD COLUMN {column} TEXT")
article_cols = {row["name"] for row in conn.execute("PRAGMA table_info(articles)")}
if "duplicate_of" not in article_cols:
conn.execute(
"ALTER TABLE articles ADD COLUMN duplicate_of INTEGER REFERENCES articles(id)"
)
# Created here (not in SCHEMA) so it runs after the column exists on upgrades.
conn.execute("CREATE INDEX IF NOT EXISTS idx_articles_duplicate_of ON articles(duplicate_of)")
+171
View File
@@ -0,0 +1,171 @@
"""Cross-source near-duplicate detection via local embeddings.
The exact-URL dedupe in feeds.py only catches the literal same link. The same
story carried by several outlets slips through as separate articles. Here we
embed each article's title+snippet with the local embedding model, cluster
near-identical ones within a short time window, and mark all but the best in
each cluster as duplicates (articles.duplicate_of). Feed and brief queries then
hide duplicates, keeping the single strongest version.
Pure-stdlib math: vectors are normalised once so cosine similarity is a dot
product, and comparisons are restricted to a date window, so no numpy is needed.
"""
from __future__ import annotations
import math
import sqlite3
from array import array
from datetime import date
from .llm import LocalModelClient
DEFAULT_THRESHOLD = 0.86
DEFAULT_WINDOW_DAYS = 3
_EMBED_BATCH = 16
def _embed_text(title: str, description: str | None) -> str:
text = title.strip()
if description:
text += ". " + description.strip()
return text[:2000]
def ensure_embeddings(
conn: sqlite3.Connection, client: LocalModelClient, limit: int | None = None
) -> int:
"""Embed and store any articles that lack an embedding. Returns count added."""
rows = conn.execute(
"""
SELECT a.id, a.title, a.description
FROM articles a
LEFT JOIN article_embeddings e ON e.article_id = a.id
WHERE e.article_id IS NULL
ORDER BY a.id
"""
).fetchall()
if limit is not None:
rows = rows[:limit]
if not rows:
return 0
added = 0
for start in range(0, len(rows), _EMBED_BATCH):
batch = rows[start : start + _EMBED_BATCH]
vectors = client.embed([_embed_text(r["title"], r["description"]) for r in batch])
for row, vector in zip(batch, vectors):
conn.execute(
"INSERT OR REPLACE INTO article_embeddings (article_id, vector, dim, model) "
"VALUES (?, ?, ?, ?)",
(row["id"], array("f", vector).tobytes(), len(vector), client.embed_model),
)
added += 1
conn.commit()
return added
def _unit(vector: list[float]) -> list[float]:
norm = math.sqrt(sum(x * x for x in vector))
if norm == 0:
return vector
return [x / norm for x in vector]
def _day_ordinal(value: str | None) -> int:
if not value:
return 0
try:
return date.fromisoformat(value[:10]).toordinal()
except ValueError:
return 0
def cluster_duplicates(
conn: sqlite3.Connection,
threshold: float = DEFAULT_THRESHOLD,
window_days: int = DEFAULT_WINDOW_DAYS,
) -> dict:
"""Group near-identical articles and record duplicate_of links.
Greedy single-link clustering: each article joins the first existing cluster
whose anchor it matches (cosine >= threshold, within window_days); otherwise
it starts a new cluster. The highest-ranked member of each cluster becomes
the representative; the rest point at it.
"""
rows = conn.execute(
"""
SELECT
a.id,
COALESCE(a.published_at, a.discovered_at) AS dt,
e.vector,
(COALESCE(s.constructive_score,0) + COALESCE(s.agency_score,0)
+ COALESCE(s.human_benefit_score,0) + src.trust_score
- COALESCE(s.cortisol_score,0) - COALESCE(s.ragebait_score,0)
- COALESCE(s.pr_risk_score,0)) AS rank_score
FROM articles a
JOIN article_embeddings e ON e.article_id = a.id
JOIN sources src ON src.id = a.source_id
LEFT JOIN article_scores s ON s.article_id = a.id
ORDER BY dt
"""
).fetchall()
items = []
for r in rows:
vec = _unit(array("f", r["vector"]).tolist())
items.append({"id": r["id"], "ord": _day_ordinal(r["dt"]), "vec": vec, "score": r["rank_score"]})
clusters: list[dict] = [] # {anchor_vec, anchor_ord, members:[item]}
for it in items:
placed = False
for cl in clusters:
if abs(it["ord"] - cl["anchor_ord"]) > window_days:
continue
dot = sum(x * y for x, y in zip(it["vec"], cl["anchor_vec"]))
if dot >= threshold:
cl["members"].append(it)
placed = True
break
if not placed:
clusters.append({"anchor_vec": it["vec"], "anchor_ord": it["ord"], "members": [it]})
# Reset prior decisions for everything we considered, then re-apply.
considered = [it["id"] for it in items]
conn.executemany(
"UPDATE articles SET duplicate_of = NULL WHERE id = ?", [(i,) for i in considered]
)
dup_clusters = 0
duplicates = 0
for cl in clusters:
if len(cl["members"]) < 2:
continue
dup_clusters += 1
rep = max(cl["members"], key=lambda m: (m["score"], -m["id"]))
for m in cl["members"]:
if m["id"] != rep["id"]:
conn.execute(
"UPDATE articles SET duplicate_of = ? WHERE id = ?", (rep["id"], m["id"])
)
duplicates += 1
conn.commit()
return {
"articles": len(items),
"clusters": len(clusters),
"duplicate_clusters": dup_clusters,
"duplicates": duplicates,
}
def dedup(
conn: sqlite3.Connection,
client: LocalModelClient,
threshold: float = DEFAULT_THRESHOLD,
window_days: int = DEFAULT_WINDOW_DAYS,
embed_limit: int | None = None,
) -> dict:
embedded = ensure_embeddings(conn, client, limit=embed_limit)
stats = cluster_duplicates(conn, threshold=threshold, window_days=window_days)
stats["embedded"] = embedded
return stats
+25
View File
@@ -19,6 +19,7 @@ from .taxonomy import (
DEFAULT_BASE_URL = "http://127.0.0.1:1234/v1"
DEFAULT_MODEL = "gpt-oss"
DEFAULT_EMBED_MODEL = "text-embedding-nomic-embed-text-v1.5"
DEFAULT_TIMEOUT = 180
@@ -106,6 +107,7 @@ class LocalModelClient:
model: str
api_key: str | None = None
timeout: int = DEFAULT_TIMEOUT
embed_model: str = DEFAULT_EMBED_MODEL
# Index into _RESPONSE_FORMATS that the server accepts; discovered lazily.
_response_format_idx: int | None = None
@@ -116,8 +118,31 @@ class LocalModelClient:
model=os.environ.get("GOODNEWS_LLM_MODEL", DEFAULT_MODEL),
api_key=os.environ.get("GOODNEWS_LLM_API_KEY"),
timeout=int(os.environ.get("GOODNEWS_LLM_TIMEOUT", DEFAULT_TIMEOUT)),
embed_model=os.environ.get("GOODNEWS_EMBED_MODEL", DEFAULT_EMBED_MODEL),
)
def embed(self, texts: list[str]) -> list[list[float]]:
"""Return embedding vectors for a batch of texts via /embeddings."""
body = json.dumps({"model": self.embed_model, "input": texts}).encode("utf-8")
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
request = urllib.request.Request(
f"{self.base_url}/embeddings", data=body, headers=headers, method="POST"
)
try:
with urllib.request.urlopen(request, timeout=self.timeout) as response:
data = json.loads(response.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"HTTP {exc.code} from embeddings: {detail}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"could not reach embeddings at {self.base_url}: {exc.reason}") from exc
try:
return [item["embedding"] for item in data["data"]]
except (KeyError, TypeError) as exc:
raise RuntimeError(f"unexpected embeddings response: {data}") from exc
def classify(self, article: sqlite3.Row) -> dict:
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
+1 -1
View File
@@ -49,7 +49,7 @@ def feed(
offset: int = 0,
) -> list[dict]:
"""Return ranked articles, optionally filtered by topic and/or flavor."""
clauses = []
clauses = ["a.duplicate_of IS NULL"]
params: list = []
if accepted_only:
clauses.append("s.accepted = 1")