Files
thejayman77 89c0fbe1f6 Sync repo to deployed state: SEO recovery, Publishing Desk, Play games, emoji picker
The deploy pipeline runs from the working tree, so a wave of shipped features
had never been committed. This snapshots git to what's actually running.

SEO impression recovery (live + verified):
- Duplicate /a/{id} now 301-redirect to their canonical twin instead of 404
  (a hard 404 silently dropped already-indexed URLs and tanked impressions).
- Dedup representative selection reworked: accepted/serveable -> established
  rep (URL stability) -> quality score, so an accepted page never retires to a
  rejected rep and an indexed canonical doesn't churn when a newer twin arrives.
- HEAD /a/{id} returns the same status as GET (api_route GET+HEAD) instead of
  falling through to the static mount and 404ing.
- `dedup --force-recluster`: cycle-locked, model-free re-cluster to re-apply the
  policy to the existing corpus (shared cycle_lock context manager).
- CLI honors GOODNEWS_DB for its default --db (was silently ignored).

Publishing Desk (admin tool to post highlights to X via Web Intents):
- publishing.py queue/rank/handle-resolution; admin UI; full searchable emoji
  picker (bundled data, no CDN) for the blurb editor.

Play games + site:
- Bloom (word-wheel), Memory Match, daily ritual set, Zen Den (dev-gated).
- English-only language gate; source prospecting; paywall + dedup hardening.

Tests: full suite green (349). Ignores tightened (node_modules, data/*.db).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 11:32:27 -04:00

204 lines
7.4 KiB
Python

"""Cross-source near-duplicate detection via local embeddings.
The exact-URL dedupe in feeds.py only catches the literal same link. The same
story carried by several outlets slips through as separate articles. Here we
embed each article's title+snippet with the local embedding model, cluster
near-identical ones within a short time window, and mark all but the best in
each cluster as duplicates (articles.duplicate_of). Feed and brief queries then
hide duplicates, keeping the single strongest version.
Pure-stdlib math: vectors are normalised once so cosine similarity is a dot
product, and comparisons are restricted to a date window, so no numpy is needed.
"""
from __future__ import annotations
import math
import sqlite3
from array import array
from datetime import date
from .llm import LocalModelClient
DEFAULT_THRESHOLD = 0.86
DEFAULT_WINDOW_DAYS = 3
_EMBED_BATCH = 16
def _embed_text(title: str, description: str | None) -> str:
text = title.strip()
if description:
text += ". " + description.strip()
return text[:2000]
def ensure_embeddings(
conn: sqlite3.Connection, client: LocalModelClient, limit: int | None = None
) -> int:
"""Embed and store any articles that lack an embedding. Returns count added."""
rows = conn.execute(
"""
SELECT a.id, a.title, a.description
FROM articles a
LEFT JOIN article_embeddings e ON e.article_id = a.id
WHERE e.article_id IS NULL
ORDER BY a.id
"""
).fetchall()
if limit is not None:
rows = rows[:limit]
if not rows:
return 0
added = 0
for start in range(0, len(rows), _EMBED_BATCH):
batch = rows[start : start + _EMBED_BATCH]
vectors = client.embed([_embed_text(r["title"], r["description"]) for r in batch])
for row, vector in zip(batch, vectors):
conn.execute(
"INSERT OR REPLACE INTO article_embeddings (article_id, vector, dim, model) "
"VALUES (?, ?, ?, ?)",
(row["id"], array("f", vector).tobytes(), len(vector), client.embed_model),
)
added += 1
conn.commit()
return added
def _unit(vector: list[float]) -> list[float]:
norm = math.sqrt(sum(x * x for x in vector))
if norm == 0:
return vector
return [x / norm for x in vector]
def _day_ordinal(value: str | None) -> int:
if not value:
return 0
try:
return date.fromisoformat(value[:10]).toordinal()
except ValueError:
return 0
def cluster_duplicates(
conn: sqlite3.Connection,
threshold: float = DEFAULT_THRESHOLD,
window_days: int = DEFAULT_WINDOW_DAYS,
) -> dict:
"""Group near-identical articles and record duplicate_of links.
Greedy single-link clustering: each article joins the first existing cluster
whose anchor it matches (cosine >= threshold, within window_days); otherwise
it starts a new cluster. The highest-ranked member of each cluster becomes
the representative; the rest point at it.
"""
rows = conn.execute(
"""
SELECT
a.id,
COALESCE(a.published_at, a.discovered_at) AS dt,
e.vector,
(COALESCE(s.constructive_score,0) + COALESCE(s.agency_score,0)
+ COALESCE(s.human_benefit_score,0) + src.trust_score
- COALESCE(s.cortisol_score,0) - COALESCE(s.ragebait_score,0)
- COALESCE(s.pr_risk_score,0)) AS rank_score,
COALESCE(s.accepted, 0) AS accepted
FROM articles a
JOIN article_embeddings e ON e.article_id = a.id
JOIN sources src ON src.id = a.source_id
LEFT JOIN article_scores s ON s.article_id = a.id
ORDER BY dt
"""
).fetchall()
items = []
for r in rows:
vec = _unit(array("f", r["vector"]).tolist())
items.append({"id": r["id"], "ord": _day_ordinal(r["dt"]), "vec": vec,
"score": r["rank_score"], "accepted": bool(r["accepted"])})
clusters: list[dict] = [] # {anchor_vec, anchor_ord, members:[item]}
for it in items:
placed = False
for cl in clusters:
if abs(it["ord"] - cl["anchor_ord"]) > window_days:
continue
dot = sum(x * y for x, y in zip(it["vec"], cl["anchor_vec"]))
if dot >= threshold:
cl["members"].append(it)
placed = True
break
if not placed:
clusters.append({"anchor_vec": it["vec"], "anchor_ord": it["ord"], "members": [it]})
# Which articles are CURRENTLY a representative (something points at them)? Captured
# BEFORE we reset, so we can keep an established canonical stable across runs.
prior_reps = {
row[0] for row in conn.execute(
"SELECT DISTINCT duplicate_of FROM articles WHERE duplicate_of IS NOT NULL"
)
}
# Reset prior decisions for everything we considered, then re-apply.
considered = [it["id"] for it in items]
conn.executemany(
"UPDATE articles SET duplicate_of = NULL WHERE id = ?", [(i,) for i in considered]
)
dup_clusters = 0
duplicates = 0
for cl in clusters:
if len(cl["members"]) < 2:
continue
dup_clusters += 1
# Representative priority (highest wins), in order:
# 1. accepted/serveable — an accepted page must never be retired to a REJECTED
# rep (that page would 404 with nothing to redirect to).
# 2. established rep — if a member is already the cluster's canonical, keep it,
# so an indexed URL doesn't churn when a newer twin arrives.
# 3. quality score — decides genuinely-new clusters.
# 4. -id — deterministic final tiebreak (older wins).
rep = max(cl["members"], key=lambda m: (
1 if m["accepted"] else 0,
1 if m["id"] in prior_reps else 0,
m["score"],
-m["id"],
))
for m in cl["members"]:
if m["id"] != rep["id"]:
conn.execute(
"UPDATE articles SET duplicate_of = ? WHERE id = ?", (rep["id"], m["id"])
)
duplicates += 1
conn.commit()
return {
"articles": len(items),
"clusters": len(clusters),
"duplicate_clusters": dup_clusters,
"duplicates": duplicates,
}
def dedup(
conn: sqlite3.Connection,
client: LocalModelClient,
threshold: float = DEFAULT_THRESHOLD,
window_days: int = DEFAULT_WINDOW_DAYS,
embed_limit: int | None = None,
) -> dict:
embedded = ensure_embeddings(conn, client, limit=embed_limit)
if embedded == 0:
# Nothing new entered the corpus → the clusters and duplicate_of links are
# unchanged, so skip the full re-cluster. It was re-running an O(n²) cosine
# pass over EVERY article and rewriting duplicate_of for all ~3.7k of them
# every cycle (~53s + a large WAL commit), which starved live API reads
# (/api/brief 2-7s). Most cycles find no new articles, so this makes the
# cycle near-instant and keeps reads fast. A real new article re-runs it.
dups = conn.execute("SELECT COUNT(*) FROM articles WHERE duplicate_of IS NOT NULL").fetchone()[0]
return {"embedded": 0, "articles": 0, "clusters": 0, "duplicate_clusters": 0,
"duplicates": dups, "skipped": True}
stats = cluster_duplicates(conn, threshold=threshold, window_days=window_days)
stats["embedded"] = embedded
return stats