Files
upbeatBytes/goodnews/summarize.py
T
thejayman77 008364e922 Why-it-belongs: top-up requires all three fields (idempotency fix)
Per Codex: generate_summary treated why_belongs alone as a complete explanation,
but get_explanation requires all three — so a partial older row (e.g. only
why_belongs) would never top up and the page would fall back forever. Now the
fully-cached check requires summary + what_happened + why_matters + why_belongs.
Test covers the partial-row top-up.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 20:10:27 -04:00

176 lines
7.5 KiB
Python

"""On-demand, cached article summaries.
Lazy: generated only for articles that actually get shared/viewed, then cached in
article_summaries forever. We fetch the article text *transiently* and ask the
local LLM for a short, ORIGINAL summary in our own words. We store only that
summary — never the publisher's article body — and the page always credits and
links to the source.
"""
from __future__ import annotations
import re
import sqlite3
import urllib.error
import urllib.request
from urllib.parse import urljoin, urlsplit
from .enrich import USER_AGENT, _NoRedirect, _host_is_public
from .llm import LocalModelClient
_FETCH_BYTES = 600_000
_FETCH_TIMEOUT = 8
_MAX_REDIRECTS = 3
_DROP = re.compile(rb"<(script|style|noscript|template|svg)[^>]*>.*?</\1>", re.IGNORECASE | re.DOTALL)
_TAGS = re.compile(rb"<[^>]+>")
_WS = re.compile(r"\s+")
_SYSTEM = (
"You write a short, ORIGINAL summary of a news story for a calm, constructive news "
"site. Summarize the underlying FACTS in your own words. Do NOT quote, and do not "
"closely paraphrase the article's sentences — no lifted phrases. 2 to 4 plain "
"sentences covering what happened and why it's encouraging. No preamble, no markdown, "
"no headline, no opinion — just the factual summary."
)
def _fetch_text(url: str) -> str:
"""SSRF-guarded fetch of an article page, reduced to plain text (capped)."""
opener = urllib.request.build_opener(_NoRedirect)
for _ in range(_MAX_REDIRECTS + 1):
if not url:
return ""
parts = urlsplit(url)
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
return ""
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
try:
response = opener.open(request, timeout=_FETCH_TIMEOUT)
except (urllib.error.URLError, OSError, ValueError):
return ""
status = getattr(response, "status", 200) or 200
if status in (301, 302, 303, 307, 308):
location = response.headers.get("Location")
response.close()
if not location:
return ""
url = urljoin(url, location)
continue
if "html" not in response.headers.get("Content-Type", "").lower():
response.close()
return ""
try:
raw = response.read(_FETCH_BYTES)
finally:
response.close()
raw = _DROP.sub(b" ", raw)
text = _TAGS.sub(b" ", raw).decode("utf-8", "replace")
return _WS.sub(" ", text).strip()[:4000]
return ""
def summarize_article(client: LocalModelClient, title: str, snippet: str, body_text: str) -> str:
material = (body_text or snippet or title or "")[:4000]
user = f"Title: {title}\n\nArticle text:\n{material}\n\nWrite the summary."
messages = [{"role": "system", "content": _SYSTEM}, {"role": "user", "content": user}]
return (client.chat_text(messages) or "").strip()[:1200]
_EXPLAIN_SYSTEM = (
"You are the calm editor of a constructive-news site. For the given story write three "
"very short, plain-language notes — 1 to 2 factual sentences each, in your own words, no "
"markdown, no preamble, no quotes, no hype. Use EXACTLY this format and these labels, each "
"on its own line:\n"
"WHAT: <what actually happened — the plain gist>\n"
"MATTERS: <why it matters — the real-world, constructive significance>\n"
"BELONGS: <why it fits a calm, constructive news site — the human benefit, agency, or "
"grounded hope in it>"
)
def _parse_explain(text: str) -> dict:
def grab(label: str) -> str | None:
m = re.search(rf"\b{label}\s*:\s*(.+?)(?=\n\s*[A-Z]+\s*:|\Z)", text or "", re.IGNORECASE | re.DOTALL)
if not m:
return None
val = _WS.sub(" ", m.group(1)).strip().strip("-•* ").strip()
return val[:400] or None
return {"what_happened": grab("WHAT"), "why_matters": grab("MATTERS"), "why_belongs": grab("BELONGS")}
def explain_article(client: LocalModelClient, title: str, snippet: str, body_text: str) -> dict:
"""Three short editorial notes (what happened / why it matters / why it belongs)."""
material = (body_text or snippet or title or "")[:4000]
user = f"Title: {title}\n\nArticle text:\n{material}\n\nWrite the three notes."
text = client.chat_text(
[{"role": "system", "content": _EXPLAIN_SYSTEM}, {"role": "user", "content": user}]
) or ""
return _parse_explain(text)
def get_summary(conn: sqlite3.Connection, article_id: int) -> str | None:
row = conn.execute(
"SELECT summary FROM article_summaries WHERE article_id = ?", (article_id,)
).fetchone()
return row["summary"] if row else None
def get_explanation(conn: sqlite3.Connection, article_id: int) -> dict | None:
"""The structured 'Why it belongs' notes — only if all three are present (else
the page falls back to summary + reason_text)."""
row = conn.execute(
"SELECT what_happened, why_matters, why_belongs FROM article_summaries WHERE article_id = ?",
(article_id,),
).fetchone()
if row and row["what_happened"] and row["why_matters"] and row["why_belongs"]:
return dict(row)
return None
def generate_summary(conn: sqlite3.Connection, article_id: int, client: LocalModelClient | None = None) -> str | None:
"""Generate + cache a summary AND the structured explanation for one article.
Returns the summary, or None if skipped. Idempotent: a fully-cached article
(summary + explanation) is returned as-is; an older summary missing the
explanation is topped up on the next call (so existing pages gain the section).
"""
existing = conn.execute(
"SELECT summary, what_happened, why_matters, why_belongs FROM article_summaries WHERE article_id = ?",
(article_id,),
).fetchone()
# Fully cached only when the explanation is COMPLETE (all three) — matches
# get_explanation(), so a partial older row gets topped up on the next call.
if existing and existing["summary"] and existing["what_happened"] and existing["why_matters"] and existing["why_belongs"]:
return existing["summary"]
row = conn.execute(
"SELECT a.title, a.description, a.canonical_url, a.duplicate_of, s.accepted "
"FROM articles a LEFT JOIN article_scores s ON s.article_id = a.id WHERE a.id = ?",
(article_id,),
).fetchone()
if not row or row["duplicate_of"] is not None or not row["accepted"]:
return existing["summary"] if existing else None
client = client or LocalModelClient.from_env()
body = _fetch_text(row["canonical_url"])
summary = existing["summary"] if existing else summarize_article(
client, row["title"], row["description"] or "", body
)
if not summary:
return None
ex = explain_article(client, row["title"], row["description"] or "", body)
conn.execute(
"INSERT OR REPLACE INTO article_summaries "
"(article_id, summary, what_happened, why_matters, why_belongs, model) VALUES (?, ?, ?, ?, ?, ?)",
(article_id, summary, ex["what_happened"], ex["why_matters"], ex["why_belongs"], client.model),
)
conn.commit()
# Attention-triggered image enrichment: a summarized article is one a reader
# has reached, so it's worth a real image. Best-effort — an image fetch
# failure must never break summarization.
try:
from .enrich import enrich_article_image
enrich_article_image(conn, article_id)
except Exception:
pass
return summary