337dc3f901
Per Codex — make /a/<id> feel like Upbeat Bytes has editorial judgment, not just a summary wrapper. Trust-building, short, not an essay. * article_summaries gains what_happened / why_matters / why_belongs (+ migration). * summarize.explain_article: a separate, fallback-able LLM pass producing three short notes (parsed from a labelled WHAT/MATTERS/BELONGS format). generate_summary now stores them alongside the summary, and tops up older summaries on next view. get_explanation returns them only when all three are present. * API: share_page + /api/summary expose the explanation. * share.py: renders the three-part section (accent rule) when complete; otherwise the single "Why it's here" reason line is the calm fallback. The page polls and swaps in both the summary and the section as they cache. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
173 lines
7.3 KiB
Python
173 lines
7.3 KiB
Python
"""On-demand, cached article summaries.
|
|
|
|
Lazy: generated only for articles that actually get shared/viewed, then cached in
|
|
article_summaries forever. We fetch the article text *transiently* and ask the
|
|
local LLM for a short, ORIGINAL summary in our own words. We store only that
|
|
summary — never the publisher's article body — and the page always credits and
|
|
links to the source.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import sqlite3
|
|
import urllib.error
|
|
import urllib.request
|
|
from urllib.parse import urljoin, urlsplit
|
|
|
|
from .enrich import USER_AGENT, _NoRedirect, _host_is_public
|
|
from .llm import LocalModelClient
|
|
|
|
_FETCH_BYTES = 600_000
|
|
_FETCH_TIMEOUT = 8
|
|
_MAX_REDIRECTS = 3
|
|
_DROP = re.compile(rb"<(script|style|noscript|template|svg)[^>]*>.*?</\1>", re.IGNORECASE | re.DOTALL)
|
|
_TAGS = re.compile(rb"<[^>]+>")
|
|
_WS = re.compile(r"\s+")
|
|
|
|
_SYSTEM = (
|
|
"You write a short, ORIGINAL summary of a news story for a calm, constructive news "
|
|
"site. Summarize the underlying FACTS in your own words. Do NOT quote, and do not "
|
|
"closely paraphrase the article's sentences — no lifted phrases. 2 to 4 plain "
|
|
"sentences covering what happened and why it's encouraging. No preamble, no markdown, "
|
|
"no headline, no opinion — just the factual summary."
|
|
)
|
|
|
|
|
|
def _fetch_text(url: str) -> str:
|
|
"""SSRF-guarded fetch of an article page, reduced to plain text (capped)."""
|
|
opener = urllib.request.build_opener(_NoRedirect)
|
|
for _ in range(_MAX_REDIRECTS + 1):
|
|
if not url:
|
|
return ""
|
|
parts = urlsplit(url)
|
|
if parts.scheme not in ("http", "https") or not _host_is_public(parts.hostname):
|
|
return ""
|
|
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT, "Accept": "text/html"})
|
|
try:
|
|
response = opener.open(request, timeout=_FETCH_TIMEOUT)
|
|
except (urllib.error.URLError, OSError, ValueError):
|
|
return ""
|
|
status = getattr(response, "status", 200) or 200
|
|
if status in (301, 302, 303, 307, 308):
|
|
location = response.headers.get("Location")
|
|
response.close()
|
|
if not location:
|
|
return ""
|
|
url = urljoin(url, location)
|
|
continue
|
|
if "html" not in response.headers.get("Content-Type", "").lower():
|
|
response.close()
|
|
return ""
|
|
try:
|
|
raw = response.read(_FETCH_BYTES)
|
|
finally:
|
|
response.close()
|
|
raw = _DROP.sub(b" ", raw)
|
|
text = _TAGS.sub(b" ", raw).decode("utf-8", "replace")
|
|
return _WS.sub(" ", text).strip()[:4000]
|
|
return ""
|
|
|
|
|
|
def summarize_article(client: LocalModelClient, title: str, snippet: str, body_text: str) -> str:
|
|
material = (body_text or snippet or title or "")[:4000]
|
|
user = f"Title: {title}\n\nArticle text:\n{material}\n\nWrite the summary."
|
|
messages = [{"role": "system", "content": _SYSTEM}, {"role": "user", "content": user}]
|
|
return (client.chat_text(messages) or "").strip()[:1200]
|
|
|
|
|
|
_EXPLAIN_SYSTEM = (
|
|
"You are the calm editor of a constructive-news site. For the given story write three "
|
|
"very short, plain-language notes — 1 to 2 factual sentences each, in your own words, no "
|
|
"markdown, no preamble, no quotes, no hype. Use EXACTLY this format and these labels, each "
|
|
"on its own line:\n"
|
|
"WHAT: <what actually happened — the plain gist>\n"
|
|
"MATTERS: <why it matters — the real-world, constructive significance>\n"
|
|
"BELONGS: <why it fits a calm, constructive news site — the human benefit, agency, or "
|
|
"grounded hope in it>"
|
|
)
|
|
|
|
|
|
def _parse_explain(text: str) -> dict:
|
|
def grab(label: str) -> str | None:
|
|
m = re.search(rf"\b{label}\s*:\s*(.+?)(?=\n\s*[A-Z]+\s*:|\Z)", text or "", re.IGNORECASE | re.DOTALL)
|
|
if not m:
|
|
return None
|
|
val = _WS.sub(" ", m.group(1)).strip().strip("-•* ").strip()
|
|
return val[:400] or None
|
|
|
|
return {"what_happened": grab("WHAT"), "why_matters": grab("MATTERS"), "why_belongs": grab("BELONGS")}
|
|
|
|
|
|
def explain_article(client: LocalModelClient, title: str, snippet: str, body_text: str) -> dict:
|
|
"""Three short editorial notes (what happened / why it matters / why it belongs)."""
|
|
material = (body_text or snippet or title or "")[:4000]
|
|
user = f"Title: {title}\n\nArticle text:\n{material}\n\nWrite the three notes."
|
|
text = client.chat_text(
|
|
[{"role": "system", "content": _EXPLAIN_SYSTEM}, {"role": "user", "content": user}]
|
|
) or ""
|
|
return _parse_explain(text)
|
|
|
|
|
|
def get_summary(conn: sqlite3.Connection, article_id: int) -> str | None:
|
|
row = conn.execute(
|
|
"SELECT summary FROM article_summaries WHERE article_id = ?", (article_id,)
|
|
).fetchone()
|
|
return row["summary"] if row else None
|
|
|
|
|
|
def get_explanation(conn: sqlite3.Connection, article_id: int) -> dict | None:
|
|
"""The structured 'Why it belongs' notes — only if all three are present (else
|
|
the page falls back to summary + reason_text)."""
|
|
row = conn.execute(
|
|
"SELECT what_happened, why_matters, why_belongs FROM article_summaries WHERE article_id = ?",
|
|
(article_id,),
|
|
).fetchone()
|
|
if row and row["what_happened"] and row["why_matters"] and row["why_belongs"]:
|
|
return dict(row)
|
|
return None
|
|
|
|
|
|
def generate_summary(conn: sqlite3.Connection, article_id: int, client: LocalModelClient | None = None) -> str | None:
|
|
"""Generate + cache a summary AND the structured explanation for one article.
|
|
|
|
Returns the summary, or None if skipped. Idempotent: a fully-cached article
|
|
(summary + explanation) is returned as-is; an older summary missing the
|
|
explanation is topped up on the next call (so existing pages gain the section).
|
|
"""
|
|
existing = conn.execute(
|
|
"SELECT summary, why_belongs FROM article_summaries WHERE article_id = ?", (article_id,)
|
|
).fetchone()
|
|
if existing and existing["summary"] and existing["why_belongs"]:
|
|
return existing["summary"] # summary + a complete explanation already cached
|
|
row = conn.execute(
|
|
"SELECT a.title, a.description, a.canonical_url, a.duplicate_of, s.accepted "
|
|
"FROM articles a LEFT JOIN article_scores s ON s.article_id = a.id WHERE a.id = ?",
|
|
(article_id,),
|
|
).fetchone()
|
|
if not row or row["duplicate_of"] is not None or not row["accepted"]:
|
|
return existing["summary"] if existing else None
|
|
client = client or LocalModelClient.from_env()
|
|
body = _fetch_text(row["canonical_url"])
|
|
summary = existing["summary"] if existing else summarize_article(
|
|
client, row["title"], row["description"] or "", body
|
|
)
|
|
if not summary:
|
|
return None
|
|
ex = explain_article(client, row["title"], row["description"] or "", body)
|
|
conn.execute(
|
|
"INSERT OR REPLACE INTO article_summaries "
|
|
"(article_id, summary, what_happened, why_matters, why_belongs, model) VALUES (?, ?, ?, ?, ?, ?)",
|
|
(article_id, summary, ex["what_happened"], ex["why_matters"], ex["why_belongs"], client.model),
|
|
)
|
|
conn.commit()
|
|
# Attention-triggered image enrichment: a summarized article is one a reader
|
|
# has reached, so it's worth a real image. Best-effort — an image fetch
|
|
# failure must never break summarization.
|
|
try:
|
|
from .enrich import enrich_article_image
|
|
enrich_article_image(conn, article_id)
|
|
except Exception:
|
|
pass
|
|
return summary
|