Sparse-day-proof briefs, feed health check, and 16 new sources

- Briefs now fill from a rolling window (prefer today, backfill up to
  window_days) and exclude anything featured in the last 7 days of briefs, so
  slow days still produce five items without stories lingering day to day.
- New 'check-feeds' command fetches and parses every feed to catch dead ones.
- Added 16 validated sources (science, environment, animals, culture),
  expanding coverage from 12 to 28 feeds to reduce staleness.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-05-30 15:30:03 +00:00
parent cef272a8fc
commit 2a9c49e2a9
3 changed files with 192 additions and 6 deletions
+144
View File
@@ -118,3 +118,147 @@ pr_risk_score = 4
poll_interval_minutes = 120 poll_interval_minutes = 120
notes = "University research stories; watch PR framing." notes = "University research stories; watch PR framing."
[[sources]]
name = "Phys.org"
feed_url = "https://phys.org/rss-feed/"
default_category = "science"
trust_score = 7
pr_risk_score = 3
poll_interval_minutes = 120
notes = "Broad science aggregator."
[[sources]]
name = "Nature News"
feed_url = "https://www.nature.com/nature.rss"
default_category = "science"
trust_score = 9
pr_risk_score = 2
poll_interval_minutes = 180
notes = "Top-tier science journal news."
[[sources]]
name = "Ars Technica Science"
feed_url = "https://feeds.arstechnica.com/arstechnica/science"
default_category = "science"
trust_score = 7
pr_risk_score = 3
poll_interval_minutes = 120
notes = "Science/tech reporting."
[[sources]]
name = "The Guardian Science"
feed_url = "https://www.theguardian.com/science/rss"
default_category = "science"
trust_score = 8
pr_risk_score = 3
poll_interval_minutes = 120
notes = "Mainstream science; needs filtering."
[[sources]]
name = "The Guardian Environment"
feed_url = "https://www.theguardian.com/environment/rss"
default_category = "environment"
trust_score = 8
pr_risk_score = 3
poll_interval_minutes = 120
notes = "Environment; needs filtering."
[[sources]]
name = "ScienceDaily Plants & Animals"
feed_url = "https://www.sciencedaily.com/rss/plants_animals.xml"
default_category = "animals"
trust_score = 6
pr_risk_score = 3
poll_interval_minutes = 120
notes = "Animal/biology discoveries."
[[sources]]
name = "ScienceDaily Space & Time"
feed_url = "https://www.sciencedaily.com/rss/space_time.xml"
default_category = "science"
trust_score = 6
pr_risk_score = 3
poll_interval_minutes = 120
notes = "Space discoveries."
[[sources]]
name = "Smithsonian Magazine"
feed_url = "https://www.smithsonianmag.com/rss/latest_articles/"
default_category = "culture"
trust_score = 7
pr_risk_score = 3
poll_interval_minutes = 180
notes = "Culture, history, science human-interest."
[[sources]]
name = "Yale Environment 360"
feed_url = "https://e360.yale.edu/feed.xml"
default_category = "environment"
trust_score = 8
pr_risk_score = 2
poll_interval_minutes = 180
notes = "In-depth environment/solutions."
[[sources]]
name = "Anthropocene Magazine"
feed_url = "https://www.anthropocenemagazine.org/feed/"
default_category = "environment"
trust_score = 7
pr_risk_score = 3
poll_interval_minutes = 180
notes = "Sustainability solutions journalism."
[[sources]]
name = "The Conversation (US)"
feed_url = "https://theconversation.com/us/articles.atom"
default_category = "science"
trust_score = 7
pr_risk_score = 2
poll_interval_minutes = 120
notes = "Academic-authored explainers."
[[sources]]
name = "Colossal"
feed_url = "https://www.thisiscolossal.com/feed/"
default_category = "culture"
trust_score = 6
pr_risk_score = 3
poll_interval_minutes = 240
notes = "Art and visual culture."
[[sources]]
name = "Atlas Obscura"
feed_url = "https://www.atlasobscura.com/feeds/latest"
default_category = "culture"
trust_score = 6
pr_risk_score = 3
poll_interval_minutes = 240
notes = "Curiosities, places, culture."
[[sources]]
name = "New Scientist"
feed_url = "https://www.newscientist.com/feed/home/"
default_category = "science"
trust_score = 7
pr_risk_score = 3
poll_interval_minutes = 120
notes = "Science weekly."
[[sources]]
name = "ScienceAlert"
feed_url = "https://www.sciencealert.com/feed"
default_category = "science"
trust_score = 6
pr_risk_score = 4
poll_interval_minutes = 120
notes = "Pop science; watch hype."
[[sources]]
name = "Grist"
feed_url = "https://grist.org/feed/"
default_category = "environment"
trust_score = 6
pr_risk_score = 3
poll_interval_minutes = 180
notes = "Climate solutions reporting."
+28 -5
View File
@@ -9,6 +9,7 @@ def build_daily_brief(
brief_date: str | None = None, brief_date: str | None = None,
limit: int = 5, limit: int = 5,
replace: bool = False, replace: bool = False,
window_days: int = 3,
) -> int: ) -> int:
target_date = brief_date or date.today().isoformat() target_date = brief_date or date.today().isoformat()
existing = conn.execute("SELECT id FROM daily_briefs WHERE brief_date = ?", (target_date,)).fetchone() existing = conn.execute("SELECT id FROM daily_briefs WHERE brief_date = ?", (target_date,)).fetchone()
@@ -22,7 +23,7 @@ def build_daily_brief(
(target_date, f"Five Good Things Today - {target_date}"), (target_date, f"Five Good Things Today - {target_date}"),
).lastrowid ).lastrowid
rows = _candidate_articles(conn, target_date) rows = _candidate_articles(conn, target_date, window_days)
selected = _select_diverse(rows, limit) selected = _select_diverse(rows, limit)
for index, row in enumerate(selected, start=1): for index, row in enumerate(selected, start=1):
conn.execute( conn.execute(
@@ -78,7 +79,17 @@ def show_brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: i
).fetchall() ).fetchall()
def _candidate_articles(conn: sqlite3.Connection, target_date: str) -> list[sqlite3.Row]: def _candidate_articles(
conn: sqlite3.Connection, target_date: str, window_days: int = 3
) -> list[sqlite3.Row]:
"""Brief candidates, sparse-day-proof.
Prefers articles dated on target_date, but widens to the preceding
`window_days` so the brief still fills on slow news days. Anything already
featured in a brief within the last 7 days (other than this same date, which
is being rebuilt) is excluded so backfilled stories cannot linger across
consecutive days.
"""
return conn.execute( return conn.execute(
""" """
SELECT SELECT
@@ -100,19 +111,31 @@ def _candidate_articles(conn: sqlite3.Connection, target_date: str) -> list[sqli
s.pr_risk_score, s.pr_risk_score,
s.reason_code, s.reason_code,
s.reason_text, s.reason_text,
s.model_name s.model_name,
CASE WHEN date(COALESCE(a.published_at, a.discovered_at)) = date(?)
THEN 1 ELSE 0 END AS is_today
FROM articles a FROM articles a
JOIN sources src ON src.id = a.source_id JOIN sources src ON src.id = a.source_id
JOIN article_scores s ON s.article_id = a.id JOIN article_scores s ON s.article_id = a.id
WHERE s.accepted = 1 WHERE s.accepted = 1
AND date(COALESCE(a.published_at, a.discovered_at)) = date(?) AND date(COALESCE(a.published_at, a.discovered_at)) <= date(?)
AND date(COALESCE(a.published_at, a.discovered_at)) > date(?, '-' || ? || ' days')
AND a.id NOT IN (
SELECT bi.article_id
FROM daily_brief_items bi
JOIN daily_briefs b ON b.id = bi.brief_id
WHERE b.brief_date <> ?
AND b.brief_date <= date(?)
AND b.brief_date > date(?, '-7 days')
)
ORDER BY ORDER BY
is_today DESC,
(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score (s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
- s.cortisol_score - s.ragebait_score - s.pr_risk_score) DESC, - s.cortisol_score - s.ragebait_score - s.pr_risk_score) DESC,
COALESCE(a.published_at, a.discovered_at) DESC COALESCE(a.published_at, a.discovered_at) DESC
LIMIT 50 LIMIT 50
""", """,
(target_date,), (target_date, target_date, target_date, window_days, target_date, target_date, target_date),
).fetchall() ).fetchall()
+20 -1
View File
@@ -8,7 +8,7 @@ from pathlib import Path
from .briefs import build_daily_brief, show_brief from .briefs import build_daily_brief, show_brief
from .db import connect, init_db from .db import connect, init_db
from .feeds import poll_all_sources, poll_due_sources, poll_source from .feeds import fetch_feed, parse_feed, poll_all_sources, poll_due_sources, poll_source
from .llm import LocalModelClient, classify_articles from .llm import LocalModelClient, classify_articles
from .scoring import score_article from .scoring import score_article
from .sources import load_sources, upsert_sources from .sources import load_sources, upsert_sources
@@ -48,6 +48,9 @@ def main() -> None:
subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats") subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats")
check_feeds_parser = subparsers.add_parser("check-feeds", help="Fetch and parse each feed, reporting health")
check_feeds_parser.add_argument("--all", action="store_true", help="Include inactive sources")
runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs") runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
runs_parser.add_argument("--limit", type=int, default=20) runs_parser.add_argument("--limit", type=int, default=20)
@@ -122,6 +125,8 @@ def main() -> None:
list_category(conn, topic=args.topic, flavor=args.flavor, limit=args.limit, accepted_only=not args.all) list_category(conn, topic=args.topic, flavor=args.flavor, limit=args.limit, accepted_only=not args.all)
elif args.command == "source-report": elif args.command == "source-report":
source_report(conn) source_report(conn)
elif args.command == "check-feeds":
check_feeds(conn, include_inactive=args.all)
elif args.command == "list-runs": elif args.command == "list-runs":
list_runs(conn, limit=args.limit) list_runs(conn, limit=args.limit)
elif args.command == "rescore": elif args.command == "rescore":
@@ -214,6 +219,20 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No
print(f" {row['canonical_url']}") print(f" {row['canonical_url']}")
def check_feeds(conn: sqlite3.Connection, include_inactive: bool = False) -> None:
where = "" if include_inactive else "WHERE active = 1"
rows = conn.execute(f"SELECT name, feed_url FROM sources {where} ORDER BY name").fetchall()
ok = 0
for row in rows:
try:
items = parse_feed(fetch_feed(row["feed_url"]))
ok += 1
print(f"OK {row['name']}: {len(items)} items")
except Exception as exc:
print(f"FAIL {row['name']}: {exc}")
print(f"--- {ok}/{len(rows)} feeds healthy ---")
def run_cycle(conn: sqlite3.Connection, args: argparse.Namespace) -> None: def run_cycle(conn: sqlite3.Connection, args: argparse.Namespace) -> None:
"""One end-to-end pass for a scheduler: poll due sources, classify the new """One end-to-end pass for a scheduler: poll due sources, classify the new
arrivals, rebuild today's brief. Each step is independent and non-fatal so a arrivals, rebuild today's brief. Each step is independent and non-fatal so a