Sparse-day-proof briefs, feed health check, and 16 new sources

- Briefs now fill from a rolling window (prefer today, backfill up to window_days) and exclude anything featured in the last 7 days of briefs, so slow days still produce five items without stories lingering day to day. - New 'check-feeds' command fetches and parses every feed to catch dead ones. - Added 16 validated sources (science, environment, animals, culture), expanding coverage from 12 to 28 feeds to reduce staleness. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-30 15:30:03 +00:00
parent cef272a8fc
commit 2a9c49e2a9
3 changed files with 192 additions and 6 deletions
@@ -118,3 +118,147 @@ pr_risk_score = 4
 poll_interval_minutes = 120
 notes = "University research stories; watch PR framing."
 [[sources]]
 name = "Phys.org"
 feed_url = "https://phys.org/rss-feed/"
 default_category = "science"
 trust_score = 7
 pr_risk_score = 3
 poll_interval_minutes = 120
 notes = "Broad science aggregator."
 [[sources]]
 name = "Nature News"
 feed_url = "https://www.nature.com/nature.rss"
 default_category = "science"
 trust_score = 9
 pr_risk_score = 2
 poll_interval_minutes = 180
 notes = "Top-tier science journal news."
 [[sources]]
 name = "Ars Technica Science"
 feed_url = "https://feeds.arstechnica.com/arstechnica/science"
 default_category = "science"
 trust_score = 7
 pr_risk_score = 3
 poll_interval_minutes = 120
 notes = "Science/tech reporting."
 [[sources]]
 name = "The Guardian Science"
 feed_url = "https://www.theguardian.com/science/rss"
 default_category = "science"
 trust_score = 8
 pr_risk_score = 3
 poll_interval_minutes = 120
 notes = "Mainstream science; needs filtering."
 [[sources]]
 name = "The Guardian Environment"
 feed_url = "https://www.theguardian.com/environment/rss"
 default_category = "environment"
 trust_score = 8
 pr_risk_score = 3
 poll_interval_minutes = 120
 notes = "Environment; needs filtering."
 [[sources]]
 name = "ScienceDaily Plants & Animals"
 feed_url = "https://www.sciencedaily.com/rss/plants_animals.xml"
 default_category = "animals"
 trust_score = 6
 pr_risk_score = 3
 poll_interval_minutes = 120
 notes = "Animal/biology discoveries."
 [[sources]]
 name = "ScienceDaily Space & Time"
 feed_url = "https://www.sciencedaily.com/rss/space_time.xml"
 default_category = "science"
 trust_score = 6
 pr_risk_score = 3
 poll_interval_minutes = 120
 notes = "Space discoveries."
 [[sources]]
 name = "Smithsonian Magazine"
 feed_url = "https://www.smithsonianmag.com/rss/latest_articles/"
 default_category = "culture"
 trust_score = 7
 pr_risk_score = 3
 poll_interval_minutes = 180
 notes = "Culture, history, science human-interest."
 [[sources]]
 name = "Yale Environment 360"
 feed_url = "https://e360.yale.edu/feed.xml"
 default_category = "environment"
 trust_score = 8
 pr_risk_score = 2
 poll_interval_minutes = 180
 notes = "In-depth environment/solutions."
 [[sources]]
 name = "Anthropocene Magazine"
 feed_url = "https://www.anthropocenemagazine.org/feed/"
 default_category = "environment"
 trust_score = 7
 pr_risk_score = 3
 poll_interval_minutes = 180
 notes = "Sustainability solutions journalism."
 [[sources]]
 name = "The Conversation (US)"
 feed_url = "https://theconversation.com/us/articles.atom"
 default_category = "science"
 trust_score = 7
 pr_risk_score = 2
 poll_interval_minutes = 120
 notes = "Academic-authored explainers."
 [[sources]]
 name = "Colossal"
 feed_url = "https://www.thisiscolossal.com/feed/"
 default_category = "culture"
 trust_score = 6
 pr_risk_score = 3
 poll_interval_minutes = 240
 notes = "Art and visual culture."
 [[sources]]
 name = "Atlas Obscura"
 feed_url = "https://www.atlasobscura.com/feeds/latest"
 default_category = "culture"
 trust_score = 6
 pr_risk_score = 3
 poll_interval_minutes = 240
 notes = "Curiosities, places, culture."
 [[sources]]
 name = "New Scientist"
 feed_url = "https://www.newscientist.com/feed/home/"
 default_category = "science"
 trust_score = 7
 pr_risk_score = 3
 poll_interval_minutes = 120
 notes = "Science weekly."
 [[sources]]
 name = "ScienceAlert"
 feed_url = "https://www.sciencealert.com/feed"
 default_category = "science"
 trust_score = 6
 pr_risk_score = 4
 poll_interval_minutes = 120
 notes = "Pop science; watch hype."
 [[sources]]
 name = "Grist"
 feed_url = "https://grist.org/feed/"
 default_category = "environment"
 trust_score = 6
 pr_risk_score = 3
 poll_interval_minutes = 180
 notes = "Climate solutions reporting."
@@ -9,6 +9,7 @@ def build_daily_brief(
    brief_date: str | None = None,
    limit: int = 5,
    replace: bool = False,
    window_days: int = 3,
 ) -> int:
    target_date = brief_date or date.today().isoformat()
    existing = conn.execute("SELECT id FROM daily_briefs WHERE brief_date = ?", (target_date,)).fetchone()
@@ -22,7 +23,7 @@ def build_daily_brief(
        (target_date, f"Five Good Things Today - {target_date}"),
    ).lastrowid
-    rows = _candidate_articles(conn, target_date)
+    rows = _candidate_articles(conn, target_date, window_days)
    selected = _select_diverse(rows, limit)
    for index, row in enumerate(selected, start=1):
        conn.execute(
@@ -78,7 +79,17 @@ def show_brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: i
    ).fetchall()
-def _candidate_articles(conn: sqlite3.Connection, target_date: str) -> list[sqlite3.Row]:
+def _candidate_articles(
    conn: sqlite3.Connection, target_date: str, window_days: int = 3
 ) -> list[sqlite3.Row]:
    """Brief candidates, sparse-day-proof.
    Prefers articles dated on target_date, but widens to the preceding
    `window_days` so the brief still fills on slow news days. Anything already
    featured in a brief within the last 7 days (other than this same date, which
    is being rebuilt) is excluded so backfilled stories cannot linger across
    consecutive days.
    """
    return conn.execute(
        """
        SELECT
@@ -100,19 +111,31 @@ def _candidate_articles(conn: sqlite3.Connection, target_date: str) -> list[sqli
            s.pr_risk_score,
            s.reason_code,
            s.reason_text,
-            s.model_name
+            s.model_name,
            CASE WHEN date(COALESCE(a.published_at, a.discovered_at)) = date(?)
                 THEN 1 ELSE 0 END AS is_today
        FROM articles a
        JOIN sources src ON src.id = a.source_id
        JOIN article_scores s ON s.article_id = a.id
        WHERE s.accepted = 1
-          AND date(COALESCE(a.published_at, a.discovered_at)) = date(?)
+          AND date(COALESCE(a.published_at, a.discovered_at)) <= date(?)
          AND date(COALESCE(a.published_at, a.discovered_at)) > date(?, '-' || ? || ' days')
          AND a.id NOT IN (
              SELECT bi.article_id
              FROM daily_brief_items bi
              JOIN daily_briefs b ON b.id = bi.brief_id
              WHERE b.brief_date <> ?
                AND b.brief_date <= date(?)
                AND b.brief_date > date(?, '-7 days')
          )
        ORDER BY
            is_today DESC,
            (s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
             - s.cortisol_score - s.ragebait_score - s.pr_risk_score) DESC,
            COALESCE(a.published_at, a.discovered_at) DESC
        LIMIT 50
        """,
-        (target_date,),
+        (target_date, target_date, target_date, window_days, target_date, target_date, target_date),
    ).fetchall()
@@ -8,7 +8,7 @@ from pathlib import Path
 from .briefs import build_daily_brief, show_brief
 from .db import connect, init_db
-from .feeds import poll_all_sources, poll_due_sources, poll_source
+from .feeds import fetch_feed, parse_feed, poll_all_sources, poll_due_sources, poll_source
 from .llm import LocalModelClient, classify_articles
 from .scoring import score_article
 from .sources import load_sources, upsert_sources
@@ -48,6 +48,9 @@ def main() -> None:
    subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats")
    check_feeds_parser = subparsers.add_parser("check-feeds", help="Fetch and parse each feed, reporting health")
    check_feeds_parser.add_argument("--all", action="store_true", help="Include inactive sources")
    runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
    runs_parser.add_argument("--limit", type=int, default=20)
@@ -122,6 +125,8 @@ def main() -> None:
        list_category(conn, topic=args.topic, flavor=args.flavor, limit=args.limit, accepted_only=not args.all)
    elif args.command == "source-report":
        source_report(conn)
    elif args.command == "check-feeds":
        check_feeds(conn, include_inactive=args.all)
    elif args.command == "list-runs":
        list_runs(conn, limit=args.limit)
    elif args.command == "rescore":
@@ -214,6 +219,20 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No
        print(f"  {row['canonical_url']}")
 def check_feeds(conn: sqlite3.Connection, include_inactive: bool = False) -> None:
    where = "" if include_inactive else "WHERE active = 1"
    rows = conn.execute(f"SELECT name, feed_url FROM sources {where} ORDER BY name").fetchall()
    ok = 0
    for row in rows:
        try:
            items = parse_feed(fetch_feed(row["feed_url"]))
            ok += 1
            print(f"OK   {row['name']}: {len(items)} items")
        except Exception as exc:
            print(f"FAIL {row['name']}: {exc}")
    print(f"--- {ok}/{len(rows)} feeds healthy ---")
 def run_cycle(conn: sqlite3.Connection, args: argparse.Namespace) -> None:
    """One end-to-end pass for a scheduler: poll due sources, classify the new
    arrivals, rebuild today's brief. Each step is independent and non-fatal so a