diff --git a/config/sources.toml b/config/sources.toml index f25e98e..49e150b 100644 --- a/config/sources.toml +++ b/config/sources.toml @@ -118,3 +118,147 @@ pr_risk_score = 4 poll_interval_minutes = 120 notes = "University research stories; watch PR framing." + +[[sources]] +name = "Phys.org" +feed_url = "https://phys.org/rss-feed/" +default_category = "science" +trust_score = 7 +pr_risk_score = 3 +poll_interval_minutes = 120 +notes = "Broad science aggregator." + +[[sources]] +name = "Nature News" +feed_url = "https://www.nature.com/nature.rss" +default_category = "science" +trust_score = 9 +pr_risk_score = 2 +poll_interval_minutes = 180 +notes = "Top-tier science journal news." + +[[sources]] +name = "Ars Technica Science" +feed_url = "https://feeds.arstechnica.com/arstechnica/science" +default_category = "science" +trust_score = 7 +pr_risk_score = 3 +poll_interval_minutes = 120 +notes = "Science/tech reporting." + +[[sources]] +name = "The Guardian Science" +feed_url = "https://www.theguardian.com/science/rss" +default_category = "science" +trust_score = 8 +pr_risk_score = 3 +poll_interval_minutes = 120 +notes = "Mainstream science; needs filtering." + +[[sources]] +name = "The Guardian Environment" +feed_url = "https://www.theguardian.com/environment/rss" +default_category = "environment" +trust_score = 8 +pr_risk_score = 3 +poll_interval_minutes = 120 +notes = "Environment; needs filtering." + +[[sources]] +name = "ScienceDaily Plants & Animals" +feed_url = "https://www.sciencedaily.com/rss/plants_animals.xml" +default_category = "animals" +trust_score = 6 +pr_risk_score = 3 +poll_interval_minutes = 120 +notes = "Animal/biology discoveries." + +[[sources]] +name = "ScienceDaily Space & Time" +feed_url = "https://www.sciencedaily.com/rss/space_time.xml" +default_category = "science" +trust_score = 6 +pr_risk_score = 3 +poll_interval_minutes = 120 +notes = "Space discoveries." + +[[sources]] +name = "Smithsonian Magazine" +feed_url = "https://www.smithsonianmag.com/rss/latest_articles/" +default_category = "culture" +trust_score = 7 +pr_risk_score = 3 +poll_interval_minutes = 180 +notes = "Culture, history, science human-interest." + +[[sources]] +name = "Yale Environment 360" +feed_url = "https://e360.yale.edu/feed.xml" +default_category = "environment" +trust_score = 8 +pr_risk_score = 2 +poll_interval_minutes = 180 +notes = "In-depth environment/solutions." + +[[sources]] +name = "Anthropocene Magazine" +feed_url = "https://www.anthropocenemagazine.org/feed/" +default_category = "environment" +trust_score = 7 +pr_risk_score = 3 +poll_interval_minutes = 180 +notes = "Sustainability solutions journalism." + +[[sources]] +name = "The Conversation (US)" +feed_url = "https://theconversation.com/us/articles.atom" +default_category = "science" +trust_score = 7 +pr_risk_score = 2 +poll_interval_minutes = 120 +notes = "Academic-authored explainers." + +[[sources]] +name = "Colossal" +feed_url = "https://www.thisiscolossal.com/feed/" +default_category = "culture" +trust_score = 6 +pr_risk_score = 3 +poll_interval_minutes = 240 +notes = "Art and visual culture." + +[[sources]] +name = "Atlas Obscura" +feed_url = "https://www.atlasobscura.com/feeds/latest" +default_category = "culture" +trust_score = 6 +pr_risk_score = 3 +poll_interval_minutes = 240 +notes = "Curiosities, places, culture." + +[[sources]] +name = "New Scientist" +feed_url = "https://www.newscientist.com/feed/home/" +default_category = "science" +trust_score = 7 +pr_risk_score = 3 +poll_interval_minutes = 120 +notes = "Science weekly." + +[[sources]] +name = "ScienceAlert" +feed_url = "https://www.sciencealert.com/feed" +default_category = "science" +trust_score = 6 +pr_risk_score = 4 +poll_interval_minutes = 120 +notes = "Pop science; watch hype." + +[[sources]] +name = "Grist" +feed_url = "https://grist.org/feed/" +default_category = "environment" +trust_score = 6 +pr_risk_score = 3 +poll_interval_minutes = 180 +notes = "Climate solutions reporting." diff --git a/goodnews/briefs.py b/goodnews/briefs.py index e031ed7..1cbdd9f 100644 --- a/goodnews/briefs.py +++ b/goodnews/briefs.py @@ -9,6 +9,7 @@ def build_daily_brief( brief_date: str | None = None, limit: int = 5, replace: bool = False, + window_days: int = 3, ) -> int: target_date = brief_date or date.today().isoformat() existing = conn.execute("SELECT id FROM daily_briefs WHERE brief_date = ?", (target_date,)).fetchone() @@ -22,7 +23,7 @@ def build_daily_brief( (target_date, f"Five Good Things Today - {target_date}"), ).lastrowid - rows = _candidate_articles(conn, target_date) + rows = _candidate_articles(conn, target_date, window_days) selected = _select_diverse(rows, limit) for index, row in enumerate(selected, start=1): conn.execute( @@ -78,7 +79,17 @@ def show_brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: i ).fetchall() -def _candidate_articles(conn: sqlite3.Connection, target_date: str) -> list[sqlite3.Row]: +def _candidate_articles( + conn: sqlite3.Connection, target_date: str, window_days: int = 3 +) -> list[sqlite3.Row]: + """Brief candidates, sparse-day-proof. + + Prefers articles dated on target_date, but widens to the preceding + `window_days` so the brief still fills on slow news days. Anything already + featured in a brief within the last 7 days (other than this same date, which + is being rebuilt) is excluded so backfilled stories cannot linger across + consecutive days. + """ return conn.execute( """ SELECT @@ -100,19 +111,31 @@ def _candidate_articles(conn: sqlite3.Connection, target_date: str) -> list[sqli s.pr_risk_score, s.reason_code, s.reason_text, - s.model_name + s.model_name, + CASE WHEN date(COALESCE(a.published_at, a.discovered_at)) = date(?) + THEN 1 ELSE 0 END AS is_today FROM articles a JOIN sources src ON src.id = a.source_id JOIN article_scores s ON s.article_id = a.id WHERE s.accepted = 1 - AND date(COALESCE(a.published_at, a.discovered_at)) = date(?) + AND date(COALESCE(a.published_at, a.discovered_at)) <= date(?) + AND date(COALESCE(a.published_at, a.discovered_at)) > date(?, '-' || ? || ' days') + AND a.id NOT IN ( + SELECT bi.article_id + FROM daily_brief_items bi + JOIN daily_briefs b ON b.id = bi.brief_id + WHERE b.brief_date <> ? + AND b.brief_date <= date(?) + AND b.brief_date > date(?, '-7 days') + ) ORDER BY + is_today DESC, (s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score - s.cortisol_score - s.ragebait_score - s.pr_risk_score) DESC, COALESCE(a.published_at, a.discovered_at) DESC LIMIT 50 """, - (target_date,), + (target_date, target_date, target_date, window_days, target_date, target_date, target_date), ).fetchall() diff --git a/goodnews/cli.py b/goodnews/cli.py index a72f003..0175e78 100644 --- a/goodnews/cli.py +++ b/goodnews/cli.py @@ -8,7 +8,7 @@ from pathlib import Path from .briefs import build_daily_brief, show_brief from .db import connect, init_db -from .feeds import poll_all_sources, poll_due_sources, poll_source +from .feeds import fetch_feed, parse_feed, poll_all_sources, poll_due_sources, poll_source from .llm import LocalModelClient, classify_articles from .scoring import score_article from .sources import load_sources, upsert_sources @@ -48,6 +48,9 @@ def main() -> None: subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats") + check_feeds_parser = subparsers.add_parser("check-feeds", help="Fetch and parse each feed, reporting health") + check_feeds_parser.add_argument("--all", action="store_true", help="Include inactive sources") + runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs") runs_parser.add_argument("--limit", type=int, default=20) @@ -122,6 +125,8 @@ def main() -> None: list_category(conn, topic=args.topic, flavor=args.flavor, limit=args.limit, accepted_only=not args.all) elif args.command == "source-report": source_report(conn) + elif args.command == "check-feeds": + check_feeds(conn, include_inactive=args.all) elif args.command == "list-runs": list_runs(conn, limit=args.limit) elif args.command == "rescore": @@ -214,6 +219,20 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No print(f" {row['canonical_url']}") +def check_feeds(conn: sqlite3.Connection, include_inactive: bool = False) -> None: + where = "" if include_inactive else "WHERE active = 1" + rows = conn.execute(f"SELECT name, feed_url FROM sources {where} ORDER BY name").fetchall() + ok = 0 + for row in rows: + try: + items = parse_feed(fetch_feed(row["feed_url"])) + ok += 1 + print(f"OK {row['name']}: {len(items)} items") + except Exception as exc: + print(f"FAIL {row['name']}: {exc}") + print(f"--- {ok}/{len(rows)} feeds healthy ---") + + def run_cycle(conn: sqlite3.Connection, args: argparse.Namespace) -> None: """One end-to-end pass for a scheduler: poll due sources, classify the new arrivals, rebuild today's brief. Each step is independent and non-fatal so a