Sparse-day-proof briefs, feed health check, and 16 new sources
- Briefs now fill from a rolling window (prefer today, backfill up to window_days) and exclude anything featured in the last 7 days of briefs, so slow days still produce five items without stories lingering day to day. - New 'check-feeds' command fetches and parses every feed to catch dead ones. - Added 16 validated sources (science, environment, animals, culture), expanding coverage from 12 to 28 feeds to reduce staleness. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -118,3 +118,147 @@ pr_risk_score = 4
|
|||||||
poll_interval_minutes = 120
|
poll_interval_minutes = 120
|
||||||
notes = "University research stories; watch PR framing."
|
notes = "University research stories; watch PR framing."
|
||||||
|
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "Phys.org"
|
||||||
|
feed_url = "https://phys.org/rss-feed/"
|
||||||
|
default_category = "science"
|
||||||
|
trust_score = 7
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 120
|
||||||
|
notes = "Broad science aggregator."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "Nature News"
|
||||||
|
feed_url = "https://www.nature.com/nature.rss"
|
||||||
|
default_category = "science"
|
||||||
|
trust_score = 9
|
||||||
|
pr_risk_score = 2
|
||||||
|
poll_interval_minutes = 180
|
||||||
|
notes = "Top-tier science journal news."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "Ars Technica Science"
|
||||||
|
feed_url = "https://feeds.arstechnica.com/arstechnica/science"
|
||||||
|
default_category = "science"
|
||||||
|
trust_score = 7
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 120
|
||||||
|
notes = "Science/tech reporting."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "The Guardian Science"
|
||||||
|
feed_url = "https://www.theguardian.com/science/rss"
|
||||||
|
default_category = "science"
|
||||||
|
trust_score = 8
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 120
|
||||||
|
notes = "Mainstream science; needs filtering."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "The Guardian Environment"
|
||||||
|
feed_url = "https://www.theguardian.com/environment/rss"
|
||||||
|
default_category = "environment"
|
||||||
|
trust_score = 8
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 120
|
||||||
|
notes = "Environment; needs filtering."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "ScienceDaily Plants & Animals"
|
||||||
|
feed_url = "https://www.sciencedaily.com/rss/plants_animals.xml"
|
||||||
|
default_category = "animals"
|
||||||
|
trust_score = 6
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 120
|
||||||
|
notes = "Animal/biology discoveries."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "ScienceDaily Space & Time"
|
||||||
|
feed_url = "https://www.sciencedaily.com/rss/space_time.xml"
|
||||||
|
default_category = "science"
|
||||||
|
trust_score = 6
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 120
|
||||||
|
notes = "Space discoveries."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "Smithsonian Magazine"
|
||||||
|
feed_url = "https://www.smithsonianmag.com/rss/latest_articles/"
|
||||||
|
default_category = "culture"
|
||||||
|
trust_score = 7
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 180
|
||||||
|
notes = "Culture, history, science human-interest."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "Yale Environment 360"
|
||||||
|
feed_url = "https://e360.yale.edu/feed.xml"
|
||||||
|
default_category = "environment"
|
||||||
|
trust_score = 8
|
||||||
|
pr_risk_score = 2
|
||||||
|
poll_interval_minutes = 180
|
||||||
|
notes = "In-depth environment/solutions."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "Anthropocene Magazine"
|
||||||
|
feed_url = "https://www.anthropocenemagazine.org/feed/"
|
||||||
|
default_category = "environment"
|
||||||
|
trust_score = 7
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 180
|
||||||
|
notes = "Sustainability solutions journalism."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "The Conversation (US)"
|
||||||
|
feed_url = "https://theconversation.com/us/articles.atom"
|
||||||
|
default_category = "science"
|
||||||
|
trust_score = 7
|
||||||
|
pr_risk_score = 2
|
||||||
|
poll_interval_minutes = 120
|
||||||
|
notes = "Academic-authored explainers."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "Colossal"
|
||||||
|
feed_url = "https://www.thisiscolossal.com/feed/"
|
||||||
|
default_category = "culture"
|
||||||
|
trust_score = 6
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 240
|
||||||
|
notes = "Art and visual culture."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "Atlas Obscura"
|
||||||
|
feed_url = "https://www.atlasobscura.com/feeds/latest"
|
||||||
|
default_category = "culture"
|
||||||
|
trust_score = 6
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 240
|
||||||
|
notes = "Curiosities, places, culture."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "New Scientist"
|
||||||
|
feed_url = "https://www.newscientist.com/feed/home/"
|
||||||
|
default_category = "science"
|
||||||
|
trust_score = 7
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 120
|
||||||
|
notes = "Science weekly."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "ScienceAlert"
|
||||||
|
feed_url = "https://www.sciencealert.com/feed"
|
||||||
|
default_category = "science"
|
||||||
|
trust_score = 6
|
||||||
|
pr_risk_score = 4
|
||||||
|
poll_interval_minutes = 120
|
||||||
|
notes = "Pop science; watch hype."
|
||||||
|
|
||||||
|
[[sources]]
|
||||||
|
name = "Grist"
|
||||||
|
feed_url = "https://grist.org/feed/"
|
||||||
|
default_category = "environment"
|
||||||
|
trust_score = 6
|
||||||
|
pr_risk_score = 3
|
||||||
|
poll_interval_minutes = 180
|
||||||
|
notes = "Climate solutions reporting."
|
||||||
|
|||||||
+28
-5
@@ -9,6 +9,7 @@ def build_daily_brief(
|
|||||||
brief_date: str | None = None,
|
brief_date: str | None = None,
|
||||||
limit: int = 5,
|
limit: int = 5,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
|
window_days: int = 3,
|
||||||
) -> int:
|
) -> int:
|
||||||
target_date = brief_date or date.today().isoformat()
|
target_date = brief_date or date.today().isoformat()
|
||||||
existing = conn.execute("SELECT id FROM daily_briefs WHERE brief_date = ?", (target_date,)).fetchone()
|
existing = conn.execute("SELECT id FROM daily_briefs WHERE brief_date = ?", (target_date,)).fetchone()
|
||||||
@@ -22,7 +23,7 @@ def build_daily_brief(
|
|||||||
(target_date, f"Five Good Things Today - {target_date}"),
|
(target_date, f"Five Good Things Today - {target_date}"),
|
||||||
).lastrowid
|
).lastrowid
|
||||||
|
|
||||||
rows = _candidate_articles(conn, target_date)
|
rows = _candidate_articles(conn, target_date, window_days)
|
||||||
selected = _select_diverse(rows, limit)
|
selected = _select_diverse(rows, limit)
|
||||||
for index, row in enumerate(selected, start=1):
|
for index, row in enumerate(selected, start=1):
|
||||||
conn.execute(
|
conn.execute(
|
||||||
@@ -78,7 +79,17 @@ def show_brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: i
|
|||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
||||||
|
|
||||||
def _candidate_articles(conn: sqlite3.Connection, target_date: str) -> list[sqlite3.Row]:
|
def _candidate_articles(
|
||||||
|
conn: sqlite3.Connection, target_date: str, window_days: int = 3
|
||||||
|
) -> list[sqlite3.Row]:
|
||||||
|
"""Brief candidates, sparse-day-proof.
|
||||||
|
|
||||||
|
Prefers articles dated on target_date, but widens to the preceding
|
||||||
|
`window_days` so the brief still fills on slow news days. Anything already
|
||||||
|
featured in a brief within the last 7 days (other than this same date, which
|
||||||
|
is being rebuilt) is excluded so backfilled stories cannot linger across
|
||||||
|
consecutive days.
|
||||||
|
"""
|
||||||
return conn.execute(
|
return conn.execute(
|
||||||
"""
|
"""
|
||||||
SELECT
|
SELECT
|
||||||
@@ -100,19 +111,31 @@ def _candidate_articles(conn: sqlite3.Connection, target_date: str) -> list[sqli
|
|||||||
s.pr_risk_score,
|
s.pr_risk_score,
|
||||||
s.reason_code,
|
s.reason_code,
|
||||||
s.reason_text,
|
s.reason_text,
|
||||||
s.model_name
|
s.model_name,
|
||||||
|
CASE WHEN date(COALESCE(a.published_at, a.discovered_at)) = date(?)
|
||||||
|
THEN 1 ELSE 0 END AS is_today
|
||||||
FROM articles a
|
FROM articles a
|
||||||
JOIN sources src ON src.id = a.source_id
|
JOIN sources src ON src.id = a.source_id
|
||||||
JOIN article_scores s ON s.article_id = a.id
|
JOIN article_scores s ON s.article_id = a.id
|
||||||
WHERE s.accepted = 1
|
WHERE s.accepted = 1
|
||||||
AND date(COALESCE(a.published_at, a.discovered_at)) = date(?)
|
AND date(COALESCE(a.published_at, a.discovered_at)) <= date(?)
|
||||||
|
AND date(COALESCE(a.published_at, a.discovered_at)) > date(?, '-' || ? || ' days')
|
||||||
|
AND a.id NOT IN (
|
||||||
|
SELECT bi.article_id
|
||||||
|
FROM daily_brief_items bi
|
||||||
|
JOIN daily_briefs b ON b.id = bi.brief_id
|
||||||
|
WHERE b.brief_date <> ?
|
||||||
|
AND b.brief_date <= date(?)
|
||||||
|
AND b.brief_date > date(?, '-7 days')
|
||||||
|
)
|
||||||
ORDER BY
|
ORDER BY
|
||||||
|
is_today DESC,
|
||||||
(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
|
(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
|
||||||
- s.cortisol_score - s.ragebait_score - s.pr_risk_score) DESC,
|
- s.cortisol_score - s.ragebait_score - s.pr_risk_score) DESC,
|
||||||
COALESCE(a.published_at, a.discovered_at) DESC
|
COALESCE(a.published_at, a.discovered_at) DESC
|
||||||
LIMIT 50
|
LIMIT 50
|
||||||
""",
|
""",
|
||||||
(target_date,),
|
(target_date, target_date, target_date, window_days, target_date, target_date, target_date),
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
+20
-1
@@ -8,7 +8,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
from .briefs import build_daily_brief, show_brief
|
from .briefs import build_daily_brief, show_brief
|
||||||
from .db import connect, init_db
|
from .db import connect, init_db
|
||||||
from .feeds import poll_all_sources, poll_due_sources, poll_source
|
from .feeds import fetch_feed, parse_feed, poll_all_sources, poll_due_sources, poll_source
|
||||||
from .llm import LocalModelClient, classify_articles
|
from .llm import LocalModelClient, classify_articles
|
||||||
from .scoring import score_article
|
from .scoring import score_article
|
||||||
from .sources import load_sources, upsert_sources
|
from .sources import load_sources, upsert_sources
|
||||||
@@ -48,6 +48,9 @@ def main() -> None:
|
|||||||
|
|
||||||
subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats")
|
subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats")
|
||||||
|
|
||||||
|
check_feeds_parser = subparsers.add_parser("check-feeds", help="Fetch and parse each feed, reporting health")
|
||||||
|
check_feeds_parser.add_argument("--all", action="store_true", help="Include inactive sources")
|
||||||
|
|
||||||
runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
|
runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
|
||||||
runs_parser.add_argument("--limit", type=int, default=20)
|
runs_parser.add_argument("--limit", type=int, default=20)
|
||||||
|
|
||||||
@@ -122,6 +125,8 @@ def main() -> None:
|
|||||||
list_category(conn, topic=args.topic, flavor=args.flavor, limit=args.limit, accepted_only=not args.all)
|
list_category(conn, topic=args.topic, flavor=args.flavor, limit=args.limit, accepted_only=not args.all)
|
||||||
elif args.command == "source-report":
|
elif args.command == "source-report":
|
||||||
source_report(conn)
|
source_report(conn)
|
||||||
|
elif args.command == "check-feeds":
|
||||||
|
check_feeds(conn, include_inactive=args.all)
|
||||||
elif args.command == "list-runs":
|
elif args.command == "list-runs":
|
||||||
list_runs(conn, limit=args.limit)
|
list_runs(conn, limit=args.limit)
|
||||||
elif args.command == "rescore":
|
elif args.command == "rescore":
|
||||||
@@ -214,6 +219,20 @@ def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> No
|
|||||||
print(f" {row['canonical_url']}")
|
print(f" {row['canonical_url']}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_feeds(conn: sqlite3.Connection, include_inactive: bool = False) -> None:
|
||||||
|
where = "" if include_inactive else "WHERE active = 1"
|
||||||
|
rows = conn.execute(f"SELECT name, feed_url FROM sources {where} ORDER BY name").fetchall()
|
||||||
|
ok = 0
|
||||||
|
for row in rows:
|
||||||
|
try:
|
||||||
|
items = parse_feed(fetch_feed(row["feed_url"]))
|
||||||
|
ok += 1
|
||||||
|
print(f"OK {row['name']}: {len(items)} items")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"FAIL {row['name']}: {exc}")
|
||||||
|
print(f"--- {ok}/{len(rows)} feeds healthy ---")
|
||||||
|
|
||||||
|
|
||||||
def run_cycle(conn: sqlite3.Connection, args: argparse.Namespace) -> None:
|
def run_cycle(conn: sqlite3.Connection, args: argparse.Namespace) -> None:
|
||||||
"""One end-to-end pass for a scheduler: poll due sources, classify the new
|
"""One end-to-end pass for a scheduler: poll due sources, classify the new
|
||||||
arrivals, rebuild today's brief. Each step is independent and non-fatal so a
|
arrivals, rebuild today's brief. Each step is independent and non-fatal so a
|
||||||
|
|||||||
Reference in New Issue
Block a user