Initial commit: goodNews constructive-news ingestion prototype
Local-first RSS/Atom ingestion pipeline with metadata-only storage, heuristic + local-LLM scoring, and daily brief builder. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,6 @@
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
.venv/
|
||||
data/*.sqlite3
|
||||
data/*.sqlite3-*
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
# goodNews
|
||||
|
||||
Local-first constructive news ingestion prototype.
|
||||
|
||||
The first milestone is intentionally small: collect public RSS/Atom metadata, dedupe it, store short source-provided snippets, and attach early reason-coded heuristic scores. It does not store full article bodies.
|
||||
|
||||
## Commands
|
||||
|
||||
From this directory:
|
||||
|
||||
```bash
|
||||
python3 -m goodnews init-db
|
||||
python3 -m goodnews import-sources
|
||||
python3 -m goodnews poll --limit 3
|
||||
python3 -m goodnews rescore
|
||||
python3 -m goodnews check-llm --base-url http://127.0.0.1:1234/v1 --model gpt-oss
|
||||
python3 -m goodnews classify --limit 10 --base-url http://127.0.0.1:1234/v1 --model gpt-oss
|
||||
python3 -m goodnews build-brief --date 2026-05-27 --replace
|
||||
python3 -m goodnews show-brief
|
||||
python3 -m goodnews list-recent --limit 10
|
||||
python3 -m goodnews list-recent --accepted-only --limit 10
|
||||
python3 -m goodnews source-report
|
||||
python3 -m goodnews list-runs
|
||||
```
|
||||
|
||||
The SQLite database lives at:
|
||||
|
||||
```txt
|
||||
data/goodnews.sqlite3
|
||||
```
|
||||
|
||||
Sources live at:
|
||||
|
||||
```txt
|
||||
config/sources.toml
|
||||
```
|
||||
|
||||
## Stored Article Data
|
||||
|
||||
For each article, the database stores:
|
||||
|
||||
- source
|
||||
- canonical URL
|
||||
- title
|
||||
- short RSS/Atom description or summary
|
||||
- author, if present
|
||||
- published timestamp, if present
|
||||
- image URL, if present
|
||||
- language, if present
|
||||
- hashes used for dedupe
|
||||
- heuristic scores and reason codes
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Run the poller for a few days and inspect which sources produce useful candidates.
|
||||
2. Add source-level quality notes and deactivate noisy feeds.
|
||||
3. Replace or supplement `heuristic-v0` with a local model classifier.
|
||||
4. Add a daily brief builder that selects 5 items using scores and source diversity.
|
||||
5. Add a small web/API layer once the ingest data looks trustworthy.
|
||||
|
||||
## Local Model Configuration
|
||||
|
||||
The `classify` command expects an OpenAI-compatible local chat-completions server.
|
||||
|
||||
You can pass settings directly:
|
||||
|
||||
```bash
|
||||
python3 -m goodnews classify --base-url http://127.0.0.1:1234/v1 --model gpt-oss --limit 10
|
||||
```
|
||||
|
||||
Or use environment variables:
|
||||
|
||||
```bash
|
||||
export GOODNEWS_LLM_BASE_URL=http://127.0.0.1:1234/v1
|
||||
export GOODNEWS_LLM_MODEL=gpt-oss
|
||||
python3 -m goodnews classify --limit 10
|
||||
```
|
||||
|
||||
`classify` rewrites the current score/reason row for selected candidates. `rescore` can restore the fast heuristic scores.
|
||||
@@ -0,0 +1,120 @@
|
||||
[[sources]]
|
||||
name = "Good News Network"
|
||||
homepage_url = "https://www.goodnewsnetwork.org/"
|
||||
feed_url = "https://www.goodnewsnetwork.org/feed/"
|
||||
default_category = "constructive"
|
||||
trust_score = 6
|
||||
pr_risk_score = 3
|
||||
poll_interval_minutes = 120
|
||||
notes = "Explicit good-news source; useful for early calibration."
|
||||
|
||||
[[sources]]
|
||||
name = "Positive News"
|
||||
homepage_url = "https://www.positive.news/"
|
||||
feed_url = "https://www.positive.news/feed/"
|
||||
default_category = "constructive"
|
||||
trust_score = 7
|
||||
pr_risk_score = 3
|
||||
poll_interval_minutes = 180
|
||||
notes = "Constructive journalism source."
|
||||
|
||||
[[sources]]
|
||||
name = "Reasons to be Cheerful"
|
||||
homepage_url = "https://reasonstobecheerful.world/"
|
||||
feed_url = "https://reasonstobecheerful.world/feed/"
|
||||
default_category = "constructive"
|
||||
trust_score = 7
|
||||
pr_risk_score = 3
|
||||
poll_interval_minutes = 180
|
||||
notes = "Solutions-oriented reporting."
|
||||
|
||||
[[sources]]
|
||||
name = "Happy Eco News"
|
||||
homepage_url = "https://happyeconews.com/"
|
||||
feed_url = "https://happyeconews.com/feed/"
|
||||
default_category = "environment"
|
||||
trust_score = 5
|
||||
pr_risk_score = 4
|
||||
poll_interval_minutes = 180
|
||||
notes = "Environmental good-news candidate source."
|
||||
|
||||
[[sources]]
|
||||
name = "Mongabay"
|
||||
homepage_url = "https://news.mongabay.com/"
|
||||
feed_url = "https://news.mongabay.com/feed/"
|
||||
default_category = "environment"
|
||||
trust_score = 8
|
||||
pr_risk_score = 2
|
||||
poll_interval_minutes = 120
|
||||
notes = "Environmental reporting; not always low-cortisol, but often constructive."
|
||||
|
||||
[[sources]]
|
||||
name = "ScienceDaily Top Science"
|
||||
homepage_url = "https://www.sciencedaily.com/"
|
||||
feed_url = "https://www.sciencedaily.com/rss/top/science.xml"
|
||||
default_category = "science"
|
||||
trust_score = 6
|
||||
pr_risk_score = 3
|
||||
poll_interval_minutes = 120
|
||||
notes = "Science discovery feed."
|
||||
|
||||
[[sources]]
|
||||
name = "ScienceDaily Health"
|
||||
homepage_url = "https://www.sciencedaily.com/news/health_medicine/"
|
||||
feed_url = "https://www.sciencedaily.com/rss/health_medicine.xml"
|
||||
default_category = "health-progress"
|
||||
trust_score = 6
|
||||
pr_risk_score = 3
|
||||
poll_interval_minutes = 120
|
||||
notes = "Health and medicine research feed."
|
||||
|
||||
[[sources]]
|
||||
name = "ScienceDaily Environment"
|
||||
homepage_url = "https://www.sciencedaily.com/news/earth_climate/environmental_science/"
|
||||
feed_url = "https://www.sciencedaily.com/rss/earth_climate/environmental_science.xml"
|
||||
default_category = "environment"
|
||||
trust_score = 6
|
||||
pr_risk_score = 3
|
||||
poll_interval_minutes = 120
|
||||
notes = "Environment research feed."
|
||||
|
||||
[[sources]]
|
||||
name = "NPR Science"
|
||||
homepage_url = "https://www.npr.org/sections/science/"
|
||||
feed_url = "https://feeds.npr.org/1007/rss.xml"
|
||||
default_category = "science"
|
||||
trust_score = 8
|
||||
pr_risk_score = 2
|
||||
poll_interval_minutes = 90
|
||||
notes = "Mainstream science feed; mix of constructive and general coverage."
|
||||
|
||||
[[sources]]
|
||||
name = "NPR Health"
|
||||
homepage_url = "https://www.npr.org/sections/health/"
|
||||
feed_url = "https://feeds.npr.org/1128/rss.xml"
|
||||
default_category = "health-progress"
|
||||
trust_score = 8
|
||||
pr_risk_score = 2
|
||||
poll_interval_minutes = 90
|
||||
notes = "Health feed; needs cortisol filtering."
|
||||
|
||||
[[sources]]
|
||||
name = "BBC Science and Environment"
|
||||
homepage_url = "https://www.bbc.com/news/science_and_environment"
|
||||
feed_url = "https://feeds.bbci.co.uk/news/science_and_environment/rss.xml"
|
||||
default_category = "science"
|
||||
trust_score = 8
|
||||
pr_risk_score = 2
|
||||
poll_interval_minutes = 90
|
||||
notes = "Broad science/environment feed; needs filtering."
|
||||
|
||||
[[sources]]
|
||||
name = "Futurity"
|
||||
homepage_url = "https://www.futurity.org/"
|
||||
feed_url = "https://www.futurity.org/feed/"
|
||||
default_category = "science"
|
||||
trust_score = 6
|
||||
pr_risk_score = 4
|
||||
poll_interval_minutes = 120
|
||||
notes = "University research stories; watch PR framing."
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
__version__ = "0.1.0"
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
from .cli import main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -0,0 +1,167 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from datetime import date
|
||||
|
||||
|
||||
def build_daily_brief(
|
||||
conn: sqlite3.Connection,
|
||||
brief_date: str | None = None,
|
||||
limit: int = 5,
|
||||
replace: bool = False,
|
||||
) -> int:
|
||||
target_date = brief_date or date.today().isoformat()
|
||||
existing = conn.execute("SELECT id FROM daily_briefs WHERE brief_date = ?", (target_date,)).fetchone()
|
||||
if existing and not replace:
|
||||
return int(existing["id"])
|
||||
if existing and replace:
|
||||
conn.execute("DELETE FROM daily_briefs WHERE id = ?", (existing["id"],))
|
||||
|
||||
brief_id = conn.execute(
|
||||
"INSERT INTO daily_briefs (brief_date, title) VALUES (?, ?)",
|
||||
(target_date, f"Five Good Things Today - {target_date}"),
|
||||
).lastrowid
|
||||
|
||||
rows = _candidate_articles(conn, target_date)
|
||||
selected = _select_diverse(rows, limit)
|
||||
for index, row in enumerate(selected, start=1):
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO daily_brief_items (brief_id, article_id, rank, selection_reason)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
brief_id,
|
||||
row["id"],
|
||||
index,
|
||||
_selection_reason(row),
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
return int(brief_id)
|
||||
|
||||
|
||||
def show_brief(conn: sqlite3.Connection, brief_date: str | None = None, limit: int = 10) -> list[sqlite3.Row]:
|
||||
target_date = brief_date or _latest_brief_date(conn)
|
||||
if not target_date:
|
||||
return []
|
||||
return conn.execute(
|
||||
"""
|
||||
SELECT
|
||||
b.brief_date,
|
||||
bi.rank,
|
||||
bi.selection_reason,
|
||||
a.title,
|
||||
a.description,
|
||||
a.canonical_url,
|
||||
a.published_at,
|
||||
src.name AS source_name,
|
||||
src.default_category,
|
||||
s.constructive_score,
|
||||
s.cortisol_score,
|
||||
s.ragebait_score,
|
||||
s.agency_score,
|
||||
s.human_benefit_score,
|
||||
s.reason_code,
|
||||
s.reason_text,
|
||||
s.model_name
|
||||
FROM daily_briefs b
|
||||
JOIN daily_brief_items bi ON bi.brief_id = b.id
|
||||
JOIN articles a ON a.id = bi.article_id
|
||||
JOIN sources src ON src.id = a.source_id
|
||||
LEFT JOIN article_scores s ON s.article_id = a.id
|
||||
WHERE b.brief_date = ?
|
||||
ORDER BY bi.rank
|
||||
LIMIT ?
|
||||
""",
|
||||
(target_date, limit),
|
||||
).fetchall()
|
||||
|
||||
|
||||
def _candidate_articles(conn: sqlite3.Connection, target_date: str) -> list[sqlite3.Row]:
|
||||
return conn.execute(
|
||||
"""
|
||||
SELECT
|
||||
a.id,
|
||||
a.title,
|
||||
a.description,
|
||||
a.canonical_url,
|
||||
a.published_at,
|
||||
a.discovered_at,
|
||||
src.name AS source_name,
|
||||
src.default_category,
|
||||
src.trust_score,
|
||||
s.constructive_score,
|
||||
s.cortisol_score,
|
||||
s.ragebait_score,
|
||||
s.agency_score,
|
||||
s.human_benefit_score,
|
||||
s.novelty_score,
|
||||
s.pr_risk_score,
|
||||
s.reason_code,
|
||||
s.reason_text,
|
||||
s.model_name
|
||||
FROM articles a
|
||||
JOIN sources src ON src.id = a.source_id
|
||||
JOIN article_scores s ON s.article_id = a.id
|
||||
WHERE s.accepted = 1
|
||||
AND date(COALESCE(a.published_at, a.discovered_at)) = date(?)
|
||||
ORDER BY
|
||||
(s.constructive_score + s.agency_score + s.human_benefit_score + src.trust_score
|
||||
- s.cortisol_score - s.ragebait_score - s.pr_risk_score) DESC,
|
||||
COALESCE(a.published_at, a.discovered_at) DESC
|
||||
LIMIT 50
|
||||
""",
|
||||
(target_date,),
|
||||
).fetchall()
|
||||
|
||||
|
||||
def _select_diverse(rows: list[sqlite3.Row], limit: int) -> list[sqlite3.Row]:
|
||||
selected = []
|
||||
seen_sources = set()
|
||||
seen_categories = set()
|
||||
|
||||
for row in rows:
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
source = row["source_name"]
|
||||
category = row["default_category"]
|
||||
if source in seen_sources and len(rows) > limit:
|
||||
continue
|
||||
selected.append(row)
|
||||
seen_sources.add(source)
|
||||
seen_categories.add(category)
|
||||
|
||||
if len(selected) < limit:
|
||||
selected_ids = {row["id"] for row in selected}
|
||||
for row in rows:
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
if row["id"] in selected_ids:
|
||||
continue
|
||||
selected.append(row)
|
||||
selected_ids.add(row["id"])
|
||||
|
||||
if len(seen_categories) < 2 and len(rows) > limit:
|
||||
selected_ids = {row["id"] for row in selected}
|
||||
for row in rows:
|
||||
if row["id"] in selected_ids:
|
||||
continue
|
||||
if row["default_category"] not in seen_categories:
|
||||
selected[-1] = row
|
||||
break
|
||||
|
||||
return selected
|
||||
|
||||
|
||||
def _selection_reason(row: sqlite3.Row) -> str:
|
||||
return (
|
||||
f"{row['reason_code']}; constructive={row['constructive_score']}, "
|
||||
f"agency={row['agency_score']}, human_benefit={row['human_benefit_score']}, "
|
||||
f"cortisol={row['cortisol_score']}, source={row['source_name']}"
|
||||
)
|
||||
|
||||
|
||||
def _latest_brief_date(conn: sqlite3.Connection) -> str | None:
|
||||
row = conn.execute("SELECT brief_date FROM daily_briefs ORDER BY brief_date DESC LIMIT 1").fetchone()
|
||||
return row["brief_date"] if row else None
|
||||
+352
@@ -0,0 +1,352 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
from .briefs import build_daily_brief, show_brief
|
||||
from .db import connect, init_db
|
||||
from .feeds import poll_all_sources, poll_source
|
||||
from .llm import LocalModelClient, classify_articles
|
||||
from .scoring import score_article
|
||||
from .sources import load_sources, upsert_sources
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
DEFAULT_DB = ROOT / "data" / "goodnews.sqlite3"
|
||||
DEFAULT_SOURCES = ROOT / "config" / "sources.toml"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(prog="goodnews")
|
||||
parser.add_argument("--db", type=Path, default=DEFAULT_DB, help="SQLite database path")
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
subparsers.add_parser("init-db", help="Create or update the SQLite schema")
|
||||
|
||||
import_parser = subparsers.add_parser("import-sources", help="Load sources from TOML")
|
||||
import_parser.add_argument("--sources", type=Path, default=DEFAULT_SOURCES)
|
||||
|
||||
poll_parser = subparsers.add_parser("poll", help="Poll active RSS/Atom sources")
|
||||
poll_parser.add_argument("--source", help="Poll one source by exact name")
|
||||
poll_parser.add_argument("--limit", type=int, help="Poll only the first N active sources")
|
||||
|
||||
list_parser = subparsers.add_parser("list-recent", help="Show recently discovered articles")
|
||||
list_parser.add_argument("--limit", type=int, default=20)
|
||||
list_parser.add_argument("--accepted-only", action="store_true")
|
||||
|
||||
source_parser = subparsers.add_parser("list-sources", help="Show configured sources")
|
||||
source_parser.add_argument("--active-only", action="store_true")
|
||||
|
||||
subparsers.add_parser("source-report", help="Show source-level ingestion and scoring stats")
|
||||
|
||||
runs_parser = subparsers.add_parser("list-runs", help="Show recent ingest runs")
|
||||
runs_parser.add_argument("--limit", type=int, default=20)
|
||||
|
||||
subparsers.add_parser("rescore", help="Re-run heuristic scores for stored articles")
|
||||
|
||||
classify_parser = subparsers.add_parser("classify", help="Classify candidates with a local LLM")
|
||||
classify_parser.add_argument("--limit", type=int, default=10)
|
||||
classify_parser.add_argument("--include-rejected", action="store_true")
|
||||
classify_parser.add_argument("--dry-run", action="store_true")
|
||||
classify_parser.add_argument("--base-url", help="OpenAI-compatible base URL, e.g. http://127.0.0.1:1234/v1")
|
||||
classify_parser.add_argument("--model", help="Local model name")
|
||||
|
||||
check_llm_parser = subparsers.add_parser("check-llm", help="Check local OpenAI-compatible model endpoint")
|
||||
check_llm_parser.add_argument("--base-url", help="OpenAI-compatible base URL, e.g. http://127.0.0.1:1234/v1")
|
||||
check_llm_parser.add_argument("--model", help="Expected local model name")
|
||||
|
||||
brief_parser = subparsers.add_parser("build-brief", help="Build/freeze a daily brief")
|
||||
brief_parser.add_argument("--date", help="Brief date in YYYY-MM-DD format; defaults to today")
|
||||
brief_parser.add_argument("--limit", type=int, default=5)
|
||||
brief_parser.add_argument("--replace", action="store_true")
|
||||
|
||||
show_brief_parser = subparsers.add_parser("show-brief", help="Show a stored daily brief")
|
||||
show_brief_parser.add_argument("--date", help="Brief date in YYYY-MM-DD format; defaults to latest brief")
|
||||
show_brief_parser.add_argument("--limit", type=int, default=10)
|
||||
|
||||
args = parser.parse_args()
|
||||
conn = connect(args.db)
|
||||
|
||||
if args.command == "init-db":
|
||||
init_db(conn)
|
||||
print(f"Initialized {args.db}")
|
||||
elif args.command == "import-sources":
|
||||
init_db(conn)
|
||||
sources = load_sources(args.sources)
|
||||
count = upsert_sources(conn, sources)
|
||||
print(f"Imported {count} sources from {args.sources}")
|
||||
elif args.command == "poll":
|
||||
init_db(conn)
|
||||
if args.source:
|
||||
source = conn.execute("SELECT * FROM sources WHERE name = ?", (args.source,)).fetchone()
|
||||
if not source:
|
||||
raise SystemExit(f"No source named {args.source!r}")
|
||||
result = poll_source(conn, source)
|
||||
else:
|
||||
result = poll_all_sources(conn, limit=args.limit)
|
||||
print(_format_result(result))
|
||||
elif args.command == "list-recent":
|
||||
list_recent(conn, limit=args.limit, accepted_only=args.accepted_only)
|
||||
elif args.command == "list-sources":
|
||||
list_sources(conn, active_only=args.active_only)
|
||||
elif args.command == "source-report":
|
||||
source_report(conn)
|
||||
elif args.command == "list-runs":
|
||||
list_runs(conn, limit=args.limit)
|
||||
elif args.command == "rescore":
|
||||
count = rescore_articles(conn)
|
||||
print(f"Rescored {count} articles")
|
||||
elif args.command == "classify":
|
||||
init_db(conn)
|
||||
client = llm_client_from_args(args)
|
||||
results = classify_articles(
|
||||
conn,
|
||||
client,
|
||||
limit=args.limit,
|
||||
include_rejected=args.include_rejected,
|
||||
dry_run=args.dry_run,
|
||||
)
|
||||
for article_id, scores in results:
|
||||
accepted = "yes" if scores["accepted"] else "no"
|
||||
print(f"[{article_id}] accepted={accepted} reason={scores['reason_code']}")
|
||||
print(f" {scores['reason_text']}")
|
||||
if args.dry_run:
|
||||
print("Dry run only; database was not updated.")
|
||||
elif args.command == "check-llm":
|
||||
client = llm_client_from_args(args)
|
||||
try:
|
||||
models = client.list_models()
|
||||
except RuntimeError as exc:
|
||||
raise SystemExit(str(exc))
|
||||
print(f"Connected to {client.base_url}")
|
||||
if models:
|
||||
print("Models:")
|
||||
for model in models:
|
||||
marker = " *" if model == client.model else ""
|
||||
print(f" {model}{marker}")
|
||||
else:
|
||||
print("Endpoint responded, but no models were listed.")
|
||||
elif args.command == "build-brief":
|
||||
init_db(conn)
|
||||
brief_id = build_daily_brief(
|
||||
conn,
|
||||
brief_date=args.date,
|
||||
limit=args.limit,
|
||||
replace=args.replace,
|
||||
)
|
||||
print(f"Built brief {brief_id}")
|
||||
print_brief(show_brief(conn, brief_date=args.date, limit=args.limit))
|
||||
elif args.command == "show-brief":
|
||||
print_brief(show_brief(conn, brief_date=args.date, limit=args.limit))
|
||||
|
||||
|
||||
def list_recent(conn: sqlite3.Connection, limit: int, accepted_only: bool) -> None:
|
||||
where = "WHERE s.accepted = 1" if accepted_only else ""
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT
|
||||
a.id,
|
||||
a.published_at,
|
||||
src.name AS source_name,
|
||||
a.title,
|
||||
a.canonical_url,
|
||||
s.accepted,
|
||||
s.constructive_score,
|
||||
s.cortisol_score,
|
||||
s.ragebait_score,
|
||||
s.reason_code
|
||||
FROM articles a
|
||||
JOIN sources src ON src.id = a.source_id
|
||||
LEFT JOIN article_scores s ON s.article_id = a.id
|
||||
{where}
|
||||
ORDER BY COALESCE(a.published_at, a.discovered_at) DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
for row in rows:
|
||||
accepted = "yes" if row["accepted"] else "no"
|
||||
print(f"[{row['id']}] {row['published_at'] or 'no date'} | {row['source_name']} | accepted={accepted}")
|
||||
print(f" {row['title']}")
|
||||
print(
|
||||
" scores: "
|
||||
f"constructive={row['constructive_score']} "
|
||||
f"cortisol={row['cortisol_score']} "
|
||||
f"ragebait={row['ragebait_score']} "
|
||||
f"reason={row['reason_code']}"
|
||||
)
|
||||
print(f" {row['canonical_url']}")
|
||||
|
||||
|
||||
def llm_client_from_args(args: argparse.Namespace) -> LocalModelClient:
|
||||
client = LocalModelClient.from_env()
|
||||
if getattr(args, "base_url", None):
|
||||
client.base_url = args.base_url.rstrip("/")
|
||||
if getattr(args, "model", None):
|
||||
client.model = args.model
|
||||
return client
|
||||
|
||||
|
||||
def list_sources(conn: sqlite3.Connection, active_only: bool) -> None:
|
||||
where = "WHERE active = 1" if active_only else ""
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT id, name, active, default_category, trust_score, pr_risk_score, feed_url
|
||||
FROM sources
|
||||
{where}
|
||||
ORDER BY name
|
||||
"""
|
||||
).fetchall()
|
||||
for row in rows:
|
||||
state = "active" if row["active"] else "inactive"
|
||||
print(
|
||||
f"[{row['id']}] {row['name']} ({state}, {row['default_category']}, "
|
||||
f"trust={row['trust_score']}, pr={row['pr_risk_score']})"
|
||||
)
|
||||
print(f" {row['feed_url']}")
|
||||
|
||||
|
||||
def source_report(conn: sqlite3.Connection) -> None:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT
|
||||
src.name,
|
||||
src.default_category,
|
||||
src.trust_score,
|
||||
src.pr_risk_score AS source_pr_risk,
|
||||
COUNT(a.id) AS articles,
|
||||
SUM(CASE WHEN s.accepted = 1 THEN 1 ELSE 0 END) AS accepted,
|
||||
ROUND(AVG(s.constructive_score), 1) AS avg_constructive,
|
||||
ROUND(AVG(s.cortisol_score), 1) AS avg_cortisol,
|
||||
ROUND(AVG(s.ragebait_score), 1) AS avg_ragebait,
|
||||
MAX(a.published_at) AS newest_article
|
||||
FROM sources src
|
||||
LEFT JOIN articles a ON a.source_id = src.id
|
||||
LEFT JOIN article_scores s ON s.article_id = a.id
|
||||
GROUP BY src.id
|
||||
ORDER BY accepted DESC, articles DESC, src.name
|
||||
"""
|
||||
).fetchall()
|
||||
for row in rows:
|
||||
articles = row["articles"] or 0
|
||||
accepted = row["accepted"] or 0
|
||||
rate = (accepted / articles * 100) if articles else 0
|
||||
print(
|
||||
f"{row['name']} | {row['default_category']} | "
|
||||
f"articles={articles} accepted={accepted} ({rate:.1f}%)"
|
||||
)
|
||||
print(
|
||||
f" trust={row['trust_score']} pr={row['source_pr_risk']} "
|
||||
f"avg_constructive={row['avg_constructive']} "
|
||||
f"avg_cortisol={row['avg_cortisol']} "
|
||||
f"avg_ragebait={row['avg_ragebait']}"
|
||||
)
|
||||
print(f" newest={row['newest_article'] or 'none'}")
|
||||
|
||||
|
||||
def list_runs(conn: sqlite3.Connection, limit: int) -> None:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT r.id, r.started_at, r.finished_at, r.status, src.name AS source_name,
|
||||
r.items_seen, r.items_inserted, r.items_duplicate, r.error
|
||||
FROM ingest_runs r
|
||||
LEFT JOIN sources src ON src.id = r.source_id
|
||||
ORDER BY r.id DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
for row in rows:
|
||||
print(
|
||||
f"[{row['id']}] {row['status']} | {row['source_name'] or 'unknown'} | "
|
||||
f"seen={row['items_seen']} inserted={row['items_inserted']} duplicate={row['items_duplicate']}"
|
||||
)
|
||||
if row["error"]:
|
||||
print(f" error: {row['error']}")
|
||||
|
||||
|
||||
def rescore_articles(conn: sqlite3.Connection) -> int:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT a.id, a.title, a.description, src.pr_risk_score
|
||||
FROM articles a
|
||||
JOIN sources src ON src.id = a.source_id
|
||||
ORDER BY a.id
|
||||
"""
|
||||
).fetchall()
|
||||
for row in rows:
|
||||
scores = score_article(row["title"], row["description"], int(row["pr_risk_score"]))
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO article_scores (
|
||||
article_id, constructive_score, cortisol_score, ragebait_score,
|
||||
agency_score, human_benefit_score, novelty_score, pr_risk_score,
|
||||
accepted, reason_code, reason_text, model_name, scored_at
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT(article_id) DO UPDATE SET
|
||||
constructive_score = excluded.constructive_score,
|
||||
cortisol_score = excluded.cortisol_score,
|
||||
ragebait_score = excluded.ragebait_score,
|
||||
agency_score = excluded.agency_score,
|
||||
human_benefit_score = excluded.human_benefit_score,
|
||||
novelty_score = excluded.novelty_score,
|
||||
pr_risk_score = excluded.pr_risk_score,
|
||||
accepted = excluded.accepted,
|
||||
reason_code = excluded.reason_code,
|
||||
reason_text = excluded.reason_text,
|
||||
model_name = excluded.model_name,
|
||||
scored_at = CURRENT_TIMESTAMP
|
||||
""",
|
||||
(
|
||||
row["id"],
|
||||
scores["constructive_score"],
|
||||
scores["cortisol_score"],
|
||||
scores["ragebait_score"],
|
||||
scores["agency_score"],
|
||||
scores["human_benefit_score"],
|
||||
scores["novelty_score"],
|
||||
scores["pr_risk_score"],
|
||||
scores["accepted"],
|
||||
scores["reason_code"],
|
||||
scores["reason_text"],
|
||||
scores["model_name"],
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
return len(rows)
|
||||
|
||||
|
||||
def print_brief(rows: list[sqlite3.Row]) -> None:
|
||||
if not rows:
|
||||
print("No brief items found.")
|
||||
return
|
||||
date = rows[0]["brief_date"]
|
||||
print(f"Five Good Things Today - {date}")
|
||||
for row in rows:
|
||||
print(f"{row['rank']}. {row['title']}")
|
||||
print(f" {row['source_name']} | {row['default_category']} | {row['model_name']}")
|
||||
print(f" reason: {row['reason_code']}")
|
||||
print(f" {row['canonical_url']}")
|
||||
|
||||
|
||||
def _format_result(result: dict) -> str:
|
||||
if "sources" in result:
|
||||
return (
|
||||
f"Polled {result['sources']} sources: seen={result['seen']} "
|
||||
f"inserted={result['inserted']} duplicate={result['duplicate']} failed={result['failed']}"
|
||||
)
|
||||
if result.get("status") == "failed":
|
||||
return (
|
||||
f"Poll failed: seen={result['seen']} inserted={result['inserted']} "
|
||||
f"duplicate={result['duplicate']} error={result['error']}"
|
||||
)
|
||||
return (
|
||||
f"Poll ok: seen={result['seen']} inserted={result['inserted']} "
|
||||
f"duplicate={result['duplicate']}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+105
@@ -0,0 +1,105 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
SCHEMA = """
|
||||
PRAGMA foreign_keys = ON;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS sources (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
homepage_url TEXT,
|
||||
feed_url TEXT NOT NULL UNIQUE,
|
||||
source_type TEXT NOT NULL DEFAULT 'rss',
|
||||
default_category TEXT,
|
||||
trust_score INTEGER NOT NULL DEFAULT 5,
|
||||
pr_risk_score INTEGER NOT NULL DEFAULT 3,
|
||||
active INTEGER NOT NULL DEFAULT 1,
|
||||
poll_interval_minutes INTEGER NOT NULL DEFAULT 60,
|
||||
notes TEXT,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS articles (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source_id INTEGER NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
||||
canonical_url TEXT NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
description TEXT,
|
||||
author TEXT,
|
||||
published_at TEXT,
|
||||
discovered_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
image_url TEXT,
|
||||
language TEXT,
|
||||
raw_guid TEXT,
|
||||
url_hash TEXT NOT NULL UNIQUE,
|
||||
title_hash TEXT,
|
||||
FOREIGN KEY (source_id) REFERENCES sources(id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_source_id ON articles(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_articles_title_hash ON articles(title_hash);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS article_scores (
|
||||
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
|
||||
constructive_score INTEGER,
|
||||
cortisol_score INTEGER,
|
||||
ragebait_score INTEGER,
|
||||
agency_score INTEGER,
|
||||
human_benefit_score INTEGER,
|
||||
novelty_score INTEGER,
|
||||
pr_risk_score INTEGER,
|
||||
accepted INTEGER,
|
||||
reason_code TEXT,
|
||||
reason_text TEXT,
|
||||
model_name TEXT,
|
||||
scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ingest_runs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
|
||||
started_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
finished_at TEXT,
|
||||
status TEXT NOT NULL DEFAULT 'running',
|
||||
items_seen INTEGER NOT NULL DEFAULT 0,
|
||||
items_inserted INTEGER NOT NULL DEFAULT 0,
|
||||
items_duplicate INTEGER NOT NULL DEFAULT 0,
|
||||
error TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS daily_briefs (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
brief_date TEXT NOT NULL UNIQUE,
|
||||
title TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
notes TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS daily_brief_items (
|
||||
brief_id INTEGER NOT NULL REFERENCES daily_briefs(id) ON DELETE CASCADE,
|
||||
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
|
||||
rank INTEGER NOT NULL,
|
||||
selection_reason TEXT,
|
||||
PRIMARY KEY (brief_id, article_id),
|
||||
UNIQUE (brief_id, rank)
|
||||
);
|
||||
"""
|
||||
|
||||
|
||||
def connect(db_path: Path | str) -> sqlite3.Connection:
|
||||
path = Path(db_path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = sqlite3.connect(path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
return conn
|
||||
|
||||
|
||||
def init_db(conn: sqlite3.Connection) -> None:
|
||||
conn.executescript(SCHEMA)
|
||||
conn.commit()
|
||||
@@ -0,0 +1,324 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import email.utils
|
||||
import sqlite3
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
import xml.etree.ElementTree as ET
|
||||
from dataclasses import dataclass
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from .scoring import score_article
|
||||
from .text import canonicalize_url, clean_text, sha256_text
|
||||
|
||||
|
||||
USER_AGENT = "goodNews/0.1 (+local constructive news prototype)"
|
||||
|
||||
|
||||
@dataclass
|
||||
class FeedItem:
|
||||
title: str
|
||||
url: str
|
||||
description: str | None = None
|
||||
author: str | None = None
|
||||
published_at: str | None = None
|
||||
image_url: str | None = None
|
||||
language: str | None = None
|
||||
raw_guid: str | None = None
|
||||
|
||||
|
||||
def poll_all_sources(conn: sqlite3.Connection, limit: int | None = None) -> dict:
|
||||
query = """
|
||||
SELECT *
|
||||
FROM sources
|
||||
WHERE active = 1
|
||||
ORDER BY id
|
||||
"""
|
||||
rows = conn.execute(query).fetchall()
|
||||
if limit is not None:
|
||||
rows = rows[:limit]
|
||||
|
||||
totals = {"sources": 0, "seen": 0, "inserted": 0, "duplicate": 0, "failed": 0}
|
||||
for source in rows:
|
||||
result = poll_source(conn, source)
|
||||
totals["sources"] += 1
|
||||
totals["seen"] += result["seen"]
|
||||
totals["inserted"] += result["inserted"]
|
||||
totals["duplicate"] += result["duplicate"]
|
||||
totals["failed"] += 1 if result["status"] == "failed" else 0
|
||||
return totals
|
||||
|
||||
|
||||
def poll_source(conn: sqlite3.Connection, source: sqlite3.Row) -> dict:
|
||||
run_id = conn.execute(
|
||||
"INSERT INTO ingest_runs (source_id) VALUES (?)",
|
||||
(source["id"],),
|
||||
).lastrowid
|
||||
conn.commit()
|
||||
|
||||
seen = inserted = duplicate = 0
|
||||
try:
|
||||
xml = fetch_feed(source["feed_url"])
|
||||
items = parse_feed(xml)
|
||||
seen = len(items)
|
||||
for item in items:
|
||||
inserted_now = insert_article(conn, source, item)
|
||||
if inserted_now:
|
||||
inserted += 1
|
||||
else:
|
||||
duplicate += 1
|
||||
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE ingest_runs
|
||||
SET finished_at = CURRENT_TIMESTAMP,
|
||||
status = 'ok',
|
||||
items_seen = ?,
|
||||
items_inserted = ?,
|
||||
items_duplicate = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(seen, inserted, duplicate, run_id),
|
||||
)
|
||||
conn.commit()
|
||||
return {"status": "ok", "seen": seen, "inserted": inserted, "duplicate": duplicate}
|
||||
except Exception as exc:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE ingest_runs
|
||||
SET finished_at = CURRENT_TIMESTAMP,
|
||||
status = 'failed',
|
||||
items_seen = ?,
|
||||
items_inserted = ?,
|
||||
items_duplicate = ?,
|
||||
error = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(seen, inserted, duplicate, str(exc), run_id),
|
||||
)
|
||||
conn.commit()
|
||||
return {
|
||||
"status": "failed",
|
||||
"seen": seen,
|
||||
"inserted": inserted,
|
||||
"duplicate": duplicate,
|
||||
"error": str(exc),
|
||||
}
|
||||
|
||||
|
||||
def fetch_feed(url: str, timeout: int = 20) -> bytes:
|
||||
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=timeout) as response:
|
||||
return response.read()
|
||||
except urllib.error.HTTPError as exc:
|
||||
raise RuntimeError(f"HTTP {exc.code} fetching {url}") from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(f"failed fetching {url}: {exc.reason}") from exc
|
||||
|
||||
|
||||
def parse_feed(xml: bytes) -> list[FeedItem]:
|
||||
root = ET.fromstring(xml)
|
||||
root_name = _local_name(root.tag)
|
||||
if root_name == "feed":
|
||||
return _parse_atom(root)
|
||||
return _parse_rss(root)
|
||||
|
||||
|
||||
def insert_article(conn: sqlite3.Connection, source: sqlite3.Row, item: FeedItem) -> bool:
|
||||
canonical_url = canonicalize_url(item.url)
|
||||
if not canonical_url or not item.title:
|
||||
return False
|
||||
|
||||
title = clean_text(item.title, max_len=500)
|
||||
description = clean_text(item.description, max_len=1000)
|
||||
if not title:
|
||||
return False
|
||||
|
||||
url_hash = sha256_text(canonical_url)
|
||||
title_hash = sha256_text(title)
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
"""
|
||||
INSERT INTO articles (
|
||||
source_id, canonical_url, title, description, author,
|
||||
published_at, image_url, language, raw_guid, url_hash, title_hash
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
source["id"],
|
||||
canonical_url,
|
||||
title,
|
||||
description,
|
||||
clean_text(item.author, max_len=250),
|
||||
item.published_at,
|
||||
canonicalize_url(item.image_url),
|
||||
item.language,
|
||||
item.raw_guid,
|
||||
url_hash,
|
||||
title_hash,
|
||||
),
|
||||
)
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
scores = score_article(title, description, int(source["pr_risk_score"]))
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO article_scores (
|
||||
article_id, constructive_score, cortisol_score, ragebait_score,
|
||||
agency_score, human_benefit_score, novelty_score, pr_risk_score,
|
||||
accepted, reason_code, reason_text, model_name
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
cursor.lastrowid,
|
||||
scores["constructive_score"],
|
||||
scores["cortisol_score"],
|
||||
scores["ragebait_score"],
|
||||
scores["agency_score"],
|
||||
scores["human_benefit_score"],
|
||||
scores["novelty_score"],
|
||||
scores["pr_risk_score"],
|
||||
scores["accepted"],
|
||||
scores["reason_code"],
|
||||
scores["reason_text"],
|
||||
scores["model_name"],
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
return True
|
||||
|
||||
|
||||
def _parse_rss(root: ET.Element) -> list[FeedItem]:
|
||||
channel = _first_child(root, "channel") or root
|
||||
language = _first_text(channel, "language")
|
||||
items = [element for element in root.iter() if _local_name(element.tag) == "item"]
|
||||
parsed = []
|
||||
for item in items:
|
||||
title = _first_text(item, "title")
|
||||
link = _first_text(item, "link")
|
||||
guid = _first_text(item, "guid")
|
||||
url = link or guid
|
||||
if not title or not url:
|
||||
continue
|
||||
parsed.append(
|
||||
FeedItem(
|
||||
title=title,
|
||||
url=url,
|
||||
description=_first_text(item, "description", "summary", "encoded"),
|
||||
author=_first_text(item, "author", "creator"),
|
||||
published_at=_parse_date(_first_text(item, "pubDate", "published", "updated", "date")),
|
||||
image_url=_find_image_url(item),
|
||||
language=language,
|
||||
raw_guid=guid,
|
||||
)
|
||||
)
|
||||
return parsed
|
||||
|
||||
|
||||
def _parse_atom(root: ET.Element) -> list[FeedItem]:
|
||||
language = root.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
|
||||
entries = [element for element in root if _local_name(element.tag) == "entry"]
|
||||
parsed = []
|
||||
for entry in entries:
|
||||
title = _first_text(entry, "title")
|
||||
url = _atom_link(entry)
|
||||
if not title or not url:
|
||||
continue
|
||||
author = None
|
||||
author_el = _first_child(entry, "author")
|
||||
if author_el is not None:
|
||||
author = _first_text(author_el, "name") or _text(author_el)
|
||||
parsed.append(
|
||||
FeedItem(
|
||||
title=title,
|
||||
url=url,
|
||||
description=_first_text(entry, "summary", "content"),
|
||||
author=author,
|
||||
published_at=_parse_date(_first_text(entry, "published", "updated")),
|
||||
image_url=_find_image_url(entry),
|
||||
language=language,
|
||||
raw_guid=_first_text(entry, "id"),
|
||||
)
|
||||
)
|
||||
return parsed
|
||||
|
||||
|
||||
def _atom_link(entry: ET.Element) -> str | None:
|
||||
fallback = None
|
||||
for child in entry:
|
||||
if _local_name(child.tag) != "link":
|
||||
continue
|
||||
href = child.attrib.get("href")
|
||||
if not href:
|
||||
continue
|
||||
if child.attrib.get("rel", "alternate") == "alternate":
|
||||
return href
|
||||
fallback = fallback or href
|
||||
return fallback
|
||||
|
||||
|
||||
def _find_image_url(element: ET.Element) -> str | None:
|
||||
for child in element.iter():
|
||||
name = _local_name(child.tag)
|
||||
if name in {"thumbnail", "content"} and child.attrib.get("url"):
|
||||
if child.attrib.get("medium") in {None, "image"}:
|
||||
return child.attrib["url"]
|
||||
if name == "enclosure" and child.attrib.get("url"):
|
||||
mime = child.attrib.get("type", "")
|
||||
if mime.startswith("image/"):
|
||||
return child.attrib["url"]
|
||||
return None
|
||||
|
||||
|
||||
def _parse_date(value: str | None) -> str | None:
|
||||
if not value:
|
||||
return None
|
||||
value = value.strip()
|
||||
try:
|
||||
parsed = email.utils.parsedate_to_datetime(value)
|
||||
if parsed.tzinfo is None:
|
||||
parsed = parsed.replace(tzinfo=UTC)
|
||||
return parsed.astimezone(UTC).isoformat()
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
parsed = datetime.fromisoformat(value.replace("Z", "+00:00"))
|
||||
if parsed.tzinfo is None:
|
||||
parsed = parsed.replace(tzinfo=UTC)
|
||||
return parsed.astimezone(UTC).isoformat()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _first_child(element: ET.Element, name: str) -> ET.Element | None:
|
||||
for child in element:
|
||||
if _local_name(child.tag) == name:
|
||||
return child
|
||||
return None
|
||||
|
||||
|
||||
def _first_text(element: ET.Element, *names: str) -> str | None:
|
||||
for child in element:
|
||||
if _local_name(child.tag) in names:
|
||||
value = _text(child)
|
||||
if value:
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def _text(element: ET.Element) -> str | None:
|
||||
if element.text:
|
||||
return element.text.strip()
|
||||
return None
|
||||
|
||||
|
||||
def _local_name(tag: str) -> str:
|
||||
if "}" in tag:
|
||||
return tag.rsplit("}", 1)[1]
|
||||
return tag
|
||||
|
||||
+265
@@ -0,0 +1,265 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
DEFAULT_BASE_URL = "http://127.0.0.1:1234/v1"
|
||||
DEFAULT_MODEL = "gpt-oss"
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """You classify article metadata for a calm constructive-news digest.
|
||||
|
||||
Judge emotional aftertaste, not simple positivity. Accept stories that leave a reader informed without feeling drained, especially when they include repair, progress, agency, resilience, human benefit, scientific discovery, environmental improvement, community action, or useful perspective.
|
||||
|
||||
Reject stories centered on fear, outrage, partisan conflict, crime, tragedy, disaster repetition, celebrity drama, market panic, or corporate PR without clear public benefit.
|
||||
|
||||
Return only JSON with this exact shape:
|
||||
{
|
||||
"constructive_score": 0,
|
||||
"cortisol_score": 0,
|
||||
"ragebait_score": 0,
|
||||
"agency_score": 0,
|
||||
"human_benefit_score": 0,
|
||||
"novelty_score": 0,
|
||||
"pr_risk_score": 0,
|
||||
"accepted": false,
|
||||
"reason_code": "short_snake_case",
|
||||
"reason_text": "one concise sentence"
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class LocalModelClient:
|
||||
base_url: str
|
||||
model: str
|
||||
api_key: str | None = None
|
||||
timeout: int = 90
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> "LocalModelClient":
|
||||
return cls(
|
||||
base_url=os.environ.get("GOODNEWS_LLM_BASE_URL", DEFAULT_BASE_URL).rstrip("/"),
|
||||
model=os.environ.get("GOODNEWS_LLM_MODEL", DEFAULT_MODEL),
|
||||
api_key=os.environ.get("GOODNEWS_LLM_API_KEY"),
|
||||
)
|
||||
|
||||
def classify(self, article: sqlite3.Row) -> dict:
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"temperature": 0.1,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": _article_prompt(article)},
|
||||
],
|
||||
"response_format": {"type": "json_object"},
|
||||
}
|
||||
try:
|
||||
return self._chat(payload)
|
||||
except RuntimeError as exc:
|
||||
if "HTTP 400" not in str(exc):
|
||||
raise
|
||||
payload.pop("response_format", None)
|
||||
return self._chat(payload)
|
||||
|
||||
def list_models(self) -> list[str]:
|
||||
headers = {}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
request = urllib.request.Request(f"{self.base_url}/models", headers=headers)
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=10) as response:
|
||||
data = json.loads(response.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(f"HTTP {exc.code} from local model: {detail}") from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(f"could not reach local model at {self.base_url}: {exc.reason}") from exc
|
||||
|
||||
models = data.get("data", [])
|
||||
names = []
|
||||
for model in models:
|
||||
if isinstance(model, dict) and model.get("id"):
|
||||
names.append(str(model["id"]))
|
||||
return names
|
||||
|
||||
def _chat(self, payload: dict) -> dict:
|
||||
body = json.dumps(payload).encode("utf-8")
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
request = urllib.request.Request(
|
||||
f"{self.base_url}/chat/completions",
|
||||
data=body,
|
||||
headers=headers,
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=self.timeout) as response:
|
||||
data = json.loads(response.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(f"HTTP {exc.code} from local model: {detail}") from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(f"could not reach local model at {self.base_url}: {exc.reason}") from exc
|
||||
|
||||
try:
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
except (KeyError, IndexError, TypeError) as exc:
|
||||
raise RuntimeError(f"unexpected local model response: {data}") from exc
|
||||
return parse_classifier_json(content)
|
||||
|
||||
|
||||
def classify_articles(
|
||||
conn: sqlite3.Connection,
|
||||
client: LocalModelClient,
|
||||
limit: int,
|
||||
include_rejected: bool = False,
|
||||
dry_run: bool = False,
|
||||
) -> list[tuple[int, dict]]:
|
||||
rows = _classification_candidates(conn, limit=limit, include_rejected=include_rejected)
|
||||
results = []
|
||||
for row in rows:
|
||||
scores = client.classify(row)
|
||||
scores = normalize_scores(scores, model_name=client.model)
|
||||
results.append((row["id"], scores))
|
||||
if not dry_run:
|
||||
upsert_article_score(conn, row["id"], scores)
|
||||
if not dry_run:
|
||||
conn.commit()
|
||||
return results
|
||||
|
||||
|
||||
def parse_classifier_json(content: str) -> dict:
|
||||
content = content.strip()
|
||||
try:
|
||||
return json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
start = content.find("{")
|
||||
end = content.rfind("}")
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
raise RuntimeError(f"model did not return JSON: {content}")
|
||||
return json.loads(content[start : end + 1])
|
||||
|
||||
|
||||
def normalize_scores(data: dict, model_name: str) -> dict:
|
||||
return {
|
||||
"constructive_score": _bounded_int(data.get("constructive_score")),
|
||||
"cortisol_score": _bounded_int(data.get("cortisol_score")),
|
||||
"ragebait_score": _bounded_int(data.get("ragebait_score")),
|
||||
"agency_score": _bounded_int(data.get("agency_score")),
|
||||
"human_benefit_score": _bounded_int(data.get("human_benefit_score")),
|
||||
"novelty_score": _bounded_int(data.get("novelty_score")),
|
||||
"pr_risk_score": _bounded_int(data.get("pr_risk_score")),
|
||||
"accepted": 1 if bool(data.get("accepted")) else 0,
|
||||
"reason_code": str(data.get("reason_code") or "model_no_reason")[:120],
|
||||
"reason_text": str(data.get("reason_text") or "")[:1000],
|
||||
"model_name": model_name,
|
||||
}
|
||||
|
||||
|
||||
def upsert_article_score(conn: sqlite3.Connection, article_id: int, scores: dict) -> None:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO article_scores (
|
||||
article_id, constructive_score, cortisol_score, ragebait_score,
|
||||
agency_score, human_benefit_score, novelty_score, pr_risk_score,
|
||||
accepted, reason_code, reason_text, model_name, scored_at
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT(article_id) DO UPDATE SET
|
||||
constructive_score = excluded.constructive_score,
|
||||
cortisol_score = excluded.cortisol_score,
|
||||
ragebait_score = excluded.ragebait_score,
|
||||
agency_score = excluded.agency_score,
|
||||
human_benefit_score = excluded.human_benefit_score,
|
||||
novelty_score = excluded.novelty_score,
|
||||
pr_risk_score = excluded.pr_risk_score,
|
||||
accepted = excluded.accepted,
|
||||
reason_code = excluded.reason_code,
|
||||
reason_text = excluded.reason_text,
|
||||
model_name = excluded.model_name,
|
||||
scored_at = CURRENT_TIMESTAMP
|
||||
""",
|
||||
(
|
||||
article_id,
|
||||
scores["constructive_score"],
|
||||
scores["cortisol_score"],
|
||||
scores["ragebait_score"],
|
||||
scores["agency_score"],
|
||||
scores["human_benefit_score"],
|
||||
scores["novelty_score"],
|
||||
scores["pr_risk_score"],
|
||||
scores["accepted"],
|
||||
scores["reason_code"],
|
||||
scores["reason_text"],
|
||||
scores["model_name"],
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _classification_candidates(
|
||||
conn: sqlite3.Connection,
|
||||
limit: int,
|
||||
include_rejected: bool,
|
||||
) -> list[sqlite3.Row]:
|
||||
where = "" if include_rejected else "WHERE s.accepted = 1 OR s.constructive_score >= 4"
|
||||
return conn.execute(
|
||||
f"""
|
||||
SELECT
|
||||
a.id,
|
||||
a.title,
|
||||
a.description,
|
||||
a.published_at,
|
||||
a.canonical_url,
|
||||
src.name AS source_name,
|
||||
src.default_category,
|
||||
src.trust_score AS source_trust_score,
|
||||
src.pr_risk_score AS source_pr_risk_score,
|
||||
s.constructive_score,
|
||||
s.cortisol_score,
|
||||
s.ragebait_score,
|
||||
s.agency_score,
|
||||
s.human_benefit_score,
|
||||
s.pr_risk_score,
|
||||
s.accepted,
|
||||
s.reason_code
|
||||
FROM articles a
|
||||
JOIN sources src ON src.id = a.source_id
|
||||
LEFT JOIN article_scores s ON s.article_id = a.id
|
||||
{where}
|
||||
ORDER BY
|
||||
CASE WHEN s.model_name LIKE 'heuristic-%' THEN 0 ELSE 1 END,
|
||||
COALESCE(a.published_at, a.discovered_at) DESC
|
||||
LIMIT ?
|
||||
""",
|
||||
(limit,),
|
||||
).fetchall()
|
||||
|
||||
|
||||
def _article_prompt(article: sqlite3.Row) -> str:
|
||||
return "\n".join(
|
||||
[
|
||||
f"Source: {article['source_name']}",
|
||||
f"Source category: {article['default_category'] or 'unknown'}",
|
||||
f"Source trust score: {article['source_trust_score']}/10",
|
||||
f"Source PR risk score: {article['source_pr_risk_score']}/10",
|
||||
f"Published: {article['published_at'] or 'unknown'}",
|
||||
f"Title: {article['title']}",
|
||||
f"Snippet: {article['description'] or ''}",
|
||||
f"URL: {article['canonical_url']}",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _bounded_int(value: object) -> int:
|
||||
try:
|
||||
parsed = int(value)
|
||||
except (TypeError, ValueError):
|
||||
parsed = 0
|
||||
return max(0, min(10, parsed))
|
||||
@@ -0,0 +1,169 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
|
||||
POSITIVE_TERMS = {
|
||||
"breakthrough",
|
||||
"progress",
|
||||
"improve",
|
||||
"improves",
|
||||
"improved",
|
||||
"solution",
|
||||
"solutions",
|
||||
"recovery",
|
||||
"restore",
|
||||
"restores",
|
||||
"rescued",
|
||||
"rescue",
|
||||
"volunteer",
|
||||
"community",
|
||||
"donate",
|
||||
"donation",
|
||||
"cure",
|
||||
"treatment",
|
||||
"therapy",
|
||||
"clean energy",
|
||||
"renewable",
|
||||
"conservation",
|
||||
"protect",
|
||||
"protects",
|
||||
"restoration",
|
||||
"kindness",
|
||||
"hope",
|
||||
"first",
|
||||
"record",
|
||||
}
|
||||
|
||||
AGENCY_TERMS = {
|
||||
"how",
|
||||
"helps",
|
||||
"helping",
|
||||
"protect",
|
||||
"protects",
|
||||
"builds",
|
||||
"creates",
|
||||
"launches",
|
||||
"teaches",
|
||||
"learn",
|
||||
"guide",
|
||||
"tool",
|
||||
"program",
|
||||
"initiative",
|
||||
"effort",
|
||||
"plan",
|
||||
"rebuild",
|
||||
}
|
||||
|
||||
CORTISOL_TERMS = {
|
||||
"war",
|
||||
"killed",
|
||||
"dead",
|
||||
"death",
|
||||
"murder",
|
||||
"shooting",
|
||||
"attack",
|
||||
"crisis",
|
||||
"catastrophe",
|
||||
"disaster",
|
||||
"collapse",
|
||||
"panic",
|
||||
"warning",
|
||||
"threat",
|
||||
"fear",
|
||||
"fears",
|
||||
"lawsuit",
|
||||
"scandal",
|
||||
}
|
||||
|
||||
RAGEBAIT_TERMS = {
|
||||
"slams",
|
||||
"blasts",
|
||||
"furious",
|
||||
"outrage",
|
||||
"rage",
|
||||
"shocking",
|
||||
"you won't believe",
|
||||
"sparks backlash",
|
||||
"destroyed",
|
||||
"humiliates",
|
||||
}
|
||||
|
||||
PR_TERMS = {
|
||||
"announces",
|
||||
"unveils",
|
||||
"funding round",
|
||||
"raises",
|
||||
"partnership",
|
||||
"brand",
|
||||
"sponsored",
|
||||
"press release",
|
||||
}
|
||||
|
||||
WORD_RE = re.compile(r"[a-z0-9']+")
|
||||
|
||||
|
||||
def _count_terms(text: str, terms: set[str]) -> int:
|
||||
lowered = text.lower()
|
||||
words = set(WORD_RE.findall(lowered))
|
||||
count = 0
|
||||
for term in terms:
|
||||
if " " in term:
|
||||
count += 1 if term in lowered else 0
|
||||
elif term in words:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def score_article(title: str, description: str | None, source_pr_risk: int) -> dict:
|
||||
text = f"{title}. {description or ''}"
|
||||
positive = _count_terms(text, POSITIVE_TERMS)
|
||||
agency = _count_terms(text, AGENCY_TERMS)
|
||||
cortisol = _count_terms(text, CORTISOL_TERMS)
|
||||
ragebait = _count_terms(text, RAGEBAIT_TERMS)
|
||||
pr_terms = _count_terms(text, PR_TERMS)
|
||||
|
||||
constructive_score = min(10, 2 + positive * 2 + agency)
|
||||
agency_score = min(10, 1 + agency * 2)
|
||||
cortisol_score = min(10, cortisol * 3)
|
||||
ragebait_score = min(10, ragebait * 4)
|
||||
pr_risk_score = min(10, source_pr_risk + pr_terms * 2)
|
||||
human_benefit_score = min(10, positive * 2 + agency)
|
||||
novelty_score = 5
|
||||
|
||||
accepted = (
|
||||
constructive_score >= 5
|
||||
and cortisol_score <= 5
|
||||
and ragebait_score <= 3
|
||||
and pr_risk_score <= 7
|
||||
)
|
||||
|
||||
if accepted:
|
||||
reason_code = "heuristic_constructive_candidate"
|
||||
reason_text = "Constructive or agency-oriented language with low obvious cortisol/ragebait signals."
|
||||
elif ragebait_score > 3:
|
||||
reason_code = "heuristic_reject_ragebait_language"
|
||||
reason_text = "Headline or snippet contains outrage-oriented language."
|
||||
elif cortisol_score > 5:
|
||||
reason_code = "heuristic_reject_cortisol_heavy"
|
||||
reason_text = "Headline or snippet appears tragedy, threat, conflict, or crisis centered."
|
||||
elif pr_risk_score > 7:
|
||||
reason_code = "heuristic_reject_pr_risk"
|
||||
reason_text = "Headline or source has signs of corporate PR framing."
|
||||
else:
|
||||
reason_code = "heuristic_needs_review"
|
||||
reason_text = "Not enough constructive signal for automatic acceptance."
|
||||
|
||||
return {
|
||||
"constructive_score": constructive_score,
|
||||
"cortisol_score": cortisol_score,
|
||||
"ragebait_score": ragebait_score,
|
||||
"agency_score": agency_score,
|
||||
"human_benefit_score": human_benefit_score,
|
||||
"novelty_score": novelty_score,
|
||||
"pr_risk_score": pr_risk_score,
|
||||
"accepted": 1 if accepted else 0,
|
||||
"reason_code": reason_code,
|
||||
"reason_text": reason_text,
|
||||
"model_name": "heuristic-v0",
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
import tomllib
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_sources(path: Path | str) -> list[dict]:
|
||||
data = tomllib.loads(Path(path).read_text(encoding="utf-8"))
|
||||
sources = data.get("sources", [])
|
||||
if not isinstance(sources, list):
|
||||
raise ValueError("sources.toml must contain [[sources]] entries")
|
||||
return sources
|
||||
|
||||
|
||||
def upsert_sources(conn: sqlite3.Connection, source_defs: list[dict]) -> int:
|
||||
count = 0
|
||||
for source in source_defs:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO sources (
|
||||
name, homepage_url, feed_url, source_type, default_category,
|
||||
trust_score, pr_risk_score, active, poll_interval_minutes, notes,
|
||||
updated_at
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT(feed_url) DO UPDATE SET
|
||||
name = excluded.name,
|
||||
homepage_url = excluded.homepage_url,
|
||||
source_type = excluded.source_type,
|
||||
default_category = excluded.default_category,
|
||||
trust_score = excluded.trust_score,
|
||||
pr_risk_score = excluded.pr_risk_score,
|
||||
active = excluded.active,
|
||||
poll_interval_minutes = excluded.poll_interval_minutes,
|
||||
notes = excluded.notes,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
""",
|
||||
(
|
||||
source["name"],
|
||||
source.get("homepage_url"),
|
||||
source["feed_url"],
|
||||
source.get("source_type", "rss"),
|
||||
source.get("default_category"),
|
||||
int(source.get("trust_score", 5)),
|
||||
int(source.get("pr_risk_score", 3)),
|
||||
1 if source.get("active", True) else 0,
|
||||
int(source.get("poll_interval_minutes", 60)),
|
||||
source.get("notes"),
|
||||
),
|
||||
)
|
||||
count += 1
|
||||
conn.commit()
|
||||
return count
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import html
|
||||
import re
|
||||
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
||||
|
||||
|
||||
TAG_RE = re.compile(r"<[^>]+>")
|
||||
WHITESPACE_RE = re.compile(r"\s+")
|
||||
TRACKING_PREFIXES = ("utm_",)
|
||||
TRACKING_PARAMS = {
|
||||
"fbclid",
|
||||
"gclid",
|
||||
"mc_cid",
|
||||
"mc_eid",
|
||||
"igshid",
|
||||
"ref",
|
||||
}
|
||||
|
||||
|
||||
def clean_text(value: str | None, max_len: int = 1000) -> str | None:
|
||||
if not value:
|
||||
return None
|
||||
text = TAG_RE.sub(" ", value)
|
||||
text = html.unescape(text)
|
||||
text = WHITESPACE_RE.sub(" ", text).strip()
|
||||
if len(text) > max_len:
|
||||
return text[: max_len - 1].rstrip() + "..."
|
||||
return text or None
|
||||
|
||||
|
||||
def canonicalize_url(url: str | None) -> str | None:
|
||||
if not url:
|
||||
return None
|
||||
url = html.unescape(url).strip()
|
||||
if not url:
|
||||
return None
|
||||
parts = urlsplit(url)
|
||||
if parts.scheme not in {"http", "https"} or not parts.netloc:
|
||||
return None
|
||||
|
||||
query = []
|
||||
for key, value in parse_qsl(parts.query, keep_blank_values=True):
|
||||
lowered = key.lower()
|
||||
if lowered in TRACKING_PARAMS or lowered.startswith(TRACKING_PREFIXES):
|
||||
continue
|
||||
query.append((key, value))
|
||||
|
||||
normalized = parts._replace(
|
||||
scheme=parts.scheme.lower(),
|
||||
netloc=parts.netloc.lower(),
|
||||
query=urlencode(sorted(query), doseq=True),
|
||||
fragment="",
|
||||
)
|
||||
return urlunsplit(normalized)
|
||||
|
||||
|
||||
def sha256_text(value: str | None) -> str:
|
||||
normalized = (value or "").strip().lower()
|
||||
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
[project]
|
||||
name = "goodnews"
|
||||
version = "0.1.0"
|
||||
description = "Local-first constructive news ingestion and filtering prototype."
|
||||
requires-python = ">=3.11"
|
||||
dependencies = []
|
||||
|
||||
[project.scripts]
|
||||
goodnews = "goodnews.cli:main"
|
||||
|
||||
Reference in New Issue
Block a user