068073423f
Local-first RSS/Atom ingestion pipeline with metadata-only storage, heuristic + local-LLM scoring, and daily brief builder. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
106 lines
3.1 KiB
Python
106 lines
3.1 KiB
Python
from __future__ import annotations
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
|
|
SCHEMA = """
|
|
PRAGMA foreign_keys = ON;
|
|
|
|
CREATE TABLE IF NOT EXISTS sources (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
name TEXT NOT NULL UNIQUE,
|
|
homepage_url TEXT,
|
|
feed_url TEXT NOT NULL UNIQUE,
|
|
source_type TEXT NOT NULL DEFAULT 'rss',
|
|
default_category TEXT,
|
|
trust_score INTEGER NOT NULL DEFAULT 5,
|
|
pr_risk_score INTEGER NOT NULL DEFAULT 3,
|
|
active INTEGER NOT NULL DEFAULT 1,
|
|
poll_interval_minutes INTEGER NOT NULL DEFAULT 60,
|
|
notes TEXT,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS articles (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
source_id INTEGER NOT NULL REFERENCES sources(id) ON DELETE CASCADE,
|
|
canonical_url TEXT NOT NULL,
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
author TEXT,
|
|
published_at TEXT,
|
|
discovered_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
image_url TEXT,
|
|
language TEXT,
|
|
raw_guid TEXT,
|
|
url_hash TEXT NOT NULL UNIQUE,
|
|
title_hash TEXT,
|
|
FOREIGN KEY (source_id) REFERENCES sources(id)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_articles_published_at ON articles(published_at);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_source_id ON articles(source_id);
|
|
CREATE INDEX IF NOT EXISTS idx_articles_title_hash ON articles(title_hash);
|
|
|
|
CREATE TABLE IF NOT EXISTS article_scores (
|
|
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
|
|
constructive_score INTEGER,
|
|
cortisol_score INTEGER,
|
|
ragebait_score INTEGER,
|
|
agency_score INTEGER,
|
|
human_benefit_score INTEGER,
|
|
novelty_score INTEGER,
|
|
pr_risk_score INTEGER,
|
|
accepted INTEGER,
|
|
reason_code TEXT,
|
|
reason_text TEXT,
|
|
model_name TEXT,
|
|
scored_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS ingest_runs (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
source_id INTEGER REFERENCES sources(id) ON DELETE SET NULL,
|
|
started_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
finished_at TEXT,
|
|
status TEXT NOT NULL DEFAULT 'running',
|
|
items_seen INTEGER NOT NULL DEFAULT 0,
|
|
items_inserted INTEGER NOT NULL DEFAULT 0,
|
|
items_duplicate INTEGER NOT NULL DEFAULT 0,
|
|
error TEXT
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS daily_briefs (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
brief_date TEXT NOT NULL UNIQUE,
|
|
title TEXT NOT NULL,
|
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
notes TEXT
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS daily_brief_items (
|
|
brief_id INTEGER NOT NULL REFERENCES daily_briefs(id) ON DELETE CASCADE,
|
|
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
|
|
rank INTEGER NOT NULL,
|
|
selection_reason TEXT,
|
|
PRIMARY KEY (brief_id, article_id),
|
|
UNIQUE (brief_id, rank)
|
|
);
|
|
"""
|
|
|
|
|
|
def connect(db_path: Path | str) -> sqlite3.Connection:
|
|
path = Path(db_path)
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
conn = sqlite3.connect(path)
|
|
conn.row_factory = sqlite3.Row
|
|
conn.execute("PRAGMA foreign_keys = ON")
|
|
return conn
|
|
|
|
|
|
def init_db(conn: sqlite3.Connection) -> None:
|
|
conn.executescript(SCHEMA)
|
|
conn.commit()
|