Geo Stage 1-2: subject-geography model + classifier + pipeline wiring
"Closer to Home" foundation (audit greenlit by Codex). Durable geography, kept decoupled from volatile scoring. - Schema: article_geo (breadth/confidence/rationale/geo_version) + article_places (0..N ISO-coded places), separate from article_scores so re-runs/audits never disturb scoring or acceptance. "local" is never stored — it's relative to the reader; the UI computes "Near you" later. - geo.py: LLM proposes place NAMES, code disposes to ISO codes (country alpha-2, US state 2-letter); region words like "Europe" can never become a country. 'global'/placeless is first-class, not failure. Confidence calibrated so 'high' needs an explicit location. Geo is its OWN LLM pass, not merged into the scoring prompt (durable metadata, re-runnable, keeps the sensitive prompt untouched). - store_geo replaces places (geo is re-derivable, unlike scores). tag_articles is idempotent by geo_version, only touches accepted non-duplicate articles. - CLI `geo` command (cycle-locked, --limit/--reclassify) for backfill, plus a bounded geo step in the cycle (--geo-limit 60, --no-geo). scripts/geo_audit.py is the prototype audit tool. 360 tests green; live smoke tagged real articles correctly (Gaza->PS, London->GB, placeless science->global). No UI / SEO pages yet — ranking/personalization only. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -6,5 +6,6 @@ node_modules/
|
|||||||
data/*.sqlite3
|
data/*.sqlite3
|
||||||
data/*.sqlite3-*
|
data/*.sqlite3-*
|
||||||
data/*.db
|
data/*.db
|
||||||
|
data/geo_audit*.json
|
||||||
|
|
||||||
logs/
|
logs/
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from .digest import send_due_digests
|
|||||||
from .games import generate_daily_puzzles
|
from .games import generate_daily_puzzles
|
||||||
from .localtime import local_today
|
from .localtime import local_today
|
||||||
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
|
from .dedup import DEFAULT_THRESHOLD, DEFAULT_WINDOW_DAYS, cluster_duplicates, dedup as run_dedup
|
||||||
|
from .geo import tag_articles as tag_geo
|
||||||
from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
|
from .enrich import enrich_brief_images, enrich_recent_images, enrich_summarized_images
|
||||||
from .summarize import generate_summary, get_summary
|
from .summarize import generate_summary, get_summary
|
||||||
from .feeds import (
|
from .feeds import (
|
||||||
@@ -132,6 +133,8 @@ def main() -> None:
|
|||||||
cycle_parser.add_argument("--classify-limit", type=int, default=40)
|
cycle_parser.add_argument("--classify-limit", type=int, default=40)
|
||||||
cycle_parser.add_argument("--no-classify", action="store_true", help="Skip the LLM classify step")
|
cycle_parser.add_argument("--no-classify", action="store_true", help="Skip the LLM classify step")
|
||||||
cycle_parser.add_argument("--no-dedup", action="store_true", help="Skip the embedding dedup step")
|
cycle_parser.add_argument("--no-dedup", action="store_true", help="Skip the embedding dedup step")
|
||||||
|
cycle_parser.add_argument("--no-geo", action="store_true", help="Skip tagging article subject-geography")
|
||||||
|
cycle_parser.add_argument("--geo-limit", type=int, default=60, help="Max articles to geo-tag per cycle")
|
||||||
cycle_parser.add_argument("--no-brief", action="store_true", help="Skip rebuilding today's brief")
|
cycle_parser.add_argument("--no-brief", action="store_true", help="Skip rebuilding today's brief")
|
||||||
cycle_parser.add_argument("--no-review", action="store_true", help="Skip recomputing source review flags")
|
cycle_parser.add_argument("--no-review", action="store_true", help="Skip recomputing source review flags")
|
||||||
cycle_parser.add_argument("--no-digest", action="store_true", help="Skip sending due daily digests")
|
cycle_parser.add_argument("--no-digest", action="store_true", help="Skip sending due daily digests")
|
||||||
@@ -147,6 +150,12 @@ def main() -> None:
|
|||||||
)
|
)
|
||||||
enrich_images_parser.add_argument("--limit", type=int, default=50, help="Max articles to fetch this batch")
|
enrich_images_parser.add_argument("--limit", type=int, default=50, help="Max articles to fetch this batch")
|
||||||
|
|
||||||
|
geo_parser = subparsers.add_parser("geo", help="Tag article subject-geography (backfill / manual). Cycle-locked.")
|
||||||
|
geo_parser.add_argument("--limit", type=int, default=200, help="Max articles to tag this batch")
|
||||||
|
geo_parser.add_argument("--reclassify", action="store_true", help="Re-tag even rows already at the current geo version")
|
||||||
|
geo_parser.add_argument("--base-url", help="OpenAI-compatible base URL")
|
||||||
|
geo_parser.add_argument("--model", help="Local model name")
|
||||||
|
|
||||||
dedup_parser = subparsers.add_parser("dedup", help="Cluster near-duplicate stories via local embeddings")
|
dedup_parser = subparsers.add_parser("dedup", help="Cluster near-duplicate stories via local embeddings")
|
||||||
dedup_parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD, help="Cosine similarity cutoff")
|
dedup_parser.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD, help="Cosine similarity cutoff")
|
||||||
dedup_parser.add_argument("--window-days", type=int, default=DEFAULT_WINDOW_DAYS)
|
dedup_parser.add_argument("--window-days", type=int, default=DEFAULT_WINDOW_DAYS)
|
||||||
@@ -298,6 +307,15 @@ def main() -> None:
|
|||||||
elif args.command == "enrich-images":
|
elif args.command == "enrich-images":
|
||||||
found = enrich_summarized_images(conn, limit=args.limit)
|
found = enrich_summarized_images(conn, limit=args.limit)
|
||||||
print(f"enrich-images: {found} new image(s) for summarized articles")
|
print(f"enrich-images: {found} new image(s) for summarized articles")
|
||||||
|
elif args.command == "geo":
|
||||||
|
init_db(conn)
|
||||||
|
# Cycle-locked so a manual backfill can't contend with the scheduled cycle.
|
||||||
|
with cycle_lock(args.db) as acquired:
|
||||||
|
if not acquired:
|
||||||
|
print("geo: a cycle is already running; try again after it finishes")
|
||||||
|
return
|
||||||
|
g = tag_geo(conn, llm_client_from_args(args), limit=args.limit, reclassify=args.reclassify)
|
||||||
|
print(f"geo: tagged={g['tagged']} errors={g['errors']} (of {g['candidates']} candidates)")
|
||||||
elif args.command == "dedup":
|
elif args.command == "dedup":
|
||||||
init_db(conn)
|
init_db(conn)
|
||||||
if args.force_recluster:
|
if args.force_recluster:
|
||||||
@@ -506,6 +524,16 @@ def _run_cycle_locked(conn: sqlite3.Connection, args: argparse.Namespace) -> Non
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"dedup: skipped ({exc})")
|
print(f"dedup: skipped ({exc})")
|
||||||
|
|
||||||
|
# Geo: tag newly-accepted, non-duplicate articles with subject geography (its own
|
||||||
|
# LLM pass, decoupled from scoring). Bounded per cycle; idempotent (skips rows
|
||||||
|
# already at the current GEO_VERSION). Non-fatal like every other step.
|
||||||
|
if not args.no_geo:
|
||||||
|
try:
|
||||||
|
g = tag_geo(conn, llm_client_from_args(args), limit=args.geo_limit)
|
||||||
|
print(f"geo: tagged={g['tagged']} errors={g['errors']} (of {g['candidates']} untagged)")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"geo: skipped ({exc})")
|
||||||
|
|
||||||
if not args.no_brief:
|
if not args.no_brief:
|
||||||
today = local_today()
|
today = local_today()
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -217,6 +217,34 @@ CREATE TABLE IF NOT EXISTS article_summaries (
|
|||||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
);
|
);
|
||||||
|
|
||||||
|
-- Where a story is ABOUT (subject geography), kept SEPARATE from article_scores so
|
||||||
|
-- durable geography isn't coupled to volatile scoring/acceptance. "local" is never
|
||||||
|
-- stored here — it's relative to the reader; the UI computes "Near you" by comparing
|
||||||
|
-- these places to the visitor's chosen home. geo_version lets us re-backfill cleanly
|
||||||
|
-- when the prompt/taxonomy changes. 'global' is a real category, not a failure.
|
||||||
|
CREATE TABLE IF NOT EXISTS article_geo (
|
||||||
|
article_id INTEGER PRIMARY KEY REFERENCES articles(id) ON DELETE CASCADE,
|
||||||
|
breadth TEXT NOT NULL DEFAULT 'unknown', -- locality|regional|national|multinational|global|unknown
|
||||||
|
confidence TEXT NOT NULL DEFAULT 'low', -- high|medium|low
|
||||||
|
rationale TEXT,
|
||||||
|
geo_version TEXT,
|
||||||
|
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
-- 0..N normalized places per article (a story can span regions). Codes are ISO
|
||||||
|
-- (country = alpha-2, state = US 2-letter / ISO-3166-2 subdivision), normalized in
|
||||||
|
-- code — never trusting the model's free text.
|
||||||
|
CREATE TABLE IF NOT EXISTS article_places (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
article_id INTEGER NOT NULL REFERENCES articles(id) ON DELETE CASCADE,
|
||||||
|
country_code TEXT,
|
||||||
|
state_code TEXT,
|
||||||
|
locality TEXT,
|
||||||
|
ord INTEGER NOT NULL DEFAULT 0
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_article_places_article ON article_places(article_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_article_places_country ON article_places(country_code);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_article_geo_breadth ON article_geo(breadth);
|
||||||
|
|
||||||
-- Privacy-respecting, first-party analytics. NO IP / user-agent / referrer / raw
|
-- Privacy-respecting, first-party analytics. NO IP / user-agent / referrer / raw
|
||||||
-- URL. visitor_hash is a hash of a random localStorage token (never email/IP).
|
-- URL. visitor_hash is a hash of a random localStorage token (never email/IP).
|
||||||
-- The UNIQUE key dedups to one row per (kind, article, visitor, day) — that both
|
-- The UNIQUE key dedups to one row per (kind, article, visitor, day) — that both
|
||||||
|
|||||||
+222
@@ -0,0 +1,222 @@
|
|||||||
|
"""Subject-geography for articles ("where is this story ABOUT").
|
||||||
|
|
||||||
|
Kept deliberately separate from scoring (see article_geo / article_places in db.py):
|
||||||
|
geography is durable metadata, scoring is volatile. The LLM proposes place NAMES;
|
||||||
|
this module disposes by normalizing to ISO codes in code, never trusting the model's
|
||||||
|
free text (so "Europe" never gets stored as a country). 'global'/placeless is a real,
|
||||||
|
first-class result, not a failure. "local" is NOT stored — it's relative to the reader;
|
||||||
|
the UI decides "Near you" by comparing these places to the visitor's chosen home.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
from .llm import LocalModelClient, parse_classifier_json
|
||||||
|
|
||||||
|
# Bump when the prompt/taxonomy changes, so a re-backfill can target stale rows.
|
||||||
|
GEO_VERSION = "geo-v1"
|
||||||
|
|
||||||
|
BREADTHS = ("locality", "regional", "national", "multinational", "global", "unknown")
|
||||||
|
CONFIDENCES = ("high", "medium", "low")
|
||||||
|
|
||||||
|
# --- normalization data (LLM returns names; we map to ISO, drop the unmappable) ----
|
||||||
|
|
||||||
|
US_STATES = {
|
||||||
|
"alabama": "AL", "alaska": "AK", "arizona": "AZ", "arkansas": "AR", "california": "CA",
|
||||||
|
"colorado": "CO", "connecticut": "CT", "delaware": "DE", "florida": "FL", "georgia": "GA",
|
||||||
|
"hawaii": "HI", "idaho": "ID", "illinois": "IL", "indiana": "IN", "iowa": "IA",
|
||||||
|
"kansas": "KS", "kentucky": "KY", "louisiana": "LA", "maine": "ME", "maryland": "MD",
|
||||||
|
"massachusetts": "MA", "michigan": "MI", "minnesota": "MN", "mississippi": "MS",
|
||||||
|
"missouri": "MO", "montana": "MT", "nebraska": "NE", "nevada": "NV", "new hampshire": "NH",
|
||||||
|
"new jersey": "NJ", "new mexico": "NM", "new york": "NY", "north carolina": "NC",
|
||||||
|
"north dakota": "ND", "ohio": "OH", "oklahoma": "OK", "oregon": "OR", "pennsylvania": "PA",
|
||||||
|
"rhode island": "RI", "south carolina": "SC", "south dakota": "SD", "tennessee": "TN",
|
||||||
|
"texas": "TX", "utah": "UT", "vermont": "VT", "virginia": "VA", "washington": "WA",
|
||||||
|
"west virginia": "WV", "wisconsin": "WI", "wyoming": "WY",
|
||||||
|
"district of columbia": "DC", "washington dc": "DC", "washington d c": "DC",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Common countries + aliases (extensible). Anything not here returns None -> we drop
|
||||||
|
# the country rather than store garbage. breadth still captures national/global, etc.
|
||||||
|
COUNTRY_TO_ISO = {
|
||||||
|
"united states": "US", "united states of america": "US", "usa": "US", "us": "US", "america": "US",
|
||||||
|
"united kingdom": "GB", "uk": "GB", "britain": "GB", "great britain": "GB", "england": "GB",
|
||||||
|
"scotland": "GB", "wales": "GB", "northern ireland": "GB",
|
||||||
|
"canada": "CA", "australia": "AU", "new zealand": "NZ", "ireland": "IE",
|
||||||
|
"france": "FR", "germany": "DE", "spain": "ES", "portugal": "PT", "italy": "IT",
|
||||||
|
"netherlands": "NL", "belgium": "BE", "luxembourg": "LU", "switzerland": "CH", "austria": "AT",
|
||||||
|
"denmark": "DK", "sweden": "SE", "norway": "NO", "finland": "FI", "iceland": "IS",
|
||||||
|
"poland": "PL", "czech republic": "CZ", "czechia": "CZ", "slovakia": "SK", "hungary": "HU",
|
||||||
|
"greece": "GR", "romania": "RO", "bulgaria": "BG", "croatia": "HR", "serbia": "RS",
|
||||||
|
"ukraine": "UA", "russia": "RU", "turkey": "TR", "turkiye": "TR",
|
||||||
|
"china": "CN", "japan": "JP", "south korea": "KR", "korea": "KR", "north korea": "KP",
|
||||||
|
"india": "IN", "pakistan": "PK", "bangladesh": "BD", "sri lanka": "LK", "nepal": "NP",
|
||||||
|
"indonesia": "ID", "malaysia": "MY", "singapore": "SG", "thailand": "TH", "vietnam": "VN",
|
||||||
|
"philippines": "PH", "taiwan": "TW", "hong kong": "HK",
|
||||||
|
"israel": "IL", "palestine": "PS", "saudi arabia": "SA", "united arab emirates": "AE",
|
||||||
|
"uae": "AE", "qatar": "QA", "iran": "IR", "iraq": "IQ", "egypt": "EG", "jordan": "JO",
|
||||||
|
"south africa": "ZA", "nigeria": "NG", "kenya": "KE", "ethiopia": "ET", "ghana": "GH",
|
||||||
|
"tanzania": "TZ", "uganda": "UG", "rwanda": "RW", "morocco": "MA", "tunisia": "TN",
|
||||||
|
"mexico": "MX", "brazil": "BR", "argentina": "AR", "chile": "CL", "colombia": "CO",
|
||||||
|
"peru": "PE", "venezuela": "VE", "ecuador": "EC", "bolivia": "BO", "uruguay": "UY",
|
||||||
|
"costa rica": "CR", "panama": "PA", "guatemala": "GT", "cuba": "CU", "jamaica": "JM",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Words that look like countries but are regions/continents -> never a country_code.
|
||||||
|
_NON_COUNTRY = {"europe", "asia", "africa", "north america", "south america", "latin america",
|
||||||
|
"the americas", "middle east", "scandinavia", "eu", "european union", "world",
|
||||||
|
"global", "international", "earth", "the world"}
|
||||||
|
|
||||||
|
|
||||||
|
def _norm_key(name) -> str:
|
||||||
|
s = re.sub(r"[^a-z0-9 ]", " ", str(name or "").lower())
|
||||||
|
s = re.sub(r"\bthe\b", " ", s)
|
||||||
|
return re.sub(r"\s+", " ", s).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_country(name) -> str | None:
|
||||||
|
key = _norm_key(name)
|
||||||
|
if not key or key in _NON_COUNTRY:
|
||||||
|
return None
|
||||||
|
return COUNTRY_TO_ISO.get(key)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_state(name, country_code) -> str | None:
|
||||||
|
if country_code != "US":
|
||||||
|
return None # only US subdivisions for v1
|
||||||
|
return US_STATES.get(_norm_key(name))
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_places(raw) -> list[dict]:
|
||||||
|
"""LLM place dicts -> cleaned, deduped [{country_code, state_code, locality}]."""
|
||||||
|
out, seen = [], set()
|
||||||
|
if not isinstance(raw, list):
|
||||||
|
return out
|
||||||
|
for p in raw:
|
||||||
|
if not isinstance(p, dict):
|
||||||
|
continue
|
||||||
|
cc = normalize_country(p.get("country"))
|
||||||
|
sc = normalize_state(p.get("state_province"), cc)
|
||||||
|
loc = str(p.get("locality") or "").strip() or None
|
||||||
|
if not (cc or sc or loc):
|
||||||
|
continue # entirely empty -> drop
|
||||||
|
key = (cc, sc, (loc or "").lower())
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
out.append({"country_code": cc, "state_code": sc, "locality": loc})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# --- LLM extraction (separate pass; does not touch the scoring prompt) ------------
|
||||||
|
|
||||||
|
SYSTEM = (
|
||||||
|
"You tag the real-world geography of a news story for a calm good-news site. "
|
||||||
|
"Identify the place(s) the story is fundamentally ABOUT or where it HAPPENED, "
|
||||||
|
"NOT places mentioned only in passing. Many good-news stories (general science, "
|
||||||
|
"space, broad research, health) have no specific place; those are 'global'. If a "
|
||||||
|
"location is only incidental or genuinely unclear, use 'unknown'. Never guess. "
|
||||||
|
"Reply with ONLY a JSON object, no prose."
|
||||||
|
)
|
||||||
|
|
||||||
|
INSTRUCT = (
|
||||||
|
"Return JSON exactly like:\n"
|
||||||
|
'{"breadth": "<locality|regional|national|multinational|global|unknown>", '
|
||||||
|
'"places": [{"country": "<name or null>", "state_province": "<name or null>", '
|
||||||
|
'"locality": "<city/town or null>"}], "confidence": "<high|medium|low>", '
|
||||||
|
'"rationale": "<one short clause: where it happened and why>"}\n'
|
||||||
|
"breadth: locality=a specific city/town/county; regional=a state/province/region; "
|
||||||
|
"national=about a whole country; multinational=a few specific countries; "
|
||||||
|
"global=worldwide or no specific country; unknown=incidental/unclear. "
|
||||||
|
"places may list more than one when a story genuinely spans regions; use null for parts you can't support.\n"
|
||||||
|
"confidence: use 'high' ONLY when the location is explicitly stated or unmistakable; "
|
||||||
|
"'medium' when reasonably inferred; 'low' when shaky. Do NOT default to high."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _article_text(row) -> str:
|
||||||
|
parts = [f"TITLE: {row['title']}"]
|
||||||
|
for label, key in (("SUMMARY", "summary"), ("WHAT HAPPENED", "what_happened"),
|
||||||
|
("WHY IT MATTERS", "why_matters"), ("PUBLISHER BLURB", "description")):
|
||||||
|
try:
|
||||||
|
v = row[key]
|
||||||
|
except (KeyError, IndexError):
|
||||||
|
v = None
|
||||||
|
if v:
|
||||||
|
parts.append(f"{label}: {v}")
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def classify_geo(client: LocalModelClient, row) -> dict:
|
||||||
|
"""One geo pass over an article row -> normalized result. Raises on unparseable."""
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": SYSTEM},
|
||||||
|
{"role": "user", "content": _article_text(row) + "\n\n" + INSTRUCT},
|
||||||
|
]
|
||||||
|
data = parse_classifier_json(client.chat_text(messages))
|
||||||
|
breadth = data.get("breadth")
|
||||||
|
if breadth not in BREADTHS:
|
||||||
|
breadth = "unknown"
|
||||||
|
confidence = data.get("confidence")
|
||||||
|
if confidence not in CONFIDENCES:
|
||||||
|
confidence = "low"
|
||||||
|
return {
|
||||||
|
"breadth": breadth,
|
||||||
|
"confidence": confidence,
|
||||||
|
"rationale": (str(data.get("rationale") or "")[:300]) or None,
|
||||||
|
"places": normalize_places(data.get("places")),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def store_geo(conn: sqlite3.Connection, article_id: int, result: dict, version: str = GEO_VERSION) -> None:
|
||||||
|
"""Upsert article_geo and replace article_places. Geo is fully re-derivable, so
|
||||||
|
replacing places (unlike scores, which we never delete) is safe."""
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO article_geo (article_id, breadth, confidence, rationale, geo_version, updated_at) "
|
||||||
|
"VALUES (?,?,?,?,?, datetime('now')) "
|
||||||
|
"ON CONFLICT(article_id) DO UPDATE SET breadth=excluded.breadth, confidence=excluded.confidence, "
|
||||||
|
"rationale=excluded.rationale, geo_version=excluded.geo_version, updated_at=excluded.updated_at",
|
||||||
|
(article_id, result["breadth"], result["confidence"], result.get("rationale"), version),
|
||||||
|
)
|
||||||
|
conn.execute("DELETE FROM article_places WHERE article_id=?", (article_id,))
|
||||||
|
for i, p in enumerate(result.get("places") or []):
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO article_places (article_id, country_code, state_code, locality, ord) VALUES (?,?,?,?,?)",
|
||||||
|
(article_id, p.get("country_code"), p.get("state_code"), p.get("locality"), i),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def tag_articles(conn: sqlite3.Connection, client: LocalModelClient, limit: int = 200,
|
||||||
|
reclassify: bool = False) -> dict:
|
||||||
|
"""Tag accepted, non-duplicate articles that lack current geo. Idempotent: skips
|
||||||
|
rows already at GEO_VERSION unless reclassify=True. Used both by the cycle (new
|
||||||
|
articles) and the backfill (existing ones). Per-article failure is non-fatal."""
|
||||||
|
if reclassify:
|
||||||
|
where = "1=1"
|
||||||
|
else:
|
||||||
|
where = "(g.article_id IS NULL OR g.geo_version IS NOT ?)"
|
||||||
|
rows = conn.execute(
|
||||||
|
f"""SELECT a.id, a.title, a.description,
|
||||||
|
sm.summary, sm.what_happened, sm.why_matters
|
||||||
|
FROM articles a
|
||||||
|
JOIN article_scores s ON s.article_id = a.id
|
||||||
|
LEFT JOIN article_summaries sm ON sm.article_id = a.id
|
||||||
|
LEFT JOIN article_geo g ON g.article_id = a.id
|
||||||
|
WHERE s.accepted = 1 AND a.duplicate_of IS NULL AND {where}
|
||||||
|
ORDER BY a.discovered_at DESC
|
||||||
|
LIMIT ?""",
|
||||||
|
(() if reclassify else (GEO_VERSION,)) + (limit,),
|
||||||
|
).fetchall()
|
||||||
|
tagged = errors = 0
|
||||||
|
for r in rows:
|
||||||
|
try:
|
||||||
|
store_geo(conn, r["id"], classify_geo(client, r))
|
||||||
|
tagged += 1
|
||||||
|
except Exception: # noqa: BLE001 — non-fatal, like other cycle steps
|
||||||
|
errors += 1
|
||||||
|
if (tagged + errors) % 25 == 0:
|
||||||
|
conn.commit()
|
||||||
|
conn.commit()
|
||||||
|
return {"candidates": len(rows), "tagged": tagged, "errors": errors}
|
||||||
@@ -16,7 +16,8 @@ $ = informational
|
|||||||
- Date showed 6/2/2026 while it was still 6/1/2026 at 10:32pm
|
- Date showed 6/2/2026 while it was still 6/1/2026 at 10:32pm
|
||||||
- For account-based usage, we should have a thumbs up button that shows up to track the articles the user likes the most. We can then curate a special feed of articles that match the categories the user likes the most. Not social-based, just for seeing news that means the most to you.
|
- For account-based usage, we should have a thumbs up button that shows up to track the articles the user likes the most. We can then curate a special feed of articles that match the categories the user likes the most. Not social-based, just for seeing news that means the most to you.
|
||||||
- Feasibility of allowing users to add their own custom feeds for news sources
|
- Feasibility of allowing users to add their own custom feeds for news sources
|
||||||
- Joke corner: a curated, clean, non-offensive daily/rotating joke spot. On-brand "escape the grind" — light, professional-but-fun. Curation bar same as the rest of UB (nothing mean or edgy).
|
- Joke corner: a curated, clean, non-offensive daily/rotating joke spot. On-brand "escape the grind" — light, professional-but-fun. Curation bar same as the rest of UB (nothing mean or edgy). PARTICIPATION LOOP: let people SUBMIT jokes → AI pre-screen (clean/non-insulting/actually-funny, conservative gate) → human batch-approval queue (user is fine doing batches to drive engagement) → approved ones go live. Same "LLM proposes, code disposes" + admin-approval-queue pattern already used for Bloom words, Daily Word pool, and source candidates — known architecture, not net-new. Drivers: submission gives a reason to RETURN ("did mine get approved?"), attribution ("submitted by …") deepens ownership, approved jokes are shareable. Guardrails: jokes are an offense minefield (punching-down/stereotypes) so AI gate stays conservative + human is final say; reuse feedback-form anti-abuse (honeypot + rate-limit) on the submit endpoint.
|
||||||
|
- Bubble shooter / "bubble blaster" game for /play (casual, calm-satisfying arcade — different fun than the word/brain games). Strategic point: own the destination + widen the funnel, NOT literally steal a clone's community. Make it feed the share loop: DAILY SEEDED board + shareable SCORE ("I scored 14,200 🫧") deep-linked like the other games. Scope flag: bigger than the turn-based grid games — it's a real-time CANVAS game (aim, projectile physics, collision, color-cluster pop, cascade/drop, animation loop). Post-launch build, our own art/calm aesthetic (no cloned name/assets).
|
||||||
- Text adventure that SAVES YOUR SPOT in time (resume where you left off — a reason to come back). Start single-player/choose-your-path; dream stretch goal = broaden to co-op/multiplayer where people work through it together. Theme TBD. Fits "UB isn't just news — it's somewhere between professional and fun, a place to escape." (Would live under /play.)
|
- Text adventure that SAVES YOUR SPOT in time (resume where you left off — a reason to come back). Start single-player/choose-your-path; dream stretch goal = broaden to co-op/multiplayer where people work through it together. Theme TBD. Fits "UB isn't just news — it's somewhere between professional and fun, a place to escape." (Would live under /play.)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,208 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""PROTOTYPE geo audit (not production).
|
||||||
|
|
||||||
|
Codex/Claude plan: before building any "Closer to Home" UI or touching the
|
||||||
|
production classify schema, measure what subject-geography the LLM can actually
|
||||||
|
extract from recent good-news articles, and whether it understands WHERE A STORY
|
||||||
|
HAPPENED vs. merely spotting place names.
|
||||||
|
|
||||||
|
Key taxonomy decision: "local" is relative to the VIEWER, so we do NOT store it.
|
||||||
|
We store the article's intrinsic geographic BREADTH (locality/regional/national/
|
||||||
|
multinational/global/unknown) plus the actual place(s). The UI later decides
|
||||||
|
"Near you" by comparing those places to the visitor's chosen home.
|
||||||
|
|
||||||
|
This writes results to a scratch JSON file and prints a coverage report. It does
|
||||||
|
not migrate the DB, change the classify pipeline, or backfill anything.
|
||||||
|
|
||||||
|
Run (host can reach the LAN model):
|
||||||
|
.venv/bin/python scripts/geo_audit.py --limit 400 --base-url http://127.0.0.1:8080/v1
|
||||||
|
Resumable: re-running skips article ids already in the out file.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import statistics
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from goodnews.cli import _default_db
|
||||||
|
from goodnews.db import connect
|
||||||
|
from goodnews.llm import LocalModelClient, parse_classifier_json
|
||||||
|
|
||||||
|
BREADTHS = {"locality", "regional", "national", "multinational", "global", "unknown"}
|
||||||
|
|
||||||
|
SYSTEM = (
|
||||||
|
"You tag the real-world geography of a news story for a calm good-news site. "
|
||||||
|
"Identify the place(s) the story is fundamentally ABOUT or where it HAPPENED — "
|
||||||
|
"NOT places mentioned only in passing. Many good-news stories (general science, "
|
||||||
|
"space, broad research) have no specific place; those are 'global'. If a location "
|
||||||
|
"is only incidental or genuinely unclear, use 'unknown'. Do not guess. "
|
||||||
|
"Reply with ONLY a JSON object, no prose."
|
||||||
|
)
|
||||||
|
|
||||||
|
INSTRUCT = (
|
||||||
|
"Return JSON exactly like:\n"
|
||||||
|
'{"breadth": "<locality|regional|national|multinational|global|unknown>", '
|
||||||
|
'"places": [{"country": "<name or null>", "state_province": "<name or null>", '
|
||||||
|
'"locality": "<city/town or null>"}], "confidence": "<high|medium|low>", '
|
||||||
|
'"rationale": "<one short clause: where it happened and why>"}\n'
|
||||||
|
"breadth guide: locality=a specific city/town/county; regional=a state/province/region; "
|
||||||
|
"national=about a whole country; multinational=a few specific countries; "
|
||||||
|
"global=worldwide or no specific country; unknown=incidental/unclear. "
|
||||||
|
"places may list more than one when a story genuinely spans regions; use null for parts you can't support."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch(conn, limit):
|
||||||
|
return conn.execute(
|
||||||
|
"""
|
||||||
|
SELECT a.id, a.title, a.description, a.published_at, a.discovered_at,
|
||||||
|
sm.summary, sm.what_happened, sm.why_matters
|
||||||
|
FROM articles a
|
||||||
|
JOIN article_scores s ON s.article_id = a.id
|
||||||
|
LEFT JOIN article_summaries sm ON sm.article_id = a.id
|
||||||
|
WHERE s.accepted = 1 AND a.duplicate_of IS NULL
|
||||||
|
ORDER BY a.discovered_at DESC
|
||||||
|
LIMIT ?
|
||||||
|
""",
|
||||||
|
(limit,),
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
|
||||||
|
def article_text(r):
|
||||||
|
parts = [f"TITLE: {r['title']}"]
|
||||||
|
for label, key in (("SUMMARY", "summary"), ("WHAT HAPPENED", "what_happened"),
|
||||||
|
("WHY IT MATTERS", "why_matters"), ("PUBLISHER BLURB", "description")):
|
||||||
|
v = r[key]
|
||||||
|
if v:
|
||||||
|
parts.append(f"{label}: {v}")
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def extract(client, r):
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": SYSTEM},
|
||||||
|
{"role": "user", "content": article_text(r) + "\n\n" + INSTRUCT},
|
||||||
|
]
|
||||||
|
raw = client.chat_text(messages)
|
||||||
|
data = parse_classifier_json(raw) # raises on unparseable
|
||||||
|
breadth = data.get("breadth")
|
||||||
|
if breadth not in BREADTHS:
|
||||||
|
breadth = "unknown"
|
||||||
|
places = data.get("places")
|
||||||
|
places = [p for p in places if isinstance(p, dict)] if isinstance(places, list) else []
|
||||||
|
conf = data.get("confidence") if data.get("confidence") in {"high", "medium", "low"} else "low"
|
||||||
|
return {
|
||||||
|
"breadth": breadth,
|
||||||
|
"places": places,
|
||||||
|
"confidence": conf,
|
||||||
|
"rationale": (data.get("rationale") or "")[:300],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def report(rows, results):
|
||||||
|
by_id = {r["id"]: r for r in rows}
|
||||||
|
n = len(results)
|
||||||
|
print(f"\n===== GEO AUDIT REPORT (n={n}) =====")
|
||||||
|
if not n:
|
||||||
|
return
|
||||||
|
breadth = Counter(v["breadth"] for v in results.values())
|
||||||
|
conf = Counter(v["confidence"] for v in results.values())
|
||||||
|
countries = Counter()
|
||||||
|
states = Counter()
|
||||||
|
unknown = 0
|
||||||
|
for v in results.values():
|
||||||
|
if v["breadth"] == "unknown" or not v["places"]:
|
||||||
|
unknown += 1
|
||||||
|
for p in v["places"]:
|
||||||
|
if p.get("country"):
|
||||||
|
countries[str(p["country"]).strip()] += 1
|
||||||
|
if p.get("state_province"):
|
||||||
|
states[str(p["state_province"]).strip()] += 1
|
||||||
|
|
||||||
|
def pct(x):
|
||||||
|
return f"{100*x/n:.0f}%"
|
||||||
|
|
||||||
|
print("\nBreadth:")
|
||||||
|
for k in ("locality", "regional", "national", "multinational", "global", "unknown"):
|
||||||
|
print(f" {k:<13} {breadth.get(k,0):>4} {pct(breadth.get(k,0))}")
|
||||||
|
print(f"\nUnknown/no-place rate: {unknown}/{n} {pct(unknown)}")
|
||||||
|
print("Confidence:", dict(conf))
|
||||||
|
print("\nTop countries:")
|
||||||
|
for name, c in countries.most_common(12):
|
||||||
|
print(f" {name:<22} {c}")
|
||||||
|
print("\nTop states/provinces:")
|
||||||
|
for name, c in states.most_common(12):
|
||||||
|
print(f" {name:<22} {c}")
|
||||||
|
# US-local fuel check: how many map to a US state (the "Near you" payload for Americans)
|
||||||
|
us_local = sum(1 for v in results.values()
|
||||||
|
if any((p.get("country") or "").strip() in ("United States", "USA", "US") and p.get("state_province")
|
||||||
|
for p in v["places"]))
|
||||||
|
print(f"\nArticles with a US state attached (US 'Near you' fuel): {us_local} {pct(us_local)}")
|
||||||
|
# freshness
|
||||||
|
days = [by_id[int(i)]["discovered_at"][:10] for i in results if by_id.get(int(i)) and by_id[int(i)]["discovered_at"]]
|
||||||
|
if days:
|
||||||
|
print(f"\nFreshness: {min(days)} .. {max(days)} ({len(set(days))} distinct days)")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--db", default=None)
|
||||||
|
ap.add_argument("--limit", type=int, default=400)
|
||||||
|
ap.add_argument("--inspect", type=int, default=8, help="print N samples for manual eyeballing")
|
||||||
|
ap.add_argument("--out", default="data/geo_audit.json")
|
||||||
|
ap.add_argument("--base-url", default=None)
|
||||||
|
ap.add_argument("--model", default=None)
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
conn = connect(args.db or str(_default_db()))
|
||||||
|
client = LocalModelClient.from_env()
|
||||||
|
if args.base_url:
|
||||||
|
client.base_url = args.base_url.rstrip("/")
|
||||||
|
if args.model:
|
||||||
|
client.model = args.model
|
||||||
|
|
||||||
|
out = Path(args.out)
|
||||||
|
results = json.loads(out.read_text()) if out.exists() else {}
|
||||||
|
rows = fetch(conn, args.limit)
|
||||||
|
print(f"Fetched {len(rows)} accepted non-dup articles; {len(results)} already done.")
|
||||||
|
|
||||||
|
done = 0
|
||||||
|
for r in rows:
|
||||||
|
rid = str(r["id"])
|
||||||
|
if rid in results:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
results[rid] = extract(client, r)
|
||||||
|
except Exception as exc: # noqa: BLE001 — prototype; record + continue
|
||||||
|
results[rid] = {"breadth": "unknown", "places": [], "confidence": "low",
|
||||||
|
"rationale": f"ERROR: {type(exc).__name__}: {exc}"[:300], "error": True}
|
||||||
|
done += 1
|
||||||
|
if done % 25 == 0:
|
||||||
|
out.write_text(json.dumps(results, indent=1))
|
||||||
|
print(f" ...{done} new, {len(results)} total")
|
||||||
|
out.write_text(json.dumps(results, indent=1))
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# Manual-inspection sample: the step Codex flagged as essential — eyeball whether
|
||||||
|
# the model captured WHERE IT HAPPENED, not just place-name recognition.
|
||||||
|
print(f"\n----- SAMPLE FOR MANUAL INSPECTION (first {args.inspect}) -----")
|
||||||
|
shown = 0
|
||||||
|
for r in rows:
|
||||||
|
rid = str(r["id"])
|
||||||
|
if rid not in results:
|
||||||
|
continue
|
||||||
|
v = results[rid]
|
||||||
|
print(f"\n[{rid}] {r['title']}")
|
||||||
|
print(f" breadth={v['breadth']} conf={v['confidence']} places={v['places']}")
|
||||||
|
print(f" why: {v['rationale']}")
|
||||||
|
shown += 1
|
||||||
|
if shown >= args.inspect:
|
||||||
|
break
|
||||||
|
|
||||||
|
report(rows, results)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,124 @@
|
|||||||
|
"""Subject-geography: ISO normalization (model proposes names, code disposes to codes),
|
||||||
|
storage into the decoupled article_geo/article_places tables, and idempotent tagging."""
|
||||||
|
import json
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from goodnews import geo
|
||||||
|
from goodnews.db import connect, init_db
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def conn():
|
||||||
|
c = connect(":memory:"); init_db(c)
|
||||||
|
c.execute("INSERT INTO sources (id,name,feed_url,trust_score) VALUES (1,'S','http://s/f',5)")
|
||||||
|
yield c
|
||||||
|
c.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _article(c, aid, *, accepted=1, dup=None):
|
||||||
|
c.execute("INSERT INTO articles (id,source_id,canonical_url,title,url_hash,discovered_at) "
|
||||||
|
"VALUES (?,1,?,?,?,datetime('now'))", (aid, f"http://s/{aid}", f"Title {aid}", f"h{aid}"))
|
||||||
|
if dup is not None:
|
||||||
|
c.execute("UPDATE articles SET duplicate_of=? WHERE id=?", (dup, aid))
|
||||||
|
c.execute("INSERT INTO article_scores (article_id,accepted,reason_code) VALUES (?,?, 'ok')", (aid, accepted))
|
||||||
|
c.execute("INSERT INTO article_summaries (article_id,summary) VALUES (?,?)", (aid, f"Summary {aid}"))
|
||||||
|
c.commit()
|
||||||
|
|
||||||
|
|
||||||
|
class FakeGeo:
|
||||||
|
def __init__(self, payload):
|
||||||
|
self._p = payload
|
||||||
|
def chat_text(self, messages):
|
||||||
|
return json.dumps(self._p)
|
||||||
|
|
||||||
|
|
||||||
|
# --- normalization: names -> ISO codes, garbage dropped --------------------------
|
||||||
|
|
||||||
|
def test_country_normalization_and_aliases():
|
||||||
|
assert geo.normalize_country("United States") == "US"
|
||||||
|
assert geo.normalize_country("the USA") == "US"
|
||||||
|
assert geo.normalize_country("uganda") == "UG"
|
||||||
|
assert geo.normalize_country("United Kingdom") == "GB"
|
||||||
|
assert geo.normalize_country("Atlantis") is None # unknown -> drop, never guess
|
||||||
|
|
||||||
|
|
||||||
|
def test_region_words_never_become_a_country():
|
||||||
|
# the exact "Europe as country" leak Codex flagged
|
||||||
|
for w in ("Europe", "Asia", "the Middle East", "European Union", "global"):
|
||||||
|
assert geo.normalize_country(w) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_state_only_for_us():
|
||||||
|
assert geo.normalize_state("California", "US") == "CA"
|
||||||
|
assert geo.normalize_state("California", "GB") is None # not a US state context
|
||||||
|
assert geo.normalize_state("Ontario", "CA") is None # v1 = US subdivisions only
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_places_maps_dedupes_and_drops_empty():
|
||||||
|
raw = [
|
||||||
|
{"country": "United States", "state_province": "Texas", "locality": "Galveston"},
|
||||||
|
{"country": "Europe", "state_province": None, "locality": "Brussels"}, # region->no country, keep locality
|
||||||
|
{"country": None, "state_province": None, "locality": None}, # empty -> dropped
|
||||||
|
{"country": "United States", "state_province": "Texas", "locality": "Galveston"}, # dup -> dropped
|
||||||
|
]
|
||||||
|
out = geo.normalize_places(raw)
|
||||||
|
assert out == [
|
||||||
|
{"country_code": "US", "state_code": "TX", "locality": "Galveston"},
|
||||||
|
{"country_code": None, "state_code": None, "locality": "Brussels"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# --- classify_geo: validates breadth/confidence, normalizes places ---------------
|
||||||
|
|
||||||
|
def test_classify_geo_validates_and_normalizes(conn):
|
||||||
|
client = FakeGeo({"breadth": "locality", "confidence": "high",
|
||||||
|
"rationale": "about Galveston",
|
||||||
|
"places": [{"country": "USA", "state_province": "Texas", "locality": "Galveston"}]})
|
||||||
|
row = conn.execute("SELECT 'x' AS title, NULL AS description, NULL AS summary, "
|
||||||
|
"NULL AS what_happened, NULL AS why_matters").fetchone()
|
||||||
|
r = geo.classify_geo(client, row)
|
||||||
|
assert r["breadth"] == "locality" and r["confidence"] == "high"
|
||||||
|
assert r["places"] == [{"country_code": "US", "state_code": "TX", "locality": "Galveston"}]
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_geo_falls_back_on_bad_enum(conn):
|
||||||
|
client = FakeGeo({"breadth": "planetary", "confidence": "absolute", "places": "nope"})
|
||||||
|
row = conn.execute("SELECT 'x' AS title, NULL AS description, NULL AS summary, "
|
||||||
|
"NULL AS what_happened, NULL AS why_matters").fetchone()
|
||||||
|
r = geo.classify_geo(client, row)
|
||||||
|
assert r["breadth"] == "unknown" and r["confidence"] == "low" and r["places"] == []
|
||||||
|
|
||||||
|
|
||||||
|
# --- storage: decoupled tables, places replaced on re-store ----------------------
|
||||||
|
|
||||||
|
def test_store_geo_writes_both_tables_and_replaces_places(conn):
|
||||||
|
_article(conn, 1)
|
||||||
|
geo.store_geo(conn, 1, {"breadth": "national", "confidence": "medium", "rationale": "US story",
|
||||||
|
"places": [{"country_code": "US", "state_code": None, "locality": None}]})
|
||||||
|
g = conn.execute("SELECT breadth, confidence, geo_version FROM article_geo WHERE article_id=1").fetchone()
|
||||||
|
assert g["breadth"] == "national" and g["confidence"] == "medium" and g["geo_version"] == geo.GEO_VERSION
|
||||||
|
assert conn.execute("SELECT COUNT(*) FROM article_places WHERE article_id=1").fetchone()[0] == 1
|
||||||
|
# re-store with different places REPLACES (geo is re-derivable, unlike scores)
|
||||||
|
geo.store_geo(conn, 1, {"breadth": "locality", "confidence": "high", "rationale": "city",
|
||||||
|
"places": [{"country_code": "US", "state_code": "CA", "locality": "Oakland"}]})
|
||||||
|
rows = conn.execute("SELECT country_code, state_code, locality FROM article_places WHERE article_id=1").fetchall()
|
||||||
|
assert len(rows) == 1 and rows[0]["state_code"] == "CA" and rows[0]["locality"] == "Oakland"
|
||||||
|
|
||||||
|
|
||||||
|
# --- tag_articles: only accepted non-dup, idempotent by version ------------------
|
||||||
|
|
||||||
|
def test_tag_articles_targets_eligible_and_is_idempotent(conn):
|
||||||
|
_article(conn, 1) # eligible
|
||||||
|
_article(conn, 2, accepted=0) # rejected -> skip
|
||||||
|
_article(conn, 3, dup=1) # duplicate -> skip
|
||||||
|
client = FakeGeo({"breadth": "global", "confidence": "high", "rationale": "science", "places": []})
|
||||||
|
|
||||||
|
r1 = geo.tag_articles(conn, client, limit=50)
|
||||||
|
assert r1["candidates"] == 1 and r1["tagged"] == 1 # only article 1
|
||||||
|
assert conn.execute("SELECT breadth FROM article_geo WHERE article_id=1").fetchone()["breadth"] == "global"
|
||||||
|
|
||||||
|
# second run: already at current version -> nothing to do
|
||||||
|
assert geo.tag_articles(conn, client, limit=50)["candidates"] == 0
|
||||||
|
# reclassify forces a re-tag
|
||||||
|
assert geo.tag_articles(conn, client, limit=50, reclassify=True)["tagged"] == 1
|
||||||
Reference in New Issue
Block a user