Files
upbeatBytes/tests/test_events.py
T
thejayman77 ee43bb0df6 analytics: filter known-bot User-Agents at /api/events (honest visitor counts)
Many modern crawlers (AI scrapers, headless Chrome, link-preview fetchers) run JS and
fire the visit/summary_viewed beacon, inflating "visitors" even though there's no
human discovery channel. Apply queries.is_bot_ua() at /api/events — the same filter
the load-error beacon uses — so honest bot UAs (GPTBot, AhrefsBot, headless Chrome,
python/curl, …) are dropped before recording. Response is identical so a bot can't
detect it. Counts read lower but truer going forward (past rows unchanged). Won't catch
UA-spoofing bots; that needs a heavier heuristic. Tests: bot UAs dropped, real browser
counted; existing event tests send a real UA (default client UA contains "python").

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 11:19:51 -04:00

106 lines
4.9 KiB
Python

import pytest
from fastapi.testclient import TestClient
# The events beacon now drops known-bot User-Agents (queries.is_bot_ua), and the test
# client's default UA contains "python" → would be filtered. Send a real browser UA so
# these record like a genuine visitor; the bot case is covered explicitly below.
_BROWSER = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"}
@pytest.fixture
def app_db(tmp_path, monkeypatch):
db = tmp_path / "t.sqlite3"
monkeypatch.setenv("GOODNEWS_DB", str(db))
monkeypatch.setenv("GOODNEWS_SESSION_SECRET", "test-secret")
import importlib
import goodnews.api as api
importlib.reload(api)
from goodnews.db import connect, init_db
connect(str(db)).close() # creates schema lazily? ensure init
c = connect(str(db)); init_db(c); c.close()
return api.create_app(), db
def _count(db, **where):
from goodnews.db import connect
c = connect(str(db))
clause = " AND ".join(f"{k}=?" for k in where)
sql = "SELECT COUNT(*) FROM events" + (f" WHERE {clause}" if where else "")
n = c.execute(sql, tuple(where.values())).fetchone()[0]
c.close()
return n
def test_event_recorded_and_deduped(app_db):
app, db = app_db
tc = TestClient(app, headers=_BROWSER)
for _ in range(3): # same (kind, article, visitor, day) → one row
assert tc.post("/api/events", json={"kind": "open", "article_id": 5, "visitor": "tok"}).json() == {"ok": True}
assert _count(db, kind="open", article_id=5) == 1
# a different visitor is a distinct row
tc.post("/api/events", json={"kind": "open", "article_id": 5, "visitor": "other"})
assert _count(db, kind="open", article_id=5) == 2
def test_visitor_token_is_hashed_not_stored_raw(app_db):
app, db = app_db
TestClient(app, headers=_BROWSER).post("/api/events", json={"kind": "visit", "visitor": "secret-token"})
from goodnews.db import connect
c = connect(str(db))
vh = c.execute("SELECT visitor_hash FROM events").fetchone()[0]
c.close()
assert vh and vh != "secret-token" and len(vh) == 64 # sha256 hex
def test_unknown_kind_is_ignored(app_db):
app, db = app_db
assert TestClient(app, headers=_BROWSER).post("/api/events", json={"kind": "evil", "visitor": "x"}).json() == {"ok": True}
assert _count(db) == 0
def test_bot_user_agents_are_not_counted(app_db):
"""JS-capable crawlers fire this beacon too; honest bot UAs must not inflate counts.
Response stays {ok:true} so a bot can't tell it was dropped."""
app, db = app_db
for bot_ua in ("Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)",
"Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)",
"Mozilla/5.0 (X11; Linux x86_64) HeadlessChrome/120.0 Safari/537.36"):
tc = TestClient(app, headers={"user-agent": bot_ua})
assert tc.post("/api/events", json={"kind": "visit", "visitor": "b"}).json() == {"ok": True}
assert _count(db, kind="visit") == 0 # none recorded
# a real browser on the same beacon IS counted
TestClient(app, headers=_BROWSER).post("/api/events", json={"kind": "visit", "visitor": "human"})
assert _count(db, kind="visit") == 1
def test_game_event_kinds_are_allowed(app_db):
app, db = app_db
tc = TestClient(app, headers=_BROWSER)
# the per-game funnel kinds (incl. the share-loop arrival) pass the allowlist
for kind in ("word_started", "word_completed", "word_shared", "word_arrival", "match_arrival"):
assert tc.post("/api/events", json={"kind": kind, "article_id": 0, "visitor": "t"}).json() == {"ok": True}
assert _count(db, kind=kind) == 1
# a bogus game kind is still rejected
tc.post("/api/events", json={"kind": "chess_started", "visitor": "t"})
assert _count(db, kind="chess_started") == 0
def test_admin_stats_games_funnel_aggregates(app_db):
app, db = app_db
tc = TestClient(app, headers=_BROWSER)
# two visitors arrive at Daily Word via a shared link; one engages + shares; a Match completes
for v in ("a", "b"):
tc.post("/api/events", json={"kind": "word_arrival", "article_id": 0, "visitor": v})
tc.post("/api/events", json={"kind": "word_started", "article_id": 0, "visitor": "a"})
tc.post("/api/events", json={"kind": "word_shared", "article_id": 0, "visitor": "a"})
tc.post("/api/events", json={"kind": "match_completed", "article_id": 0, "visitor": "a"})
from goodnews.db import connect
from goodnews import queries
c = connect(str(db))
games = queries.admin_stats(c, days=30)["games"]
c.close()
assert games["by_game"]["word"] == {"arrival": 2, "started": 1, "completed": 0, "shared": 1}
assert games["by_game"]["match"]["completed"] == 1
assert games["totals"]["arrival"] == 2 and games["totals"]["shared"] == 1