analytics: filter known-bot User-Agents at /api/events (honest visitor counts)
Many modern crawlers (AI scrapers, headless Chrome, link-preview fetchers) run JS and fire the visit/summary_viewed beacon, inflating "visitors" even though there's no human discovery channel. Apply queries.is_bot_ua() at /api/events — the same filter the load-error beacon uses — so honest bot UAs (GPTBot, AhrefsBot, headless Chrome, python/curl, …) are dropped before recording. Response is identical so a bot can't detect it. Counts read lower but truer going forward (past rows unchanged). Won't catch UA-spoofing bots; that needs a heavier heuristic. Tests: bot UAs dropped, real browser counted; existing event tests send a real UA (default client UA contains "python"). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
|
After Width: | Height: | Size: 16 KiB |
|
After Width: | Height: | Size: 52 KiB |
|
After Width: | Height: | Size: 41 KiB |
|
After Width: | Height: | Size: 27 KiB |
|
After Width: | Height: | Size: 56 KiB |
|
After Width: | Height: | Size: 88 KiB |
|
After Width: | Height: | Size: 91 KiB |
|
After Width: | Height: | Size: 74 KiB |
|
After Width: | Height: | Size: 66 KiB |
|
After Width: | Height: | Size: 39 KiB |
|
After Width: | Height: | Size: 29 KiB |
|
After Width: | Height: | Size: 51 KiB |
@@ -1160,8 +1160,13 @@ def create_app() -> FastAPI:
|
|||||||
# --- Privacy-respecting first-party analytics -------------------------
|
# --- Privacy-respecting first-party analytics -------------------------
|
||||||
|
|
||||||
@app.post("/api/events")
|
@app.post("/api/events")
|
||||||
def record_event(body: EventBody) -> dict:
|
def record_event(body: EventBody, request: Request) -> dict:
|
||||||
if body.kind in _EVENT_KINDS:
|
# Don't let crawlers inflate visitor/funnel counts. Many modern bots run JS and
|
||||||
|
# DO fire this beacon, so filter by User-Agent (same check the load-error beacon
|
||||||
|
# uses) — catches honest bot UAs (GPTBot, AhrefsBot, headless Chrome, …). The
|
||||||
|
# response is identical either way, so a bot can't tell it was dropped.
|
||||||
|
ua = request.headers.get("user-agent", "")
|
||||||
|
if body.kind in _EVENT_KINDS and not queries.is_bot_ua(ua):
|
||||||
with get_conn() as conn:
|
with get_conn() as conn:
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"INSERT OR IGNORE INTO events (kind, article_id, visitor_hash, day) "
|
"INSERT OR IGNORE INTO events (kind, article_id, visitor_hash, day) "
|
||||||
|
|||||||
@@ -1,6 +1,12 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
# The events beacon now drops known-bot User-Agents (queries.is_bot_ua), and the test
|
||||||
|
# client's default UA contains "python" → would be filtered. Send a real browser UA so
|
||||||
|
# these record like a genuine visitor; the bot case is covered explicitly below.
|
||||||
|
_BROWSER = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"}
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def app_db(tmp_path, monkeypatch):
|
def app_db(tmp_path, monkeypatch):
|
||||||
@@ -28,7 +34,7 @@ def _count(db, **where):
|
|||||||
|
|
||||||
def test_event_recorded_and_deduped(app_db):
|
def test_event_recorded_and_deduped(app_db):
|
||||||
app, db = app_db
|
app, db = app_db
|
||||||
tc = TestClient(app)
|
tc = TestClient(app, headers=_BROWSER)
|
||||||
for _ in range(3): # same (kind, article, visitor, day) → one row
|
for _ in range(3): # same (kind, article, visitor, day) → one row
|
||||||
assert tc.post("/api/events", json={"kind": "open", "article_id": 5, "visitor": "tok"}).json() == {"ok": True}
|
assert tc.post("/api/events", json={"kind": "open", "article_id": 5, "visitor": "tok"}).json() == {"ok": True}
|
||||||
assert _count(db, kind="open", article_id=5) == 1
|
assert _count(db, kind="open", article_id=5) == 1
|
||||||
@@ -39,7 +45,7 @@ def test_event_recorded_and_deduped(app_db):
|
|||||||
|
|
||||||
def test_visitor_token_is_hashed_not_stored_raw(app_db):
|
def test_visitor_token_is_hashed_not_stored_raw(app_db):
|
||||||
app, db = app_db
|
app, db = app_db
|
||||||
TestClient(app).post("/api/events", json={"kind": "visit", "visitor": "secret-token"})
|
TestClient(app, headers=_BROWSER).post("/api/events", json={"kind": "visit", "visitor": "secret-token"})
|
||||||
from goodnews.db import connect
|
from goodnews.db import connect
|
||||||
c = connect(str(db))
|
c = connect(str(db))
|
||||||
vh = c.execute("SELECT visitor_hash FROM events").fetchone()[0]
|
vh = c.execute("SELECT visitor_hash FROM events").fetchone()[0]
|
||||||
@@ -49,13 +55,28 @@ def test_visitor_token_is_hashed_not_stored_raw(app_db):
|
|||||||
|
|
||||||
def test_unknown_kind_is_ignored(app_db):
|
def test_unknown_kind_is_ignored(app_db):
|
||||||
app, db = app_db
|
app, db = app_db
|
||||||
assert TestClient(app).post("/api/events", json={"kind": "evil", "visitor": "x"}).json() == {"ok": True}
|
assert TestClient(app, headers=_BROWSER).post("/api/events", json={"kind": "evil", "visitor": "x"}).json() == {"ok": True}
|
||||||
assert _count(db) == 0
|
assert _count(db) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_bot_user_agents_are_not_counted(app_db):
|
||||||
|
"""JS-capable crawlers fire this beacon too; honest bot UAs must not inflate counts.
|
||||||
|
Response stays {ok:true} so a bot can't tell it was dropped."""
|
||||||
|
app, db = app_db
|
||||||
|
for bot_ua in ("Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)",
|
||||||
|
"Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) HeadlessChrome/120.0 Safari/537.36"):
|
||||||
|
tc = TestClient(app, headers={"user-agent": bot_ua})
|
||||||
|
assert tc.post("/api/events", json={"kind": "visit", "visitor": "b"}).json() == {"ok": True}
|
||||||
|
assert _count(db, kind="visit") == 0 # none recorded
|
||||||
|
# a real browser on the same beacon IS counted
|
||||||
|
TestClient(app, headers=_BROWSER).post("/api/events", json={"kind": "visit", "visitor": "human"})
|
||||||
|
assert _count(db, kind="visit") == 1
|
||||||
|
|
||||||
|
|
||||||
def test_game_event_kinds_are_allowed(app_db):
|
def test_game_event_kinds_are_allowed(app_db):
|
||||||
app, db = app_db
|
app, db = app_db
|
||||||
tc = TestClient(app)
|
tc = TestClient(app, headers=_BROWSER)
|
||||||
# the per-game funnel kinds (incl. the share-loop arrival) pass the allowlist
|
# the per-game funnel kinds (incl. the share-loop arrival) pass the allowlist
|
||||||
for kind in ("word_started", "word_completed", "word_shared", "word_arrival", "match_arrival"):
|
for kind in ("word_started", "word_completed", "word_shared", "word_arrival", "match_arrival"):
|
||||||
assert tc.post("/api/events", json={"kind": kind, "article_id": 0, "visitor": "t"}).json() == {"ok": True}
|
assert tc.post("/api/events", json={"kind": kind, "article_id": 0, "visitor": "t"}).json() == {"ok": True}
|
||||||
@@ -67,7 +88,7 @@ def test_game_event_kinds_are_allowed(app_db):
|
|||||||
|
|
||||||
def test_admin_stats_games_funnel_aggregates(app_db):
|
def test_admin_stats_games_funnel_aggregates(app_db):
|
||||||
app, db = app_db
|
app, db = app_db
|
||||||
tc = TestClient(app)
|
tc = TestClient(app, headers=_BROWSER)
|
||||||
# two visitors arrive at Daily Word via a shared link; one engages + shares; a Match completes
|
# two visitors arrive at Daily Word via a shared link; one engages + shares; a Match completes
|
||||||
for v in ("a", "b"):
|
for v in ("a", "b"):
|
||||||
tc.post("/api/events", json={"kind": "word_arrival", "article_id": 0, "visitor": v})
|
tc.post("/api/events", json={"kind": "word_arrival", "article_id": 0, "visitor": v})
|
||||||
|
|||||||