diff --git a/data/img_cache/13bc90ce0d95af422fa129823766aad5621f502f.webp b/data/img_cache/13bc90ce0d95af422fa129823766aad5621f502f.webp new file mode 100644 index 0000000..1ea05c9 Binary files /dev/null and b/data/img_cache/13bc90ce0d95af422fa129823766aad5621f502f.webp differ diff --git a/data/img_cache/13f15407c6a4e508aebadad75ba9b1479be092c8.webp b/data/img_cache/13f15407c6a4e508aebadad75ba9b1479be092c8.webp new file mode 100644 index 0000000..57b54d4 Binary files /dev/null and b/data/img_cache/13f15407c6a4e508aebadad75ba9b1479be092c8.webp differ diff --git a/data/img_cache/57f7e8ab32447b098e33d3f0b425c85b1dd08b19.webp b/data/img_cache/57f7e8ab32447b098e33d3f0b425c85b1dd08b19.webp new file mode 100644 index 0000000..7ceb39f Binary files /dev/null and b/data/img_cache/57f7e8ab32447b098e33d3f0b425c85b1dd08b19.webp differ diff --git a/data/img_cache/605016d188825b397bae21cbd8562174bdce7f5d.webp b/data/img_cache/605016d188825b397bae21cbd8562174bdce7f5d.webp new file mode 100644 index 0000000..170cf98 Binary files /dev/null and b/data/img_cache/605016d188825b397bae21cbd8562174bdce7f5d.webp differ diff --git a/data/img_cache/7227748af1997c6c91b263289d291cb1f7b4b8a5.webp b/data/img_cache/7227748af1997c6c91b263289d291cb1f7b4b8a5.webp new file mode 100644 index 0000000..92fdeaf Binary files /dev/null and b/data/img_cache/7227748af1997c6c91b263289d291cb1f7b4b8a5.webp differ diff --git a/data/img_cache/79f3d3e07f102cf16bf15aee2f96a680fe33a20d.webp b/data/img_cache/79f3d3e07f102cf16bf15aee2f96a680fe33a20d.webp new file mode 100644 index 0000000..5488272 Binary files /dev/null and b/data/img_cache/79f3d3e07f102cf16bf15aee2f96a680fe33a20d.webp differ diff --git a/data/img_cache/9c77d9a96d84b8673a53562ff30bb769dfed5cbd.webp b/data/img_cache/9c77d9a96d84b8673a53562ff30bb769dfed5cbd.webp new file mode 100644 index 0000000..2906b34 Binary files /dev/null and b/data/img_cache/9c77d9a96d84b8673a53562ff30bb769dfed5cbd.webp differ diff --git a/data/img_cache/a9d2a33bb3ff008d3cb38592576c6987ec3b2467.webp b/data/img_cache/a9d2a33bb3ff008d3cb38592576c6987ec3b2467.webp new file mode 100644 index 0000000..361c563 Binary files /dev/null and b/data/img_cache/a9d2a33bb3ff008d3cb38592576c6987ec3b2467.webp differ diff --git a/data/img_cache/ba2ad7e9d1f929b3173a1bce3626c5027bb33ce8.webp b/data/img_cache/ba2ad7e9d1f929b3173a1bce3626c5027bb33ce8.webp new file mode 100644 index 0000000..d5c1731 Binary files /dev/null and b/data/img_cache/ba2ad7e9d1f929b3173a1bce3626c5027bb33ce8.webp differ diff --git a/data/img_cache/e41490017ad2fd7f566930a456c1d9311359ac71.webp b/data/img_cache/e41490017ad2fd7f566930a456c1d9311359ac71.webp new file mode 100644 index 0000000..a73c2bc Binary files /dev/null and b/data/img_cache/e41490017ad2fd7f566930a456c1d9311359ac71.webp differ diff --git a/data/img_cache/edc0b99254d0dda68dfdd12ea8686a5951588e29.webp b/data/img_cache/edc0b99254d0dda68dfdd12ea8686a5951588e29.webp new file mode 100644 index 0000000..fe47034 Binary files /dev/null and b/data/img_cache/edc0b99254d0dda68dfdd12ea8686a5951588e29.webp differ diff --git a/data/img_cache/f72fd31b95f68ad7ec17825f1315268815a81f0f.webp b/data/img_cache/f72fd31b95f68ad7ec17825f1315268815a81f0f.webp new file mode 100644 index 0000000..2f2cbcb Binary files /dev/null and b/data/img_cache/f72fd31b95f68ad7ec17825f1315268815a81f0f.webp differ diff --git a/goodnews/api.py b/goodnews/api.py index 3cbf5e4..0507628 100644 --- a/goodnews/api.py +++ b/goodnews/api.py @@ -1160,8 +1160,13 @@ def create_app() -> FastAPI: # --- Privacy-respecting first-party analytics ------------------------- @app.post("/api/events") - def record_event(body: EventBody) -> dict: - if body.kind in _EVENT_KINDS: + def record_event(body: EventBody, request: Request) -> dict: + # Don't let crawlers inflate visitor/funnel counts. Many modern bots run JS and + # DO fire this beacon, so filter by User-Agent (same check the load-error beacon + # uses) — catches honest bot UAs (GPTBot, AhrefsBot, headless Chrome, …). The + # response is identical either way, so a bot can't tell it was dropped. + ua = request.headers.get("user-agent", "") + if body.kind in _EVENT_KINDS and not queries.is_bot_ua(ua): with get_conn() as conn: conn.execute( "INSERT OR IGNORE INTO events (kind, article_id, visitor_hash, day) " diff --git a/tests/test_events.py b/tests/test_events.py index 91571e5..d422527 100644 --- a/tests/test_events.py +++ b/tests/test_events.py @@ -1,6 +1,12 @@ import pytest from fastapi.testclient import TestClient +# The events beacon now drops known-bot User-Agents (queries.is_bot_ua), and the test +# client's default UA contains "python" → would be filtered. Send a real browser UA so +# these record like a genuine visitor; the bot case is covered explicitly below. +_BROWSER = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0 Safari/537.36"} + @pytest.fixture def app_db(tmp_path, monkeypatch): @@ -28,7 +34,7 @@ def _count(db, **where): def test_event_recorded_and_deduped(app_db): app, db = app_db - tc = TestClient(app) + tc = TestClient(app, headers=_BROWSER) for _ in range(3): # same (kind, article, visitor, day) → one row assert tc.post("/api/events", json={"kind": "open", "article_id": 5, "visitor": "tok"}).json() == {"ok": True} assert _count(db, kind="open", article_id=5) == 1 @@ -39,7 +45,7 @@ def test_event_recorded_and_deduped(app_db): def test_visitor_token_is_hashed_not_stored_raw(app_db): app, db = app_db - TestClient(app).post("/api/events", json={"kind": "visit", "visitor": "secret-token"}) + TestClient(app, headers=_BROWSER).post("/api/events", json={"kind": "visit", "visitor": "secret-token"}) from goodnews.db import connect c = connect(str(db)) vh = c.execute("SELECT visitor_hash FROM events").fetchone()[0] @@ -49,13 +55,28 @@ def test_visitor_token_is_hashed_not_stored_raw(app_db): def test_unknown_kind_is_ignored(app_db): app, db = app_db - assert TestClient(app).post("/api/events", json={"kind": "evil", "visitor": "x"}).json() == {"ok": True} + assert TestClient(app, headers=_BROWSER).post("/api/events", json={"kind": "evil", "visitor": "x"}).json() == {"ok": True} assert _count(db) == 0 +def test_bot_user_agents_are_not_counted(app_db): + """JS-capable crawlers fire this beacon too; honest bot UAs must not inflate counts. + Response stays {ok:true} so a bot can't tell it was dropped.""" + app, db = app_db + for bot_ua in ("Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)", + "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)", + "Mozilla/5.0 (X11; Linux x86_64) HeadlessChrome/120.0 Safari/537.36"): + tc = TestClient(app, headers={"user-agent": bot_ua}) + assert tc.post("/api/events", json={"kind": "visit", "visitor": "b"}).json() == {"ok": True} + assert _count(db, kind="visit") == 0 # none recorded + # a real browser on the same beacon IS counted + TestClient(app, headers=_BROWSER).post("/api/events", json={"kind": "visit", "visitor": "human"}) + assert _count(db, kind="visit") == 1 + + def test_game_event_kinds_are_allowed(app_db): app, db = app_db - tc = TestClient(app) + tc = TestClient(app, headers=_BROWSER) # the per-game funnel kinds (incl. the share-loop arrival) pass the allowlist for kind in ("word_started", "word_completed", "word_shared", "word_arrival", "match_arrival"): assert tc.post("/api/events", json={"kind": kind, "article_id": 0, "visitor": "t"}).json() == {"ok": True} @@ -67,7 +88,7 @@ def test_game_event_kinds_are_allowed(app_db): def test_admin_stats_games_funnel_aggregates(app_db): app, db = app_db - tc = TestClient(app) + tc = TestClient(app, headers=_BROWSER) # two visitors arrive at Daily Word via a shared link; one engages + shares; a Match completes for v in ("a", "b"): tc.post("/api/events", json={"kind": "word_arrival", "article_id": 0, "visitor": v})