Reliability: slow ≠ failed — SW nav timeout, slow-boot telemetry, de-bot stats

Root cause of the intermittent white screen: the shell HTML is no-cache
(cf-cache-status: DYNAMIC), so every page-open does a synchronous round-trip
to the residential origin before any pixel renders — and the SW's network-first
navigation only fell back to the cached shell on REJECTION, never on slowness.
A stalled fetch meant staring at white with a perfectly good shell in cache.
The boot seatbelt couldn't see it either: it lives inside the HTML that hadn't
arrived yet, so slow boots left no telemetry.

- service-worker: race navigation fetch vs 2.5s grace timer. Network wins →
  fresh HTML as before; timer/5xx/failure → cached shell instantly, network
  response still refreshes the cache in the background. Safe due to the 14-day
  immutable-chunk grace window. Caps the white screen at ~2.5s for repeat
  visitors on any network.
- app.html: beacon `boot-slow: Nms (html Nms) on 4g` when mount takes >4s —
  the "white screen, then it loaded" glitches finally leave a trace, with
  HTML-arrival timing to separate slow-origin from slow-JS.
- admin: bot UAs (HeadlessChrome/bot/spider/crawl/…) excluded from the
  headline "Load errors today" count — throttled crawlers trip the 10s boot
  check routinely (the one recorded error was HeadlessChrome on X11, not a
  phone). Bots stay visible in the list, tagged + dimmed.

Tests: telemetry test extended for bot flag + filtered counts. 223 pytest +
11 vitest green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-11 19:23:33 -04:00
parent 90da4be083
commit 628cc5722c
6 changed files with 84 additions and 17 deletions
+15
View File
@@ -61,6 +61,21 @@
var el = document.getElementById('boot-fallback');
if (el && el.parentNode) el.parentNode.removeChild(el);
try { sessionStorage.removeItem('ub_reloaded'); } catch (e) {}
// Slow-but-successful boots (the "white screen, then it loaded" glitch)
// would otherwise leave no trace — beacon the timing so they're visible.
// performance.now() counts from navigation start, so a slow-arriving
// HTML document is included, not just slow JS.
try {
var ms = Math.round(performance.now());
if (ms > 4000) {
var nav = performance.getEntriesByType && performance.getEntriesByType('navigation')[0];
var detail = 'boot-slow: ' + ms + 'ms';
if (nav && nav.responseStart) detail += ' (html ' + Math.round(nav.responseStart) + 'ms)';
if (navigator.connection && navigator.connection.effectiveType)
detail += ' on ' + navigator.connection.effectiveType;
report(detail);
}
} catch (e) { /* timing is best-effort */ }
};
addEventListener('vite:preloadError', function (e) {
report('preloadError: ' + ((e && e.payload && e.payload.message) || ''));
+7 -2
View File
@@ -458,9 +458,9 @@
<h2>Recent load errors <span class="count">(last {clientErrors.length})</span></h2>
<ul class="cerrs">
{#each clientErrors as e (e.created_at + e.reason)}
<li>
<li class:bot={e.bot}>
<span class="ce-when">{fdate(e.created_at)}</span>
<span class="ce-reason">{e.reason || '—'}</span>
<span class="ce-reason">{e.reason || '—'}{#if e.bot}<span class="ce-bot">bot</span>{/if}</span>
<span class="ce-path">{e.path || '/'}</span>
<span class="ce-ua">{e.user_agent}</span>
</li>
@@ -1198,6 +1198,11 @@
font-size: 0.82rem; padding: 8px 12px; background: var(--surface); border: 1px solid var(--line); border-radius: 8px; }
.ce-when { color: var(--muted); white-space: nowrap; }
.ce-reason { font-family: var(--label); color: #9a3b3b; }
.cerrs li.bot { opacity: 0.6; }
.cerrs li.bot .ce-reason { color: var(--muted); }
.ce-bot { display: inline-block; margin-left: 8px; padding: 1px 8px; border-radius: 999px;
background: var(--accent-soft); color: var(--accent-deep);
font-size: 0.68rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.05em; }
.ce-path { color: var(--accent-deep); white-space: nowrap; }
.ce-ua { grid-column: 1 / -1; color: var(--muted); font-size: 0.72rem;
overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+28 -11
View File
@@ -8,6 +8,10 @@
import { version } from '$service-worker';
const CACHE = `upbeat-${version}`;
// How long a navigation may wait on the network before the cached shell is
// served instead. Long enough for a healthy fetch, short enough that a stalled
// cellular/origin hop never reads as a broken site.
const NAV_TIMEOUT_MS = 2500;
// Paths the FastAPI server owns — the SW must NOT intercept or cache these.
function isServerPath(p) {
@@ -38,19 +42,32 @@ self.addEventListener('fetch', (event) => {
if (url.origin !== location.origin) return;
if (isServerPath(url.pathname)) return; // let the network/server handle these
// Navigations: network-first; keep the freshest real HTML shell as the offline
// fallback; on a failed fetch, serve that cached shell (never blank).
// Navigations: network-first, but a SLOW network must not mean a white screen —
// "slow" and "failed" both fall back to the cached shell. We race the fetch
// against a short grace timer: network wins → freshest HTML as usual; timer
// wins (or 5xx/failure) → serve the cached shell instantly while the network
// response still lands in the cache for next time. A slightly stale shell is
// safe: deploys keep old immutable chunks for a 14-day grace window.
if (request.mode === 'navigate') {
event.respondWith(
fetch(request)
.then((res) => {
if (res && res.ok && (res.headers.get('content-type') || '').includes('text/html')) {
const copy = res.clone();
caches.open(CACHE).then((c) => c.put('/', copy)).catch(() => {});
}
return res;
})
.catch(() => caches.match('/'))
(async () => {
const cache = await caches.open(CACHE);
const cached = await cache.match('/');
const network = fetch(request)
.then((res) => {
if (res && res.ok && (res.headers.get('content-type') || '').includes('text/html')) {
cache.put('/', res.clone()).catch(() => {});
}
return res;
})
.catch(() => null);
if (!cached) return (await network) || Response.error(); // first visit: network only
const winner = await Promise.race([
network,
new Promise((resolve) => setTimeout(() => resolve('slow'), NAV_TIMEOUT_MS)),
]);
return winner && winner !== 'slow' && winner.ok ? winner : cached;
})()
);
return;
}
+3 -1
View File
@@ -961,7 +961,9 @@ def create_app() -> FastAPI:
rows = conn.execute(
"SELECT reason, path, user_agent, created_at FROM client_errors ORDER BY id DESC LIMIT 20"
).fetchall()
return [dict(r) for r in rows]
# Bots stay visible in the list (tagged) but are excluded from the
# headline counts — see queries.admin_stats.
return [{**dict(r), "bot": queries.is_bot_ua(r["user_agent"])} for r in rows]
@app.post("/api/feedback")
def submit_feedback(body: FeedbackBody, request: Request, background_tasks: BackgroundTasks) -> dict:
+22 -3
View File
@@ -13,6 +13,18 @@ from datetime import UTC, datetime, timedelta
from .feeds import MAX_BACKOFF_MINUTES
from .paywall import is_paywalled
# UA substrings that mark automated clients. Crawlers run JS on a throttled
# budget and trip the boot-failure beacon routinely — without this filter they
# read as real users seeing blank screens.
BOT_UA_MARKS = ("headlesschrome", "bot", "spider", "crawl", "python", "curl", "wget", "phantomjs")
_NOT_BOT_SQL = " AND ".join(f"instr(lower(user_agent), '{m}')=0" for m in BOT_UA_MARKS)
def is_bot_ua(ua: str | None) -> bool:
low = (ua or "").lower()
return any(m in low for m in BOT_UA_MARKS)
# Composite ranking used everywhere a "best first" order is needed. Kept as one
# expression so brief, category feeds, and the API all rank identically.
RANK_SCORE_SQL = (
@@ -565,10 +577,17 @@ def admin_stats(conn: sqlite3.Connection, days: int = 30) -> dict:
"top_topics": top_topics,
"shares": shares,
"daily": daily,
# Boot-failure seatbelt signal — blank-screen risk surfacing.
# Boot-failure seatbelt signal — blank-screen risk surfacing. Bots are
# excluded from the headline counts: throttled crawlers fail the boot
# check routinely and would read as real users seeing blank screens.
"client_errors": {
"today": scalar("SELECT COUNT(*) FROM client_errors WHERE date(created_at)=date('now')"),
"window": scalar("SELECT COUNT(*) FROM client_errors WHERE created_at>=date('now',?)", (since,)),
"today": scalar(
f"SELECT COUNT(*) FROM client_errors WHERE date(created_at)=date('now') AND {_NOT_BOT_SQL}"
),
"window": scalar(
f"SELECT COUNT(*) FROM client_errors WHERE created_at>=date('now',?) AND {_NOT_BOT_SQL}",
(since,),
),
},
}
+9
View File
@@ -418,7 +418,16 @@ def test_client_error_telemetry(tmp_path, monkeypatch):
rows = tc.get("/api/admin/client-errors").json()
assert len(rows) == 1 and rows[0]["reason"] == "boot-timeout" and rows[0]["path"] == "/play"
assert rows[0]["user_agent"] # captured from the request header
assert rows[0]["bot"] is False
assert tc.get("/api/admin/stats").json()["client_errors"]["today"] == 1
# A throttled crawler tripping the beacon must NOT inflate the headline count,
# but stays visible (tagged) in the list.
anon.post("/api/client-error", json={"reason": "boot-timeout", "path": "/"},
headers={"user-agent": "Mozilla/5.0 (X11; Linux x86_64) HeadlessChrome/138.0 Safari/537.36"})
rows = tc.get("/api/admin/client-errors").json()
assert len(rows) == 2 and rows[0]["bot"] is True
stats = tc.get("/api/admin/stats").json()["client_errors"]
assert stats["today"] == 1 and stats["window"] == 1 # bot excluded from both
def test_wordsearch_theme_admin(tmp_path, monkeypatch):