Reply sanitizer: cap raw input, auto-close open tags (no severed HTML)

Per Codex: slicing the SANITIZED html with [:8000] could cut through a tag or
entity. Cap the RAW editor HTML (20k) before sanitizing instead, and have
sanitize_reply_html auto-close any still-open allowed tags so malformed input
can never leave a dangling/severed tag in message_html or the email body.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
jay
2026-06-09 09:22:41 -04:00
parent a5cea7cd74
commit ba92c0a04b
3 changed files with 18 additions and 2 deletions
+3 -2
View File
@@ -843,8 +843,9 @@ def create_app() -> FastAPI:
@app.post("/api/admin/feedback/{fid}/reply")
def admin_feedback_reply(fid: int, body: FeedbackReplyBody, request: Request) -> dict:
# Sanitize the editor HTML to our allowlist; derive the plain-text fallback.
reply_html = sanitize_reply_html(body.html)[:8000]
# Cap the RAW editor HTML first (slicing sanitized output could sever a
# tag), then sanitize the whole thing.
reply_html = sanitize_reply_html((body.html or "")[:20000])
reply_text = reply_html_to_text(reply_html)
if not reply_text:
raise HTTPException(status_code=422, detail="Reply message is required.")
+5
View File
@@ -118,6 +118,11 @@ def sanitize_reply_html(raw: str) -> str:
p = _Sanitizer()
p.feed(raw)
p.close()
# Close any still-open allowed tags (malformed input → never emit a dangling
# or severed tag into stored HTML / the email body).
for canon in reversed(p.open):
if canon:
p.out.append(f"</{canon}>")
html = "".join(p.out)
# If nothing but markup/whitespace survived, treat as empty.
if not re.sub(r"<[^>]+>", "", html).strip():
+10
View File
@@ -41,3 +41,13 @@ def test_html_to_text():
assert t2("<p>hi <strong>there</strong></p>") == "hi there"
assert t2("<ul><li>a</li><li>b</li></ul>") == "- a\n- b"
assert t2('<span style="font-size:18px">big</span>') == "big"
def test_autocloses_unclosed_tags():
assert s("<strong>bold") == "<strong>bold</strong>"
assert s("<p>hi") == "<p>hi</p>"
assert s("<ul><li>a</li>") == "<ul><li>a</li></ul>" # unclosed <ul> gets closed
assert s('<span style="font-size:18px">big') == '<span style="font-size:18px">big</span>'
# pathological input still never leaves a dangling open tag (balanced output)
out = s("<strong><em>x")
assert out == "<strong><em>x</em></strong>"