ba92c0a04b
Per Codex: slicing the SANITIZED html with [:8000] could cut through a tag or entity. Cap the RAW editor HTML (20k) before sanitizing instead, and have sanitize_reply_html auto-close any still-open allowed tags so malformed input can never leave a dangling/severed tag in message_html or the email body. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
144 lines
4.8 KiB
Python
144 lines
4.8 KiB
Python
"""Server-side sanitizing for admin feedback replies.
|
|
|
|
The reply composer is a small admin-only WYSIWYG (contenteditable + execCommand).
|
|
Browsers emit inconsistent HTML, so the server is the adult in the room: it
|
|
takes whatever the editor produced and rebuilds it from a strict allowlist —
|
|
|
|
strong, em, p, br, ul, ol, li, and span ONLY with a whitelisted font-size.
|
|
|
|
Everything else (links, images, scripts, arbitrary styles, colors, fonts) is
|
|
dropped; disallowed-tag *content* is kept as escaped text, except script/style
|
|
whose content is discarded entirely. No raw HTML is ever trusted downstream —
|
|
we store and send only this sanitized output.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import html as _html
|
|
import re
|
|
from html.parser import HTMLParser
|
|
|
|
# Canonicalise the tags we keep (b→strong, i→em, div→p); anything not here is dropped.
|
|
_TAG_MAP = {
|
|
"b": "strong", "strong": "strong",
|
|
"i": "em", "em": "em",
|
|
"p": "p", "div": "p",
|
|
"br": "br",
|
|
"ul": "ul", "ol": "ol", "li": "li",
|
|
"span": "span",
|
|
}
|
|
_ALLOWED_SIZES = {"13px", "15px", "18px", "22px"}
|
|
_DROP_CONTENT = {"script", "style"}
|
|
_FONT_SIZE_RE = re.compile(r"font-size:\s*(\d+px)", re.I)
|
|
# execCommand fontSize emits <font size="1..7">; map the common ones to our scale.
|
|
_FONT_TAG_SIZE = {"1": "13px", "2": "13px", "3": "15px", "4": "18px", "5": "22px", "6": "22px", "7": "22px"}
|
|
|
|
|
|
class _Sanitizer(HTMLParser):
|
|
def __init__(self) -> None:
|
|
super().__init__(convert_charrefs=True)
|
|
self.out: list[str] = []
|
|
self.open: list[str | None] = [] # emitted canonical tag, or None for a dropped wrapper
|
|
self.skip = 0 # depth inside script/style (content discarded)
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
t = tag.lower()
|
|
if t in _DROP_CONTENT:
|
|
self.skip += 1
|
|
self.open.append(None)
|
|
return
|
|
if self.skip:
|
|
return
|
|
if t == "br":
|
|
self.out.append("<br>")
|
|
return # void — no stack entry
|
|
size = self._size(t, attrs)
|
|
if t == "span":
|
|
if size:
|
|
self.out.append(f'<span style="font-size:{size}">')
|
|
self.open.append("span")
|
|
else:
|
|
self.open.append(None) # unstyled/!whitelisted span → drop wrapper, keep text
|
|
return
|
|
if t == "font":
|
|
if size:
|
|
self.out.append(f'<span style="font-size:{size}">')
|
|
self.open.append("span") # normalise <font size> → safe span
|
|
else:
|
|
self.open.append(None)
|
|
return
|
|
canon = _TAG_MAP.get(t)
|
|
if canon:
|
|
self.out.append(f"<{canon}>")
|
|
self.open.append(canon)
|
|
else:
|
|
self.open.append(None) # disallowed tag — drop tag, keep children
|
|
|
|
def handle_startendtag(self, tag, attrs):
|
|
if tag.lower() == "br" and not self.skip:
|
|
self.out.append("<br>")
|
|
|
|
def handle_endtag(self, tag):
|
|
t = tag.lower()
|
|
if t in _DROP_CONTENT:
|
|
if self.skip:
|
|
self.skip -= 1
|
|
if self.open:
|
|
self.open.pop()
|
|
return
|
|
if self.skip or t == "br":
|
|
return
|
|
if not self.open:
|
|
return
|
|
canon = self.open.pop()
|
|
if canon:
|
|
self.out.append(f"</{canon}>")
|
|
|
|
def handle_data(self, data):
|
|
if not self.skip:
|
|
self.out.append(_html.escape(data, quote=False))
|
|
|
|
def _size(self, tag, attrs):
|
|
for k, v in attrs:
|
|
if not v:
|
|
continue
|
|
if k.lower() == "style":
|
|
m = _FONT_SIZE_RE.search(v)
|
|
if m and m.group(1) in _ALLOWED_SIZES:
|
|
return m.group(1)
|
|
elif tag == "font" and k.lower() == "size":
|
|
return _FONT_TAG_SIZE.get(v.strip())
|
|
return None
|
|
|
|
|
|
def sanitize_reply_html(raw: str) -> str:
|
|
"""Rebuild editor HTML from the strict allowlist (or '' if it has no content)."""
|
|
if not raw:
|
|
return ""
|
|
p = _Sanitizer()
|
|
p.feed(raw)
|
|
p.close()
|
|
# Close any still-open allowed tags (malformed input → never emit a dangling
|
|
# or severed tag into stored HTML / the email body).
|
|
for canon in reversed(p.open):
|
|
if canon:
|
|
p.out.append(f"</{canon}>")
|
|
html = "".join(p.out)
|
|
# If nothing but markup/whitespace survived, treat as empty.
|
|
if not re.sub(r"<[^>]+>", "", html).strip():
|
|
return ""
|
|
return html.strip()
|
|
|
|
|
|
def reply_html_to_text(clean_html: str) -> str:
|
|
"""Plain-text fallback derived from the already-sanitized HTML."""
|
|
if not clean_html:
|
|
return ""
|
|
s = re.sub(r"</li>", "\n", clean_html)
|
|
s = re.sub(r"<li>", "- ", s)
|
|
s = re.sub(r"</p>", "\n\n", s)
|
|
s = re.sub(r"<br\s*/?>", "\n", s)
|
|
s = re.sub(r"<[^>]+>", "", s)
|
|
s = _html.unescape(s)
|
|
return re.sub(r"\n{3,}", "\n\n", s).strip()
|