Files
upbeatBytes/goodnews/markup.py
T
thejayman77 ba92c0a04b Reply sanitizer: cap raw input, auto-close open tags (no severed HTML)
Per Codex: slicing the SANITIZED html with [:8000] could cut through a tag or
entity. Cap the RAW editor HTML (20k) before sanitizing instead, and have
sanitize_reply_html auto-close any still-open allowed tags so malformed input
can never leave a dangling/severed tag in message_html or the email body.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 09:22:41 -04:00

144 lines
4.8 KiB
Python

"""Server-side sanitizing for admin feedback replies.
The reply composer is a small admin-only WYSIWYG (contenteditable + execCommand).
Browsers emit inconsistent HTML, so the server is the adult in the room: it
takes whatever the editor produced and rebuilds it from a strict allowlist —
strong, em, p, br, ul, ol, li, and span ONLY with a whitelisted font-size.
Everything else (links, images, scripts, arbitrary styles, colors, fonts) is
dropped; disallowed-tag *content* is kept as escaped text, except script/style
whose content is discarded entirely. No raw HTML is ever trusted downstream —
we store and send only this sanitized output.
"""
from __future__ import annotations
import html as _html
import re
from html.parser import HTMLParser
# Canonicalise the tags we keep (b→strong, i→em, div→p); anything not here is dropped.
_TAG_MAP = {
"b": "strong", "strong": "strong",
"i": "em", "em": "em",
"p": "p", "div": "p",
"br": "br",
"ul": "ul", "ol": "ol", "li": "li",
"span": "span",
}
_ALLOWED_SIZES = {"13px", "15px", "18px", "22px"}
_DROP_CONTENT = {"script", "style"}
_FONT_SIZE_RE = re.compile(r"font-size:\s*(\d+px)", re.I)
# execCommand fontSize emits <font size="1..7">; map the common ones to our scale.
_FONT_TAG_SIZE = {"1": "13px", "2": "13px", "3": "15px", "4": "18px", "5": "22px", "6": "22px", "7": "22px"}
class _Sanitizer(HTMLParser):
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self.out: list[str] = []
self.open: list[str | None] = [] # emitted canonical tag, or None for a dropped wrapper
self.skip = 0 # depth inside script/style (content discarded)
def handle_starttag(self, tag, attrs):
t = tag.lower()
if t in _DROP_CONTENT:
self.skip += 1
self.open.append(None)
return
if self.skip:
return
if t == "br":
self.out.append("<br>")
return # void — no stack entry
size = self._size(t, attrs)
if t == "span":
if size:
self.out.append(f'<span style="font-size:{size}">')
self.open.append("span")
else:
self.open.append(None) # unstyled/!whitelisted span → drop wrapper, keep text
return
if t == "font":
if size:
self.out.append(f'<span style="font-size:{size}">')
self.open.append("span") # normalise <font size> → safe span
else:
self.open.append(None)
return
canon = _TAG_MAP.get(t)
if canon:
self.out.append(f"<{canon}>")
self.open.append(canon)
else:
self.open.append(None) # disallowed tag — drop tag, keep children
def handle_startendtag(self, tag, attrs):
if tag.lower() == "br" and not self.skip:
self.out.append("<br>")
def handle_endtag(self, tag):
t = tag.lower()
if t in _DROP_CONTENT:
if self.skip:
self.skip -= 1
if self.open:
self.open.pop()
return
if self.skip or t == "br":
return
if not self.open:
return
canon = self.open.pop()
if canon:
self.out.append(f"</{canon}>")
def handle_data(self, data):
if not self.skip:
self.out.append(_html.escape(data, quote=False))
def _size(self, tag, attrs):
for k, v in attrs:
if not v:
continue
if k.lower() == "style":
m = _FONT_SIZE_RE.search(v)
if m and m.group(1) in _ALLOWED_SIZES:
return m.group(1)
elif tag == "font" and k.lower() == "size":
return _FONT_TAG_SIZE.get(v.strip())
return None
def sanitize_reply_html(raw: str) -> str:
"""Rebuild editor HTML from the strict allowlist (or '' if it has no content)."""
if not raw:
return ""
p = _Sanitizer()
p.feed(raw)
p.close()
# Close any still-open allowed tags (malformed input → never emit a dangling
# or severed tag into stored HTML / the email body).
for canon in reversed(p.open):
if canon:
p.out.append(f"</{canon}>")
html = "".join(p.out)
# If nothing but markup/whitespace survived, treat as empty.
if not re.sub(r"<[^>]+>", "", html).strip():
return ""
return html.strip()
def reply_html_to_text(clean_html: str) -> str:
"""Plain-text fallback derived from the already-sanitized HTML."""
if not clean_html:
return ""
s = re.sub(r"</li>", "\n", clean_html)
s = re.sub(r"<li>", "- ", s)
s = re.sub(r"</p>", "\n\n", s)
s = re.sub(r"<br\s*/?>", "\n", s)
s = re.sub(r"<[^>]+>", "", s)
s = _html.unescape(s)
return re.sub(r"\n{3,}", "\n\n", s).strip()