"""Server-side sanitizing for admin feedback replies. The reply composer is a small admin-only WYSIWYG (contenteditable + execCommand). Browsers emit inconsistent HTML, so the server is the adult in the room: it takes whatever the editor produced and rebuilds it from a strict allowlist — strong, em, p, br, ul, ol, li, and span ONLY with a whitelisted font-size. Everything else (links, images, scripts, arbitrary styles, colors, fonts) is dropped; disallowed-tag *content* is kept as escaped text, except script/style whose content is discarded entirely. No raw HTML is ever trusted downstream — we store and send only this sanitized output. """ from __future__ import annotations import html as _html import re from html.parser import HTMLParser # Canonicalise the tags we keep (b→strong, i→em, div→p); anything not here is dropped. _TAG_MAP = { "b": "strong", "strong": "strong", "i": "em", "em": "em", "p": "p", "div": "p", "br": "br", "ul": "ul", "ol": "ol", "li": "li", "span": "span", } _ALLOWED_SIZES = {"13px", "15px", "18px", "22px"} _DROP_CONTENT = {"script", "style"} _FONT_SIZE_RE = re.compile(r"font-size:\s*(\d+px)", re.I) # execCommand fontSize emits ; map the common ones to our scale. _FONT_TAG_SIZE = {"1": "13px", "2": "13px", "3": "15px", "4": "18px", "5": "22px", "6": "22px", "7": "22px"} class _Sanitizer(HTMLParser): def __init__(self) -> None: super().__init__(convert_charrefs=True) self.out: list[str] = [] self.open: list[str | None] = [] # emitted canonical tag, or None for a dropped wrapper self.skip = 0 # depth inside script/style (content discarded) def handle_starttag(self, tag, attrs): t = tag.lower() if t in _DROP_CONTENT: self.skip += 1 self.open.append(None) return if self.skip: return if t == "br": self.out.append("
") return # void — no stack entry size = self._size(t, attrs) if t == "span": if size: self.out.append(f'') self.open.append("span") else: self.open.append(None) # unstyled/!whitelisted span → drop wrapper, keep text return if t == "font": if size: self.out.append(f'') self.open.append("span") # normalise → safe span else: self.open.append(None) return canon = _TAG_MAP.get(t) if canon: self.out.append(f"<{canon}>") self.open.append(canon) else: self.open.append(None) # disallowed tag — drop tag, keep children def handle_startendtag(self, tag, attrs): if tag.lower() == "br" and not self.skip: self.out.append("
") def handle_endtag(self, tag): t = tag.lower() if t in _DROP_CONTENT: if self.skip: self.skip -= 1 if self.open: self.open.pop() return if self.skip or t == "br": return if not self.open: return canon = self.open.pop() if canon: self.out.append(f"") def handle_data(self, data): if not self.skip: self.out.append(_html.escape(data, quote=False)) def _size(self, tag, attrs): for k, v in attrs: if not v: continue if k.lower() == "style": m = _FONT_SIZE_RE.search(v) if m and m.group(1) in _ALLOWED_SIZES: return m.group(1) elif tag == "font" and k.lower() == "size": return _FONT_TAG_SIZE.get(v.strip()) return None def sanitize_reply_html(raw: str) -> str: """Rebuild editor HTML from the strict allowlist (or '' if it has no content).""" if not raw: return "" p = _Sanitizer() p.feed(raw) p.close() # Close any still-open allowed tags (malformed input → never emit a dangling # or severed tag into stored HTML / the email body). for canon in reversed(p.open): if canon: p.out.append(f"") html = "".join(p.out) # If nothing but markup/whitespace survived, treat as empty. if not re.sub(r"<[^>]+>", "", html).strip(): return "" return html.strip() def reply_html_to_text(clean_html: str) -> str: """Plain-text fallback derived from the already-sanitized HTML.""" if not clean_html: return "" s = re.sub(r"", "\n", clean_html) s = re.sub(r"
  • ", "- ", s) s = re.sub(r"

    ", "\n\n", s) s = re.sub(r"", "\n", s) s = re.sub(r"<[^>]+>", "", s) s = _html.unescape(s) return re.sub(r"\n{3,}", "\n\n", s).strip()