upbeatBytes/goodnews/markup.py

"""Server-side sanitizing for admin feedback replies.

The reply composer is a small admin-only WYSIWYG (contenteditable + execCommand).
Browsers emit inconsistent HTML, so the server is the adult in the room: it
takes whatever the editor produced and rebuilds it from a strict allowlist —

  strong, em, p, br, ul, ol, li, and span ONLY with a whitelisted font-size.

Everything else (links, images, scripts, arbitrary styles, colors, fonts) is
dropped; disallowed-tag *content* is kept as escaped text, except script/style
whose content is discarded entirely. No raw HTML is ever trusted downstream —
we store and send only this sanitized output.
"""

from __future__ import annotations

import html as _html
import re
from html.parser import HTMLParser

# Canonicalise the tags we keep (b→strong, i→em, div→p); anything not here is dropped.
_TAG_MAP = {
    "b": "strong", "strong": "strong",
    "i": "em", "em": "em",
    "p": "p", "div": "p",
    "br": "br",
    "ul": "ul", "ol": "ol", "li": "li",
    "span": "span",
}
_ALLOWED_SIZES = {"13px", "15px", "18px", "22px"}
_DROP_CONTENT = {"script", "style"}
_FONT_SIZE_RE = re.compile(r"font-size:\s*(\d+px)", re.I)
# execCommand fontSize emits <font size="1..7">; map the common ones to our scale.
_FONT_TAG_SIZE = {"1": "13px", "2": "13px", "3": "15px", "4": "18px", "5": "22px", "6": "22px", "7": "22px"}


class _Sanitizer(HTMLParser):
    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self.out: list[str] = []
        self.open: list[str | None] = []  # emitted canonical tag, or None for a dropped wrapper
        self.skip = 0  # depth inside script/style (content discarded)

    def handle_starttag(self, tag, attrs):
        t = tag.lower()
        if t in _DROP_CONTENT:
            self.skip += 1
            self.open.append(None)
            return
        if self.skip:
            return
        if t == "br":
            self.out.append("<br>")
            return  # void — no stack entry
        size = self._size(t, attrs)
        if t == "span":
            if size:
                self.out.append(f'<span style="font-size:{size}">')
                self.open.append("span")
            else:
                self.open.append(None)  # unstyled/!whitelisted span → drop wrapper, keep text
            return
        if t == "font":
            if size:
                self.out.append(f'<span style="font-size:{size}">')
                self.open.append("span")  # normalise <font size> → safe span
            else:
                self.open.append(None)
            return
        canon = _TAG_MAP.get(t)
        if canon:
            self.out.append(f"<{canon}>")
            self.open.append(canon)
        else:
            self.open.append(None)  # disallowed tag — drop tag, keep children

    def handle_startendtag(self, tag, attrs):
        if tag.lower() == "br" and not self.skip:
            self.out.append("<br>")

    def handle_endtag(self, tag):
        t = tag.lower()
        if t in _DROP_CONTENT:
            if self.skip:
                self.skip -= 1
            if self.open:
                self.open.pop()
            return
        if self.skip or t == "br":
            return
        if not self.open:
            return
        canon = self.open.pop()
        if canon:
            self.out.append(f"</{canon}>")

    def handle_data(self, data):
        if not self.skip:
            self.out.append(_html.escape(data, quote=False))

    def _size(self, tag, attrs):
        for k, v in attrs:
            if not v:
                continue
            if k.lower() == "style":
                m = _FONT_SIZE_RE.search(v)
                if m and m.group(1) in _ALLOWED_SIZES:
                    return m.group(1)
            elif tag == "font" and k.lower() == "size":
                return _FONT_TAG_SIZE.get(v.strip())
        return None


def sanitize_reply_html(raw: str) -> str:
    """Rebuild editor HTML from the strict allowlist (or '' if it has no content)."""
    if not raw:
        return ""
    p = _Sanitizer()
    p.feed(raw)
    p.close()
    # Close any still-open allowed tags (malformed input → never emit a dangling
    # or severed tag into stored HTML / the email body).
    for canon in reversed(p.open):
        if canon:
            p.out.append(f"</{canon}>")
    html = "".join(p.out)
    # If nothing but markup/whitespace survived, treat as empty.
    if not re.sub(r"<[^>]+>", "", html).strip():
        return ""
    return html.strip()


def reply_html_to_text(clean_html: str) -> str:
    """Plain-text fallback derived from the already-sanitized HTML."""
    if not clean_html:
        return ""
    s = re.sub(r"</li>", "\n", clean_html)
    s = re.sub(r"<li>", "- ", s)
    s = re.sub(r"</p>", "\n\n", s)
    s = re.sub(r"<br\s*/?>", "\n", s)
    s = re.sub(r"<[^>]+>", "", s)
    s = _html.unescape(s)
    return re.sub(r"\n{3,}", "\n\n", s).strip()