scripts/ftg/formatter.py

# A set of filters to format HTML to text with barebones Markdown-style markup.
import re


class HtmlToFText:

    strip_tags1 = {
            # Remove comments first, which may wrap around other tags
            "<!--": "-->",
            # Remove classes, ids and extraneous attributes
            " class=\"": "\"", " id=\"": "\"",
            }

    strip_tags2 = {
            # Remove doctype, tags and inner html
            "<!": ">",
            "<applet": "</applet>",
            "<aside": "</aside>",
            "<base": "</base>",
            "<canvas": "</canvas>",
            "<form": "</form>",
            "<button": "</button>",
            "<input": "</input>",
            "<label": "</label>",
            "<head": "</head>",
            "<iframe": "</iframe>",
            "<menu": "</menu>",
            "<nav": "</nav>",
            "<noscript": "</noscript>",
            "<param": "</param>",
            "<progress": "</progress>",
            "<rp": "</rp>",
            "<script": "</script>",
            "<style": "</style>",
            # Remove non-functional empty links after stripping classes/ids
            "<a href=\"#\"": "</a>",
            "<a>": "</a>",
            # Remove the tags themselves but not the inner html
            "<article": ">", "</article": ">",
            "<body": ">", "</body": ">",
            "<div": ">", "</div": ">",
            "<footer": ">", "</footer": ">",
            "<header": ">", "</header": ">",
            "<html": ">", "</html": ">",
            "<main": ">", "</main": ">",
            "<section": ">", "</section": ">",
            "<span": ">", "</span": ">",
            "<title": ">", "</title": ">",
            # Remove currently unsupported tags
            "<center": ">", "</center": ">",
            "<frame": ">", "</frame": ">",
            "<small": ">", "</small": ">",
            # "<audio": ">", "</audio": ">",
            # "<video": ">", "</video": ">",
            # "<map": ">", "</map": ">",
            }

    strip_ws = ["\n\n", "\t", "  "]

    format_tags1 = {
            ">\n<": "><",
            "<blockquote>": "\n>",
            }

    format_tags2 = {
            "<address>": "[[address]]\n", "</address>": "\n",
            "<em>": "*", "</em>": "*",
            "<i>": "*", "</i>": "*",
            "<h1>": "\n# ", "</h1>": "\n",
            "<h2>": "\n## ", "</h2>": "\n",
            "<h3>": "\n### ", "</h3>": "\n",
            "<h4>": "\n#### ", "</h4>": "\n",
            "<h5>": "\n##### ", "</h5>": "\n",
            "<h6>": "\n###### ", "</h6>": "\n",
            "<hr>": "\n---\n", "<hr/>": "\n---\n", "<hr />": "\n---\n",
            "<br>": "\n", "<br/>": "\n", "<br />": "\n",
            "<blockquote>": "\n>", "</blockquote>": "\n",
            "<cite>": "**", "</cite>": "**",
            "<code>": "`", "</code>": "`",
            "<del>": "~~", "</del>": "~~",
            "<ins>": "**", "</ins>": "**",
            "<dl>": "\n", "</dl>": "",
            "<dt>": "", "</dt>": ": ",
            "<dd>": "", "</dd>": "\n",
            "<figcaption>": "*", "</figcaption>": "*",
            "<figure>": "Fig. ", "</figure>": "",
            "<mark>": "***", "</mark>": "***",
            "<p>": "\n", "</p>": "\n",
            "<pre>": "\n```\n", "</pre>": "\n```\n",
            "<q>": "«", "</q>": "»",
            "<ruby>": "", "</ruby>": "",
            "<rt>": " (", "</rt>": ")",
            "<s>": "[[~~", "</s>": "~~]]",
            "<strong>": "**", "</strong>": "**",
            "<sub>": "⏝", "</sub>": "⏝",
            "<sup>": "^", "</sup>": "^",
            "<b>": "**", "</b>": "**",
            "<u>": "__", "</u>": "__",
            "&nbsp;": "\n\n",
            "&#39;": "'",
            "&rsquo;": "'", "&rsquo;": "'",
            "&ldquo;": "\"", "&rdquo;": "\"",
            "&ndash;": "—",
            "&copy;": "©",
            }

    def filter_strip_tags(self, html):
        """Strip extraneous html tags."""
        txt = html
        # Some tags need to be stripped before others
        for tag in self.strip_tags1:
            txt = re.sub(tag + ".*?" + self.strip_tags1[tag], "", txt,
                    flags=re.DOTALL)
        for tag in self.strip_tags2:
            txt = re.sub(tag + ".*?" + self.strip_tags2[tag], "", txt,
                    flags=re.DOTALL)
        return txt

    def filter_whitespace(self, html):
        """Strip extra whitespaces often found in dynamically-generated source
        files."""
        txt = html
        for ws in self.strip_ws:
            txt = txt.replace(ws, "")
        return "".join(txt.split("\n\n"))

    def filter_format_tags(self, html):
        """Translate select structure and format-related tags to Markdown-like
        syntax."""
        txt = html
        for tag in self.format_tags1:
            txt = txt.replace(tag, self.format_tags1[tag])
        for tag in self.format_tags2:
            txt = txt.replace(tag, self.format_tags2[tag])
        return txt

    def filter_img(self, html):
        """Translate image tags to Markdown syntax."""
        txt = html
        attrs = {"src": "", "title": "", "alt": ""}
        imgs = re.findall("<img [a-z].*?/>", txt, flags=re.DOTALL)
        for i in imgs:
            for a in attrs:
                if (" " + a + "=\"") in i:
                    attrs[a] = i.split(" " + a + "=\"")[1].split("\"")[0]
                elif (" " + a + "='") in i:
                    attrs[a] = i.split(" " + a + "='")[1].split("'")[0]
                elif (" " + a + "=") in i:
                    attrs[a] = i.split(" " + a + "=")[1].split(" ")[0]
            if attrs["title"] != "":
                md_link = "![" + attrs["alt"] + "](" + attrs["src"] + " \"" + \
                        attrs["title"] + "\")"
            else:
                md_link = "![" + attrs["alt"] + "](" + attrs["src"] + ")"
            txt = txt.replace(i, md_link)
        return txt

    def filter_links(self, html):
        """Translate links to Markdown syntax."""
        txt = html
        links = re.findall("<a [a-z].*?</a>", txt, flags=re.DOTALL)
        attrs = {"href": "", "title": ""}
        md_link = ""
        for l in links:
            if " href=\"" in l:
                attrs["href"] = l.split(" href=\"")[1].split("\"")[0]
                attrs["title"] = l.split(">")[1].strip("</a>")
            elif " href='" in l:
                attrs["href"] = l.split(" href='")[1].split("'")[0]
                attrs["title"] = l.split(">")[1].strip("</a>")
            elif " href=" in l:
                attrs["href"] = l.split(" href=")[1].split(" ")[0]
                attrs["title"] = l.split(">")[1].strip("</a>")
            if (attrs["href"] != "") and (attrs["title"] != ""):
                md_link = "[" + attrs["title"] + "](" + attrs["href"] + ")"
            txt = txt.replace(l, md_link)
        return txt

    def filter_embed(self, html):
        """Translate embed tags to Markdown links."""
        txt = html
        embeds = re.findall("<embed [a-z].*?>", txt, flags=re.DOTALL)
        src = ""
        for e in embeds:
            if " src=\"" in e:
                src = e.split(" src=\"")[1].split("\"")[0]
            elif " src='" in e:
                src = e.split(" src='")[1].split("'")[0]
            elif " src=" in e:
                src = e.split(" src=")[1].split(">")[0]
            if src != "":
                txt = txt.replace(e, "[embed](" + src + ")")
        return txt

    def filter_abbr(self, html):
        """Format abbr tags, e.g. `<abbr title="Hypertext Markup
        Language">HTML</abbr>` -> `HTML [[abbr: Hypertext Markup Language]]`"""
        txt = html
        abbrs = re.findall("<abbr [a-z].*?</abbr>", txt, flags=re.DOTALL)
        attrs = {"abbr": "", "title": ""}
        abbrev = ""
        for a in abbrs:
            if " title=\"" in a:
                attrs["title"] = a.split(" title=\"")[1].split("\"")[0]
                attrs["abbr"] = a.split(">")[1].strip("</abbr>")
            elif " title='" in a:
                attrs["title"] = a.split(" title='")[1].split("'")[0]
                attrs["abbr"] = a.split(">")[1].strip("</abbr>")
            elif " title=" in a:
                attrs["title"] = a.split(" title=")[1].split(" ")[0]
                attrs["abbr"] = a.split(">")[1].strip("</abbr>")
            if (attrs["title"] != "") and (attrs["abbr"] != ""):
                abbrev = attrs["abbr"] + "[[abbr: " + attrs["title"] + "]]"
            txt = txt.replace(l, abbrev)
        return txt

    def filter_time(self, html):
        """Format time tags, e.g. `<time datetime="1970-01-01">Today</time>` ->
        `Today (1970-01-01)`."""
        txt = html
        timestamps = re.findall("<time.*?</time>", txt)
        attrs = {"title": "", "datetime": ""}
        for t in timestamps:
            attrs["title"] = t.split(">")[1].strip("</time>")
            if " datetime=\"" in t:
                attrs["datetime"] = t.split(" datetime=\"")[1].split("\"")[0]
            elif " datetime='" in t:
                attrs["datetime"] = t.split(" datetime='")[1].split("'")[0]
            elif " datetime=" in t:
                attrs["datetime"] = t.split(" datetime=")[1].split(">")[0]
            ts = attrs["title"]
            if attrs["datetime"] != "":
                ts += " (" + attrs["datetime"] + ")"
            txt = txt.replace(t, ts)
        return txt

    def filter_ol(self, html):
        """Parse ordered lists. Only single-level lists are currently
        supported."""
        txt = html
        ol = re.findall("<ol>.*?</ol>", txt, flags=re.DOTALL)
        for o in ol:
            li = o.replace("</li>", "").replace("</ol>", "").split("<li>")
            md = ""
            for l in range(1, len(li)):
                md += str(l) + ". " + li[l].lstrip()
            txt = txt.replace(o, "\n" + md)
        return txt

    def filter_ul(self, html):
        """Parse unordered lists. Only single-level lists are currently
        supported (nested lists will be flattened)."""
        txt = html
        while  ("<ul>" in txt) or ("<li>" in txt):
            txt = txt.replace("<li></li>","")
            txt = txt.replace("<ul>", "\n").replace("</ul>", "")
            txt = txt.replace("<li>", "- ").replace("</li>", "\n")
        return txt

    def convert(self, html):
        """Run format filters on html string."""
        text = self.filter_strip_tags(html)
        text = self.filter_whitespace(text)
        text = self.filter_format_tags(text)
        text = self.filter_img(text)
        text = self.filter_links(text)
        text = self.filter_embed(text)
        text = self.filter_abbr(text)
        text = self.filter_time(text)
        text = self.filter_ol(text)
        text = self.filter_ul(text)
        return text