scripts/deprecated/ftg/formatter.py

# A set of filters to format HTML to text with barebones Markdown-style markup.
import re


class HtmlToFText:

    strip_tags1 = {
            # Remove comments first, which may wrap around other tags
            "<!--": "-->",
            # Remove classes, ids and extraneous attributes
            " class=\"": "\"", " id=\"": "\"",
            }

    strip_tags2 = {
            # Remove doctype, tags and inner html
            "<!": ">",
            "<applet": "</applet>",
            "<aside": "</aside>",
            "<base": "</base>",
            "<canvas": "</canvas>",
            "<form": "</form>",
            "<button": "</button>",
            "<input": "</input>",
            "<label": "</label>",
            "<head": "</head>",
            "<iframe": "</iframe>",
            "<menu": "</menu>",
            "<nav": "</nav>",
            "<noscript": "</noscript>",
            "<param": "</param>",
            "<progress": "</progress>",
            "<rp": "</rp>",
            "<script": "</script>",
            "<style": "</style>",
            # Remove non-functional empty links after stripping classes/ids
            "<a href=\"#\"": "</a>",
            "<a>": "</a>",
            # Remove the tags themselves but not the inner html
            "<article": ">", "</article": ">",
            "<body": ">", "</body": ">",
            "<div": ">", "</div": ">",
            "<footer": ">", "</footer": ">",
            "<header": ">", "</header": ">",
            "<html": ">", "</html": ">",
            "<main": ">", "</main": ">",
            "<section": ">", "</section": ">",
            "<span": ">", "</span": ">",
            "<title": ">", "</title": ">",
            # Remove currently unsupported tags
            "<center": ">", "</center": ">",
            "<frame": ">", "</frame": ">",
            "<small": ">", "</small": ">",
            # "<audio": ">", "</audio": ">",
            # "<video": ">", "</video": ">",
            # "<map": ">", "</map": ">",
            }

    strip_ws = ["\n\n", "\t", "  "]

    format_tags1 = {
            ">\n<": "><",
            "<blockquote>": "\n>",
            }

    format_tags2 = {
            "<address>": "[[address]]\n", "</address>": "\n",
            "<em>": "*", "</em>": "*",
            "<i>": "*", "</i>": "*",
            "<h1>": "\n# ", "</h1>": "\n",
            "<h2>": "\n## ", "</h2>": "\n",
            "<h3>": "\n### ", "</h3>": "\n",
            "<h4>": "\n#### ", "</h4>": "\n",
            "<h5>": "\n##### ", "</h5>": "\n",
            "<h6>": "\n###### ", "</h6>": "\n",
            "<hr>": "\n---\n", "<hr/>": "\n---\n", "<hr />": "\n---\n",
            "<br>": "\n", "<br/>": "\n", "<br />": "\n",
            "<blockquote>": "\n>", "</blockquote>": "\n",
            "<cite>": "**", "</cite>": "**",
            "<code>": "`", "</code>": "`",
            "<del>": "~~", "</del>": "~~",
            "<ins>": "**", "</ins>": "**",
            "<dl>": "\n", "</dl>": "",
            "<dt>": "", "</dt>": ": ",
            "<dd>": "", "</dd>": "\n",
            "<figcaption>": "*", "</figcaption>": "*",
            "<figure>": "Fig. ", "</figure>": "",
            "<mark>": "***", "</mark>": "***",
            "<p>": "\n", "</p>": "\n",
            "<pre>": "\n```\n", "</pre>": "\n```\n",
            "<q>": "«", "</q>": "»",
            "<ruby>": "", "</ruby>": "",
            "<rt>": " (", "</rt>": ")",
            "<s>": "[[~~", "</s>": "~~]]",
            "<strong>": "**", "</strong>": "**",
            "<sub>": "⏝", "</sub>": "⏝",
            "<sup>": "^", "</sup>": "^",
            "<b>": "**", "</b>": "**",
            "<u>": "__", "</u>": "__", 
            "&nbsp;": "\n\n",
            "&#39;": "'",
            "&rsquo;": "'", "&rsquo;": "'",
            "&ldquo;": "\"", "&rdquo;": "\"",
            "&ndash;": "—",
            "&copy;": "©",
            }

    def filter_strip_tags(self, html):
        """Strip extraneous html tags."""
        txt = html
        # Some tags need to be stripped before others
        for tag in self.strip_tags1:
            txt = re.sub(tag + ".*?" + self.strip_tags1[tag], "", txt,
                    flags=re.DOTALL)
        for tag in self.strip_tags2:
            txt = re.sub(tag + ".*?" + self.strip_tags2[tag], "", txt,
                    flags=re.DOTALL)
        return txt

    def filter_whitespace(self, html):
        """Strip extra whitespaces often found in dynamically-generated source
        files."""
        txt = html
        for ws in self.strip_ws:
            txt = txt.replace(ws, "")
        return "".join(txt.split("\n\n"))

    def filter_format_tags(self, html):
        """Translate select structure and format-related tags to Markdown-like
        syntax."""
        txt = html
        for tag in self.format_tags1:
            txt = txt.replace(tag, self.format_tags1[tag])
        for tag in self.format_tags2:
            txt = txt.replace(tag, self.format_tags2[tag])
        return txt

    def filter_img(self, html):
        """Translate image tags to Markdown syntax."""
        txt = html
        attrs = {"src": "", "title": "", "alt": ""}
        imgs = re.findall("<img [a-z].*?/>", txt, flags=re.DOTALL)
        for i in imgs:
            for a in attrs:
                if (" " + a + "=\"") in i:
                    attrs[a] = i.split(" " + a + "=\"")[1].split("\"")[0]
                elif (" " + a + "='") in i:
                    attrs[a] = i.split(" " + a + "='")[1].split("'")[0]
                elif (" " + a + "=") in i:
                    attrs[a] = i.split(" " + a + "=")[1].split(" ")[0]
            if attrs["title"] != "":
                md_link = "![" + attrs["alt"] + "](" + attrs["src"] + " \"" + \
                        attrs["title"] + "\")" 
            else:
                md_link = "![" + attrs["alt"] + "](" + attrs["src"] + ")"
            txt = txt.replace(i, md_link)
        return txt

    def filter_links(self, html):
        """Translate links to Markdown syntax."""
        txt = html
        links = re.findall("<a [a-z].*?</a>", txt, flags=re.DOTALL)
        attrs = {"href": "", "title": ""}
        md_link = ""
        for l in links:
            if " href=\"" in l:
                attrs["href"] = l.split(" href=\"")[1].split("\"")[0]
                attrs["title"] = l.split(">")[1].strip("</a>")
            elif " href='" in l:
                attrs["href"] = l.split(" href='")[1].split("'")[0]
                attrs["title"] = l.split(">")[1].strip("</a>")
            elif " href=" in l:
                attrs["href"] = l.split(" href=")[1].split(" ")[0]
                attrs["title"] = l.split(">")[1].strip("</a>")
            if (attrs["href"] != "") and (attrs["title"] != ""):
                md_link = "[" + attrs["title"] + "](" + attrs["href"] + ")"
            txt = txt.replace(l, md_link)
        return txt
    
    def filter_embed(self, html):
        """Translate embed tags to Markdown links."""
        txt = html
        embeds = re.findall("<embed [a-z].*?>", txt, flags=re.DOTALL)
        src = ""
        for e in embeds:
            if " src=\"" in e:
                src = e.split(" src=\"")[1].split("\"")[0]
            elif " src='" in e:
                src = e.split(" src='")[1].split("'")[0]
            elif " src=" in e:
                src = e.split(" src=")[1].split(">")[0]
            if src != "":
                txt = txt.replace(e, "[embed](" + src + ")")
        return txt

    def filter_abbr(self, html):
        """Format abbr tags, e.g. `<abbr title="Hypertext Markup
        Language">HTML</abbr>` -> `HTML [[abbr: Hypertext Markup Language]]`"""
        txt = html
        abbrs = re.findall("<abbr [a-z].*?</abbr>", txt, flags=re.DOTALL)
        attrs = {"abbr": "", "title": ""}
        abbrev = ""
        for a in abbrs:
            if " title=\"" in a:
                attrs["title"] = a.split(" title=\"")[1].split("\"")[0]
                attrs["abbr"] = a.split(">")[1].strip("</abbr>")
            elif " title='" in a:
                attrs["title"] = a.split(" title='")[1].split("'")[0]
                attrs["abbr"] = a.split(">")[1].strip("</abbr>")
            elif " title=" in a:
                attrs["title"] = a.split(" title=")[1].split(" ")[0]
                attrs["abbr"] = a.split(">")[1].strip("</abbr>")
            if (attrs["title"] != "") and (attrs["abbr"] != ""):
                abbrev = attrs["abbr"] + "[[abbr: " + attrs["title"] + "]]"
            txt = txt.replace(l, abbrev)
        return txt

    def filter_time(self, html):
        """Format time tags, e.g. `<time datetime="1970-01-01">Today</time>` ->
        `Today (1970-01-01)`."""
        txt = html
        timestamps = re.findall("<time.*?</time>", txt)
        attrs = {"title": "", "datetime": ""}
        for t in timestamps:
            attrs["title"] = t.split(">")[1].strip("</time>")
            if " datetime=\"" in t:
                attrs["datetime"] = t.split(" datetime=\"")[1].split("\"")[0]
            elif " datetime='" in t:
                attrs["datetime"] = t.split(" datetime='")[1].split("'")[0]
            elif " datetime=" in t:
                attrs["datetime"] = t.split(" datetime=")[1].split(">")[0]
            ts = attrs["title"]
            if attrs["datetime"] != "": 
                ts += " (" + attrs["datetime"] + ")"
            txt = txt.replace(t, ts)
        return txt

    def filter_ol(self, html):
        """Parse ordered lists. Only single-level lists are currently
        supported."""
        txt = html
        ol = re.findall("<ol>.*?</ol>", txt, flags=re.DOTALL)
        for o in ol:
            li = o.replace("</li>", "").replace("</ol>", "").split("<li>")
            md = ""
            for l in range(1, len(li)):
                md += str(l) + ". " + li[l].lstrip()
            txt = txt.replace(o, "\n" + md)
        return txt

    def filter_ul(self, html):
        """Parse unordered lists. Only single-level lists are currently
        supported (nested lists will be flattened)."""
        txt = html
        while  ("<ul>" in txt) or ("<li>" in txt):
            txt = txt.replace("<li></li>","")
            txt = txt.replace("<ul>", "\n").replace("</ul>", "")
            txt = txt.replace("<li>", "- ").replace("</li>", "\n")
        return txt

    def convert(self, html):
        """Run format filters on html string."""
        text = self.filter_strip_tags(html)
        text = self.filter_whitespace(text)
        text = self.filter_format_tags(text)
        text = self.filter_img(text)
        text = self.filter_links(text)
        text = self.filter_embed(text)
        text = self.filter_abbr(text)
        text = self.filter_time(text)
        text = self.filter_ol(text)
        text = self.filter_ul(text)
        return text
Initial commit 2023-04-21 02:31:52 +00:00			`# A set of filters to format HTML to text with barebones Markdown-style markup.`
			`import re`


			`class HtmlToFText:`

			`strip_tags1 = {`
			`# Remove comments first, which may wrap around other tags`
			`"<!--": "-->",`
			`# Remove classes, ids and extraneous attributes`
			`" class=\"": "\"", " id=\"": "\"",`
			`}`

			`strip_tags2 = {`
			`# Remove doctype, tags and inner html`
			`"<!": ">",`
			`"<applet": "</applet>",`
			`"<aside": "</aside>",`
			`"<base": "</base>",`
			`"<canvas": "</canvas>",`
			`"<form": "</form>",`
			`"<button": "</button>",`
			`"<input": "</input>",`
			`"<label": "</label>",`
			`"<head": "</head>",`
			`"<iframe": "</iframe>",`
			`"<menu": "</menu>",`
			`"<nav": "</nav>",`
			`"<noscript": "</noscript>",`
			`"<param": "</param>",`
			`"<progress": "</progress>",`
			`"<rp": "</rp>",`
			`"<script": "</script>",`
			`"<style": "</style>",`
			`# Remove non-functional empty links after stripping classes/ids`
			`"<a href=\"#\"": "</a>",`
			`"<a>": "</a>",`
			`# Remove the tags themselves but not the inner html`
			`"<article": ">", "</article": ">",`
			`"<body": ">", "</body": ">",`
			`"<div": ">", "</div": ">",`
			`"<footer": ">", "</footer": ">",`
			`"<header": ">", "</header": ">",`
			`"<html": ">", "</html": ">",`
			`"<main": ">", "</main": ">",`
			`"<section": ">", "</section": ">",`
			`"<span": ">", "</span": ">",`
			`"<title": ">", "</title": ">",`
			`# Remove currently unsupported tags`
			`"<center": ">", "</center": ">",`
			`"<frame": ">", "</frame": ">",`
			`"<small": ">", "</small": ">",`
			`# "<audio": ">", "</audio": ">",`
			`# "<video": ">", "</video": ">",`
			`# "<map": ">", "</map": ">",`
			`}`

			`strip_ws = ["\n\n", "\t", " "]`

			`format_tags1 = {`
			`">\n<": "><",`
			`"<blockquote>": "\n>",`
			`}`

			`format_tags2 = {`
			`"<address>": "[[address]]\n", "</address>": "\n",`
			`"<em>": "", "</em>": "",`
			`"<i>": "", "</i>": "",`
			`"<h1>": "\n# ", "</h1>": "\n",`
			`"<h2>": "\n## ", "</h2>": "\n",`
			`"<h3>": "\n### ", "</h3>": "\n",`
			`"<h4>": "\n#### ", "</h4>": "\n",`
			`"<h5>": "\n##### ", "</h5>": "\n",`
			`"<h6>": "\n###### ", "</h6>": "\n",`
			`"<hr>": "\n---\n", "<hr/>": "\n---\n", "<hr />": "\n---\n",`
			`"<br>": "\n", "<br/>": "\n", "<br />": "\n",`
			`"<blockquote>": "\n>", "</blockquote>": "\n",`
			`"<cite>": "", "</cite>": "",`
			"<code>": "`", "</code>": "`",
			`"<del>": "~~", "</del>": "~~",`
			`"<ins>": "", "</ins>": "",`
			`"<dl>": "\n", "</dl>": "",`
			`"<dt>": "", "</dt>": ": ",`
			`"<dd>": "", "</dd>": "\n",`
			`"<figcaption>": "", "</figcaption>": "",`
			`"<figure>": "Fig. ", "</figure>": "",`
			`"<mark>": "*", "</mark>": "*",`
			`"<p>": "\n", "</p>": "\n",`
			"<pre>": "\n```\n", "</pre>": "\n```\n",
			`"<q>": "«", "</q>": "»",`
			`"<ruby>": "", "</ruby>": "",`
			`"<rt>": " (", "</rt>": ")",`
			`"<s>": "[[~~", "</s>": "~~]]",`
			`"<strong>": "", "</strong>": "",`
			`"<sub>": "⏝", "</sub>": "⏝",`
			`"<sup>": "^", "</sup>": "^",`
			`"<b>": "", "</b>": "",`
			`"<u>": "__", "</u>": "__",`
			`" ": "\n\n",`
			`"'": "'",`
			`"’": "'", "’": "'",`
			`"“": "\"", "”": "\"",`
			`"–": "—",`
			`"©": "©",`
			`}`

			`def filter_strip_tags(self, html):`
			`"""Strip extraneous html tags."""`
			`txt = html`
			`# Some tags need to be stripped before others`
			`for tag in self.strip_tags1:`
			`txt = re.sub(tag + ".*?" + self.strip_tags1[tag], "", txt,`
			`flags=re.DOTALL)`
			`for tag in self.strip_tags2:`
			`txt = re.sub(tag + ".*?" + self.strip_tags2[tag], "", txt,`
			`flags=re.DOTALL)`
			`return txt`

			`def filter_whitespace(self, html):`
			`"""Strip extra whitespaces often found in dynamically-generated source`
			`files."""`
			`txt = html`
			`for ws in self.strip_ws:`
			`txt = txt.replace(ws, "")`
			`return "".join(txt.split("\n\n"))`

			`def filter_format_tags(self, html):`
			`"""Translate select structure and format-related tags to Markdown-like`
			`syntax."""`
			`txt = html`
			`for tag in self.format_tags1:`
			`txt = txt.replace(tag, self.format_tags1[tag])`
			`for tag in self.format_tags2:`
			`txt = txt.replace(tag, self.format_tags2[tag])`
			`return txt`

			`def filter_img(self, html):`
			`"""Translate image tags to Markdown syntax."""`
			`txt = html`
			`attrs = {"src": "", "title": "", "alt": ""}`
			`imgs = re.findall("<img [a-z].*?/>", txt, flags=re.DOTALL)`
			`for i in imgs:`
			`for a in attrs:`
			`if (" " + a + "=\"") in i:`
			`attrs[a] = i.split(" " + a + "=\"")[1].split("\"")[0]`
			`elif (" " + a + "='") in i:`
			`attrs[a] = i.split(" " + a + "='")[1].split("'")[0]`
			`elif (" " + a + "=") in i:`
			`attrs[a] = i.split(" " + a + "=")[1].split(" ")[0]`
			`if attrs["title"] != "":`
			`md_link = "![" + attrs["alt"] + "](" + attrs["src"] + " \"" + \`
			`attrs["title"] + "\")"`
			`else:`
			`md_link = "![" + attrs["alt"] + "](" + attrs["src"] + ")"`
			`txt = txt.replace(i, md_link)`
			`return txt`

			`def filter_links(self, html):`
			`"""Translate links to Markdown syntax."""`
			`txt = html`
			`links = re.findall("<a [a-z].*?</a>", txt, flags=re.DOTALL)`
			`attrs = {"href": "", "title": ""}`
			`md_link = ""`
			`for l in links:`
			`if " href=\"" in l:`
			`attrs["href"] = l.split(" href=\"")[1].split("\"")[0]`
			`attrs["title"] = l.split(">")[1].strip("</a>")`
			`elif " href='" in l:`
			`attrs["href"] = l.split(" href='")[1].split("'")[0]`
			`attrs["title"] = l.split(">")[1].strip("</a>")`
			`elif " href=" in l:`
			`attrs["href"] = l.split(" href=")[1].split(" ")[0]`
			`attrs["title"] = l.split(">")[1].strip("</a>")`
			`if (attrs["href"] != "") and (attrs["title"] != ""):`
			`md_link = "[" + attrs["title"] + "](" + attrs["href"] + ")"`
			`txt = txt.replace(l, md_link)`
			`return txt`

			`def filter_embed(self, html):`
			`"""Translate embed tags to Markdown links."""`
			`txt = html`
			`embeds = re.findall("<embed [a-z].*?>", txt, flags=re.DOTALL)`
			`src = ""`
			`for e in embeds:`
			`if " src=\"" in e:`
			`src = e.split(" src=\"")[1].split("\"")[0]`
			`elif " src='" in e:`
			`src = e.split(" src='")[1].split("'")[0]`
			`elif " src=" in e:`
			`src = e.split(" src=")[1].split(">")[0]`
			`if src != "":`
			`txt = txt.replace(e, "[embed](" + src + ")")`
			`return txt`

			`def filter_abbr(self, html):`
			"""Format abbr tags, e.g. `<abbr title="Hypertext Markup
			Language">HTML</abbr>` -> `HTML [[abbr: Hypertext Markup Language]]`"""
			`txt = html`
			`abbrs = re.findall("<abbr [a-z].*?</abbr>", txt, flags=re.DOTALL)`
			`attrs = {"abbr": "", "title": ""}`
			`abbrev = ""`
			`for a in abbrs:`
			`if " title=\"" in a:`
			`attrs["title"] = a.split(" title=\"")[1].split("\"")[0]`
			`attrs["abbr"] = a.split(">")[1].strip("</abbr>")`
			`elif " title='" in a:`
			`attrs["title"] = a.split(" title='")[1].split("'")[0]`
			`attrs["abbr"] = a.split(">")[1].strip("</abbr>")`
			`elif " title=" in a:`
			`attrs["title"] = a.split(" title=")[1].split(" ")[0]`
			`attrs["abbr"] = a.split(">")[1].strip("</abbr>")`
			`if (attrs["title"] != "") and (attrs["abbr"] != ""):`
			`abbrev = attrs["abbr"] + "[[abbr: " + attrs["title"] + "]]"`
			`txt = txt.replace(l, abbrev)`
			`return txt`

			`def filter_time(self, html):`
			"""Format time tags, e.g. `<time datetime="1970-01-01">Today</time>` ->
			`Today (1970-01-01)`."""
			`txt = html`
			`timestamps = re.findall("<time.*?</time>", txt)`
			`attrs = {"title": "", "datetime": ""}`
			`for t in timestamps:`
			`attrs["title"] = t.split(">")[1].strip("</time>")`
			`if " datetime=\"" in t:`
			`attrs["datetime"] = t.split(" datetime=\"")[1].split("\"")[0]`
			`elif " datetime='" in t:`
			`attrs["datetime"] = t.split(" datetime='")[1].split("'")[0]`
			`elif " datetime=" in t:`
			`attrs["datetime"] = t.split(" datetime=")[1].split(">")[0]`
			`ts = attrs["title"]`
			`if attrs["datetime"] != "":`
			`ts += " (" + attrs["datetime"] + ")"`
			`txt = txt.replace(t, ts)`
			`return txt`

			`def filter_ol(self, html):`
			`"""Parse ordered lists. Only single-level lists are currently`
			`supported."""`
			`txt = html`
			`ol = re.findall("<ol>.*?</ol>", txt, flags=re.DOTALL)`
			`for o in ol:`
			`li = o.replace("</li>", "").replace("</ol>", "").split("<li>")`
			`md = ""`
			`for l in range(1, len(li)):`
			`md += str(l) + ". " + li[l].lstrip()`
			`txt = txt.replace(o, "\n" + md)`
			`return txt`

			`def filter_ul(self, html):`
			`"""Parse unordered lists. Only single-level lists are currently`
			`supported (nested lists will be flattened)."""`
			`txt = html`
			`while ("<ul>" in txt) or ("<li>" in txt):`
			`txt = txt.replace("<li></li>","")`
			`txt = txt.replace("<ul>", "\n").replace("</ul>", "")`
			`txt = txt.replace("<li>", "- ").replace("</li>", "\n")`
			`return txt`

			`def convert(self, html):`
			`"""Run format filters on html string."""`
			`text = self.filter_strip_tags(html)`
			`text = self.filter_whitespace(text)`
			`text = self.filter_format_tags(text)`
			`text = self.filter_img(text)`
			`text = self.filter_links(text)`
			`text = self.filter_embed(text)`
			`text = self.filter_abbr(text)`
			`text = self.filter_time(text)`
			`text = self.filter_ol(text)`
			`text = self.filter_ul(text)`
			`return text`