# A set of filters to format HTML to text with barebones Markdown-style markup. import re class HtmlToFText: strip_tags1 = { # Remove comments first, which may wrap around other tags "", # Remove classes, ids and extraneous attributes " class=\"": "\"", " id=\"": "\"", } strip_tags2 = { # Remove doctype, tags and inner html "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", # Remove non-functional empty links after stripping classes/ids "", "": "", # Remove the tags themselves but not the inner html "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", # Remove currently unsupported tags "", "", "", "", "", "", # "", "", # "", "", # "", "", } strip_ws = ["\n\n", "\t", " "] format_tags1 = { ">\n<": "><", "
": "\n>", } format_tags2 = { "
": "[[address]]\n", "
": "\n", "": "*", "": "*", "": "*", "": "*", "

": "\n# ", "

": "\n", "

": "\n## ", "

": "\n", "

": "\n### ", "

": "\n", "

": "\n#### ", "

": "\n", "
": "\n##### ", "
": "\n", "
": "\n###### ", "
": "\n", "
": "\n---\n", "
": "\n---\n", "
": "\n---\n", "
": "\n", "
": "\n", "
": "\n", "
": "\n>", "
": "\n", "": "**", "": "**", "": "`", "": "`", "": "~~", "": "~~", "": "**", "": "**", "
": "\n", "
": "", "
": "", "
": ": ", "
": "", "
": "\n", "
": "*", "
": "*", "
": "Fig. ", "
": "", "": "***", "": "***", "

": "\n", "

": "\n", "
": "\n```\n", "
": "\n```\n", "": "«", "": "»", "": "", "": "", "": " (", "": ")", "": "[[~~", "": "~~]]", "": "**", "": "**", "": "⏝", "": "⏝", "": "^", "": "^", "": "**", "": "**", "": "__", "": "__", " ": "\n\n", "'": "'", "’": "'", "’": "'", "“": "\"", "”": "\"", "–": "—", "©": "©", } def filter_strip_tags(self, html): """Strip extraneous html tags.""" txt = html # Some tags need to be stripped before others for tag in self.strip_tags1: txt = re.sub(tag + ".*?" + self.strip_tags1[tag], "", txt, flags=re.DOTALL) for tag in self.strip_tags2: txt = re.sub(tag + ".*?" + self.strip_tags2[tag], "", txt, flags=re.DOTALL) return txt def filter_whitespace(self, html): """Strip extra whitespaces often found in dynamically-generated source files.""" txt = html for ws in self.strip_ws: txt = txt.replace(ws, "") return "".join(txt.split("\n\n")) def filter_format_tags(self, html): """Translate select structure and format-related tags to Markdown-like syntax.""" txt = html for tag in self.format_tags1: txt = txt.replace(tag, self.format_tags1[tag]) for tag in self.format_tags2: txt = txt.replace(tag, self.format_tags2[tag]) return txt def filter_img(self, html): """Translate image tags to Markdown syntax.""" txt = html attrs = {"src": "", "title": "", "alt": ""} imgs = re.findall("", txt, flags=re.DOTALL) for i in imgs: for a in attrs: if (" " + a + "=\"") in i: attrs[a] = i.split(" " + a + "=\"")[1].split("\"")[0] elif (" " + a + "='") in i: attrs[a] = i.split(" " + a + "='")[1].split("'")[0] elif (" " + a + "=") in i: attrs[a] = i.split(" " + a + "=")[1].split(" ")[0] if attrs["title"] != "": md_link = "![" + attrs["alt"] + "](" + attrs["src"] + " \"" + \ attrs["title"] + "\")" else: md_link = "![" + attrs["alt"] + "](" + attrs["src"] + ")" txt = txt.replace(i, md_link) return txt def filter_links(self, html): """Translate links to Markdown syntax.""" txt = html links = re.findall("", txt, flags=re.DOTALL) attrs = {"href": "", "title": ""} md_link = "" for l in links: if " href=\"" in l: attrs["href"] = l.split(" href=\"")[1].split("\"")[0] attrs["title"] = l.split(">")[1].strip("") elif " href='" in l: attrs["href"] = l.split(" href='")[1].split("'")[0] attrs["title"] = l.split(">")[1].strip("") elif " href=" in l: attrs["href"] = l.split(" href=")[1].split(" ")[0] attrs["title"] = l.split(">")[1].strip("") if (attrs["href"] != "") and (attrs["title"] != ""): md_link = "[" + attrs["title"] + "](" + attrs["href"] + ")" txt = txt.replace(l, md_link) return txt def filter_embed(self, html): """Translate embed tags to Markdown links.""" txt = html embeds = re.findall("", txt, flags=re.DOTALL) src = "" for e in embeds: if " src=\"" in e: src = e.split(" src=\"")[1].split("\"")[0] elif " src='" in e: src = e.split(" src='")[1].split("'")[0] elif " src=" in e: src = e.split(" src=")[1].split(">")[0] if src != "": txt = txt.replace(e, "[embed](" + src + ")") return txt def filter_abbr(self, html): """Format abbr tags, e.g. `HTML` -> `HTML [[abbr: Hypertext Markup Language]]`""" txt = html abbrs = re.findall("", txt, flags=re.DOTALL) attrs = {"abbr": "", "title": ""} abbrev = "" for a in abbrs: if " title=\"" in a: attrs["title"] = a.split(" title=\"")[1].split("\"")[0] attrs["abbr"] = a.split(">")[1].strip("") elif " title='" in a: attrs["title"] = a.split(" title='")[1].split("'")[0] attrs["abbr"] = a.split(">")[1].strip("") elif " title=" in a: attrs["title"] = a.split(" title=")[1].split(" ")[0] attrs["abbr"] = a.split(">")[1].strip("") if (attrs["title"] != "") and (attrs["abbr"] != ""): abbrev = attrs["abbr"] + "[[abbr: " + attrs["title"] + "]]" txt = txt.replace(l, abbrev) return txt def filter_time(self, html): """Format time tags, e.g. `` -> `Today (1970-01-01)`.""" txt = html timestamps = re.findall("", txt) attrs = {"title": "", "datetime": ""} for t in timestamps: attrs["title"] = t.split(">")[1].strip("") if " datetime=\"" in t: attrs["datetime"] = t.split(" datetime=\"")[1].split("\"")[0] elif " datetime='" in t: attrs["datetime"] = t.split(" datetime='")[1].split("'")[0] elif " datetime=" in t: attrs["datetime"] = t.split(" datetime=")[1].split(">")[0] ts = attrs["title"] if attrs["datetime"] != "": ts += " (" + attrs["datetime"] + ")" txt = txt.replace(t, ts) return txt def filter_ol(self, html): """Parse ordered lists. Only single-level lists are currently supported.""" txt = html ol = re.findall("
    .*?
", txt, flags=re.DOTALL) for o in ol: li = o.replace("", "").replace("", "").split("
  • ") md = "" for l in range(1, len(li)): md += str(l) + ". " + li[l].lstrip() txt = txt.replace(o, "\n" + md) return txt def filter_ul(self, html): """Parse unordered lists. Only single-level lists are currently supported (nested lists will be flattened).""" txt = html while ("
      " in txt) or ("
    • " in txt): txt = txt.replace("
    • ","") txt = txt.replace("
        ", "\n").replace("
      ", "") txt = txt.replace("
    • ", "- ").replace("
    • ", "\n") return txt def convert(self, html): """Run format filters on html string.""" text = self.filter_strip_tags(html) text = self.filter_whitespace(text) text = self.filter_format_tags(text) text = self.filter_img(text) text = self.filter_links(text) text = self.filter_embed(text) text = self.filter_abbr(text) text = self.filter_time(text) text = self.filter_ol(text) text = self.filter_ul(text) return text