273 lines
11 KiB
Python
273 lines
11 KiB
Python
# A set of filters to format HTML to text with barebones Markdown-style markup.
|
|
import re
|
|
|
|
|
|
class HtmlToFText:
|
|
|
|
strip_tags1 = {
|
|
# Remove comments first, which may wrap around other tags
|
|
"<!--": "-->",
|
|
# Remove classes, ids and extraneous attributes
|
|
" class=\"": "\"", " id=\"": "\"",
|
|
}
|
|
|
|
strip_tags2 = {
|
|
# Remove doctype, tags and inner html
|
|
"<!": ">",
|
|
"<applet": "</applet>",
|
|
"<aside": "</aside>",
|
|
"<base": "</base>",
|
|
"<canvas": "</canvas>",
|
|
"<form": "</form>",
|
|
"<button": "</button>",
|
|
"<input": "</input>",
|
|
"<label": "</label>",
|
|
"<head": "</head>",
|
|
"<iframe": "</iframe>",
|
|
"<menu": "</menu>",
|
|
"<nav": "</nav>",
|
|
"<noscript": "</noscript>",
|
|
"<param": "</param>",
|
|
"<progress": "</progress>",
|
|
"<rp": "</rp>",
|
|
"<script": "</script>",
|
|
"<style": "</style>",
|
|
# Remove non-functional empty links after stripping classes/ids
|
|
"<a href=\"#\"": "</a>",
|
|
"<a>": "</a>",
|
|
# Remove the tags themselves but not the inner html
|
|
"<article": ">", "</article": ">",
|
|
"<body": ">", "</body": ">",
|
|
"<div": ">", "</div": ">",
|
|
"<footer": ">", "</footer": ">",
|
|
"<header": ">", "</header": ">",
|
|
"<html": ">", "</html": ">",
|
|
"<main": ">", "</main": ">",
|
|
"<section": ">", "</section": ">",
|
|
"<span": ">", "</span": ">",
|
|
"<title": ">", "</title": ">",
|
|
# Remove currently unsupported tags
|
|
"<center": ">", "</center": ">",
|
|
"<frame": ">", "</frame": ">",
|
|
"<small": ">", "</small": ">",
|
|
# "<audio": ">", "</audio": ">",
|
|
# "<video": ">", "</video": ">",
|
|
# "<map": ">", "</map": ">",
|
|
}
|
|
|
|
strip_ws = ["\n\n", "\t", " "]
|
|
|
|
format_tags1 = {
|
|
">\n<": "><",
|
|
"<blockquote>": "\n>",
|
|
}
|
|
|
|
format_tags2 = {
|
|
"<address>": "[[address]]\n", "</address>": "\n",
|
|
"<em>": "*", "</em>": "*",
|
|
"<i>": "*", "</i>": "*",
|
|
"<h1>": "\n# ", "</h1>": "\n",
|
|
"<h2>": "\n## ", "</h2>": "\n",
|
|
"<h3>": "\n### ", "</h3>": "\n",
|
|
"<h4>": "\n#### ", "</h4>": "\n",
|
|
"<h5>": "\n##### ", "</h5>": "\n",
|
|
"<h6>": "\n###### ", "</h6>": "\n",
|
|
"<hr>": "\n---\n", "<hr/>": "\n---\n", "<hr />": "\n---\n",
|
|
"<br>": "\n", "<br/>": "\n", "<br />": "\n",
|
|
"<blockquote>": "\n>", "</blockquote>": "\n",
|
|
"<cite>": "**", "</cite>": "**",
|
|
"<code>": "`", "</code>": "`",
|
|
"<del>": "~~", "</del>": "~~",
|
|
"<ins>": "**", "</ins>": "**",
|
|
"<dl>": "\n", "</dl>": "",
|
|
"<dt>": "", "</dt>": ": ",
|
|
"<dd>": "", "</dd>": "\n",
|
|
"<figcaption>": "*", "</figcaption>": "*",
|
|
"<figure>": "Fig. ", "</figure>": "",
|
|
"<mark>": "***", "</mark>": "***",
|
|
"<p>": "\n", "</p>": "\n",
|
|
"<pre>": "\n```\n", "</pre>": "\n```\n",
|
|
"<q>": "«", "</q>": "»",
|
|
"<ruby>": "", "</ruby>": "",
|
|
"<rt>": " (", "</rt>": ")",
|
|
"<s>": "[[~~", "</s>": "~~]]",
|
|
"<strong>": "**", "</strong>": "**",
|
|
"<sub>": "⏝", "</sub>": "⏝",
|
|
"<sup>": "^", "</sup>": "^",
|
|
"<b>": "**", "</b>": "**",
|
|
"<u>": "__", "</u>": "__",
|
|
" ": "\n\n",
|
|
"'": "'",
|
|
"’": "'", "’": "'",
|
|
"“": "\"", "”": "\"",
|
|
"–": "—",
|
|
"©": "©",
|
|
}
|
|
|
|
def filter_strip_tags(self, html):
|
|
"""Strip extraneous html tags."""
|
|
txt = html
|
|
# Some tags need to be stripped before others
|
|
for tag in self.strip_tags1:
|
|
txt = re.sub(tag + ".*?" + self.strip_tags1[tag], "", txt,
|
|
flags=re.DOTALL)
|
|
for tag in self.strip_tags2:
|
|
txt = re.sub(tag + ".*?" + self.strip_tags2[tag], "", txt,
|
|
flags=re.DOTALL)
|
|
return txt
|
|
|
|
def filter_whitespace(self, html):
|
|
"""Strip extra whitespaces often found in dynamically-generated source
|
|
files."""
|
|
txt = html
|
|
for ws in self.strip_ws:
|
|
txt = txt.replace(ws, "")
|
|
return "".join(txt.split("\n\n"))
|
|
|
|
def filter_format_tags(self, html):
|
|
"""Translate select structure and format-related tags to Markdown-like
|
|
syntax."""
|
|
txt = html
|
|
for tag in self.format_tags1:
|
|
txt = txt.replace(tag, self.format_tags1[tag])
|
|
for tag in self.format_tags2:
|
|
txt = txt.replace(tag, self.format_tags2[tag])
|
|
return txt
|
|
|
|
def filter_img(self, html):
|
|
"""Translate image tags to Markdown syntax."""
|
|
txt = html
|
|
attrs = {"src": "", "title": "", "alt": ""}
|
|
imgs = re.findall("<img [a-z].*?/>", txt, flags=re.DOTALL)
|
|
for i in imgs:
|
|
for a in attrs:
|
|
if (" " + a + "=\"") in i:
|
|
attrs[a] = i.split(" " + a + "=\"")[1].split("\"")[0]
|
|
elif (" " + a + "='") in i:
|
|
attrs[a] = i.split(" " + a + "='")[1].split("'")[0]
|
|
elif (" " + a + "=") in i:
|
|
attrs[a] = i.split(" " + a + "=")[1].split(" ")[0]
|
|
if attrs["title"] != "":
|
|
md_link = "![" + attrs["alt"] + "](" + attrs["src"] + " \"" + \
|
|
attrs["title"] + "\")"
|
|
else:
|
|
md_link = "![" + attrs["alt"] + "](" + attrs["src"] + ")"
|
|
txt = txt.replace(i, md_link)
|
|
return txt
|
|
|
|
def filter_links(self, html):
|
|
"""Translate links to Markdown syntax."""
|
|
txt = html
|
|
links = re.findall("<a [a-z].*?</a>", txt, flags=re.DOTALL)
|
|
attrs = {"href": "", "title": ""}
|
|
md_link = ""
|
|
for l in links:
|
|
if " href=\"" in l:
|
|
attrs["href"] = l.split(" href=\"")[1].split("\"")[0]
|
|
attrs["title"] = l.split(">")[1].strip("</a>")
|
|
elif " href='" in l:
|
|
attrs["href"] = l.split(" href='")[1].split("'")[0]
|
|
attrs["title"] = l.split(">")[1].strip("</a>")
|
|
elif " href=" in l:
|
|
attrs["href"] = l.split(" href=")[1].split(" ")[0]
|
|
attrs["title"] = l.split(">")[1].strip("</a>")
|
|
if (attrs["href"] != "") and (attrs["title"] != ""):
|
|
md_link = "[" + attrs["title"] + "](" + attrs["href"] + ")"
|
|
txt = txt.replace(l, md_link)
|
|
return txt
|
|
|
|
def filter_embed(self, html):
|
|
"""Translate embed tags to Markdown links."""
|
|
txt = html
|
|
embeds = re.findall("<embed [a-z].*?>", txt, flags=re.DOTALL)
|
|
src = ""
|
|
for e in embeds:
|
|
if " src=\"" in e:
|
|
src = e.split(" src=\"")[1].split("\"")[0]
|
|
elif " src='" in e:
|
|
src = e.split(" src='")[1].split("'")[0]
|
|
elif " src=" in e:
|
|
src = e.split(" src=")[1].split(">")[0]
|
|
if src != "":
|
|
txt = txt.replace(e, "[embed](" + src + ")")
|
|
return txt
|
|
|
|
def filter_abbr(self, html):
|
|
"""Format abbr tags, e.g. `<abbr title="Hypertext Markup
|
|
Language">HTML</abbr>` -> `HTML [[abbr: Hypertext Markup Language]]`"""
|
|
txt = html
|
|
abbrs = re.findall("<abbr [a-z].*?</abbr>", txt, flags=re.DOTALL)
|
|
attrs = {"abbr": "", "title": ""}
|
|
abbrev = ""
|
|
for a in abbrs:
|
|
if " title=\"" in a:
|
|
attrs["title"] = a.split(" title=\"")[1].split("\"")[0]
|
|
attrs["abbr"] = a.split(">")[1].strip("</abbr>")
|
|
elif " title='" in a:
|
|
attrs["title"] = a.split(" title='")[1].split("'")[0]
|
|
attrs["abbr"] = a.split(">")[1].strip("</abbr>")
|
|
elif " title=" in a:
|
|
attrs["title"] = a.split(" title=")[1].split(" ")[0]
|
|
attrs["abbr"] = a.split(">")[1].strip("</abbr>")
|
|
if (attrs["title"] != "") and (attrs["abbr"] != ""):
|
|
abbrev = attrs["abbr"] + "[[abbr: " + attrs["title"] + "]]"
|
|
txt = txt.replace(l, abbrev)
|
|
return txt
|
|
|
|
def filter_time(self, html):
|
|
"""Format time tags, e.g. `<time datetime="1970-01-01">Today</time>` ->
|
|
`Today (1970-01-01)`."""
|
|
txt = html
|
|
timestamps = re.findall("<time.*?</time>", txt)
|
|
attrs = {"title": "", "datetime": ""}
|
|
for t in timestamps:
|
|
attrs["title"] = t.split(">")[1].strip("</time>")
|
|
if " datetime=\"" in t:
|
|
attrs["datetime"] = t.split(" datetime=\"")[1].split("\"")[0]
|
|
elif " datetime='" in t:
|
|
attrs["datetime"] = t.split(" datetime='")[1].split("'")[0]
|
|
elif " datetime=" in t:
|
|
attrs["datetime"] = t.split(" datetime=")[1].split(">")[0]
|
|
ts = attrs["title"]
|
|
if attrs["datetime"] != "":
|
|
ts += " (" + attrs["datetime"] + ")"
|
|
txt = txt.replace(t, ts)
|
|
return txt
|
|
|
|
def filter_ol(self, html):
|
|
"""Parse ordered lists. Only single-level lists are currently
|
|
supported."""
|
|
txt = html
|
|
ol = re.findall("<ol>.*?</ol>", txt, flags=re.DOTALL)
|
|
for o in ol:
|
|
li = o.replace("</li>", "").replace("</ol>", "").split("<li>")
|
|
md = ""
|
|
for l in range(1, len(li)):
|
|
md += str(l) + ". " + li[l].lstrip()
|
|
txt = txt.replace(o, "\n" + md)
|
|
return txt
|
|
|
|
def filter_ul(self, html):
|
|
"""Parse unordered lists. Only single-level lists are currently
|
|
supported (nested lists will be flattened)."""
|
|
txt = html
|
|
while ("<ul>" in txt) or ("<li>" in txt):
|
|
txt = txt.replace("<li></li>","")
|
|
txt = txt.replace("<ul>", "\n").replace("</ul>", "")
|
|
txt = txt.replace("<li>", "- ").replace("</li>", "\n")
|
|
return txt
|
|
|
|
def convert(self, html):
|
|
"""Run format filters on html string."""
|
|
text = self.filter_strip_tags(html)
|
|
text = self.filter_whitespace(text)
|
|
text = self.filter_format_tags(text)
|
|
text = self.filter_img(text)
|
|
text = self.filter_links(text)
|
|
text = self.filter_embed(text)
|
|
text = self.filter_abbr(text)
|
|
text = self.filter_time(text)
|
|
text = self.filter_ol(text)
|
|
text = self.filter_ul(text)
|
|
return text
|