scripts/ftg/formatter.py

273 lines
11 KiB
Python

# A set of filters to format HTML to text with barebones Markdown-style markup.
import re
class HtmlToFText:
strip_tags1 = {
# Remove comments first, which may wrap around other tags
"<!--": "-->",
# Remove classes, ids and extraneous attributes
" class=\"": "\"", " id=\"": "\"",
}
strip_tags2 = {
# Remove doctype, tags and inner html
"<!": ">",
"<applet": "</applet>",
"<aside": "</aside>",
"<base": "</base>",
"<canvas": "</canvas>",
"<form": "</form>",
"<button": "</button>",
"<input": "</input>",
"<label": "</label>",
"<head": "</head>",
"<iframe": "</iframe>",
"<menu": "</menu>",
"<nav": "</nav>",
"<noscript": "</noscript>",
"<param": "</param>",
"<progress": "</progress>",
"<rp": "</rp>",
"<script": "</script>",
"<style": "</style>",
# Remove non-functional empty links after stripping classes/ids
"<a href=\"#\"": "</a>",
"<a>": "</a>",
# Remove the tags themselves but not the inner html
"<article": ">", "</article": ">",
"<body": ">", "</body": ">",
"<div": ">", "</div": ">",
"<footer": ">", "</footer": ">",
"<header": ">", "</header": ">",
"<html": ">", "</html": ">",
"<main": ">", "</main": ">",
"<section": ">", "</section": ">",
"<span": ">", "</span": ">",
"<title": ">", "</title": ">",
# Remove currently unsupported tags
"<center": ">", "</center": ">",
"<frame": ">", "</frame": ">",
"<small": ">", "</small": ">",
# "<audio": ">", "</audio": ">",
# "<video": ">", "</video": ">",
# "<map": ">", "</map": ">",
}
strip_ws = ["\n\n", "\t", " "]
format_tags1 = {
">\n<": "><",
"<blockquote>": "\n>",
}
format_tags2 = {
"<address>": "[[address]]\n", "</address>": "\n",
"<em>": "*", "</em>": "*",
"<i>": "*", "</i>": "*",
"<h1>": "\n# ", "</h1>": "\n",
"<h2>": "\n## ", "</h2>": "\n",
"<h3>": "\n### ", "</h3>": "\n",
"<h4>": "\n#### ", "</h4>": "\n",
"<h5>": "\n##### ", "</h5>": "\n",
"<h6>": "\n###### ", "</h6>": "\n",
"<hr>": "\n---\n", "<hr/>": "\n---\n", "<hr />": "\n---\n",
"<br>": "\n", "<br/>": "\n", "<br />": "\n",
"<blockquote>": "\n>", "</blockquote>": "\n",
"<cite>": "**", "</cite>": "**",
"<code>": "`", "</code>": "`",
"<del>": "~~", "</del>": "~~",
"<ins>": "**", "</ins>": "**",
"<dl>": "\n", "</dl>": "",
"<dt>": "", "</dt>": ": ",
"<dd>": "", "</dd>": "\n",
"<figcaption>": "*", "</figcaption>": "*",
"<figure>": "Fig. ", "</figure>": "",
"<mark>": "***", "</mark>": "***",
"<p>": "\n", "</p>": "\n",
"<pre>": "\n```\n", "</pre>": "\n```\n",
"<q>": "«", "</q>": "»",
"<ruby>": "", "</ruby>": "",
"<rt>": " (", "</rt>": ")",
"<s>": "[[~~", "</s>": "~~]]",
"<strong>": "**", "</strong>": "**",
"<sub>": "", "</sub>": "",
"<sup>": "^", "</sup>": "^",
"<b>": "**", "</b>": "**",
"<u>": "__", "</u>": "__",
"&nbsp;": "\n\n",
"&#39;": "'",
"&rsquo;": "'", "&rsquo;": "'",
"&ldquo;": "\"", "&rdquo;": "\"",
"&ndash;": "",
"&copy;": "©",
}
def filter_strip_tags(self, html):
"""Strip extraneous html tags."""
txt = html
# Some tags need to be stripped before others
for tag in self.strip_tags1:
txt = re.sub(tag + ".*?" + self.strip_tags1[tag], "", txt,
flags=re.DOTALL)
for tag in self.strip_tags2:
txt = re.sub(tag + ".*?" + self.strip_tags2[tag], "", txt,
flags=re.DOTALL)
return txt
def filter_whitespace(self, html):
"""Strip extra whitespaces often found in dynamically-generated source
files."""
txt = html
for ws in self.strip_ws:
txt = txt.replace(ws, "")
return "".join(txt.split("\n\n"))
def filter_format_tags(self, html):
"""Translate select structure and format-related tags to Markdown-like
syntax."""
txt = html
for tag in self.format_tags1:
txt = txt.replace(tag, self.format_tags1[tag])
for tag in self.format_tags2:
txt = txt.replace(tag, self.format_tags2[tag])
return txt
def filter_img(self, html):
"""Translate image tags to Markdown syntax."""
txt = html
attrs = {"src": "", "title": "", "alt": ""}
imgs = re.findall("<img [a-z].*?/>", txt, flags=re.DOTALL)
for i in imgs:
for a in attrs:
if (" " + a + "=\"") in i:
attrs[a] = i.split(" " + a + "=\"")[1].split("\"")[0]
elif (" " + a + "='") in i:
attrs[a] = i.split(" " + a + "='")[1].split("'")[0]
elif (" " + a + "=") in i:
attrs[a] = i.split(" " + a + "=")[1].split(" ")[0]
if attrs["title"] != "":
md_link = "![" + attrs["alt"] + "](" + attrs["src"] + " \"" + \
attrs["title"] + "\")"
else:
md_link = "![" + attrs["alt"] + "](" + attrs["src"] + ")"
txt = txt.replace(i, md_link)
return txt
def filter_links(self, html):
"""Translate links to Markdown syntax."""
txt = html
links = re.findall("<a [a-z].*?</a>", txt, flags=re.DOTALL)
attrs = {"href": "", "title": ""}
md_link = ""
for l in links:
if " href=\"" in l:
attrs["href"] = l.split(" href=\"")[1].split("\"")[0]
attrs["title"] = l.split(">")[1].strip("</a>")
elif " href='" in l:
attrs["href"] = l.split(" href='")[1].split("'")[0]
attrs["title"] = l.split(">")[1].strip("</a>")
elif " href=" in l:
attrs["href"] = l.split(" href=")[1].split(" ")[0]
attrs["title"] = l.split(">")[1].strip("</a>")
if (attrs["href"] != "") and (attrs["title"] != ""):
md_link = "[" + attrs["title"] + "](" + attrs["href"] + ")"
txt = txt.replace(l, md_link)
return txt
def filter_embed(self, html):
"""Translate embed tags to Markdown links."""
txt = html
embeds = re.findall("<embed [a-z].*?>", txt, flags=re.DOTALL)
src = ""
for e in embeds:
if " src=\"" in e:
src = e.split(" src=\"")[1].split("\"")[0]
elif " src='" in e:
src = e.split(" src='")[1].split("'")[0]
elif " src=" in e:
src = e.split(" src=")[1].split(">")[0]
if src != "":
txt = txt.replace(e, "[embed](" + src + ")")
return txt
def filter_abbr(self, html):
"""Format abbr tags, e.g. `<abbr title="Hypertext Markup
Language">HTML</abbr>` -> `HTML [[abbr: Hypertext Markup Language]]`"""
txt = html
abbrs = re.findall("<abbr [a-z].*?</abbr>", txt, flags=re.DOTALL)
attrs = {"abbr": "", "title": ""}
abbrev = ""
for a in abbrs:
if " title=\"" in a:
attrs["title"] = a.split(" title=\"")[1].split("\"")[0]
attrs["abbr"] = a.split(">")[1].strip("</abbr>")
elif " title='" in a:
attrs["title"] = a.split(" title='")[1].split("'")[0]
attrs["abbr"] = a.split(">")[1].strip("</abbr>")
elif " title=" in a:
attrs["title"] = a.split(" title=")[1].split(" ")[0]
attrs["abbr"] = a.split(">")[1].strip("</abbr>")
if (attrs["title"] != "") and (attrs["abbr"] != ""):
abbrev = attrs["abbr"] + "[[abbr: " + attrs["title"] + "]]"
txt = txt.replace(l, abbrev)
return txt
def filter_time(self, html):
"""Format time tags, e.g. `<time datetime="1970-01-01">Today</time>` ->
`Today (1970-01-01)`."""
txt = html
timestamps = re.findall("<time.*?</time>", txt)
attrs = {"title": "", "datetime": ""}
for t in timestamps:
attrs["title"] = t.split(">")[1].strip("</time>")
if " datetime=\"" in t:
attrs["datetime"] = t.split(" datetime=\"")[1].split("\"")[0]
elif " datetime='" in t:
attrs["datetime"] = t.split(" datetime='")[1].split("'")[0]
elif " datetime=" in t:
attrs["datetime"] = t.split(" datetime=")[1].split(">")[0]
ts = attrs["title"]
if attrs["datetime"] != "":
ts += " (" + attrs["datetime"] + ")"
txt = txt.replace(t, ts)
return txt
def filter_ol(self, html):
"""Parse ordered lists. Only single-level lists are currently
supported."""
txt = html
ol = re.findall("<ol>.*?</ol>", txt, flags=re.DOTALL)
for o in ol:
li = o.replace("</li>", "").replace("</ol>", "").split("<li>")
md = ""
for l in range(1, len(li)):
md += str(l) + ". " + li[l].lstrip()
txt = txt.replace(o, "\n" + md)
return txt
def filter_ul(self, html):
"""Parse unordered lists. Only single-level lists are currently
supported (nested lists will be flattened)."""
txt = html
while ("<ul>" in txt) or ("<li>" in txt):
txt = txt.replace("<li></li>","")
txt = txt.replace("<ul>", "\n").replace("</ul>", "")
txt = txt.replace("<li>", "- ").replace("</li>", "\n")
return txt
def convert(self, html):
"""Run format filters on html string."""
text = self.filter_strip_tags(html)
text = self.filter_whitespace(text)
text = self.filter_format_tags(text)
text = self.filter_img(text)
text = self.filter_links(text)
text = self.filter_embed(text)
text = self.filter_abbr(text)
text = self.filter_time(text)
text = self.filter_ol(text)
text = self.filter_ul(text)
return text