tilde-wiki/tildewiki/compilation.py

205 lines
7.4 KiB
Python

import os
import re
from datetime import datetime
from shutil import copy
from typing import Optional, Callable
from markdown import markdown
from markdown.extensions.toc import TocExtension
DOUBLE_NEWLINE_RE = re.compile(r'\n\n', flags=re.MULTILINE|re.DOTALL)
HEADER_TITLE_RE = re.compile(r'<h([12])>(.*?)</h\1>')
TITLE_RE = re.compile(r'<title>.*?</title>')
LINK_RE = re.compile(r'href="\/wiki')
SRC_RE = re.compile(r'src="\/wiki')
DEFAULT_ON_CREATE = lambda _: None
def relativize_links(content:str, depth:int) -> str:
"""Given compiled html content, change URLs that start in "/wiki" to be
relative instead of absolute. Depth indicates how many pairs of dots we
should use to traverse upward."""
dots = os.path.join(*['..' for _ in range(depth)])
href_repl = 'href="{}'.format(os.path.join(dots, 'wiki'))
src_repl = 'src="{}'.format(os.path.join(dots, 'wiki'))
out = re.sub(LINK_RE, href_repl, content)
return re.sub(SRC_RE, src_repl, out)
def depth_from(root: str, path: str) -> int:
"""Given a root path and a path below it, returns how many levels below
the root the path is."""
if root == path:
return 1
relpath = os.path.relpath(path, root)
first = os.path.split(relpath)[0]
depth = 2
while first != '':
depth += 1
first = os.path.split(first)[0]
return depth
def generate_toc(header_content, articles):
"""given header_content and a list of dicts with keys title, href, and path this function
generates the toc page's content"""
toc_content = '{}\n'.format(update_title(header_content, 'table of contents'))
toplevel_articles = [a for a in articles if a['path'] == '']
articles = [a for a in articles if a['path'] != '']
sorted(articles, key=lambda a: a['path'])
toc_content += '<h1>Table of Contents</h1>\n'
toc_content += '<h2>unsorted articles</h2>\n<ul>\n'
for a in toplevel_articles:
toc_content += '<li><a href="{}">{}</a></li>\n'.format(a['href'], a['title'])
seen = set()
for article in articles:
if article['path'] not in seen:
path = article['path']
components = path.split('/')
hlvl = len(components) + 1
toc_content += '</ul>'
toc_content += f'<h{hlvl}>' + path.split('/')[-1] + f'</h{hlvl}>'
toc_content += '<ul>'
seen.add(path)
toc_content += '<li><a href="{href}">{title}</a></li>'.format(**article)
toc_content += '</ul>'
return toc_content
def compile_wiki(source_path: str,
dest_path: str,
on_create: Callable[[str], None]=DEFAULT_ON_CREATE) -> None:
"""Given a source path (presumably a git repository) and a destination
path, compiles the files found in {source_path}/articles and compiles them all
to {dest_path}/.
THIS FUNCTION CLEARS {dest_path}/!
Be absolutely sure you know what you are doing when you call this ^_^
If passed, on_create will be called per directory and file created by the
compiler. The default is to take no action.
"""
last_compiled = '<hr><p><em>last compiled: {}</em></p>'.format(datetime.utcnow())
header_content = compile_markdown(os.path.join(source_path, 'src/header.md'))
footer_content = last_compiled + compile_markdown(os.path.join(source_path, 'src/footer.md'))
logo_path = os.path.join(source_path, 'src/logo.png')
css_path = os.path.join(source_path, 'src/main.css')
articles_root = os.path.join(source_path, 'src/articles')
articles = []
copy(logo_path, dest_path)
copy(css_path, dest_path)
for source_root, dirs, files in os.walk(articles_root):
depth = depth_from(articles_root, source_root)
current_suffix = source_root.replace(articles_root, '')
if current_suffix and current_suffix[0] == '/':
current_suffix = current_suffix[1:]
dest_root = os.path.join(dest_path, current_suffix)
for directory in dirs:
dir_path = os.path.join(dest_root, directory)
os.mkdir(dir_path)
on_create(dir_path)
for source_filename in files:
if source_filename.startswith('.'):
continue
source_file_path = os.path.join(source_root, source_filename)
output = compile_source_file(
source_file_path,
header_content,
footer_content)
output = relativize_links(output, depth)
dest_filename = source_filename.split('.')[0] + '.html'
articles.append({
'title': dest_filename.split('.')[0],
'href': os.path.join(current_suffix, dest_filename),
'path': current_suffix})
final_path = os.path.join(dest_root, dest_filename)
with open(final_path, 'w') as f:
f.write(output)
on_create(final_path)
toc_content = generate_toc(header_content, articles)
toc_path = os.path.join(dest_path, 'toc.html')
with open(toc_path, 'w') as f:
f.write(relativize_links(toc_content, 1))
f.write(footer_content)
on_create(toc_path)
def slurp(file_path:str) -> str:
"""Convenience function for reading a file and returning its contents."""
content = None
with open(file_path, 'r') as f:
content = f.read()
return content
def compile_source_file(source_file_path:str, header_content:str, footer_content:str) -> str:
"""Given a path to a source file, this function:
- picks an appropriate compiler for the file extension
- compiles the file
- sandwiches it between the provided header and footer content
- returns the constructed string
"""
if not os.path.isabs(source_file_path):
raise ValueError(
'{} is not an absolute path.'.format(source_file_path))
# pick a compiler
if source_file_path.endswith('.md'):
compiler = compile_markdown
elif source_file_path.endswith('.txt'):
compiler = compile_plaintext
else:
# this just copies through any files that we don't recognize as needing
# conversion.
compiler = slurp
content = compiler(source_file_path)
title = extract_title(content)
if title is not None:
header_content = update_title(header_content, title)
return '{}\n{}\n{}'.format(header_content, content, footer_content)
def update_title(content:str, title:str) -> str:
"""Given a chunk of HTML, finds, updates, and returns the title element to
be the given title. If there is no title element, the content is returned
unmodified."""
return re.sub(TITLE_RE, '<title>{}</title>'.format(title), content)
def extract_title(content:str) -> Optional[str]:
"""Given a string of page content, look for a header in the first line.
Returns it if found; returns None otherwise."""
first_line = content.split('\n')[0]
matches = re.match(HEADER_TITLE_RE, first_line)
if matches is not None:
return matches.groups()[1]
return None
def compile_markdown(file_path:str) -> str:
"""Given a string of markdown, compiles it and returns the result."""
return markdown(
slurp(file_path),
output_format='html5',
# Allow tables of contents and exclude <h1> tags (page titles)
extensions=[TocExtension(toc_depth='2-6')])
def compile_plaintext(file_path:str) -> str:
output = '<p>\n'
output += re.sub(
DOUBLE_NEWLINE_RE,
'</p><p>',
slurp(file_path))
output += '\n</p>\n'
return output