tilde-wiki/tildewiki/compilation.py

import os
import re
from datetime import datetime
from shutil import copy
from typing import Optional, Callable

from markdown import markdown
from markdown.extensions.toc import TocExtension

DOUBLE_NEWLINE_RE = re.compile(r'\n\n', flags=re.MULTILINE|re.DOTALL)
HEADER_TITLE_RE = re.compile(r'<h([12])>(.*?)</h\1>')
TITLE_RE = re.compile(r'<title>.*?</title>')
LINK_RE = re.compile(r'href="\/wiki')
SRC_RE = re.compile(r'src="\/wiki')

DEFAULT_ON_CREATE = lambda _: None

def relativize_links(content:str, depth:int) -> str:
    """Given compiled html content, change URLs that start in "/wiki" to be
    relative instead of absolute. Depth indicates how many pairs of dots we
    should use to traverse upward."""
    dots = os.path.join(*['..' for _ in range(depth)])
    href_repl = 'href="{}'.format(os.path.join(dots, 'wiki'))
    src_repl = 'src="{}'.format(os.path.join(dots, 'wiki'))
    out = re.sub(LINK_RE, href_repl, content)
    return re.sub(SRC_RE, src_repl, out)

def depth_from(root: str, path: str) -> int:
    """Given a root path and a path below it, returns how many levels below
    the root the path is."""
    if root == path:
      return 1
    relpath = os.path.relpath(path, root)
    first = os.path.split(relpath)[0]
    depth = 2
    while first != '':
      depth += 1
      first = os.path.split(first)[0]
    return depth

def generate_toc(header_content, articles):
    """given header_content and a list of dicts with keys title, href, and path this function
    generates the toc page's content"""

    toc_content = '{}\n'.format(update_title(header_content, 'table of contents'))

    toplevel_articles = [a for a in articles if a['path'] == '']
    articles = [a for a in articles if a['path'] != '']
    sorted(articles, key=lambda a: a['path'])
    toc_content += '<h1>Table of Contents</h1>\n'
    toc_content += '<h2>unsorted articles</h2>\n<ul>\n'
    for a in toplevel_articles:
        toc_content += '<li><a href="{}">{}</a></li>\n'.format(a['href'], a['title'])

    seen = set()
    for article in articles:
        if article['path'] not in seen:
            path = article['path']
            components = path.split('/')
            hlvl = len(components) + 1
            toc_content += '</ul>'
            toc_content += f'<h{hlvl}>' + path.split('/')[-1] + f'</h{hlvl}>'
            toc_content += '<ul>'
            seen.add(path)
        toc_content += '<li><a href="{href}">{title}</a></li>'.format(**article)

    toc_content += '</ul>'
    return toc_content

def compile_wiki(source_path: str,
                 dest_path: str,
                 on_create: Callable[[str], None]=DEFAULT_ON_CREATE) -> None:
    """Given a source path (presumably a git repository) and a destination
    path, compiles the files found in {source_path}/articles and compiles them all
    to {dest_path}/.

    THIS FUNCTION CLEARS {dest_path}/!

    Be absolutely sure you know what you are doing when you call this ^_^

    If passed, on_create will be called per directory and file created by the
    compiler. The default is to take no action.
    """
    last_compiled = '<hr><p><em>last compiled: {}</em></p>'.format(datetime.utcnow())

    header_content = compile_markdown(os.path.join(source_path, 'src/header.md'))
    footer_content = last_compiled + compile_markdown(os.path.join(source_path, 'src/footer.md'))
    logo_path = os.path.join(source_path, 'src/logo.png')
    css_path = os.path.join(source_path, 'src/main.css')

    articles_root = os.path.join(source_path, 'src/articles')

    articles = []

    copy(logo_path, dest_path)
    copy(css_path, dest_path)

    for source_root, dirs, files in os.walk(articles_root):
        depth = depth_from(articles_root, source_root)
        current_suffix = source_root.replace(articles_root, '')
        if current_suffix and current_suffix[0] == '/':
            current_suffix = current_suffix[1:]

        dest_root = os.path.join(dest_path, current_suffix)

        for directory in dirs:
            dir_path = os.path.join(dest_root, directory)
            os.mkdir(dir_path)
            on_create(dir_path)

        for source_filename in files:
            if source_filename.startswith('.'):
                continue
            source_file_path = os.path.join(source_root, source_filename)
            output = compile_source_file(
                source_file_path,
                header_content,
                footer_content)
            output = relativize_links(output, depth)
            dest_filename = source_filename.split('.')[0] + '.html'
            articles.append({
                'title': dest_filename.split('.')[0],
                'href': os.path.join(current_suffix, dest_filename),
                'path': current_suffix})
            final_path = os.path.join(dest_root, dest_filename)
            with open(final_path, 'w') as f:
                f.write(output)
            on_create(final_path)


    toc_content = generate_toc(header_content, articles)
    toc_path = os.path.join(dest_path, 'toc.html')
    with open(toc_path, 'w') as f:
        f.write(relativize_links(toc_content, 1))
        f.write(footer_content)
    on_create(toc_path)

def slurp(file_path:str) -> str:
    """Convenience function for reading a file and returning its contents."""
    content = None
    with open(file_path, 'r') as f:
        content = f.read()
    return content

def compile_source_file(source_file_path:str, header_content:str, footer_content:str) -> str:
    """Given a path to a source file, this function:
    - picks an appropriate compiler for the file extension
    - compiles the file
    - sandwiches it between the provided header and footer content
    - returns the constructed string
    """
    if not os.path.isabs(source_file_path):
        raise ValueError(
            '{} is not an absolute path.'.format(source_file_path))

    # pick a compiler
    if source_file_path.endswith('.md'):
        compiler = compile_markdown
    elif source_file_path.endswith('.txt'):
        compiler = compile_plaintext
    else:
        # this just copies through any files that we don't recognize as needing
        # conversion.
        compiler = slurp

    content = compiler(source_file_path)

    title = extract_title(content)
    if title is not None:
        header_content = update_title(header_content, title)

    return '{}\n{}\n{}'.format(header_content, content, footer_content)

def update_title(content:str, title:str) -> str:
    """Given a chunk of HTML, finds, updates, and returns the title element to
    be the given title. If there is no title element, the content is returned
    unmodified."""
    return re.sub(TITLE_RE, '<title>{}</title>'.format(title), content)

def extract_title(content:str) -> Optional[str]:
    """Given a string of page content, look for a header in the first line.
    Returns it if found; returns None otherwise."""
    first_line = content.split('\n')[0]
    matches = re.match(HEADER_TITLE_RE, first_line)
    if matches is not None:
        return matches.groups()[1]
    return None

def compile_markdown(file_path:str) -> str:
    """Given a string of markdown, compiles it and returns the result."""
    return markdown(
        slurp(file_path),
        output_format='html5',
        # Allow tables of contents and exclude <h1> tags (page titles)
        extensions=[TocExtension(toc_depth='2-6')])

def compile_plaintext(file_path:str) -> str:
    output = '<p>\n'
    output += re.sub(
        DOUBLE_NEWLINE_RE,
        '</p><p>',
        slurp(file_path))
    output += '\n</p>\n'
    return output