#!/usr/bin/env python # convert the Glossatory archive from an ActivityPub collection to # gemini import argparse import datetime import json import re from pathlib import Path from shutil import copy import sys MNAMES = { "01": "January", "02": "February", "03": "March", "04": "April", "05": "May", "06": "June", "07": "July", "08": "August", "09": "September", "10": "October", "11": "November", "12": "December", } class MediaPost(): def __init__(self, year, month, day, file, title, title_res=None): self.year = year self.month = month self.day = day self.file = file self.fname = Path(file).name self.title = title self.title_res = title_res self.defn = "" self.desc = "" if self.title_res: self.try_parse_title() def __str__(self): return f"{self.year}-{self.month}-{self.day}: {self.file}" def copy_image(self, root): d = Path(root) / self.year / self.month / self.day target = d / self.fname try: if not target.exists(): copy(self.file, target) except FileNotFoundError as e: print(f"Image file missing: {self}", file=sys.stderr) def try_parse_title(self): for re in self.title_res: if m := re.match(self.title): self.title = ' '.join(m.groups()) return print(f"{self.file} Couldn't match alt text {self.title}", file=sys.stderr) def process_post(cf, archive, obj, debug=False): if debug: print(f"Processing {obj}", file=sys.stderr) date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d") year = f"{date.year}" month = f"{date.month:02}" day = f"{date.day:02}" if "attachment" not in obj or len(obj["attachment"]) < 1: raise ValueError('No attachments on this status') if len(obj["attachment"]) > 1: status_id = obj["id"] n = len(obj["attachment"]) print(f"Warning: only one media item copied from post {status_id} which has {n}", file=sys.stderr) attachment = obj["attachment"][0] url = attachment["url"] if m:= cf["url_re"].match(url): file = Path(archive) / m.group(1) else: raise ValueError(f"Couldn't match url {url}") alt = attachment["name"] return MediaPost(year, month, day, file, alt, cf.get("title_res", None)) def ensure_dir(gmdir): if not gmdir.is_dir(): gmdir.mkdir(parents=True) def load_colophon(cfile): if cfile: with open(cfile, "r") as cfh: colophon = cfh.readlines() return "".join(colophon) return None def load_config(config): with open(config, "r") as cfh: cf = json.load(cfh) cf["url_re"] = re.compile(cf["url_re"]) if "title_res" in cf: cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ] return cf def write_gemfile(gmdir, colophon, title, items): ensure_dir(gmdir) gmi = gmdir / "index.gmi" with open(gmi, "w") as gfh: if colophon: gfh.write(colophon) gfh.write("\n\n") gfh.write(f"# {title}\n\n") for link, text in items: gfh.write(f"=> {link} {text}\n") def apub2gmi(cf, archive, output, colophon, debug=False): with open(f"{archive}/outbox.json", "r") as fh: js = json.load(fh) posts = {} for item in js["orderedItems"]: if item["type"] == "Create": if debug: print(item) try: post = process_post(cf, archive, item["object"], debug) if not post.year in posts: posts[post.year] = {} if not post.month in posts[post.year]: posts[post.year][post.month] = {} m = posts[post.year][post.month] if not post.day in m: m[post.day] = [ post ] else: m[post.day].append(post) except Exception as e: i = item["id"] print(f"Processing failed: {i}: {e}") years = [ ( f"{year}/", year ) for year in posts ] write_gemfile(Path(output), colophon, "Glossatory", years) for year in posts: ydir = Path(output) / year months = [ ( f"{month}/", MNAMES[month] ) for month in posts[year] ] write_gemfile(ydir, colophon, year, months) for month in posts[year]: mname = MNAMES[month] mdir = ydir / month for day in posts[year][month]: ddir = mdir / day ddir.mkdir(parents=True, exist_ok=True) for post in posts[year][month][day]: post.copy_image(output) gmi = mdir / "index.gmi" links = [ ( f"/glossatory/{year}/", year), ] for day in posts[year][month]: for post in posts[year][month][day]: links.append((f"{day}/{post.fname}", post.title)) write_gemfile(mdir, colophon, f"{mname} {year}", links) if __name__ == "__main__": ap = argparse.ArgumentParser() ap.add_argument( '-a', '--archive', required=True, type=str, help="ActivityPub archive" ) ap.add_argument( '-o', '--output', required=True, type=str, help="Output directory" ) ap.add_argument( '-c', '--config', required=True, type=str, help="Config file" ) ap.add_argument( '-t', '--text', required=False, type=str, help="File with text to be included at the top of each index page" ) ap.add_argument( '-d', '--debug', action="store_true", default=False, help="Print debug output" ) args = ap.parse_args() cf = load_config(args.config) colophon = load_colophon(args.text) apub2gmi(cf, args.archive, args.output, colophon, args.debug)