apub2gmi/apub2gmi.py

#!/usr/bin/env python

# convert the Glossatory archive from an ActivityPub collection to
# gemini

import json
import datetime
import re
from pathlib import Path
from shutil import copy

MNAMES = {
	"01": "January",
	"02": "February",
	"03": "March",
	"04": "April",
	"05": "May",
	"06": "June",
	"07": "July",
	"08": "August",
	"09": "September",
	"10": "October",
	"11": "November",
	"12": "December",
}

ARCHIVE = "archive-20230604031441-05906d0df7f3f14777089c2fd7d0175a"
OUTDIR = "gemini"

HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database.

=> https://weirder.earth/@GLOSSATORY Follow the drawings on Mastodon
=> https://botsin.space/@GLOSSATORY Follow the words on Mastodon
=> https://oulipo.social/@GLOSSATORY Follow the words without the letter "e"
=> / Back to Mike's gemini
=> /glossatory/ Glossatory archive home

"""

URL_RE = re.compile(r"^/files\.weirder\.earth/(.*)$")
NAME_RES = [
	re.compile(r"^(.*?)\.\s*(.*)$"),
	re.compile(r"^(.*?)\s*(The drawing.*)$"),
	re.compile(r"^A line drawing depicting (.*)$"),
	re.compile(r"^(.*?): (.*)$"),
	]


class GlossatoryPost():
	def __init__(self, year, month, day, file, title):
		self.year = year
		self.month = month
		self.day = day
		self.file = file
		self.fname = Path(file).name
		self.title = title
		self.defn = ""
		self.desc = ""
		self.try_parse()

	def copy_image(self, root):
		d = Path(root) / self.year / self.month / self.day
		target = d / self.fname
		if not target.exists():
			copy(self.file, target)

	def try_parse(self):
		for re in NAME_RES:
			if m := re.match(self.title):
				self.defn = m.group(1)
				if len(m.groups()) == 2:
					self.desc = m.group(2)
				return
		print(f"{self.file} Couldn't match title {self.title}")
		self.defn = self.title


def process_post(obj):
	date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
	year = f"{date.year}"
	month = f"{date.month:02}"
	day = f"{date.day:02}"
	attachment = obj["attachment"][0]
	url = attachment["url"]
	if m:= URL_RE.match(url):
		file = Path(ARCHIVE) / m.group(1)
	else:
		raise ValueError(f"Couldn't match url {url}")
	alt = attachment["name"]
	return GlossatoryPost(year, month, day, file, alt)


def ensure_dir(gmdir):
	if not gmdir.is_dir():
		gmdir.mkdir(parents=True)


def write_gemfile(gmdir, title, items):
	ensure_dir(gmdir)
	gmi = gmdir / "index.gmi"
	with open(gmi, "w") as gfh:
		gfh.write(HEADER)
		gfh.write(f"# {title}\n\n")
		for link, text in items:
			gfh.write(f"=> {link} {text}\n")


with open(f"{ARCHIVE}/outbox.json", "r") as fh:
	js = json.load(fh)
	posts = {}
	for item in js["orderedItems"]:
		if item["type"] == "Create":
			try:
				post = process_post(item["object"])
				if not post.year in posts:
					posts[post.year] = {}
				if not post.month in posts[post.year]:
					posts[post.year][post.month] = {}
				m = posts[post.year][post.month]
				if not post.day in m:
					m[post.day] = [ post ]
				else:
					m[post.day].append(post)
			except Exception as e:
				i = item["id"]
				print(f"Processing failed: {i}: {e}")

	years = [ ( f"{year}/", year ) for year in posts ]
	write_gemfile(Path(OUTDIR), "Glossatory", years)

	for year in posts:
		ydir = Path(OUTDIR) / year
		months = [ ( f"{month}/", MNAMES[month] ) for month in posts[year] ]
		write_gemfile(ydir, year, months)
		for month in posts[year]:
			mname = MNAMES[month]
			mdir = ydir / month
			for day in posts[year][month]:
				ddir = mdir / day
				ddir.mkdir(parents=True, exist_ok=True)
				for post in posts[year][month][day]:
					post.copy_image(OUTDIR)
			gmi = mdir / "index.gmi"
			links = [
				( f"/glossatory/{year}/", year),
			]
			for day in posts[year][month]:
				for post in posts[year][month][day]:
					links.append((f"{day}/{post.fname}", post.title))
			write_gemfile(mdir, f"{mname} {year}", links)