diff --git a/README.md b/README.md index 86856e1..9c76a77 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,51 @@ This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server. -I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and it assumes that all of the media attachments have a name with -two parts you want to pull out of it and use as the human-readable form of the -index. +I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) from my account [@GLOSSATORY](https://weirder.earth/@GLOSSATORY). -Pushing a minor change as a test +It builds a hierarchy of folders with the structure YYYY/MM/DD and copies +media attachments into the appropriate day's folder. + +It also adds index.gmi files at each level. Index files at the top and year +levels have links to the next level down. Index files at the month level have +links to all of the attachments for that month. + +It assumes that there's only one media attachment per post. If it finds a post with more than one attachment it will only copy the first and issue a warning. + +The alt-text is used as the title of each image in the month-level index file. If you want to only use part of the alt-text, you can provide a list of regular expressions in the config file which will be matched against it. + +## Usage + + python apub2gmi.py --archive PATH_OF_YOUR_ACTIVITYPUB_ARCHIVE/ --output GEMINI_OUTPUT --config CONFIG.json [--text OPTIONAL_COLOPHON_TEXT ] [--debug] + + + +## Config + +The configuration file is JSON as follows: + +``` +{ + "url_re": "^/YOUR_SERVERS_MEDIA_ATTACHMENT_PATH/(.*)$", + "title_res": [ + "^(.*?)\\.\\s*(.*)$" + ] +} + + +`url_re` should match the URLs of media attachments in the ActivityPub JSON. This will depend on your server - here's an example from the GLOSSATORY archive. + + "attachment": + [ + { + "type": "Document", + "mediaType": "image/jpeg", + "url": "/weirderearth/media_attachments/files/105/839/131/582/626/008/original/9e2423c3ffd70dd0.jpeg", + "name": "BILLET: an unmarried working person (often used for making tying) The drawing depicts a person seated at a bench tying knots in a long cord.", + "blurhash": "U2Ss50M{~qt7-;t7IUt7_3-;RjM{RjD%-;WB", + "width": 1280, + "height": 1280 + } + ], + +`title_res` is optional: it's a list of Python regular expressions which will be matched against the alt-text. The text used for the index page is the `()` group or groups from the first regexp which matches. If there's more than one group in the re, the results are joined with spaces. \ No newline at end of file diff --git a/apub2gmi.py b/apub2gmi.py index 2694097..4978424 100755 --- a/apub2gmi.py +++ b/apub2gmi.py @@ -9,6 +9,7 @@ import json import re from pathlib import Path from shutil import copy +import sys MNAMES = { "01": "January", @@ -28,40 +29,52 @@ MNAMES = { class MediaPost(): - def __init__(self, name_res, year, month, day, file, title): - self.name_res = name_res + def __init__(self, year, month, day, file, title, title_res=None): self.year = year self.month = month self.day = day self.file = file self.fname = Path(file).name self.title = title + self.title_res = title_res self.defn = "" self.desc = "" - self.try_parse() + if self.title_res: + self.try_parse_title() + + def __str__(self): + return f"{self.year}-{self.month}-{self.day}: {self.file}" def copy_image(self, root): d = Path(root) / self.year / self.month / self.day target = d / self.fname - if not target.exists(): - copy(self.file, target) + try: + if not target.exists(): + copy(self.file, target) + except FileNotFoundError as e: + print(f"Image file missing: {self}", file=sys.stderr) - def try_parse(self): - for re in self.name_res: + def try_parse_title(self): + for re in self.title_res: if m := re.match(self.title): - self.defn = m.group(1) - if len(m.groups()) == 2: - self.desc = m.group(2) + self.title = ' '.join(m.groups()) return - print(f"{self.file} Couldn't match title {self.title}") - self.defn = self.title + print(f"{self.file} Couldn't match alt text {self.title}", file=sys.stderr) -def process_post(cf, archive, obj): +def process_post(cf, archive, obj, debug=False): + if debug: + print(f"Processing {obj}", file=sys.stderr) date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d") year = f"{date.year}" month = f"{date.month:02}" day = f"{date.day:02}" + if "attachment" not in obj or len(obj["attachment"]) < 1: + raise ValueError('No attachments on this status') + if len(obj["attachment"]) > 1: + status_id = obj["id"] + n = len(obj["attachment"]) + print(f"Warning: only one media item copied from post {status_id} which has {n}", file=sys.stderr) attachment = obj["attachment"][0] url = attachment["url"] if m:= cf["url_re"].match(url): @@ -69,7 +82,7 @@ def process_post(cf, archive, obj): else: raise ValueError(f"Couldn't match url {url}") alt = attachment["name"] - return MediaPost(cf["title_res"], year, month, day, file, alt) + return MediaPost(year, month, day, file, alt, cf.get("title_res", None)) def ensure_dir(gmdir): @@ -89,7 +102,8 @@ def load_config(config): with open(config, "r") as cfh: cf = json.load(cfh) cf["url_re"] = re.compile(cf["url_re"]) - cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ] + if "title_res" in cf: + cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ] return cf @@ -105,14 +119,16 @@ def write_gemfile(gmdir, colophon, title, items): gfh.write(f"=> {link} {text}\n") -def apub2gmi(cf, archive, output, colophon): +def apub2gmi(cf, archive, output, colophon, debug=False): with open(f"{archive}/outbox.json", "r") as fh: js = json.load(fh) posts = {} for item in js["orderedItems"]: if item["type"] == "Create": + if debug: + print(item) try: - post = process_post(cf, archive, item["object"]) + post = process_post(cf, archive, item["object"], debug) if not post.year in posts: posts[post.year] = {} if not post.month in posts[post.year]: @@ -166,7 +182,11 @@ if __name__ == "__main__": '-t', '--text', required=False, type=str, help="File with text to be included at the top of each index page" ) + ap.add_argument( + '-d', '--debug', action="store_true", default=False, + help="Print debug output" + ) args = ap.parse_args() cf = load_config(args.config) colophon = load_colophon(args.text) - apub2gmi(cf, args.archive, args.output, colophon) + apub2gmi(cf, args.archive, args.output, colophon, args.debug)