Added Licence, made the title_re optional, completed docs

2025-06-08 19:39:49 +10:00 · 2025-06-08 19:39:49 +10:00 · 39057d7a4c
commit 39057d7a4c
parent d2a39ea202
2 changed files with 85 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -2,8 +2,51 @@
 This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server.
-I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and it assumes that all of the media attachments have a name with
+I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) from my account [@GLOSSATORY](https://weirder.earth/@GLOSSATORY).
 two parts you want to pull out of it and use as the human-readable form of the
 index.
-Pushing a minor change as a test
+It builds a hierarchy of folders with the structure YYYY/MM/DD and copies
 media attachments into the appropriate day's folder.
 It also adds index.gmi files at each level. Index files at the top and year
 levels have links to the next level down. Index files at the month level have
 links to all of the attachments for that month.
 It assumes that there's only one media attachment per post. If it finds a post with more than one attachment it will only copy the first and issue a warning.
 The alt-text is used as the title of each image in the month-level index file. If you want to only use part of the alt-text, you can provide a list of regular expressions in the config file which will be matched against it.
 ## Usage
    python apub2gmi.py --archive PATH_OF_YOUR_ACTIVITYPUB_ARCHIVE/ --output GEMINI_OUTPUT --config CONFIG.json [--text OPTIONAL_COLOPHON_TEXT ] [--debug] 
 ## Config
 The configuration file is JSON as follows:
 ```
 {
 	"url_re": "^/YOUR_SERVERS_MEDIA_ATTACHMENT_PATH/(.*)$",
 	"title_res": [
 		"^(.*?)\\.\\s*(.*)$"
 	]
 }
 `url_re` should match the URLs of media attachments in the ActivityPub JSON. This will depend on your server - here's an example from the GLOSSATORY archive.
 	"attachment":
 	[
 	    {
 	        "type": "Document",
 	        "mediaType": "image/jpeg",
 	        "url": "/weirderearth/media_attachments/files/105/839/131/582/626/008/original/9e2423c3ffd70dd0.jpeg",
 	        "name": "BILLET: an unmarried working person (often used for making tying) The drawing depicts a person seated at a bench tying knots in a long cord.",
 	        "blurhash": "U2Ss50M{~qt7-;t7IUt7_3-;RjM{RjD%-;WB",
 	        "width": 1280,
 	        "height": 1280
 	    }
 	],
 `title_res` is optional: it's a list of Python regular expressions which will be matched against the alt-text. The text used for the index page is the `()` group or groups from the first regexp which matches. If there's more than one group in the re, the results are joined with spaces.
--- a/apub2gmi.py
+++ b/apub2gmi.py
@ -9,6 +9,7 @@ import json
 import re
 from pathlib import Path
 from shutil import copy
 import sys
 MNAMES = {
 	"01": "January",
@ -28,40 +29,52 @@ MNAMES = {
 class MediaPost():
-	def __init__(self, name_res, year, month, day, file, title):
+	def __init__(self, year, month, day, file, title, title_res=None):
 		self.name_res = name_res
 		self.year = year
 		self.month = month
 		self.day = day
 		self.file = file
 		self.fname = Path(file).name
 		self.title = title
 		self.title_res = title_res
 		self.defn = ""
 		self.desc = ""
-		self.try_parse()
+		if self.title_res:
 			self.try_parse_title()
 	def __str__(self):
 		return f"{self.year}-{self.month}-{self.day}: {self.file}"
 	def copy_image(self, root):
 		d = Path(root) / self.year / self.month / self.day
 		target = d / self.fname
 		try:
 			if not target.exists():
 				copy(self.file, target)
 		except FileNotFoundError as e:
 			print(f"Image file missing: {self}", file=sys.stderr)
-	def try_parse(self):
+	def try_parse_title(self):
-		for re in self.name_res:
+		for re in self.title_res:
 			if m := re.match(self.title):
-				self.defn = m.group(1)
+				self.title = ' '.join(m.groups())
 				if len(m.groups()) == 2:
 					self.desc = m.group(2)
 				return
-		print(f"{self.file} Couldn't match title {self.title}")
+		print(f"{self.file} Couldn't match alt text {self.title}", file=sys.stderr)
 		self.defn = self.title
-def process_post(cf, archive, obj):
+def process_post(cf, archive, obj, debug=False):
 	if debug:
 		print(f"Processing {obj}", file=sys.stderr)
 	date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
 	year = f"{date.year}"
 	month = f"{date.month:02}"
 	day = f"{date.day:02}"
 	if "attachment" not in obj or len(obj["attachment"]) < 1:
 		raise ValueError('No attachments on this status')
 	if len(obj["attachment"]) > 1:
 		status_id = obj["id"]
 		n = len(obj["attachment"])
 		print(f"Warning: only one media item copied from post {status_id} which has {n}", file=sys.stderr)
 	attachment = obj["attachment"][0]
 	url = attachment["url"]
 	if m:= cf["url_re"].match(url):
@ -69,7 +82,7 @@ def process_post(cf, archive, obj):
 	else:
 		raise ValueError(f"Couldn't match url {url}")
 	alt = attachment["name"]
-	return MediaPost(cf["title_res"], year, month, day, file, alt)
+	return MediaPost(year, month, day, file, alt, cf.get("title_res", None))
 def ensure_dir(gmdir):
@ -89,6 +102,7 @@ def load_config(config):
 	with open(config, "r") as cfh:
 		cf = json.load(cfh)
 		cf["url_re"] = re.compile(cf["url_re"])
 		if "title_res" in cf:
 			cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
 	return cf
@ -105,14 +119,16 @@ def write_gemfile(gmdir, colophon, title, items):
 			gfh.write(f"=> {link} {text}\n")
-def apub2gmi(cf, archive, output, colophon):
+def apub2gmi(cf, archive, output, colophon, debug=False):
 	with open(f"{archive}/outbox.json", "r") as fh:
 		js = json.load(fh)
 		posts = {}
 		for item in js["orderedItems"]:
 			if item["type"] == "Create":
 				if debug:
 					print(item)
 				try:
-					post = process_post(cf, archive, item["object"])
+					post = process_post(cf, archive, item["object"], debug)
 					if not post.year in posts:
 						posts[post.year] = {}
 					if not post.month in posts[post.year]:
@ -166,7 +182,11 @@ if __name__ == "__main__":
 		'-t', '--text', required=False, type=str,
 		help="File with text to be included at the top of each index page"
 		)
 	ap.add_argument(
 		'-d', '--debug', action="store_true", default=False,
 		help="Print debug output"
 		)
 	args = ap.parse_args()
 	cf = load_config(args.config)
 	colophon = load_colophon(args.text)
-	apub2gmi(cf, args.archive, args.output, colophon)
+	apub2gmi(cf, args.archive, args.output, colophon, args.debug)