Merge pull request 'feature-licence-and-better-docs' (#1) from feature-licence-and-better-docs into main

Reviewed-on: #1
2025-06-08 09:42:38 +00:00 · 2025-06-08 09:42:38 +00:00 · 71f1856199
commit 71f1856199
parent 60b3dc619e 39057d7a4c
2 changed files with 86 additions and 21 deletions
--- a/README.md
+++ b/README.md
@ -2,6 +2,51 @@
 This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server.
-I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and it assumes that all of the media attachments have a name with
+I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) from my account [@GLOSSATORY](https://weirder.earth/@GLOSSATORY).
-two parts you want to pull out of it and use as the human-readable form of the
+
-index.
+It builds a hierarchy of folders with the structure YYYY/MM/DD and copies
 media attachments into the appropriate day's folder.
 It also adds index.gmi files at each level. Index files at the top and year
 levels have links to the next level down. Index files at the month level have
 links to all of the attachments for that month.
 It assumes that there's only one media attachment per post. If it finds a post with more than one attachment it will only copy the first and issue a warning.
 The alt-text is used as the title of each image in the month-level index file. If you want to only use part of the alt-text, you can provide a list of regular expressions in the config file which will be matched against it.
 ## Usage
    python apub2gmi.py --archive PATH_OF_YOUR_ACTIVITYPUB_ARCHIVE/ --output GEMINI_OUTPUT --config CONFIG.json [--text OPTIONAL_COLOPHON_TEXT ] [--debug] 
 ## Config
 The configuration file is JSON as follows:
 ```
 {
 	"url_re": "^/YOUR_SERVERS_MEDIA_ATTACHMENT_PATH/(.*)$",
 	"title_res": [
 		"^(.*?)\\.\\s*(.*)$"
 	]
 }
 `url_re` should match the URLs of media attachments in the ActivityPub JSON. This will depend on your server - here's an example from the GLOSSATORY archive.
 	"attachment":
 	[
 	    {
 	        "type": "Document",
 	        "mediaType": "image/jpeg",
 	        "url": "/weirderearth/media_attachments/files/105/839/131/582/626/008/original/9e2423c3ffd70dd0.jpeg",
 	        "name": "BILLET: an unmarried working person (often used for making tying) The drawing depicts a person seated at a bench tying knots in a long cord.",
 	        "blurhash": "U2Ss50M{~qt7-;t7IUt7_3-;RjM{RjD%-;WB",
 	        "width": 1280,
 	        "height": 1280
 	    }
 	],
 `title_res` is optional: it's a list of Python regular expressions which will be matched against the alt-text. The text used for the index page is the `()` group or groups from the first regexp which matches. If there's more than one group in the re, the results are joined with spaces.
--- a/apub2gmi.py
+++ b/apub2gmi.py
@ -9,6 +9,7 @@ import json
 import re
 from pathlib import Path
 from shutil import copy
 import sys
 MNAMES = {
 	"01": "January",
@ -28,40 +29,52 @@ MNAMES = {
 class MediaPost():
-	def __init__(self, name_res, year, month, day, file, title):
+	def __init__(self, year, month, day, file, title, title_res=None):
 		self.name_res = name_res
 		self.year = year
 		self.month = month
 		self.day = day
 		self.file = file
 		self.fname = Path(file).name
 		self.title = title
 		self.title_res = title_res
 		self.defn = ""
 		self.desc = ""
-		self.try_parse()
+		if self.title_res:
 			self.try_parse_title()
 	def __str__(self):
 		return f"{self.year}-{self.month}-{self.day}: {self.file}"
 	def copy_image(self, root):
 		d = Path(root) / self.year / self.month / self.day
 		target = d / self.fname
-		if not target.exists():
+		try:
-			copy(self.file, target)
+			if not target.exists():
 				copy(self.file, target)
 		except FileNotFoundError as e:
 			print(f"Image file missing: {self}", file=sys.stderr)
-	def try_parse(self):
+	def try_parse_title(self):
-		for re in self.name_res:
+		for re in self.title_res:
 			if m := re.match(self.title):
-				self.defn = m.group(1)
+				self.title = ' '.join(m.groups())
 				if len(m.groups()) == 2:
 					self.desc = m.group(2)
 				return
-		print(f"{self.file} Couldn't match title {self.title}")
+		print(f"{self.file} Couldn't match alt text {self.title}", file=sys.stderr)
 		self.defn = self.title
-def process_post(cf, archive, obj):
+def process_post(cf, archive, obj, debug=False):
 	if debug:
 		print(f"Processing {obj}", file=sys.stderr)
 	date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
 	year = f"{date.year}"
 	month = f"{date.month:02}"
 	day = f"{date.day:02}"
 	if "attachment" not in obj or len(obj["attachment"]) < 1:
 		raise ValueError('No attachments on this status')
 	if len(obj["attachment"]) > 1:
 		status_id = obj["id"]
 		n = len(obj["attachment"])
 		print(f"Warning: only one media item copied from post {status_id} which has {n}", file=sys.stderr)
 	attachment = obj["attachment"][0]
 	url = attachment["url"]
 	if m:= cf["url_re"].match(url):
@ -69,7 +82,7 @@ def process_post(cf, archive, obj):
 	else:
 		raise ValueError(f"Couldn't match url {url}")
 	alt = attachment["name"]
-	return MediaPost(cf["title_res"], year, month, day, file, alt)
+	return MediaPost(year, month, day, file, alt, cf.get("title_res", None))
 def ensure_dir(gmdir):
@ -89,7 +102,8 @@ def load_config(config):
 	with open(config, "r") as cfh:
 		cf = json.load(cfh)
 		cf["url_re"] = re.compile(cf["url_re"])
-		cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
+		if "title_res" in cf:
 			cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
 	return cf
@ -105,14 +119,16 @@ def write_gemfile(gmdir, colophon, title, items):
 			gfh.write(f"=> {link} {text}\n")
-def apub2gmi(cf, archive, output, colophon):
+def apub2gmi(cf, archive, output, colophon, debug=False):
 	with open(f"{archive}/outbox.json", "r") as fh:
 		js = json.load(fh)
 		posts = {}
 		for item in js["orderedItems"]:
 			if item["type"] == "Create":
 				if debug:
 					print(item)
 				try:
-					post = process_post(cf, archive, item["object"])
+					post = process_post(cf, archive, item["object"], debug)
 					if not post.year in posts:
 						posts[post.year] = {}
 					if not post.month in posts[post.year]:
@ -166,7 +182,11 @@ if __name__ == "__main__":
 		'-t', '--text', required=False, type=str,
 		help="File with text to be included at the top of each index page"
 		)
 	ap.add_argument(
 		'-d', '--debug', action="store_true", default=False,
 		help="Print debug output"
 		)
 	args = ap.parse_args()
 	cf = load_config(args.config)
 	colophon = load_colophon(args.text)
-	apub2gmi(cf, args.archive, args.output, colophon)
+	apub2gmi(cf, args.archive, args.output, colophon, args.debug)