Updated README.md

Moved the regexps out to a config file
2024-05-04 14:40:14 +10:00 · 2024-05-04 14:38:10 +10:00
3 changed files with 34 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -2,4 +2,6 @@
 This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server.
-I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and there's still a few things hard-coded in it which I haven't moved out to a config file, specifically the regular expressions which match attachment URLs and pull bits of text out for the index links.
+I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and it assumes that all of the media attachments have a name with
 two parts you want to pull out of it and use as the human-readable form of the
 index.
--- a/apub2gmi.py
+++ b/apub2gmi.py
@ -26,28 +26,10 @@ MNAMES = {
 }
 HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database.
 => https://weirder.earth/@GLOSSATORY Follow the drawings on Mastodon
 => https://botsin.space/@GLOSSATORY Follow the words on Mastodon
 => https://oulipo.social/@GLOSSATORY Follow the words without the letter "e"
 => / Back to Mike's gemini
 => /glossatory/ Glossatory archive home
 """
 URL_RE = re.compile(r"^/weirderearth/(.*)$")
 NAME_RES = [
 	re.compile(r"^(.*?)\.\s*(.*)$"),
 	re.compile(r"^(.*?)\s*(The drawing.*)$"),
 	re.compile(r"^A line drawing depicting (.*)$"),
 	re.compile(r"^(.*?): (.*)$"),
 	]
 class MediaPost():
-	def __init__(self, year, month, day, file, title):
+	def __init__(self, name_res, year, month, day, file, title):
 		self.name_res = name_res
 		self.year = year
 		self.month = month
 		self.day = day
@ -65,7 +47,7 @@ class MediaPost():
 			copy(self.file, target)
 	def try_parse(self):
-		for re in NAME_RES:
+		for re in self.name_res:
 			if m := re.match(self.title):
 				self.defn = m.group(1)
 				if len(m.groups()) == 2:
@ -75,19 +57,19 @@ class MediaPost():
 		self.defn = self.title
-def process_post(archive, obj):
+def process_post(cf, archive, obj):
 	date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
 	year = f"{date.year}"
 	month = f"{date.month:02}"
 	day = f"{date.day:02}"
 	attachment = obj["attachment"][0]
 	url = attachment["url"]
-	if m:= URL_RE.match(url):
+	if m:= cf["url_re"].match(url):
 		file = Path(archive) / m.group(1)
 	else:
 		raise ValueError(f"Couldn't match url {url}")
 	alt = attachment["name"]
-	return MediaPost(year, month, day, file, alt)
+	return MediaPost(cf["title_res"], year, month, day, file, alt)
 def ensure_dir(gmdir):
@ -102,6 +84,15 @@ def load_colophon(cfile):
 			return "".join(colophon)
 	return None
 def load_config(config):
 	with open(config, "r") as cfh:
 		cf = json.load(cfh)
 		cf["url_re"] = re.compile(cf["url_re"])
 		cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
 	return cf
 def write_gemfile(gmdir, colophon, title, items):
 	ensure_dir(gmdir)
 	gmi = gmdir / "index.gmi"
@ -114,14 +105,14 @@ def write_gemfile(gmdir, colophon, title, items):
 			gfh.write(f"=> {link} {text}\n")
-def apub2gmi(archive, output, colophon):
+def apub2gmi(cf, archive, output, colophon):
 	with open(f"{archive}/outbox.json", "r") as fh:
 		js = json.load(fh)
 		posts = {}
 		for item in js["orderedItems"]:
 			if item["type"] == "Create":
 				try:
-					post = process_post(archive, item["object"])
+					post = process_post(cf, archive, item["object"])
 					if not post.year in posts:
 						posts[post.year] = {}
 					if not post.month in posts[post.year]:
@ -169,9 +160,13 @@ if __name__ == "__main__":
 		'-o', '--output', required=True, type=str, help="Output directory"
 		)
 	ap.add_argument(
-		'-c', '--colophon', required=False, type=str,
+		'-c', '--config', required=True, type=str, help="Config file"
 	)
 	ap.add_argument(
 		'-t', '--text', required=False, type=str,
 		help="File with text to be included at the top of each index page"
 		)
 	args = ap.parse_args()
-	colophon = load_colophon(args.colophon)
+	cf = load_config(args.config)
-	apub2gmi(args.archive, args.output, colophon)
+	colophon = load_colophon(args.text)
 	apub2gmi(cf, args.archive, args.output, colophon)
--- a/config.json
+++ b/config.json
@ -0,0 +1,7 @@
 {
 	"url_re": "^/some_pattern/(.*)$",
 	"title_res": [
 		"^(.*?)\\.\\s*(.*)$",
 		"^(.*?): (.*)$"
 	]
 }
Author	SHA1	Message	Date
Mike Lynch	60b3dc619e	Updated README.md	2024-05-04 14:40:14 +10:00
Mike Lynch	f6d6a5135e	Moved the regexps out to a config file	2024-05-04 14:38:10 +10:00