Updated README.md

Moved the regexps out to a config file
2024-05-04 14:40:14 +10:00 · 2024-05-04 14:38:10 +10:00
3 changed files with 34 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -2,4 +2,6 @@

 This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server.

-I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and there's still a few things hard-coded in it which I haven't moved out to a config file, specifically the regular expressions which match attachment URLs and pull bits of text out for the index links.
+I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and it assumes that all of the media attachments have a name with
+two parts you want to pull out of it and use as the human-readable form of the
+index.
--- a/apub2gmi.py
+++ b/apub2gmi.py
@ -26,28 +26,10 @@ MNAMES = {
 }


-HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database.
-
-=> https://weirder.earth/@GLOSSATORY Follow the drawings on Mastodon
-=> https://botsin.space/@GLOSSATORY Follow the words on Mastodon
-=> https://oulipo.social/@GLOSSATORY Follow the words without the letter "e"
-=> / Back to Mike's gemini
-=> /glossatory/ Glossatory archive home
-
-"""
-
-URL_RE = re.compile(r"^/weirderearth/(.*)$")
-NAME_RES = [
-	re.compile(r"^(.*?)\.\s*(.*)$"),
-	re.compile(r"^(.*?)\s*(The drawing.*)$"),
-	re.compile(r"^A line drawing depicting (.*)$"),
-	re.compile(r"^(.*?): (.*)$"),
-	]
-
-

 class MediaPost():
-	def __init__(self, year, month, day, file, title):
+	def __init__(self, name_res, year, month, day, file, title):
+		self.name_res = name_res
 		self.year = year
 		self.month = month
 		self.day = day
@ -65,7 +47,7 @@ class MediaPost():
 			copy(self.file, target)

 	def try_parse(self):
-		for re in NAME_RES:
+		for re in self.name_res:
 			if m := re.match(self.title):
 				self.defn = m.group(1)
 				if len(m.groups()) == 2:
@ -75,19 +57,19 @@ class MediaPost():
 		self.defn = self.title


-def process_post(archive, obj):
+def process_post(cf, archive, obj):
 	date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
 	year = f"{date.year}"
 	month = f"{date.month:02}"
 	day = f"{date.day:02}"
 	attachment = obj["attachment"][0]
 	url = attachment["url"]
-	if m:= URL_RE.match(url):
+	if m:= cf["url_re"].match(url):
 		file = Path(archive) / m.group(1)
 	else:
 		raise ValueError(f"Couldn't match url {url}")
 	alt = attachment["name"]
-	return MediaPost(year, month, day, file, alt)
+	return MediaPost(cf["title_res"], year, month, day, file, alt)


 def ensure_dir(gmdir):
@ -102,6 +84,15 @@ def load_colophon(cfile):
 			return "".join(colophon)
 	return None

+
+def load_config(config):
+	with open(config, "r") as cfh:
+		cf = json.load(cfh)
+		cf["url_re"] = re.compile(cf["url_re"])
+		cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
+	return cf
+
+
 def write_gemfile(gmdir, colophon, title, items):
 	ensure_dir(gmdir)
 	gmi = gmdir / "index.gmi"
@ -114,14 +105,14 @@ def write_gemfile(gmdir, colophon, title, items):
 			gfh.write(f"=> {link} {text}\n")


-def apub2gmi(archive, output, colophon):
+def apub2gmi(cf, archive, output, colophon):
 	with open(f"{archive}/outbox.json", "r") as fh:
 		js = json.load(fh)
 		posts = {}
 		for item in js["orderedItems"]:
 			if item["type"] == "Create":
 				try:
-					post = process_post(archive, item["object"])
+					post = process_post(cf, archive, item["object"])
 					if not post.year in posts:
 						posts[post.year] = {}
 					if not post.month in posts[post.year]:
@ -169,9 +160,13 @@ if __name__ == "__main__":
 		'-o', '--output', required=True, type=str, help="Output directory"
 		)
 	ap.add_argument(
-		'-c', '--colophon', required=False, type=str,
+		'-c', '--config', required=True, type=str, help="Config file"
+	)
+	ap.add_argument(
+		'-t', '--text', required=False, type=str,
 		help="File with text to be included at the top of each index page"
 		)
 	args = ap.parse_args()
-	colophon = load_colophon(args.colophon)
-	apub2gmi(args.archive, args.output, colophon)
+	cf = load_config(args.config)
+	colophon = load_colophon(args.text)
+	apub2gmi(cf, args.archive, args.output, colophon)
--- a/config.json
+++ b/config.json
@ -0,0 +1,7 @@
+{
+	"url_re": "^/some_pattern/(.*)$",
+	"title_res": [
+		"^(.*?)\\.\\s*(.*)$",
+		"^(.*?): (.*)$"
+	]
+}
Author	SHA1	Message	Date
Mike Lynch	60b3dc619e	Updated README.md	2024-05-04 14:40:14 +10:00
Mike Lynch	f6d6a5135e	Moved the regexps out to a config file	2024-05-04 14:38:10 +10:00