Made the script more mature, updated the media attachment RE

main
Mike Lynch 2024-05-04 13:52:19 +10:00
parent 4e3405727d
commit 564f366ae2
1 changed files with 59 additions and 51 deletions

View File

@ -3,8 +3,9 @@
# convert the Glossatory archive from an ActivityPub collection to # convert the Glossatory archive from an ActivityPub collection to
# gemini # gemini
import json import argparse
import datetime import datetime
import json
import re import re
from pathlib import Path from pathlib import Path
from shutil import copy from shutil import copy
@ -24,8 +25,6 @@ MNAMES = {
"12": "December", "12": "December",
} }
ARCHIVE = "archive-20230604031441-05906d0df7f3f14777089c2fd7d0175a"
OUTDIR = "gemini"
HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database. HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database.
@ -37,7 +36,7 @@ HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot
""" """
URL_RE = re.compile(r"^/files\.weirder\.earth/(.*)$") URL_RE = re.compile(r"^/weirderearth/(.*)$")
NAME_RES = [ NAME_RES = [
re.compile(r"^(.*?)\.\s*(.*)$"), re.compile(r"^(.*?)\.\s*(.*)$"),
re.compile(r"^(.*?)\s*(The drawing.*)$"), re.compile(r"^(.*?)\s*(The drawing.*)$"),
@ -76,7 +75,7 @@ class GlossatoryPost():
self.defn = self.title self.defn = self.title
def process_post(obj): def process_post(archive, obj):
date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d") date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
year = f"{date.year}" year = f"{date.year}"
month = f"{date.month:02}" month = f"{date.month:02}"
@ -84,7 +83,7 @@ def process_post(obj):
attachment = obj["attachment"][0] attachment = obj["attachment"][0]
url = attachment["url"] url = attachment["url"]
if m:= URL_RE.match(url): if m:= URL_RE.match(url):
file = Path(ARCHIVE) / m.group(1) file = Path(archive) / m.group(1)
else: else:
raise ValueError(f"Couldn't match url {url}") raise ValueError(f"Couldn't match url {url}")
alt = attachment["name"] alt = attachment["name"]
@ -106,16 +105,14 @@ def write_gemfile(gmdir, title, items):
gfh.write(f"=> {link} {text}\n") gfh.write(f"=> {link} {text}\n")
def apub2gmi(archive, output):
with open(f"{archive}/outbox.json", "r") as fh:
with open(f"{ARCHIVE}/outbox.json", "r") as fh:
js = json.load(fh) js = json.load(fh)
posts = {} posts = {}
for item in js["orderedItems"]: for item in js["orderedItems"]:
if item["type"] == "Create": if item["type"] == "Create":
try: try:
post = process_post(item["object"]) post = process_post(archive, item["object"])
if not post.year in posts: if not post.year in posts:
posts[post.year] = {} posts[post.year] = {}
if not post.month in posts[post.year]: if not post.month in posts[post.year]:
@ -130,10 +127,10 @@ with open(f"{ARCHIVE}/outbox.json", "r") as fh:
print(f"Processing failed: {i}: {e}") print(f"Processing failed: {i}: {e}")
years = [ ( f"{year}/", year ) for year in posts ] years = [ ( f"{year}/", year ) for year in posts ]
write_gemfile(Path(OUTDIR), "Glossatory", years) write_gemfile(Path(output), "Glossatory", years)
for year in posts: for year in posts:
ydir = Path(OUTDIR) / year ydir = Path(output) / year
months = [ ( f"{month}/", MNAMES[month] ) for month in posts[year] ] months = [ ( f"{month}/", MNAMES[month] ) for month in posts[year] ]
write_gemfile(ydir, year, months) write_gemfile(ydir, year, months)
for month in posts[year]: for month in posts[year]:
@ -143,7 +140,7 @@ with open(f"{ARCHIVE}/outbox.json", "r") as fh:
ddir = mdir / day ddir = mdir / day
ddir.mkdir(parents=True, exist_ok=True) ddir.mkdir(parents=True, exist_ok=True)
for post in posts[year][month][day]: for post in posts[year][month][day]:
post.copy_image(OUTDIR) post.copy_image(output)
gmi = mdir / "index.gmi" gmi = mdir / "index.gmi"
links = [ links = [
( f"/glossatory/{year}/", year), ( f"/glossatory/{year}/", year),
@ -153,3 +150,14 @@ with open(f"{ARCHIVE}/outbox.json", "r") as fh:
links.append((f"{day}/{post.fname}", post.title)) links.append((f"{day}/{post.fname}", post.title))
write_gemfile(mdir, f"{mname} {year}", links) write_gemfile(mdir, f"{mname} {year}", links)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument(
'-a', '--archive', required=True, type=str, help="ActivityPub archive"
)
ap.add_argument(
'-o', '--output', required=True, type=str, help="Output directory"
)
args = ap.parse_args()
apub2gmi(args.archive, args.output)