Made the script more mature, updated the media attachment RE

main
Mike Lynch 2024-05-04 13:52:19 +10:00
parent 4e3405727d
commit 564f366ae2
1 changed files with 59 additions and 51 deletions

View File

@ -3,8 +3,9 @@
# convert the Glossatory archive from an ActivityPub collection to # convert the Glossatory archive from an ActivityPub collection to
# gemini # gemini
import json import argparse
import datetime import datetime
import json
import re import re
from pathlib import Path from pathlib import Path
from shutil import copy from shutil import copy
@ -24,8 +25,6 @@ MNAMES = {
"12": "December", "12": "December",
} }
ARCHIVE = "archive-20230604031441-05906d0df7f3f14777089c2fd7d0175a"
OUTDIR = "gemini"
HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database. HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database.
@ -37,7 +36,7 @@ HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot
""" """
URL_RE = re.compile(r"^/files\.weirder\.earth/(.*)$") URL_RE = re.compile(r"^/weirderearth/(.*)$")
NAME_RES = [ NAME_RES = [
re.compile(r"^(.*?)\.\s*(.*)$"), re.compile(r"^(.*?)\.\s*(.*)$"),
re.compile(r"^(.*?)\s*(The drawing.*)$"), re.compile(r"^(.*?)\s*(The drawing.*)$"),
@ -76,7 +75,7 @@ class GlossatoryPost():
self.defn = self.title self.defn = self.title
def process_post(obj): def process_post(archive, obj):
date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d") date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
year = f"{date.year}" year = f"{date.year}"
month = f"{date.month:02}" month = f"{date.month:02}"
@ -84,7 +83,7 @@ def process_post(obj):
attachment = obj["attachment"][0] attachment = obj["attachment"][0]
url = attachment["url"] url = attachment["url"]
if m:= URL_RE.match(url): if m:= URL_RE.match(url):
file = Path(ARCHIVE) / m.group(1) file = Path(archive) / m.group(1)
else: else:
raise ValueError(f"Couldn't match url {url}") raise ValueError(f"Couldn't match url {url}")
alt = attachment["name"] alt = attachment["name"]
@ -106,50 +105,59 @@ def write_gemfile(gmdir, title, items):
gfh.write(f"=> {link} {text}\n") gfh.write(f"=> {link} {text}\n")
def apub2gmi(archive, output):
with open(f"{archive}/outbox.json", "r") as fh:
js = json.load(fh)
posts = {}
for item in js["orderedItems"]:
if item["type"] == "Create":
try:
post = process_post(archive, item["object"])
if not post.year in posts:
posts[post.year] = {}
if not post.month in posts[post.year]:
posts[post.year][post.month] = {}
m = posts[post.year][post.month]
if not post.day in m:
m[post.day] = [ post ]
else:
m[post.day].append(post)
except Exception as e:
i = item["id"]
print(f"Processing failed: {i}: {e}")
years = [ ( f"{year}/", year ) for year in posts ]
write_gemfile(Path(output), "Glossatory", years)
for year in posts:
ydir = Path(output) / year
months = [ ( f"{month}/", MNAMES[month] ) for month in posts[year] ]
write_gemfile(ydir, year, months)
for month in posts[year]:
mname = MNAMES[month]
mdir = ydir / month
for day in posts[year][month]:
ddir = mdir / day
ddir.mkdir(parents=True, exist_ok=True)
for post in posts[year][month][day]:
post.copy_image(output)
gmi = mdir / "index.gmi"
links = [
( f"/glossatory/{year}/", year),
]
for day in posts[year][month]:
for post in posts[year][month][day]:
links.append((f"{day}/{post.fname}", post.title))
write_gemfile(mdir, f"{mname} {year}", links)
if __name__ == "__main__":
with open(f"{ARCHIVE}/outbox.json", "r") as fh: ap = argparse.ArgumentParser()
js = json.load(fh) ap.add_argument(
posts = {} '-a', '--archive', required=True, type=str, help="ActivityPub archive"
for item in js["orderedItems"]: )
if item["type"] == "Create": ap.add_argument(
try: '-o', '--output', required=True, type=str, help="Output directory"
post = process_post(item["object"]) )
if not post.year in posts: args = ap.parse_args()
posts[post.year] = {} apub2gmi(args.archive, args.output)
if not post.month in posts[post.year]:
posts[post.year][post.month] = {}
m = posts[post.year][post.month]
if not post.day in m:
m[post.day] = [ post ]
else:
m[post.day].append(post)
except Exception as e:
i = item["id"]
print(f"Processing failed: {i}: {e}")
years = [ ( f"{year}/", year ) for year in posts ]
write_gemfile(Path(OUTDIR), "Glossatory", years)
for year in posts:
ydir = Path(OUTDIR) / year
months = [ ( f"{month}/", MNAMES[month] ) for month in posts[year] ]
write_gemfile(ydir, year, months)
for month in posts[year]:
mname = MNAMES[month]
mdir = ydir / month
for day in posts[year][month]:
ddir = mdir / day
ddir.mkdir(parents=True, exist_ok=True)
for post in posts[year][month][day]:
post.copy_image(OUTDIR)
gmi = mdir / "index.gmi"
links = [
( f"/glossatory/{year}/", year),
]
for day in posts[year][month]:
for post in posts[year][month][day]:
links.append((f"{day}/{post.fname}", post.title))
write_gemfile(mdir, f"{mname} {year}", links)