Made the script more mature, updated the media attachment RE

main
Mike Lynch 2024-05-04 13:52:19 +10:00
parent 4e3405727d
commit 564f366ae2
1 changed files with 59 additions and 51 deletions

View File

@ -3,8 +3,9 @@
# convert the Glossatory archive from an ActivityPub collection to
# gemini
import json
import argparse
import datetime
import json
import re
from pathlib import Path
from shutil import copy
@ -24,8 +25,6 @@ MNAMES = {
"12": "December",
}
ARCHIVE = "archive-20230604031441-05906d0df7f3f14777089c2fd7d0175a"
OUTDIR = "gemini"
HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database.
@ -37,7 +36,7 @@ HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot
"""
URL_RE = re.compile(r"^/files\.weirder\.earth/(.*)$")
URL_RE = re.compile(r"^/weirderearth/(.*)$")
NAME_RES = [
re.compile(r"^(.*?)\.\s*(.*)$"),
re.compile(r"^(.*?)\s*(The drawing.*)$"),
@ -76,7 +75,7 @@ class GlossatoryPost():
self.defn = self.title
def process_post(obj):
def process_post(archive, obj):
date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
year = f"{date.year}"
month = f"{date.month:02}"
@ -84,7 +83,7 @@ def process_post(obj):
attachment = obj["attachment"][0]
url = attachment["url"]
if m:= URL_RE.match(url):
file = Path(ARCHIVE) / m.group(1)
file = Path(archive) / m.group(1)
else:
raise ValueError(f"Couldn't match url {url}")
alt = attachment["name"]
@ -106,50 +105,59 @@ def write_gemfile(gmdir, title, items):
gfh.write(f"=> {link} {text}\n")
def apub2gmi(archive, output):
with open(f"{archive}/outbox.json", "r") as fh:
js = json.load(fh)
posts = {}
for item in js["orderedItems"]:
if item["type"] == "Create":
try:
post = process_post(archive, item["object"])
if not post.year in posts:
posts[post.year] = {}
if not post.month in posts[post.year]:
posts[post.year][post.month] = {}
m = posts[post.year][post.month]
if not post.day in m:
m[post.day] = [ post ]
else:
m[post.day].append(post)
except Exception as e:
i = item["id"]
print(f"Processing failed: {i}: {e}")
years = [ ( f"{year}/", year ) for year in posts ]
write_gemfile(Path(output), "Glossatory", years)
for year in posts:
ydir = Path(output) / year
months = [ ( f"{month}/", MNAMES[month] ) for month in posts[year] ]
write_gemfile(ydir, year, months)
for month in posts[year]:
mname = MNAMES[month]
mdir = ydir / month
for day in posts[year][month]:
ddir = mdir / day
ddir.mkdir(parents=True, exist_ok=True)
for post in posts[year][month][day]:
post.copy_image(output)
gmi = mdir / "index.gmi"
links = [
( f"/glossatory/{year}/", year),
]
for day in posts[year][month]:
for post in posts[year][month][day]:
links.append((f"{day}/{post.fname}", post.title))
write_gemfile(mdir, f"{mname} {year}", links)
with open(f"{ARCHIVE}/outbox.json", "r") as fh:
js = json.load(fh)
posts = {}
for item in js["orderedItems"]:
if item["type"] == "Create":
try:
post = process_post(item["object"])
if not post.year in posts:
posts[post.year] = {}
if not post.month in posts[post.year]:
posts[post.year][post.month] = {}
m = posts[post.year][post.month]
if not post.day in m:
m[post.day] = [ post ]
else:
m[post.day].append(post)
except Exception as e:
i = item["id"]
print(f"Processing failed: {i}: {e}")
years = [ ( f"{year}/", year ) for year in posts ]
write_gemfile(Path(OUTDIR), "Glossatory", years)
for year in posts:
ydir = Path(OUTDIR) / year
months = [ ( f"{month}/", MNAMES[month] ) for month in posts[year] ]
write_gemfile(ydir, year, months)
for month in posts[year]:
mname = MNAMES[month]
mdir = ydir / month
for day in posts[year][month]:
ddir = mdir / day
ddir.mkdir(parents=True, exist_ok=True)
for post in posts[year][month][day]:
post.copy_image(OUTDIR)
gmi = mdir / "index.gmi"
links = [
( f"/glossatory/{year}/", year),
]
for day in posts[year][month]:
for post in posts[year][month][day]:
links.append((f"{day}/{post.fname}", post.title))
write_gemfile(mdir, f"{mname} {year}", links)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument(
'-a', '--archive', required=True, type=str, help="ActivityPub archive"
)
ap.add_argument(
'-o', '--output', required=True, type=str, help="Output directory"
)
args = ap.parse_args()
apub2gmi(args.archive, args.output)