Compare commits

..

2 Commits

Author SHA1 Message Date
Mike Lynch 60b3dc619e Updated README.md 2024-05-04 14:40:14 +10:00
Mike Lynch f6d6a5135e Moved the regexps out to a config file 2024-05-04 14:38:10 +10:00
3 changed files with 34 additions and 30 deletions

View File

@ -2,4 +2,6 @@
This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server. This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server.
I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and there's still a few things hard-coded in it which I haven't moved out to a config file, specifically the regular expressions which match attachment URLs and pull bits of text out for the index links. I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and it assumes that all of the media attachments have a name with
two parts you want to pull out of it and use as the human-readable form of the
index.

View File

@ -26,28 +26,10 @@ MNAMES = {
} }
HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database.
=> https://weirder.earth/@GLOSSATORY Follow the drawings on Mastodon
=> https://botsin.space/@GLOSSATORY Follow the words on Mastodon
=> https://oulipo.social/@GLOSSATORY Follow the words without the letter "e"
=> / Back to Mike's gemini
=> /glossatory/ Glossatory archive home
"""
URL_RE = re.compile(r"^/weirderearth/(.*)$")
NAME_RES = [
re.compile(r"^(.*?)\.\s*(.*)$"),
re.compile(r"^(.*?)\s*(The drawing.*)$"),
re.compile(r"^A line drawing depicting (.*)$"),
re.compile(r"^(.*?): (.*)$"),
]
class MediaPost(): class MediaPost():
def __init__(self, year, month, day, file, title): def __init__(self, name_res, year, month, day, file, title):
self.name_res = name_res
self.year = year self.year = year
self.month = month self.month = month
self.day = day self.day = day
@ -65,7 +47,7 @@ class MediaPost():
copy(self.file, target) copy(self.file, target)
def try_parse(self): def try_parse(self):
for re in NAME_RES: for re in self.name_res:
if m := re.match(self.title): if m := re.match(self.title):
self.defn = m.group(1) self.defn = m.group(1)
if len(m.groups()) == 2: if len(m.groups()) == 2:
@ -75,19 +57,19 @@ class MediaPost():
self.defn = self.title self.defn = self.title
def process_post(archive, obj): def process_post(cf, archive, obj):
date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d") date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
year = f"{date.year}" year = f"{date.year}"
month = f"{date.month:02}" month = f"{date.month:02}"
day = f"{date.day:02}" day = f"{date.day:02}"
attachment = obj["attachment"][0] attachment = obj["attachment"][0]
url = attachment["url"] url = attachment["url"]
if m:= URL_RE.match(url): if m:= cf["url_re"].match(url):
file = Path(archive) / m.group(1) file = Path(archive) / m.group(1)
else: else:
raise ValueError(f"Couldn't match url {url}") raise ValueError(f"Couldn't match url {url}")
alt = attachment["name"] alt = attachment["name"]
return MediaPost(year, month, day, file, alt) return MediaPost(cf["title_res"], year, month, day, file, alt)
def ensure_dir(gmdir): def ensure_dir(gmdir):
@ -102,6 +84,15 @@ def load_colophon(cfile):
return "".join(colophon) return "".join(colophon)
return None return None
def load_config(config):
with open(config, "r") as cfh:
cf = json.load(cfh)
cf["url_re"] = re.compile(cf["url_re"])
cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
return cf
def write_gemfile(gmdir, colophon, title, items): def write_gemfile(gmdir, colophon, title, items):
ensure_dir(gmdir) ensure_dir(gmdir)
gmi = gmdir / "index.gmi" gmi = gmdir / "index.gmi"
@ -114,14 +105,14 @@ def write_gemfile(gmdir, colophon, title, items):
gfh.write(f"=> {link} {text}\n") gfh.write(f"=> {link} {text}\n")
def apub2gmi(archive, output, colophon): def apub2gmi(cf, archive, output, colophon):
with open(f"{archive}/outbox.json", "r") as fh: with open(f"{archive}/outbox.json", "r") as fh:
js = json.load(fh) js = json.load(fh)
posts = {} posts = {}
for item in js["orderedItems"]: for item in js["orderedItems"]:
if item["type"] == "Create": if item["type"] == "Create":
try: try:
post = process_post(archive, item["object"]) post = process_post(cf, archive, item["object"])
if not post.year in posts: if not post.year in posts:
posts[post.year] = {} posts[post.year] = {}
if not post.month in posts[post.year]: if not post.month in posts[post.year]:
@ -169,9 +160,13 @@ if __name__ == "__main__":
'-o', '--output', required=True, type=str, help="Output directory" '-o', '--output', required=True, type=str, help="Output directory"
) )
ap.add_argument( ap.add_argument(
'-c', '--colophon', required=False, type=str, '-c', '--config', required=True, type=str, help="Config file"
)
ap.add_argument(
'-t', '--text', required=False, type=str,
help="File with text to be included at the top of each index page" help="File with text to be included at the top of each index page"
) )
args = ap.parse_args() args = ap.parse_args()
colophon = load_colophon(args.colophon) cf = load_config(args.config)
apub2gmi(args.archive, args.output, colophon) colophon = load_colophon(args.text)
apub2gmi(cf, args.archive, args.output, colophon)

7
config.json 100644
View File

@ -0,0 +1,7 @@
{
"url_re": "^/some_pattern/(.*)$",
"title_res": [
"^(.*?)\\.\\s*(.*)$",
"^(.*?): (.*)$"
]
}