Compare commits

...

2 Commits

Author SHA1 Message Date
Mike Lynch 60b3dc619e Updated README.md 2024-05-04 14:40:14 +10:00
Mike Lynch f6d6a5135e Moved the regexps out to a config file 2024-05-04 14:38:10 +10:00
3 changed files with 34 additions and 30 deletions

View File

@ -2,4 +2,6 @@
This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server.
I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and there's still a few things hard-coded in it which I haven't moved out to a config file, specifically the regular expressions which match attachment URLs and pull bits of text out for the index links.
I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and it assumes that all of the media attachments have a name with
two parts you want to pull out of it and use as the human-readable form of the
index.

View File

@ -26,28 +26,10 @@ MNAMES = {
}
HEADER = """This is the archive of GLOSSATORY, an illustrated companion to a bot which generates dictionary entries based on an RNN trained on the WordNet lexical database.
=> https://weirder.earth/@GLOSSATORY Follow the drawings on Mastodon
=> https://botsin.space/@GLOSSATORY Follow the words on Mastodon
=> https://oulipo.social/@GLOSSATORY Follow the words without the letter "e"
=> / Back to Mike's gemini
=> /glossatory/ Glossatory archive home
"""
URL_RE = re.compile(r"^/weirderearth/(.*)$")
NAME_RES = [
re.compile(r"^(.*?)\.\s*(.*)$"),
re.compile(r"^(.*?)\s*(The drawing.*)$"),
re.compile(r"^A line drawing depicting (.*)$"),
re.compile(r"^(.*?): (.*)$"),
]
class MediaPost():
def __init__(self, year, month, day, file, title):
def __init__(self, name_res, year, month, day, file, title):
self.name_res = name_res
self.year = year
self.month = month
self.day = day
@ -65,7 +47,7 @@ class MediaPost():
copy(self.file, target)
def try_parse(self):
for re in NAME_RES:
for re in self.name_res:
if m := re.match(self.title):
self.defn = m.group(1)
if len(m.groups()) == 2:
@ -75,19 +57,19 @@ class MediaPost():
self.defn = self.title
def process_post(archive, obj):
def process_post(cf, archive, obj):
date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
year = f"{date.year}"
month = f"{date.month:02}"
day = f"{date.day:02}"
attachment = obj["attachment"][0]
url = attachment["url"]
if m:= URL_RE.match(url):
if m:= cf["url_re"].match(url):
file = Path(archive) / m.group(1)
else:
raise ValueError(f"Couldn't match url {url}")
alt = attachment["name"]
return MediaPost(year, month, day, file, alt)
return MediaPost(cf["title_res"], year, month, day, file, alt)
def ensure_dir(gmdir):
@ -102,6 +84,15 @@ def load_colophon(cfile):
return "".join(colophon)
return None
def load_config(config):
with open(config, "r") as cfh:
cf = json.load(cfh)
cf["url_re"] = re.compile(cf["url_re"])
cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
return cf
def write_gemfile(gmdir, colophon, title, items):
ensure_dir(gmdir)
gmi = gmdir / "index.gmi"
@ -114,14 +105,14 @@ def write_gemfile(gmdir, colophon, title, items):
gfh.write(f"=> {link} {text}\n")
def apub2gmi(archive, output, colophon):
def apub2gmi(cf, archive, output, colophon):
with open(f"{archive}/outbox.json", "r") as fh:
js = json.load(fh)
posts = {}
for item in js["orderedItems"]:
if item["type"] == "Create":
try:
post = process_post(archive, item["object"])
post = process_post(cf, archive, item["object"])
if not post.year in posts:
posts[post.year] = {}
if not post.month in posts[post.year]:
@ -169,9 +160,13 @@ if __name__ == "__main__":
'-o', '--output', required=True, type=str, help="Output directory"
)
ap.add_argument(
'-c', '--colophon', required=False, type=str,
'-c', '--config', required=True, type=str, help="Config file"
)
ap.add_argument(
'-t', '--text', required=False, type=str,
help="File with text to be included at the top of each index page"
)
args = ap.parse_args()
colophon = load_colophon(args.colophon)
apub2gmi(args.archive, args.output, colophon)
cf = load_config(args.config)
colophon = load_colophon(args.text)
apub2gmi(cf, args.archive, args.output, colophon)

7
config.json 100644
View File

@ -0,0 +1,7 @@
{
"url_re": "^/some_pattern/(.*)$",
"title_res": [
"^(.*?)\\.\\s*(.*)$",
"^(.*?): (.*)$"
]
}