Merge pull request 'feature-licence-and-better-docs' (#1) from feature-licence-and-better-docs into main

Reviewed-on: #1
This commit is contained in:
bombinans 2025-06-08 09:42:38 +00:00
commit 71f1856199
2 changed files with 86 additions and 21 deletions

View File

@ -2,6 +2,51 @@
This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server. This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server.
I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and it assumes that all of the media attachments have a name with I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) from my account [@GLOSSATORY](https://weirder.earth/@GLOSSATORY).
two parts you want to pull out of it and use as the human-readable form of the
index. It builds a hierarchy of folders with the structure YYYY/MM/DD and copies
media attachments into the appropriate day's folder.
It also adds index.gmi files at each level. Index files at the top and year
levels have links to the next level down. Index files at the month level have
links to all of the attachments for that month.
It assumes that there's only one media attachment per post. If it finds a post with more than one attachment it will only copy the first and issue a warning.
The alt-text is used as the title of each image in the month-level index file. If you want to only use part of the alt-text, you can provide a list of regular expressions in the config file which will be matched against it.
## Usage
python apub2gmi.py --archive PATH_OF_YOUR_ACTIVITYPUB_ARCHIVE/ --output GEMINI_OUTPUT --config CONFIG.json [--text OPTIONAL_COLOPHON_TEXT ] [--debug]
## Config
The configuration file is JSON as follows:
```
{
"url_re": "^/YOUR_SERVERS_MEDIA_ATTACHMENT_PATH/(.*)$",
"title_res": [
"^(.*?)\\.\\s*(.*)$"
]
}
`url_re` should match the URLs of media attachments in the ActivityPub JSON. This will depend on your server - here's an example from the GLOSSATORY archive.
"attachment":
[
{
"type": "Document",
"mediaType": "image/jpeg",
"url": "/weirderearth/media_attachments/files/105/839/131/582/626/008/original/9e2423c3ffd70dd0.jpeg",
"name": "BILLET: an unmarried working person (often used for making tying) The drawing depicts a person seated at a bench tying knots in a long cord.",
"blurhash": "U2Ss50M{~qt7-;t7IUt7_3-;RjM{RjD%-;WB",
"width": 1280,
"height": 1280
}
],
`title_res` is optional: it's a list of Python regular expressions which will be matched against the alt-text. The text used for the index page is the `()` group or groups from the first regexp which matches. If there's more than one group in the re, the results are joined with spaces.

View File

@ -9,6 +9,7 @@ import json
import re import re
from pathlib import Path from pathlib import Path
from shutil import copy from shutil import copy
import sys
MNAMES = { MNAMES = {
"01": "January", "01": "January",
@ -28,40 +29,52 @@ MNAMES = {
class MediaPost(): class MediaPost():
def __init__(self, name_res, year, month, day, file, title): def __init__(self, year, month, day, file, title, title_res=None):
self.name_res = name_res
self.year = year self.year = year
self.month = month self.month = month
self.day = day self.day = day
self.file = file self.file = file
self.fname = Path(file).name self.fname = Path(file).name
self.title = title self.title = title
self.title_res = title_res
self.defn = "" self.defn = ""
self.desc = "" self.desc = ""
self.try_parse() if self.title_res:
self.try_parse_title()
def __str__(self):
return f"{self.year}-{self.month}-{self.day}: {self.file}"
def copy_image(self, root): def copy_image(self, root):
d = Path(root) / self.year / self.month / self.day d = Path(root) / self.year / self.month / self.day
target = d / self.fname target = d / self.fname
if not target.exists(): try:
copy(self.file, target) if not target.exists():
copy(self.file, target)
except FileNotFoundError as e:
print(f"Image file missing: {self}", file=sys.stderr)
def try_parse(self): def try_parse_title(self):
for re in self.name_res: for re in self.title_res:
if m := re.match(self.title): if m := re.match(self.title):
self.defn = m.group(1) self.title = ' '.join(m.groups())
if len(m.groups()) == 2:
self.desc = m.group(2)
return return
print(f"{self.file} Couldn't match title {self.title}") print(f"{self.file} Couldn't match alt text {self.title}", file=sys.stderr)
self.defn = self.title
def process_post(cf, archive, obj): def process_post(cf, archive, obj, debug=False):
if debug:
print(f"Processing {obj}", file=sys.stderr)
date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d") date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
year = f"{date.year}" year = f"{date.year}"
month = f"{date.month:02}" month = f"{date.month:02}"
day = f"{date.day:02}" day = f"{date.day:02}"
if "attachment" not in obj or len(obj["attachment"]) < 1:
raise ValueError('No attachments on this status')
if len(obj["attachment"]) > 1:
status_id = obj["id"]
n = len(obj["attachment"])
print(f"Warning: only one media item copied from post {status_id} which has {n}", file=sys.stderr)
attachment = obj["attachment"][0] attachment = obj["attachment"][0]
url = attachment["url"] url = attachment["url"]
if m:= cf["url_re"].match(url): if m:= cf["url_re"].match(url):
@ -69,7 +82,7 @@ def process_post(cf, archive, obj):
else: else:
raise ValueError(f"Couldn't match url {url}") raise ValueError(f"Couldn't match url {url}")
alt = attachment["name"] alt = attachment["name"]
return MediaPost(cf["title_res"], year, month, day, file, alt) return MediaPost(year, month, day, file, alt, cf.get("title_res", None))
def ensure_dir(gmdir): def ensure_dir(gmdir):
@ -89,7 +102,8 @@ def load_config(config):
with open(config, "r") as cfh: with open(config, "r") as cfh:
cf = json.load(cfh) cf = json.load(cfh)
cf["url_re"] = re.compile(cf["url_re"]) cf["url_re"] = re.compile(cf["url_re"])
cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ] if "title_res" in cf:
cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
return cf return cf
@ -105,14 +119,16 @@ def write_gemfile(gmdir, colophon, title, items):
gfh.write(f"=> {link} {text}\n") gfh.write(f"=> {link} {text}\n")
def apub2gmi(cf, archive, output, colophon): def apub2gmi(cf, archive, output, colophon, debug=False):
with open(f"{archive}/outbox.json", "r") as fh: with open(f"{archive}/outbox.json", "r") as fh:
js = json.load(fh) js = json.load(fh)
posts = {} posts = {}
for item in js["orderedItems"]: for item in js["orderedItems"]:
if item["type"] == "Create": if item["type"] == "Create":
if debug:
print(item)
try: try:
post = process_post(cf, archive, item["object"]) post = process_post(cf, archive, item["object"], debug)
if not post.year in posts: if not post.year in posts:
posts[post.year] = {} posts[post.year] = {}
if not post.month in posts[post.year]: if not post.month in posts[post.year]:
@ -166,7 +182,11 @@ if __name__ == "__main__":
'-t', '--text', required=False, type=str, '-t', '--text', required=False, type=str,
help="File with text to be included at the top of each index page" help="File with text to be included at the top of each index page"
) )
ap.add_argument(
'-d', '--debug', action="store_true", default=False,
help="Print debug output"
)
args = ap.parse_args() args = ap.parse_args()
cf = load_config(args.config) cf = load_config(args.config)
colophon = load_colophon(args.text) colophon = load_colophon(args.text)
apub2gmi(cf, args.archive, args.output, colophon) apub2gmi(cf, args.archive, args.output, colophon, args.debug)