Added Licence, made the title_re optional, completed docs
This commit is contained in:
parent
d2a39ea202
commit
39057d7a4c
51
README.md
51
README.md
@ -2,8 +2,51 @@
|
|||||||
|
|
||||||
This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server.
|
This is a script which takes an archive exported from a Mastodon account, looks for media attachments and uses them to build an archive for a Gemini server.
|
||||||
|
|
||||||
I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) and it assumes that all of the media attachments have a name with
|
I use it to update the [Glossatory archives](gemini://gemini.mikelynch.org/glossatory/) from my account [@GLOSSATORY](https://weirder.earth/@GLOSSATORY).
|
||||||
two parts you want to pull out of it and use as the human-readable form of the
|
|
||||||
index.
|
|
||||||
|
|
||||||
Pushing a minor change as a test
|
It builds a hierarchy of folders with the structure YYYY/MM/DD and copies
|
||||||
|
media attachments into the appropriate day's folder.
|
||||||
|
|
||||||
|
It also adds index.gmi files at each level. Index files at the top and year
|
||||||
|
levels have links to the next level down. Index files at the month level have
|
||||||
|
links to all of the attachments for that month.
|
||||||
|
|
||||||
|
It assumes that there's only one media attachment per post. If it finds a post with more than one attachment it will only copy the first and issue a warning.
|
||||||
|
|
||||||
|
The alt-text is used as the title of each image in the month-level index file. If you want to only use part of the alt-text, you can provide a list of regular expressions in the config file which will be matched against it.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
python apub2gmi.py --archive PATH_OF_YOUR_ACTIVITYPUB_ARCHIVE/ --output GEMINI_OUTPUT --config CONFIG.json [--text OPTIONAL_COLOPHON_TEXT ] [--debug]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Config
|
||||||
|
|
||||||
|
The configuration file is JSON as follows:
|
||||||
|
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"url_re": "^/YOUR_SERVERS_MEDIA_ATTACHMENT_PATH/(.*)$",
|
||||||
|
"title_res": [
|
||||||
|
"^(.*?)\\.\\s*(.*)$"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
`url_re` should match the URLs of media attachments in the ActivityPub JSON. This will depend on your server - here's an example from the GLOSSATORY archive.
|
||||||
|
|
||||||
|
"attachment":
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "Document",
|
||||||
|
"mediaType": "image/jpeg",
|
||||||
|
"url": "/weirderearth/media_attachments/files/105/839/131/582/626/008/original/9e2423c3ffd70dd0.jpeg",
|
||||||
|
"name": "BILLET: an unmarried working person (often used for making tying) The drawing depicts a person seated at a bench tying knots in a long cord.",
|
||||||
|
"blurhash": "U2Ss50M{~qt7-;t7IUt7_3-;RjM{RjD%-;WB",
|
||||||
|
"width": 1280,
|
||||||
|
"height": 1280
|
||||||
|
}
|
||||||
|
],
|
||||||
|
|
||||||
|
`title_res` is optional: it's a list of Python regular expressions which will be matched against the alt-text. The text used for the index page is the `()` group or groups from the first regexp which matches. If there's more than one group in the re, the results are joined with spaces.
|
50
apub2gmi.py
50
apub2gmi.py
@ -9,6 +9,7 @@ import json
|
|||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from shutil import copy
|
from shutil import copy
|
||||||
|
import sys
|
||||||
|
|
||||||
MNAMES = {
|
MNAMES = {
|
||||||
"01": "January",
|
"01": "January",
|
||||||
@ -28,40 +29,52 @@ MNAMES = {
|
|||||||
|
|
||||||
|
|
||||||
class MediaPost():
|
class MediaPost():
|
||||||
def __init__(self, name_res, year, month, day, file, title):
|
def __init__(self, year, month, day, file, title, title_res=None):
|
||||||
self.name_res = name_res
|
|
||||||
self.year = year
|
self.year = year
|
||||||
self.month = month
|
self.month = month
|
||||||
self.day = day
|
self.day = day
|
||||||
self.file = file
|
self.file = file
|
||||||
self.fname = Path(file).name
|
self.fname = Path(file).name
|
||||||
self.title = title
|
self.title = title
|
||||||
|
self.title_res = title_res
|
||||||
self.defn = ""
|
self.defn = ""
|
||||||
self.desc = ""
|
self.desc = ""
|
||||||
self.try_parse()
|
if self.title_res:
|
||||||
|
self.try_parse_title()
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.year}-{self.month}-{self.day}: {self.file}"
|
||||||
|
|
||||||
def copy_image(self, root):
|
def copy_image(self, root):
|
||||||
d = Path(root) / self.year / self.month / self.day
|
d = Path(root) / self.year / self.month / self.day
|
||||||
target = d / self.fname
|
target = d / self.fname
|
||||||
|
try:
|
||||||
if not target.exists():
|
if not target.exists():
|
||||||
copy(self.file, target)
|
copy(self.file, target)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(f"Image file missing: {self}", file=sys.stderr)
|
||||||
|
|
||||||
def try_parse(self):
|
def try_parse_title(self):
|
||||||
for re in self.name_res:
|
for re in self.title_res:
|
||||||
if m := re.match(self.title):
|
if m := re.match(self.title):
|
||||||
self.defn = m.group(1)
|
self.title = ' '.join(m.groups())
|
||||||
if len(m.groups()) == 2:
|
|
||||||
self.desc = m.group(2)
|
|
||||||
return
|
return
|
||||||
print(f"{self.file} Couldn't match title {self.title}")
|
print(f"{self.file} Couldn't match alt text {self.title}", file=sys.stderr)
|
||||||
self.defn = self.title
|
|
||||||
|
|
||||||
|
|
||||||
def process_post(cf, archive, obj):
|
def process_post(cf, archive, obj, debug=False):
|
||||||
|
if debug:
|
||||||
|
print(f"Processing {obj}", file=sys.stderr)
|
||||||
date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
|
date = datetime.datetime.strptime(obj["published"][:10], "%Y-%m-%d")
|
||||||
year = f"{date.year}"
|
year = f"{date.year}"
|
||||||
month = f"{date.month:02}"
|
month = f"{date.month:02}"
|
||||||
day = f"{date.day:02}"
|
day = f"{date.day:02}"
|
||||||
|
if "attachment" not in obj or len(obj["attachment"]) < 1:
|
||||||
|
raise ValueError('No attachments on this status')
|
||||||
|
if len(obj["attachment"]) > 1:
|
||||||
|
status_id = obj["id"]
|
||||||
|
n = len(obj["attachment"])
|
||||||
|
print(f"Warning: only one media item copied from post {status_id} which has {n}", file=sys.stderr)
|
||||||
attachment = obj["attachment"][0]
|
attachment = obj["attachment"][0]
|
||||||
url = attachment["url"]
|
url = attachment["url"]
|
||||||
if m:= cf["url_re"].match(url):
|
if m:= cf["url_re"].match(url):
|
||||||
@ -69,7 +82,7 @@ def process_post(cf, archive, obj):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Couldn't match url {url}")
|
raise ValueError(f"Couldn't match url {url}")
|
||||||
alt = attachment["name"]
|
alt = attachment["name"]
|
||||||
return MediaPost(cf["title_res"], year, month, day, file, alt)
|
return MediaPost(year, month, day, file, alt, cf.get("title_res", None))
|
||||||
|
|
||||||
|
|
||||||
def ensure_dir(gmdir):
|
def ensure_dir(gmdir):
|
||||||
@ -89,6 +102,7 @@ def load_config(config):
|
|||||||
with open(config, "r") as cfh:
|
with open(config, "r") as cfh:
|
||||||
cf = json.load(cfh)
|
cf = json.load(cfh)
|
||||||
cf["url_re"] = re.compile(cf["url_re"])
|
cf["url_re"] = re.compile(cf["url_re"])
|
||||||
|
if "title_res" in cf:
|
||||||
cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
|
cf["title_res"] = [ re.compile(r) for r in cf["title_res"] ]
|
||||||
return cf
|
return cf
|
||||||
|
|
||||||
@ -105,14 +119,16 @@ def write_gemfile(gmdir, colophon, title, items):
|
|||||||
gfh.write(f"=> {link} {text}\n")
|
gfh.write(f"=> {link} {text}\n")
|
||||||
|
|
||||||
|
|
||||||
def apub2gmi(cf, archive, output, colophon):
|
def apub2gmi(cf, archive, output, colophon, debug=False):
|
||||||
with open(f"{archive}/outbox.json", "r") as fh:
|
with open(f"{archive}/outbox.json", "r") as fh:
|
||||||
js = json.load(fh)
|
js = json.load(fh)
|
||||||
posts = {}
|
posts = {}
|
||||||
for item in js["orderedItems"]:
|
for item in js["orderedItems"]:
|
||||||
if item["type"] == "Create":
|
if item["type"] == "Create":
|
||||||
|
if debug:
|
||||||
|
print(item)
|
||||||
try:
|
try:
|
||||||
post = process_post(cf, archive, item["object"])
|
post = process_post(cf, archive, item["object"], debug)
|
||||||
if not post.year in posts:
|
if not post.year in posts:
|
||||||
posts[post.year] = {}
|
posts[post.year] = {}
|
||||||
if not post.month in posts[post.year]:
|
if not post.month in posts[post.year]:
|
||||||
@ -166,7 +182,11 @@ if __name__ == "__main__":
|
|||||||
'-t', '--text', required=False, type=str,
|
'-t', '--text', required=False, type=str,
|
||||||
help="File with text to be included at the top of each index page"
|
help="File with text to be included at the top of each index page"
|
||||||
)
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
'-d', '--debug', action="store_true", default=False,
|
||||||
|
help="Print debug output"
|
||||||
|
)
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
cf = load_config(args.config)
|
cf = load_config(args.config)
|
||||||
colophon = load_colophon(args.text)
|
colophon = load_colophon(args.text)
|
||||||
apub2gmi(cf, args.archive, args.output, colophon)
|
apub2gmi(cf, args.archive, args.output, colophon, args.debug)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user