tdp.py->stats.py

master
Michael F. Lamb 2015-10-20 15:50:23 -04:00
parent 470bef5f05
commit 65e1ec1a5d
2 changed files with 179 additions and 272 deletions

View File

@ -1,204 +0,0 @@
#!/usr/local/bin/python3
# tdp.py - tilde data in tilde data protocol format.
# Copyright 2015 Michael F. Lamb <http://datagrok.org>
# License: GPLv3+
"""
Outputs JSON data conforming to "~dp (Tilde Description Protocol)" as defined
at: http://protocol.club/~datagrok/beta-wiki/tdp.html
It is a JSON structure of the form:
{
'name': (string) the name of the server.
'url': (string) the URL of the server.
'signup_url': (string) the URL of a page describing the process required to request an account on the server.
'user_count': (number) the number of users currently registered on the server.
'want_users': (boolean) whether the server is currently accepting new user requests.
'admin_email': (string) the email address of the primary server administrator.
'description': (string) a free-form description for the server.
'users': [ (array) an array of users on the server.
{
'username': (string) the username of the user.
'title': (string) the HTML title of the users index.html page.
'mtime': (number) a timestamp representing the last time the users index.html was modified.
},
...
]
}
We also overload this with the preexisting data format we were using in
/var/local/tildetown/tildetown-py/stats.py, which is of the form:
{
'all_users': [ (array) of users on the server.
{
'username': (string) the username of the user.
'default': (boolean) Is the user still using their unmodified default index.html?
'favicon': (string) a url to an image representing the user
},
...
]
'num_users': (number) count of all_users
'live_users': [ (array) an array of live users, same format as all_users. Users may appear in both arrays.
...
],
'num_live_users': (number) count of live users
'active_user_count': (number) count of currently logged in users
'generated_at': (string) the time this JSON was generated in '%Y-%m-%d %H:%M:%S' format.
'generated_at_msec': (number) the time this JSON was generated, in milliseconds since the epoch.
'site_name': (same as 'name' above)
'site_url': (same as 'url' above)
'uptime': (string) output of `uptime -p`
}
Usage: tdp.py > /var/www/html/tilde.json
"""
# I suppose I could import /var/local/tildetown/tildetown-py/stats.py which
# does much of the same work, but I wanted to try to make one that needs no
# venv nor 'sh' module. (Success.) Bonus: this runs in 0.127s, vs 5.2s
# for 'stats'
# FIXME: unlike stats.py, we calculate last modified only on index.html.
# FIXME: we output quite a bit of redundant data. I think we should lose
# 'live_users' and do that filtering on the client side.
# FIXME: If we're the only consumer of the stats.py data, let's change the
# client side to use 'users' and drop 'all_users'.
import datetime
import hashlib
import json
import os
import pwd
import re
import struct
import subprocess
title_re = re.compile(r'<title[^>]*>(.*)</title>', re.DOTALL)
defaultindex_hash = None
# modified from https://gist.github.com/likexian/f9da722585036d372dca
XTMP_STRUCT_FMT = 'hi32s4s32s256shhiii4i20x'
XTMP_STRUCT_SIZE = struct.calcsize(XTMP_STRUCT_FMT)
XTMP_STRUCT_KEYS = [
'type', 'pid', 'line', 'id', 'user', 'host', 'e_termination', 'e_exit',
'session', 'sec', 'usec', 'addr_v6', 'unused',
]
def read_xtmp(filename):
"""Pure-python replacement for who(1) and w(1); parses the data structure
in /var/run/utmp or /var/run/wtmp, generating a dict for each entry. See
man 5 utmp for meaning of fields.
"""
# This was fun but probably not worth the trouble, since we end up having
# to use subprocess.check_output() elsewhere anyway.
with open(filename, 'rb') as fp:
for entry in iter((lambda: fp.read(XTMP_STRUCT_SIZE)), b''):
yield dict(zip(
XTMP_STRUCT_KEYS,
(i.decode('UTF-8').partition('\x00')[0] if hasattr(i, 'partition') else i
for i in struct.unpack(XTMP_STRUCT_FMT, entry))))
def active_user_count():
"""Return the count of unique usernames logged in."""
return len(set(r['user'] for r in read_xtmp('/var/run/utmp') if r['type'] == 7))
def md5sum(filename):
"""Return the md5 hash of the contents of filename as a hexidecimal string."""
# This doesn't slurp the whole file in; it reads 4k at a time.
h = hashlib.md5()
with open(filename, 'rb') as fp:
for data in iter((lambda: fp.read(4096)), b''):
h.update(data)
return h.hexdigest()
def get_title(indexhtml):
"""Given an html file, return the content of its <title>"""
print(indexhtml)
fp = open(indexhtml, 'rt', errors='ignore')
title = title_re.search(fp.read())
if title:
return title.group(1)
def get_users():
"""Generate tuples of the form (username, homedir) for all normal
users on this system.
"""
return ((p.pw_name, p.pw_dir) for p in pwd.getpwall() if
p.pw_uid >= 1000 and
p.pw_shell != '/bin/false' and
p.pw_name not in ['nobody', 'ubuntu', 'poetry'])
def tdp_user(username, homedir):
"""Given a unix username, and their home directory, return a TDP format
dict with information about that user.
"""
indexhtml = os.path.join(homedir, 'public_html', 'index.html')
return {
'username': username,
'title': get_title(indexhtml),
'mtime': int(os.path.getmtime(indexhtml) * 1000),
# tilde.town extensions and backward compatibility
# FIXME: just shelling out to diff -q might be way faster than all
# these hashes.
'default': md5sum(indexhtml) == defaultindex_hash,
'favicon': 'TODO',
}
def tdp():
now = datetime.datetime.now()
users = [tdp_user(username, homedir) for username, homedir in get_users()]
# TDP format data
data = {
'name': 'tilde.town',
'url': 'http://tilde.town',
'signup_url': 'http://goo.gl/forms/8IvQFTDjlo',
'want_users': True,
'admin_email': 'nks@lambdaphil.es',
'description': " ".join(l.strip() for l in """
an intentional digital community for creating and sharing works of
art, educating peers, and technological anachronism. we are a
completely non-commercial, donation supported, and committed to
rejecting false technological progress in favor of empathy and
sustainable computing.
""".splitlines()),
'user_count': len(users),
'users': users,
}
# tilde.town extensions and backward compatibility
data.update({
'active_user_count': active_user_count(),
'generated_at': now.strftime('%Y-%m-%d %H:%M:%S'),
'generated_at_msec': int(now.timestamp() * 1000),
'uptime': subprocess.check_output(['uptime', '-p'], universal_newlines=True),
})
# redundant entries we should drop after changing homepage template
data.update({
'all_users': data['users'],
'num_users': data['user_count'],
'live_users': [u for u in data['users'] if not u['default']],
'site_name': data['name'],
'site_url': data['url'],
})
data.update({
'num_live_users': len(data['live_users']),
})
return data
def main():
global defaultindex_hash
defaultindex_hash = md5sum("/etc/skel/public_html/index.html")
print(json.dumps(tdp(), sort_keys=True, indent=2))
if __name__ == '__main__':
raise SystemExit(main())

247
tildetown/stats.py 100644 → 100755
View File

@ -1,87 +1,198 @@
#!/usr/local/bin/python3
# tdp.py - tilde data in tilde data protocol format.
# Copyright 2015 Michael F. Lamb <http://datagrok.org>
# License: GPLv3+
"""
Outputs JSON data conforming to "~dp (Tilde Description Protocol)" as defined
at: http://protocol.club/~datagrok/beta-wiki/tdp.html
It is a JSON structure of the form:
{
'name': (string) the name of the server.
'url': (string) the URL of the server.
'signup_url': (string) the URL of a page describing the process required to request an account on the server.
'user_count': (number) the number of users currently registered on the server.
'want_users': (boolean) whether the server is currently accepting new user requests.
'admin_email': (string) the email address of the primary server administrator.
'description': (string) a free-form description for the server.
'users': [ (array) an array of users on the server.
{
'username': (string) the username of the user.
'title': (string) the HTML title of the users index.html page.
'mtime': (number) a timestamp representing the last time the users index.html was modified.
},
...
]
}
We also overload this with the preexisting data format we were using in
/var/local/tildetown/tildetown-py/stats.py, which is of the form:
{
'all_users': [ (array) of users on the server.
{
'username': (string) the username of the user.
'default': (boolean) Is the user still using their unmodified default index.html?
'favicon': (string) a url to an image representing the user
},
...
]
'num_users': (number) count of all_users
'live_users': [ (array) an array of live users, same format as all_users. Users may appear in both arrays.
...
],
'num_live_users': (number) count of live users
'active_user_count': (number) count of currently logged in users
'generated_at': (string) the time this JSON was generated in '%Y-%m-%d %H:%M:%S' format.
'generated_at_msec': (number) the time this JSON was generated, in milliseconds since the epoch.
'site_name': (same as 'name' above)
'site_url': (same as 'url' above)
'uptime': (string) output of `uptime -p`
}
Usage: tdp.py > /var/www/html/tilde.json
"""
# I suppose I could import /var/local/tildetown/tildetown-py/stats.py which
# does much of the same work, but I wanted to try to make one that needs no
# venv nor 'sh' module. (Success.) Bonus: this runs in 0.127s, vs 5.2s
# for 'stats'
# FIXME: we output quite a bit of redundant data. I think we should lose
# 'live_users' and do that filtering on the client side.
# FIXME: If we're the only consumer of the stats.py data, let's change the
# client side to use 'users' and drop 'all_users'.
import datetime
import hashlib
import json
from functools import partial
from os import listdir
from os.path import getmtime, join
from datetime import datetime
from sh import find, uptime, who, sort, wc, cut
from tildetown.util import slurp, thread, p
import os
import pwd
import re
import struct
import subprocess
# this script emits json on standard out that has information about tilde.town
# users. It denotes who has not updated their page from the default. It also
# reports the time this script was run. The user list is sorted by public_html update time.
SYSTEM_USERS = ['wiki', 'root', 'ubuntu', 'nate']
SYSTEM_USERS = ['wiki', 'root', 'ubuntu', 'nate', 'nobody']
DEFAULT_HTML_FILENAME = "/etc/skel/public_html/index.html"
title_re = re.compile(r'<title[^>]*>(.*)</title>', re.DOTALL)
username_to_html_path = lambda u: "/home/{}/public_html".format(u)
def active_user_count():
"""Return the count of unique usernames logged in."""
return len(set(line.split()[0] for line in
subprocess.check_output(
["who"], universal_newlines=True).splitlines()))
def default_p(username):
return subprocess.call(
['diff', '-q', DEFAULT_HTML_FILENAME, user_html_filename],
stdout=subprocess.DEVNULL) == 0
def get_title(indexhtml):
"""Given an html file, return the content of its <title>"""
with open(indexhtml, 'rt', errors='ignore') as fp:
title = title_re.search(fp.read())
if title:
return title.group(1)
def bounded_find(path):
# find might return 1 but still have worked fine.
return find(path, "-maxdepth", "3", _ok_code=[0,1])
def get_users():
"""Generate tuples of the form (username, homedir) for all normal
users on this system.
def get_active_user_count():
return int(wc(sort(cut(who(), "-d", " ", "-f1"), "-u"), "-l"))
"""
return ((p.pw_name, p.pw_dir) for p in pwd.getpwall() if
p.pw_uid >= 1000 and
p.pw_shell != '/bin/false' and
p.pw_name not in SYSTEM_USERS)
def guarded_mtime(path):
try:
return getmtime(path.rstrip())
except Exception as _:
return 0
def most_recent_within(path):
"""Return the most recent timestamp among all files within path, 3
levels deep.
"""
return max(modified_times(path), maxdepth=3)
def modify_time(username):
files_to_mtimes = partial(map, guarded_mtime)
return thread(username,
username_to_html_path,
bounded_find,
files_to_mtimes,
list,
max)
def modified_times(path, maxdepth=None):
"""Walk the directories in path, generating timestamps for all
files.
"""
for root, dirs, files in os.walk(path):
if maxdepth and len(root[len(path):].split(os.sep)) == maxdepth:
dirs.clear()
for f in files:
try:
yield os.path.getmtime(os.path.join(root, f))
except FileNotFoundError:
pass
def sort_user_list(usernames):
return sorted(usernames, key=modify_time)
def tdp_user(username, homedir):
"""Given a unix username, and their home directory, return a TDP format
dict with information about that user.
def user_generator():
ignore_system_users = lambda un: un not in SYSTEM_USERS
return filter(ignore_system_users, listdir("/home"))
"""
public_html = os.path.join(homedir, 'public_html')
index_html = os.path.join(public_html, 'index.html')
if os.path.exists(index_html):
return {
'username': username,
'title': get_title(index_html),
'mtime': int(most_recent_within(public_html) * 1000),
# tilde.town extensions and backward compatibility
# FIXME: just shelling out to diff -q might be way faster than all
# these hashes.
'favicon': 'TODO',
'default': subprocess.call(
['diff', '-q', DEFAULT_HTML_FILENAME, index_html],
stdout=subprocess.DEVNULL) == 0,
}
else:
return {
'username': username,
'default': False
}
def get_user_data():
username_to_data = lambda u: {'username': u,
'default': default_p(u),
'favicon':'TODO'}
live_p = lambda user: not user['default']
def tdp():
now = datetime.datetime.now()
users = [tdp_user(u, h) for u, h in get_users()]
all_users = thread(user_generator(),
sort_user_list,
reversed,
partial(map, username_to_data),
list)
# TDP format data
data = {
'name': 'tilde.town',
'url': 'http://tilde.town',
'signup_url': 'http://goo.gl/forms/8IvQFTDjlo',
'want_users': True,
'admin_email': 'nks@lambdaphil.es',
'description': " ".join(l.strip() for l in """
an intentional digital community for creating and sharing works of
art, educating peers, and technological anachronism. we are a
completely non-commercial, donation supported, and committed to
rejecting false technological progress in favor of empathy and
sustainable computing.
""".splitlines()),
'user_count': len(users),
'users': users,
}
live_users = list(filter(live_p, all_users))
# tilde.town extensions and backward compatibility
data.update({
'active_user_count': active_user_count(),
'generated_at': now.strftime('%Y-%m-%d %H:%M:%S'),
'generated_at_msec': int(now.timestamp() * 1000),
'uptime': subprocess.check_output(['uptime', '-p'], universal_newlines=True),
})
# redundant entries we should drop after changing homepage template
data.update({
'all_users': data['users'],
'num_users': data['user_count'],
'live_users': [u for u in data['users'] if not u['default']],
'site_name': data['name'],
'site_url': data['url'],
})
data.update({
'num_live_users': len(data['live_users']),
})
active_user_count = get_active_user_count()
return {'all_users': all_users,
'num_users': len(all_users),
'num_live_users': len(live_users),
'active_user_count': active_user_count,
'live_users': live_users,}
def get_data():
user_data = get_user_data()
data = {'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'site_name': 'tilde.town',
'site_url': 'http://tilde.town',
'uptime': str(uptime('-p')).rstrip(),}
data.update(user_data)
return data
def main():
print(json.dumps(tdp(), sort_keys=True, indent=2))
if __name__ == '__main__':
print(json.dumps(get_data()))
raise SystemExit(main())