From 65e1ec1a5d951df07d18a0c3b3563c4fcd01d628 Mon Sep 17 00:00:00 2001 From: "Michael F. Lamb" Date: Tue, 20 Oct 2015 15:50:23 -0400 Subject: [PATCH] tdp.py->stats.py --- scripts/tdp.py | 204 ------------------------------------- tildetown/stats.py | 247 ++++++++++++++++++++++++++++++++------------- 2 files changed, 179 insertions(+), 272 deletions(-) delete mode 100755 scripts/tdp.py mode change 100644 => 100755 tildetown/stats.py diff --git a/scripts/tdp.py b/scripts/tdp.py deleted file mode 100755 index b3b3d18..0000000 --- a/scripts/tdp.py +++ /dev/null @@ -1,204 +0,0 @@ -#!/usr/local/bin/python3 - -# tdp.py - tilde data in tilde data protocol format. -# Copyright 2015 Michael F. Lamb -# License: GPLv3+ - -""" -Outputs JSON data conforming to "~dp (Tilde Description Protocol)" as defined -at: http://protocol.club/~datagrok/beta-wiki/tdp.html - -It is a JSON structure of the form: - -{ - 'name': (string) the name of the server. - 'url': (string) the URL of the server. - 'signup_url': (string) the URL of a page describing the process required to request an account on the server. - 'user_count': (number) the number of users currently registered on the server. - 'want_users': (boolean) whether the server is currently accepting new user requests. - 'admin_email': (string) the email address of the primary server administrator. - 'description': (string) a free-form description for the server. - 'users': [ (array) an array of users on the server. - { - 'username': (string) the username of the user. - 'title': (string) the HTML title of the user’s index.html page. - 'mtime': (number) a timestamp representing the last time the user’s index.html was modified. - }, - ... - ] - } - -We also overload this with the preexisting data format we were using in -/var/local/tildetown/tildetown-py/stats.py, which is of the form: - -{ - 'all_users': [ (array) of users on the server. - { - 'username': (string) the username of the user. - 'default': (boolean) Is the user still using their unmodified default index.html? - 'favicon': (string) a url to an image representing the user - }, - ... - ] - 'num_users': (number) count of all_users - 'live_users': [ (array) an array of live users, same format as all_users. Users may appear in both arrays. - ... - ], - 'num_live_users': (number) count of live users - 'active_user_count': (number) count of currently logged in users - 'generated_at': (string) the time this JSON was generated in '%Y-%m-%d %H:%M:%S' format. - 'generated_at_msec': (number) the time this JSON was generated, in milliseconds since the epoch. - 'site_name': (same as 'name' above) - 'site_url': (same as 'url' above) - 'uptime': (string) output of `uptime -p` - -} -Usage: tdp.py > /var/www/html/tilde.json -""" - -# I suppose I could import /var/local/tildetown/tildetown-py/stats.py which -# does much of the same work, but I wanted to try to make one that needs no -# venv nor 'sh' module. (Success.) Bonus: this runs in 0.127s, vs 5.2s -# for 'stats' - -# FIXME: unlike stats.py, we calculate last modified only on index.html. - -# FIXME: we output quite a bit of redundant data. I think we should lose -# 'live_users' and do that filtering on the client side. - -# FIXME: If we're the only consumer of the stats.py data, let's change the -# client side to use 'users' and drop 'all_users'. - -import datetime -import hashlib -import json -import os -import pwd -import re -import struct -import subprocess - -title_re = re.compile(r']*>(.*)', re.DOTALL) -defaultindex_hash = None - -# modified from https://gist.github.com/likexian/f9da722585036d372dca -XTMP_STRUCT_FMT = 'hi32s4s32s256shhiii4i20x' -XTMP_STRUCT_SIZE = struct.calcsize(XTMP_STRUCT_FMT) -XTMP_STRUCT_KEYS = [ - 'type', 'pid', 'line', 'id', 'user', 'host', 'e_termination', 'e_exit', - 'session', 'sec', 'usec', 'addr_v6', 'unused', - ] - -def read_xtmp(filename): - """Pure-python replacement for who(1) and w(1); parses the data structure - in /var/run/utmp or /var/run/wtmp, generating a dict for each entry. See - man 5 utmp for meaning of fields. - - """ - # This was fun but probably not worth the trouble, since we end up having - # to use subprocess.check_output() elsewhere anyway. - with open(filename, 'rb') as fp: - for entry in iter((lambda: fp.read(XTMP_STRUCT_SIZE)), b''): - yield dict(zip( - XTMP_STRUCT_KEYS, - (i.decode('UTF-8').partition('\x00')[0] if hasattr(i, 'partition') else i - for i in struct.unpack(XTMP_STRUCT_FMT, entry)))) - -def active_user_count(): - """Return the count of unique usernames logged in.""" - return len(set(r['user'] for r in read_xtmp('/var/run/utmp') if r['type'] == 7)) - -def md5sum(filename): - """Return the md5 hash of the contents of filename as a hexidecimal string.""" - # This doesn't slurp the whole file in; it reads 4k at a time. - h = hashlib.md5() - with open(filename, 'rb') as fp: - for data in iter((lambda: fp.read(4096)), b''): - h.update(data) - return h.hexdigest() - -def get_title(indexhtml): - """Given an html file, return the content of its """ - print(indexhtml) - fp = open(indexhtml, 'rt', errors='ignore') - title = title_re.search(fp.read()) - if title: - return title.group(1) - -def get_users(): - """Generate tuples of the form (username, homedir) for all normal - users on this system. - - """ - return ((p.pw_name, p.pw_dir) for p in pwd.getpwall() if - p.pw_uid >= 1000 and - p.pw_shell != '/bin/false' and - p.pw_name not in ['nobody', 'ubuntu', 'poetry']) - -def tdp_user(username, homedir): - """Given a unix username, and their home directory, return a TDP format - dict with information about that user. - - """ - indexhtml = os.path.join(homedir, 'public_html', 'index.html') - return { - 'username': username, - 'title': get_title(indexhtml), - 'mtime': int(os.path.getmtime(indexhtml) * 1000), - # tilde.town extensions and backward compatibility - # FIXME: just shelling out to diff -q might be way faster than all - # these hashes. - 'default': md5sum(indexhtml) == defaultindex_hash, - 'favicon': 'TODO', - } - -def tdp(): - now = datetime.datetime.now() - users = [tdp_user(username, homedir) for username, homedir in get_users()] - - # TDP format data - data = { - 'name': 'tilde.town', - 'url': 'http://tilde.town', - 'signup_url': 'http://goo.gl/forms/8IvQFTDjlo', - 'want_users': True, - 'admin_email': 'nks@lambdaphil.es', - 'description': " ".join(l.strip() for l in """ - an intentional digital community for creating and sharing works of - art, educating peers, and technological anachronism. we are a - completely non-commercial, donation supported, and committed to - rejecting false technological progress in favor of empathy and - sustainable computing. - """.splitlines()), - 'user_count': len(users), - 'users': users, - } - - # tilde.town extensions and backward compatibility - data.update({ - 'active_user_count': active_user_count(), - 'generated_at': now.strftime('%Y-%m-%d %H:%M:%S'), - 'generated_at_msec': int(now.timestamp() * 1000), - 'uptime': subprocess.check_output(['uptime', '-p'], universal_newlines=True), - }) - # redundant entries we should drop after changing homepage template - data.update({ - 'all_users': data['users'], - 'num_users': data['user_count'], - 'live_users': [u for u in data['users'] if not u['default']], - 'site_name': data['name'], - 'site_url': data['url'], - }) - data.update({ - 'num_live_users': len(data['live_users']), - }) - - return data - -def main(): - global defaultindex_hash - defaultindex_hash = md5sum("/etc/skel/public_html/index.html") - print(json.dumps(tdp(), sort_keys=True, indent=2)) - -if __name__ == '__main__': - raise SystemExit(main()) diff --git a/tildetown/stats.py b/tildetown/stats.py old mode 100644 new mode 100755 index d8a8ec6..766e2f9 --- a/tildetown/stats.py +++ b/tildetown/stats.py @@ -1,87 +1,198 @@ +#!/usr/local/bin/python3 + +# tdp.py - tilde data in tilde data protocol format. +# Copyright 2015 Michael F. Lamb <http://datagrok.org> +# License: GPLv3+ + +""" +Outputs JSON data conforming to "~dp (Tilde Description Protocol)" as defined +at: http://protocol.club/~datagrok/beta-wiki/tdp.html + +It is a JSON structure of the form: + +{ + 'name': (string) the name of the server. + 'url': (string) the URL of the server. + 'signup_url': (string) the URL of a page describing the process required to request an account on the server. + 'user_count': (number) the number of users currently registered on the server. + 'want_users': (boolean) whether the server is currently accepting new user requests. + 'admin_email': (string) the email address of the primary server administrator. + 'description': (string) a free-form description for the server. + 'users': [ (array) an array of users on the server. + { + 'username': (string) the username of the user. + 'title': (string) the HTML title of the user’s index.html page. + 'mtime': (number) a timestamp representing the last time the user’s index.html was modified. + }, + ... + ] + } + +We also overload this with the preexisting data format we were using in +/var/local/tildetown/tildetown-py/stats.py, which is of the form: + +{ + 'all_users': [ (array) of users on the server. + { + 'username': (string) the username of the user. + 'default': (boolean) Is the user still using their unmodified default index.html? + 'favicon': (string) a url to an image representing the user + }, + ... + ] + 'num_users': (number) count of all_users + 'live_users': [ (array) an array of live users, same format as all_users. Users may appear in both arrays. + ... + ], + 'num_live_users': (number) count of live users + 'active_user_count': (number) count of currently logged in users + 'generated_at': (string) the time this JSON was generated in '%Y-%m-%d %H:%M:%S' format. + 'generated_at_msec': (number) the time this JSON was generated, in milliseconds since the epoch. + 'site_name': (same as 'name' above) + 'site_url': (same as 'url' above) + 'uptime': (string) output of `uptime -p` + +} +Usage: tdp.py > /var/www/html/tilde.json +""" + +# I suppose I could import /var/local/tildetown/tildetown-py/stats.py which +# does much of the same work, but I wanted to try to make one that needs no +# venv nor 'sh' module. (Success.) Bonus: this runs in 0.127s, vs 5.2s +# for 'stats' + +# FIXME: we output quite a bit of redundant data. I think we should lose +# 'live_users' and do that filtering on the client side. + +# FIXME: If we're the only consumer of the stats.py data, let's change the +# client side to use 'users' and drop 'all_users'. + +import datetime +import hashlib import json -from functools import partial -from os import listdir -from os.path import getmtime, join -from datetime import datetime -from sh import find, uptime, who, sort, wc, cut -from tildetown.util import slurp, thread, p +import os +import pwd +import re +import struct import subprocess -# this script emits json on standard out that has information about tilde.town -# users. It denotes who has not updated their page from the default. It also -# reports the time this script was run. The user list is sorted by public_html update time. - -SYSTEM_USERS = ['wiki', 'root', 'ubuntu', 'nate'] - +SYSTEM_USERS = ['wiki', 'root', 'ubuntu', 'nate', 'nobody'] DEFAULT_HTML_FILENAME = "/etc/skel/public_html/index.html" +title_re = re.compile(r'<title[^>]*>(.*)', re.DOTALL) -username_to_html_path = lambda u: "/home/{}/public_html".format(u) +def active_user_count(): + """Return the count of unique usernames logged in.""" + return len(set(line.split()[0] for line in + subprocess.check_output( + ["who"], universal_newlines=True).splitlines())) -def default_p(username): - return subprocess.call( - ['diff', '-q', DEFAULT_HTML_FILENAME, user_html_filename], - stdout=subprocess.DEVNULL) == 0 +def get_title(indexhtml): + """Given an html file, return the content of its """ + with open(indexhtml, 'rt', errors='ignore') as fp: + title = title_re.search(fp.read()) + if title: + return title.group(1) -def bounded_find(path): - # find might return 1 but still have worked fine. - return find(path, "-maxdepth", "3", _ok_code=[0,1]) +def get_users(): + """Generate tuples of the form (username, homedir) for all normal + users on this system. -def get_active_user_count(): - return int(wc(sort(cut(who(), "-d", " ", "-f1"), "-u"), "-l")) + """ + return ((p.pw_name, p.pw_dir) for p in pwd.getpwall() if + p.pw_uid >= 1000 and + p.pw_shell != '/bin/false' and + p.pw_name not in SYSTEM_USERS) -def guarded_mtime(path): - try: - return getmtime(path.rstrip()) - except Exception as _: - return 0 +def most_recent_within(path): + """Return the most recent timestamp among all files within path, 3 + levels deep. + """ + return max(modified_times(path), maxdepth=3) -def modify_time(username): - files_to_mtimes = partial(map, guarded_mtime) - return thread(username, - username_to_html_path, - bounded_find, - files_to_mtimes, - list, - max) +def modified_times(path, maxdepth=None): + """Walk the directories in path, generating timestamps for all + files. + """ + for root, dirs, files in os.walk(path): + if maxdepth and len(root[len(path):].split(os.sep)) == maxdepth: + dirs.clear() + for f in files: + try: + yield os.path.getmtime(os.path.join(root, f)) + except FileNotFoundError: + pass -def sort_user_list(usernames): - return sorted(usernames, key=modify_time) +def tdp_user(username, homedir): + """Given a unix username, and their home directory, return a TDP format + dict with information about that user. -def user_generator(): - ignore_system_users = lambda un: un not in SYSTEM_USERS - return filter(ignore_system_users, listdir("/home")) + """ + public_html = os.path.join(homedir, 'public_html') + index_html = os.path.join(public_html, 'index.html') + if os.path.exists(index_html): + return { + 'username': username, + 'title': get_title(index_html), + 'mtime': int(most_recent_within(public_html) * 1000), + # tilde.town extensions and backward compatibility + # FIXME: just shelling out to diff -q might be way faster than all + # these hashes. + 'favicon': 'TODO', + 'default': subprocess.call( + ['diff', '-q', DEFAULT_HTML_FILENAME, index_html], + stdout=subprocess.DEVNULL) == 0, + } + else: + return { + 'username': username, + 'default': False + } -def get_user_data(): - username_to_data = lambda u: {'username': u, - 'default': default_p(u), - 'favicon':'TODO'} - live_p = lambda user: not user['default'] +def tdp(): + now = datetime.datetime.now() + users = [tdp_user(u, h) for u, h in get_users()] - all_users = thread(user_generator(), - sort_user_list, - reversed, - partial(map, username_to_data), - list) + # TDP format data + data = { + 'name': 'tilde.town', + 'url': 'http://tilde.town', + 'signup_url': 'http://goo.gl/forms/8IvQFTDjlo', + 'want_users': True, + 'admin_email': 'nks@lambdaphil.es', + 'description': " ".join(l.strip() for l in """ + an intentional digital community for creating and sharing works of + art, educating peers, and technological anachronism. we are a + completely non-commercial, donation supported, and committed to + rejecting false technological progress in favor of empathy and + sustainable computing. + """.splitlines()), + 'user_count': len(users), + 'users': users, + } - live_users = list(filter(live_p, all_users)) + # tilde.town extensions and backward compatibility + data.update({ + 'active_user_count': active_user_count(), + 'generated_at': now.strftime('%Y-%m-%d %H:%M:%S'), + 'generated_at_msec': int(now.timestamp() * 1000), + 'uptime': subprocess.check_output(['uptime', '-p'], universal_newlines=True), + }) + # redundant entries we should drop after changing homepage template + data.update({ + 'all_users': data['users'], + 'num_users': data['user_count'], + 'live_users': [u for u in data['users'] if not u['default']], + 'site_name': data['name'], + 'site_url': data['url'], + }) + data.update({ + 'num_live_users': len(data['live_users']), + }) - active_user_count = get_active_user_count() - - return {'all_users': all_users, - 'num_users': len(all_users), - 'num_live_users': len(live_users), - 'active_user_count': active_user_count, - 'live_users': live_users,} - -def get_data(): - user_data = get_user_data() - data = {'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - 'site_name': 'tilde.town', - 'site_url': 'http://tilde.town', - 'uptime': str(uptime('-p')).rstrip(),} - - data.update(user_data) return data +def main(): + print(json.dumps(tdp(), sort_keys=True, indent=2)) + if __name__ == '__main__': - print(json.dumps(get_data())) + raise SystemExit(main())