From d2d599318fcdec2ac36d57b07c831d7d0d6790df Mon Sep 17 00:00:00 2001
From: magical <magical@tilde.town>
Date: Wed, 10 Aug 2022 03:24:07 +0000
Subject: [PATCH] thread grouping, more or less

---
 clients/nntp_client.py | 131 ++++++++++++++++++++++++++++++-----------
 1 file changed, 96 insertions(+), 35 deletions(-)

diff --git a/clients/nntp_client.py b/clients/nntp_client.py
index aeb6845..7e8e54c 100644
--- a/clients/nntp_client.py
+++ b/clients/nntp_client.py
@@ -7,9 +7,12 @@ import nntplib
 import time
 import json
 import ssl
+import re
 
 import os
 
+__all__ = ('BBJNews','URLError')
+
 class BBJNews(object):
     # this module isnt exactly complete. The below description claims
     # `all of its endpoints are mapped to native methods` though this
@@ -506,35 +509,8 @@ class BBJNews(object):
         #       :bytes - the number of bytes in the article
         #       :lines - the number of lines in the body (deprecated)
 
-        if False:
-            # build up a map of message references
-            # we use a disjoint-set data structure
-            # to find the root of each message
-            threadmap = {}
-            rank = {}
-            for num, ov in overviews:
-                msgid = nntplib.decode_header(ov['message-id'])
-                # RFC5536 suggests that whitespace should not occur inside
-                # a message id, which (if true) makes it pretty easy to split
-                # the list of message ids in the references header
-                refs = nntplib.decode_header(ov['references']).split()
-                for r in refs:
-                    threadmap[msgid] = r
-                    rank[msgid] = 1
-                    # TODO
-        else:
-            # make every message its own thread, for prototyping purposes
-            #t = {
-            #    'title': str,
-            #    'reply_count': int, # does this include the OP?
-            #    'pinned': bool,
-            #    'thread_id': uuid
-            #    'author': user_uuid,
-            #    'created': time,
-            #    'last_mod': time,
-            #    'last_author': user_uuid,
-            #}
-            threads = _overview_to_threads(overviews)
+        # see also: https://www.jwz.org/doc/threading.html
+        threads = _overviews_to_threads_fancy(overviews)
 
         # make usermap
         usermap = {}
@@ -545,12 +521,13 @@ class BBJNews(object):
             addr = _parse_single_address(userid)
             usermap[userid] = {
                 'user_id': userid,
-                'user_name': addr.name,
+                'user_name': addr.name or addr.user,
                 'address': addr.address,
                 'color': colorhash(userid),
                 'is_admin': False, # TODO: LIST MODERATORS?
             }
 
+        threads.sort(key=lambda x: x['last_mod'], reverse=True)
         return threads, usermap
 
 
@@ -565,7 +542,8 @@ class BBJNews(object):
               print(usermap[author_id]["user_name"])
               print(message["body"])
         """
-        return {}, {}
+        m = self.fake_message('oops...')
+        return {"title":"", "messages":[m], "author":m['author']}, {m['author']: self.user}
 
         response = self("thread_load",
             format=format, thread_id=thread_id, op_only=op_only)
@@ -615,14 +593,14 @@ class BBJNews(object):
         }
 
 
-    # unused
     def format_message(self, body, format="sequential"):
         """
         Send `body` to the server to be formatted according to `format`,
         defaulting to the sequential parser. Returns the body object.
         """
-        response = self("format_message", body=body, format=format)
-        return response["data"]
+        return [[(None, body)]]
+        #response = self("format_message", body=body, format=format)
+        #return response["data"]
 
     # unsupported
     def message_delete(self, thread_id, post_id):
@@ -747,8 +725,78 @@ class BBJNews(object):
             "messages": response["data"]["messages"]
         }
 
+def _overviews_to_threads_fancy(overviews):
+    # build up a map of message references
+    # we use a disjoint-set data structure
+    # to find the root of each message
+    threadmap = {}
+    def find(id):
+        parent = threadmap.setdefault(id, id)
+        if parent == id:
+            return id
+        root = find(parent)
+        if root != parent:
+            threadmap[id] = root
+        return root
+
+    messages = {}
+    for num, ov in overviews:
+        try:
+            msgid = nntplib.decode_header(ov['message-id']).strip()
+            refs = _parse_message_ids(nntplib.decode_header(ov['references']))
+        except ValueError:
+            continue
+
+        messages[msgid] = (num, msgid, ov)
+        for r in refs:
+            threadmap[find(msgid)] = find(r)
+
+    thread_messages = {}
+    for id in messages:
+        root = find(id)
+        l = thread_messages.setdefault(root, [])
+        l.append(messages[id])
+
+    threads = []
+    for id, messages in thread_messages.items():
+        messages.sort(key=lambda x: x[0])
+        first = messages[0][2]
+        last = messages[-1][2]
+        try:
+            d = nntplib.decode_header(first['date'])
+            d = email.utils.mktime_tz(email.utils.parsedate_tz(d))
+            d2 = nntplib.decode_header(last['date'])
+            d2 = email.utils.mktime_tz(email.utils.parsedate_tz(d2))
+            t = {
+                'pinned': False,
+                'title': nntplib.decode_header(first['subject']),
+                'reply_count': len(messages),
+                'thread_id': nntplib.decode_header(first['message-id']),
+                'author': nntplib.decode_header(first['from']),
+                'created': d,
+                'last_author': nntplib.decode_header(last['from']),
+                'last_mod': d2,
+            }
+        except (ValueError, KeyError, IndexError):
+            continue
+        else:
+            threads.append(t)
+
+    return threads
+
 
 def _overview_to_threads(overviews):
+    # make every message its own thread, for prototyping purposes
+    #t = {
+    #    'title': str,
+    #    'reply_count': int, # does this include the OP?
+    #    'pinned': bool,
+    #    'thread_id': uuid
+    #    'author': user_uuid,
+    #    'created': time,
+    #    'last_mod': time,
+    #    'last_author': user_uuid,
+    #}
     threads = []
     for num, ov in overviews:
         try:
@@ -782,7 +830,20 @@ def _test_overview_to_threads():
         print(t)
 
 
-Address = namedtuple('Address', 'name, address')
+_atext = r"[a-zA-Z0-9!#$%&'\*\+\-/=?^_`{|}~]" # RFC 5322 §3.2.3
+_dotatext = r"%s+(?:\.%s+)*" % (_atext, _atext)
+_mdtext = r"\[[!-=\?-Z^-~]\]"
+_msg_id_re = re.compile(r'<%s@(?:%s|%s)>' % (_dotatext, _dotatext, _mdtext)) # RFC 5536 §3.1.3
+
+def _parse_message_ids(s):
+    """parses a list of message ids separated by junk"""
+    return _msg_id_re.findall(s)
+
+class Address(namedtuple('Address', 'name, address')):
+    @property
+    def user(self):
+        user, _, _ = self.address.partition("@")
+        return user
 
 def _parse_single_address(value):
     # the email.headerregistry api is truly bizarre