From 2d431f03fe2c1ff8bee9c230b2e1882862ba698c Mon Sep 17 00:00:00 2001
From: gamerdonkey <gamerdonkey@tilde.town>
Date: Tue, 9 Aug 2022 05:20:36 +0000
Subject: [PATCH] Improving stop word handling, also improving sentence
 parsing.

---
 test.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/test.py b/test.py
index 030e7ef..75c8b2f 100644
--- a/test.py
+++ b/test.py
@@ -1,5 +1,6 @@
 import json
 import random
+import re
 import requests
 import time
 
@@ -29,6 +30,21 @@ adlib_tags = {
 }
 
 stop_words = set(stopwords.words("english"))
+months = {
+    "january",
+    "february",
+    "march",
+    "april",
+    "may",
+    "june",
+    "july",
+    "august",
+    "september",
+    "october",
+    "november",
+    "december",
+}
+stop_words.update(months)
 
 def get_random_wikipedia_title() -> str:
     random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
@@ -51,7 +67,8 @@ print(wiki_page.title)
 print(wiki_page.displaytitle)
 print(wiki_page.canonicalurl)
 
-summary = wiki_page.summary
+summary = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", wiki_page.summary)  # Try to break up sentences mashed together by stripping strings
+title = wiki_page.title
 sentences = sent_tokenize(summary)
 tagged_sentences = []
 for sentence in sentences:
@@ -66,12 +83,14 @@ for sentence in tagged_sentences:
         output_tokens.append({"id": i, "token": token, "tag": tag})
         adlib_tag = adlib_tags.get(tag)
         if adlib_tag is not None:
-            if random.randint(0, adlib_word_counter) > min_words and token not in stop_words:
+            if random.randint(0, adlib_word_counter) > min_words \
+                    and token.lower() not in stop_words \
+                    and token.lower() not in title.lower():
                 output_tokens[-1]["adlib_tag"] = adlib_tag
                 adlib_word_counter = 0
             else:
                 adlib_word_counter += 1
-
+        print(f"{token}: {adlib_tag} {adlib_word_counter}")
         i += 1
 
 with open("article.js", "w") as json_file: