Improving stop word handling, also improving sentence parsing.

2022-08-09 05:20:36 +00:00 · 2022-08-09 05:20:36 +00:00 · 2d431f03fe
commit 2d431f03fe
parent 54d18d7eeb
1 changed files with 22 additions and 3 deletions
--- a/test.py
+++ b/test.py
@ -1,5 +1,6 @@
 import json
 import random
+import re
 import requests
 import time

@ -29,6 +30,21 @@ adlib_tags = {
 }

 stop_words = set(stopwords.words("english"))
+months = {
+    "january",
+    "february",
+    "march",
+    "april",
+    "may",
+    "june",
+    "july",
+    "august",
+    "september",
+    "october",
+    "november",
+    "december",
+}
+stop_words.update(months)

 def get_random_wikipedia_title() -> str:
    random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
@ -51,7 +67,8 @@ print(wiki_page.title)
 print(wiki_page.displaytitle)
 print(wiki_page.canonicalurl)

-summary = wiki_page.summary
+summary = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", wiki_page.summary)  # Try to break up sentences mashed together by stripping strings
+title = wiki_page.title
 sentences = sent_tokenize(summary)
 tagged_sentences = []
 for sentence in sentences:
@ -66,12 +83,14 @@ for sentence in tagged_sentences:
        output_tokens.append({"id": i, "token": token, "tag": tag})
        adlib_tag = adlib_tags.get(tag)
        if adlib_tag is not None:
-            if random.randint(0, adlib_word_counter) > min_words and token not in stop_words:
+            if random.randint(0, adlib_word_counter) > min_words \
+                    and token.lower() not in stop_words \
+                    and token.lower() not in title.lower():
                output_tokens[-1]["adlib_tag"] = adlib_tag
                adlib_word_counter = 0
            else:
                adlib_word_counter += 1
-
+        print(f"{token}: {adlib_tag} {adlib_word_counter}")
        i += 1

 with open("article.js", "w") as json_file: