From c7767fc3a424809be904aa834b090b2fe6751685 Mon Sep 17 00:00:00 2001
From: gamerdonkey <gamerdonkey@tilde.town>
Date: Wed, 17 Aug 2022 05:01:43 +0000
Subject: [PATCH] Refactoring the parsing script and adding functionality to
 put multi-word proper nounds together, improving styling and metadata.

---
 test.py        | 61 +++++++++++++++++++++++++-------------------
 web/index.html | 68 +++++++++++++++++++++++++++++++++++++++++++++++---
 web/script.js  | 10 +++++---
 3 files changed, 108 insertions(+), 31 deletions(-)

diff --git a/test.py b/test.py
index 75c8b2f..8431dc6 100644
--- a/test.py
+++ b/test.py
@@ -46,8 +46,11 @@ months = {
 }
 stop_words.update(months)
 
+# More stop words: becomes, become, became, well
+
 def get_random_wikipedia_title() -> str:
     random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
+
     return random_result['items'][0]['title']
 
 def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage:
@@ -57,41 +60,49 @@ def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> W
         if(min_length and len(page.summary) < min_length):
             print(f"{page.displaytitle} is too short. Retrying...")
             page = None
-            time.sleep(1)
+            time.sleep(3)
+
     return page
 
+def sentence_tokenize_and_tag(text: str):
+    text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text)  # Try to break up sentences mashed together by stripping strings
+    sentences = sent_tokenize(text)
+    tagged_sentences = []
+    for sentence in sentences:
+        tagged_sentences.append(pos_tag(word_tokenize(sentence)))
+
+    return tagged_sentences
+
 wikipedia = Wikipedia('en')
 wiki_page = get_random_wikipedia_page(wikipedia, 500)
 
 print(wiki_page.title)
-print(wiki_page.displaytitle)
-print(wiki_page.canonicalurl)
-
-summary = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", wiki_page.summary)  # Try to break up sentences mashed together by stripping strings
-title = wiki_page.title
-sentences = sent_tokenize(summary)
-tagged_sentences = []
-for sentence in sentences:
-    tagged_sentences.append(pos_tag(word_tokenize(sentence)))
-
 
+lowered_title = wiki_page.title.lower()
 i = adlib_word_counter = 0
 min_words = 4
 output_tokens = []
-for sentence in tagged_sentences:
+for sentence in sentence_tokenize_and_tag(wiki_page.summary):
     for token, tag in sentence:
-        output_tokens.append({"id": i, "token": token, "tag": tag})
-        adlib_tag = adlib_tags.get(tag)
-        if adlib_tag is not None:
-            if random.randint(0, adlib_word_counter) > min_words \
-                    and token.lower() not in stop_words \
-                    and token.lower() not in title.lower():
-                output_tokens[-1]["adlib_tag"] = adlib_tag
-                adlib_word_counter = 0
-            else:
-                adlib_word_counter += 1
-        print(f"{token}: {adlib_tag} {adlib_word_counter}")
-        i += 1
+        if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]:
+            output_tokens[-1]["token"] += f" {token}"
+
+        else:
+            output_token = {"id": i, "token": token, "tag": tag}
+            adlib_tag = adlib_tags.get(tag)
+            if adlib_tag is not None:
+                if random.randint(0, adlib_word_counter) > min_words \
+                        and token.lower() not in stop_words \
+                        and token.lower() not in lowered_title:
+                    output_token["adlib_tag"] = adlib_tag
+                    adlib_word_counter = 0
+                else:
+                    adlib_word_counter += 1
+            output_tokens.append(output_token)
+            i += 1
+
+article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl}
+article['tokens'] = output_tokens
 
 with open("article.js", "w") as json_file:
-    json_file.write(f"article = {json.dumps(output_tokens)}")
+    json_file.write(f"article = {json.dumps(article)}")
diff --git a/web/index.html b/web/index.html
index a4348db..f9e8108 100644
--- a/web/index.html
+++ b/web/index.html
@@ -3,12 +3,74 @@
 <head>
   <meta charset="UTF-8" />
   <title>WAD-LIBS</title>
+<style>
+
+#inputs {
+  display: flex;
+  flex-flow: column wrap;
+  max-height: 400px;
+}
+
+#inputs p {
+  max-width: 30%;
+  margin: 0.5em;
+}
+
+#inputs p input {
+  float: right;
+}
+
+button {
+  font-weight: bold;
+  padding: 0.5em 1em;
+  margin: 0.5em 1em;
+}
+
+#article {
+  font-family: sans-serif;
+  font-size: 11pt;
+  margin: 2em 0 2em 2em;
+  max-width: 1000px;
+  transition: opacity 1s ease-in;
+  /*display: none;*/
+  opacity: 0;
+}
+
+#article.visible {
+  display: block;
+  opacity: 1;
+}
+
+#article #title {
+  font-family: 'Georgia', 'Times', serif;
+  font-size: 24pt;
+  margin-right: 0.3em;
+}
+
+#article #link {
+  font-size: 10pt;
+}
+
+#article hr {
+  border: none;
+  border-top: 1px solid;
+  margin-bottom: 1em;
+}
+
+</style>
 </head>
 <body>
-  <h1>WAD-LIBS: Wikipedia Ad-Libbed</h1>
-  <div id="inputs"></div>
+<h1>WAD-LIBS: Wikipedia Ad-Libbed</h1>
+<div id="inputs"></div>
   <p><button id="show-article">WAD-LIB!</button>
-  <div id="article"></div>
+  <div id="article">
+    <div id="header">
+      <span id="title"></span>
+      <span id="link"></span>
+    </div>
+    <hr>
+    <div id="summary"></div>
+  </div>
   <script src="article.js"></script>
   <script src="script.js"></script>
 </body>
diff --git a/web/script.js b/web/script.js
index 7c9259a..2f3014c 100644
--- a/web/script.js
+++ b/web/script.js
@@ -16,7 +16,7 @@ function parseArticleJSON_old() {
 
 function createInputs(article) {
   inputs = ''
-  article.forEach(function(token) {
+  article.tokens.forEach(function(token) {
     if(token.adlib_tag) {
       inputs += `
       <p>
@@ -30,8 +30,10 @@ function createInputs(article) {
 }
 
 function showArticle(article) {
+  document.getElementById('title').innerHTML = article.title
+  document.getElementById('link').innerHTML = `[ <a href='${article.url}'>Original Article</a> ]`
   let output = ''
-  article.forEach(function(token) {
+  article.tokens.forEach(function(token) {
     let adlib_input = document.getElementById(`token_${token.id}`);
     if(adlib_input && adlib_input.value) {
       output += `<strong>${adlib_input.value}</strong> `
@@ -40,7 +42,9 @@ function showArticle(article) {
       output += `${token.token} `
     }
   })
-  document.getElementById('article').innerHTML = output
+  document.getElementById('summary').innerHTML = output
+
+  document.getElementById('article').classList.add('visible')
 }
 
 createInputs(article)