From 2c846fdd46590a03f0d08e4b011eb97ea26118df Mon Sep 17 00:00:00 2001
From: gamerdonkey <gamerdonkey@tilde.town>
Date: Sat, 20 Aug 2022 03:48:12 +0000
Subject: [PATCH] Cleaned up the batch process to create articles and moved it
 to a script I can easily run with cron.

---
 .gitignore                    |   1 +
 batch/create_adlib_article.sh |   5 ++
 batch/process_wiki_article.py | 116 ++++++++++++++++++++++++++++++++++
 3 files changed, 122 insertions(+)
 create mode 100755 batch/create_adlib_article.sh
 create mode 100644 batch/process_wiki_article.py

diff --git a/.gitignore b/.gitignore
index d85cd63..9dcc417 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 wiki-mad-libs-env/
+batch/archive/
diff --git a/batch/create_adlib_article.sh b/batch/create_adlib_article.sh
new file mode 100755
index 0000000..b265b76
--- /dev/null
+++ b/batch/create_adlib_article.sh
@@ -0,0 +1,5 @@
+cd /home/gamerdonkey/code/wiki-mad-libs/batch
+source ../wiki-mad-libs-env/bin/activate
+python process_wiki_article.py
+cp article.js ./archive/$(date +%Y%m%dT%H%M%S).js
+mv article.js /home/gamerdonkey/public_html/waki-libs/
diff --git a/batch/process_wiki_article.py b/batch/process_wiki_article.py
new file mode 100644
index 0000000..9d93c94
--- /dev/null
+++ b/batch/process_wiki_article.py
@@ -0,0 +1,116 @@
+import json
+import random
+import re
+import requests
+import time
+
+from nltk import pos_tag, sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+from wikipediaapi import Wikipedia, WikipediaPage
+
+# Info about the default pos_tag tags
+# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
+adlib_tags = {
+    "JJ": "Adjective",
+    "JJR": "Adjective ending in 'er'",
+    "JJS": "Adjective ending in 'est'",
+    "NN": "Noun",
+    "NNS": "Plural Noun",
+    "NNP": "Proper Noun",
+    "NNPS": "Plural Proper Noun",
+    "RB": "Adverb",
+    "RBR": "Adverb ending in 'er'",
+    "RBS": "Adverb ending in 'est'",
+    "VB": "Verb",
+    "VBD": "Past Tense Verb",
+    "VBG": "Verb ending in 'ing'",
+    "VBN": "Past Tense Verb",
+    "VBP": "Present Tense Verb",
+    "VBZ": "Present Tense Verb ending in 's'",
+}
+
+stop_words = set(stopwords.words("english"))
+months = {
+    "january",
+    "february",
+    "march",
+    "april",
+    "may",
+    "june",
+    "july",
+    "august",
+    "september",
+    "october",
+    "november",
+    "december",
+}
+stop_words.update(months)
+
+# More stop words: becomes, become, became, well
+
+def get_random_wikipedia_title() -> str:
+    random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
+
+    return random_result['items'][0]['title']
+
+def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage:
+    page = None
+    while(page is None):
+        page = wikipedia.page(get_random_wikipedia_title())
+        if(min_length and len(page.summary) < min_length):
+            print(f"{page.displaytitle} is too short. Retrying...")
+            page = None
+            time.sleep(3)
+
+    return page
+
+def sentence_tokenize_and_tag(text: str):
+    text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text)  # Try to break up sentences mashed together by stripping strings
+    sentences = sent_tokenize(text)
+    tagged_sentences = []
+    for sentence in sentences:
+        tagged_sentences.append(pos_tag(word_tokenize(sentence)))
+
+    return tagged_sentences
+
+def adlibify(wiki_page, min_adlib_rest):
+    lowered_title = wiki_page.title.lower()
+    i = adlib_word_counter = 0
+    output_tokens = []
+    for sentence in sentence_tokenize_and_tag(wiki_page.summary):
+        for token, tag in sentence:
+            if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]:
+                output_tokens[-1]["token"] += f" {token}"
+
+            else:
+                output_token = {"id": i, "token": token, "tag": tag}
+                adlib_tag = adlib_tags.get(tag)
+                if adlib_tag is not None:
+                    if random.randint(0, adlib_word_counter) > min_adlib_rest \
+                            and token.lower() not in stop_words \
+                            and token.lower() not in lowered_title:
+                        output_token["adlib_tag"] = adlib_tag
+                        adlib_word_counter = 0
+                    else:
+                        adlib_word_counter += 1
+                output_tokens.append(output_token)
+                i += 1
+
+    article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl}
+    article['tokens'] = output_tokens
+
+    return article
+
+
+def main():
+    wikipedia = Wikipedia('en')
+    wiki_page = get_random_wikipedia_page(wikipedia, 500)
+
+    print(wiki_page.title)
+
+    article = adlibify(wiki_page, 4)
+    with open("article.js", "w") as json_file:
+        json_file.write(f"article = {json.dumps(article)}")
+
+if __name__ == '__main__':
+    main()