From 2c846fdd46590a03f0d08e4b011eb97ea26118df Mon Sep 17 00:00:00 2001 From: gamerdonkey Date: Sat, 20 Aug 2022 03:48:12 +0000 Subject: [PATCH] Cleaned up the batch process to create articles and moved it to a script I can easily run with cron. --- .gitignore | 1 + batch/create_adlib_article.sh | 5 ++ batch/process_wiki_article.py | 116 ++++++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+) create mode 100755 batch/create_adlib_article.sh create mode 100644 batch/process_wiki_article.py diff --git a/.gitignore b/.gitignore index d85cd63..9dcc417 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ wiki-mad-libs-env/ +batch/archive/ diff --git a/batch/create_adlib_article.sh b/batch/create_adlib_article.sh new file mode 100755 index 0000000..b265b76 --- /dev/null +++ b/batch/create_adlib_article.sh @@ -0,0 +1,5 @@ +cd /home/gamerdonkey/code/wiki-mad-libs/batch +source ../wiki-mad-libs-env/bin/activate +python process_wiki_article.py +cp article.js ./archive/$(date +%Y%m%dT%H%M%S).js +mv article.js /home/gamerdonkey/public_html/waki-libs/ diff --git a/batch/process_wiki_article.py b/batch/process_wiki_article.py new file mode 100644 index 0000000..9d93c94 --- /dev/null +++ b/batch/process_wiki_article.py @@ -0,0 +1,116 @@ +import json +import random +import re +import requests +import time + +from nltk import pos_tag, sent_tokenize, word_tokenize +from nltk.corpus import stopwords +from wikipediaapi import Wikipedia, WikipediaPage + +# Info about the default pos_tag tags +# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html +adlib_tags = { + "JJ": "Adjective", + "JJR": "Adjective ending in 'er'", + "JJS": "Adjective ending in 'est'", + "NN": "Noun", + "NNS": "Plural Noun", + "NNP": "Proper Noun", + "NNPS": "Plural Proper Noun", + "RB": "Adverb", + "RBR": "Adverb ending in 'er'", + "RBS": "Adverb ending in 'est'", + "VB": "Verb", + "VBD": "Past Tense Verb", + "VBG": "Verb ending in 'ing'", + "VBN": "Past Tense Verb", + "VBP": "Present Tense Verb", + "VBZ": "Present Tense Verb ending in 's'", +} + +stop_words = set(stopwords.words("english")) +months = { + "january", + "february", + "march", + "april", + "may", + "june", + "july", + "august", + "september", + "october", + "november", + "december", +} +stop_words.update(months) + +# More stop words: becomes, become, became, well + +def get_random_wikipedia_title() -> str: + random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) + + return random_result['items'][0]['title'] + +def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage: + page = None + while(page is None): + page = wikipedia.page(get_random_wikipedia_title()) + if(min_length and len(page.summary) < min_length): + print(f"{page.displaytitle} is too short. Retrying...") + page = None + time.sleep(3) + + return page + +def sentence_tokenize_and_tag(text: str): + text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text) # Try to break up sentences mashed together by stripping strings + sentences = sent_tokenize(text) + tagged_sentences = [] + for sentence in sentences: + tagged_sentences.append(pos_tag(word_tokenize(sentence))) + + return tagged_sentences + +def adlibify(wiki_page, min_adlib_rest): + lowered_title = wiki_page.title.lower() + i = adlib_word_counter = 0 + output_tokens = [] + for sentence in sentence_tokenize_and_tag(wiki_page.summary): + for token, tag in sentence: + if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]: + output_tokens[-1]["token"] += f" {token}" + + else: + output_token = {"id": i, "token": token, "tag": tag} + adlib_tag = adlib_tags.get(tag) + if adlib_tag is not None: + if random.randint(0, adlib_word_counter) > min_adlib_rest \ + and token.lower() not in stop_words \ + and token.lower() not in lowered_title: + output_token["adlib_tag"] = adlib_tag + adlib_word_counter = 0 + else: + adlib_word_counter += 1 + output_tokens.append(output_token) + i += 1 + + article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl} + article['tokens'] = output_tokens + + return article + + +def main(): + wikipedia = Wikipedia('en') + wiki_page = get_random_wikipedia_page(wikipedia, 500) + + print(wiki_page.title) + + article = adlibify(wiki_page, 4) + with open("article.js", "w") as json_file: + json_file.write(f"article = {json.dumps(article)}") + +if __name__ == '__main__': + main()