Cleaned up the batch process to create articles and moved it to a script I can easily run with cron.

2022-08-20 03:48:12 +00:00 · 2022-08-20 03:48:12 +00:00 · 2c846fdd46
commit 2c846fdd46
parent 1e4de11f4c
3 changed files with 122 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 wiki-mad-libs-env/
 batch/archive/
--- a/batch/create_adlib_article.sh
+++ b/batch/create_adlib_article.sh
@ -0,0 +1,5 @@
 cd /home/gamerdonkey/code/wiki-mad-libs/batch
 source ../wiki-mad-libs-env/bin/activate
 python process_wiki_article.py
 cp article.js ./archive/$(date +%Y%m%dT%H%M%S).js
 mv article.js /home/gamerdonkey/public_html/waki-libs/
--- a/batch/process_wiki_article.py
+++ b/batch/process_wiki_article.py
@ -0,0 +1,116 @@
 import json
 import random
 import re
 import requests
 import time
 from nltk import pos_tag, sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
 from wikipediaapi import Wikipedia, WikipediaPage
 # Info about the default pos_tag tags
 # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
 adlib_tags = {
    "JJ": "Adjective",
    "JJR": "Adjective ending in 'er'",
    "JJS": "Adjective ending in 'est'",
    "NN": "Noun",
    "NNS": "Plural Noun",
    "NNP": "Proper Noun",
    "NNPS": "Plural Proper Noun",
    "RB": "Adverb",
    "RBR": "Adverb ending in 'er'",
    "RBS": "Adverb ending in 'est'",
    "VB": "Verb",
    "VBD": "Past Tense Verb",
    "VBG": "Verb ending in 'ing'",
    "VBN": "Past Tense Verb",
    "VBP": "Present Tense Verb",
    "VBZ": "Present Tense Verb ending in 's'",
 }
 stop_words = set(stopwords.words("english"))
 months = {
    "january",
    "february",
    "march",
    "april",
    "may",
    "june",
    "july",
    "august",
    "september",
    "october",
    "november",
    "december",
 }
 stop_words.update(months)
 # More stop words: becomes, become, became, well
 def get_random_wikipedia_title() -> str:
    random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
    return random_result['items'][0]['title']
 def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage:
    page = None
    while(page is None):
        page = wikipedia.page(get_random_wikipedia_title())
        if(min_length and len(page.summary) < min_length):
            print(f"{page.displaytitle} is too short. Retrying...")
            page = None
            time.sleep(3)
    return page
 def sentence_tokenize_and_tag(text: str):
    text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text)  # Try to break up sentences mashed together by stripping strings
    sentences = sent_tokenize(text)
    tagged_sentences = []
    for sentence in sentences:
        tagged_sentences.append(pos_tag(word_tokenize(sentence)))
    return tagged_sentences
 def adlibify(wiki_page, min_adlib_rest):
    lowered_title = wiki_page.title.lower()
    i = adlib_word_counter = 0
    output_tokens = []
    for sentence in sentence_tokenize_and_tag(wiki_page.summary):
        for token, tag in sentence:
            if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]:
                output_tokens[-1]["token"] += f" {token}"
            else:
                output_token = {"id": i, "token": token, "tag": tag}
                adlib_tag = adlib_tags.get(tag)
                if adlib_tag is not None:
                    if random.randint(0, adlib_word_counter) > min_adlib_rest \
                            and token.lower() not in stop_words \
                            and token.lower() not in lowered_title:
                        output_token["adlib_tag"] = adlib_tag
                        adlib_word_counter = 0
                    else:
                        adlib_word_counter += 1
                output_tokens.append(output_token)
                i += 1
    article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl}
    article['tokens'] = output_tokens
    return article
 def main():
    wikipedia = Wikipedia('en')
    wiki_page = get_random_wikipedia_page(wikipedia, 500)
    print(wiki_page.title)
    article = adlibify(wiki_page, 4)
    with open("article.js", "w") as json_file:
        json_file.write(f"article = {json.dumps(article)}")
 if __name__ == '__main__':
    main()
`@ -1 +1,2 @@`
	`wiki-mad-libs-env/`	`wiki-mad-libs-env/`
		`batch/archive/`