Cleaned up the batch process to create articles and moved it to a script I can easily run with cron.
parent
1e4de11f4c
commit
2c846fdd46
|
@ -1 +1,2 @@
|
||||||
wiki-mad-libs-env/
|
wiki-mad-libs-env/
|
||||||
|
batch/archive/
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
cd /home/gamerdonkey/code/wiki-mad-libs/batch
|
||||||
|
source ../wiki-mad-libs-env/bin/activate
|
||||||
|
python process_wiki_article.py
|
||||||
|
cp article.js ./archive/$(date +%Y%m%dT%H%M%S).js
|
||||||
|
mv article.js /home/gamerdonkey/public_html/waki-libs/
|
|
@ -0,0 +1,116 @@
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
|
||||||
|
from nltk import pos_tag, sent_tokenize, word_tokenize
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from wikipediaapi import Wikipedia, WikipediaPage
|
||||||
|
|
||||||
|
# Info about the default pos_tag tags
|
||||||
|
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
|
||||||
|
adlib_tags = {
|
||||||
|
"JJ": "Adjective",
|
||||||
|
"JJR": "Adjective ending in 'er'",
|
||||||
|
"JJS": "Adjective ending in 'est'",
|
||||||
|
"NN": "Noun",
|
||||||
|
"NNS": "Plural Noun",
|
||||||
|
"NNP": "Proper Noun",
|
||||||
|
"NNPS": "Plural Proper Noun",
|
||||||
|
"RB": "Adverb",
|
||||||
|
"RBR": "Adverb ending in 'er'",
|
||||||
|
"RBS": "Adverb ending in 'est'",
|
||||||
|
"VB": "Verb",
|
||||||
|
"VBD": "Past Tense Verb",
|
||||||
|
"VBG": "Verb ending in 'ing'",
|
||||||
|
"VBN": "Past Tense Verb",
|
||||||
|
"VBP": "Present Tense Verb",
|
||||||
|
"VBZ": "Present Tense Verb ending in 's'",
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_words = set(stopwords.words("english"))
|
||||||
|
months = {
|
||||||
|
"january",
|
||||||
|
"february",
|
||||||
|
"march",
|
||||||
|
"april",
|
||||||
|
"may",
|
||||||
|
"june",
|
||||||
|
"july",
|
||||||
|
"august",
|
||||||
|
"september",
|
||||||
|
"october",
|
||||||
|
"november",
|
||||||
|
"december",
|
||||||
|
}
|
||||||
|
stop_words.update(months)
|
||||||
|
|
||||||
|
# More stop words: becomes, become, became, well
|
||||||
|
|
||||||
|
def get_random_wikipedia_title() -> str:
|
||||||
|
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
|
||||||
|
|
||||||
|
return random_result['items'][0]['title']
|
||||||
|
|
||||||
|
def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage:
|
||||||
|
page = None
|
||||||
|
while(page is None):
|
||||||
|
page = wikipedia.page(get_random_wikipedia_title())
|
||||||
|
if(min_length and len(page.summary) < min_length):
|
||||||
|
print(f"{page.displaytitle} is too short. Retrying...")
|
||||||
|
page = None
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
def sentence_tokenize_and_tag(text: str):
|
||||||
|
text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text) # Try to break up sentences mashed together by stripping strings
|
||||||
|
sentences = sent_tokenize(text)
|
||||||
|
tagged_sentences = []
|
||||||
|
for sentence in sentences:
|
||||||
|
tagged_sentences.append(pos_tag(word_tokenize(sentence)))
|
||||||
|
|
||||||
|
return tagged_sentences
|
||||||
|
|
||||||
|
def adlibify(wiki_page, min_adlib_rest):
|
||||||
|
lowered_title = wiki_page.title.lower()
|
||||||
|
i = adlib_word_counter = 0
|
||||||
|
output_tokens = []
|
||||||
|
for sentence in sentence_tokenize_and_tag(wiki_page.summary):
|
||||||
|
for token, tag in sentence:
|
||||||
|
if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]:
|
||||||
|
output_tokens[-1]["token"] += f" {token}"
|
||||||
|
|
||||||
|
else:
|
||||||
|
output_token = {"id": i, "token": token, "tag": tag}
|
||||||
|
adlib_tag = adlib_tags.get(tag)
|
||||||
|
if adlib_tag is not None:
|
||||||
|
if random.randint(0, adlib_word_counter) > min_adlib_rest \
|
||||||
|
and token.lower() not in stop_words \
|
||||||
|
and token.lower() not in lowered_title:
|
||||||
|
output_token["adlib_tag"] = adlib_tag
|
||||||
|
adlib_word_counter = 0
|
||||||
|
else:
|
||||||
|
adlib_word_counter += 1
|
||||||
|
output_tokens.append(output_token)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl}
|
||||||
|
article['tokens'] = output_tokens
|
||||||
|
|
||||||
|
return article
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
wikipedia = Wikipedia('en')
|
||||||
|
wiki_page = get_random_wikipedia_page(wikipedia, 500)
|
||||||
|
|
||||||
|
print(wiki_page.title)
|
||||||
|
|
||||||
|
article = adlibify(wiki_page, 4)
|
||||||
|
with open("article.js", "w") as json_file:
|
||||||
|
json_file.write(f"article = {json.dumps(article)}")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue