Cleaned up the batch process to create articles and moved it to a script I can easily run with cron.
parent
1e4de11f4c
commit
2c846fdd46
@ -1 +1,2 @@ |
||||
wiki-mad-libs-env/ |
||||
batch/archive/ |
||||
|
@ -0,0 +1,5 @@ |
||||
cd /home/gamerdonkey/code/wiki-mad-libs/batch |
||||
source ../wiki-mad-libs-env/bin/activate |
||||
python process_wiki_article.py |
||||
cp article.js ./archive/$(date +%Y%m%dT%H%M%S).js |
||||
mv article.js /home/gamerdonkey/public_html/waki-libs/ |
@ -0,0 +1,116 @@ |
||||
import json |
||||
import random |
||||
import re |
||||
import requests |
||||
import time |
||||
|
||||
from nltk import pos_tag, sent_tokenize, word_tokenize |
||||
from nltk.corpus import stopwords |
||||
from wikipediaapi import Wikipedia, WikipediaPage |
||||
|
||||
# Info about the default pos_tag tags |
||||
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html |
||||
adlib_tags = { |
||||
"JJ": "Adjective", |
||||
"JJR": "Adjective ending in 'er'", |
||||
"JJS": "Adjective ending in 'est'", |
||||
"NN": "Noun", |
||||
"NNS": "Plural Noun", |
||||
"NNP": "Proper Noun", |
||||
"NNPS": "Plural Proper Noun", |
||||
"RB": "Adverb", |
||||
"RBR": "Adverb ending in 'er'", |
||||
"RBS": "Adverb ending in 'est'", |
||||
"VB": "Verb", |
||||
"VBD": "Past Tense Verb", |
||||
"VBG": "Verb ending in 'ing'", |
||||
"VBN": "Past Tense Verb", |
||||
"VBP": "Present Tense Verb", |
||||
"VBZ": "Present Tense Verb ending in 's'", |
||||
} |
||||
|
||||
stop_words = set(stopwords.words("english")) |
||||
months = { |
||||
"january", |
||||
"february", |
||||
"march", |
||||
"april", |
||||
"may", |
||||
"june", |
||||
"july", |
||||
"august", |
||||
"september", |
||||
"october", |
||||
"november", |
||||
"december", |
||||
} |
||||
stop_words.update(months) |
||||
|
||||
# More stop words: becomes, become, became, well |
||||
|
||||
def get_random_wikipedia_title() -> str: |
||||
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) |
||||
|
||||
return random_result['items'][0]['title'] |
||||
|
||||
def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage: |
||||
page = None |
||||
while(page is None): |
||||
page = wikipedia.page(get_random_wikipedia_title()) |
||||
if(min_length and len(page.summary) < min_length): |
||||
print(f"{page.displaytitle} is too short. Retrying...") |
||||
page = None |
||||
time.sleep(3) |
||||
|
||||
return page |
||||
|
||||
def sentence_tokenize_and_tag(text: str): |
||||
text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text) # Try to break up sentences mashed together by stripping strings |
||||
sentences = sent_tokenize(text) |
||||
tagged_sentences = [] |
||||
for sentence in sentences: |
||||
tagged_sentences.append(pos_tag(word_tokenize(sentence))) |
||||
|
||||
return tagged_sentences |
||||
|
||||
def adlibify(wiki_page, min_adlib_rest): |
||||
lowered_title = wiki_page.title.lower() |
||||
i = adlib_word_counter = 0 |
||||
output_tokens = [] |
||||
for sentence in sentence_tokenize_and_tag(wiki_page.summary): |
||||
for token, tag in sentence: |
||||
if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]: |
||||
output_tokens[-1]["token"] += f" {token}" |
||||
|
||||
else: |
||||
output_token = {"id": i, "token": token, "tag": tag} |
||||
adlib_tag = adlib_tags.get(tag) |
||||
if adlib_tag is not None: |
||||
if random.randint(0, adlib_word_counter) > min_adlib_rest \ |
||||
and token.lower() not in stop_words \ |
||||
and token.lower() not in lowered_title: |
||||
output_token["adlib_tag"] = adlib_tag |
||||
adlib_word_counter = 0 |
||||
else: |
||||
adlib_word_counter += 1 |
||||
output_tokens.append(output_token) |
||||
i += 1 |
||||
|
||||
article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl} |
||||
article['tokens'] = output_tokens |
||||
|
||||
return article |
||||
|
||||
|
||||
def main(): |
||||
wikipedia = Wikipedia('en') |
||||
wiki_page = get_random_wikipedia_page(wikipedia, 500) |
||||
|
||||
print(wiki_page.title) |
||||
|
||||
article = adlibify(wiki_page, 4) |
||||
with open("article.js", "w") as json_file: |
||||
json_file.write(f"article = {json.dumps(article)}") |
||||
|
||||
if __name__ == '__main__': |
||||
main() |
Loading…
Reference in new issue