diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d85cd63 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +wiki-mad-libs-env/ diff --git a/test.py b/test.py new file mode 100644 index 0000000..614205b --- /dev/null +++ b/test.py @@ -0,0 +1,55 @@ +import json +import requests +import wikipediaapi + +from nltk import pos_tag +from nltk import sent_tokenize, word_tokenize + +# Info about the default pos_tag tags +# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html +adlib_tags = { + "JJ": "Adjective", + "JJR": "Adjective ending in 'er'", + "JJS": "Adjective ending in 'est'", + "NN": "Noun", + "NNS": "Plural Noun", + "NNP": "Proper Noun", + "NNPS": "Plural Proper Noun", + "RB": "Adverb", + "RBR": "Adverb ending in 'er'", + "RBS": "Adverb ending in 'est'", + "VB": "Verb", + "VBD": "Past Tense Verb", + "VBG": "Verb ending in 'ing'", + "VBN": "Past Tense Verb", + "VBP": "Present Tense Verb", + "VBZ": "Present Tense Verb ending in 's'", +} + + +def get_random_wikipedia_title(): + random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) + return random_result['items'][0]['title'] + +wikipedia = wikipediaapi.Wikipedia('en') +wiki_page = wikipedia.page(get_random_wikipedia_title()) + +print(wiki_page.title) +print(wiki_page.displaytitle) +print(wiki_page.canonicalurl) + +summary = wiki_page.summary +sentences = sent_tokenize(summary) +tagged_sentences = [] +for sentence in sentences: + tagged_sentences.append(pos_tag(word_tokenize(sentence))) + + +i = 0 +output_tokens = [] +for sentence in tagged_sentences: + for token, tag in sentence: + output_tokens.append({"id": i, "token": token, "tag": tag}) + i += 1 + +print(json.dumps(output_tokens))