import json import requests import wikipediaapi from nltk import pos_tag from nltk import sent_tokenize, word_tokenize # Info about the default pos_tag tags # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html adlib_tags = { "JJ": "Adjective", "JJR": "Adjective ending in 'er'", "JJS": "Adjective ending in 'est'", "NN": "Noun", "NNS": "Plural Noun", "NNP": "Proper Noun", "NNPS": "Plural Proper Noun", "RB": "Adverb", "RBR": "Adverb ending in 'er'", "RBS": "Adverb ending in 'est'", "VB": "Verb", "VBD": "Past Tense Verb", "VBG": "Verb ending in 'ing'", "VBN": "Past Tense Verb", "VBP": "Present Tense Verb", "VBZ": "Present Tense Verb ending in 's'", } def get_random_wikipedia_title(): random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) return random_result['items'][0]['title'] wikipedia = wikipediaapi.Wikipedia('en') wiki_page = wikipedia.page(get_random_wikipedia_title()) print(wiki_page.title) print(wiki_page.displaytitle) print(wiki_page.canonicalurl) summary = wiki_page.summary sentences = sent_tokenize(summary) tagged_sentences = [] for sentence in sentences: tagged_sentences.append(pos_tag(word_tokenize(sentence))) i = 0 output_tokens = [] for sentence in tagged_sentences: for token, tag in sentence: output_tokens.append({"id": i, "token": token, "tag": tag}) i += 1 print(json.dumps(output_tokens))