Updating the test program to break up a wikipedia page into tokens.
parent
ca57f9b604
commit
44a2b90ec7
56
test.py
56
test.py
|
@ -2,28 +2,54 @@ import json
|
||||||
import requests
|
import requests
|
||||||
import wikipediaapi
|
import wikipediaapi
|
||||||
|
|
||||||
from nltk import pos_tag, map_tag
|
from nltk import pos_tag
|
||||||
from nltk import word_tokenize
|
from nltk import sent_tokenize, word_tokenize
|
||||||
|
|
||||||
|
# Info about the default pos_tag tags
|
||||||
|
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
|
||||||
|
adlib_tags = {
|
||||||
|
"JJ": "Adjective",
|
||||||
|
"JJR": "Adjective ending in 'er'",
|
||||||
|
"JJS": "Adjective ending in 'est'",
|
||||||
|
"NN": "Noun",
|
||||||
|
"NNS": "Plural Noun",
|
||||||
|
"NNP": "Proper Noun",
|
||||||
|
"NNPS": "Plural Proper Noun",
|
||||||
|
"RB": "Adverb",
|
||||||
|
"RBR": "Adverb ending in 'er'",
|
||||||
|
"RBS": "Adverb ending in 'est'",
|
||||||
|
"VB": "Verb",
|
||||||
|
"VBD": "Past Tense Verb",
|
||||||
|
"VBG": "Verb ending in 'ing'",
|
||||||
|
"VBN": "Past Tense Verb",
|
||||||
|
"VBP": "Present Tense Verb",
|
||||||
|
"VBZ": "Present Tense Verb ending in 's'",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_random_wikipedia_title():
|
def get_random_wikipedia_title():
|
||||||
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
|
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
|
||||||
return random_result['items'][0]['title']
|
return random_result['items'][0]['title']
|
||||||
|
|
||||||
data = "The quick brown fox jumps over the lazy dog."
|
|
||||||
|
|
||||||
data_pos_tagged = pos_tag(word_tokenize(data))
|
|
||||||
|
|
||||||
for tagged_word in data_pos_tagged:
|
|
||||||
print(tagged_word)
|
|
||||||
|
|
||||||
wikipedia = wikipediaapi.Wikipedia('en')
|
wikipedia = wikipediaapi.Wikipedia('en')
|
||||||
random_page = wikipedia.page(get_random_wikipedia_title())
|
wiki_page = wikipedia.page(get_random_wikipedia_title())
|
||||||
|
|
||||||
print(random_page.title)
|
print(wiki_page.title)
|
||||||
|
print(wiki_page.displaytitle)
|
||||||
|
print(wiki_page.canonicalurl)
|
||||||
|
|
||||||
random_page_summary_tagged = pos_tag(word_tokenize(random_page.summary))
|
summary = wiki_page.summary
|
||||||
simple_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in random_page_summary_tagged]
|
sentences = sent_tokenize(summary)
|
||||||
|
tagged_sentences = []
|
||||||
|
for sentence in sentences:
|
||||||
|
tagged_sentences.append(pos_tag(word_tokenize(sentence)))
|
||||||
|
|
||||||
print(random_page_summary_tagged)
|
|
||||||
print(simple_tags)
|
i = 0
|
||||||
|
output_tokens = []
|
||||||
|
for sentence in tagged_sentences:
|
||||||
|
for token, tag in sentence:
|
||||||
|
output_tokens.append({"id": i, "token": token, "tag": tag})
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
print(json.dumps(output_tokens))
|
||||||
|
|
Loading…
Reference in New Issue