From 44a2b90ec78effe6bc149ba0c69f66a170f37deb Mon Sep 17 00:00:00 2001 From: gamerdonkey Date: Fri, 17 Jun 2022 22:16:18 -0500 Subject: [PATCH] Updating the test program to break up a wikipedia page into tokens. --- test.py | 56 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/test.py b/test.py index 2e1923e..614205b 100644 --- a/test.py +++ b/test.py @@ -2,28 +2,54 @@ import json import requests import wikipediaapi -from nltk import pos_tag, map_tag -from nltk import word_tokenize +from nltk import pos_tag +from nltk import sent_tokenize, word_tokenize + +# Info about the default pos_tag tags +# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html +adlib_tags = { + "JJ": "Adjective", + "JJR": "Adjective ending in 'er'", + "JJS": "Adjective ending in 'est'", + "NN": "Noun", + "NNS": "Plural Noun", + "NNP": "Proper Noun", + "NNPS": "Plural Proper Noun", + "RB": "Adverb", + "RBR": "Adverb ending in 'er'", + "RBS": "Adverb ending in 'est'", + "VB": "Verb", + "VBD": "Past Tense Verb", + "VBG": "Verb ending in 'ing'", + "VBN": "Past Tense Verb", + "VBP": "Present Tense Verb", + "VBZ": "Present Tense Verb ending in 's'", +} def get_random_wikipedia_title(): random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) return random_result['items'][0]['title'] -data = "The quick brown fox jumps over the lazy dog." - -data_pos_tagged = pos_tag(word_tokenize(data)) - -for tagged_word in data_pos_tagged: - print(tagged_word) - wikipedia = wikipediaapi.Wikipedia('en') -random_page = wikipedia.page(get_random_wikipedia_title()) +wiki_page = wikipedia.page(get_random_wikipedia_title()) -print(random_page.title) +print(wiki_page.title) +print(wiki_page.displaytitle) +print(wiki_page.canonicalurl) -random_page_summary_tagged = pos_tag(word_tokenize(random_page.summary)) -simple_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in random_page_summary_tagged] +summary = wiki_page.summary +sentences = sent_tokenize(summary) +tagged_sentences = [] +for sentence in sentences: + tagged_sentences.append(pos_tag(word_tokenize(sentence))) -print(random_page_summary_tagged) -print(simple_tags) + +i = 0 +output_tokens = [] +for sentence in tagged_sentences: + for token, tag in sentence: + output_tokens.append({"id": i, "token": token, "tag": tag}) + i += 1 + +print(json.dumps(output_tokens))