From ca57f9b604317b3e056b40276b70c98a7df61860 Mon Sep 17 00:00:00 2001 From: gamerdonkey Date: Tue, 7 Jun 2022 21:43:51 -0500 Subject: [PATCH 1/3] Initial commit. --- .gitignore | 1 + test.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 .gitignore create mode 100644 test.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bdaab25 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +env/ diff --git a/test.py b/test.py new file mode 100644 index 0000000..2e1923e --- /dev/null +++ b/test.py @@ -0,0 +1,29 @@ +import json +import requests +import wikipediaapi + +from nltk import pos_tag, map_tag +from nltk import word_tokenize + + +def get_random_wikipedia_title(): + random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) + return random_result['items'][0]['title'] + +data = "The quick brown fox jumps over the lazy dog." + +data_pos_tagged = pos_tag(word_tokenize(data)) + +for tagged_word in data_pos_tagged: + print(tagged_word) + +wikipedia = wikipediaapi.Wikipedia('en') +random_page = wikipedia.page(get_random_wikipedia_title()) + +print(random_page.title) + +random_page_summary_tagged = pos_tag(word_tokenize(random_page.summary)) +simple_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in random_page_summary_tagged] + +print(random_page_summary_tagged) +print(simple_tags) From 44a2b90ec78effe6bc149ba0c69f66a170f37deb Mon Sep 17 00:00:00 2001 From: gamerdonkey Date: Fri, 17 Jun 2022 22:16:18 -0500 Subject: [PATCH 2/3] Updating the test program to break up a wikipedia page into tokens. --- test.py | 56 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/test.py b/test.py index 2e1923e..614205b 100644 --- a/test.py +++ b/test.py @@ -2,28 +2,54 @@ import json import requests import wikipediaapi -from nltk import pos_tag, map_tag -from nltk import word_tokenize +from nltk import pos_tag +from nltk import sent_tokenize, word_tokenize + +# Info about the default pos_tag tags +# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html +adlib_tags = { + "JJ": "Adjective", + "JJR": "Adjective ending in 'er'", + "JJS": "Adjective ending in 'est'", + "NN": "Noun", + "NNS": "Plural Noun", + "NNP": "Proper Noun", + "NNPS": "Plural Proper Noun", + "RB": "Adverb", + "RBR": "Adverb ending in 'er'", + "RBS": "Adverb ending in 'est'", + "VB": "Verb", + "VBD": "Past Tense Verb", + "VBG": "Verb ending in 'ing'", + "VBN": "Past Tense Verb", + "VBP": "Present Tense Verb", + "VBZ": "Present Tense Verb ending in 's'", +} def get_random_wikipedia_title(): random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) return random_result['items'][0]['title'] -data = "The quick brown fox jumps over the lazy dog." - -data_pos_tagged = pos_tag(word_tokenize(data)) - -for tagged_word in data_pos_tagged: - print(tagged_word) - wikipedia = wikipediaapi.Wikipedia('en') -random_page = wikipedia.page(get_random_wikipedia_title()) +wiki_page = wikipedia.page(get_random_wikipedia_title()) -print(random_page.title) +print(wiki_page.title) +print(wiki_page.displaytitle) +print(wiki_page.canonicalurl) -random_page_summary_tagged = pos_tag(word_tokenize(random_page.summary)) -simple_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in random_page_summary_tagged] +summary = wiki_page.summary +sentences = sent_tokenize(summary) +tagged_sentences = [] +for sentence in sentences: + tagged_sentences.append(pos_tag(word_tokenize(sentence))) -print(random_page_summary_tagged) -print(simple_tags) + +i = 0 +output_tokens = [] +for sentence in tagged_sentences: + for token, tag in sentence: + output_tokens.append({"id": i, "token": token, "tag": tag}) + i += 1 + +print(json.dumps(output_tokens)) From 1db6e08b116a8f91c31ffe918807d62e897b2f56 Mon Sep 17 00:00:00 2001 From: gamerdonkey Date: Sat, 18 Jun 2022 03:38:24 +0000 Subject: [PATCH 3/3] Renaming python venv dir. --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index bdaab25..d85cd63 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ -env/ +wiki-mad-libs-env/