Merge branch 'initial-branch' into main

2022-06-18 03:43:23 +00:00 · 2022-06-18 03:43:23 +00:00 · 79f25bf5f9
commit 79f25bf5f9
parent 6584db9bd5 1db6e08b11
2 changed files with 56 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+wiki-mad-libs-env/
--- a/test.py
+++ b/test.py
@ -0,0 +1,55 @@
+import json
+import requests
+import wikipediaapi
+
+from nltk import pos_tag
+from nltk import sent_tokenize, word_tokenize
+
+# Info about the default pos_tag tags
+# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
+adlib_tags = {
+    "JJ": "Adjective",
+    "JJR": "Adjective ending in 'er'",
+    "JJS": "Adjective ending in 'est'",
+    "NN": "Noun",
+    "NNS": "Plural Noun",
+    "NNP": "Proper Noun",
+    "NNPS": "Plural Proper Noun",
+    "RB": "Adverb",
+    "RBR": "Adverb ending in 'er'",
+    "RBS": "Adverb ending in 'est'",
+    "VB": "Verb",
+    "VBD": "Past Tense Verb",
+    "VBG": "Verb ending in 'ing'",
+    "VBN": "Past Tense Verb",
+    "VBP": "Present Tense Verb",
+    "VBZ": "Present Tense Verb ending in 's'",
+}
+
+
+def get_random_wikipedia_title():
+    random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
+    return random_result['items'][0]['title']
+
+wikipedia = wikipediaapi.Wikipedia('en')
+wiki_page = wikipedia.page(get_random_wikipedia_title())
+
+print(wiki_page.title)
+print(wiki_page.displaytitle)
+print(wiki_page.canonicalurl)
+
+summary = wiki_page.summary
+sentences = sent_tokenize(summary)
+tagged_sentences = []
+for sentence in sentences:
+    tagged_sentences.append(pos_tag(word_tokenize(sentence)))
+
+
+i = 0
+output_tokens = []
+for sentence in tagged_sentences:
+    for token, tag in sentence:
+        output_tokens.append({"id": i, "token": token, "tag": tag})
+        i += 1
+
+print(json.dumps(output_tokens))