比较提交

...

4 次代码提交

作者 SHA1 备注 提交日期
79f25bf5f9 Merge branch 'initial-branch' into main 2022-06-18 03:43:23 +00:00
1db6e08b11 Renaming python venv dir. 2022-06-18 03:38:24 +00:00
gamerdonkey
44a2b90ec7 Updating the test program to break up a wikipedia page into tokens. 2022-06-17 22:16:18 -05:00
gamerdonkey
ca57f9b604 Initial commit. 2022-06-07 21:43:51 -05:00
共有 2 个文件被更改,包括 56 次插入0 次删除

1
.gitignore vendored 普通文件
查看文件

@ -0,0 +1 @@
wiki-mad-libs-env/

55
test.py 普通文件
查看文件

@ -0,0 +1,55 @@
import json
import requests
import wikipediaapi
from nltk import pos_tag
from nltk import sent_tokenize, word_tokenize
# Info about the default pos_tag tags
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
adlib_tags = {
"JJ": "Adjective",
"JJR": "Adjective ending in 'er'",
"JJS": "Adjective ending in 'est'",
"NN": "Noun",
"NNS": "Plural Noun",
"NNP": "Proper Noun",
"NNPS": "Plural Proper Noun",
"RB": "Adverb",
"RBR": "Adverb ending in 'er'",
"RBS": "Adverb ending in 'est'",
"VB": "Verb",
"VBD": "Past Tense Verb",
"VBG": "Verb ending in 'ing'",
"VBN": "Past Tense Verb",
"VBP": "Present Tense Verb",
"VBZ": "Present Tense Verb ending in 's'",
}
def get_random_wikipedia_title():
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
return random_result['items'][0]['title']
wikipedia = wikipediaapi.Wikipedia('en')
wiki_page = wikipedia.page(get_random_wikipedia_title())
print(wiki_page.title)
print(wiki_page.displaytitle)
print(wiki_page.canonicalurl)
summary = wiki_page.summary
sentences = sent_tokenize(summary)
tagged_sentences = []
for sentence in sentences:
tagged_sentences.append(pos_tag(word_tokenize(sentence)))
i = 0
output_tokens = []
for sentence in tagged_sentences:
for token, tag in sentence:
output_tokens.append({"id": i, "token": token, "tag": tag})
i += 1
print(json.dumps(output_tokens))