Updating the test program to break up a wikipedia page into tokens.
このコミットが含まれているのは:
		
						コミット
						44a2b90ec7
					
				
							
								
								
									
										56
									
								
								test.py
									
									
									
									
									
								
							
							
						
						
									
										56
									
								
								test.py
									
									
									
									
									
								
							| @ -2,28 +2,54 @@ import json | |||||||
| import requests | import requests | ||||||
| import wikipediaapi | import wikipediaapi | ||||||
| 
 | 
 | ||||||
| from nltk import pos_tag, map_tag | from nltk import pos_tag | ||||||
| from nltk import word_tokenize | from nltk import sent_tokenize, word_tokenize | ||||||
|  | 
 | ||||||
|  | # Info about the default pos_tag tags | ||||||
|  | # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html | ||||||
|  | adlib_tags = { | ||||||
|  |     "JJ": "Adjective", | ||||||
|  |     "JJR": "Adjective ending in 'er'", | ||||||
|  |     "JJS": "Adjective ending in 'est'", | ||||||
|  |     "NN": "Noun", | ||||||
|  |     "NNS": "Plural Noun", | ||||||
|  |     "NNP": "Proper Noun", | ||||||
|  |     "NNPS": "Plural Proper Noun", | ||||||
|  |     "RB": "Adverb", | ||||||
|  |     "RBR": "Adverb ending in 'er'", | ||||||
|  |     "RBS": "Adverb ending in 'est'", | ||||||
|  |     "VB": "Verb", | ||||||
|  |     "VBD": "Past Tense Verb", | ||||||
|  |     "VBG": "Verb ending in 'ing'", | ||||||
|  |     "VBN": "Past Tense Verb", | ||||||
|  |     "VBP": "Present Tense Verb", | ||||||
|  |     "VBZ": "Present Tense Verb ending in 's'", | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_random_wikipedia_title(): | def get_random_wikipedia_title(): | ||||||
|     random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) |     random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) | ||||||
|     return random_result['items'][0]['title'] |     return random_result['items'][0]['title'] | ||||||
| 
 | 
 | ||||||
| data = "The quick brown fox jumps over the lazy dog." |  | ||||||
| 
 |  | ||||||
| data_pos_tagged = pos_tag(word_tokenize(data)) |  | ||||||
| 
 |  | ||||||
| for tagged_word in data_pos_tagged: |  | ||||||
|     print(tagged_word) |  | ||||||
| 
 |  | ||||||
| wikipedia = wikipediaapi.Wikipedia('en') | wikipedia = wikipediaapi.Wikipedia('en') | ||||||
| random_page = wikipedia.page(get_random_wikipedia_title()) | wiki_page = wikipedia.page(get_random_wikipedia_title()) | ||||||
| 
 | 
 | ||||||
| print(random_page.title) | print(wiki_page.title) | ||||||
|  | print(wiki_page.displaytitle) | ||||||
|  | print(wiki_page.canonicalurl) | ||||||
| 
 | 
 | ||||||
| random_page_summary_tagged = pos_tag(word_tokenize(random_page.summary)) | summary = wiki_page.summary | ||||||
| simple_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in random_page_summary_tagged] | sentences = sent_tokenize(summary) | ||||||
|  | tagged_sentences = [] | ||||||
|  | for sentence in sentences: | ||||||
|  |     tagged_sentences.append(pos_tag(word_tokenize(sentence))) | ||||||
| 
 | 
 | ||||||
| print(random_page_summary_tagged) | 
 | ||||||
| print(simple_tags) | i = 0 | ||||||
|  | output_tokens = [] | ||||||
|  | for sentence in tagged_sentences: | ||||||
|  |     for token, tag in sentence: | ||||||
|  |         output_tokens.append({"id": i, "token": token, "tag": tag}) | ||||||
|  |         i += 1 | ||||||
|  | 
 | ||||||
|  | print(json.dumps(output_tokens)) | ||||||
|  | |||||||
		読み込み中…
	
	
			
			x
			
			
		
	
		新しいイシューから参照
	
	ユーザーをブロックする