Updating the test program to break up a wikipedia page into tokens.
This commit is contained in:
		
							parent
							
								
									ca57f9b604
								
							
						
					
					
						commit
						44a2b90ec7
					
				
							
								
								
									
										56
									
								
								test.py
									
									
									
									
									
								
							
							
						
						
									
										56
									
								
								test.py
									
									
									
									
									
								
							| @ -2,28 +2,54 @@ import json | ||||
| import requests | ||||
| import wikipediaapi | ||||
| 
 | ||||
| from nltk import pos_tag, map_tag | ||||
| from nltk import word_tokenize | ||||
| from nltk import pos_tag | ||||
| from nltk import sent_tokenize, word_tokenize | ||||
| 
 | ||||
| # Info about the default pos_tag tags | ||||
| # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html | ||||
| adlib_tags = { | ||||
|     "JJ": "Adjective", | ||||
|     "JJR": "Adjective ending in 'er'", | ||||
|     "JJS": "Adjective ending in 'est'", | ||||
|     "NN": "Noun", | ||||
|     "NNS": "Plural Noun", | ||||
|     "NNP": "Proper Noun", | ||||
|     "NNPS": "Plural Proper Noun", | ||||
|     "RB": "Adverb", | ||||
|     "RBR": "Adverb ending in 'er'", | ||||
|     "RBS": "Adverb ending in 'est'", | ||||
|     "VB": "Verb", | ||||
|     "VBD": "Past Tense Verb", | ||||
|     "VBG": "Verb ending in 'ing'", | ||||
|     "VBN": "Past Tense Verb", | ||||
|     "VBP": "Present Tense Verb", | ||||
|     "VBZ": "Present Tense Verb ending in 's'", | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| def get_random_wikipedia_title(): | ||||
|     random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) | ||||
|     return random_result['items'][0]['title'] | ||||
| 
 | ||||
| data = "The quick brown fox jumps over the lazy dog." | ||||
| 
 | ||||
| data_pos_tagged = pos_tag(word_tokenize(data)) | ||||
| 
 | ||||
| for tagged_word in data_pos_tagged: | ||||
|     print(tagged_word) | ||||
| 
 | ||||
| wikipedia = wikipediaapi.Wikipedia('en') | ||||
| random_page = wikipedia.page(get_random_wikipedia_title()) | ||||
| wiki_page = wikipedia.page(get_random_wikipedia_title()) | ||||
| 
 | ||||
| print(random_page.title) | ||||
| print(wiki_page.title) | ||||
| print(wiki_page.displaytitle) | ||||
| print(wiki_page.canonicalurl) | ||||
| 
 | ||||
| random_page_summary_tagged = pos_tag(word_tokenize(random_page.summary)) | ||||
| simple_tags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in random_page_summary_tagged] | ||||
| summary = wiki_page.summary | ||||
| sentences = sent_tokenize(summary) | ||||
| tagged_sentences = [] | ||||
| for sentence in sentences: | ||||
|     tagged_sentences.append(pos_tag(word_tokenize(sentence))) | ||||
| 
 | ||||
| print(random_page_summary_tagged) | ||||
| print(simple_tags) | ||||
| 
 | ||||
| i = 0 | ||||
| output_tokens = [] | ||||
| for sentence in tagged_sentences: | ||||
|     for token, tag in sentence: | ||||
|         output_tokens.append({"id": i, "token": token, "tag": tag}) | ||||
|         i += 1 | ||||
| 
 | ||||
| print(json.dumps(output_tokens)) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user