From 50de741ec4254cbe48b8bdddc9d2a0dec777ef3e Mon Sep 17 00:00:00 2001 From: gamerdonkey Date: Sat, 20 Aug 2022 03:50:08 +0000 Subject: [PATCH] Deleting old batch file. This is now batch/process_wiki_article.py --- test.py | 108 -------------------------------------------------------- 1 file changed, 108 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 8431dc6..0000000 --- a/test.py +++ /dev/null @@ -1,108 +0,0 @@ -import json -import random -import re -import requests -import time - -from nltk import pos_tag, sent_tokenize, word_tokenize -from nltk.corpus import stopwords -from wikipediaapi import Wikipedia, WikipediaPage - -# Info about the default pos_tag tags -# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html -adlib_tags = { - "JJ": "Adjective", - "JJR": "Adjective ending in 'er'", - "JJS": "Adjective ending in 'est'", - "NN": "Noun", - "NNS": "Plural Noun", - "NNP": "Proper Noun", - "NNPS": "Plural Proper Noun", - "RB": "Adverb", - "RBR": "Adverb ending in 'er'", - "RBS": "Adverb ending in 'est'", - "VB": "Verb", - "VBD": "Past Tense Verb", - "VBG": "Verb ending in 'ing'", - "VBN": "Past Tense Verb", - "VBP": "Present Tense Verb", - "VBZ": "Present Tense Verb ending in 's'", -} - -stop_words = set(stopwords.words("english")) -months = { - "january", - "february", - "march", - "april", - "may", - "june", - "july", - "august", - "september", - "october", - "november", - "december", -} -stop_words.update(months) - -# More stop words: becomes, become, became, well - -def get_random_wikipedia_title() -> str: - random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) - - return random_result['items'][0]['title'] - -def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage: - page = None - while(page is None): - page = wikipedia.page(get_random_wikipedia_title()) - if(min_length and len(page.summary) < min_length): - print(f"{page.displaytitle} is too short. Retrying...") - page = None - time.sleep(3) - - return page - -def sentence_tokenize_and_tag(text: str): - text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text) # Try to break up sentences mashed together by stripping strings - sentences = sent_tokenize(text) - tagged_sentences = [] - for sentence in sentences: - tagged_sentences.append(pos_tag(word_tokenize(sentence))) - - return tagged_sentences - -wikipedia = Wikipedia('en') -wiki_page = get_random_wikipedia_page(wikipedia, 500) - -print(wiki_page.title) - -lowered_title = wiki_page.title.lower() -i = adlib_word_counter = 0 -min_words = 4 -output_tokens = [] -for sentence in sentence_tokenize_and_tag(wiki_page.summary): - for token, tag in sentence: - if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]: - output_tokens[-1]["token"] += f" {token}" - - else: - output_token = {"id": i, "token": token, "tag": tag} - adlib_tag = adlib_tags.get(tag) - if adlib_tag is not None: - if random.randint(0, adlib_word_counter) > min_words \ - and token.lower() not in stop_words \ - and token.lower() not in lowered_title: - output_token["adlib_tag"] = adlib_tag - adlib_word_counter = 0 - else: - adlib_word_counter += 1 - output_tokens.append(output_token) - i += 1 - -article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl} -article['tokens'] = output_tokens - -with open("article.js", "w") as json_file: - json_file.write(f"article = {json.dumps(article)}")