Deleting old batch file. This is now batch/process_wiki_article.py

2022-08-20 03:50:08 +00:00 · 2022-08-20 03:50:08 +00:00 · 50de741ec4
commit 50de741ec4
parent 2c846fdd46
1 changed files with 0 additions and 108 deletions
--- a/test.py
+++ b/test.py
@ -1,108 +0,0 @@
-import json
-import random
-import re
-import requests
-import time
-
-from nltk import pos_tag, sent_tokenize, word_tokenize
-from nltk.corpus import stopwords
-from wikipediaapi import Wikipedia, WikipediaPage
-
-# Info about the default pos_tag tags
-# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
-adlib_tags = {
-    "JJ": "Adjective",
-    "JJR": "Adjective ending in 'er'",
-    "JJS": "Adjective ending in 'est'",
-    "NN": "Noun",
-    "NNS": "Plural Noun",
-    "NNP": "Proper Noun",
-    "NNPS": "Plural Proper Noun",
-    "RB": "Adverb",
-    "RBR": "Adverb ending in 'er'",
-    "RBS": "Adverb ending in 'est'",
-    "VB": "Verb",
-    "VBD": "Past Tense Verb",
-    "VBG": "Verb ending in 'ing'",
-    "VBN": "Past Tense Verb",
-    "VBP": "Present Tense Verb",
-    "VBZ": "Present Tense Verb ending in 's'",
-}
-
-stop_words = set(stopwords.words("english"))
-months = {
-    "january",
-    "february",
-    "march",
-    "april",
-    "may",
-    "june",
-    "july",
-    "august",
-    "september",
-    "october",
-    "november",
-    "december",
-}
-stop_words.update(months)
-
-# More stop words: becomes, become, became, well
-
-def get_random_wikipedia_title() -> str:
-    random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
-
-    return random_result['items'][0]['title']
-
-def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage:
-    page = None
-    while(page is None):
-        page = wikipedia.page(get_random_wikipedia_title())
-        if(min_length and len(page.summary) < min_length):
-            print(f"{page.displaytitle} is too short. Retrying...")
-            page = None
-            time.sleep(3)
-
-    return page
-
-def sentence_tokenize_and_tag(text: str):
-    text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text)  # Try to break up sentences mashed together by stripping strings
-    sentences = sent_tokenize(text)
-    tagged_sentences = []
-    for sentence in sentences:
-        tagged_sentences.append(pos_tag(word_tokenize(sentence)))
-
-    return tagged_sentences
-
-wikipedia = Wikipedia('en')
-wiki_page = get_random_wikipedia_page(wikipedia, 500)
-
-print(wiki_page.title)
-
-lowered_title = wiki_page.title.lower()
-i = adlib_word_counter = 0
-min_words = 4
-output_tokens = []
-for sentence in sentence_tokenize_and_tag(wiki_page.summary):
-    for token, tag in sentence:
-        if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]:
-            output_tokens[-1]["token"] += f" {token}"
-
-        else:
-            output_token = {"id": i, "token": token, "tag": tag}
-            adlib_tag = adlib_tags.get(tag)
-            if adlib_tag is not None:
-                if random.randint(0, adlib_word_counter) > min_words \
-                        and token.lower() not in stop_words \
-                        and token.lower() not in lowered_title:
-                    output_token["adlib_tag"] = adlib_tag
-                    adlib_word_counter = 0
-                else:
-                    adlib_word_counter += 1
-            output_tokens.append(output_token)
-            i += 1
-
-article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl}
-article['tokens'] = output_tokens
-
-with open("article.js", "w") as json_file:
-    json_file.write(f"article = {json.dumps(article)}")