From c7767fc3a424809be904aa834b090b2fe6751685 Mon Sep 17 00:00:00 2001 From: gamerdonkey Date: Wed, 17 Aug 2022 05:01:43 +0000 Subject: [PATCH] Refactoring the parsing script and adding functionality to put multi-word proper nounds together, improving styling and metadata. --- test.py | 61 +++++++++++++++++++++++++------------------- web/index.html | 68 +++++++++++++++++++++++++++++++++++++++++++++++--- web/script.js | 10 +++++--- 3 files changed, 108 insertions(+), 31 deletions(-) diff --git a/test.py b/test.py index 75c8b2f..8431dc6 100644 --- a/test.py +++ b/test.py @@ -46,8 +46,11 @@ months = { } stop_words.update(months) +# More stop words: becomes, become, became, well + def get_random_wikipedia_title() -> str: random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) + return random_result['items'][0]['title'] def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage: @@ -57,41 +60,49 @@ def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> W if(min_length and len(page.summary) < min_length): print(f"{page.displaytitle} is too short. Retrying...") page = None - time.sleep(1) + time.sleep(3) + return page +def sentence_tokenize_and_tag(text: str): + text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text) # Try to break up sentences mashed together by stripping strings + sentences = sent_tokenize(text) + tagged_sentences = [] + for sentence in sentences: + tagged_sentences.append(pos_tag(word_tokenize(sentence))) + + return tagged_sentences + wikipedia = Wikipedia('en') wiki_page = get_random_wikipedia_page(wikipedia, 500) print(wiki_page.title) -print(wiki_page.displaytitle) -print(wiki_page.canonicalurl) - -summary = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", wiki_page.summary) # Try to break up sentences mashed together by stripping strings -title = wiki_page.title -sentences = sent_tokenize(summary) -tagged_sentences = [] -for sentence in sentences: - tagged_sentences.append(pos_tag(word_tokenize(sentence))) - +lowered_title = wiki_page.title.lower() i = adlib_word_counter = 0 min_words = 4 output_tokens = [] -for sentence in tagged_sentences: +for sentence in sentence_tokenize_and_tag(wiki_page.summary): for token, tag in sentence: - output_tokens.append({"id": i, "token": token, "tag": tag}) - adlib_tag = adlib_tags.get(tag) - if adlib_tag is not None: - if random.randint(0, adlib_word_counter) > min_words \ - and token.lower() not in stop_words \ - and token.lower() not in title.lower(): - output_tokens[-1]["adlib_tag"] = adlib_tag - adlib_word_counter = 0 - else: - adlib_word_counter += 1 - print(f"{token}: {adlib_tag} {adlib_word_counter}") - i += 1 + if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]: + output_tokens[-1]["token"] += f" {token}" + + else: + output_token = {"id": i, "token": token, "tag": tag} + adlib_tag = adlib_tags.get(tag) + if adlib_tag is not None: + if random.randint(0, adlib_word_counter) > min_words \ + and token.lower() not in stop_words \ + and token.lower() not in lowered_title: + output_token["adlib_tag"] = adlib_tag + adlib_word_counter = 0 + else: + adlib_word_counter += 1 + output_tokens.append(output_token) + i += 1 + +article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl} +article['tokens'] = output_tokens with open("article.js", "w") as json_file: - json_file.write(f"article = {json.dumps(output_tokens)}") + json_file.write(f"article = {json.dumps(article)}") diff --git a/web/index.html b/web/index.html index a4348db..f9e8108 100644 --- a/web/index.html +++ b/web/index.html @@ -3,12 +3,74 @@ WAD-LIBS + -

WAD-LIBS: Wikipedia Ad-Libbed

-
+

WAD-LIBS: Wikipedia Ad-Libbed

+

-

+
+ +
+
+
diff --git a/web/script.js b/web/script.js index 7c9259a..2f3014c 100644 --- a/web/script.js +++ b/web/script.js @@ -16,7 +16,7 @@ function parseArticleJSON_old() { function createInputs(article) { inputs = '' - article.forEach(function(token) { + article.tokens.forEach(function(token) { if(token.adlib_tag) { inputs += `

@@ -30,8 +30,10 @@ function createInputs(article) { } function showArticle(article) { + document.getElementById('title').innerHTML = article.title + document.getElementById('link').innerHTML = `[ Original Article ]` let output = '' - article.forEach(function(token) { + article.tokens.forEach(function(token) { let adlib_input = document.getElementById(`token_${token.id}`); if(adlib_input && adlib_input.value) { output += `${adlib_input.value} ` @@ -40,7 +42,9 @@ function showArticle(article) { output += `${token.token} ` } }) - document.getElementById('article').innerHTML = output + document.getElementById('summary').innerHTML = output + + document.getElementById('article').classList.add('visible') } createInputs(article)