Refactoring the parsing script and adding functionality to put multi-word proper nounds together, improving styling and metadata.

main
gamerdonkey 2022-08-17 05:01:43 +00:00
parent 2d431f03fe
commit c7767fc3a4
3 changed files with 108 additions and 31 deletions

61
test.py
View File

@ -46,8 +46,11 @@ months = {
} }
stop_words.update(months) stop_words.update(months)
# More stop words: becomes, become, became, well
def get_random_wikipedia_title() -> str: def get_random_wikipedia_title() -> str:
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
return random_result['items'][0]['title'] return random_result['items'][0]['title']
def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage: def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage:
@ -57,41 +60,49 @@ def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> W
if(min_length and len(page.summary) < min_length): if(min_length and len(page.summary) < min_length):
print(f"{page.displaytitle} is too short. Retrying...") print(f"{page.displaytitle} is too short. Retrying...")
page = None page = None
time.sleep(1) time.sleep(3)
return page return page
def sentence_tokenize_and_tag(text: str):
text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text) # Try to break up sentences mashed together by stripping strings
sentences = sent_tokenize(text)
tagged_sentences = []
for sentence in sentences:
tagged_sentences.append(pos_tag(word_tokenize(sentence)))
return tagged_sentences
wikipedia = Wikipedia('en') wikipedia = Wikipedia('en')
wiki_page = get_random_wikipedia_page(wikipedia, 500) wiki_page = get_random_wikipedia_page(wikipedia, 500)
print(wiki_page.title) print(wiki_page.title)
print(wiki_page.displaytitle)
print(wiki_page.canonicalurl)
summary = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", wiki_page.summary) # Try to break up sentences mashed together by stripping strings
title = wiki_page.title
sentences = sent_tokenize(summary)
tagged_sentences = []
for sentence in sentences:
tagged_sentences.append(pos_tag(word_tokenize(sentence)))
lowered_title = wiki_page.title.lower()
i = adlib_word_counter = 0 i = adlib_word_counter = 0
min_words = 4 min_words = 4
output_tokens = [] output_tokens = []
for sentence in tagged_sentences: for sentence in sentence_tokenize_and_tag(wiki_page.summary):
for token, tag in sentence: for token, tag in sentence:
output_tokens.append({"id": i, "token": token, "tag": tag}) if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]:
adlib_tag = adlib_tags.get(tag) output_tokens[-1]["token"] += f" {token}"
if adlib_tag is not None:
if random.randint(0, adlib_word_counter) > min_words \ else:
and token.lower() not in stop_words \ output_token = {"id": i, "token": token, "tag": tag}
and token.lower() not in title.lower(): adlib_tag = adlib_tags.get(tag)
output_tokens[-1]["adlib_tag"] = adlib_tag if adlib_tag is not None:
adlib_word_counter = 0 if random.randint(0, adlib_word_counter) > min_words \
else: and token.lower() not in stop_words \
adlib_word_counter += 1 and token.lower() not in lowered_title:
print(f"{token}: {adlib_tag} {adlib_word_counter}") output_token["adlib_tag"] = adlib_tag
i += 1 adlib_word_counter = 0
else:
adlib_word_counter += 1
output_tokens.append(output_token)
i += 1
article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl}
article['tokens'] = output_tokens
with open("article.js", "w") as json_file: with open("article.js", "w") as json_file:
json_file.write(f"article = {json.dumps(output_tokens)}") json_file.write(f"article = {json.dumps(article)}")

View File

@ -3,12 +3,74 @@
<head> <head>
<meta charset="UTF-8" /> <meta charset="UTF-8" />
<title>WAD-LIBS</title> <title>WAD-LIBS</title>
<style>
#inputs {
display: flex;
flex-flow: column wrap;
max-height: 400px;
}
#inputs p {
max-width: 30%;
margin: 0.5em;
}
#inputs p input {
float: right;
}
button {
font-weight: bold;
padding: 0.5em 1em;
margin: 0.5em 1em;
}
#article {
font-family: sans-serif;
font-size: 11pt;
margin: 2em 0 2em 2em;
max-width: 1000px;
transition: opacity 1s ease-in;
/*display: none;*/
opacity: 0;
}
#article.visible {
display: block;
opacity: 1;
}
#article #title {
font-family: 'Georgia', 'Times', serif;
font-size: 24pt;
margin-right: 0.3em;
}
#article #link {
font-size: 10pt;
}
#article hr {
border: none;
border-top: 1px solid;
margin-bottom: 1em;
}
</style>
</head> </head>
<body> <body>
<h1>WAD-LIBS: Wikipedia Ad-Libbed</h1> <h1>WAD-LIBS: Wikipedia Ad-Libbed</h1>
<div id="inputs"></div> <div id="inputs"></div>
<p><button id="show-article">WAD-LIB!</button> <p><button id="show-article">WAD-LIB!</button>
<div id="article"></div> <div id="article">
<div id="header">
<span id="title"></span>
<span id="link"></span>
</div>
<hr>
<div id="summary"></div>
</div>
<script src="article.js"></script> <script src="article.js"></script>
<script src="script.js"></script> <script src="script.js"></script>
</body> </body>

View File

@ -16,7 +16,7 @@ function parseArticleJSON_old() {
function createInputs(article) { function createInputs(article) {
inputs = '' inputs = ''
article.forEach(function(token) { article.tokens.forEach(function(token) {
if(token.adlib_tag) { if(token.adlib_tag) {
inputs += ` inputs += `
<p> <p>
@ -30,8 +30,10 @@ function createInputs(article) {
} }
function showArticle(article) { function showArticle(article) {
document.getElementById('title').innerHTML = article.title
document.getElementById('link').innerHTML = `[ <a href='${article.url}'>Original Article</a> ]`
let output = '' let output = ''
article.forEach(function(token) { article.tokens.forEach(function(token) {
let adlib_input = document.getElementById(`token_${token.id}`); let adlib_input = document.getElementById(`token_${token.id}`);
if(adlib_input && adlib_input.value) { if(adlib_input && adlib_input.value) {
output += `<strong>${adlib_input.value}</strong> ` output += `<strong>${adlib_input.value}</strong> `
@ -40,7 +42,9 @@ function showArticle(article) {
output += `${token.token} ` output += `${token.token} `
} }
}) })
document.getElementById('article').innerHTML = output document.getElementById('summary').innerHTML = output
document.getElementById('article').classList.add('visible')
} }
createInputs(article) createInputs(article)