Refactoring the parsing script and adding functionality to put multi-word proper nounds together, improving styling and metadata.
parent
2d431f03fe
commit
c7767fc3a4
45
test.py
45
test.py
|
@ -46,8 +46,11 @@ months = {
|
||||||
}
|
}
|
||||||
stop_words.update(months)
|
stop_words.update(months)
|
||||||
|
|
||||||
|
# More stop words: becomes, become, became, well
|
||||||
|
|
||||||
def get_random_wikipedia_title() -> str:
|
def get_random_wikipedia_title() -> str:
|
||||||
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
|
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
|
||||||
|
|
||||||
return random_result['items'][0]['title']
|
return random_result['items'][0]['title']
|
||||||
|
|
||||||
def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage:
|
def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage:
|
||||||
|
@ -57,41 +60,49 @@ def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> W
|
||||||
if(min_length and len(page.summary) < min_length):
|
if(min_length and len(page.summary) < min_length):
|
||||||
print(f"{page.displaytitle} is too short. Retrying...")
|
print(f"{page.displaytitle} is too short. Retrying...")
|
||||||
page = None
|
page = None
|
||||||
time.sleep(1)
|
time.sleep(3)
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
def sentence_tokenize_and_tag(text: str):
|
||||||
|
text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text) # Try to break up sentences mashed together by stripping strings
|
||||||
|
sentences = sent_tokenize(text)
|
||||||
|
tagged_sentences = []
|
||||||
|
for sentence in sentences:
|
||||||
|
tagged_sentences.append(pos_tag(word_tokenize(sentence)))
|
||||||
|
|
||||||
|
return tagged_sentences
|
||||||
|
|
||||||
wikipedia = Wikipedia('en')
|
wikipedia = Wikipedia('en')
|
||||||
wiki_page = get_random_wikipedia_page(wikipedia, 500)
|
wiki_page = get_random_wikipedia_page(wikipedia, 500)
|
||||||
|
|
||||||
print(wiki_page.title)
|
print(wiki_page.title)
|
||||||
print(wiki_page.displaytitle)
|
|
||||||
print(wiki_page.canonicalurl)
|
|
||||||
|
|
||||||
summary = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", wiki_page.summary) # Try to break up sentences mashed together by stripping strings
|
|
||||||
title = wiki_page.title
|
|
||||||
sentences = sent_tokenize(summary)
|
|
||||||
tagged_sentences = []
|
|
||||||
for sentence in sentences:
|
|
||||||
tagged_sentences.append(pos_tag(word_tokenize(sentence)))
|
|
||||||
|
|
||||||
|
|
||||||
|
lowered_title = wiki_page.title.lower()
|
||||||
i = adlib_word_counter = 0
|
i = adlib_word_counter = 0
|
||||||
min_words = 4
|
min_words = 4
|
||||||
output_tokens = []
|
output_tokens = []
|
||||||
for sentence in tagged_sentences:
|
for sentence in sentence_tokenize_and_tag(wiki_page.summary):
|
||||||
for token, tag in sentence:
|
for token, tag in sentence:
|
||||||
output_tokens.append({"id": i, "token": token, "tag": tag})
|
if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]:
|
||||||
|
output_tokens[-1]["token"] += f" {token}"
|
||||||
|
|
||||||
|
else:
|
||||||
|
output_token = {"id": i, "token": token, "tag": tag}
|
||||||
adlib_tag = adlib_tags.get(tag)
|
adlib_tag = adlib_tags.get(tag)
|
||||||
if adlib_tag is not None:
|
if adlib_tag is not None:
|
||||||
if random.randint(0, adlib_word_counter) > min_words \
|
if random.randint(0, adlib_word_counter) > min_words \
|
||||||
and token.lower() not in stop_words \
|
and token.lower() not in stop_words \
|
||||||
and token.lower() not in title.lower():
|
and token.lower() not in lowered_title:
|
||||||
output_tokens[-1]["adlib_tag"] = adlib_tag
|
output_token["adlib_tag"] = adlib_tag
|
||||||
adlib_word_counter = 0
|
adlib_word_counter = 0
|
||||||
else:
|
else:
|
||||||
adlib_word_counter += 1
|
adlib_word_counter += 1
|
||||||
print(f"{token}: {adlib_tag} {adlib_word_counter}")
|
output_tokens.append(output_token)
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
|
article = {'title': wiki_page.displaytitle, 'url': wiki_page.canonicalurl}
|
||||||
|
article['tokens'] = output_tokens
|
||||||
|
|
||||||
with open("article.js", "w") as json_file:
|
with open("article.js", "w") as json_file:
|
||||||
json_file.write(f"article = {json.dumps(output_tokens)}")
|
json_file.write(f"article = {json.dumps(article)}")
|
||||||
|
|
|
@ -3,12 +3,74 @@
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<title>WAD-LIBS</title>
|
<title>WAD-LIBS</title>
|
||||||
|
<style>
|
||||||
|
|
||||||
|
#inputs {
|
||||||
|
display: flex;
|
||||||
|
flex-flow: column wrap;
|
||||||
|
max-height: 400px;
|
||||||
|
}
|
||||||
|
|
||||||
|
#inputs p {
|
||||||
|
max-width: 30%;
|
||||||
|
margin: 0.5em;
|
||||||
|
}
|
||||||
|
|
||||||
|
#inputs p input {
|
||||||
|
float: right;
|
||||||
|
}
|
||||||
|
|
||||||
|
button {
|
||||||
|
font-weight: bold;
|
||||||
|
padding: 0.5em 1em;
|
||||||
|
margin: 0.5em 1em;
|
||||||
|
}
|
||||||
|
|
||||||
|
#article {
|
||||||
|
font-family: sans-serif;
|
||||||
|
font-size: 11pt;
|
||||||
|
margin: 2em 0 2em 2em;
|
||||||
|
max-width: 1000px;
|
||||||
|
transition: opacity 1s ease-in;
|
||||||
|
/*display: none;*/
|
||||||
|
opacity: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#article.visible {
|
||||||
|
display: block;
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#article #title {
|
||||||
|
font-family: 'Georgia', 'Times', serif;
|
||||||
|
font-size: 24pt;
|
||||||
|
margin-right: 0.3em;
|
||||||
|
}
|
||||||
|
|
||||||
|
#article #link {
|
||||||
|
font-size: 10pt;
|
||||||
|
}
|
||||||
|
|
||||||
|
#article hr {
|
||||||
|
border: none;
|
||||||
|
border-top: 1px solid;
|
||||||
|
margin-bottom: 1em;
|
||||||
|
}
|
||||||
|
|
||||||
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<h1>WAD-LIBS: Wikipedia Ad-Libbed</h1>
|
<h1>WAD-LIBS: Wikipedia Ad-Libbed</h1>
|
||||||
<div id="inputs"></div>
|
<div id="inputs"></div>
|
||||||
<p><button id="show-article">WAD-LIB!</button>
|
<p><button id="show-article">WAD-LIB!</button>
|
||||||
<div id="article"></div>
|
<div id="article">
|
||||||
|
<div id="header">
|
||||||
|
<span id="title"></span>
|
||||||
|
<span id="link"></span>
|
||||||
|
</div>
|
||||||
|
<hr>
|
||||||
|
<div id="summary"></div>
|
||||||
|
</div>
|
||||||
<script src="article.js"></script>
|
<script src="article.js"></script>
|
||||||
<script src="script.js"></script>
|
<script src="script.js"></script>
|
||||||
</body>
|
</body>
|
||||||
|
|
|
@ -16,7 +16,7 @@ function parseArticleJSON_old() {
|
||||||
|
|
||||||
function createInputs(article) {
|
function createInputs(article) {
|
||||||
inputs = ''
|
inputs = ''
|
||||||
article.forEach(function(token) {
|
article.tokens.forEach(function(token) {
|
||||||
if(token.adlib_tag) {
|
if(token.adlib_tag) {
|
||||||
inputs += `
|
inputs += `
|
||||||
<p>
|
<p>
|
||||||
|
@ -30,8 +30,10 @@ function createInputs(article) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function showArticle(article) {
|
function showArticle(article) {
|
||||||
|
document.getElementById('title').innerHTML = article.title
|
||||||
|
document.getElementById('link').innerHTML = `[ <a href='${article.url}'>Original Article</a> ]`
|
||||||
let output = ''
|
let output = ''
|
||||||
article.forEach(function(token) {
|
article.tokens.forEach(function(token) {
|
||||||
let adlib_input = document.getElementById(`token_${token.id}`);
|
let adlib_input = document.getElementById(`token_${token.id}`);
|
||||||
if(adlib_input && adlib_input.value) {
|
if(adlib_input && adlib_input.value) {
|
||||||
output += `<strong>${adlib_input.value}</strong> `
|
output += `<strong>${adlib_input.value}</strong> `
|
||||||
|
@ -40,7 +42,9 @@ function showArticle(article) {
|
||||||
output += `${token.token} `
|
output += `${token.token} `
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
document.getElementById('article').innerHTML = output
|
document.getElementById('summary').innerHTML = output
|
||||||
|
|
||||||
|
document.getElementById('article').classList.add('visible')
|
||||||
}
|
}
|
||||||
|
|
||||||
createInputs(article)
|
createInputs(article)
|
||||||
|
|
Loading…
Reference in New Issue