Improving stop word handling, also improving sentence parsing.
parent
54d18d7eeb
commit
2d431f03fe
25
test.py
25
test.py
|
@ -1,5 +1,6 @@
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
|
import re
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
@ -29,6 +30,21 @@ adlib_tags = {
|
||||||
}
|
}
|
||||||
|
|
||||||
stop_words = set(stopwords.words("english"))
|
stop_words = set(stopwords.words("english"))
|
||||||
|
months = {
|
||||||
|
"january",
|
||||||
|
"february",
|
||||||
|
"march",
|
||||||
|
"april",
|
||||||
|
"may",
|
||||||
|
"june",
|
||||||
|
"july",
|
||||||
|
"august",
|
||||||
|
"september",
|
||||||
|
"october",
|
||||||
|
"november",
|
||||||
|
"december",
|
||||||
|
}
|
||||||
|
stop_words.update(months)
|
||||||
|
|
||||||
def get_random_wikipedia_title() -> str:
|
def get_random_wikipedia_title() -> str:
|
||||||
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
|
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
|
||||||
|
@ -51,7 +67,8 @@ print(wiki_page.title)
|
||||||
print(wiki_page.displaytitle)
|
print(wiki_page.displaytitle)
|
||||||
print(wiki_page.canonicalurl)
|
print(wiki_page.canonicalurl)
|
||||||
|
|
||||||
summary = wiki_page.summary
|
summary = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", wiki_page.summary) # Try to break up sentences mashed together by stripping strings
|
||||||
|
title = wiki_page.title
|
||||||
sentences = sent_tokenize(summary)
|
sentences = sent_tokenize(summary)
|
||||||
tagged_sentences = []
|
tagged_sentences = []
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
|
@ -66,12 +83,14 @@ for sentence in tagged_sentences:
|
||||||
output_tokens.append({"id": i, "token": token, "tag": tag})
|
output_tokens.append({"id": i, "token": token, "tag": tag})
|
||||||
adlib_tag = adlib_tags.get(tag)
|
adlib_tag = adlib_tags.get(tag)
|
||||||
if adlib_tag is not None:
|
if adlib_tag is not None:
|
||||||
if random.randint(0, adlib_word_counter) > min_words and token not in stop_words:
|
if random.randint(0, adlib_word_counter) > min_words \
|
||||||
|
and token.lower() not in stop_words \
|
||||||
|
and token.lower() not in title.lower():
|
||||||
output_tokens[-1]["adlib_tag"] = adlib_tag
|
output_tokens[-1]["adlib_tag"] = adlib_tag
|
||||||
adlib_word_counter = 0
|
adlib_word_counter = 0
|
||||||
else:
|
else:
|
||||||
adlib_word_counter += 1
|
adlib_word_counter += 1
|
||||||
|
print(f"{token}: {adlib_tag} {adlib_word_counter}")
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
with open("article.js", "w") as json_file:
|
with open("article.js", "w") as json_file:
|
||||||
|
|
Loading…
Reference in New Issue