Improving stop word handling, also improving sentence parsing.

main
gamerdonkey 2022-08-09 05:20:36 +00:00
parent 54d18d7eeb
commit 2d431f03fe
1 changed files with 22 additions and 3 deletions

25
test.py
View File

@ -1,5 +1,6 @@
import json
import random
import re
import requests
import time
@ -29,6 +30,21 @@ adlib_tags = {
}
stop_words = set(stopwords.words("english"))
months = {
"january",
"february",
"march",
"april",
"may",
"june",
"july",
"august",
"september",
"october",
"november",
"december",
}
stop_words.update(months)
def get_random_wikipedia_title() -> str:
random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text)
@ -51,7 +67,8 @@ print(wiki_page.title)
print(wiki_page.displaytitle)
print(wiki_page.canonicalurl)
summary = wiki_page.summary
summary = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", wiki_page.summary) # Try to break up sentences mashed together by stripping strings
title = wiki_page.title
sentences = sent_tokenize(summary)
tagged_sentences = []
for sentence in sentences:
@ -66,12 +83,14 @@ for sentence in tagged_sentences:
output_tokens.append({"id": i, "token": token, "tag": tag})
adlib_tag = adlib_tags.get(tag)
if adlib_tag is not None:
if random.randint(0, adlib_word_counter) > min_words and token not in stop_words:
if random.randint(0, adlib_word_counter) > min_words \
and token.lower() not in stop_words \
and token.lower() not in title.lower():
output_tokens[-1]["adlib_tag"] = adlib_tag
adlib_word_counter = 0
else:
adlib_word_counter += 1
print(f"{token}: {adlib_tag} {adlib_word_counter}")
i += 1
with open("article.js", "w") as json_file: