From 2d431f03fe2c1ff8bee9c230b2e1882862ba698c Mon Sep 17 00:00:00 2001 From: gamerdonkey Date: Tue, 9 Aug 2022 05:20:36 +0000 Subject: [PATCH] Improving stop word handling, also improving sentence parsing. --- test.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/test.py b/test.py index 030e7ef..75c8b2f 100644 --- a/test.py +++ b/test.py @@ -1,5 +1,6 @@ import json import random +import re import requests import time @@ -29,6 +30,21 @@ adlib_tags = { } stop_words = set(stopwords.words("english")) +months = { + "january", + "february", + "march", + "april", + "may", + "june", + "july", + "august", + "september", + "october", + "november", + "december", +} +stop_words.update(months) def get_random_wikipedia_title() -> str: random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) @@ -51,7 +67,8 @@ print(wiki_page.title) print(wiki_page.displaytitle) print(wiki_page.canonicalurl) -summary = wiki_page.summary +summary = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", wiki_page.summary) # Try to break up sentences mashed together by stripping strings +title = wiki_page.title sentences = sent_tokenize(summary) tagged_sentences = [] for sentence in sentences: @@ -66,12 +83,14 @@ for sentence in tagged_sentences: output_tokens.append({"id": i, "token": token, "tag": tag}) adlib_tag = adlib_tags.get(tag) if adlib_tag is not None: - if random.randint(0, adlib_word_counter) > min_words and token not in stop_words: + if random.randint(0, adlib_word_counter) > min_words \ + and token.lower() not in stop_words \ + and token.lower() not in title.lower(): output_tokens[-1]["adlib_tag"] = adlib_tag adlib_word_counter = 0 else: adlib_word_counter += 1 - + print(f"{token}: {adlib_tag} {adlib_word_counter}") i += 1 with open("article.js", "w") as json_file: