import hashlib import json import random import re import requests import time from nltk import pos_tag, sent_tokenize, word_tokenize from nltk.corpus import stopwords from wikipediaapi import Wikipedia, WikipediaPage # Info about the default pos_tag tags # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html adlib_tags = { "JJ": "Adjective", "JJR": "Adjective ending in 'er'", "JJS": "Adjective ending in 'est'", "NN": "Noun", "NNS": "Plural Noun", "NNP": "Proper Noun", "NNPS": "Plural Proper Noun", "RB": "Adverb", "RBR": "Adverb ending in 'er'", "RBS": "Adverb ending in 'est'", "VB": "Verb", "VBD": "Past Tense Verb", "VBG": "Verb ending in 'ing'", "VBN": "Past Tense Verb", "VBP": "Present Tense Verb", "VBZ": "Present Tense Verb ending in 's'", } stop_words = set(stopwords.words("english")) months = { "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december", } stop_words.update(months) # More stop words: becomes, become, became, well def get_random_wikipedia_title() -> str: random_result = json.loads(requests.get('https://en.wikipedia.org/api/rest_v1/page/random/title').text) return random_result['items'][0]['title'] def get_random_wikipedia_page(wikipedia: Wikipedia, min_length: int = None) -> WikipediaPage: page = None while(page is None): page = wikipedia.page(get_random_wikipedia_title()) if(min_length and len(page.summary) < min_length): print(f"{page.displaytitle} is too short. Retrying...") page = None time.sleep(3) return page def sentence_tokenize_and_tag(text: str): text = re.sub(r"(\w+)\.([A-Z])", r"\1. \2", text) # Try to break up sentences mashed together by stripping strings sentences = sent_tokenize(text) tagged_sentences = [] for sentence in sentences: tagged_sentences.append(pos_tag(word_tokenize(sentence))) return tagged_sentences def adlibify(wiki_page, min_adlib_rest): lowered_title = wiki_page.title.lower() i = adlib_word_counter = 0 output_tokens = [] for sentence in sentence_tokenize_and_tag(wiki_page.summary): for token, tag in sentence: if tag in ["NNP", "NNPS"] and i > 0 and output_tokens[-1]["tag"] in ["NNP", "NNPS"]: output_tokens[-1]["token"] += f" {token}" else: output_token = {"id": i, "token": token, "tag": tag} adlib_tag = adlib_tags.get(tag) if adlib_tag is not None: if random.randint(0, adlib_word_counter) > min_adlib_rest \ and token.lower() not in stop_words \ and token.lower() not in lowered_title: output_token["adlib_tag"] = adlib_tag adlib_word_counter = 0 else: adlib_word_counter += 1 output_tokens.append(output_token) i += 1 url = wiki_page.canonicalurl url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()[:8] article = {'title': wiki_page.displaytitle, 'url': url, 'hash': url_hash} article['tokens'] = output_tokens return article def main(): wikipedia = Wikipedia('en') wiki_page = get_random_wikipedia_page(wikipedia, 500) print(wiki_page.title) article = adlibify(wiki_page, 4) with open("article.js", "w") as json_file: json_file.write(f"article = {json.dumps(article)}") if __name__ == '__main__': main()