From 116be618a060cad6236ad8b1dd65665de2ee3560 Mon Sep 17 00:00:00 2001 From: endorphant Date: Fri, 3 Jun 2016 04:16:12 -0400 Subject: [PATCH] doing some repo cleanup --- README.html | 8 + README.md | 5 + bin/chatter.py | 59 - bin/inflect.py | 3130 ------------------------------------------------ bin/mistune.py | 1154 ------------------ lib/lang.json | 39 - 6 files changed, 13 insertions(+), 4382 deletions(-) delete mode 100644 bin/chatter.py delete mode 100644 bin/inflect.py delete mode 100644 bin/mistune.py delete mode 100644 lib/lang.json diff --git a/README.html b/README.html index c6c5aa3..04b5fc6 100644 --- a/README.html +++ b/README.html @@ -89,6 +89,14 @@ your local timezone yet. here are some +

dependencies

+ + + +

future features

these are a few ideas being kicked around, or under active development:

diff --git a/README.md b/README.md index 89ae742..ddf93e5 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,11 @@ this.** your local timezone yet. here are some [timezone setting instructions](http://www.cyberciti.biz/faq/linux-unix-set-tz-environment-variable/) +### dependencies + +* [mistune](https://pypi.python.org/pypi/mistune) +* [inflect](https://pypi.python.org/pypi/inflect) + ### future features these are a few ideas being kicked around, or under active development: diff --git a/bin/chatter.py b/bin/chatter.py deleted file mode 100644 index 7e1a034..0000000 --- a/bin/chatter.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/python - -''' -ttbp: tilde town blogging platform -(also known as the feels engine) -a console-based blogging program developed for tilde.town -copyright (c) 2016 ~endorphant (endorphant@tilde.town) - -chatter.py: -some text processing utilities - -GNU GPL BOILERPLATE: -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . - -the complete codebase is available at: -https://github.com/modgethanc/ttbp -''' - -import os - -import random -import json -import os - -SOURCE = os.path.join("/home", "endorphant", "projects", "ttbp") -langfile = open(os.path.join(SOURCE, "lib", "lang.json"), 'r') -LANG = json.load(langfile) -langfile.close() - -def say(keyword): - ''' - takes a keyword and randomly returns from language dictionary to match that keyword - - returns None if keyword doesn't exist - - TODO: validate keyword? - ''' - - return random.choice(LANG.get(keyword)) - -def month(num): - ''' - takes a MM and returns lovercase full name of that month - - TODO: validate num? - ''' - - return LANG["months"].get(num) diff --git a/bin/inflect.py b/bin/inflect.py deleted file mode 100644 index 64382a2..0000000 --- a/bin/inflect.py +++ /dev/null @@ -1,3130 +0,0 @@ -''' - inflect.py: correctly generate plurals, ordinals, indefinite articles; - convert numbers to words - Copyright (C) 2010 Paul Dyson - - Based upon the Perl module Lingua::EN::Inflect by Damian Conway. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - - The original Perl module Lingua::EN::Inflect by Damian Conway is - available from http://search.cpan.org/~dconway/ - - This module can be downloaded at http://pypi.python.org/pypi/inflect - -methods: - classical inflect - plural plural_noun plural_verb plural_adj singular_noun no num a an - compare compare_nouns compare_verbs compare_adjs - present_participle - ordinal - number_to_words - join - defnoun defverb defadj defa defan - - INFLECTIONS: classical inflect - plural plural_noun plural_verb plural_adj singular_noun compare - no num a an present_participle - - PLURALS: classical inflect - plural plural_noun plural_verb plural_adj singular_noun no num - compare compare_nouns compare_verbs compare_adjs - - COMPARISONS: classical - compare compare_nouns compare_verbs compare_adjs - - ARTICLES: classical inflect num a an - - NUMERICAL: ordinal number_to_words - - USER_DEFINED: defnoun defverb defadj defa defan - -Exceptions: - UnknownClassicalModeError - BadNumValueError - BadChunkingOptionError - NumOutOfRangeError - BadUserDefinedPatternError - BadRcFileError - BadGenderError - -''' - -from re import match, search, subn, IGNORECASE, VERBOSE -from re import split as splitre -from re import error as reerror -from re import sub as resub - - -class UnknownClassicalModeError(Exception): - pass - - -class BadNumValueError(Exception): - pass - - -class BadChunkingOptionError(Exception): - pass - - -class NumOutOfRangeError(Exception): - pass - - -class BadUserDefinedPatternError(Exception): - pass - - -class BadRcFileError(Exception): - pass - - -class BadGenderError(Exception): - pass - -__ver_major__ = 0 -__ver_minor__ = 2 -__ver_patch__ = 4 -__ver_sub__ = "" -__version__ = "%d.%d.%d%s" % (__ver_major__, __ver_minor__, - __ver_patch__, __ver_sub__) - - -STDOUT_ON = False - - -def print3(txt): - if STDOUT_ON: - print(txt) - - -def enclose(s): - return "(?:%s)" % s - - -def joinstem(cutpoint=0, words=''): - ''' - join stem of each word in words into a string for regex - each word is truncated at cutpoint - cutpoint is usually negative indicating the number of letters to remove - from the end of each word - - e.g. - joinstem(-2, ["ephemeris", "iris", ".*itis"]) returns - (?:ephemer|ir|.*it) - - ''' - return enclose('|'.join(w[:cutpoint] for w in words)) - - -def bysize(words): - ''' - take a list of words and return a dict of sets sorted by word length - e.g. - ret[3]=set(['ant', 'cat', 'dog', 'pig']) - ret[4]=set(['frog', 'goat']) - ret[5]=set(['horse']) - ret[8]=set(['elephant']) - ''' - ret = {} - for w in words: - if len(w) not in ret: - ret[len(w)] = set() - ret[len(w)].add(w) - return ret - - -def make_pl_si_lists(lst, plending, siendingsize, dojoinstem=True): - ''' - given a list of singular words: lst - an ending to append to make the plural: plending - the number of characters to remove from the singular before appending plending: siendingsize - a flag whether to create a joinstem: dojoinstem - - return: - a list of pluralised words: si_list (called si because this is what you need to - look for to make the singular) - the pluralised words as a dict of sets sorted by word length: si_bysize - the singular words as a dict of sets sorted by word length: pl_bysize - if dojoinstem is True: a regular expression that matches any of the stems: stem - ''' - if siendingsize is not None: - siendingsize = -siendingsize - si_list = [w[:siendingsize] + plending for w in lst] - pl_bysize = bysize(lst) - si_bysize = bysize(si_list) - if dojoinstem: - stem = joinstem(siendingsize, lst) - return si_list, si_bysize, pl_bysize, stem - else: - return si_list, si_bysize, pl_bysize - - -# 1. PLURALS - -pl_sb_irregular_s = { - "corpus": "corpuses|corpora", - "opus": "opuses|opera", - "genus": "genera", - "mythos": "mythoi", - "penis": "penises|penes", - "testis": "testes", - "atlas": "atlases|atlantes", - "yes": "yeses", -} - -pl_sb_irregular = { - "child": "children", - "brother": "brothers|brethren", - "loaf": "loaves", - "hoof": "hoofs|hooves", - "beef": "beefs|beeves", - "thief": "thiefs|thieves", - "money": "monies", - "mongoose": "mongooses", - "ox": "oxen", - "cow": "cows|kine", - "graffito": "graffiti", - "octopus": "octopuses|octopodes", - "genie": "genies|genii", - "ganglion": "ganglions|ganglia", - "trilby": "trilbys", - "turf": "turfs|turves", - "numen": "numina", - "atman": "atmas", - "occiput": "occiputs|occipita", - "sabretooth": "sabretooths", - "sabertooth": "sabertooths", - "lowlife": "lowlifes", - "flatfoot": "flatfoots", - "tenderfoot": "tenderfoots", - "romany": "romanies", - "jerry": "jerries", - "mary": "maries", - "talouse": "talouses", - "blouse": "blouses", - "rom": "roma", - "carmen": "carmina", -} - -pl_sb_irregular.update(pl_sb_irregular_s) -# pl_sb_irregular_keys = enclose('|'.join(pl_sb_irregular.keys())) - -pl_sb_irregular_caps = { - 'Romany': 'Romanies', - 'Jerry': 'Jerrys', - 'Mary': 'Marys', - 'Rom': 'Roma', -} - -pl_sb_irregular_compound = { - "prima donna": "prima donnas|prime donne", -} - -si_sb_irregular = dict([(v, k) for (k, v) in pl_sb_irregular.items()]) -keys = list(si_sb_irregular.keys()) -for k in keys: - if '|' in k: - k1, k2 = k.split('|') - si_sb_irregular[k1] = si_sb_irregular[k2] = si_sb_irregular[k] - del si_sb_irregular[k] -si_sb_irregular_caps = dict([(v, k) for (k, v) in pl_sb_irregular_caps.items()]) -si_sb_irregular_compound = dict([(v, k) for (k, v) in pl_sb_irregular_compound.items()]) -keys = list(si_sb_irregular_compound.keys()) -for k in keys: - if '|' in k: - k1, k2 = k.split('|') - si_sb_irregular_compound[k1] = si_sb_irregular_compound[k2] = si_sb_irregular_compound[k] - del si_sb_irregular_compound[k] - -# si_sb_irregular_keys = enclose('|'.join(si_sb_irregular.keys())) - -# Z's that don't double - -pl_sb_z_zes_list = ( - "quartz", "topaz", -) -pl_sb_z_zes_bysize = bysize(pl_sb_z_zes_list) - -pl_sb_ze_zes_list = ('snooze',) -pl_sb_ze_zes_bysize = bysize(pl_sb_ze_zes_list) - - -# CLASSICAL "..is" -> "..ides" - -pl_sb_C_is_ides_complete = [ - # GENERAL WORDS... - "ephemeris", "iris", "clitoris", - "chrysalis", "epididymis", -] - -pl_sb_C_is_ides_endings = [ - # INFLAMATIONS... - "itis", -] - -pl_sb_C_is_ides = joinstem(-2, pl_sb_C_is_ides_complete + ['.*%s' % w for w in pl_sb_C_is_ides_endings]) - -pl_sb_C_is_ides_list = pl_sb_C_is_ides_complete + pl_sb_C_is_ides_endings - -(si_sb_C_is_ides_list, si_sb_C_is_ides_bysize, - pl_sb_C_is_ides_bysize) = make_pl_si_lists(pl_sb_C_is_ides_list, 'ides', 2, dojoinstem=False) - - -# CLASSICAL "..a" -> "..ata" - -pl_sb_C_a_ata_list = ( - "anathema", "bema", "carcinoma", "charisma", "diploma", - "dogma", "drama", "edema", "enema", "enigma", "lemma", - "lymphoma", "magma", "melisma", "miasma", "oedema", - "sarcoma", "schema", "soma", "stigma", "stoma", "trauma", - "gumma", "pragma", -) - -(si_sb_C_a_ata_list, si_sb_C_a_ata_bysize, - pl_sb_C_a_ata_bysize, pl_sb_C_a_ata) = make_pl_si_lists(pl_sb_C_a_ata_list, 'ata', 1) - -# UNCONDITIONAL "..a" -> "..ae" - -pl_sb_U_a_ae_list = ( - "alumna", "alga", "vertebra", "persona" -) -(si_sb_U_a_ae_list, si_sb_U_a_ae_bysize, - pl_sb_U_a_ae_bysize, pl_sb_U_a_ae) = make_pl_si_lists(pl_sb_U_a_ae_list, 'e', None) - -# CLASSICAL "..a" -> "..ae" - -pl_sb_C_a_ae_list = ( - "amoeba", "antenna", "formula", "hyperbola", - "medusa", "nebula", "parabola", "abscissa", - "hydra", "nova", "lacuna", "aurora", "umbra", - "flora", "fauna", -) -(si_sb_C_a_ae_list, si_sb_C_a_ae_bysize, - pl_sb_C_a_ae_bysize, pl_sb_C_a_ae) = make_pl_si_lists(pl_sb_C_a_ae_list, 'e', None) - - -# CLASSICAL "..en" -> "..ina" - -pl_sb_C_en_ina_list = ( - "stamen", "foramen", "lumen", -) - -(si_sb_C_en_ina_list, si_sb_C_en_ina_bysize, - pl_sb_C_en_ina_bysize, pl_sb_C_en_ina) = make_pl_si_lists(pl_sb_C_en_ina_list, 'ina', 2) - - -# UNCONDITIONAL "..um" -> "..a" - -pl_sb_U_um_a_list = ( - "bacterium", "agendum", "desideratum", "erratum", - "stratum", "datum", "ovum", "extremum", - "candelabrum", -) -(si_sb_U_um_a_list, si_sb_U_um_a_bysize, - pl_sb_U_um_a_bysize, pl_sb_U_um_a) = make_pl_si_lists(pl_sb_U_um_a_list, 'a', 2) - -# CLASSICAL "..um" -> "..a" - -pl_sb_C_um_a_list = ( - "maximum", "minimum", "momentum", "optimum", - "quantum", "cranium", "curriculum", "dictum", - "phylum", "aquarium", "compendium", "emporium", - "enconium", "gymnasium", "honorarium", "interregnum", - "lustrum", "memorandum", "millennium", "rostrum", - "spectrum", "speculum", "stadium", "trapezium", - "ultimatum", "medium", "vacuum", "velum", - "consortium", "arboretum", -) - -(si_sb_C_um_a_list, si_sb_C_um_a_bysize, - pl_sb_C_um_a_bysize, pl_sb_C_um_a) = make_pl_si_lists(pl_sb_C_um_a_list, 'a', 2) - - -# UNCONDITIONAL "..us" -> "i" - -pl_sb_U_us_i_list = ( - "alumnus", "alveolus", "bacillus", "bronchus", - "locus", "nucleus", "stimulus", "meniscus", - "sarcophagus", -) -(si_sb_U_us_i_list, si_sb_U_us_i_bysize, - pl_sb_U_us_i_bysize, pl_sb_U_us_i) = make_pl_si_lists(pl_sb_U_us_i_list, 'i', 2) - -# CLASSICAL "..us" -> "..i" - -pl_sb_C_us_i_list = ( - "focus", "radius", "genius", - "incubus", "succubus", "nimbus", - "fungus", "nucleolus", "stylus", - "torus", "umbilicus", "uterus", - "hippopotamus", "cactus", -) - -(si_sb_C_us_i_list, si_sb_C_us_i_bysize, - pl_sb_C_us_i_bysize, pl_sb_C_us_i) = make_pl_si_lists(pl_sb_C_us_i_list, 'i', 2) - - -# CLASSICAL "..us" -> "..us" (ASSIMILATED 4TH DECLENSION LATIN NOUNS) - -pl_sb_C_us_us = ( - "status", "apparatus", "prospectus", "sinus", - "hiatus", "impetus", "plexus", -) -pl_sb_C_us_us_bysize = bysize(pl_sb_C_us_us) - -# UNCONDITIONAL "..on" -> "a" - -pl_sb_U_on_a_list = ( - "criterion", "perihelion", "aphelion", - "phenomenon", "prolegomenon", "noumenon", - "organon", "asyndeton", "hyperbaton", -) -(si_sb_U_on_a_list, si_sb_U_on_a_bysize, - pl_sb_U_on_a_bysize, pl_sb_U_on_a) = make_pl_si_lists(pl_sb_U_on_a_list, 'a', 2) - -# CLASSICAL "..on" -> "..a" - -pl_sb_C_on_a_list = ( - "oxymoron", -) - -(si_sb_C_on_a_list, si_sb_C_on_a_bysize, - pl_sb_C_on_a_bysize, pl_sb_C_on_a) = make_pl_si_lists(pl_sb_C_on_a_list, 'a', 2) - - -# CLASSICAL "..o" -> "..i" (BUT NORMALLY -> "..os") - -pl_sb_C_o_i = [ - "solo", "soprano", "basso", "alto", - "contralto", "tempo", "piano", "virtuoso", -] # list not tuple so can concat for pl_sb_U_o_os - -pl_sb_C_o_i_bysize = bysize(pl_sb_C_o_i) -si_sb_C_o_i_bysize = bysize(['%si' % w[:-1] for w in pl_sb_C_o_i]) - -pl_sb_C_o_i_stems = joinstem(-1, pl_sb_C_o_i) - -# ALWAYS "..o" -> "..os" - -pl_sb_U_o_os_complete = set(( - "ado", "ISO", "NATO", "NCO", "NGO", "oto", -)) -si_sb_U_o_os_complete = set('%ss' % w for w in pl_sb_U_o_os_complete) - - -pl_sb_U_o_os_endings = [ - "aficionado", "aggro", - "albino", "allegro", "ammo", - "Antananarivo", "archipelago", "armadillo", - "auto", "avocado", "Bamako", - "Barquisimeto", "bimbo", "bingo", - "Biro", "bolero", "Bolzano", - "bongo", "Boto", "burro", - "Cairo", "canto", "cappuccino", - "casino", "cello", "Chicago", - "Chimango", "cilantro", "cochito", - "coco", "Colombo", "Colorado", - "commando", "concertino", "contango", - "credo", "crescendo", "cyano", - "demo", "ditto", "Draco", - "dynamo", "embryo", "Esperanto", - "espresso", "euro", "falsetto", - "Faro", "fiasco", "Filipino", - "flamenco", "furioso", "generalissimo", - "Gestapo", "ghetto", "gigolo", - "gizmo", "Greensboro", "gringo", - "Guaiabero", "guano", "gumbo", - "gyro", "hairdo", "hippo", - "Idaho", "impetigo", "inferno", - "info", "intermezzo", "intertrigo", - "Iquico", "jumbo", - "junto", "Kakapo", "kilo", - "Kinkimavo", "Kokako", "Kosovo", - "Lesotho", "libero", "libido", - "libretto", "lido", "Lilo", - "limbo", "limo", "lineno", - "lingo", "lino", "livedo", - "loco", "logo", "lumbago", - "macho", "macro", "mafioso", - "magneto", "magnifico", "Majuro", - "Malabo", "manifesto", "Maputo", - "Maracaibo", "medico", "memo", - "metro", "Mexico", "micro", - "Milano", "Monaco", "mono", - "Montenegro", "Morocco", "Muqdisho", - "myo", - "neutrino", "Ningbo", - "octavo", "oregano", "Orinoco", - "Orlando", "Oslo", - "panto", "Paramaribo", "Pardusco", - "pedalo", "photo", "pimento", - "pinto", "pleco", "Pluto", - "pogo", "polo", "poncho", - "Porto-Novo", "Porto", "pro", - "psycho", "pueblo", "quarto", - "Quito", "rhino", "risotto", - "rococo", "rondo", "Sacramento", - "saddo", "sago", "salvo", - "Santiago", "Sapporo", "Sarajevo", - "scherzando", "scherzo", "silo", - "sirocco", "sombrero", "staccato", - "sterno", "stucco", "stylo", - "sumo", "Taiko", "techno", - "terrazzo", "testudo", "timpano", - "tiro", "tobacco", "Togo", - "Tokyo", "torero", "Torino", - "Toronto", "torso", "tremolo", - "typo", "tyro", "ufo", - "UNESCO", "vaquero", "vermicello", - "verso", "vibrato", "violoncello", - "Virgo", "weirdo", "WHO", - "WTO", "Yamoussoukro", "yo-yo", - "zero", "Zibo", -] + pl_sb_C_o_i - -pl_sb_U_o_os_bysize = bysize(pl_sb_U_o_os_endings) -si_sb_U_o_os_bysize = bysize(['%ss' % w for w in pl_sb_U_o_os_endings]) - - -# UNCONDITIONAL "..ch" -> "..chs" - -pl_sb_U_ch_chs_list = ( - "czech", "eunuch", "stomach" -) - -(si_sb_U_ch_chs_list, si_sb_U_ch_chs_bysize, - pl_sb_U_ch_chs_bysize, pl_sb_U_ch_chs) = make_pl_si_lists(pl_sb_U_ch_chs_list, 's', None) - - -# UNCONDITIONAL "..[ei]x" -> "..ices" - -pl_sb_U_ex_ices_list = ( - "codex", "murex", "silex", -) -(si_sb_U_ex_ices_list, si_sb_U_ex_ices_bysize, - pl_sb_U_ex_ices_bysize, pl_sb_U_ex_ices) = make_pl_si_lists(pl_sb_U_ex_ices_list, 'ices', 2) - -pl_sb_U_ix_ices_list = ( - "radix", "helix", -) -(si_sb_U_ix_ices_list, si_sb_U_ix_ices_bysize, - pl_sb_U_ix_ices_bysize, pl_sb_U_ix_ices) = make_pl_si_lists(pl_sb_U_ix_ices_list, 'ices', 2) - -# CLASSICAL "..[ei]x" -> "..ices" - -pl_sb_C_ex_ices_list = ( - "vortex", "vertex", "cortex", "latex", - "pontifex", "apex", "index", "simplex", -) - -(si_sb_C_ex_ices_list, si_sb_C_ex_ices_bysize, - pl_sb_C_ex_ices_bysize, pl_sb_C_ex_ices) = make_pl_si_lists(pl_sb_C_ex_ices_list, 'ices', 2) - - -pl_sb_C_ix_ices_list = ( - "appendix", -) - -(si_sb_C_ix_ices_list, si_sb_C_ix_ices_bysize, - pl_sb_C_ix_ices_bysize, pl_sb_C_ix_ices) = make_pl_si_lists(pl_sb_C_ix_ices_list, 'ices', 2) - - -# ARABIC: ".." -> "..i" - -pl_sb_C_i_list = ( - "afrit", "afreet", "efreet", -) - -(si_sb_C_i_list, si_sb_C_i_bysize, - pl_sb_C_i_bysize, pl_sb_C_i) = make_pl_si_lists(pl_sb_C_i_list, 'i', None) - - -# HEBREW: ".." -> "..im" - -pl_sb_C_im_list = ( - "goy", "seraph", "cherub", -) - -(si_sb_C_im_list, si_sb_C_im_bysize, - pl_sb_C_im_bysize, pl_sb_C_im) = make_pl_si_lists(pl_sb_C_im_list, 'im', None) - - -# UNCONDITIONAL "..man" -> "..mans" - -pl_sb_U_man_mans_list = """ - ataman caiman cayman ceriman - desman dolman farman harman hetman - human leman ottoman shaman talisman -""".split() -pl_sb_U_man_mans_caps_list = """ - Alabaman Bahaman Burman German - Hiroshiman Liman Nakayaman Norman Oklahoman - Panaman Roman Selman Sonaman Tacoman Yakiman - Yokohaman Yuman -""".split() - -(si_sb_U_man_mans_list, si_sb_U_man_mans_bysize, - pl_sb_U_man_mans_bysize) = make_pl_si_lists(pl_sb_U_man_mans_list, 's', None, dojoinstem=False) -(si_sb_U_man_mans_caps_list, si_sb_U_man_mans_caps_bysize, - pl_sb_U_man_mans_caps_bysize) = make_pl_si_lists(pl_sb_U_man_mans_caps_list, 's', None, dojoinstem=False) - - -pl_sb_uninflected_s_complete = [ - # PAIRS OR GROUPS SUBSUMED TO A SINGULAR... - "breeches", "britches", "pajamas", "pyjamas", "clippers", "gallows", - "hijinks", "headquarters", "pliers", "scissors", "testes", "herpes", - "pincers", "shears", "proceedings", "trousers", - - # UNASSIMILATED LATIN 4th DECLENSION - - "cantus", "coitus", "nexus", - - # RECENT IMPORTS... - "contretemps", "corps", "debris", - "siemens", - - # DISEASES - "mumps", - - # MISCELLANEOUS OTHERS... - "diabetes", "jackanapes", "series", "species", "subspecies", "rabies", - "chassis", "innings", "news", "mews", "haggis", -] - -pl_sb_uninflected_s_endings = [ - # RECENT IMPORTS... - "ois", - - # DISEASES - "measles", -] - -pl_sb_uninflected_s = pl_sb_uninflected_s_complete + ['.*%s' % w for w in pl_sb_uninflected_s_endings] - -pl_sb_uninflected_herd = ( - # DON'T INFLECT IN CLASSICAL MODE, OTHERWISE NORMAL INFLECTION - "wildebeest", "swine", "eland", "bison", "buffalo", - "elk", "rhinoceros", 'zucchini', - 'caribou', 'dace', 'grouse', 'guinea fowl', 'guinea-fowl', - 'haddock', 'hake', 'halibut', 'herring', 'mackerel', - 'pickerel', 'pike', 'roe', 'seed', 'shad', - 'snipe', 'teal', 'turbot', 'water fowl', 'water-fowl', -) - -pl_sb_uninflected_complete = [ - # SOME FISH AND HERD ANIMALS - "tuna", "salmon", "mackerel", "trout", - "bream", "sea-bass", "sea bass", "carp", "cod", "flounder", "whiting", - "moose", - - # OTHER ODDITIES - "graffiti", "djinn", 'samuri', - 'offspring', 'pence', 'quid', 'hertz', -] + pl_sb_uninflected_s_complete -# SOME WORDS ENDING IN ...s (OFTEN PAIRS TAKEN AS A WHOLE) - -pl_sb_uninflected_caps = [ - # ALL NATIONALS ENDING IN -ese - "Portuguese", "Amoyese", "Borghese", "Congoese", "Faroese", - "Foochowese", "Genevese", "Genoese", "Gilbertese", "Hottentotese", - "Kiplingese", "Kongoese", "Lucchese", "Maltese", "Nankingese", - "Niasese", "Pekingese", "Piedmontese", "Pistoiese", "Sarawakese", - "Shavese", "Vermontese", "Wenchowese", "Yengeese", -] - - -pl_sb_uninflected_endings = [ - # SOME FISH AND HERD ANIMALS - "fish", - - "deer", "sheep", - - # ALL NATIONALS ENDING IN -ese - "nese", "rese", "lese", "mese", - - # DISEASES - "pox", - - - # OTHER ODDITIES - 'craft', -] + pl_sb_uninflected_s_endings -# SOME WORDS ENDING IN ...s (OFTEN PAIRS TAKEN AS A WHOLE) - - -pl_sb_uninflected_bysize = bysize(pl_sb_uninflected_endings) - - -# SINGULAR WORDS ENDING IN ...s (ALL INFLECT WITH ...es) - -pl_sb_singular_s_complete = [ - "acropolis", "aegis", "alias", "asbestos", "bathos", "bias", - "bronchitis", "bursitis", "caddis", "cannabis", - "canvas", "chaos", "cosmos", "dais", "digitalis", - "epidermis", "ethos", "eyas", "gas", "glottis", - "hubris", "ibis", "lens", "mantis", "marquis", "metropolis", - "pathos", "pelvis", "polis", "rhinoceros", - "sassafras", "trellis", -] + pl_sb_C_is_ides_complete - - -pl_sb_singular_s_endings = [ - "ss", "us", -] + pl_sb_C_is_ides_endings - -pl_sb_singular_s_bysize = bysize(pl_sb_singular_s_endings) - -si_sb_singular_s_complete = ['%ses' % w for w in pl_sb_singular_s_complete] -si_sb_singular_s_endings = ['%ses' % w for w in pl_sb_singular_s_endings] -si_sb_singular_s_bysize = bysize(si_sb_singular_s_endings) - -pl_sb_singular_s_es = [ - "[A-Z].*es", -] - -pl_sb_singular_s = enclose('|'.join(pl_sb_singular_s_complete + - ['.*%s' % w for w in pl_sb_singular_s_endings] + - pl_sb_singular_s_es)) - - -# PLURALS ENDING IN uses -> use - - -si_sb_ois_oi_case = ( - 'Bolshois', 'Hanois' -) - -si_sb_uses_use_case = ( - 'Betelgeuses', 'Duses', 'Meuses', 'Syracuses', 'Toulouses', -) - -si_sb_uses_use = ( - 'abuses', 'applauses', 'blouses', - 'carouses', 'causes', 'chartreuses', 'clauses', - 'contuses', 'douses', 'excuses', 'fuses', - 'grouses', 'hypotenuses', 'masseuses', - 'menopauses', 'misuses', 'muses', 'overuses', 'pauses', - 'peruses', 'profuses', 'recluses', 'reuses', - 'ruses', 'souses', 'spouses', 'suffuses', 'transfuses', 'uses', -) - -si_sb_ies_ie_case = ( - 'Addies', 'Aggies', 'Allies', 'Amies', 'Angies', 'Annies', - 'Annmaries', 'Archies', 'Arties', 'Aussies', 'Barbies', - 'Barries', 'Basies', 'Bennies', 'Bernies', 'Berties', 'Bessies', - 'Betties', 'Billies', 'Blondies', 'Bobbies', 'Bonnies', - 'Bowies', 'Brandies', 'Bries', 'Brownies', 'Callies', - 'Carnegies', 'Carries', 'Cassies', 'Charlies', 'Cheries', - 'Christies', 'Connies', 'Curies', 'Dannies', 'Debbies', 'Dixies', - 'Dollies', 'Donnies', 'Drambuies', 'Eddies', 'Effies', 'Ellies', - 'Elsies', 'Eries', 'Ernies', 'Essies', 'Eugenies', 'Fannies', - 'Flossies', 'Frankies', 'Freddies', 'Gillespies', 'Goldies', - 'Gracies', 'Guthries', 'Hallies', 'Hatties', 'Hetties', - 'Hollies', 'Jackies', 'Jamies', 'Janies', 'Jannies', 'Jeanies', - 'Jeannies', 'Jennies', 'Jessies', 'Jimmies', 'Jodies', 'Johnies', - 'Johnnies', 'Josies', 'Julies', 'Kalgoorlies', 'Kathies', 'Katies', - 'Kellies', 'Kewpies', 'Kristies', 'Laramies', 'Lassies', 'Lauries', - 'Leslies', 'Lessies', 'Lillies', 'Lizzies', 'Lonnies', 'Lories', - 'Lorries', 'Lotties', 'Louies', 'Mackenzies', 'Maggies', 'Maisies', - 'Mamies', 'Marcies', 'Margies', 'Maries', 'Marjories', 'Matties', - 'McKenzies', 'Melanies', 'Mickies', 'Millies', 'Minnies', 'Mollies', - 'Mounties', 'Nannies', 'Natalies', 'Nellies', 'Netties', 'Ollies', - 'Ozzies', 'Pearlies', 'Pottawatomies', 'Reggies', 'Richies', 'Rickies', - 'Robbies', 'Ronnies', 'Rosalies', 'Rosemaries', 'Rosies', 'Roxies', - 'Rushdies', 'Ruthies', 'Sadies', 'Sallies', 'Sammies', 'Scotties', - 'Selassies', 'Sherries', 'Sophies', 'Stacies', 'Stefanies', 'Stephanies', - 'Stevies', 'Susies', 'Sylvies', 'Tammies', 'Terries', 'Tessies', - 'Tommies', 'Tracies', 'Trekkies', 'Valaries', 'Valeries', 'Valkyries', - 'Vickies', 'Virgies', 'Willies', 'Winnies', 'Wylies', 'Yorkies', -) - -si_sb_ies_ie = ( - 'aeries', 'baggies', 'belies', 'biggies', 'birdies', 'bogies', - 'bonnies', 'boogies', 'bookies', 'bourgeoisies', 'brownies', - 'budgies', 'caddies', 'calories', 'camaraderies', 'cockamamies', - 'collies', 'cookies', 'coolies', 'cooties', 'coteries', 'crappies', - 'curies', 'cutesies', 'dogies', 'eyrie', 'floozies', 'footsies', - 'freebies', 'genies', 'goalies', 'groupies', - 'hies', 'jalousies', 'junkies', - 'kiddies', 'laddies', 'lassies', 'lies', - 'lingeries', 'magpies', 'menageries', 'mommies', 'movies', 'neckties', - 'newbies', 'nighties', 'oldies', 'organdies', 'overlies', - 'pies', 'pinkies', 'pixies', 'potpies', 'prairies', - 'quickies', 'reveries', 'rookies', 'rotisseries', 'softies', 'sorties', - 'species', 'stymies', 'sweeties', 'ties', 'underlies', 'unties', - 'veggies', 'vies', 'yuppies', 'zombies', -) - - -si_sb_oes_oe_case = ( - 'Chloes', 'Crusoes', 'Defoes', 'Faeroes', 'Ivanhoes', 'Joes', - 'McEnroes', 'Moes', 'Monroes', 'Noes', 'Poes', 'Roscoes', - 'Tahoes', 'Tippecanoes', 'Zoes', -) - -si_sb_oes_oe = ( - 'aloes', 'backhoes', 'canoes', - 'does', 'floes', 'foes', 'hoes', 'mistletoes', - 'oboes', 'pekoes', 'roes', 'sloes', - 'throes', 'tiptoes', 'toes', 'woes', -) - -si_sb_z_zes = ( - "quartzes", "topazes", -) - -si_sb_zzes_zz = ( - 'buzzes', 'fizzes', 'frizzes', 'razzes' -) - -si_sb_ches_che_case = ( - 'Andromaches', 'Apaches', 'Blanches', 'Comanches', - 'Nietzsches', 'Porsches', 'Roches', -) - -si_sb_ches_che = ( - 'aches', 'avalanches', 'backaches', 'bellyaches', 'caches', - 'cloches', 'creches', 'douches', 'earaches', 'fiches', - 'headaches', 'heartaches', 'microfiches', - 'niches', 'pastiches', 'psyches', 'quiches', - 'stomachaches', 'toothaches', -) - -si_sb_xes_xe = ( - 'annexes', 'axes', 'deluxes', 'pickaxes', -) - -si_sb_sses_sse_case = ( - 'Hesses', 'Jesses', 'Larousses', 'Matisses', -) -si_sb_sses_sse = ( - 'bouillabaisses', 'crevasses', 'demitasses', 'impasses', - 'mousses', 'posses', -) - -si_sb_ves_ve_case = ( - # *[nwl]ives -> [nwl]live - 'Clives', 'Palmolives', -) -si_sb_ves_ve = ( - # *[^d]eaves -> eave - 'interweaves', 'weaves', - - # *[nwl]ives -> [nwl]live - 'olives', - - # *[eoa]lves -> [eoa]lve - 'bivalves', 'dissolves', 'resolves', 'salves', 'twelves', 'valves', -) - - -plverb_special_s = enclose('|'.join( - [pl_sb_singular_s] + - pl_sb_uninflected_s + - list(pl_sb_irregular_s.keys()) + [ - '(.*[csx])is', - '(.*)ceps', - '[A-Z].*s', - ] -)) - -pl_sb_postfix_adj = { - 'general': ['(?!major|lieutenant|brigadier|adjutant|.*star)\S+'], - 'martial': ['court'], -} - -for k in list(pl_sb_postfix_adj.keys()): - pl_sb_postfix_adj[k] = enclose( - enclose('|'.join(pl_sb_postfix_adj[k])) + - "(?=(?:-|\\s+)%s)" % k) - -pl_sb_postfix_adj_stems = '(' + '|'.join(list(pl_sb_postfix_adj.values())) + ')(.*)' - - -# PLURAL WORDS ENDING IS es GO TO SINGULAR is - -si_sb_es_is = ( - 'amanuenses', 'amniocenteses', 'analyses', 'antitheses', - 'apotheoses', 'arterioscleroses', 'atheroscleroses', 'axes', - # 'bases', # bases -> basis - 'catalyses', 'catharses', 'chasses', 'cirrhoses', - 'cocces', 'crises', 'diagnoses', 'dialyses', 'diereses', - 'electrolyses', 'emphases', 'exegeses', 'geneses', - 'halitoses', 'hydrolyses', 'hypnoses', 'hypotheses', 'hystereses', - 'metamorphoses', 'metastases', 'misdiagnoses', 'mitoses', - 'mononucleoses', 'narcoses', 'necroses', 'nemeses', 'neuroses', - 'oases', 'osmoses', 'osteoporoses', 'paralyses', 'parentheses', - 'parthenogeneses', 'periphrases', 'photosyntheses', 'probosces', - 'prognoses', 'prophylaxes', 'prostheses', 'preces', 'psoriases', - 'psychoanalyses', 'psychokineses', 'psychoses', 'scleroses', - 'scolioses', 'sepses', 'silicoses', 'symbioses', 'synopses', - 'syntheses', 'taxes', 'telekineses', 'theses', 'thromboses', - 'tuberculoses', 'urinalyses', -) - -pl_prep_list = """ - about above across after among around at athwart before behind - below beneath beside besides between betwixt beyond but by - during except for from in into near of off on onto out over - since till to under until unto upon with""".split() - -pl_prep_list_da = pl_prep_list + ['de', 'du', 'da'] - -pl_prep_bysize = bysize(pl_prep_list_da) - -pl_prep = enclose('|'.join(pl_prep_list_da)) - -pl_sb_prep_dual_compound = r'(.*?)((?:-|\s+)(?:' + pl_prep + r')(?:-|\s+))a(?:-|\s+)(.*)' - - -singular_pronoun_genders = set(['neuter', - 'feminine', - 'masculine', - 'gender-neutral', - 'feminine or masculine', - 'masculine or feminine']) - -pl_pron_nom = { - # NOMINATIVE REFLEXIVE - "i": "we", "myself": "ourselves", - "you": "you", "yourself": "yourselves", - "she": "they", "herself": "themselves", - "he": "they", "himself": "themselves", - "it": "they", "itself": "themselves", - "they": "they", "themself": "themselves", - - # POSSESSIVE - "mine": "ours", - "yours": "yours", - "hers": "theirs", - "his": "theirs", - "its": "theirs", - "theirs": "theirs", -} - -si_pron = {} -si_pron['nom'] = dict([(v, k) for (k, v) in pl_pron_nom.items()]) -si_pron['nom']['we'] = 'I' - - -pl_pron_acc = { - # ACCUSATIVE REFLEXIVE - "me": "us", "myself": "ourselves", - "you": "you", "yourself": "yourselves", - "her": "them", "herself": "themselves", - "him": "them", "himself": "themselves", - "it": "them", "itself": "themselves", - "them": "them", "themself": "themselves", -} - -pl_pron_acc_keys = enclose('|'.join(list(pl_pron_acc.keys()))) -pl_pron_acc_keys_bysize = bysize(list(pl_pron_acc.keys())) - -si_pron['acc'] = dict([(v, k) for (k, v) in pl_pron_acc.items()]) - -for thecase, plur, gend, sing in ( - ('nom', 'they', 'neuter', 'it'), - ('nom', 'they', 'feminine', 'she'), - ('nom', 'they', 'masculine', 'he'), - ('nom', 'they', 'gender-neutral', 'they'), - ('nom', 'they', 'feminine or masculine', 'she or he'), - ('nom', 'they', 'masculine or feminine', 'he or she'), - ('nom', 'themselves', 'neuter', 'itself'), - ('nom', 'themselves', 'feminine', 'herself'), - ('nom', 'themselves', 'masculine', 'himself'), - ('nom', 'themselves', 'gender-neutral', 'themself'), - ('nom', 'themselves', 'feminine or masculine', 'herself or himself'), - ('nom', 'themselves', 'masculine or feminine', 'himself or herself'), - ('nom', 'theirs', 'neuter', 'its'), - ('nom', 'theirs', 'feminine', 'hers'), - ('nom', 'theirs', 'masculine', 'his'), - ('nom', 'theirs', 'gender-neutral', 'theirs'), - ('nom', 'theirs', 'feminine or masculine', 'hers or his'), - ('nom', 'theirs', 'masculine or feminine', 'his or hers'), - ('acc', 'them', 'neuter', 'it'), - ('acc', 'them', 'feminine', 'her'), - ('acc', 'them', 'masculine', 'him'), - ('acc', 'them', 'gender-neutral', 'them'), - ('acc', 'them', 'feminine or masculine', 'her or him'), - ('acc', 'them', 'masculine or feminine', 'him or her'), - ('acc', 'themselves', 'neuter', 'itself'), - ('acc', 'themselves', 'feminine', 'herself'), - ('acc', 'themselves', 'masculine', 'himself'), - ('acc', 'themselves', 'gender-neutral', 'themself'), - ('acc', 'themselves', 'feminine or masculine', 'herself or himself'), - ('acc', 'themselves', 'masculine or feminine', 'himself or herself'), -): - try: - si_pron[thecase][plur][gend] = sing - except TypeError: - si_pron[thecase][plur] = {} - si_pron[thecase][plur][gend] = sing - - -si_pron_acc_keys = enclose('|'.join(list(si_pron['acc'].keys()))) -si_pron_acc_keys_bysize = bysize(list(si_pron['acc'].keys())) - - -def get_si_pron(thecase, word, gender): - try: - sing = si_pron[thecase][word] - except KeyError: - raise # not a pronoun - try: - return sing[gender] # has several types due to gender - except TypeError: - return sing # answer independent of gender - -plverb_irregular_pres = { - # 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR - # 3RD PERS. (INDET.) - "am": "are", "are": "are", "is": "are", - "was": "were", "were": "were", "was": "were", - "have": "have", "have": "have", "has": "have", - "do": "do", "do": "do", "does": "do", -} - -plverb_ambiguous_pres = { - # 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR - # 3RD PERS. (INDET.) - "act": "act", "act": "act", "acts": "act", - "blame": "blame", "blame": "blame", "blames": "blame", - "can": "can", "can": "can", "can": "can", - "must": "must", "must": "must", "must": "must", - "fly": "fly", "fly": "fly", "flies": "fly", - "copy": "copy", "copy": "copy", "copies": "copy", - "drink": "drink", "drink": "drink", "drinks": "drink", - "fight": "fight", "fight": "fight", "fights": "fight", - "fire": "fire", "fire": "fire", "fires": "fire", - "like": "like", "like": "like", "likes": "like", - "look": "look", "look": "look", "looks": "look", - "make": "make", "make": "make", "makes": "make", - "reach": "reach", "reach": "reach", "reaches": "reach", - "run": "run", "run": "run", "runs": "run", - "sink": "sink", "sink": "sink", "sinks": "sink", - "sleep": "sleep", "sleep": "sleep", "sleeps": "sleep", - "view": "view", "view": "view", "views": "view", -} - -plverb_ambiguous_pres_keys = enclose('|'.join(list(plverb_ambiguous_pres.keys()))) - - -plverb_irregular_non_pres = ( - "did", "had", "ate", "made", "put", - "spent", "fought", "sank", "gave", "sought", - "shall", "could", "ought", "should", -) - -plverb_ambiguous_non_pres = enclose('|'.join(( - "thought", "saw", "bent", "will", "might", "cut", -))) - -# "..oes" -> "..oe" (the rest are "..oes" -> "o") - -pl_v_oes_oe = ('canoes', 'floes', 'oboes', 'roes', 'throes', 'woes') -pl_v_oes_oe_endings_size4 = ('hoes', 'toes') -pl_v_oes_oe_endings_size5 = ('shoes') - - -pl_count_zero = ( - "0", "no", "zero", "nil" -) - - -pl_count_one = ( - "1", "a", "an", "one", "each", "every", "this", "that", -) - -pl_adj_special = { - "a": "some", "an": "some", - "this": "these", "that": "those", -} - -pl_adj_special_keys = enclose('|'.join(list(pl_adj_special.keys()))) - -pl_adj_poss = { - "my": "our", - "your": "your", - "its": "their", - "her": "their", - "his": "their", - "their": "their", -} - -pl_adj_poss_keys = enclose('|'.join(list(pl_adj_poss.keys()))) - - -# 2. INDEFINITE ARTICLES - -# THIS PATTERN MATCHES STRINGS OF CAPITALS STARTING WITH A "VOWEL-SOUND" -# CONSONANT FOLLOWED BY ANOTHER CONSONANT, AND WHICH ARE NOT LIKELY -# TO BE REAL WORDS (OH, ALL RIGHT THEN, IT'S JUST MAGIC!) - -A_abbrev = r""" -(?! FJO | [HLMNS]Y. | RY[EO] | SQU - | ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU]) -[FHLMNRSX][A-Z] -""" - -# THIS PATTERN CODES THE BEGINNINGS OF ALL ENGLISH WORDS BEGINING WITH A -# 'y' FOLLOWED BY A CONSONANT. ANY OTHER Y-CONSONANT PREFIX THEREFORE -# IMPLIES AN ABBREVIATION. - -A_y_cons = 'y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)' - -# EXCEPTIONS TO EXCEPTIONS - -A_explicit_a = enclose('|'.join(( - "unabomber", "unanimous", "US", -))) - -A_explicit_an = enclose('|'.join(( - "euler", - "hour(?!i)", "heir", "honest", "hono[ur]", - "mpeg", -))) - -A_ordinal_an = enclose('|'.join(( - "[aefhilmnorsx]-?th", -))) - -A_ordinal_a = enclose('|'.join(( - "[bcdgjkpqtuvwyz]-?th", -))) - - -# NUMERICAL INFLECTIONS - -nth = { - 0: 'th', - 1: 'st', - 2: 'nd', - 3: 'rd', - 4: 'th', - 5: 'th', - 6: 'th', - 7: 'th', - 8: 'th', - 9: 'th', - 11: 'th', - 12: 'th', - 13: 'th', -} - -ordinal = dict(ty='tieth', - one='first', - two='second', - three='third', - five='fifth', - eight='eighth', - nine='ninth', - twelve='twelfth') - -ordinal_suff = '|'.join(list(ordinal.keys())) - - -# NUMBERS - -unit = ['', 'one', 'two', 'three', 'four', 'five', - 'six', 'seven', 'eight', 'nine'] -teen = ['ten', 'eleven', 'twelve', 'thirteen', 'fourteen', - 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen'] -ten = ['', '', 'twenty', 'thirty', 'forty', - 'fifty', 'sixty', 'seventy', 'eighty', 'ninety'] -mill = [' ', ' thousand', ' million', ' billion', ' trillion', ' quadrillion', - ' quintillion', ' sextillion', ' septillion', ' octillion', - ' nonillion', ' decillion'] - - -# SUPPORT CLASSICAL PLURALIZATIONS - -def_classical = dict( - all=False, - zero=False, - herd=False, - names=True, - persons=False, - ancient=False, -) - -all_classical = dict((k, True) for k in list(def_classical.keys())) -no_classical = dict((k, False) for k in list(def_classical.keys())) - - -# TODO: .inflectrc file does not work -# can't just execute methods from another file like this - -# for rcfile in (pathjoin(dirname(__file__), '.inflectrc'), -# expanduser(pathjoin(('~'), '.inflectrc'))): -# if isfile(rcfile): -# try: -# execfile(rcfile) -# except: -# print3("\nBad .inflectrc file (%s):\n" % rcfile) -# raise BadRcFileError - - -class engine: - - def __init__(self): - - self.classical_dict = def_classical.copy() - self.persistent_count = None - self.mill_count = 0 - self.pl_sb_user_defined = [] - self.pl_v_user_defined = [] - self.pl_adj_user_defined = [] - self.si_sb_user_defined = [] - self.A_a_user_defined = [] - self.thegender = 'neuter' - - deprecated_methods = dict(pl='plural', - plnoun='plural_noun', - plverb='plural_verb', - pladj='plural_adj', - sinoun='single_noun', - prespart='present_participle', - numwords='number_to_words', - plequal='compare', - plnounequal='compare_nouns', - plverbequal='compare_verbs', - pladjequal='compare_adjs', - wordlist='join', - ) - - def __getattr__(self, meth): - if meth in self.deprecated_methods: - print3('%s() deprecated, use %s()' % (meth, self.deprecated_methods[meth])) - raise DeprecationWarning - raise AttributeError - - def defnoun(self, singular, plural): - ''' - Set the noun plural of singular to plural. - - ''' - self.checkpat(singular) - self.checkpatplural(plural) - self.pl_sb_user_defined.extend((singular, plural)) - self.si_sb_user_defined.extend((plural, singular)) - return 1 - - def defverb(self, s1, p1, s2, p2, s3, p3): - ''' - Set the verb plurals for s1, s2 and s3 to p1, p2 and p3 respectively. - - Where 1, 2 and 3 represent the 1st, 2nd and 3rd person forms of the verb. - - ''' - self.checkpat(s1) - self.checkpat(s2) - self.checkpat(s3) - self.checkpatplural(p1) - self.checkpatplural(p2) - self.checkpatplural(p3) - self.pl_v_user_defined.extend((s1, p1, s2, p2, s3, p3)) - return 1 - - def defadj(self, singular, plural): - ''' - Set the adjective plural of singular to plural. - - ''' - self.checkpat(singular) - self.checkpatplural(plural) - self.pl_adj_user_defined.extend((singular, plural)) - return 1 - - def defa(self, pattern): - ''' - Define the indefinate article as 'a' for words matching pattern. - - ''' - self.checkpat(pattern) - self.A_a_user_defined.extend((pattern, 'a')) - return 1 - - def defan(self, pattern): - ''' - Define the indefinate article as 'an' for words matching pattern. - - ''' - self.checkpat(pattern) - self.A_a_user_defined.extend((pattern, 'an')) - return 1 - - def checkpat(self, pattern): - ''' - check for errors in a regex pattern - ''' - if pattern is None: - return - try: - match(pattern, '') - except reerror: - print3("\nBad user-defined singular pattern:\n\t%s\n" % pattern) - raise BadUserDefinedPatternError - - def checkpatplural(self, pattern): - ''' - check for errors in a regex replace pattern - ''' - return - # can't find a pattern that doesn't pass the following test: - # if pattern is None: - # return - # try: - # resub('', pattern, '') - # except reerror: - # print3("\nBad user-defined plural pattern:\n\t%s\n" % pattern) - # raise BadUserDefinedPatternError - - def ud_match(self, word, wordlist): - for i in range(len(wordlist) - 2, -2, -2): # backwards through even elements - mo = search(r'^%s$' % wordlist[i], word, IGNORECASE) - if mo: - if wordlist[i + 1] is None: - return None - pl = resub(r'\$(\d+)', r'\\1', wordlist[i + 1]) # change $n to \n for expand - return mo.expand(pl) - return None - - def classical(self, **kwargs): - """ - turn classical mode on and off for various categories - - turn on all classical modes: - classical() - classical(all=True) - - turn on or off specific claassical modes: - e.g. - classical(herd=True) - classical(names=False) - - By default all classical modes are off except names. - - unknown value in args or key in kwargs rasies exception: UnknownClasicalModeError - - """ - classical_mode = list(def_classical.keys()) - if not kwargs: - self.classical_dict = all_classical.copy() - return - if 'all' in kwargs: - if kwargs['all']: - self.classical_dict = all_classical.copy() - else: - self.classical_dict = no_classical.copy() - - for k, v in list(kwargs.items()): - if k in classical_mode: - self.classical_dict[k] = v - else: - raise UnknownClassicalModeError - - def num(self, count=None, show=None): # (;$count,$show) - ''' - Set the number to be used in other method calls. - - Returns count. - - Set show to False to return '' instead. - - ''' - if count is not None: - try: - self.persistent_count = int(count) - except ValueError: - raise BadNumValueError - if (show is None) or show: - return str(count) - else: - self.persistent_count = None - return '' - - def gender(self, gender): - ''' - set the gender for the singular of plural pronouns - - can be one of: - 'neuter' ('they' -> 'it') - 'feminine' ('they' -> 'she') - 'masculine' ('they' -> 'he') - 'gender-neutral' ('they' -> 'they') - 'feminine or masculine' ('they' -> 'she or he') - 'masculine or feminine' ('they' -> 'he or she') - ''' - if gender in singular_pronoun_genders: - self.thegender = gender - else: - raise BadGenderError - - def nummo(self, matchobject): - ''' - num but take a matchobject - use groups 1 and 2 in matchobject - ''' - return self.num(matchobject.group(1), matchobject.group(2)) - - def plmo(self, matchobject): - ''' - plural but take a matchobject - use groups 1 and 3 in matchobject - ''' - return self.plural(matchobject.group(1), matchobject.group(3)) - - def plnounmo(self, matchobject): - ''' - plural_noun but take a matchobject - use groups 1 and 3 in matchobject - ''' - return self.plural_noun(matchobject.group(1), matchobject.group(3)) - - def plverbmo(self, matchobject): - ''' - plural_verb but take a matchobject - use groups 1 and 3 in matchobject - ''' - return self.plural_verb(matchobject.group(1), matchobject.group(3)) - - def pladjmo(self, matchobject): - ''' - plural_adj but take a matchobject - use groups 1 and 3 in matchobject - ''' - return self.plural_adj(matchobject.group(1), matchobject.group(3)) - - def sinounmo(self, matchobject): - ''' - singular_noun but take a matchobject - use groups 1 and 3 in matchobject - ''' - return self.singular_noun(matchobject.group(1), matchobject.group(3)) - - def amo(self, matchobject): - ''' - A but take a matchobject - use groups 1 and 3 in matchobject - ''' - if matchobject.group(3) is None: - return self.a(matchobject.group(1)) - return self.a(matchobject.group(1), matchobject.group(3)) - - def nomo(self, matchobject): - ''' - NO but take a matchobject - use groups 1 and 3 in matchobject - ''' - return self.no(matchobject.group(1), matchobject.group(3)) - - def ordinalmo(self, matchobject): - ''' - ordinal but take a matchobject - use group 1 - ''' - return self.ordinal(matchobject.group(1)) - - def numwordsmo(self, matchobject): - ''' - number_to_words but take a matchobject - use group 1 - ''' - return self.number_to_words(matchobject.group(1)) - - def prespartmo(self, matchobject): - ''' - prespart but take a matchobject - use group 1 - ''' - return self.present_participle(matchobject.group(1)) - -# 0. PERFORM GENERAL INFLECTIONS IN A STRING - - def inflect(self, text): - ''' - Perform inflections in a string. - - e.g. inflect('The plural of cat is plural(cat)') returns - 'The plural of cat is cats' - - can use plural, plural_noun, plural_verb, plural_adj, singular_noun, a, an, no, ordinal, - number_to_words and prespart - - ''' - save_persistent_count = self.persistent_count - sections = splitre(r"(num\([^)]*\))", text) - inflection = [] - - for section in sections: - (section, count) = subn(r"num\(\s*?(?:([^),]*)(?:,([^)]*))?)?\)", self.nummo, section) - if not count: - total = -1 - while total: - (section, total) = subn( - r"(?x)\bplural \( ([^),]*) (, ([^)]*) )? \) ", - self.plmo, section) - (section, count) = subn( - r"(?x)\bplural_noun \( ([^),]*) (, ([^)]*) )? \) ", - self.plnounmo, section) - total += count - (section, count) = subn( - r"(?x)\bplural_verb \( ([^),]*) (, ([^)]*) )? \) ", - self.plverbmo, section) - total += count - (section, count) = subn( - r"(?x)\bplural_adj \( ([^),]*) (, ([^)]*) )? \) ", - self.pladjmo, section) - total += count - (section, count) = subn( - r"(?x)\bsingular_noun \( ([^),]*) (, ([^)]*) )? \) ", - self.sinounmo, section) - total += count - (section, count) = subn( - r"(?x)\ban? \( ([^),]*) (, ([^)]*) )? \) ", - self.amo, section) - total += count - (section, count) = subn( - r"(?x)\bno \( ([^),]*) (, ([^)]*) )? \) ", - self.nomo, section) - total += count - (section, count) = subn( - r"(?x)\bordinal \( ([^)]*) \) ", - self.ordinalmo, section) - total += count - (section, count) = subn( - r"(?x)\bnumber_to_words \( ([^)]*) \) ", - self.numwordsmo, section) - total += count - (section, count) = subn( - r"(?x)\bpresent_participle \( ([^)]*) \) ", - self.prespartmo, section) - total += count - - inflection.append(section) - - self.persistent_count = save_persistent_count - return "".join(inflection) - -# ## PLURAL SUBROUTINES - - def postprocess(self, orig, inflected): - """ - FIX PEDANTRY AND CAPITALIZATION :-) - """ - if '|' in inflected: - inflected = inflected.split('|')[self.classical_dict['all']] - if orig == "I": - return inflected - if orig == orig.upper(): - return inflected.upper() - if orig[0] == orig[0].upper(): - return '%s%s' % (inflected[0].upper(), - inflected[1:]) - return inflected - - def partition_word(self, text): - mo = search(r'\A(\s*)(.+?)(\s*)\Z', text) - try: - return mo.group(1), mo.group(2), mo.group(3) - except AttributeError: # empty string - return '', '', '' - -# def pl(self, *args, **kwds): -# print 'pl() deprecated, use plural()' -# raise DeprecationWarning -# return self.plural(*args, **kwds) -# -# def plnoun(self, *args, **kwds): -# print 'plnoun() deprecated, use plural_noun()' -# raise DeprecationWarning -# return self.plural_noun(*args, **kwds) -# -# def plverb(self, *args, **kwds): -# print 'plverb() deprecated, use plural_verb()' -# raise DeprecationWarning -# return self.plural_verb(*args, **kwds) -# -# def pladj(self, *args, **kwds): -# print 'pladj() deprecated, use plural_adj()' -# raise DeprecationWarning -# return self.plural_adj(*args, **kwds) -# -# def sinoun(self, *args, **kwds): -# print 'sinoun() deprecated, use singular_noun()' -# raise DeprecationWarning -# return self.singular_noun(*args, **kwds) -# -# def prespart(self, *args, **kwds): -# print 'prespart() deprecated, use present_participle()' -# raise DeprecationWarning -# return self.present_participle(*args, **kwds) -# -# def numwords(self, *args, **kwds): -# print 'numwords() deprecated, use number_to_words()' -# raise DeprecationWarning -# return self.number_to_words(*args, **kwds) - - def plural(self, text, count=None): - ''' - Return the plural of text. - - If count supplied, then return text if count is one of: - 1, a, an, one, each, every, this, that - otherwise return the plural. - - Whitespace at the start and end is preserved. - - ''' - pre, word, post = self.partition_word(text) - if not word: - return text - plural = self.postprocess( - word, - self._pl_special_adjective(word, count) or - self._pl_special_verb(word, count) or - self._plnoun(word, count)) - return "%s%s%s" % (pre, plural, post) - - def plural_noun(self, text, count=None): - ''' - Return the plural of text, where text is a noun. - - If count supplied, then return text if count is one of: - 1, a, an, one, each, every, this, that - otherwise return the plural. - - Whitespace at the start and end is preserved. - - ''' - pre, word, post = self.partition_word(text) - if not word: - return text - plural = self.postprocess(word, self._plnoun(word, count)) - return "%s%s%s" % (pre, plural, post) - - def plural_verb(self, text, count=None): - ''' - Return the plural of text, where text is a verb. - - If count supplied, then return text if count is one of: - 1, a, an, one, each, every, this, that - otherwise return the plural. - - Whitespace at the start and end is preserved. - - ''' - pre, word, post = self.partition_word(text) - if not word: - return text - plural = self.postprocess(word, self._pl_special_verb(word, count) or - self._pl_general_verb(word, count)) - return "%s%s%s" % (pre, plural, post) - - def plural_adj(self, text, count=None): - ''' - Return the plural of text, where text is an adjective. - - If count supplied, then return text if count is one of: - 1, a, an, one, each, every, this, that - otherwise return the plural. - - Whitespace at the start and end is preserved. - - ''' - pre, word, post = self.partition_word(text) - if not word: - return text - plural = self.postprocess(word, self._pl_special_adjective(word, count) or word) - return "%s%s%s" % (pre, plural, post) - - def compare(self, word1, word2): - ''' - compare word1 and word2 for equality regardless of plurality - - return values: - eq - the strings are equal - p:s - word1 is the plural of word2 - s:p - word2 is the plural of word1 - p:p - word1 and word2 are two different plural forms of the one word - False - otherwise - - ''' - return ( - self._plequal(word1, word2, self.plural_noun) or - self._plequal(word1, word2, self.plural_verb) or - self._plequal(word1, word2, self.plural_adj)) - - def compare_nouns(self, word1, word2): - ''' - compare word1 and word2 for equality regardless of plurality - word1 and word2 are to be treated as nouns - - return values: - eq - the strings are equal - p:s - word1 is the plural of word2 - s:p - word2 is the plural of word1 - p:p - word1 and word2 are two different plural forms of the one word - False - otherwise - - ''' - return self._plequal(word1, word2, self.plural_noun) - - def compare_verbs(self, word1, word2): - ''' - compare word1 and word2 for equality regardless of plurality - word1 and word2 are to be treated as verbs - - return values: - eq - the strings are equal - p:s - word1 is the plural of word2 - s:p - word2 is the plural of word1 - p:p - word1 and word2 are two different plural forms of the one word - False - otherwise - - ''' - return self._plequal(word1, word2, self.plural_verb) - - def compare_adjs(self, word1, word2): - ''' - compare word1 and word2 for equality regardless of plurality - word1 and word2 are to be treated as adjectives - - return values: - eq - the strings are equal - p:s - word1 is the plural of word2 - s:p - word2 is the plural of word1 - p:p - word1 and word2 are two different plural forms of the one word - False - otherwise - - ''' - return self._plequal(word1, word2, self.plural_adj) - - def singular_noun(self, text, count=None, gender=None): - ''' - Return the singular of text, where text is a plural noun. - - If count supplied, then return the singular if count is one of: - 1, a, an, one, each, every, this, that or if count is None - otherwise return text unchanged. - - Whitespace at the start and end is preserved. - - ''' - pre, word, post = self.partition_word(text) - if not word: - return text - sing = self._sinoun(word, count=count, gender=gender) - if sing is not False: - plural = self.postprocess(word, self._sinoun(word, count=count, gender=gender)) - return "%s%s%s" % (pre, plural, post) - return False - - def _plequal(self, word1, word2, pl): - classval = self.classical_dict.copy() - self.classical_dict = all_classical.copy() - if word1 == word2: - return "eq" - if word1 == pl(word2): - return "p:s" - if pl(word1) == word2: - return "s:p" - self.classical_dict = no_classical.copy() - if word1 == pl(word2): - return "p:s" - if pl(word1) == word2: - return "s:p" - self.classical_dict = classval.copy() - - if pl == self.plural or pl == self.plural_noun: - if self._pl_check_plurals_N(word1, word2): - return "p:p" - if self._pl_check_plurals_N(word2, word1): - return "p:p" - if pl == self.plural or pl == self.plural_adj: - if self._pl_check_plurals_adj(word1, word2): - return "p:p" - return False - - def _pl_reg_plurals(self, pair, stems, end1, end2): - if search(r"(%s)(%s\|\1%s|%s\|\1%s)" % (stems, end1, end2, end2, end1), pair): - return True - return False - - def _pl_check_plurals_N(self, word1, word2): - pair = "%s|%s" % (word1, word2) - if pair in list(pl_sb_irregular_s.values()): - return True - if pair in list(pl_sb_irregular.values()): - return True - if pair in list(pl_sb_irregular_caps.values()): - return True - - for (stems, end1, end2) in ( - (pl_sb_C_a_ata, "as", "ata"), - (pl_sb_C_is_ides, "is", "ides"), - (pl_sb_C_a_ae, "s", "e"), - (pl_sb_C_en_ina, "ens", "ina"), - (pl_sb_C_um_a, "ums", "a"), - (pl_sb_C_us_i, "uses", "i"), - (pl_sb_C_on_a, "ons", "a"), - (pl_sb_C_o_i_stems, "os", "i"), - (pl_sb_C_ex_ices, "exes", "ices"), - (pl_sb_C_ix_ices, "ixes", "ices"), - (pl_sb_C_i, "s", "i"), - (pl_sb_C_im, "s", "im"), - ('.*eau', "s", "x"), - ('.*ieu', "s", "x"), - ('.*tri', "xes", "ces"), - ('.{2,}[yia]n', "xes", "ges") - ): - if self._pl_reg_plurals(pair, stems, end1, end2): - return True - return False - - def _pl_check_plurals_adj(self, word1, word2): -# VERSION: tuple in endswith requires python 2.5 - word1a = word1[:word1.rfind("'")] if word1.endswith(("'s", "'")) else '' - word2a = word2[:word2.rfind("'")] if word2.endswith(("'s", "'")) else '' - # TODO: BUG? report upstream. I don't think you should chop off the s' - # word1b = word1[:-2] if word1.endswith("s'") else '' - # word2b = word2[:-2] if word2.endswith("s'") else '' - - # TODO: dresses', dresses's -> dresses, dresses when chop off letters - # then they return False because they are the same. Need to fix this. - - if word1a: - if word2a and (self._pl_check_plurals_N(word1a, word2a) - or self._pl_check_plurals_N(word2a, word1a)): - return True - # if word2b and ( self._pl_check_plurals_N(word1a, word2b) - # or self._pl_check_plurals_N(word2b, word1a) ): - # return True - - # if word1b: - # if word2a and ( self._pl_check_plurals_N(word1b, word2a) - # or self._pl_check_plurals_N(word2a, word1b) ): - # return True - # if word2b and ( self._pl_check_plurals_N(word1b, word2b) - # or self._pl_check_plurals_N(word2b, word1b) ): - # return True - - return False - - def get_count(self, count=None): - if count is None and self.persistent_count is not None: - count = self.persistent_count - - if count is not None: - count = 1 if ((str(count) in pl_count_one) or - (self.classical_dict['zero'] and str(count).lower() in pl_count_zero)) else 2 - else: - count = '' - return count - - # @profile - def _plnoun(self, word, count=None): - count = self.get_count(count) - -# DEFAULT TO PLURAL - - if count == 1: - return word - -# HANDLE USER-DEFINED NOUNS - - value = self.ud_match(word, self.pl_sb_user_defined) - if value is not None: - return value - -# HANDLE EMPTY WORD, SINGULAR COUNT AND UNINFLECTED PLURALS - - if word == '': - return word - - lowerword = word.lower() - - if lowerword in pl_sb_uninflected_complete: - return word - - if word in pl_sb_uninflected_caps: - return word - - for k, v in pl_sb_uninflected_bysize.items(): - if lowerword[-k:] in v: - return word - - if (self.classical_dict['herd'] and lowerword in pl_sb_uninflected_herd): - return word - -# HANDLE COMPOUNDS ("Governor General", "mother-in-law", "aide-de-camp", ETC.) - - mo = search(r"^(?:%s)$" % pl_sb_postfix_adj_stems, word, IGNORECASE) - if mo and mo.group(2) != '': - return "%s%s" % (self._plnoun(mo.group(1), 2), mo.group(2)) - - if ' a ' in lowerword or '-a-' in lowerword: - mo = search(r"^(?:%s)$" % pl_sb_prep_dual_compound, word, IGNORECASE) - if mo and mo.group(2) != '' and mo.group(3) != '': - return "%s%s%s" % (self._plnoun(mo.group(1), 2), - mo.group(2), - self._plnoun(mo.group(3))) - - lowersplit = lowerword.split(' ') - if len(lowersplit) >= 3: - for numword in range(1, len(lowersplit) - 1): - if lowersplit[numword] in pl_prep_list_da: - return ' '.join( - lowersplit[:numword - 1] + - [self._plnoun(lowersplit[numword - 1], 2)] + lowersplit[numword:]) - - lowersplit = lowerword.split('-') - if len(lowersplit) >= 3: - for numword in range(1, len(lowersplit) - 1): - if lowersplit[numword] in pl_prep_list_da: - return ' '.join( - lowersplit[:numword - 1] + - [self._plnoun(lowersplit[numword - 1], 2) + - '-' + lowersplit[numword] + '-']) + ' '.join(lowersplit[(numword + 1):]) - -# HANDLE PRONOUNS - - for k, v in pl_pron_acc_keys_bysize.items(): - if lowerword[-k:] in v: # ends with accusivate pronoun - for pk, pv in pl_prep_bysize.items(): - if lowerword[:pk] in pv: # starts with a prep - if lowerword.split() == [lowerword[:pk], lowerword[-k:]]: # only whitespace in between - return lowerword[:-k] + pl_pron_acc[lowerword[-k:]] - - try: - return pl_pron_nom[word.lower()] - except KeyError: - pass - - try: - return pl_pron_acc[word.lower()] - except KeyError: - pass - -# HANDLE ISOLATED IRREGULAR PLURALS - - wordsplit = word.split() - wordlast = wordsplit[-1] - lowerwordlast = wordlast.lower() - - if wordlast in list(pl_sb_irregular_caps.keys()): - llen = len(wordlast) - return '%s%s' % (word[:-llen], - pl_sb_irregular_caps[wordlast]) - - if lowerwordlast in list(pl_sb_irregular.keys()): - llen = len(lowerwordlast) - return '%s%s' % (word[:-llen], - pl_sb_irregular[lowerwordlast]) - - if (' '.join(wordsplit[-2:])).lower() in list(pl_sb_irregular_compound.keys()): - llen = len(' '.join(wordsplit[-2:])) # TODO: what if 2 spaces between these words? - return '%s%s' % (word[:-llen], - pl_sb_irregular_compound[(' '.join(wordsplit[-2:])).lower()]) - - if lowerword[-3:] == 'quy': - return word[:-1] + 'ies' - - if lowerword[-6:] == 'person': - if self.classical_dict['persons']: - return word + 's' - else: - return word[:-4] + 'ople' - -# HANDLE FAMILIES OF IRREGULAR PLURALS - - if lowerword[-3:] == 'man': - for k, v in pl_sb_U_man_mans_bysize.items(): - if lowerword[-k:] in v: - return word + 's' - for k, v in pl_sb_U_man_mans_caps_bysize.items(): - if word[-k:] in v: - return word + 's' - return word[:-3] + 'men' - if lowerword[-5:] == 'mouse': - return word[:-5] + 'mice' - if lowerword[-5:] == 'louse': - return word[:-5] + 'lice' - if lowerword[-5:] == 'goose': - return word[:-5] + 'geese' - if lowerword[-5:] == 'tooth': - return word[:-5] + 'teeth' - if lowerword[-4:] == 'foot': - return word[:-4] + 'feet' - - if lowerword == 'die': - return 'dice' - -# HANDLE UNASSIMILATED IMPORTS - - if lowerword[-4:] == 'ceps': - return word - if lowerword[-4:] == 'zoon': - return word[:-2] + 'a' - if lowerword[-3:] in ('cis', 'sis', 'xis'): - return word[:-2] + 'es' - - for lastlet, d, numend, post in ( - ('h', pl_sb_U_ch_chs_bysize, None, 's'), - ('x', pl_sb_U_ex_ices_bysize, -2, 'ices'), - ('x', pl_sb_U_ix_ices_bysize, -2, 'ices'), - ('m', pl_sb_U_um_a_bysize, -2, 'a'), - ('s', pl_sb_U_us_i_bysize, -2, 'i'), - ('n', pl_sb_U_on_a_bysize, -2, 'a'), - ('a', pl_sb_U_a_ae_bysize, None, 'e'), - ): - if lowerword[-1] == lastlet: # this test to add speed - for k, v in d.items(): - if lowerword[-k:] in v: - return word[:numend] + post - -# HANDLE INCOMPLETELY ASSIMILATED IMPORTS - - if (self.classical_dict['ancient']): - if lowerword[-4:] == 'trix': - return word[:-1] + 'ces' - if lowerword[-3:] in ('eau', 'ieu'): - return word + 'x' - if lowerword[-3:] in ('ynx', 'inx', 'anx') and len(word) > 4: - return word[:-1] + 'ges' - - for lastlet, d, numend, post in ( - ('n', pl_sb_C_en_ina_bysize, -2, 'ina'), - ('x', pl_sb_C_ex_ices_bysize, -2, 'ices'), - ('x', pl_sb_C_ix_ices_bysize, -2, 'ices'), - ('m', pl_sb_C_um_a_bysize, -2, 'a'), - ('s', pl_sb_C_us_i_bysize, -2, 'i'), - ('s', pl_sb_C_us_us_bysize, None, ''), - ('a', pl_sb_C_a_ae_bysize, None, 'e'), - ('a', pl_sb_C_a_ata_bysize, None, 'ta'), - ('s', pl_sb_C_is_ides_bysize, -1, 'des'), - ('o', pl_sb_C_o_i_bysize, -1, 'i'), - ('n', pl_sb_C_on_a_bysize, -2, 'a'), - ): - if lowerword[-1] == lastlet: # this test to add speed - for k, v in d.items(): - if lowerword[-k:] in v: - return word[:numend] + post - - for d, numend, post in ( - (pl_sb_C_i_bysize, None, 'i'), - (pl_sb_C_im_bysize, None, 'im'), - ): - for k, v in d.items(): - if lowerword[-k:] in v: - return word[:numend] + post - -# HANDLE SINGULAR NOUNS ENDING IN ...s OR OTHER SILIBANTS - - if lowerword in pl_sb_singular_s_complete: - return word + 'es' - - for k, v in pl_sb_singular_s_bysize.items(): - if lowerword[-k:] in v: - return word + 'es' - - if lowerword[-2:] == 'es' and word[0] == word[0].upper(): - return word + 'es' - -# Wouldn't special words -# ending with 's' always have been caught, regardless of them starting -# with a capital letter (i.e. being names) -# It makes sense below to do this for words ending in 'y' so that -# Sally -> Sallys. But not sure it makes sense here. Where is the case -# of a word ending in s that is caught here and would otherwise have been -# caught below? -# -# removing it as I can't find a case that executes it -# TODO: check this again -# -# if (self.classical_dict['names']): -# mo = search(r"([A-Z].*s)$", word) -# if mo: -# return "%ses" % mo.group(1) - - if lowerword[-1] == 'z': - for k, v in pl_sb_z_zes_bysize.items(): - if lowerword[-k:] in v: - return word + 'es' - - if lowerword[-2:-1] != 'z': - return word + 'zes' - - if lowerword[-2:] == 'ze': - for k, v in pl_sb_ze_zes_bysize.items(): - if lowerword[-k:] in v: - return word + 's' - - if lowerword[-2:] in ('ch', 'sh', 'zz', 'ss') or lowerword[-1] == 'x': - return word + 'es' - -# ## (r"(.*)(us)$", "%s%ses"), TODO: why is this commented? - -# HANDLE ...f -> ...ves - - if lowerword[-3:] in ('elf', 'alf', 'olf'): - return word[:-1] + 'ves' - if lowerword[-3:] == 'eaf' and lowerword[-4:-3] != 'd': - return word[:-1] + 'ves' - if lowerword[-4:] in ('nife', 'life', 'wife'): - return word[:-2] + 'ves' - if lowerword[-3:] == 'arf': - return word[:-1] + 'ves' - -# HANDLE ...y - - if lowerword[-1] == 'y': - if lowerword[-2:-1] in 'aeiou' or len(word) == 1: - return word + 's' - - if (self.classical_dict['names']): - if lowerword[-1] == 'y' and word[0] == word[0].upper(): - return word + 's' - - return word[:-1] + 'ies' - -# HANDLE ...o - - if lowerword in pl_sb_U_o_os_complete: - return word + 's' - - for k, v in pl_sb_U_o_os_bysize.items(): - if lowerword[-k:] in v: - return word + 's' - - if lowerword[-2:] in ('ao', 'eo', 'io', 'oo', 'uo'): - return word + 's' - - if lowerword[-1] == 'o': - return word + 'es' - -# OTHERWISE JUST ADD ...s - - return "%ss" % word - - def _pl_special_verb(self, word, count=None): - if (self.classical_dict['zero'] and - str(count).lower() in pl_count_zero): - return False - count = self.get_count(count) - - if count == 1: - return word - -# HANDLE USER-DEFINED VERBS - - value = self.ud_match(word, self.pl_v_user_defined) - if value is not None: - return value - -# HANDLE IRREGULAR PRESENT TENSE (SIMPLE AND COMPOUND) - - lowerword = word.lower() - try: - firstword = lowerword.split()[0] - except IndexError: - return False # word is '' - - if firstword in list(plverb_irregular_pres.keys()): - return "%s%s" % (plverb_irregular_pres[firstword], word[len(firstword):]) - -# HANDLE IRREGULAR FUTURE, PRETERITE AND PERFECT TENSES - - if firstword in plverb_irregular_non_pres: - return word - -# HANDLE PRESENT NEGATIONS (SIMPLE AND COMPOUND) - - if firstword.endswith("n't") and firstword[:-3] in list(plverb_irregular_pres.keys()): - return "%sn't%s" % (plverb_irregular_pres[firstword[:-3]], word[len(firstword):]) - - if firstword.endswith("n't"): - return word - -# HANDLE SPECIAL CASES - - mo = search(r"^(%s)$" % plverb_special_s, word) - if mo: - return False - if search(r"\s", word): - return False - if lowerword == 'quizzes': - return 'quiz' - -# HANDLE STANDARD 3RD PERSON (CHOP THE ...(e)s OFF SINGLE WORDS) - - if lowerword[-4:] in ('ches', 'shes', 'zzes', 'sses') or \ - lowerword[-3:] == 'xes': - return word[:-2] - -# # mo = search(r"^(.*)([cs]h|[x]|zz|ss)es$", -# # word, IGNORECASE) -# # if mo: -# # return "%s%s" % (mo.group(1), mo.group(2)) - - if lowerword[-3:] == 'ies' and len(word) > 3: - return lowerword[:-3] + 'y' - - if (lowerword in pl_v_oes_oe or - lowerword[-4:] in pl_v_oes_oe_endings_size4 or - lowerword[-5:] in pl_v_oes_oe_endings_size5): - return word[:-1] - - if lowerword.endswith('oes') and len(word) > 3: - return lowerword[:-2] - - mo = search(r"^(.*[^s])s$", word, IGNORECASE) - if mo: - return mo.group(1) - -# OTHERWISE, A REGULAR VERB (HANDLE ELSEWHERE) - - return False - - def _pl_general_verb(self, word, count=None): - count = self.get_count(count) - - if count == 1: - return word - -# HANDLE AMBIGUOUS PRESENT TENSES (SIMPLE AND COMPOUND) - - mo = search(r"^(%s)((\s.*)?)$" % plverb_ambiguous_pres_keys, word, IGNORECASE) - if mo: - return "%s%s" % (plverb_ambiguous_pres[mo.group(1).lower()], mo.group(2)) - -# HANDLE AMBIGUOUS PRETERITE AND PERFECT TENSES - - mo = search(r"^(%s)((\s.*)?)$" % plverb_ambiguous_non_pres, word, IGNORECASE) - if mo: - return word - -# OTHERWISE, 1st OR 2ND PERSON IS UNINFLECTED - - return word - - def _pl_special_adjective(self, word, count=None): - count = self.get_count(count) - - if count == 1: - return word - -# HANDLE USER-DEFINED ADJECTIVES - - value = self.ud_match(word, self.pl_adj_user_defined) - if value is not None: - return value - -# HANDLE KNOWN CASES - - mo = search(r"^(%s)$" % pl_adj_special_keys, - word, IGNORECASE) - if mo: - return "%s" % (pl_adj_special[mo.group(1).lower()]) - -# HANDLE POSSESSIVES - - mo = search(r"^(%s)$" % pl_adj_poss_keys, - word, IGNORECASE) - if mo: - return "%s" % (pl_adj_poss[mo.group(1).lower()]) - - mo = search(r"^(.*)'s?$", - word) - if mo: - pl = self.plural_noun(mo.group(1)) - trailing_s = "" if pl[-1] == 's' else "s" - return "%s'%s" % (pl, trailing_s) - -# OTHERWISE, NO IDEA - - return False - - # @profile - def _sinoun(self, word, count=None, gender=None): - count = self.get_count(count) - -# DEFAULT TO PLURAL - - if count == 2: - return word - -# SET THE GENDER - - try: - if gender is None: - gender = self.thegender - elif gender not in singular_pronoun_genders: - raise BadGenderError - except (TypeError, IndexError): - raise BadGenderError - -# HANDLE USER-DEFINED NOUNS - - value = self.ud_match(word, self.si_sb_user_defined) - if value is not None: - return value - -# HANDLE EMPTY WORD, SINGULAR COUNT AND UNINFLECTED PLURALS - - if word == '': - return word - - lowerword = word.lower() - - if word in si_sb_ois_oi_case: - return word[:-1] - - if lowerword in pl_sb_uninflected_complete: - return word - - if word in pl_sb_uninflected_caps: - return word - - for k, v in pl_sb_uninflected_bysize.items(): - if lowerword[-k:] in v: - return word - - if (self.classical_dict['herd'] and lowerword in pl_sb_uninflected_herd): - return word - -# HANDLE COMPOUNDS ("Governor General", "mother-in-law", "aide-de-camp", ETC.) - - mo = search(r"^(?:%s)$" % pl_sb_postfix_adj_stems, word, IGNORECASE) - if mo and mo.group(2) != '': - return "%s%s" % (self._sinoun(mo.group(1), 1, gender=gender), mo.group(2)) - - # how to reverse this one? - # mo = search(r"^(?:%s)$" % pl_sb_prep_dual_compound, word, IGNORECASE) - # if mo and mo.group(2) != '' and mo.group(3) != '': - # return "%s%s%s" % (self._sinoun(mo.group(1), 1), - # mo.group(2), - # self._sinoun(mo.group(3), 1)) - - lowersplit = lowerword.split(' ') - if len(lowersplit) >= 3: - for numword in range(1, len(lowersplit) - 1): - if lowersplit[numword] in pl_prep_list_da: - return ' '.join(lowersplit[:numword - 1] + - [self._sinoun(lowersplit[numword - 1], 1, gender=gender)] + - lowersplit[numword:]) - - lowersplit = lowerword.split('-') - if len(lowersplit) >= 3: - for numword in range(1, len(lowersplit) - 1): - if lowersplit[numword] in pl_prep_list_da: - return ' '.join( - lowersplit[:numword - 1] + - [self._sinoun(lowersplit[numword - 1], 1, gender=gender) + - '-' + lowersplit[numword] + '-']) + ' '.join(lowersplit[(numword + 1):]) - -# HANDLE PRONOUNS - - for k, v in si_pron_acc_keys_bysize.items(): - if lowerword[-k:] in v: # ends with accusivate pronoun - for pk, pv in pl_prep_bysize.items(): - if lowerword[:pk] in pv: # starts with a prep - if lowerword.split() == [lowerword[:pk], lowerword[-k:]]: # only whitespace in between - return lowerword[:-k] + get_si_pron('acc', lowerword[-k:], gender) - - try: - return get_si_pron('nom', word.lower(), gender) - except KeyError: - pass - - try: - return get_si_pron('acc', word.lower(), gender) - except KeyError: - pass - -# HANDLE ISOLATED IRREGULAR PLURALS - - wordsplit = word.split() - wordlast = wordsplit[-1] - lowerwordlast = wordlast.lower() - - if wordlast in list(si_sb_irregular_caps.keys()): - llen = len(wordlast) - return '%s%s' % (word[:-llen], - si_sb_irregular_caps[wordlast]) - - if lowerwordlast in list(si_sb_irregular.keys()): - llen = len(lowerwordlast) - return '%s%s' % (word[:-llen], - si_sb_irregular[lowerwordlast]) - - if (' '.join(wordsplit[-2:])).lower() in list(si_sb_irregular_compound.keys()): - llen = len(' '.join(wordsplit[-2:])) # TODO: what if 2 spaces between these words? - return '%s%s' % (word[:-llen], - si_sb_irregular_compound[(' '.join(wordsplit[-2:])).lower()]) - - if lowerword[-5:] == 'quies': - return word[:-3] + 'y' - - if lowerword[-7:] == 'persons': - return word[:-1] - if lowerword[-6:] == 'people': - return word[:-4] + 'rson' - -# HANDLE FAMILIES OF IRREGULAR PLURALS - - if lowerword[-4:] == 'mans': - for k, v in si_sb_U_man_mans_bysize.items(): - if lowerword[-k:] in v: - return word[:-1] - for k, v in si_sb_U_man_mans_caps_bysize.items(): - if word[-k:] in v: - return word[:-1] - if lowerword[-3:] == 'men': - return word[:-3] + 'man' - if lowerword[-4:] == 'mice': - return word[:-4] + 'mouse' - if lowerword[-4:] == 'lice': - return word[:-4] + 'louse' - if lowerword[-5:] == 'geese': - return word[:-5] + 'goose' - if lowerword[-5:] == 'teeth': - return word[:-5] + 'tooth' - if lowerword[-4:] == 'feet': - return word[:-4] + 'foot' - - if lowerword == 'dice': - return 'die' - -# HANDLE UNASSIMILATED IMPORTS - - if lowerword[-4:] == 'ceps': - return word - if lowerword[-3:] == 'zoa': - return word[:-1] + 'on' - - for lastlet, d, numend, post in ( - ('s', si_sb_U_ch_chs_bysize, -1, ''), - ('s', si_sb_U_ex_ices_bysize, -4, 'ex'), - ('s', si_sb_U_ix_ices_bysize, -4, 'ix'), - ('a', si_sb_U_um_a_bysize, -1, 'um'), - ('i', si_sb_U_us_i_bysize, -1, 'us'), - ('a', si_sb_U_on_a_bysize, -1, 'on'), - ('e', si_sb_U_a_ae_bysize, -1, ''), - ): - if lowerword[-1] == lastlet: # this test to add speed - for k, v in d.items(): - if lowerword[-k:] in v: - return word[:numend] + post - -# HANDLE INCOMPLETELY ASSIMILATED IMPORTS - - if (self.classical_dict['ancient']): - - if lowerword[-6:] == 'trices': - return word[:-3] + 'x' - if lowerword[-4:] in ('eaux', 'ieux'): - return word[:-1] - if lowerword[-5:] in ('ynges', 'inges', 'anges') and len(word) > 6: - return word[:-3] + 'x' - - for lastlet, d, numend, post in ( - ('a', si_sb_C_en_ina_bysize, -3, 'en'), - ('s', si_sb_C_ex_ices_bysize, -4, 'ex'), - ('s', si_sb_C_ix_ices_bysize, -4, 'ix'), - ('a', si_sb_C_um_a_bysize, -1, 'um'), - ('i', si_sb_C_us_i_bysize, -1, 'us'), - ('s', pl_sb_C_us_us_bysize, None, ''), - ('e', si_sb_C_a_ae_bysize, -1, ''), - ('a', si_sb_C_a_ata_bysize, -2, ''), - ('s', si_sb_C_is_ides_bysize, -3, 's'), - ('i', si_sb_C_o_i_bysize, -1, 'o'), - ('a', si_sb_C_on_a_bysize, -1, 'on'), - ('m', si_sb_C_im_bysize, -2, ''), - ('i', si_sb_C_i_bysize, -1, ''), - ): - if lowerword[-1] == lastlet: # this test to add speed - for k, v in d.items(): - if lowerword[-k:] in v: - return word[:numend] + post - -# HANDLE PLURLS ENDING IN uses -> use - - if (lowerword[-6:] == 'houses' or - word in si_sb_uses_use_case or - lowerword in si_sb_uses_use): - return word[:-1] - -# HANDLE PLURLS ENDING IN ies -> ie - - if word in si_sb_ies_ie_case or lowerword in si_sb_ies_ie: - return word[:-1] - -# HANDLE PLURLS ENDING IN oes -> oe - - if (lowerword[-5:] == 'shoes' or - word in si_sb_oes_oe_case or - lowerword in si_sb_oes_oe): - return word[:-1] - -# HANDLE SINGULAR NOUNS ENDING IN ...s OR OTHER SILIBANTS - - if (word in si_sb_sses_sse_case or - lowerword in si_sb_sses_sse): - return word[:-1] - - if lowerword in si_sb_singular_s_complete: - return word[:-2] - - for k, v in si_sb_singular_s_bysize.items(): - if lowerword[-k:] in v: - return word[:-2] - - if lowerword[-4:] == 'eses' and word[0] == word[0].upper(): - return word[:-2] - -# Wouldn't special words -# ending with 's' always have been caught, regardless of them starting -# with a capital letter (i.e. being names) -# It makes sense below to do this for words ending in 'y' so that -# Sally -> Sallys. But not sure it makes sense here. Where is the case -# of a word ending in s that is caught here and would otherwise have been -# caught below? -# -# removing it as I can't find a case that executes it -# TODO: check this again -# -# if (self.classical_dict['names']): -# mo = search(r"([A-Z].*ses)$", word) -# if mo: -# return "%s" % mo.group(1) - - if lowerword in si_sb_z_zes: - return word[:-2] - - if lowerword in si_sb_zzes_zz: - return word[:-2] - - if lowerword[-4:] == 'zzes': - return word[:-3] - - if (word in si_sb_ches_che_case or - lowerword in si_sb_ches_che): - return word[:-1] - - if lowerword[-4:] in ('ches', 'shes'): - return word[:-2] - - if lowerword in si_sb_xes_xe: - return word[:-1] - - if lowerword[-3:] == 'xes': - return word[:-2] -# (r"(.*)(us)es$", "%s%s"), TODO: why is this commented? - -# HANDLE ...f -> ...ves - - if (word in si_sb_ves_ve_case or - lowerword in si_sb_ves_ve): - return word[:-1] - - if lowerword[-3:] == 'ves': - if lowerword[-5:-3] in ('el', 'al', 'ol'): - return word[:-3] + 'f' - if lowerword[-5:-3] == 'ea' and word[-6:-5] != 'd': - return word[:-3] + 'f' - if lowerword[-5:-3] in ('ni', 'li', 'wi'): - return word[:-3] + 'fe' - if lowerword[-5:-3] == 'ar': - return word[:-3] + 'f' - -# HANDLE ...y - - if lowerword[-2:] == 'ys': - if len(lowerword) > 2 and lowerword[-3] in 'aeiou': - return word[:-1] - - if (self.classical_dict['names']): - if lowerword[-2:] == 'ys' and word[0] == word[0].upper(): - return word[:-1] - - if lowerword[-3:] == 'ies': - return word[:-3] + 'y' - -# HANDLE ...o - - if lowerword[-2:] == 'os': - - if lowerword in si_sb_U_o_os_complete: - return word[:-1] - - for k, v in si_sb_U_o_os_bysize.items(): - if lowerword[-k:] in v: - return word[:-1] - - if lowerword[-3:] in ('aos', 'eos', 'ios', 'oos', 'uos'): - return word[:-1] - - if lowerword[-3:] == 'oes': - return word[:-2] - -# UNASSIMILATED IMPORTS FINAL RULE - - if word in si_sb_es_is: - return word[:-2] + 'is' - -# OTHERWISE JUST REMOVE ...s - - if lowerword[-1] == 's': - return word[:-1] - -# COULD NOT FIND SINGULAR - - return False - -# ADJECTIVES - - def a(self, text, count=1): - ''' - Return the appropriate indefinite article followed by text. - - The indefinite article is either 'a' or 'an'. - - If count is not one, then return count followed by text - instead of 'a' or 'an'. - - Whitespace at the start and end is preserved. - - ''' - mo = search(r"\A(\s*)(?:an?\s+)?(.+?)(\s*)\Z", - text, IGNORECASE) - if mo: - word = mo.group(2) - if not word: - return text - pre = mo.group(1) - post = mo.group(3) - result = self._indef_article(word, count) - return "%s%s%s" % (pre, result, post) - return '' - - an = a - - def _indef_article(self, word, count): - mycount = self.get_count(count) - - if mycount != 1: - return "%s %s" % (count, word) - -# HANDLE USER-DEFINED VARIANTS - - value = self.ud_match(word, self.A_a_user_defined) - if value is not None: - return "%s %s" % (value, word) - -# HANDLE ORDINAL FORMS - - for a in ( - (r"^(%s)" % A_ordinal_a, "a"), - (r"^(%s)" % A_ordinal_an, "an"), - ): - mo = search(a[0], word, IGNORECASE) - if mo: - return "%s %s" % (a[1], word) - -# HANDLE SPECIAL CASES - - for a in ( - (r"^(%s)" % A_explicit_an, "an"), - (r"^[aefhilmnorsx]$", "an"), - (r"^[bcdgjkpqtuvwyz]$", "a"), - ): - mo = search(a[0], word, IGNORECASE) - if mo: - return "%s %s" % (a[1], word) - -# HANDLE ABBREVIATIONS - - for a in ( - (r"(%s)" % A_abbrev, "an", VERBOSE), - (r"^[aefhilmnorsx][.-]", "an", IGNORECASE), - (r"^[a-z][.-]", "a", IGNORECASE), - ): - mo = search(a[0], word, a[2]) - if mo: - return "%s %s" % (a[1], word) - -# HANDLE CONSONANTS - - mo = search(r"^[^aeiouy]", word, IGNORECASE) - if mo: - return "a %s" % word - -# HANDLE SPECIAL VOWEL-FORMS - - for a in ( - (r"^e[uw]", "a"), - (r"^onc?e\b", "a"), - (r"^onetime\b", "a"), - (r"^uni([^nmd]|mo)", "a"), - (r"^u[bcfghjkqrst][aeiou]", "a"), - (r"^ukr", "a"), - (r"^(%s)" % A_explicit_a, "a"), - ): - mo = search(a[0], word, IGNORECASE) - if mo: - return "%s %s" % (a[1], word) - -# HANDLE SPECIAL CAPITALS - - mo = search(r"^U[NK][AIEO]?", word) - if mo: - return "a %s" % word - -# HANDLE VOWELS - - mo = search(r"^[aeiou]", word, IGNORECASE) - if mo: - return "an %s" % word - -# HANDLE y... (BEFORE CERTAIN CONSONANTS IMPLIES (UNNATURALIZED) "i.." SOUND) - - mo = search(r"^(%s)" % A_y_cons, word, IGNORECASE) - if mo: - return "an %s" % word - -# OTHERWISE, GUESS "a" - return "a %s" % word - -# 2. TRANSLATE ZERO-QUANTIFIED $word TO "no plural($word)" - - def no(self, text, count=None): - ''' - If count is 0, no, zero or nil, return 'no' followed by the plural - of text. - - If count is one of: - 1, a, an, one, each, every, this, that - return count followed by text. - - Otherwise return count follow by the plural of text. - - In the return value count is always followed by a space. - - Whitespace at the start and end is preserved. - - ''' - if count is None and self.persistent_count is not None: - count = self.persistent_count - - if count is None: - count = 0 - mo = search(r"\A(\s*)(.+?)(\s*)\Z", text) - pre = mo.group(1) - word = mo.group(2) - post = mo.group(3) - - if str(count).lower() in pl_count_zero: - return "%sno %s%s" % (pre, self.plural(word, 0), post) - else: - return "%s%s %s%s" % (pre, count, self.plural(word, count), post) - -# PARTICIPLES - - def present_participle(self, word): - ''' - Return the present participle for word. - - word is the 3rd person singular verb. - - ''' - plv = self.plural_verb(word, 2) - - for pat, repl in ( - (r"ie$", r"y"), - (r"ue$", r"u"), # TODO: isn't ue$ -> u encompassed in the following rule? - (r"([auy])e$", r"\g<1>"), - (r"ski$", r"ski"), - (r"[^b]i$", r""), - (r"^(are|were)$", r"be"), - (r"^(had)$", r"hav"), - (r"^(hoe)$", r"\g<1>"), - (r"([^e])e$", r"\g<1>"), - (r"er$", r"er"), - (r"([^aeiou][aeiouy]([bdgmnprst]))$", "\g<1>\g<2>"), - ): - (ans, num) = subn(pat, repl, plv) - if num: - return "%sing" % ans - return "%sing" % ans - -# NUMERICAL INFLECTIONS - - def ordinal(self, num): - ''' - Return the ordinal of num. - - num can be an integer or text - - e.g. ordinal(1) returns '1st' - ordinal('one') returns 'first' - - ''' - if match(r"\d", str(num)): - try: - num % 2 - n = num - except TypeError: - if '.' in str(num): - try: - n = int(num[-1]) # numbers after decimal, so only need last one for ordinal - except ValueError: # ends with '.', so need to use whole string - n = int(num[:-1]) - else: - n = int(num) - try: - post = nth[n % 100] - except KeyError: - post = nth[n % 10] - return "%s%s" % (num, post) - else: - mo = search(r"(%s)\Z" % ordinal_suff, num) - try: - post = ordinal[mo.group(1)] - return resub(r"(%s)\Z" % ordinal_suff, post, num) - except AttributeError: - return "%sth" % num - - def millfn(self, ind=0): - if ind > len(mill) - 1: - print3("number out of range") - raise NumOutOfRangeError - return mill[ind] - - def unitfn(self, units, mindex=0): - return "%s%s" % (unit[units], self.millfn(mindex)) - - def tenfn(self, tens, units, mindex=0): - if tens != 1: - return "%s%s%s%s" % (ten[tens], - '-' if tens and units else '', - unit[units], - self.millfn(mindex)) - return "%s%s" % (teen[units], mill[mindex]) - - def hundfn(self, hundreds, tens, units, mindex): - if hundreds: - return "%s hundred%s%s%s, " % (unit[hundreds], # use unit not unitfn as simpler - " %s " % self.number_args['andword'] if tens or units else '', - self.tenfn(tens, units), - self.millfn(mindex)) - if tens or units: - return "%s%s, " % (self.tenfn(tens, units), self.millfn(mindex)) - return '' - - def group1sub(self, mo): - units = int(mo.group(1)) - if units == 1: - return " %s, " % self.number_args['one'] - elif units: - # TODO: bug one and zero are padded with a space but other numbers aren't. check this in perl - return "%s, " % unit[units] - else: - return " %s, " % self.number_args['zero'] - - def group1bsub(self, mo): - units = int(mo.group(1)) - if units: - # TODO: bug one and zero are padded with a space but other numbers aren't. check this in perl - return "%s, " % unit[units] - else: - return " %s, " % self.number_args['zero'] - - def group2sub(self, mo): - tens = int(mo.group(1)) - units = int(mo.group(2)) - if tens: - return "%s, " % self.tenfn(tens, units) - if units: - return " %s %s, " % (self.number_args['zero'], unit[units]) - return " %s %s, " % (self.number_args['zero'], self.number_args['zero']) - - def group3sub(self, mo): - hundreds = int(mo.group(1)) - tens = int(mo.group(2)) - units = int(mo.group(3)) - if hundreds == 1: - hunword = " %s" % self.number_args['one'] - elif hundreds: - hunword = "%s" % unit[hundreds] - # TODO: bug one and zero are padded with a space but other numbers aren't. check this in perl - else: - hunword = " %s" % self.number_args['zero'] - if tens: - tenword = self.tenfn(tens, units) - elif units: - tenword = " %s %s" % (self.number_args['zero'], unit[units]) - else: - tenword = " %s %s" % (self.number_args['zero'], self.number_args['zero']) - return "%s %s, " % (hunword, tenword) - - def hundsub(self, mo): - ret = self.hundfn(int(mo.group(1)), int(mo.group(2)), int(mo.group(3)), self.mill_count) - self.mill_count += 1 - return ret - - def tensub(self, mo): - return "%s, " % self.tenfn(int(mo.group(1)), int(mo.group(2)), self.mill_count) - - def unitsub(self, mo): - return "%s, " % self.unitfn(int(mo.group(1)), self.mill_count) - - def enword(self, num, group): - # import pdb - # pdb.set_trace() - - if group == 1: - num = resub(r"(\d)", self.group1sub, num) - elif group == 2: - num = resub(r"(\d)(\d)", self.group2sub, num) - num = resub(r"(\d)", self.group1bsub, num, 1) - # group1bsub same as - # group1sub except it doesn't use the default word for one. - # Is this required? i.e. is the default word not to beused when - # grouping in pairs? - # - # No. This is a bug. Fixed. TODO: report upstream. - elif group == 3: - num = resub(r"(\d)(\d)(\d)", self.group3sub, num) - num = resub(r"(\d)(\d)", self.group2sub, num, 1) - num = resub(r"(\d)", self.group1sub, num, 1) - elif int(num) == 0: - num = self.number_args['zero'] - elif int(num) == 1: - num = self.number_args['one'] - else: - num = num.lstrip().lstrip('0') - self.mill_count = 0 - # surely there's a better way to do the next bit - mo = search(r"(\d)(\d)(\d)(?=\D*\Z)", num) - while mo: - num = resub(r"(\d)(\d)(\d)(?=\D*\Z)", self.hundsub, num, 1) - mo = search(r"(\d)(\d)(\d)(?=\D*\Z)", num) - num = resub(r"(\d)(\d)(?=\D*\Z)", self.tensub, num, 1) - num = resub(r"(\d)(?=\D*\Z)", self.unitsub, num, 1) - return num - - def blankfn(self, mo): - ''' do a global blank replace - TODO: surely this can be done with an option to resub - rather than this fn - ''' - return '' - - def commafn(self, mo): - ''' do a global ',' replace - TODO: surely this can be done with an option to resub - rather than this fn - ''' - return ',' - - def spacefn(self, mo): - ''' do a global ' ' replace - TODO: surely this can be done with an option to resub - rather than this fn - ''' - return ' ' - - def number_to_words(self, num, wantlist=False, - group=0, comma=',', andword='and', - zero='zero', one='one', decimal='point', - threshold=None): - ''' - Return a number in words. - - group = 1, 2 or 3 to group numbers before turning into words - comma: define comma - andword: word for 'and'. Can be set to ''. - e.g. "one hundred and one" vs "one hundred one" - zero: word for '0' - one: word for '1' - decimal: word for decimal point - threshold: numbers above threshold not turned into words - - parameters not remembered from last call. Departure from Perl version. - ''' - self.number_args = dict(andword=andword, zero=zero, one=one) - num = '%s' % num - - # Handle "stylistic" conversions (up to a given threshold)... - if (threshold is not None and float(num) > threshold): - spnum = num.split('.', 1) - while (comma): - (spnum[0], n) = subn(r"(\d)(\d{3}(?:,|\Z))", r"\1,\2", spnum[0]) - if n == 0: - break - try: - return "%s.%s" % (spnum[0], spnum[1]) - except IndexError: - return "%s" % spnum[0] - - if group < 0 or group > 3: - raise BadChunkingOptionError - nowhite = num.lstrip() - if nowhite[0] == '+': - sign = "plus" - elif nowhite[0] == '-': - sign = "minus" - else: - sign = "" - - myord = (num[-2:] in ('st', 'nd', 'rd', 'th')) - if myord: - num = num[:-2] - finalpoint = False - if decimal: - if group != 0: - chunks = num.split('.') - else: - chunks = num.split('.', 1) - if chunks[-1] == '': # remove blank string if nothing after decimal - chunks = chunks[:-1] - finalpoint = True # add 'point' to end of output - else: - chunks = [num] - - first = 1 - loopstart = 0 - - if chunks[0] == '': - first = 0 - if len(chunks) > 1: - loopstart = 1 - - for i in range(loopstart, len(chunks)): - chunk = chunks[i] - # remove all non numeric \D - chunk = resub(r"\D", self.blankfn, chunk) - if chunk == "": - chunk = "0" - - if group == 0 and (first == 0 or first == ''): - chunk = self.enword(chunk, 1) - else: - chunk = self.enword(chunk, group) - - if chunk[-2:] == ', ': - chunk = chunk[:-2] - chunk = resub(r"\s+,", self.commafn, chunk) - - if group == 0 and first: - chunk = resub(r", (\S+)\s+\Z", " %s \\1" % andword, chunk) - chunk = resub(r"\s+", self.spacefn, chunk) - # chunk = resub(r"(\A\s|\s\Z)", self.blankfn, chunk) - chunk = chunk.strip() - if first: - first = '' - chunks[i] = chunk - - numchunks = [] - if first != 0: - numchunks = chunks[0].split("%s " % comma) - - if myord and numchunks: - # TODO: can this be just one re as it is in perl? - mo = search(r"(%s)\Z" % ordinal_suff, numchunks[-1]) - if mo: - numchunks[-1] = resub(r"(%s)\Z" % ordinal_suff, ordinal[mo.group(1)], - numchunks[-1]) - else: - numchunks[-1] += 'th' - - for chunk in chunks[1:]: - numchunks.append(decimal) - numchunks.extend(chunk.split("%s " % comma)) - - if finalpoint: - numchunks.append(decimal) - - # wantlist: Perl list context. can explictly specify in Python - if wantlist: - if sign: - numchunks = [sign] + numchunks - return numchunks - elif group: - signout = "%s " % sign if sign else '' - return "%s%s" % (signout, ", ".join(numchunks)) - else: - signout = "%s " % sign if sign else '' - num = "%s%s" % (signout, numchunks.pop(0)) - if decimal is None: - first = True - else: - first = not num.endswith(decimal) - for nc in numchunks: - if nc == decimal: - num += " %s" % nc - first = 0 - elif first: - num += "%s %s" % (comma, nc) - else: - num += " %s" % nc - return num - -# Join words with commas and a trailing 'and' (when appropriate)... - - def join(self, words, sep=None, sep_spaced=True, - final_sep=None, conj='and', conj_spaced=True): - ''' - Join words into a list. - - e.g. join(['ant', 'bee', 'fly']) returns 'ant, bee, and fly' - - options: - conj: replacement for 'and' - sep: separator. default ',', unless ',' is in the list then ';' - final_sep: final separator. default ',', unless ',' is in the list then ';' - conj_spaced: boolean. Should conj have spaces around it - - ''' - if not words: - return "" - if len(words) == 1: - return words[0] - - if conj_spaced: - if conj == '': - conj = ' ' - else: - conj = ' %s ' % conj - - if len(words) == 2: - return "%s%s%s" % (words[0], conj, words[1]) - - if sep is None: - if ',' in ''.join(words): - sep = ';' - else: - sep = ',' - if final_sep is None: - final_sep = sep - - final_sep = "%s%s" % (final_sep, conj) - - if sep_spaced: - sep += ' ' - - return "%s%s%s" % (sep.join(words[0:-1]), final_sep, words[-1]) diff --git a/bin/mistune.py b/bin/mistune.py deleted file mode 100644 index 3c82c8e..0000000 --- a/bin/mistune.py +++ /dev/null @@ -1,1154 +0,0 @@ -# coding: utf-8 -""" - mistune - ~~~~~~~ - - The fastest markdown parser in pure Python with renderer feature. - - :copyright: (c) 2014 - 2015 by Hsiaoming Yang. -""" - -import re -import inspect - -__version__ = '0.7.2' -__author__ = 'Hsiaoming Yang ' -__all__ = [ - 'BlockGrammar', 'BlockLexer', - 'InlineGrammar', 'InlineLexer', - 'Renderer', 'Markdown', - 'markdown', 'escape', -] - - -_key_pattern = re.compile(r'\s+') -_nonalpha_pattern = re.compile(r'\W') -_escape_pattern = re.compile(r'&(?!#?\w+;)') -_newline_pattern = re.compile(r'\r\n|\r') -_block_quote_leading_pattern = re.compile(r'^ *> ?', flags=re.M) -_block_code_leading_pattern = re.compile(r'^ {4}', re.M) -_inline_tags = [ - 'a', 'em', 'strong', 'small', 's', 'cite', 'q', 'dfn', 'abbr', 'data', - 'time', 'code', 'var', 'samp', 'kbd', 'sub', 'sup', 'i', 'b', 'u', 'mark', - 'ruby', 'rt', 'rp', 'bdi', 'bdo', 'span', 'br', 'wbr', 'ins', 'del', - 'img', 'font', -] -_pre_tags = ['pre', 'script', 'style'] -_valid_end = r'(?!:/|[^\w\s@]*@)\b' -_valid_attr = r'''"[^"]*"|'[^']*'|[^'">]''' -_block_tag = r'(?!(?:%s)\b)\w+%s' % ('|'.join(_inline_tags), _valid_end) -_scheme_blacklist = ('javascript', 'data', 'vbscript') - - -def _pure_pattern(regex): - pattern = regex.pattern - if pattern.startswith('^'): - pattern = pattern[1:] - return pattern - - -def _keyify(key): - return _key_pattern.sub(' ', key.lower()) - - -def escape(text, quote=False, smart_amp=True): - """Replace special characters "&", "<" and ">" to HTML-safe sequences. - - The original cgi.escape will always escape "&", but you can control - this one for a smart escape amp. - - :param quote: if set to True, " and ' will be escaped. - :param smart_amp: if set to False, & will always be escaped. - """ - if smart_amp: - text = _escape_pattern.sub('&', text) - else: - text = text.replace('&', '&') - text = text.replace('<', '<') - text = text.replace('>', '>') - if quote: - text = text.replace('"', '"') - text = text.replace("'", ''') - return text - - -def escape_link(url, **kwargs): - """Remove dangerous URL schemes like javascript: and escape afterwards.""" - if ':' in url: - scheme, _ = url.split(':', 1) - scheme = _nonalpha_pattern.sub('', scheme) - # whitelist would be better but mistune's use case is too general - if scheme.lower() in _scheme_blacklist: - return '' - # escape &entities; to &entities; - kwargs['smart_amp'] = False - return escape(url, **kwargs) - - -def preprocessing(text, tab=4): - text = _newline_pattern.sub('\n', text) - text = text.replace('\t', ' ' * tab) - text = text.replace('\u00a0', ' ') - text = text.replace('\u2424', '\n') - pattern = re.compile(r'^ +$', re.M) - return pattern.sub('', text) - - -class BlockGrammar(object): - """Grammars for block level tokens.""" - - def_links = re.compile( - r'^ *\[([^^\]]+)\]: *' # [key]: - r']+)>?' # or link - r'(?: +["(]([^\n]+)[")])? *(?:\n+|$)' - ) - def_footnotes = re.compile( - r'^\[\^([^\]]+)\]: *(' - r'[^\n]*(?:\n+|$)' # [^key]: - r'(?: {1,}[^\n]*(?:\n+|$))*' - r')' - ) - - newline = re.compile(r'^\n+') - block_code = re.compile(r'^( {4}[^\n]+\n*)+') - fences = re.compile( - r'^ *(`{3,}|~{3,}) *(\S+)? *\n' # ```lang - r'([\s\S]+?)\s*' - r'\1 *(?:\n+|$)' # ``` - ) - hrule = re.compile(r'^ {0,3}[-*_](?: *[-*_]){2,} *(?:\n+|$)') - heading = re.compile(r'^ *(#{1,6}) *([^\n]+?) *#* *(?:\n+|$)') - lheading = re.compile(r'^([^\n]+)\n *(=|-)+ *(?:\n+|$)') - block_quote = re.compile(r'^( *>[^\n]+(\n[^\n]+)*\n*)+') - list_block = re.compile( - r'^( *)([*+-]|\d+\.) [\s\S]+?' - r'(?:' - r'\n+(?=\1?(?:[-*_] *){3,}(?:\n+|$))' # hrule - r'|\n+(?=%s)' # def links - r'|\n+(?=%s)' # def footnotes - r'|\n{2,}' - r'(?! )' - r'(?!\1(?:[*+-]|\d+\.) )\n*' - r'|' - r'\s*$)' % ( - _pure_pattern(def_links), - _pure_pattern(def_footnotes), - ) - ) - list_item = re.compile( - r'^(( *)(?:[*+-]|\d+\.) [^\n]*' - r'(?:\n(?!\2(?:[*+-]|\d+\.) )[^\n]*)*)', - flags=re.M - ) - list_bullet = re.compile(r'^ *(?:[*+-]|\d+\.) +') - paragraph = re.compile( - r'^((?:[^\n]+\n?(?!' - r'%s|%s|%s|%s|%s|%s|%s|%s|%s' - r'))+)\n*' % ( - _pure_pattern(fences).replace(r'\1', r'\2'), - _pure_pattern(list_block).replace(r'\1', r'\3'), - _pure_pattern(hrule), - _pure_pattern(heading), - _pure_pattern(lheading), - _pure_pattern(block_quote), - _pure_pattern(def_links), - _pure_pattern(def_footnotes), - '<' + _block_tag, - ) - ) - block_html = re.compile( - r'^ *(?:%s|%s|%s) *(?:\n{2,}|\s*$)' % ( - r'', - r'<(%s)((?:%s)*?)>([\s\S]+?)<\/\1>' % (_block_tag, _valid_attr), - r'<%s(?:%s)*?>' % (_block_tag, _valid_attr), - ) - ) - table = re.compile( - r'^ *\|(.+)\n *\|( *[-:]+[-| :]*)\n((?: *\|.*(?:\n|$))*)\n*' - ) - nptable = re.compile( - r'^ *(\S.*\|.*)\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*' - ) - text = re.compile(r'^[^\n]+') - - -class BlockLexer(object): - """Block level lexer for block grammars.""" - grammar_class = BlockGrammar - - default_rules = [ - 'newline', 'hrule', 'block_code', 'fences', 'heading', - 'nptable', 'lheading', 'block_quote', - 'list_block', 'block_html', 'def_links', - 'def_footnotes', 'table', 'paragraph', 'text' - ] - - list_rules = ( - 'newline', 'block_code', 'fences', 'lheading', 'hrule', - 'block_quote', 'list_block', 'block_html', 'text', - ) - - footnote_rules = ( - 'newline', 'block_code', 'fences', 'heading', - 'nptable', 'lheading', 'hrule', 'block_quote', - 'list_block', 'block_html', 'table', 'paragraph', 'text' - ) - - def __init__(self, rules=None, **kwargs): - self.tokens = [] - self.def_links = {} - self.def_footnotes = {} - - if not rules: - rules = self.grammar_class() - - self.rules = rules - - def __call__(self, text, rules=None): - return self.parse(text, rules) - - def parse(self, text, rules=None): - text = text.rstrip('\n') - - if not rules: - rules = self.default_rules - - def manipulate(text): - for key in rules: - rule = getattr(self.rules, key) - m = rule.match(text) - if not m: - continue - getattr(self, 'parse_%s' % key)(m) - return m - return False # pragma: no cover - - while text: - m = manipulate(text) - if m is not False: - text = text[len(m.group(0)):] - continue - if text: # pragma: no cover - raise RuntimeError('Infinite loop at: %s' % text) - return self.tokens - - def parse_newline(self, m): - length = len(m.group(0)) - if length > 1: - self.tokens.append({'type': 'newline'}) - - def parse_block_code(self, m): - # clean leading whitespace - code = _block_code_leading_pattern.sub('', m.group(0)) - self.tokens.append({ - 'type': 'code', - 'lang': None, - 'text': code, - }) - - def parse_fences(self, m): - self.tokens.append({ - 'type': 'code', - 'lang': m.group(2), - 'text': m.group(3), - }) - - def parse_heading(self, m): - self.tokens.append({ - 'type': 'heading', - 'level': len(m.group(1)), - 'text': m.group(2), - }) - - def parse_lheading(self, m): - """Parse setext heading.""" - self.tokens.append({ - 'type': 'heading', - 'level': 1 if m.group(2) == '=' else 2, - 'text': m.group(1), - }) - - def parse_hrule(self, m): - self.tokens.append({'type': 'hrule'}) - - def parse_list_block(self, m): - bull = m.group(2) - self.tokens.append({ - 'type': 'list_start', - 'ordered': '.' in bull, - }) - cap = m.group(0) - self._process_list_item(cap, bull) - self.tokens.append({'type': 'list_end'}) - - def _process_list_item(self, cap, bull): - cap = self.rules.list_item.findall(cap) - - _next = False - length = len(cap) - - for i in range(length): - item = cap[i][0] - - # remove the bullet - space = len(item) - item = self.rules.list_bullet.sub('', item) - - # outdent - if '\n ' in item: - space = space - len(item) - pattern = re.compile(r'^ {1,%d}' % space, flags=re.M) - item = pattern.sub('', item) - - # determine whether item is loose or not - loose = _next - if not loose and re.search(r'\n\n(?!\s*$)', item): - loose = True - - rest = len(item) - if i != length - 1 and rest: - _next = item[rest-1] == '\n' - if not loose: - loose = _next - - if loose: - t = 'loose_item_start' - else: - t = 'list_item_start' - - self.tokens.append({'type': t}) - # recurse - self.parse(item, self.list_rules) - self.tokens.append({'type': 'list_item_end'}) - - def parse_block_quote(self, m): - self.tokens.append({'type': 'block_quote_start'}) - # clean leading > - cap = _block_quote_leading_pattern.sub('', m.group(0)) - self.parse(cap) - self.tokens.append({'type': 'block_quote_end'}) - - def parse_def_links(self, m): - key = _keyify(m.group(1)) - self.def_links[key] = { - 'link': m.group(2), - 'title': m.group(3), - } - - def parse_def_footnotes(self, m): - key = _keyify(m.group(1)) - if key in self.def_footnotes: - # footnote is already defined - return - - self.def_footnotes[key] = 0 - - self.tokens.append({ - 'type': 'footnote_start', - 'key': key, - }) - - text = m.group(2) - - if '\n' in text: - lines = text.split('\n') - whitespace = None - for line in lines[1:]: - space = len(line) - len(line.lstrip()) - if space and (not whitespace or space < whitespace): - whitespace = space - newlines = [lines[0]] - for line in lines[1:]: - newlines.append(line[whitespace:]) - text = '\n'.join(newlines) - - self.parse(text, self.footnote_rules) - - self.tokens.append({ - 'type': 'footnote_end', - 'key': key, - }) - - def parse_table(self, m): - item = self._process_table(m) - - cells = re.sub(r'(?: *\| *)?\n$', '', m.group(3)) - cells = cells.split('\n') - for i, v in enumerate(cells): - v = re.sub(r'^ *\| *| *\| *$', '', v) - cells[i] = re.split(r' *\| *', v) - - item['cells'] = cells - self.tokens.append(item) - - def parse_nptable(self, m): - item = self._process_table(m) - - cells = re.sub(r'\n$', '', m.group(3)) - cells = cells.split('\n') - for i, v in enumerate(cells): - cells[i] = re.split(r' *\| *', v) - - item['cells'] = cells - self.tokens.append(item) - - def _process_table(self, m): - header = re.sub(r'^ *| *\| *$', '', m.group(1)) - header = re.split(r' *\| *', header) - align = re.sub(r' *|\| *$', '', m.group(2)) - align = re.split(r' *\| *', align) - - for i, v in enumerate(align): - if re.search(r'^ *-+: *$', v): - align[i] = 'right' - elif re.search(r'^ *:-+: *$', v): - align[i] = 'center' - elif re.search(r'^ *:-+ *$', v): - align[i] = 'left' - else: - align[i] = None - - item = { - 'type': 'table', - 'header': header, - 'align': align, - } - return item - - def parse_block_html(self, m): - tag = m.group(1) - if not tag: - text = m.group(0) - self.tokens.append({ - 'type': 'close_html', - 'text': text - }) - else: - attr = m.group(2) - text = m.group(3) - self.tokens.append({ - 'type': 'open_html', - 'tag': tag, - 'extra': attr, - 'text': text - }) - - def parse_paragraph(self, m): - text = m.group(1).rstrip('\n') - self.tokens.append({'type': 'paragraph', 'text': text}) - - def parse_text(self, m): - text = m.group(0) - self.tokens.append({'type': 'text', 'text': text}) - - -class InlineGrammar(object): - """Grammars for inline level tokens.""" - - escape = re.compile(r'^\\([\\`*{}\[\]()#+\-.!_>~|])') # \* \+ \! .... - inline_html = re.compile( - r'^(?:%s|%s|%s)' % ( - r'', - r'<(\w+%s)((?:%s)*?)>([\s\S]*?)<\/\1>' % (_valid_end, _valid_attr), - r'<\w+%s(?:%s)*?>' % (_valid_end, _valid_attr), - ) - ) - autolink = re.compile(r'^<([^ >]+(@|:)[^ >]+)>') - link = re.compile( - r'^!?\[(' - r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*' - r')\]\(' - r'''\s*(<)?([\s\S]*?)(?(2)>)(?:\s+['"]([\s\S]*?)['"])?\s*''' - r'\)' - ) - reflink = re.compile( - r'^!?\[(' - r'(?:\[[^^\]]*\]|[^\[\]]|\](?=[^\[]*\]))*' - r')\]\s*\[([^^\]]*)\]' - ) - nolink = re.compile(r'^!?\[((?:\[[^\]]*\]|[^\[\]])*)\]') - url = re.compile(r'''^(https?:\/\/[^\s<]+[^<.,:;"')\]\s])''') - double_emphasis = re.compile( - r'^_{2}([\s\S]+?)_{2}(?!_)' # __word__ - r'|' - r'^\*{2}([\s\S]+?)\*{2}(?!\*)' # **word** - ) - emphasis = re.compile( - r'^\b_((?:__|[^_])+?)_\b' # _word_ - r'|' - r'^\*((?:\*\*|[^\*])+?)\*(?!\*)' # *word* - ) - code = re.compile(r'^(`+)\s*([\s\S]*?[^`])\s*\1(?!`)') # `code` - linebreak = re.compile(r'^ {2,}\n(?!\s*$)') - strikethrough = re.compile(r'^~~(?=\S)([\s\S]*?\S)~~') # ~~word~~ - footnote = re.compile(r'^\[\^([^\]]+)\]') - text = re.compile(r'^[\s\S]+?(?=[\\%s' % (tag, extra, text, tag) - else: - html = m.group(0) - return self.renderer.inline_html(html) - - def output_footnote(self, m): - key = _keyify(m.group(1)) - if key not in self.footnotes: - return None - if self.footnotes[key]: - return None - self.footnote_index += 1 - self.footnotes[key] = self.footnote_index - return self.renderer.footnote_ref(key, self.footnote_index) - - def output_link(self, m): - return self._process_link(m, m.group(3), m.group(4)) - - def output_reflink(self, m): - key = _keyify(m.group(2) or m.group(1)) - if key not in self.links: - return None - ret = self.links[key] - return self._process_link(m, ret['link'], ret['title']) - - def output_nolink(self, m): - key = _keyify(m.group(1)) - if key not in self.links: - return None - ret = self.links[key] - return self._process_link(m, ret['link'], ret['title']) - - def _process_link(self, m, link, title=None): - line = m.group(0) - text = m.group(1) - if line[0] == '!': - return self.renderer.image(link, title, text) - - self._in_link = True - text = self.output(text) - self._in_link = False - return self.renderer.link(link, title, text) - - def output_double_emphasis(self, m): - text = m.group(2) or m.group(1) - text = self.output(text) - return self.renderer.double_emphasis(text) - - def output_emphasis(self, m): - text = m.group(2) or m.group(1) - text = self.output(text) - return self.renderer.emphasis(text) - - def output_code(self, m): - text = m.group(2) - return self.renderer.codespan(text) - - def output_linebreak(self, m): - return self.renderer.linebreak() - - def output_strikethrough(self, m): - text = self.output(m.group(1)) - return self.renderer.strikethrough(text) - - def output_text(self, m): - text = m.group(0) - return self.renderer.text(text) - - -class Renderer(object): - """The default HTML renderer for rendering Markdown. - """ - - def __init__(self, **kwargs): - self.options = kwargs - - def placeholder(self): - """Returns the default, empty output value for the renderer. - - All renderer methods use the '+=' operator to append to this value. - Default is a string so rendering HTML can build up a result string with - the rendered Markdown. - - Can be overridden by Renderer subclasses to be types like an empty - list, allowing the renderer to create a tree-like structure to - represent the document (which can then be reprocessed later into a - separate format like docx or pdf). - """ - return '' - - def block_code(self, code, lang=None): - """Rendering block level code. ``pre > code``. - - :param code: text content of the code block. - :param lang: language of the given code. - """ - code = code.rstrip('\n') - if not lang: - code = escape(code, smart_amp=False) - return '
%s\n
\n' % code - code = escape(code, quote=True, smart_amp=False) - return '
%s\n
\n' % (lang, code) - - def block_quote(self, text): - """Rendering
with the given text. - - :param text: text content of the blockquote. - """ - return '
%s\n
\n' % text.rstrip('\n') - - def block_html(self, html): - """Rendering block level pure html content. - - :param html: text content of the html snippet. - """ - if self.options.get('skip_style') and \ - html.lower().startswith('`` ``

``. - - :param text: rendered text content for the header. - :param level: a number for the header level, for example: 1. - :param raw: raw text content of the header. - """ - return '%s\n' % (level, text, level) - - def hrule(self): - """Rendering method for ``
`` tag.""" - if self.options.get('use_xhtml'): - return '
\n' - return '
\n' - - def list(self, body, ordered=True): - """Rendering list tags like ``
    `` and ``
      ``. - - :param body: body contents of the list. - :param ordered: whether this list is ordered or not. - """ - tag = 'ul' - if ordered: - tag = 'ol' - return '<%s>\n%s\n' % (tag, body, tag) - - def list_item(self, text): - """Rendering list item snippet. Like ``
    1. ``.""" - return '
    2. %s
    3. \n' % text - - def paragraph(self, text): - """Rendering paragraph tags. Like ``

      ``.""" - return '

      %s

      \n' % text.strip(' ') - - def table(self, header, body): - """Rendering table element. Wrap header and body in it. - - :param header: header part of the table. - :param body: body part of the table. - """ - return ( - '\n%s\n' - '\n%s\n
      \n' - ) % (header, body) - - def table_row(self, content): - """Rendering a table row. Like ````. - - :param content: content of current table row. - """ - return '\n%s\n' % content - - def table_cell(self, content, **flags): - """Rendering a table cell. Like ```` ````. - - :param content: content of current table cell. - :param header: whether this is header or not. - :param align: align of current table cell. - """ - if flags['header']: - tag = 'th' - else: - tag = 'td' - align = flags['align'] - if not align: - return '<%s>%s\n' % (tag, content, tag) - return '<%s style="text-align:%s">%s\n' % ( - tag, align, content, tag - ) - - def double_emphasis(self, text): - """Rendering **strong** text. - - :param text: text content for emphasis. - """ - return '%s' % text - - def emphasis(self, text): - """Rendering *emphasis* text. - - :param text: text content for emphasis. - """ - return '%s' % text - - def codespan(self, text): - """Rendering inline `code` text. - - :param text: text content for inline code. - """ - text = escape(text.rstrip(), smart_amp=False) - return '%s' % text - - def linebreak(self): - """Rendering line break like ``
      ``.""" - if self.options.get('use_xhtml'): - return '
      \n' - return '
      \n' - - def strikethrough(self, text): - """Rendering ~~strikethrough~~ text. - - :param text: text content for strikethrough. - """ - return '%s' % text - - def text(self, text): - """Rendering unformatted text. - - :param text: text content. - """ - return escape(text) - - def autolink(self, link, is_email=False): - """Rendering a given link or email address. - - :param link: link content or email address. - :param is_email: whether this is an email or not. - """ - text = link = escape(link) - if is_email: - link = 'mailto:%s' % link - return '%s' % (link, text) - - def link(self, link, title, text): - """Rendering a given link with content and title. - - :param link: href link for ```` tag. - :param title: title content for `title` attribute. - :param text: text content for description. - """ - link = escape_link(link, quote=True) - if not title: - return '%s' % (link, text) - title = escape(title, quote=True) - return '%s' % (link, title, text) - - def image(self, src, title, text): - """Rendering a image with title and text. - - :param src: source link of the image. - :param title: title text of the image. - :param text: alt text of the image. - """ - src = escape_link(src, quote=True) - text = escape(text, quote=True) - if title: - title = escape(title, quote=True) - html = '%s' % html - return '%s>' % html - - def inline_html(self, html): - """Rendering span level pure html content. - - :param html: text content of the html snippet. - """ - if self.options.get('escape'): - return escape(html) - return html - - def newline(self): - """Rendering newline element.""" - return '' - - def footnote_ref(self, key, index): - """Rendering the ref anchor of a footnote. - - :param key: identity key for the footnote. - :param index: the index count of current footnote. - """ - html = ( - '' - '%d' - ) % (escape(key), escape(key), index) - return html - - def footnote_item(self, key, text): - """Rendering a footnote item. - - :param key: identity key for the footnote. - :param text: text content of the footnote. - """ - back = ( - '' - ) % escape(key) - text = text.rstrip() - if text.endswith('

      '): - text = re.sub(r'<\/p>$', r'%s

      ' % back, text) - else: - text = '%s

      %s

      ' % (text, back) - html = '
    4. %s
    5. \n' % (escape(key), text) - return html - - def footnotes(self, text): - """Wrapper for all footnotes. - - :param text: contents of all footnotes. - """ - html = '
      \n%s
        %s
      \n
      \n' - return html % (self.hrule(), text) - - -class Markdown(object): - """The Markdown parser. - - :param renderer: An instance of ``Renderer``. - :param inline: An inline lexer class or instance. - :param block: A block lexer class or instance. - """ - def __init__(self, renderer=None, inline=None, block=None, **kwargs): - if not renderer: - renderer = Renderer(**kwargs) - else: - kwargs.update(renderer.options) - - self.renderer = renderer - - if inline and inspect.isclass(inline): - inline = inline(renderer, **kwargs) - if block and inspect.isclass(block): - block = block(**kwargs) - - if inline: - self.inline = inline - else: - self.inline = InlineLexer(renderer, **kwargs) - - self.block = block or BlockLexer(BlockGrammar()) - self.footnotes = [] - self.tokens = [] - - # detect if it should parse text in block html - self._parse_block_html = kwargs.get('parse_block_html') - - def __call__(self, text): - return self.parse(text) - - def render(self, text): - """Render the Markdown text. - - :param text: markdown formatted text content. - """ - return self.parse(text) - - def parse(self, text): - out = self.output(preprocessing(text)) - - keys = self.block.def_footnotes - - # reset block - self.block.def_links = {} - self.block.def_footnotes = {} - - # reset inline - self.inline.links = {} - self.inline.footnotes = {} - - if not self.footnotes: - return out - - footnotes = filter(lambda o: keys.get(o['key']), self.footnotes) - self.footnotes = sorted( - footnotes, key=lambda o: keys.get(o['key']), reverse=True - ) - - body = self.renderer.placeholder() - while self.footnotes: - note = self.footnotes.pop() - body += self.renderer.footnote_item( - note['key'], note['text'] - ) - - out += self.renderer.footnotes(body) - return out - - def pop(self): - if not self.tokens: - return None - self.token = self.tokens.pop() - return self.token - - def peek(self): - if self.tokens: - return self.tokens[-1] - return None # pragma: no cover - - def output(self, text, rules=None): - self.tokens = self.block(text, rules) - self.tokens.reverse() - - self.inline.setup(self.block.def_links, self.block.def_footnotes) - - out = self.renderer.placeholder() - while self.pop(): - out += self.tok() - return out - - def tok(self): - t = self.token['type'] - - # sepcial cases - if t.endswith('_start'): - t = t[:-6] - - return getattr(self, 'output_%s' % t)() - - def tok_text(self): - text = self.token['text'] - while self.peek()['type'] == 'text': - text += '\n' + self.pop()['text'] - return self.inline(text) - - def output_newline(self): - return self.renderer.newline() - - def output_hrule(self): - return self.renderer.hrule() - - def output_heading(self): - return self.renderer.header( - self.inline(self.token['text']), - self.token['level'], - self.token['text'], - ) - - def output_code(self): - return self.renderer.block_code( - self.token['text'], self.token['lang'] - ) - - def output_table(self): - aligns = self.token['align'] - aligns_length = len(aligns) - cell = self.renderer.placeholder() - - # header part - header = self.renderer.placeholder() - for i, value in enumerate(self.token['header']): - align = aligns[i] if i < aligns_length else None - flags = {'header': True, 'align': align} - cell += self.renderer.table_cell(self.inline(value), **flags) - - header += self.renderer.table_row(cell) - - # body part - body = self.renderer.placeholder() - for i, row in enumerate(self.token['cells']): - cell = self.renderer.placeholder() - for j, value in enumerate(row): - align = aligns[j] if j < aligns_length else None - flags = {'header': False, 'align': align} - cell += self.renderer.table_cell(self.inline(value), **flags) - body += self.renderer.table_row(cell) - - return self.renderer.table(header, body) - - def output_block_quote(self): - body = self.renderer.placeholder() - while self.pop()['type'] != 'block_quote_end': - body += self.tok() - return self.renderer.block_quote(body) - - def output_list(self): - ordered = self.token['ordered'] - body = self.renderer.placeholder() - while self.pop()['type'] != 'list_end': - body += self.tok() - return self.renderer.list(body, ordered) - - def output_list_item(self): - body = self.renderer.placeholder() - while self.pop()['type'] != 'list_item_end': - if self.token['type'] == 'text': - body += self.tok_text() - else: - body += self.tok() - - return self.renderer.list_item(body) - - def output_loose_item(self): - body = self.renderer.placeholder() - while self.pop()['type'] != 'list_item_end': - body += self.tok() - return self.renderer.list_item(body) - - def output_footnote(self): - self.inline._in_footnote = True - body = self.renderer.placeholder() - key = self.token['key'] - while self.pop()['type'] != 'footnote_end': - body += self.tok() - self.footnotes.append({'key': key, 'text': body}) - self.inline._in_footnote = False - return self.renderer.placeholder() - - def output_close_html(self): - text = self.token['text'] - return self.renderer.block_html(text) - - def output_open_html(self): - text = self.token['text'] - tag = self.token['tag'] - if self._parse_block_html and tag not in _pre_tags: - text = self.inline(text, rules=self.inline.inline_html_rules) - extra = self.token.get('extra') or '' - html = '<%s%s>%s' % (tag, extra, text, tag) - return self.renderer.block_html(html) - - def output_paragraph(self): - return self.renderer.paragraph(self.inline(self.token['text'])) - - def output_text(self): - return self.renderer.paragraph(self.tok_text()) - - -def markdown(text, escape=True, **kwargs): - """Render markdown formatted text to html. - - :param text: markdown formatted text content. - :param escape: if set to False, all html tags will not be escaped. - :param use_xhtml: output with xhtml tags. - :param hard_wrap: if set to True, it will use the GFM line breaks feature. - :param parse_block_html: parse text only in block level html. - :param parse_inline_html: parse text only in inline level html. - """ - return Markdown(escape=escape, **kwargs)(text) diff --git a/lib/lang.json b/lib/lang.json deleted file mode 100644 index b0cb469..0000000 --- a/lib/lang.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "greet":[ - "hi", - "hey", - "howdy", - "good morning", - "good afternoon", - "good day", - "good evening", - "welcome back", - "nice to see you" - ], - "bye":[ - "see you later, space cowboy", - "bye, townie", - "until next time, friend", - "come back whenever" - ], - "friend":[ - "friend", - "pal", - "buddy", - "townie" - ], - "months":{ - "01":"january", - "02":"february", - "03":"march", - "04":"april", - "05":"may", - "06":"june", - "07":"july", - "08":"august", - "09":"september", - "10":"october", - "11":"november", - "12":"december" - } -}