From cf5e5d2fd4578e6ff654e00627f77a360b378bf0 Mon Sep 17 00:00:00 2001 From: Oliver Sherouse Date: Fri, 28 Sep 2018 14:33:05 -0400 Subject: [PATCH 1/7] Inaugurate 0.6.0 development --- quantgov/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quantgov/__init__.py b/quantgov/__init__.py index 37f7d26..407d4c3 100644 --- a/quantgov/__init__.py +++ b/quantgov/__init__.py @@ -4,4 +4,4 @@ from . import corpus, nlp, ml, utils from .utils import load_driver -__version__ = '0.5.0' +__version__ = '0.6.0.dev' From 2d1599dab043d1216903cbbefb3367f18409e5a3 Mon Sep 17 00:00:00 2001 From: Scott Date: Thu, 17 Jan 2019 13:11:26 -0500 Subject: [PATCH 2/7] adding pos metrics analysis, design word count analysis --- .gitignore | 79 ++++++++++++ quantgov/nlp.py | 181 +++++++++++++++++++++++++-- quantgov/resources/design_words.txt | 1 + quantgov/resources/nltk_pos_tags.txt | 35 ++++++ 4 files changed, 289 insertions(+), 7 deletions(-) create mode 100755 quantgov/resources/design_words.txt create mode 100644 quantgov/resources/nltk_pos_tags.txt diff --git a/.gitignore b/.gitignore index d9d3190..108ee8d 100644 --- a/.gitignore +++ b/.gitignore @@ -88,3 +88,82 @@ ENV/ # Rope project settings .ropeproject + +# PyCharm stuff + +.idea/* +.idea/codeStyles/Project.xml +.idea/encodings.xml +.idea/misc.xml +.idea/modules.xml +.idea/quantgov.iml +.idea/vcs.xml + +queries.sql + +# via JetBrains: +# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser \ No newline at end of file diff --git a/quantgov/nlp.py b/quantgov/nlp.py index 8022fd8..0477863 100644 --- a/quantgov/nlp.py +++ b/quantgov/nlp.py @@ -1,6 +1,9 @@ """ quantgov.nlp: Text-based analysis of documents """ + +from __future__ import division + import re import collections import math @@ -11,6 +14,7 @@ try: import nltk.corpus + from nltk import word_tokenize, sent_tokenize, bigrams, trigrams, pos_tag NLTK = True except ImportError: NLTK = None @@ -26,6 +30,10 @@ except LookupError: nltk.download('wordnet') nltk.corpus.wordnet.ensure_loaded() + try: + nltk.pos_tag('A test.') + except LookupError: + nltk.download('averaged_perceptron_tagger') commands = {} @@ -44,7 +52,7 @@ def check_textblob(func, *args, **kwargs): return func(*args, **kwargs) -class WordCounter(): +class WordCounter: cli = utils.CLISpec( help='Word Counter', @@ -62,7 +70,7 @@ class WordCounter(): @staticmethod def get_columns(args): - return ('words',) + return ('words', ) @staticmethod def process_document(doc, word_pattern): @@ -72,7 +80,7 @@ def process_document(doc, word_pattern): commands['count_words'] = WordCounter -class OccurrenceCounter(): +class OccurrenceCounter: cli = utils.CLISpec( help="Term Counter for Specific Words", @@ -130,7 +138,7 @@ def process_document(doc, terms, pattern, total_label): commands['count_occurrences'] = OccurrenceCounter -class ShannonEntropy(): +class ShannonEntropy: lemmas = {} cli = utils.CLISpec( help='Shannon Entropy', @@ -197,7 +205,7 @@ def lemmatize(word): commands['shannon_entropy'] = ShannonEntropy -class ConditionalCounter(): +class ConditionalCounter: cli = utils.CLISpec( help=('Count conditional words and phrases. Included terms are: ' ' "if", "but", "except", "provided", "when", "where", ' @@ -224,7 +232,7 @@ def process_document(doc): commands['count_conditionals'] = ConditionalCounter -class SentenceLength(): +class SentenceLength: cli = utils.CLISpec( help='Sentence Length', @@ -262,7 +270,7 @@ def process_document(doc, precision): commands['sentence_length'] = SentenceLength -class SentimentAnalysis(): +class SentimentAnalysis: cli = utils.CLISpec( help='Performs sentiment analysis on the text', @@ -308,3 +316,162 @@ def process_document(doc, backend, precision): commands['sentiment_analysis'] = SentimentAnalysis + + +class DesignWords: + + cli = utils.CLISpec( + help='Searches for a pre-defined list of words potentially ' + 'associated with design-based standards in text.', + arguments=[ + utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + + @staticmethod + def get_columns(args): + # column names to return + return ('design_word_count', 'design_word_ratio', + 'design_word_ratio2', ) + + @staticmethod + @check_nltk + def process_document(doc, precision): + + # load in design words + # aka weights and measures, chemical compounds, etc. + design_words = [] + with open("quantgov/resources/design_words.txt", 'r') as d: + for l in d: + design_words.append(l.strip()) + design_words = [x.lower().strip() + for x in design_words if x != ""] + + # kill stopwords + stw = set(nltk.corpus.stopwords.words('english')) + design_words = [x for x in design_words if x not in stw] + + # 1-3 grams in design words list + dw1 = set([x for x in design_words + if len(word_tokenize(x)) == 1]) + dw2 = set([x for x in design_words + if len(word_tokenize(x)) == 2]) + dw3 = set([x for x in design_words + if len(word_tokenize(x)) == 3]) + + # tokenize document + tokenized = word_tokenize(doc.text) + + # silly count based on words that might indicate design standards + # aka best practices, etc. + maybe_relevant_count = len([x for x in tokenized + if x in ['standard', + 'practice', + 'best practice']]) + + # single words + token_count = collections.Counter(tokenized) + dw1_count = sum([token_count[x] + for x in token_count.keys() if x in dw1]) + + # bigrams, trigrams + bigrams = [' '.join(x) for x in nltk.bigrams(tokenized)] + trigrams = [' '.join(x) for x in nltk.trigrams(tokenized)] + bigrams = collections.Counter(bigrams) + trigrams = collections.Counter(trigrams) + dw2_count = sum([bigrams[x] + for x in bigrams.keys() if x in dw2]) + dw3_count = sum([trigrams[x] + for x in trigrams.keys() if x in dw3]) + + # final counts + design_word_count = dw1_count + dw2_count + dw3_count + design_word_ratio = design_word_count / sum(token_count.values()) + design_word_ratio2 = design_word_count / len(set(tokenized)) + + # rounds + if precision: + design_word_ratio = round(design_word_ratio, precision) + design_word_ratio2 = round(design_word_ratio2, precision) + + return doc.index + (design_word_count, design_word_ratio, + design_word_ratio2, ) + + +commands['design_words'] = DesignWords + + +class PartsOfSpeech: + + cli = utils.CLISpec( + help='Part of speech tagging and derived metrics', + arguments=[ + utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + + @staticmethod + def get_columns(args): + # column names to return + return ('', '', ) + + @staticmethod + @check_nltk + def process_document(doc, precision): + + # NLTK part of speech tagging + nltk_tags = pos_tag(word_tokenize(doc.text)) + + # all tags + all_tags = [] + with open('quantgov/resources/nltk_pos_tags.txt', 'r') as o: + for x in o.readlines(): + all_tags.append(x.split('|')[0]) + + # count up tags + count_tags = {} + for x in all_tags: + count_tags[x.strip()] = 0 + for x in nltk_tags: + try: + count_tags[x[1]] += 1 + except KeyError: + continue + + word_count = sum(count_tags.values()) + nouns_count = (count_tags['NN'] + count_tags['NNS'] + + count_tags['NNP']) + verbs_count = (count_tags['VB'] + count_tags['VBD'] + + count_tags['VBG'] + count_tags['VBN'] + + count_tags['VBP'] + count_tags['VBZ']) + noun_verb_ratio = (nouns_count + 1) / (verbs_count + 1) + nouns_ratio = (nouns_count + 1) / (word_count + 1) + verbs_ratio = (verbs_count + 1) / (word_count + 1) + proper_nouns_count = count_tags['NNP'] + count_tags['NNPS'] + proper_nouns_ratio = (proper_nouns_count + 1) / (word_count + 1) + + if precision: + noun_verb_ratio = round(noun_verb_ratio, precision) + nouns_ratio = round(nouns_ratio, precision) + proper_nouns_ratio = round(proper_nouns_ratio, precision) + verbs_ratio = round(verbs_ratio, precision) + + return (doc.index + + (noun_verb_ratio, nouns_count, verbs_count, + nouns_ratio, verbs_ratio, + proper_nouns_count, proper_nouns_ratio, )) + + +commands['pos_metrics'] = PartsOfSpeech diff --git a/quantgov/resources/design_words.txt b/quantgov/resources/design_words.txt new file mode 100755 index 0000000..9bfd8b9 --- /dev/null +++ b/quantgov/resources/design_words.txt @@ -0,0 +1 @@ +nickel(II) zirconate vanadium(III) bromide potassium bitartrate calcium oxide germanium(IV) methoxide boron carbide sodium oxide silver telluride vanadium(III) sulfate gallium(III) fluoride sodium hydromethylglycinate hafnium carbide lanthanum chloride silver sulfide Sodium carbonate Sodium hydroxide thorium(III) fluoride potassium persulfate Iron(III) nitrate lanthanum niobate palladium(II) bromide caesium hydride arsenic triselenide phosphorus hexasulfide indium(III) iodide lutetium iodide iron(III) pyrophosphate molybdenum(IV) bromide rhenium(IV) sulfide zinc chloride sodium cyanoborohydride aluminum silicate calcium aluminate sulfur trioxide potassium cyanate Acetic acid calcium acetate technetium(VII) oxide zinc fumarate palladium(IV) sulfide caesium tungstate californium(III) fluoride sodium hydroxide zinc dichromate bromine monochloride zinc dimethyldithiocarbamate cadmium antimonide rhenium(III) iodide barium tartrate tellurium monoxide indium(III) fluoride nonahydrate hafnium phosphide scandium(III) chloride germanium(II) sulfide Ammonium dichromate calcium perchlorate erbium(III) chloride cerium(III) bromide sodium dithionate lanthanum fluoride zinc hydride zinc glycerophosphate platinum(III) iodide titanium(IV) sulfide niobium(IV) carbide potassium tetrafluoroaluminate sodium tartrate praseodymium(IV) oxide lithium metasilicate praseodymium(III) formate lead(II) tungstate arsenic tetrasulfide iron chromite lead(II) orthosilicate iron(III) sulfate iron(II) oxide manganese(II) sulfate Oxalic acid lead(II) bromide sodium chlorate calcium nitrite indium(III) phosphide potassium hexafluorophosphate gallium(III) sulfate ruthenium(V) fluoride sodium sulfate europium(II) fluoride calcium formate krypton difluoride tantalum(III) chloride barium hydrosulfide potassium chromate calcium sulfide uranium(VI) telluride sodium amide Potassium hexacyanoferrate(III) calcium hydroxide indium(I) hydride oxygen(II) fluoride caesium chloride hafnium oxide bismuth(III) oxide Potassium hydrogen phosphate dysprosium(III) bromide uranium(IV) fluoride nickel(II) boride aluminium chloride radium nitrate phosphorus tetrachloride strontium bromide hexahydrate tungsten(IV) fluoride Sodium formate ammonia sodium bifluoride molybdenum(IV) fluoride phosphorus tetrabromide terbium(III) fluoride iridium(II) bromide lithium selenide boron tribromide Lactose bismuth(III) subcarbonate copper(II) oleate lanthanum oxide lithium chloride monohydrate phosphorus decaoxide tungsten(VI) oxytetrabromide sodium vanadate scandium(III) hydride molybdenum(II) iodide zinc hydroxide manganese(VII) oxide protactinium(IV) iodide lead(II) hypophosphite cadmium nitride Zinc chloride protactinium(IV) oxide indium(III) acetate magnesium peroxide potassium hexafluorosilicate niobium(II) oxide plutonium(IV) sulfide copper(II) arsenite sodium cobaltnitrite erbium(III) sulfide zirconium(IV) iodide Ammonium nitrate molybdenum(VI) fluoride sulfur monoxide gallium(I) oxide tungsten(II) chloride chlorine monofluoride magnesium hydroxide copper(I) hydride lanthanum iodide titanium(II) chloride magnesium perchlorate lead(II) formate vanadium(IV) sulfide sodium sulfide gallium(III) nitride iron(III) hypophosphite Potassium permanganate germanium(II) iodide potassium bromate antimony pentachloride nitrogen tribromide lead(II) chloride barium dichromate Calcium sulfate terbium(II) fluoride dinitrogen tetroxide caesium oxide lithium borohydride copper(II) acetylide Acetone dysprosium(II) bromide calcium sulfate potassium tellurite Dimethyl sulfoxide (DMSO) arsenic trihydride zinc chromite radium carbonate strontium selenide tantalum(III) aluminide Zinc nitrate chromium(III) fluoride dinitrogen tetrafluoride strontium sulfite yttrium(III) hydride magnesium antimonide gold(III) iodide barium disilicate hafnium sulfate mercury(II) oxide yttrium(III) boride iron(II) tungstate Sodium tartrate copper(II) tungstate potassium cyanide thorium(IV) bromide Barium chloride germanium(II) fluoride lead(II) sulfate rhenium(IV) fluoride radium chloride gold(III) bromide gallium(III) bromide zirconium(II) oxide sodium perchlorate mercury(I) oxide thallium(I) hexafluorophosphate indium(II) selenide einsteinium(III) chloride thallium(II) chloride Sodium chloride lead(II) carbonate tantalum(IV) oxide niobium(V) oxide iridium(IV) oxide Potassium chloride rubidium selenide barium azide barium fluoride boron nitride lanthanum vanadate niobium(V) bromide lithium metavanadate zirconium(IV) selenide uranium(II) nitride titanium(IV) fluoride germanium(IV) fluoride arsenic diiodide beryllium phosphate nickel(II) bromide sodium iodide tetraarsenic tetrasulfide barium arsenite lanthanum bromide iron(II) sulfide Tin(IV) chloride silver dibromide silver bromate Hydrofluoric acid copper(II) azide bismuth(III) stannate californium(III) bromide bismuth(III) hydride erbium(III) fluoride sodium selenide potassium benzoate ruthenium(IV) telluride arsenic trichloride boron arsenate terbium(III) chloride titanium(III) bromide neptunium(IV) chloride plutonium(III) fluoride platinum(VI) fluoride barium selenite sodium benzosulfonate Lead(II) nitrate Potassium carbonate terbium(III) bromide sodium tripolyphosphate ruthenium(II) iodide tellurium disulfide aluminium antimonide nickel(II) chloride ruthenium(IV) selenide lutetium nitrate potassium formate neodymium(III) telluride Potassium hexacyanoferrate(II) barium molybdate silver cyanide zinc carbonate zinc orthosilicate Hydrazine technetium(IV) chloride palladium(IV) telluride lead(II) stearate beryllium nitrate antimony triiodide zinc pyrophosphate Ammonium acetate cobalt(III) titanate disulfur monoxide technetium(V) fluoride holmium vanadate cadmium sulfide cadmium arsenide samarium(III) bromide potassium niobate lutetium telluride strontium chloride ytterbium(III) sulfide zinc dimethyldithiocarbonate lithium borate lutetium chloride platinum(II) bromide Maleic acid sodium adipate thallium(I) molybdate cobalt(II) oxalate calcium tungstate zirconium(IV) silicate Ammonium chloride Sodium hydrogen tartrate technetium(IV) bromide aluminium nitride iron(III) phosphate arsenic monoxide tellurium trioxide sodium perrhenate aluminium bromide hexahydrate iron sodium acetate barium perrhenate iron(II) bromide vanadium(IV) chloride potassium perrhenate titanium(II) oxide aluminium sulfide Hydrogen peroxide antimony tritelluride indium(III) fluoride hafnium(IV) fluoride praseodymium(III) iodide nitrogen dioxide sodium borofluoride cadmium titanate chromium(III) chloride yttrium(III) bromide tantalum(V) fluoride lanthanum formate silver carbonate zinc nitride lithium nitride dysprosium(II) iodide boron trifluoride dysprosium(III) iodide niobium(V) iodide ytterbium(III) bromide strontium nitrite Chloroplatinic acid potassium ferrate sodium hypochlorite phosphorus hexoxide copper(II) selenide iodine monoazide tellurium tetraiodide sodium antimonate bismuth(III) oxychloride radium iodide Calcium chloride barium formate curium(III) oxide curium(III) chloride sodium hexafluoroaluminate calcium chromate bismuth(III) oxybromide selenium disulfide Glucose lithium tetrahydridoaluminate yttrium(III) trifluoromethanesulfonate Ammonium oxalate osmium(IV) fluoride potassium dimethyldithiocarbamate iridium(III) bromide iron(III) ferrocyanide zinc stannate calcium permanganate platinum(III) bromide potassium peroxide rhenium(IV) iodide caesium acetate Nicotine beryllium oxide sodium bismuthate zirconium(IV) hydroxide caesium iodide einsteinium(II) iodide calcium bromide nickel(II) nitrate thallium(I) oxalate nickel(II) sulfide Sodium sulfide protactinium(IV) chloride einsteinium(III) iodide nickel(II) iodide lanthanum nitride calcium hydrogenorthophosphate silver chromate gallium(II) oxide zinc bromide zinc iodide zirconium(IV) sulfide thulium(III) bromide Mercury(II) chloride sodium selenite cadmium niobate gold(III) selenate aluminium selenide lanthanum silicide lithium peroxide bismuth(III) citrate titanium(II) fluoride germanium(IV) nitride molybdenum(IV) sulfide sodium tetracholoraluminate ytterbium(II) telluride molybdenum(VI) chloride praseodymium(II) selenide silicon bromide erbium(III) chloride hexahydrate tungsten(V) bromide Ammonium carbonate barium pyrophosphate barium thiocyanate indium(III) fluoride trihydrate silicon sulfide sodium chloride magnesium chloride titanium(III) nitride chromium(III) nitride iron(II) chloride tetrahydrate cobalt(III) acetylacetonate lithium zirconate praseodymium(III) nitride fluorine nitrate gold(III) selenide indium(III) arsenide ruthenium(VI) oxide cerium(III) oxide zinc phosphate Perchloric acid iridium(II) iodide potassium iodide arsenic monophosphide disulfur diiodide magnesium arsenate lithium phosphate molybdenum(III) chloride lanthanum boride molybdenum(III) iodide gadolinium(III) telluride sodium arsenite magnesium technetate manganese(II,III) oxide disulfur dibromide Silver nitrate phosphorus thiochloride sodium orthosilicate cerium(II) hydride sodium nitrate neptunium(V) oxide beryllium aluminate thallium(I) nitrate zinc telluride phosphorus tetrafluoride rubidium fluoride technetium(VI) chloride nickel(II) chromate lead(II) antimonate dinitrogen difluoride Lead(IV) acetate tellurium tetrafluoride rhenium(VII) fluoride gadolinium(III) chloride iron(III) chloride lithium titanate manganese(II) nitrate Sodium dihydrogen phosphate osmium(IV) bromide lead(II) iodide gallium(II) telluride manganese(II) sulfide cobalt(II) telluride terbium(IV) fluoride Sodium sulfite Mercury(II) sulfate zinc thiocyanate thulium(II) niobate antimony tribromide cobalt(II) chloride hexahydrate tantalum(IV) sulfide lead(II) oleate zirconium(II) chloride neodymium(III) nitride silver hexafluorophosphate Aluminium sulfate technetium(IV) sulfide titanium(III) sulfate Iron(II) sulfate sodium formaldehydesulfoxylate chromium(II) bromide Maltose sodium tungstate Sodium chromate boron triiodide copper(II) borate ruthenium(III) fluoride calcium telluride thallium(I) fluoride xenon trioxide rhenium(IV) oxide chlorine heptoxide silicon dioxide vanadium(II) bromide caesium azide aluminium lanthanum hydride indium(III) phosphate thallium(III) iodate cadmium iodate aluminum carbide lead(IV) bromide tin(IV) sulfide Sodium arsenate niobium(IV) oxide caesium fluoride copper(II) oxalate thorium(III) iodide sodium tetraphenylborate manganese(II) carbonate thulium(III) chloride gallium(III) antimonide iron(II) orthosilicate iron(II) selenide potassium hexafluorozirconate cobalt(II) fluoride copper(II) formate platinum(IV) bromide lanthanum chloride heptahydrate germanium(IV) sulfide Hydrobromic acid gadolinium(II) silicide calcium silicate europium(II) chloride einsteinium(II) bromide antimony trinitride thorium(IV) oxide lead(II) selenide dysprosium(II) chloride lead(III) acetate yttrium(III) arsenide thorium(II) sulfide thorium(IV) orthosilicate thallium(I) oxide platinum(IV) selenide neodymium(III) formate calcium propionate arsenic pentafluoride Copper(II) nitrate thorium(IV) silicide erbium(III) sulfate lanthanum chloride trihydrate silver fulminate bismuth(III) orthoniobate potassium bifluoride potassium hexafluoroaluminate palladium(II) nitrate niobium(VI) selenide Resorcinol caesium iodate silver iodate magnesium selenide aluminium diboride magnesium telluride tungsten(VI) dioxydibromide Lithium chloride tellurium dioxide antimony trihydride silver acetate osmium(V) chloride caesium amide aluminium chloride hexahydrate vanadium(IV) iodide manganese(IV) fluoride sodium phenylacetate magnesium stearate Sodium thiosulfate ammonium hydroxide sodium ethenesulfonate holmium iodide manganese(III) fluoride selenium hexafluoride rubidium periodate copper(II) chloride Aluminium nitrate bismuth(III) iodide cobalt(II) iodate Diethyl ether niobium(IV) sulfide terbium(III) formate Potassium dihydrogen phosphate cobalt(II) chromite neodymium(II) selenide rhenium(V) bromide xenon oxytetrafluoride trisilver trichloride arsenic triphosphide chromium(IV) chloride platinum(V) fluoride sodium fluorophosphate sulfur hexafluoride Sodium phosphate silver selenate berkelium(III) iodide samarium(II) fluoride chromium(III) sulfide phosphorus triselenide barium acetate sodium carbonate zinc permanganate neodymium(III) vanadate californium(III) iodide arsenic trioxide niobium(III) iodide rubidium iodide titanium(III) chloride strontium bromide zirconium(IV) sulfate bismuth(III) titanate Copper(II) sulfate strontium silicate tin(II) oxalate samarium(III) telluride holmium fluoride yttrium(III) oxide cobalt(II) orthosilicate barium nitrite bromine pentafluoride bromine azide copper(I) bromide yttrium(III) sulfide iodine trichloride zinc fluoride berkelium(IV) oxide mercury(II) thiocyanate calcium molybdate tungsten(II) bromide chlorine hexoxide iron(II) aluminate cobalt(III) oxide Barium nitrate indium(III) selenide iron(II) telluride praseodymium(II) sulfide phosphorus oxybromide titanium(IV) sulfate vanadium(II) oxide ytterbium(III) selenide berkelium(III) bromide zinc succinate dinitrogen monoxide strontium stannate arsenic pentasulfide tantalum(IV) telluride sodium formate barium hydroxide uranium(IV) carbide lithium chloride sodium thiosulfate thallium(I) cyanide europium(III) bromide chromium(III) oxide iron(III) chloride hexahydrate arsenic oxybromide sodium ethoxide calcium chloride calcium titanate curium(IV) fluoride europium(II) vanadate nickel(II) oxide thallium(I) perchlorate tin(IV) chloride bismuth(III) bromide cadmium azide caesium perchlorate arsenic tribromide gadolinium(III) iodide barium silicide dysprosium(III) formate sodium caprylate iron(II) carbonate calcium hypochlorite xenon monofluoride thorium(III) nitride iron(II) molybdate radium oxide Tin(II) chloride nickel(II) cyanide bismuth(III) selenide erbium(II) sulfide scandium(II) hydride potassium azide boric acid sodium stearate europium(II) telluride germanium(II) selenide lead(IV) hydride caesium tetraiodozincate lutetium nitride palladium(II) trifluoroacetate thallium(I) hydroxide germanium(II) oxide lithium methoxide lutetium oxide alumimum nitrate cerium(III) fluoride lithium hexafluorophosphate lead(II) nitrate Pentan-1-ol osmium(III) chloride trihydrate rubidium oxide cerium(III) formate rhenium(VI) chloride copper(I) iodide uranium(V) fluoride mercury(II) nitrate indium(II) chloride potassium metaphosphate mercury(II) bromide potassium fluoride strontium carbonate copper(II) tetrafluoroborate calcium fumarate caesium cyanide phosphorus pentachloride beryllium sulfate rhenium(V) fluoride silicon chloride zinc arsenate vanadium(II) iodide tungsten(VI) sulfide dysprosium(II) silicide molybdenum(IV) silicide silver perchlorate calcium phosphate sodium methoxide diiodine tetroxide lead(II) titanate bromine monofluoride sodium propionate niobium(III) chloride iron(II) fluoride sodium silicate Chloroform cerium(II) sulfide silicon carbide tungsten(III) chloride ruthenium(IV) oxide barium selenide lithium dihydrogenphosphate iridium(III) sulfide antimony trisulfate Butan-1-ol calcium oxalate lead(II) tantalate Sulfuric acid protactinium(IV) bromide calcium iodate zinc peroxide chromium(II) chloride barium permanganate gadolinium(III) formate iridium(V) fluoride samarium(III) iodide strontium thiocyanate californium(IV) oxide vanadium(IV) carbide zinc chlorate sodium aluminate potassium phosphate bismuth(V) fluoride iron(II) iodide mercury(II) hydride zirconium(IV) bromide cadmium carbonate potassium ethoxide copper(II) fluoride sodium malonate silicon fluoride cobalt(II) perchlorate Potassium tartrate selenium oxybromide barium chlorate lithium aluminate niobium(IV) chloride cobalt(III) nitrate iron(III) hydroxide nickel(II) stannate thorium(IV) selenide tin(IV) hydride potassium metavanadate ruthenium(II) bromide promethium(III) chloride thorium(IV) carbide tantalum(V) chloride sodium thioantimonide calcium carbonate plutonium(IV) fluoride erbium(III) telluride calcium tetrahydroaluminate lead(II) hydroxide neptunium(IV) fluoride yttrium(III) vanadate sodium sulfide nonahydrate caesium nitrate silver subfluoride iron(III) oxide lead(II) chromate Cadmium nitrate Iron(II) ammonium sulfate sodium iodate chlorine dioxide iron(II,III) oxide selenium bromide gadolinium(III) fluoride lithium molybdate cadmium nitrate Strontium nitrate potassium periodate radium bromide manganese(IV) telluride protactinium(III) iodide arsenic tetraselenide protactinium(V) oxide iron(II) chloride sodium thiocyanate ytterbium(II) chloride tungsten(VI) dioxydiiodide osmium(VI) fluoride einsteinium(III) bromide vanadium(III) chloride lead(II) zirconate germanium(II) telluride barium peroxide iron(II) hydroxide Sodium citrate thallium(I,III) bromide cerium(III) oxalate potassium bromide Strontium sulfate indium(I) iodide copper(I) sulfide germanium(IV) chloride gadolinium(III) nitride vanadium(I) hydride barium hydride titanium(II) iodide sodium dithionite cadmium telluride tin(II) fluoroborate iridium(IV) selenide iron(II) phosphate tantalum(V) oxide Acetamide Potassium sulfite Sodium hydrogen phosphate bismuth(III) sulfide barium bromide iodine monochloride phosphorus pentabromide nickel(III) sulfide samarium(II) chloride magnesium silicite Methyl acetate tantalum(IV) carbide platinum(IV) iodide Ethylene glycol gold(III) oxide potassium hydride Ammonia chromium(III) picolinate copper(II) perchlorate silver sulfate potassium tetrachlorocuprate calcium fluoride sodium lactate potassium tetrachloropalladate promethium(III) bromide antimony pentafluoride bismuth(III) oxalate caesium bromide tungsten(IV) sulfide magnesium bromide hexahydrate scandium(III) bromide Chromium(VI) oxide mercury(I) chloride sodium hexametaphosphate silver oxalate potassium hexafluorotitanate palladium(II) oxide lithium oxide arsenic tritelluride cerium(III) chloride protactinium(IV) fluoride Lead(II) chloride calcium boride nickel(II) hydroxide ytterbium(III) chloride hexahydrate chromium(VI) carbonyl Zinc bromide osmium(III) bromide lithium perrhenate molybdenum(II) oxide uranium(VI) carbide Dichloroacetic acid caesium selenide copper(II) stearate europium(III) nitride nickel(II) antimonide xenon hexafluoride lead(II) lactate tantalum(IV) chloride rhenium(IV) telluride lithium hydride Citric acid lead(II) oxide europium(II) selenide sodium hexafluorophosphate thallium(I) selenide plutonium(III) chloride uranium(V) bromide tungsten(VI) dioxydichloride Aluminium chloride Magnesium sulfate ytterbium(II) iodide iron(II) fluoride tetrahydrate fluorine perchlorate rhodium(VI) fluoride disulfur decafluoride Lead(II) acetate cobalt(II) chromate silver nitrate mercury(II) fluoride Potassium iodide berkelium(II) oxide ruthenium(VI) fluoride samarium(III) sulfide scandium(III) oxide osmium(III) iodide cadmium cyanide gallium(III) selenide tantalum(III) nitride sodium sulfite rhodium(IV) oxide iron(II) sulfate Sodium sulfate chromium(III) sulfate phosphorus pentasulfide arsenic oxychloride potassium carbonate yttrium(III) antimonide protactinium(V) fluoride gold(I) chloride strontium peroxide thallium(I) sulfide molybdenum(III) fluoride cobalt(II) molybdate lithium bromide germanium(IV) bromide rhenium(VII) sulfide sodium sulfide pentahydrate gadolinium(III) chloride hexahydrate lithium hydroxide dysprosium(III) fluoride chromium(II,III) oxide magnesium succinate iridium(IV) bromide indium(II) telluride titanium(IV) bromide lithium metaborate bismuth(III) subnitrate cadmium acetate calcium nitrate plutonium(III) hydride barium selenate Potassium nitrite copper(II) chromate arsenic pentachloride beryllium sulfide rhenium(VI) oxide thorium(IV) sulfide manganese(II) acetate potassium nitrate beryllium sulfate trihydrate chromium(III) iodide scandium(III) nitrate palladium(II) telluride Potassium bicarbonate gallium(III) hydroxide chromium(III) bromide rhodium(IV) fluoride palladium(II) sulfate plutonium(VI) fluoride cobalt(III) sulfide radium hydroxide terbium(IV) oxide phosphorus triiodide gadolinium(III) sulfide mercury(II) acetate barium manganate thallium(III) oxide iridium(III) fluoride gallium(III) hydride palladium(II) fluoride lithium benzoate copper(I) chloride indium(I,III) iodide Hydrocyanic acid lead(II) metavandate barium perchlorate Cobalt(II) nitrate thallium(I) chlorate silver chloride sulfur tetrachloride berkelium(III) chloride Copper(II) chloride hafnium nitride gold(II) selenide gallium(III) phosphide curium(II) oxide potassium nitrite potassium adipate sodium tetraborate tungsten(VI) chloride chromium(IV) fluoride niobium(IV) selenide Copper(I) chloride indium(II) sulfide rhenium(VI) fluoride fluorine monoxide americium dioxide lead(IV) oxide bismuth(III) oxynitrate osmium(I) iodide cadmium iodide caesium chlorate manganese(II) telluride calcium silicide nickel(II) arsenide chlorine trioxide terbium(IV) silicide barium titanate zinc tungstate terbium(III) sulfide molybdenum(III) hexacarbonyl xenon tetroxide uranium(III) bromide Barium hydroxide lithium iodide Glycerol technetium(IV) oxide silver azide lead(II) perchlorate zinc cyanide barium tungstate germanium(IV) selenide zirconium(IV) silicide sodium chlorite silver bromide neodymium(III) iodide hafnium chloride zinc stearate Lysergic acid diethylamide (LSD) zirconium(III) nitride arsenic trifluoride californium(IV) fluoride barium sulfate Phenol molybdenum(VI) oxide potassium chlorate strontium titanate antimony trifluoride barium bromide dihydrate uranium(IV) bromide phosphorus trichloride nickel(II) acetate zirconium(IV) carbide aluminium phosphate selenium tetrachloride potassium superoxide Lactic acid boron arsenide platinum(IV) chloride tantalum hydride potassium binoxalate berkelium(III) oxide rubidium hydroxide zirconium(IV) phosphate selenium monosulfide tellurium hexafluoride niobium(III) bromide Potassium sulfate palladium(II) iodide zinc caprylate potassium sulfate gold(I) sulfide dysprosium(III) sulfide protactinium(V) bromide Nitric acid tin(II) chloride potassium pyrophosphate Sodium bromide silver tetrachloroaluminate cerium(III) sulfide indium(I) chloride titanium(III) iodide copper(II) bromide gallium(III) telluride iron(II) chloride dihydrate yttrium(II) carbide titanium(III) oxide Potassium chlorate iron(III) fluoride tin(IV) selenide zirconium(IV) fluoride Urea vanadium(V) fluoride gold(III) fluoride tantalum(V) bromide chromium(III) telluride copper(II) chloride dihydrate lithium sulfate zirconium(IV) chloride Carbon disulfide sodium persulfate thulium(III) iodide caesium nitrite silicon boride neptunium(IV) oxide copper(II) nitrate beryllium hydride rhodium(V) fluoride bismuth(III) formate antimony trisulfide boron phosphate uranium(IV) iodide arsenic diphosphide calcium cyanamide erbium(III) boride tin(IV) oxide barium chromate praseodymium(III) fluoride gold(I,III) chloride germanium(IV) hydride erbium(III) formate tungsten(VI) bromide barium boride plutonium(III) oxide niobium(III) nitride americium(III) chloride disulfur dichloride uranium(IV) chloride mercury(II) perchlorate barium thiosulfate lanthanum carbide samarium(III) fluoride potassium trifluoromethanesulfonate europium(III) sulfate iron(II) titanate lithium formate strontium hydride tetrasulfur tetranitride chromium(III) phosphate ruthenium(II) chloride samarium(II) iodide zinc titanate sodium antimonide lead(II) cyanide antimony pentasulfide potassium thiocyanate platinum(II) oxide californium(II) bromide germanium(IV) iodide palladium(IV) selenide Potassium dichromate sulfur tetrafluoride caesium vanadate strontium tungstate dysprosium(III) chloride chromium(II) selenide Acetaldehyde praseodymium(III) chloride Hexafluorosilicic acid aluminium formate copper(II) sulfate iron(III) iodide thallium(I) selenate vanadium(IV) fluoride sodium tosylate europium(II) sulfide Chromium(III) nitrate lead(II) selenate lead(II) molybdate copper(I) selenide strontium molybdate cobalt(II) tungstate calcium citrate manganese(II) oxide thulium(III) fluoride Ammonium hydroxide gadolinium(III) bromide vanadium(IV) telluride fluorine dioxide silver selenide bromine dioxide molybdenum(IV) oxide uranium(V) chloride silver thiocyanate sodium acrylate lead(II) sulfide cadmium oxalate mercury(I) bromide aluminium sulfate lithium diisopropylamide nickel(II) tungstate zinc nitrate tin(IV) bromide tungsten(VI) oxytetrafluoride Iron(III) chloride rubidium peroxide Tartaric acid thulium(II) chloride bismuth(III) vanadate berkelium(IV) fluoride lithium sulfide chlorine pentafluoride beryllium chloride caesium peroxide sodium azide actinium(III) oxide gold(I) bromide barium chloride zirconium(IV) tungstate zinc chromate bismuth(IV) peroxide platinum(II) chloride barium niobate cobalt(II) nitrate hexahydrate caesium periodate nickel(III) oxide gallium(III) iodide silver acetylide cobalt(II) chloride bromine trifluoride rubidium nitrate europium(III) oxide potassium dichromate selenium difluoride dysprosium(III) hydride sodium peroxide selenium dichloride iodine monobromide thulium(III) oxide selenium trioxide lithium perchlorate gold(I) iodide Potassium chromate barium sulfide Magnesium nitrate platinum(II) cyanide cobalt(III) oxide monohydrate silver molybdate neptunium(III) chloride Nickel nitrate iron(III) arsenic nitrogen trifluoride Cobalt(II) sulfate cobalt(IV) sulfide iridium(III) iodide tantalum(II) oxide Potassium iodate zirconium(IV) oxide xenon tetrafluoride molybdenum(V) chloride antimony pentoxide silver tribromide sodium phosphate thallium(I) carbonate holmium chloride rhenium(III) chloride californium(III) oxide gold(I) cyanide tin(II) fluoride Strontium chloride potassium pyrosulfate calcium nitride cobalt(II) naphthenate lead(II) butanoate zinc oxalate thallium(I) iodide technetium(VI) fluoride platinum(II) telluride sodium cyanide osmium(III) chloride platinum(IV) telluride potassium acetate xenon dioxydifluoride cobalt(II) oxide plutonium(III) bromide caesium molybdate potassium perchlorate berkelium(III) fluoride cobalt(II) nitrate sodium tetrafluoroaluminate beryllium nitrate trihydrate beryllium selenide molybdenum(II) chloride sodium selenate europium(III) iodide silicon hydride arsenic triiodide aluminium hydroxide tin(IV) iodide rubidium chloride Chloroacetic acid bismuth(III) fluoride gallium(III) oxide tungsten(V) oxytrichloride Potassium bromide uranium(V) oxide silver perrhenate Iron(III) sulfate vanadium(II) chloride thallium(III) chloride rubidium iodate strontium nitrate potassium fluoroborate ruthenium(III) bromide silver fluoride gold(II) telluride molybdenum(V) oxide lutetium bromide molybdenum(IV) iodide rubidium hydride platinum(III) chloride terbium(III) selenide magnesium nitrate curium(III) bromide indium(II) bromide neptunium(II) oxide vanadium(III) sulfide uranium(VI) chloride phosphorus tetrasulfide iridium(VI) telluride indium(III) bromide iron(III) dichromate curium(III) fluoride sodium bicarbonate cobalt(II) stannate uranium(II) sulfide palladium(IV) fluoride platinum(IV) oxide bismuth(III) phosphate strontium chromate caesium chromate potassium perruthenate potassium aluminate caesium sulfide holmium sulfide cobalt(II) cyanide calcium iodide thorium(IV) fluoride phosphorus tetrahydride platinum(II) sulfide potassium hexachlororuthenate yttrium(III) formate molybdenum(III) nitride phosphorus pentafluoride holmium formate gadolinium(III) oxide platinum(IV) chloride pentaahydrate Calcium hydroxide sodium superoxide cadmium oxide selenium tetrabromide zinc oxide thallium(I) acetate lithium telluride copper(II) cyanide caesium oxalate ferric nitrate gold(V) fluoride beryllium acetylacetonate Chloroauric acid uranium(III) hydride strontium zirconate selenium tetrafluoride thallium(III) nitrate neodymium(III) fluoride sodium cyanate rhenium(IV) bromide thorium(II) carbonate vanadium(IV) oxide iron(III) oxalate iron(III) metavanadate manganese(II) selenide boron phosphide beryllium iodide osmium(VIII) oxide europium(III) fluoride sodium periodate water sodium bisulfide zinc ferrocyanide lead(II) telluride rubidium superoxide cadmium chloride rubidium acetylacetonate uranyl nitrate bismuth(III) molybdate Saccharose cobalt(II) aluminate scandium(III) telluride diboron tetrachloride lead(II) chlorate magnesium carbonate neodymium(II) chloride plutonium(II) sulfide silver(I,III) oxide zirconium(II) hydride germanium(II) bromide Manganese(II) chloride tellurium tetrabromide plutonium(III) nitride selenium dibromide iodine monofluoride thulium(II) iodide chlorine trifluoride molybdenum(III) sulfide neptunium(V) fluoride Potassium thiocyanate dysprosium(III) nitride vanadium(V) oxytrifluoride gallium(III) fluoride trihydrate lithium fluoride scandium(III) sulfide arsenic pentoxide TRIS silicon iodide strontium dithionate titanium(III) fluoride vanadium(III) iodide samarium(II) bromide rhodium(III) iodide curium(III) iodide lutetium sulfide promethium(III) oxide titanium(IV) ethoxide beryllium nitrate tetrahydrate sodium metavanadate copper(I) telluride manganese(II) fluoride neptunium(III) fluoride uranium(IV,V) oxide chromium(VI) fluoride ytterbium(III) iodide boron decahydride calcium dihydrogen phosphate monohydrate beryllium telluride sulfur dichloride samarium(III) oxide ammonium sulfate tungsten(VI) oxide vanadium(V) oxytrichloride zinc perchlorate magnesium sulfate thorium(II) iodide thallium(I) triiodide silver selenite vanadyl sulfate yttrium(II) hydride copper(I) fluoride samarium(III) formate Bismuth(III) chloride uranium(III) iodide osmium(II) cyclopentadienide zinc borate magnesium silicide bismuth(III) orthotantalate sodium borohydride uranium(VI) fluoride hafnium orthosilicate potassium xanthogenate osmium(V) fluoride holmium silicide rhenium(VII) oxide erbium(III) nitride magnesium bromide indium(III) chloride barium oxalate neodymium(III) sulfide bismuth(III) oxyiodide samarium(III) chloride cadmium silicate iridium(IV) oxide dihydrate vanadium(V) oxide lanthanum sulfide neodymium(III) nitrate titanium(IV) iodide zirconium(IV) telluride potassium sulfide praseodymium(III) bromide rhodium(VII) sulfide cadmium sulfate lead(II) thiocyanate neodymium(III) gallate zinc iodate magnesium iodide europium(III) formate erbium(III) niobate thulium(II) chloride heptahydrate cerium(III) vanadate cerium(IV) oxide uranium(VI) oxide barium hexafluorosilicate platinum(IV) oxide monohydrate thulium(II) bromide arsenic trisulfide beryllium hydroxide diboron trisulfide phosphorus tetraiodide nitrogen monoxide platinum(II) iodide zirconium(III) iodide caesium hydroxide europium(III) chloride potassium oxide zinc lactate beryllium acetate aluminium phosphide potassium metabisulfite magnesium hydride Fructose rhodium(III) sulfide praseodymium(II) iodide plutonium(II) hydride cadmium phosphide Zinc sulfate potassium telluride cobalt(II) titanate sodium fluorosilicate mercury(II) iodide tantalum(IV) iodide neodymium(II) bromide potassium tartrate sodium pyrosulfate Phosphoric acid gold sulfuric acid barium metasilicate Chromium(III) sulfate indium(III) sulfate lithium superoxide molybdenum(III) oxide cadmium chromate EDTA, disodium salt caesium sulfate strontium selenate silver difluoride barium arsenate erbium(II) telluride silver phosphate uranium(IV) telluride iron(III) fluoride trihydrate cyanide thallium(I) bromide sodium molybdate cadmium fluoride lutetium fluoride cadmium selenide barium metaphosphate sodium diacetate curium(IV) oxide zirconium(IV) phosphide Ammonium sulfate phosphorus trisulfide lead(II,IV) oxide europium(II) iodide indium(I) bromide titanium(II) sulfide nickel(II) carbonate diboron tetrafluoride Sodium nitrate gallium trichloride neptunium(VI) fluoride Hydroiodic acid tantalum(II) sulfide Calcium nitrate magnesium nitride copper(I) thiocyanate caesium tetraiodocadamate ytterbium(V) silicide zinc phosphide manganese(II) phosphate molybdenum(IV) selenide tungsten(V) chloride einsteinium(II) chloride zinc sulfate nickel(II) carbonyl Silver sulfate uranium(VI) selenide plutonium(II) selenide indium(I) fluoride vanadium(II) silicide antimony orthoniobate beryllium carbide chromium(IV) oxide nickel(II) chloride hexahydrate zinc sulfide calcium hypophosphite Ethanol Pyridine titanium(IV) carbide radium sulfate plutonium(III) iodide barium iodide dihydrate selenium dioxide Propan-2-ol titanium(IV) silicide gadolinium(II) iodide chlorine monoxide germanium(IV) oxide holmium oxide lithium niobate lithium hexafluoroaluminate disilver dichloride antimony trioxide cobalt(II) hydroxide Trichloroacetic acid cobalt(III) hydroxide vanadium(III) fluoride plutonium(II) oxide niobium(IV) bromide copper(II) acetate thorium(IV) iodide einsteinium(III) fluoride ytterbium(II) bromide potassium thiosulfate silver thioantimonate palladium(II) sulfide niobium(III) fluoride phosphorus tribromide manganese(II) iodide tetrahydrate iridium(II) chloride sodium monofluoroacetate osmium(VII) fluoride Sodium acetate terbium(III) iodide copper(II) pyrophosphate bismuth(III) telluride uranium(III) chloride potassium permanganate strontium acetate neodymium(III) chloride calcium carbide ruthenium(III) chloride Formaldehyde Mannitol chlorine perchlorate aluminium arsenide Potassium cyanide palladium(IV) oxide titanium(III) phosphide gallium(III) nitrate gallium(II) selenide tellurium tetrachloride holmium nitride tungsten(IV) selenide bismuth(III) hydroxide calcium hydride sodium bromide tungsten(III) iodide Magnesium chloride osmium(II) iodide erbium(III) oxide vanadium(IV) bromide dysprosium(II) boride praseodymium(III) telluride cadmium hydroxide cobalt(II) selenide palladium(II) chloride arsenic mercury(II) sulfate indium(III) hydroxide caesium superoxide phosphorus trihydride caesium telluride mercury(II) fulminate chromium(II) fluoride lead(II) phosphate copper(I) acetate sodium polyphosphate yttrium(III) nitride tin(II) bromide chromium(VI) oxide nickel(II) fluoborate calcium selenide thorium(IV) chloride antimony tetroxide samarium(III) selenide calcium arsenate copper(II) sulfide indium(III) oxide barium aluminate lithium carbonate tantalum(III) fluoride copper(I) cyanide Sodium potassium tartrate Thiourea scandium(III) iodide sodium bromate boron trichloride sodium metatitanate antimony triselenide thallium(I) ethoxide iridium(IV) sulfide potassium hydroxide Cadmium sulfate scandium(III) formate barium zirconate barium stannate cobalt(III) chloride barium oxide sodium nitrite Chromium(III) chloride caesium permanganate lead(II) fluoride sodium xylenesulfonate osmium(IV) telluride einsteinium(III) oxide lead(II,II,IV) oxide potassium chloride ytterbium(II) selenide dysprosium(III) oxide rubidium tetrachloroaluminate hafnium boride Sodium dichromate cerium(II) iodide hafnium sulfide lithium arsenate beryllium fluoride iodine(III) fluoride uranium(II) oxide potassium iodate strontium metaborate uranium(III) fluoride protactinium(V) chloride lithium nitrate vanadium(III) nitride terbium(III) nitride calcium peroxide barium orthovanadate neptunium(III) iodide tin(IV) fluoride niobium(IV) iodide osmium(IV) oxide copper(II) oxide rhodium(III) sulfate neodymium(III) bromide arsenic pentaselenide sodium chromate phosphorus trifluoride thorium(II) hydride niobium(V) fluoride dinitrogen trioxide sodium niobate palladium(II) selenide Sodium hydrogen carbonate platinum(IV) sulfide sodium fluoride thallium(I) sulfate Nickel sulfate tin(II) oxide barium nitride osmium(IV) selenide antimony dichlorotrifluoride ytterbium(II) fluoride indium(III) antimonide rhodium(III) chloride silver chlorate nitrogen trichloride sodium pyrophosphate diarsenic tetrahydride niobium(IV) telluride arsenic dioxide europium(II) silicide ammonium bicarbonate praseodymium(II) telluride cobalt(II) acetate holmium bromide ruthenium(IV) fluoride titanium(IV) oxide ammonium nitrate lithium tantalate barium carbide californium(II) iodide gold(III) hydroxide barium chloride dihydrate cerium(II) fluoride manganese(II) bromide caesium carbonate osmium(IV) chloride uranium(V,VI) oxide radium fluoride dinitrogen pentoxide neptunium(IV) bromide praseodymium(II) fluoride sulfur dioxide thallium(I) formate strontium hydroxide tungsten(IV) chloride lutetium boride erbium(III) hydride tellurium monoiodide erbium(III) bromide selenium oxychloride osmium(VIII) fluoride chromyl fluoride lithium amide arsenic phosphide iodine heptafluoride potassium oxalate ytterbium(III) fluoride iron(II) nitrate ammonium chloride barium cyanide Bismuth(III) nitrate xenon difluoride tin(II) iodide magnesium sulfide rubidium tetrafluoroaluminate scandium(III) fluoride zinc acetate beryllium nitride Sodium chlorate rhodium(III) fluoride iridium(III) oxide zirconium(III) bromide sodium stannate mercury(I) fluoride plutonium(IV) oxide cerium(III) nitride molybdenum(IV) chloride nickel(II) selenide lithium iodide trihydrate thallium(III) fluoride gold(III) sulfide copper(II) telluride potassium selenide palladium(II) cyanide sucrose lanthanum aluminate mercury(II) telluride barium carbonate beryllium silicate bismuth(III) chloride tungsten(II) iodide tantalum(III) bromide hafnium silicide rubidium sulfide copper(I) acetylide lead(II) arsenite tungsten(III) bromide arsenic hemiselenide tungsten(V) oxytribromide calcium perrhenate neptunium(III) sulfide rhodium(III) nitrate cobalt(III) fluoride manganese(IV) selenide silicon nitride uranium(III) nitride iridium(IV) fluoride neptunium(III) bromide tungsten(VI) fluoride rhodium(III) bromide californium(II) chloride lead(II) azide tungsten(IV) silicide lead(II) acetate strontium aluminate chromium(III) nitrate ruthenium(III) iodide titanium(III) sulfide Formic acid gold(III) chloride yttrium(III) phosphide calcium chloride hexahydrate cobalt(II) iodide calcium pyrophosphate tin(II) telluride lead(IV) fluoride sodium benzoate Butyric acid Manganese(II) sulfate niobium(IV) fluoride strontium boride iridium(VI) fluoride platinum(IV) fluoride neodymium(II) telluride tantalum(IV) silicide mercury(II) iodate barium sulfite antimony trichloride mercury(I) iodide phosphorus heptasulfide vanadium(III) oxide manganese(II) chloride rhodium(IV) sulfide nitrogen triiodide gallium(III) arsenide rhodium(IV) selenide magnesium glycerophosphate strontium fluoride tantalum(IV) bromide copper(II) molybdate magnesium titanate praseodymium(IV) fluoride manganese(III) oxide vanadium(II) fluoride nickel(II) sulfate chromium(IV) bromide lead(II) iodate selenium dioxydifluoride zinc propionate tellurium dichloride cobalt(II) sulfate thallium(I) telluride titanium(IV) chloride sodium ferrocyanide potassium diphosphate aluminium fluoride aluminium oxide lead(II) sulfite Iodic acid tantalum(V) iodide silver iodide copper(II) arsenate palladium(II) acetate calcium cyanide gallium(I,III) chloride neodymium(II) sulfide tungsten(IV) carbide copper(II) hydroxide niobium(V) chloride aluminium iodide arsenic tetraoxide mercury(II) selenide nickel(II) titanate osmium(IV) sulfide phosphorus oxychloride yttrium(III) fluoride sodium telluride iodine pentafluoride Potassium nitrate magnesium fluoride beryllium borohydride mercury(II) chloride iridium(IV) iodide Antimony(III) chloride barium iodide Acetonitrile lithium citrate aluminium bromide beryllium formate titanium(II) hydride Potassium hydroxide ytterbium(III) oxide calcium metasilicate iron(II) acetate rubidium bromide strontium oxide zinc antimonide aluminium dodecaboride cobalt(II) ferricyanide tungsten(IV) oxide rhenium(III) oxide selenium hexasulfide diboron trioxide rubidium acetate rhodium(III) oxide oxygen(I) fluoride indium(III) nitride tungsten(VI) oxytetrachloride rhodium(IV) telluride tungsten(IV) iodide tungsten(IV) telluride sodium metabisulfite Dimethylglyoxime beryllium bromide copper(II) ferrocyanide potassium propionate gallium(III) sulfide vanadium(IV) silicide phosphorus nonasulfide yttrium(III) chloride tungsten(V) fluoride lithium azide thulium(III) sulfide Malonic acid potassium bisulfate praseodymium(III) sulfide titanium(III,IV) oxide diboron hexahydride gallium(II) chloride erbium(III) iodide tin(II) sulfate cobalt(II) sulfide silver dichromate potassium ferricyanide hafnium iodide lead(II) metasilicate disulfur tetrafluoride lead(II) niobate caesium metaborate manganese(II) chloride tetrahydrate tantalum(IV) selenide phosphorus pentoxide zinc arsenite molybdenum(II) bromide copper(I) azide cerium(IV) fluoride yttrium(III) iodide chromium(II) iodide indium(III) sulfide gadolinium(II) selenide strontium sulfide calcium chlorate nickel(II) fluoride thorium(III) sulfide barium tetraiodomercurate europium(II) bromide ytterbium(III) chloride Antimony(V) chloride protactinium(V) iodide tin(II) sulfide sodium hydride caesium bromate lead(II) oxalate Mercury(II) nitrate gallium(II) sulfide cadmium bromide lead(II) selenite magnesium oxide tin(II) selenide Methanol Hydrochloric acid cerium(III) iodide vanadium(IV) selenide mercury(II) sulfide trisilver triiodide potassium borohydride Sulfurous acid neodymium(III) oxide cadmium molybdate neptunium(III) nitride thallium(I) chloride titanium(II) bromide molybdenum(IV) telluride rhenium(III) bromide sulfur difluoride diiodine pentoxide manganese(II) iodide manganese(III) chloride copper(I) oxide rhenium(IV) chloride gold(I,III) fluoride neodymium(II) iodide potassium silicate silver oxide Propan-1-ol lithium antimonide uranium(IV) oxide protactinium(II) oxide chromium(II) carbide manganese(IV) oxide strontium iodide iron(III) bromide hafnium hydride barium nitrate potassium bicarbonate europium(II) niobate manganese(III) antimonide cadmium tungstate iridium(IV) chloride germanium(II) chloride lead(II) fluoroborate lead(II) arsenate thorium(IV) nitrate zinc selenide zinc nitrite molybdenum(III) bromide iridium(III) chloride iron(III) chromate nickel(III) boride cobalt(II) bromide erbium(III) vanadate chromium(IV) iodide lead(IV) chloride Potassium bromate potassium methoxide terbium(III) oxide praseodymium(III) oxide beryllium boride zinc molybdate calcium phosphide tellurium dibromide lithium tetrafluoroborate molybdenum(V) fluoride indium(III) telluride Nickel chloride Urethane Isobutanol rubidium telluride californium(III) chloride barium stearate nickel(II) telluride silver permanganate ruthenium(III) acetylacetonate chromyl chloride barium vanadate cobalt(II) acetate tetrahydrate bromine monoxide zirconium(II) iodide strontium phosphide chromium(V) fluoride cobalt(IV) fluoride rhenium(V) chloride tungsten(IV) bromide hafnium selenide scandium(III) trifluoromethanesulfonate Sodium nitrite sodium bisulfite hafnium bromide ruthenium(IV) sulfide acre apothecaries' drachm apothecaries' ounce apothecaries' pound avoirdupois bag barleycorn barn barn-megaparsec barrel beard-second beatment Beaufort scale bing bob bolls bolt broadsheet bushel butt cable carat Celsius cental chain chain, square chaldron candlepower chopin clockface angle cloth-yard compass points cord cord foot crown cubic fathom cubic foot cubic inch cubic yard cubit cup cwt hundredweight degree Delisle dessert spoon dicker drachm apothecaries' dram dram weight, avoirdupois drum east ell engineer's chain faggot Fahrenheit farthing fathom fathom cubic finger firkin five pounds fiver florin fluid drachm fluid ounce fodder folio foot foot, cubic foot, square fother furlong furlong, square gallon gallon Scots gill gill Scots goad grad gradian grain grand groat guinea Gunter's chain half crown half penny hand ha'penny hogshead Hoppus foot horsepower hundredweight inch inch, cubic inch, square iron jill keel Kelvin kennings kilderkin knot last league ledger left letter lb pound weight lb wt lbf lbm line link load measured rick megalithic yard megaparsec Mercalli mil mile mile, square mille millihelen minim Mohr scale monkey mutchkin nail nautical mile Newton degree north o'clock angle octavo ounce ounce apothecaries' ounce troy oz ounce pace pack palm peck penny pennyweight perch perch square perch pica pig pin pint pint Scots pipe point point font points of compass pole pole square pony poppy seed port pottle pound pound pound apothecaries' pound force lbf pound mass lbm pound troy pound weight lb wt poundal puncheon quart quarter length quarter quarter quarto quid quire radian Ramsden's chain Rankine ream R�aumur Richter rick right R�mer rood rod rod square rope sack sardine scam score score Scottish units scruple seam shackle shilling shipping ton six pence sixpenny joint slug smoot south sovereign span square chain square foot square furlong square inch square mile spoon square perch square pole square rod square yard starboard step stone sterling sterling Tudor surveyor's chain tablespoon tabloid tanner teaspoon ten bob ten pounds ten shillings tenner tertian thou three pence thruppence thumb tierce tod ton ton ton troy ounce troy pound truss tun tuppence two shillings two and six U.S. units of volume vergee west windle yard yard, cubic yard, square \ No newline at end of file diff --git a/quantgov/resources/nltk_pos_tags.txt b/quantgov/resources/nltk_pos_tags.txt new file mode 100644 index 0000000..6ef8326 --- /dev/null +++ b/quantgov/resources/nltk_pos_tags.txt @@ -0,0 +1,35 @@ +CC | coordinating conjunction +CD | cardinal digit +DT | determiner +EX | existential there (like: "there is" ... think of it like "there exists") +FW | foreign word +IN | preposition/subordinating conjunction +JJ | adjective 'big' +JJR | adjective, comparative 'bigger' +JJS | adjective, superlative 'biggest' +LS | list marker 1) +MD | modal could, will +NN | noun, singular 'desk' +NNS | noun plural 'desks' +NNP | proper noun, singular 'Harrison' +NNPS | proper noun, plural 'Americans' +PDT | predeterminer 'all the kids' +POS | possessive ending parent's +PRP | personal pronoun I, he, she +PRP$ | possessive pronoun my, his, hers +RB | adverb; i.e. very, silently, +RBR | adverb, comparative better +RBS | adverb, superlative best +RP | particle give up +TO | to go 'to' the store. +UH | interjection errrrrrrrm +VB | verb, base form take +VBD | verb, past tense took +VBG | verb, gerund/present participle taking +VBN | verb, past participle taken +VBP | verb, sing. present, non-3d take +VBZ | verb, 3rd person sing. present takes +WDT | wh-determiner which +WP | wh-pronoun who, what +WP$ | possessive wh-pronoun whose +WRB | wh-abverb where, when \ No newline at end of file From 8a360e3f572a65182c450c7793da5ec848f9e0d4 Mon Sep 17 00:00:00 2001 From: Stephen Strosko Date: Thu, 17 Jan 2019 15:09:51 -0500 Subject: [PATCH 3/7] Revert "Merge pull request #60 from QuantGov/pdesign_pos_features" This reverts commit 7ae8ebb54e48edc1edeaa2fa04ff9ac949bb2244, reversing changes made to cf5e5d2fd4578e6ff654e00627f77a360b378bf0. --- .gitignore | 79 ------------ quantgov/nlp.py | 181 ++------------------------- quantgov/resources/design_words.txt | 1 - quantgov/resources/nltk_pos_tags.txt | 35 ------ 4 files changed, 7 insertions(+), 289 deletions(-) delete mode 100755 quantgov/resources/design_words.txt delete mode 100644 quantgov/resources/nltk_pos_tags.txt diff --git a/.gitignore b/.gitignore index 108ee8d..d9d3190 100644 --- a/.gitignore +++ b/.gitignore @@ -88,82 +88,3 @@ ENV/ # Rope project settings .ropeproject - -# PyCharm stuff - -.idea/* -.idea/codeStyles/Project.xml -.idea/encodings.xml -.idea/misc.xml -.idea/modules.xml -.idea/quantgov.iml -.idea/vcs.xml - -queries.sql - -# via JetBrains: -# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Generated files -.idea/**/contentModel.xml - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/httpRequests - -# Android studio 3.1+ serialized cache file -.idea/caches/build_file_checksums.ser \ No newline at end of file diff --git a/quantgov/nlp.py b/quantgov/nlp.py index 0477863..8022fd8 100644 --- a/quantgov/nlp.py +++ b/quantgov/nlp.py @@ -1,9 +1,6 @@ """ quantgov.nlp: Text-based analysis of documents """ - -from __future__ import division - import re import collections import math @@ -14,7 +11,6 @@ try: import nltk.corpus - from nltk import word_tokenize, sent_tokenize, bigrams, trigrams, pos_tag NLTK = True except ImportError: NLTK = None @@ -30,10 +26,6 @@ except LookupError: nltk.download('wordnet') nltk.corpus.wordnet.ensure_loaded() - try: - nltk.pos_tag('A test.') - except LookupError: - nltk.download('averaged_perceptron_tagger') commands = {} @@ -52,7 +44,7 @@ def check_textblob(func, *args, **kwargs): return func(*args, **kwargs) -class WordCounter: +class WordCounter(): cli = utils.CLISpec( help='Word Counter', @@ -70,7 +62,7 @@ class WordCounter: @staticmethod def get_columns(args): - return ('words', ) + return ('words',) @staticmethod def process_document(doc, word_pattern): @@ -80,7 +72,7 @@ def process_document(doc, word_pattern): commands['count_words'] = WordCounter -class OccurrenceCounter: +class OccurrenceCounter(): cli = utils.CLISpec( help="Term Counter for Specific Words", @@ -138,7 +130,7 @@ def process_document(doc, terms, pattern, total_label): commands['count_occurrences'] = OccurrenceCounter -class ShannonEntropy: +class ShannonEntropy(): lemmas = {} cli = utils.CLISpec( help='Shannon Entropy', @@ -205,7 +197,7 @@ def lemmatize(word): commands['shannon_entropy'] = ShannonEntropy -class ConditionalCounter: +class ConditionalCounter(): cli = utils.CLISpec( help=('Count conditional words and phrases. Included terms are: ' ' "if", "but", "except", "provided", "when", "where", ' @@ -232,7 +224,7 @@ def process_document(doc): commands['count_conditionals'] = ConditionalCounter -class SentenceLength: +class SentenceLength(): cli = utils.CLISpec( help='Sentence Length', @@ -270,7 +262,7 @@ def process_document(doc, precision): commands['sentence_length'] = SentenceLength -class SentimentAnalysis: +class SentimentAnalysis(): cli = utils.CLISpec( help='Performs sentiment analysis on the text', @@ -316,162 +308,3 @@ def process_document(doc, backend, precision): commands['sentiment_analysis'] = SentimentAnalysis - - -class DesignWords: - - cli = utils.CLISpec( - help='Searches for a pre-defined list of words potentially ' - 'associated with design-based standards in text.', - arguments=[ - utils.CLIArg( - flags=('--precision'), - kwargs={ - 'help': 'decimal places to round', - 'default': 2 - } - ) - ] - ) - - @staticmethod - def get_columns(args): - # column names to return - return ('design_word_count', 'design_word_ratio', - 'design_word_ratio2', ) - - @staticmethod - @check_nltk - def process_document(doc, precision): - - # load in design words - # aka weights and measures, chemical compounds, etc. - design_words = [] - with open("quantgov/resources/design_words.txt", 'r') as d: - for l in d: - design_words.append(l.strip()) - design_words = [x.lower().strip() - for x in design_words if x != ""] - - # kill stopwords - stw = set(nltk.corpus.stopwords.words('english')) - design_words = [x for x in design_words if x not in stw] - - # 1-3 grams in design words list - dw1 = set([x for x in design_words - if len(word_tokenize(x)) == 1]) - dw2 = set([x for x in design_words - if len(word_tokenize(x)) == 2]) - dw3 = set([x for x in design_words - if len(word_tokenize(x)) == 3]) - - # tokenize document - tokenized = word_tokenize(doc.text) - - # silly count based on words that might indicate design standards - # aka best practices, etc. - maybe_relevant_count = len([x for x in tokenized - if x in ['standard', - 'practice', - 'best practice']]) - - # single words - token_count = collections.Counter(tokenized) - dw1_count = sum([token_count[x] - for x in token_count.keys() if x in dw1]) - - # bigrams, trigrams - bigrams = [' '.join(x) for x in nltk.bigrams(tokenized)] - trigrams = [' '.join(x) for x in nltk.trigrams(tokenized)] - bigrams = collections.Counter(bigrams) - trigrams = collections.Counter(trigrams) - dw2_count = sum([bigrams[x] - for x in bigrams.keys() if x in dw2]) - dw3_count = sum([trigrams[x] - for x in trigrams.keys() if x in dw3]) - - # final counts - design_word_count = dw1_count + dw2_count + dw3_count - design_word_ratio = design_word_count / sum(token_count.values()) - design_word_ratio2 = design_word_count / len(set(tokenized)) - - # rounds - if precision: - design_word_ratio = round(design_word_ratio, precision) - design_word_ratio2 = round(design_word_ratio2, precision) - - return doc.index + (design_word_count, design_word_ratio, - design_word_ratio2, ) - - -commands['design_words'] = DesignWords - - -class PartsOfSpeech: - - cli = utils.CLISpec( - help='Part of speech tagging and derived metrics', - arguments=[ - utils.CLIArg( - flags=('--precision'), - kwargs={ - 'help': 'decimal places to round', - 'default': 2 - } - ) - ] - ) - - @staticmethod - def get_columns(args): - # column names to return - return ('', '', ) - - @staticmethod - @check_nltk - def process_document(doc, precision): - - # NLTK part of speech tagging - nltk_tags = pos_tag(word_tokenize(doc.text)) - - # all tags - all_tags = [] - with open('quantgov/resources/nltk_pos_tags.txt', 'r') as o: - for x in o.readlines(): - all_tags.append(x.split('|')[0]) - - # count up tags - count_tags = {} - for x in all_tags: - count_tags[x.strip()] = 0 - for x in nltk_tags: - try: - count_tags[x[1]] += 1 - except KeyError: - continue - - word_count = sum(count_tags.values()) - nouns_count = (count_tags['NN'] + count_tags['NNS'] + - count_tags['NNP']) - verbs_count = (count_tags['VB'] + count_tags['VBD'] + - count_tags['VBG'] + count_tags['VBN'] + - count_tags['VBP'] + count_tags['VBZ']) - noun_verb_ratio = (nouns_count + 1) / (verbs_count + 1) - nouns_ratio = (nouns_count + 1) / (word_count + 1) - verbs_ratio = (verbs_count + 1) / (word_count + 1) - proper_nouns_count = count_tags['NNP'] + count_tags['NNPS'] - proper_nouns_ratio = (proper_nouns_count + 1) / (word_count + 1) - - if precision: - noun_verb_ratio = round(noun_verb_ratio, precision) - nouns_ratio = round(nouns_ratio, precision) - proper_nouns_ratio = round(proper_nouns_ratio, precision) - verbs_ratio = round(verbs_ratio, precision) - - return (doc.index + - (noun_verb_ratio, nouns_count, verbs_count, - nouns_ratio, verbs_ratio, - proper_nouns_count, proper_nouns_ratio, )) - - -commands['pos_metrics'] = PartsOfSpeech diff --git a/quantgov/resources/design_words.txt b/quantgov/resources/design_words.txt deleted file mode 100755 index 9bfd8b9..0000000 --- a/quantgov/resources/design_words.txt +++ /dev/null @@ -1 +0,0 @@ -nickel(II) zirconate vanadium(III) bromide potassium bitartrate calcium oxide germanium(IV) methoxide boron carbide sodium oxide silver telluride vanadium(III) sulfate gallium(III) fluoride sodium hydromethylglycinate hafnium carbide lanthanum chloride silver sulfide Sodium carbonate Sodium hydroxide thorium(III) fluoride potassium persulfate Iron(III) nitrate lanthanum niobate palladium(II) bromide caesium hydride arsenic triselenide phosphorus hexasulfide indium(III) iodide lutetium iodide iron(III) pyrophosphate molybdenum(IV) bromide rhenium(IV) sulfide zinc chloride sodium cyanoborohydride aluminum silicate calcium aluminate sulfur trioxide potassium cyanate Acetic acid calcium acetate technetium(VII) oxide zinc fumarate palladium(IV) sulfide caesium tungstate californium(III) fluoride sodium hydroxide zinc dichromate bromine monochloride zinc dimethyldithiocarbamate cadmium antimonide rhenium(III) iodide barium tartrate tellurium monoxide indium(III) fluoride nonahydrate hafnium phosphide scandium(III) chloride germanium(II) sulfide Ammonium dichromate calcium perchlorate erbium(III) chloride cerium(III) bromide sodium dithionate lanthanum fluoride zinc hydride zinc glycerophosphate platinum(III) iodide titanium(IV) sulfide niobium(IV) carbide potassium tetrafluoroaluminate sodium tartrate praseodymium(IV) oxide lithium metasilicate praseodymium(III) formate lead(II) tungstate arsenic tetrasulfide iron chromite lead(II) orthosilicate iron(III) sulfate iron(II) oxide manganese(II) sulfate Oxalic acid lead(II) bromide sodium chlorate calcium nitrite indium(III) phosphide potassium hexafluorophosphate gallium(III) sulfate ruthenium(V) fluoride sodium sulfate europium(II) fluoride calcium formate krypton difluoride tantalum(III) chloride barium hydrosulfide potassium chromate calcium sulfide uranium(VI) telluride sodium amide Potassium hexacyanoferrate(III) calcium hydroxide indium(I) hydride oxygen(II) fluoride caesium chloride hafnium oxide bismuth(III) oxide Potassium hydrogen phosphate dysprosium(III) bromide uranium(IV) fluoride nickel(II) boride aluminium chloride radium nitrate phosphorus tetrachloride strontium bromide hexahydrate tungsten(IV) fluoride Sodium formate ammonia sodium bifluoride molybdenum(IV) fluoride phosphorus tetrabromide terbium(III) fluoride iridium(II) bromide lithium selenide boron tribromide Lactose bismuth(III) subcarbonate copper(II) oleate lanthanum oxide lithium chloride monohydrate phosphorus decaoxide tungsten(VI) oxytetrabromide sodium vanadate scandium(III) hydride molybdenum(II) iodide zinc hydroxide manganese(VII) oxide protactinium(IV) iodide lead(II) hypophosphite cadmium nitride Zinc chloride protactinium(IV) oxide indium(III) acetate magnesium peroxide potassium hexafluorosilicate niobium(II) oxide plutonium(IV) sulfide copper(II) arsenite sodium cobaltnitrite erbium(III) sulfide zirconium(IV) iodide Ammonium nitrate molybdenum(VI) fluoride sulfur monoxide gallium(I) oxide tungsten(II) chloride chlorine monofluoride magnesium hydroxide copper(I) hydride lanthanum iodide titanium(II) chloride magnesium perchlorate lead(II) formate vanadium(IV) sulfide sodium sulfide gallium(III) nitride iron(III) hypophosphite Potassium permanganate germanium(II) iodide potassium bromate antimony pentachloride nitrogen tribromide lead(II) chloride barium dichromate Calcium sulfate terbium(II) fluoride dinitrogen tetroxide caesium oxide lithium borohydride copper(II) acetylide Acetone dysprosium(II) bromide calcium sulfate potassium tellurite Dimethyl sulfoxide (DMSO) arsenic trihydride zinc chromite radium carbonate strontium selenide tantalum(III) aluminide Zinc nitrate chromium(III) fluoride dinitrogen tetrafluoride strontium sulfite yttrium(III) hydride magnesium antimonide gold(III) iodide barium disilicate hafnium sulfate mercury(II) oxide yttrium(III) boride iron(II) tungstate Sodium tartrate copper(II) tungstate potassium cyanide thorium(IV) bromide Barium chloride germanium(II) fluoride lead(II) sulfate rhenium(IV) fluoride radium chloride gold(III) bromide gallium(III) bromide zirconium(II) oxide sodium perchlorate mercury(I) oxide thallium(I) hexafluorophosphate indium(II) selenide einsteinium(III) chloride thallium(II) chloride Sodium chloride lead(II) carbonate tantalum(IV) oxide niobium(V) oxide iridium(IV) oxide Potassium chloride rubidium selenide barium azide barium fluoride boron nitride lanthanum vanadate niobium(V) bromide lithium metavanadate zirconium(IV) selenide uranium(II) nitride titanium(IV) fluoride germanium(IV) fluoride arsenic diiodide beryllium phosphate nickel(II) bromide sodium iodide tetraarsenic tetrasulfide barium arsenite lanthanum bromide iron(II) sulfide Tin(IV) chloride silver dibromide silver bromate Hydrofluoric acid copper(II) azide bismuth(III) stannate californium(III) bromide bismuth(III) hydride erbium(III) fluoride sodium selenide potassium benzoate ruthenium(IV) telluride arsenic trichloride boron arsenate terbium(III) chloride titanium(III) bromide neptunium(IV) chloride plutonium(III) fluoride platinum(VI) fluoride barium selenite sodium benzosulfonate Lead(II) nitrate Potassium carbonate terbium(III) bromide sodium tripolyphosphate ruthenium(II) iodide tellurium disulfide aluminium antimonide nickel(II) chloride ruthenium(IV) selenide lutetium nitrate potassium formate neodymium(III) telluride Potassium hexacyanoferrate(II) barium molybdate silver cyanide zinc carbonate zinc orthosilicate Hydrazine technetium(IV) chloride palladium(IV) telluride lead(II) stearate beryllium nitrate antimony triiodide zinc pyrophosphate Ammonium acetate cobalt(III) titanate disulfur monoxide technetium(V) fluoride holmium vanadate cadmium sulfide cadmium arsenide samarium(III) bromide potassium niobate lutetium telluride strontium chloride ytterbium(III) sulfide zinc dimethyldithiocarbonate lithium borate lutetium chloride platinum(II) bromide Maleic acid sodium adipate thallium(I) molybdate cobalt(II) oxalate calcium tungstate zirconium(IV) silicate Ammonium chloride Sodium hydrogen tartrate technetium(IV) bromide aluminium nitride iron(III) phosphate arsenic monoxide tellurium trioxide sodium perrhenate aluminium bromide hexahydrate iron sodium acetate barium perrhenate iron(II) bromide vanadium(IV) chloride potassium perrhenate titanium(II) oxide aluminium sulfide Hydrogen peroxide antimony tritelluride indium(III) fluoride hafnium(IV) fluoride praseodymium(III) iodide nitrogen dioxide sodium borofluoride cadmium titanate chromium(III) chloride yttrium(III) bromide tantalum(V) fluoride lanthanum formate silver carbonate zinc nitride lithium nitride dysprosium(II) iodide boron trifluoride dysprosium(III) iodide niobium(V) iodide ytterbium(III) bromide strontium nitrite Chloroplatinic acid potassium ferrate sodium hypochlorite phosphorus hexoxide copper(II) selenide iodine monoazide tellurium tetraiodide sodium antimonate bismuth(III) oxychloride radium iodide Calcium chloride barium formate curium(III) oxide curium(III) chloride sodium hexafluoroaluminate calcium chromate bismuth(III) oxybromide selenium disulfide Glucose lithium tetrahydridoaluminate yttrium(III) trifluoromethanesulfonate Ammonium oxalate osmium(IV) fluoride potassium dimethyldithiocarbamate iridium(III) bromide iron(III) ferrocyanide zinc stannate calcium permanganate platinum(III) bromide potassium peroxide rhenium(IV) iodide caesium acetate Nicotine beryllium oxide sodium bismuthate zirconium(IV) hydroxide caesium iodide einsteinium(II) iodide calcium bromide nickel(II) nitrate thallium(I) oxalate nickel(II) sulfide Sodium sulfide protactinium(IV) chloride einsteinium(III) iodide nickel(II) iodide lanthanum nitride calcium hydrogenorthophosphate silver chromate gallium(II) oxide zinc bromide zinc iodide zirconium(IV) sulfide thulium(III) bromide Mercury(II) chloride sodium selenite cadmium niobate gold(III) selenate aluminium selenide lanthanum silicide lithium peroxide bismuth(III) citrate titanium(II) fluoride germanium(IV) nitride molybdenum(IV) sulfide sodium tetracholoraluminate ytterbium(II) telluride molybdenum(VI) chloride praseodymium(II) selenide silicon bromide erbium(III) chloride hexahydrate tungsten(V) bromide Ammonium carbonate barium pyrophosphate barium thiocyanate indium(III) fluoride trihydrate silicon sulfide sodium chloride magnesium chloride titanium(III) nitride chromium(III) nitride iron(II) chloride tetrahydrate cobalt(III) acetylacetonate lithium zirconate praseodymium(III) nitride fluorine nitrate gold(III) selenide indium(III) arsenide ruthenium(VI) oxide cerium(III) oxide zinc phosphate Perchloric acid iridium(II) iodide potassium iodide arsenic monophosphide disulfur diiodide magnesium arsenate lithium phosphate molybdenum(III) chloride lanthanum boride molybdenum(III) iodide gadolinium(III) telluride sodium arsenite magnesium technetate manganese(II,III) oxide disulfur dibromide Silver nitrate phosphorus thiochloride sodium orthosilicate cerium(II) hydride sodium nitrate neptunium(V) oxide beryllium aluminate thallium(I) nitrate zinc telluride phosphorus tetrafluoride rubidium fluoride technetium(VI) chloride nickel(II) chromate lead(II) antimonate dinitrogen difluoride Lead(IV) acetate tellurium tetrafluoride rhenium(VII) fluoride gadolinium(III) chloride iron(III) chloride lithium titanate manganese(II) nitrate Sodium dihydrogen phosphate osmium(IV) bromide lead(II) iodide gallium(II) telluride manganese(II) sulfide cobalt(II) telluride terbium(IV) fluoride Sodium sulfite Mercury(II) sulfate zinc thiocyanate thulium(II) niobate antimony tribromide cobalt(II) chloride hexahydrate tantalum(IV) sulfide lead(II) oleate zirconium(II) chloride neodymium(III) nitride silver hexafluorophosphate Aluminium sulfate technetium(IV) sulfide titanium(III) sulfate Iron(II) sulfate sodium formaldehydesulfoxylate chromium(II) bromide Maltose sodium tungstate Sodium chromate boron triiodide copper(II) borate ruthenium(III) fluoride calcium telluride thallium(I) fluoride xenon trioxide rhenium(IV) oxide chlorine heptoxide silicon dioxide vanadium(II) bromide caesium azide aluminium lanthanum hydride indium(III) phosphate thallium(III) iodate cadmium iodate aluminum carbide lead(IV) bromide tin(IV) sulfide Sodium arsenate niobium(IV) oxide caesium fluoride copper(II) oxalate thorium(III) iodide sodium tetraphenylborate manganese(II) carbonate thulium(III) chloride gallium(III) antimonide iron(II) orthosilicate iron(II) selenide potassium hexafluorozirconate cobalt(II) fluoride copper(II) formate platinum(IV) bromide lanthanum chloride heptahydrate germanium(IV) sulfide Hydrobromic acid gadolinium(II) silicide calcium silicate europium(II) chloride einsteinium(II) bromide antimony trinitride thorium(IV) oxide lead(II) selenide dysprosium(II) chloride lead(III) acetate yttrium(III) arsenide thorium(II) sulfide thorium(IV) orthosilicate thallium(I) oxide platinum(IV) selenide neodymium(III) formate calcium propionate arsenic pentafluoride Copper(II) nitrate thorium(IV) silicide erbium(III) sulfate lanthanum chloride trihydrate silver fulminate bismuth(III) orthoniobate potassium bifluoride potassium hexafluoroaluminate palladium(II) nitrate niobium(VI) selenide Resorcinol caesium iodate silver iodate magnesium selenide aluminium diboride magnesium telluride tungsten(VI) dioxydibromide Lithium chloride tellurium dioxide antimony trihydride silver acetate osmium(V) chloride caesium amide aluminium chloride hexahydrate vanadium(IV) iodide manganese(IV) fluoride sodium phenylacetate magnesium stearate Sodium thiosulfate ammonium hydroxide sodium ethenesulfonate holmium iodide manganese(III) fluoride selenium hexafluoride rubidium periodate copper(II) chloride Aluminium nitrate bismuth(III) iodide cobalt(II) iodate Diethyl ether niobium(IV) sulfide terbium(III) formate Potassium dihydrogen phosphate cobalt(II) chromite neodymium(II) selenide rhenium(V) bromide xenon oxytetrafluoride trisilver trichloride arsenic triphosphide chromium(IV) chloride platinum(V) fluoride sodium fluorophosphate sulfur hexafluoride Sodium phosphate silver selenate berkelium(III) iodide samarium(II) fluoride chromium(III) sulfide phosphorus triselenide barium acetate sodium carbonate zinc permanganate neodymium(III) vanadate californium(III) iodide arsenic trioxide niobium(III) iodide rubidium iodide titanium(III) chloride strontium bromide zirconium(IV) sulfate bismuth(III) titanate Copper(II) sulfate strontium silicate tin(II) oxalate samarium(III) telluride holmium fluoride yttrium(III) oxide cobalt(II) orthosilicate barium nitrite bromine pentafluoride bromine azide copper(I) bromide yttrium(III) sulfide iodine trichloride zinc fluoride berkelium(IV) oxide mercury(II) thiocyanate calcium molybdate tungsten(II) bromide chlorine hexoxide iron(II) aluminate cobalt(III) oxide Barium nitrate indium(III) selenide iron(II) telluride praseodymium(II) sulfide phosphorus oxybromide titanium(IV) sulfate vanadium(II) oxide ytterbium(III) selenide berkelium(III) bromide zinc succinate dinitrogen monoxide strontium stannate arsenic pentasulfide tantalum(IV) telluride sodium formate barium hydroxide uranium(IV) carbide lithium chloride sodium thiosulfate thallium(I) cyanide europium(III) bromide chromium(III) oxide iron(III) chloride hexahydrate arsenic oxybromide sodium ethoxide calcium chloride calcium titanate curium(IV) fluoride europium(II) vanadate nickel(II) oxide thallium(I) perchlorate tin(IV) chloride bismuth(III) bromide cadmium azide caesium perchlorate arsenic tribromide gadolinium(III) iodide barium silicide dysprosium(III) formate sodium caprylate iron(II) carbonate calcium hypochlorite xenon monofluoride thorium(III) nitride iron(II) molybdate radium oxide Tin(II) chloride nickel(II) cyanide bismuth(III) selenide erbium(II) sulfide scandium(II) hydride potassium azide boric acid sodium stearate europium(II) telluride germanium(II) selenide lead(IV) hydride caesium tetraiodozincate lutetium nitride palladium(II) trifluoroacetate thallium(I) hydroxide germanium(II) oxide lithium methoxide lutetium oxide alumimum nitrate cerium(III) fluoride lithium hexafluorophosphate lead(II) nitrate Pentan-1-ol osmium(III) chloride trihydrate rubidium oxide cerium(III) formate rhenium(VI) chloride copper(I) iodide uranium(V) fluoride mercury(II) nitrate indium(II) chloride potassium metaphosphate mercury(II) bromide potassium fluoride strontium carbonate copper(II) tetrafluoroborate calcium fumarate caesium cyanide phosphorus pentachloride beryllium sulfate rhenium(V) fluoride silicon chloride zinc arsenate vanadium(II) iodide tungsten(VI) sulfide dysprosium(II) silicide molybdenum(IV) silicide silver perchlorate calcium phosphate sodium methoxide diiodine tetroxide lead(II) titanate bromine monofluoride sodium propionate niobium(III) chloride iron(II) fluoride sodium silicate Chloroform cerium(II) sulfide silicon carbide tungsten(III) chloride ruthenium(IV) oxide barium selenide lithium dihydrogenphosphate iridium(III) sulfide antimony trisulfate Butan-1-ol calcium oxalate lead(II) tantalate Sulfuric acid protactinium(IV) bromide calcium iodate zinc peroxide chromium(II) chloride barium permanganate gadolinium(III) formate iridium(V) fluoride samarium(III) iodide strontium thiocyanate californium(IV) oxide vanadium(IV) carbide zinc chlorate sodium aluminate potassium phosphate bismuth(V) fluoride iron(II) iodide mercury(II) hydride zirconium(IV) bromide cadmium carbonate potassium ethoxide copper(II) fluoride sodium malonate silicon fluoride cobalt(II) perchlorate Potassium tartrate selenium oxybromide barium chlorate lithium aluminate niobium(IV) chloride cobalt(III) nitrate iron(III) hydroxide nickel(II) stannate thorium(IV) selenide tin(IV) hydride potassium metavanadate ruthenium(II) bromide promethium(III) chloride thorium(IV) carbide tantalum(V) chloride sodium thioantimonide calcium carbonate plutonium(IV) fluoride erbium(III) telluride calcium tetrahydroaluminate lead(II) hydroxide neptunium(IV) fluoride yttrium(III) vanadate sodium sulfide nonahydrate caesium nitrate silver subfluoride iron(III) oxide lead(II) chromate Cadmium nitrate Iron(II) ammonium sulfate sodium iodate chlorine dioxide iron(II,III) oxide selenium bromide gadolinium(III) fluoride lithium molybdate cadmium nitrate Strontium nitrate potassium periodate radium bromide manganese(IV) telluride protactinium(III) iodide arsenic tetraselenide protactinium(V) oxide iron(II) chloride sodium thiocyanate ytterbium(II) chloride tungsten(VI) dioxydiiodide osmium(VI) fluoride einsteinium(III) bromide vanadium(III) chloride lead(II) zirconate germanium(II) telluride barium peroxide iron(II) hydroxide Sodium citrate thallium(I,III) bromide cerium(III) oxalate potassium bromide Strontium sulfate indium(I) iodide copper(I) sulfide germanium(IV) chloride gadolinium(III) nitride vanadium(I) hydride barium hydride titanium(II) iodide sodium dithionite cadmium telluride tin(II) fluoroborate iridium(IV) selenide iron(II) phosphate tantalum(V) oxide Acetamide Potassium sulfite Sodium hydrogen phosphate bismuth(III) sulfide barium bromide iodine monochloride phosphorus pentabromide nickel(III) sulfide samarium(II) chloride magnesium silicite Methyl acetate tantalum(IV) carbide platinum(IV) iodide Ethylene glycol gold(III) oxide potassium hydride Ammonia chromium(III) picolinate copper(II) perchlorate silver sulfate potassium tetrachlorocuprate calcium fluoride sodium lactate potassium tetrachloropalladate promethium(III) bromide antimony pentafluoride bismuth(III) oxalate caesium bromide tungsten(IV) sulfide magnesium bromide hexahydrate scandium(III) bromide Chromium(VI) oxide mercury(I) chloride sodium hexametaphosphate silver oxalate potassium hexafluorotitanate palladium(II) oxide lithium oxide arsenic tritelluride cerium(III) chloride protactinium(IV) fluoride Lead(II) chloride calcium boride nickel(II) hydroxide ytterbium(III) chloride hexahydrate chromium(VI) carbonyl Zinc bromide osmium(III) bromide lithium perrhenate molybdenum(II) oxide uranium(VI) carbide Dichloroacetic acid caesium selenide copper(II) stearate europium(III) nitride nickel(II) antimonide xenon hexafluoride lead(II) lactate tantalum(IV) chloride rhenium(IV) telluride lithium hydride Citric acid lead(II) oxide europium(II) selenide sodium hexafluorophosphate thallium(I) selenide plutonium(III) chloride uranium(V) bromide tungsten(VI) dioxydichloride Aluminium chloride Magnesium sulfate ytterbium(II) iodide iron(II) fluoride tetrahydrate fluorine perchlorate rhodium(VI) fluoride disulfur decafluoride Lead(II) acetate cobalt(II) chromate silver nitrate mercury(II) fluoride Potassium iodide berkelium(II) oxide ruthenium(VI) fluoride samarium(III) sulfide scandium(III) oxide osmium(III) iodide cadmium cyanide gallium(III) selenide tantalum(III) nitride sodium sulfite rhodium(IV) oxide iron(II) sulfate Sodium sulfate chromium(III) sulfate phosphorus pentasulfide arsenic oxychloride potassium carbonate yttrium(III) antimonide protactinium(V) fluoride gold(I) chloride strontium peroxide thallium(I) sulfide molybdenum(III) fluoride cobalt(II) molybdate lithium bromide germanium(IV) bromide rhenium(VII) sulfide sodium sulfide pentahydrate gadolinium(III) chloride hexahydrate lithium hydroxide dysprosium(III) fluoride chromium(II,III) oxide magnesium succinate iridium(IV) bromide indium(II) telluride titanium(IV) bromide lithium metaborate bismuth(III) subnitrate cadmium acetate calcium nitrate plutonium(III) hydride barium selenate Potassium nitrite copper(II) chromate arsenic pentachloride beryllium sulfide rhenium(VI) oxide thorium(IV) sulfide manganese(II) acetate potassium nitrate beryllium sulfate trihydrate chromium(III) iodide scandium(III) nitrate palladium(II) telluride Potassium bicarbonate gallium(III) hydroxide chromium(III) bromide rhodium(IV) fluoride palladium(II) sulfate plutonium(VI) fluoride cobalt(III) sulfide radium hydroxide terbium(IV) oxide phosphorus triiodide gadolinium(III) sulfide mercury(II) acetate barium manganate thallium(III) oxide iridium(III) fluoride gallium(III) hydride palladium(II) fluoride lithium benzoate copper(I) chloride indium(I,III) iodide Hydrocyanic acid lead(II) metavandate barium perchlorate Cobalt(II) nitrate thallium(I) chlorate silver chloride sulfur tetrachloride berkelium(III) chloride Copper(II) chloride hafnium nitride gold(II) selenide gallium(III) phosphide curium(II) oxide potassium nitrite potassium adipate sodium tetraborate tungsten(VI) chloride chromium(IV) fluoride niobium(IV) selenide Copper(I) chloride indium(II) sulfide rhenium(VI) fluoride fluorine monoxide americium dioxide lead(IV) oxide bismuth(III) oxynitrate osmium(I) iodide cadmium iodide caesium chlorate manganese(II) telluride calcium silicide nickel(II) arsenide chlorine trioxide terbium(IV) silicide barium titanate zinc tungstate terbium(III) sulfide molybdenum(III) hexacarbonyl xenon tetroxide uranium(III) bromide Barium hydroxide lithium iodide Glycerol technetium(IV) oxide silver azide lead(II) perchlorate zinc cyanide barium tungstate germanium(IV) selenide zirconium(IV) silicide sodium chlorite silver bromide neodymium(III) iodide hafnium chloride zinc stearate Lysergic acid diethylamide (LSD) zirconium(III) nitride arsenic trifluoride californium(IV) fluoride barium sulfate Phenol molybdenum(VI) oxide potassium chlorate strontium titanate antimony trifluoride barium bromide dihydrate uranium(IV) bromide phosphorus trichloride nickel(II) acetate zirconium(IV) carbide aluminium phosphate selenium tetrachloride potassium superoxide Lactic acid boron arsenide platinum(IV) chloride tantalum hydride potassium binoxalate berkelium(III) oxide rubidium hydroxide zirconium(IV) phosphate selenium monosulfide tellurium hexafluoride niobium(III) bromide Potassium sulfate palladium(II) iodide zinc caprylate potassium sulfate gold(I) sulfide dysprosium(III) sulfide protactinium(V) bromide Nitric acid tin(II) chloride potassium pyrophosphate Sodium bromide silver tetrachloroaluminate cerium(III) sulfide indium(I) chloride titanium(III) iodide copper(II) bromide gallium(III) telluride iron(II) chloride dihydrate yttrium(II) carbide titanium(III) oxide Potassium chlorate iron(III) fluoride tin(IV) selenide zirconium(IV) fluoride Urea vanadium(V) fluoride gold(III) fluoride tantalum(V) bromide chromium(III) telluride copper(II) chloride dihydrate lithium sulfate zirconium(IV) chloride Carbon disulfide sodium persulfate thulium(III) iodide caesium nitrite silicon boride neptunium(IV) oxide copper(II) nitrate beryllium hydride rhodium(V) fluoride bismuth(III) formate antimony trisulfide boron phosphate uranium(IV) iodide arsenic diphosphide calcium cyanamide erbium(III) boride tin(IV) oxide barium chromate praseodymium(III) fluoride gold(I,III) chloride germanium(IV) hydride erbium(III) formate tungsten(VI) bromide barium boride plutonium(III) oxide niobium(III) nitride americium(III) chloride disulfur dichloride uranium(IV) chloride mercury(II) perchlorate barium thiosulfate lanthanum carbide samarium(III) fluoride potassium trifluoromethanesulfonate europium(III) sulfate iron(II) titanate lithium formate strontium hydride tetrasulfur tetranitride chromium(III) phosphate ruthenium(II) chloride samarium(II) iodide zinc titanate sodium antimonide lead(II) cyanide antimony pentasulfide potassium thiocyanate platinum(II) oxide californium(II) bromide germanium(IV) iodide palladium(IV) selenide Potassium dichromate sulfur tetrafluoride caesium vanadate strontium tungstate dysprosium(III) chloride chromium(II) selenide Acetaldehyde praseodymium(III) chloride Hexafluorosilicic acid aluminium formate copper(II) sulfate iron(III) iodide thallium(I) selenate vanadium(IV) fluoride sodium tosylate europium(II) sulfide Chromium(III) nitrate lead(II) selenate lead(II) molybdate copper(I) selenide strontium molybdate cobalt(II) tungstate calcium citrate manganese(II) oxide thulium(III) fluoride Ammonium hydroxide gadolinium(III) bromide vanadium(IV) telluride fluorine dioxide silver selenide bromine dioxide molybdenum(IV) oxide uranium(V) chloride silver thiocyanate sodium acrylate lead(II) sulfide cadmium oxalate mercury(I) bromide aluminium sulfate lithium diisopropylamide nickel(II) tungstate zinc nitrate tin(IV) bromide tungsten(VI) oxytetrafluoride Iron(III) chloride rubidium peroxide Tartaric acid thulium(II) chloride bismuth(III) vanadate berkelium(IV) fluoride lithium sulfide chlorine pentafluoride beryllium chloride caesium peroxide sodium azide actinium(III) oxide gold(I) bromide barium chloride zirconium(IV) tungstate zinc chromate bismuth(IV) peroxide platinum(II) chloride barium niobate cobalt(II) nitrate hexahydrate caesium periodate nickel(III) oxide gallium(III) iodide silver acetylide cobalt(II) chloride bromine trifluoride rubidium nitrate europium(III) oxide potassium dichromate selenium difluoride dysprosium(III) hydride sodium peroxide selenium dichloride iodine monobromide thulium(III) oxide selenium trioxide lithium perchlorate gold(I) iodide Potassium chromate barium sulfide Magnesium nitrate platinum(II) cyanide cobalt(III) oxide monohydrate silver molybdate neptunium(III) chloride Nickel nitrate iron(III) arsenic nitrogen trifluoride Cobalt(II) sulfate cobalt(IV) sulfide iridium(III) iodide tantalum(II) oxide Potassium iodate zirconium(IV) oxide xenon tetrafluoride molybdenum(V) chloride antimony pentoxide silver tribromide sodium phosphate thallium(I) carbonate holmium chloride rhenium(III) chloride californium(III) oxide gold(I) cyanide tin(II) fluoride Strontium chloride potassium pyrosulfate calcium nitride cobalt(II) naphthenate lead(II) butanoate zinc oxalate thallium(I) iodide technetium(VI) fluoride platinum(II) telluride sodium cyanide osmium(III) chloride platinum(IV) telluride potassium acetate xenon dioxydifluoride cobalt(II) oxide plutonium(III) bromide caesium molybdate potassium perchlorate berkelium(III) fluoride cobalt(II) nitrate sodium tetrafluoroaluminate beryllium nitrate trihydrate beryllium selenide molybdenum(II) chloride sodium selenate europium(III) iodide silicon hydride arsenic triiodide aluminium hydroxide tin(IV) iodide rubidium chloride Chloroacetic acid bismuth(III) fluoride gallium(III) oxide tungsten(V) oxytrichloride Potassium bromide uranium(V) oxide silver perrhenate Iron(III) sulfate vanadium(II) chloride thallium(III) chloride rubidium iodate strontium nitrate potassium fluoroborate ruthenium(III) bromide silver fluoride gold(II) telluride molybdenum(V) oxide lutetium bromide molybdenum(IV) iodide rubidium hydride platinum(III) chloride terbium(III) selenide magnesium nitrate curium(III) bromide indium(II) bromide neptunium(II) oxide vanadium(III) sulfide uranium(VI) chloride phosphorus tetrasulfide iridium(VI) telluride indium(III) bromide iron(III) dichromate curium(III) fluoride sodium bicarbonate cobalt(II) stannate uranium(II) sulfide palladium(IV) fluoride platinum(IV) oxide bismuth(III) phosphate strontium chromate caesium chromate potassium perruthenate potassium aluminate caesium sulfide holmium sulfide cobalt(II) cyanide calcium iodide thorium(IV) fluoride phosphorus tetrahydride platinum(II) sulfide potassium hexachlororuthenate yttrium(III) formate molybdenum(III) nitride phosphorus pentafluoride holmium formate gadolinium(III) oxide platinum(IV) chloride pentaahydrate Calcium hydroxide sodium superoxide cadmium oxide selenium tetrabromide zinc oxide thallium(I) acetate lithium telluride copper(II) cyanide caesium oxalate ferric nitrate gold(V) fluoride beryllium acetylacetonate Chloroauric acid uranium(III) hydride strontium zirconate selenium tetrafluoride thallium(III) nitrate neodymium(III) fluoride sodium cyanate rhenium(IV) bromide thorium(II) carbonate vanadium(IV) oxide iron(III) oxalate iron(III) metavanadate manganese(II) selenide boron phosphide beryllium iodide osmium(VIII) oxide europium(III) fluoride sodium periodate water sodium bisulfide zinc ferrocyanide lead(II) telluride rubidium superoxide cadmium chloride rubidium acetylacetonate uranyl nitrate bismuth(III) molybdate Saccharose cobalt(II) aluminate scandium(III) telluride diboron tetrachloride lead(II) chlorate magnesium carbonate neodymium(II) chloride plutonium(II) sulfide silver(I,III) oxide zirconium(II) hydride germanium(II) bromide Manganese(II) chloride tellurium tetrabromide plutonium(III) nitride selenium dibromide iodine monofluoride thulium(II) iodide chlorine trifluoride molybdenum(III) sulfide neptunium(V) fluoride Potassium thiocyanate dysprosium(III) nitride vanadium(V) oxytrifluoride gallium(III) fluoride trihydrate lithium fluoride scandium(III) sulfide arsenic pentoxide TRIS silicon iodide strontium dithionate titanium(III) fluoride vanadium(III) iodide samarium(II) bromide rhodium(III) iodide curium(III) iodide lutetium sulfide promethium(III) oxide titanium(IV) ethoxide beryllium nitrate tetrahydrate sodium metavanadate copper(I) telluride manganese(II) fluoride neptunium(III) fluoride uranium(IV,V) oxide chromium(VI) fluoride ytterbium(III) iodide boron decahydride calcium dihydrogen phosphate monohydrate beryllium telluride sulfur dichloride samarium(III) oxide ammonium sulfate tungsten(VI) oxide vanadium(V) oxytrichloride zinc perchlorate magnesium sulfate thorium(II) iodide thallium(I) triiodide silver selenite vanadyl sulfate yttrium(II) hydride copper(I) fluoride samarium(III) formate Bismuth(III) chloride uranium(III) iodide osmium(II) cyclopentadienide zinc borate magnesium silicide bismuth(III) orthotantalate sodium borohydride uranium(VI) fluoride hafnium orthosilicate potassium xanthogenate osmium(V) fluoride holmium silicide rhenium(VII) oxide erbium(III) nitride magnesium bromide indium(III) chloride barium oxalate neodymium(III) sulfide bismuth(III) oxyiodide samarium(III) chloride cadmium silicate iridium(IV) oxide dihydrate vanadium(V) oxide lanthanum sulfide neodymium(III) nitrate titanium(IV) iodide zirconium(IV) telluride potassium sulfide praseodymium(III) bromide rhodium(VII) sulfide cadmium sulfate lead(II) thiocyanate neodymium(III) gallate zinc iodate magnesium iodide europium(III) formate erbium(III) niobate thulium(II) chloride heptahydrate cerium(III) vanadate cerium(IV) oxide uranium(VI) oxide barium hexafluorosilicate platinum(IV) oxide monohydrate thulium(II) bromide arsenic trisulfide beryllium hydroxide diboron trisulfide phosphorus tetraiodide nitrogen monoxide platinum(II) iodide zirconium(III) iodide caesium hydroxide europium(III) chloride potassium oxide zinc lactate beryllium acetate aluminium phosphide potassium metabisulfite magnesium hydride Fructose rhodium(III) sulfide praseodymium(II) iodide plutonium(II) hydride cadmium phosphide Zinc sulfate potassium telluride cobalt(II) titanate sodium fluorosilicate mercury(II) iodide tantalum(IV) iodide neodymium(II) bromide potassium tartrate sodium pyrosulfate Phosphoric acid gold sulfuric acid barium metasilicate Chromium(III) sulfate indium(III) sulfate lithium superoxide molybdenum(III) oxide cadmium chromate EDTA, disodium salt caesium sulfate strontium selenate silver difluoride barium arsenate erbium(II) telluride silver phosphate uranium(IV) telluride iron(III) fluoride trihydrate cyanide thallium(I) bromide sodium molybdate cadmium fluoride lutetium fluoride cadmium selenide barium metaphosphate sodium diacetate curium(IV) oxide zirconium(IV) phosphide Ammonium sulfate phosphorus trisulfide lead(II,IV) oxide europium(II) iodide indium(I) bromide titanium(II) sulfide nickel(II) carbonate diboron tetrafluoride Sodium nitrate gallium trichloride neptunium(VI) fluoride Hydroiodic acid tantalum(II) sulfide Calcium nitrate magnesium nitride copper(I) thiocyanate caesium tetraiodocadamate ytterbium(V) silicide zinc phosphide manganese(II) phosphate molybdenum(IV) selenide tungsten(V) chloride einsteinium(II) chloride zinc sulfate nickel(II) carbonyl Silver sulfate uranium(VI) selenide plutonium(II) selenide indium(I) fluoride vanadium(II) silicide antimony orthoniobate beryllium carbide chromium(IV) oxide nickel(II) chloride hexahydrate zinc sulfide calcium hypophosphite Ethanol Pyridine titanium(IV) carbide radium sulfate plutonium(III) iodide barium iodide dihydrate selenium dioxide Propan-2-ol titanium(IV) silicide gadolinium(II) iodide chlorine monoxide germanium(IV) oxide holmium oxide lithium niobate lithium hexafluoroaluminate disilver dichloride antimony trioxide cobalt(II) hydroxide Trichloroacetic acid cobalt(III) hydroxide vanadium(III) fluoride plutonium(II) oxide niobium(IV) bromide copper(II) acetate thorium(IV) iodide einsteinium(III) fluoride ytterbium(II) bromide potassium thiosulfate silver thioantimonate palladium(II) sulfide niobium(III) fluoride phosphorus tribromide manganese(II) iodide tetrahydrate iridium(II) chloride sodium monofluoroacetate osmium(VII) fluoride Sodium acetate terbium(III) iodide copper(II) pyrophosphate bismuth(III) telluride uranium(III) chloride potassium permanganate strontium acetate neodymium(III) chloride calcium carbide ruthenium(III) chloride Formaldehyde Mannitol chlorine perchlorate aluminium arsenide Potassium cyanide palladium(IV) oxide titanium(III) phosphide gallium(III) nitrate gallium(II) selenide tellurium tetrachloride holmium nitride tungsten(IV) selenide bismuth(III) hydroxide calcium hydride sodium bromide tungsten(III) iodide Magnesium chloride osmium(II) iodide erbium(III) oxide vanadium(IV) bromide dysprosium(II) boride praseodymium(III) telluride cadmium hydroxide cobalt(II) selenide palladium(II) chloride arsenic mercury(II) sulfate indium(III) hydroxide caesium superoxide phosphorus trihydride caesium telluride mercury(II) fulminate chromium(II) fluoride lead(II) phosphate copper(I) acetate sodium polyphosphate yttrium(III) nitride tin(II) bromide chromium(VI) oxide nickel(II) fluoborate calcium selenide thorium(IV) chloride antimony tetroxide samarium(III) selenide calcium arsenate copper(II) sulfide indium(III) oxide barium aluminate lithium carbonate tantalum(III) fluoride copper(I) cyanide Sodium potassium tartrate Thiourea scandium(III) iodide sodium bromate boron trichloride sodium metatitanate antimony triselenide thallium(I) ethoxide iridium(IV) sulfide potassium hydroxide Cadmium sulfate scandium(III) formate barium zirconate barium stannate cobalt(III) chloride barium oxide sodium nitrite Chromium(III) chloride caesium permanganate lead(II) fluoride sodium xylenesulfonate osmium(IV) telluride einsteinium(III) oxide lead(II,II,IV) oxide potassium chloride ytterbium(II) selenide dysprosium(III) oxide rubidium tetrachloroaluminate hafnium boride Sodium dichromate cerium(II) iodide hafnium sulfide lithium arsenate beryllium fluoride iodine(III) fluoride uranium(II) oxide potassium iodate strontium metaborate uranium(III) fluoride protactinium(V) chloride lithium nitrate vanadium(III) nitride terbium(III) nitride calcium peroxide barium orthovanadate neptunium(III) iodide tin(IV) fluoride niobium(IV) iodide osmium(IV) oxide copper(II) oxide rhodium(III) sulfate neodymium(III) bromide arsenic pentaselenide sodium chromate phosphorus trifluoride thorium(II) hydride niobium(V) fluoride dinitrogen trioxide sodium niobate palladium(II) selenide Sodium hydrogen carbonate platinum(IV) sulfide sodium fluoride thallium(I) sulfate Nickel sulfate tin(II) oxide barium nitride osmium(IV) selenide antimony dichlorotrifluoride ytterbium(II) fluoride indium(III) antimonide rhodium(III) chloride silver chlorate nitrogen trichloride sodium pyrophosphate diarsenic tetrahydride niobium(IV) telluride arsenic dioxide europium(II) silicide ammonium bicarbonate praseodymium(II) telluride cobalt(II) acetate holmium bromide ruthenium(IV) fluoride titanium(IV) oxide ammonium nitrate lithium tantalate barium carbide californium(II) iodide gold(III) hydroxide barium chloride dihydrate cerium(II) fluoride manganese(II) bromide caesium carbonate osmium(IV) chloride uranium(V,VI) oxide radium fluoride dinitrogen pentoxide neptunium(IV) bromide praseodymium(II) fluoride sulfur dioxide thallium(I) formate strontium hydroxide tungsten(IV) chloride lutetium boride erbium(III) hydride tellurium monoiodide erbium(III) bromide selenium oxychloride osmium(VIII) fluoride chromyl fluoride lithium amide arsenic phosphide iodine heptafluoride potassium oxalate ytterbium(III) fluoride iron(II) nitrate ammonium chloride barium cyanide Bismuth(III) nitrate xenon difluoride tin(II) iodide magnesium sulfide rubidium tetrafluoroaluminate scandium(III) fluoride zinc acetate beryllium nitride Sodium chlorate rhodium(III) fluoride iridium(III) oxide zirconium(III) bromide sodium stannate mercury(I) fluoride plutonium(IV) oxide cerium(III) nitride molybdenum(IV) chloride nickel(II) selenide lithium iodide trihydrate thallium(III) fluoride gold(III) sulfide copper(II) telluride potassium selenide palladium(II) cyanide sucrose lanthanum aluminate mercury(II) telluride barium carbonate beryllium silicate bismuth(III) chloride tungsten(II) iodide tantalum(III) bromide hafnium silicide rubidium sulfide copper(I) acetylide lead(II) arsenite tungsten(III) bromide arsenic hemiselenide tungsten(V) oxytribromide calcium perrhenate neptunium(III) sulfide rhodium(III) nitrate cobalt(III) fluoride manganese(IV) selenide silicon nitride uranium(III) nitride iridium(IV) fluoride neptunium(III) bromide tungsten(VI) fluoride rhodium(III) bromide californium(II) chloride lead(II) azide tungsten(IV) silicide lead(II) acetate strontium aluminate chromium(III) nitrate ruthenium(III) iodide titanium(III) sulfide Formic acid gold(III) chloride yttrium(III) phosphide calcium chloride hexahydrate cobalt(II) iodide calcium pyrophosphate tin(II) telluride lead(IV) fluoride sodium benzoate Butyric acid Manganese(II) sulfate niobium(IV) fluoride strontium boride iridium(VI) fluoride platinum(IV) fluoride neodymium(II) telluride tantalum(IV) silicide mercury(II) iodate barium sulfite antimony trichloride mercury(I) iodide phosphorus heptasulfide vanadium(III) oxide manganese(II) chloride rhodium(IV) sulfide nitrogen triiodide gallium(III) arsenide rhodium(IV) selenide magnesium glycerophosphate strontium fluoride tantalum(IV) bromide copper(II) molybdate magnesium titanate praseodymium(IV) fluoride manganese(III) oxide vanadium(II) fluoride nickel(II) sulfate chromium(IV) bromide lead(II) iodate selenium dioxydifluoride zinc propionate tellurium dichloride cobalt(II) sulfate thallium(I) telluride titanium(IV) chloride sodium ferrocyanide potassium diphosphate aluminium fluoride aluminium oxide lead(II) sulfite Iodic acid tantalum(V) iodide silver iodide copper(II) arsenate palladium(II) acetate calcium cyanide gallium(I,III) chloride neodymium(II) sulfide tungsten(IV) carbide copper(II) hydroxide niobium(V) chloride aluminium iodide arsenic tetraoxide mercury(II) selenide nickel(II) titanate osmium(IV) sulfide phosphorus oxychloride yttrium(III) fluoride sodium telluride iodine pentafluoride Potassium nitrate magnesium fluoride beryllium borohydride mercury(II) chloride iridium(IV) iodide Antimony(III) chloride barium iodide Acetonitrile lithium citrate aluminium bromide beryllium formate titanium(II) hydride Potassium hydroxide ytterbium(III) oxide calcium metasilicate iron(II) acetate rubidium bromide strontium oxide zinc antimonide aluminium dodecaboride cobalt(II) ferricyanide tungsten(IV) oxide rhenium(III) oxide selenium hexasulfide diboron trioxide rubidium acetate rhodium(III) oxide oxygen(I) fluoride indium(III) nitride tungsten(VI) oxytetrachloride rhodium(IV) telluride tungsten(IV) iodide tungsten(IV) telluride sodium metabisulfite Dimethylglyoxime beryllium bromide copper(II) ferrocyanide potassium propionate gallium(III) sulfide vanadium(IV) silicide phosphorus nonasulfide yttrium(III) chloride tungsten(V) fluoride lithium azide thulium(III) sulfide Malonic acid potassium bisulfate praseodymium(III) sulfide titanium(III,IV) oxide diboron hexahydride gallium(II) chloride erbium(III) iodide tin(II) sulfate cobalt(II) sulfide silver dichromate potassium ferricyanide hafnium iodide lead(II) metasilicate disulfur tetrafluoride lead(II) niobate caesium metaborate manganese(II) chloride tetrahydrate tantalum(IV) selenide phosphorus pentoxide zinc arsenite molybdenum(II) bromide copper(I) azide cerium(IV) fluoride yttrium(III) iodide chromium(II) iodide indium(III) sulfide gadolinium(II) selenide strontium sulfide calcium chlorate nickel(II) fluoride thorium(III) sulfide barium tetraiodomercurate europium(II) bromide ytterbium(III) chloride Antimony(V) chloride protactinium(V) iodide tin(II) sulfide sodium hydride caesium bromate lead(II) oxalate Mercury(II) nitrate gallium(II) sulfide cadmium bromide lead(II) selenite magnesium oxide tin(II) selenide Methanol Hydrochloric acid cerium(III) iodide vanadium(IV) selenide mercury(II) sulfide trisilver triiodide potassium borohydride Sulfurous acid neodymium(III) oxide cadmium molybdate neptunium(III) nitride thallium(I) chloride titanium(II) bromide molybdenum(IV) telluride rhenium(III) bromide sulfur difluoride diiodine pentoxide manganese(II) iodide manganese(III) chloride copper(I) oxide rhenium(IV) chloride gold(I,III) fluoride neodymium(II) iodide potassium silicate silver oxide Propan-1-ol lithium antimonide uranium(IV) oxide protactinium(II) oxide chromium(II) carbide manganese(IV) oxide strontium iodide iron(III) bromide hafnium hydride barium nitrate potassium bicarbonate europium(II) niobate manganese(III) antimonide cadmium tungstate iridium(IV) chloride germanium(II) chloride lead(II) fluoroborate lead(II) arsenate thorium(IV) nitrate zinc selenide zinc nitrite molybdenum(III) bromide iridium(III) chloride iron(III) chromate nickel(III) boride cobalt(II) bromide erbium(III) vanadate chromium(IV) iodide lead(IV) chloride Potassium bromate potassium methoxide terbium(III) oxide praseodymium(III) oxide beryllium boride zinc molybdate calcium phosphide tellurium dibromide lithium tetrafluoroborate molybdenum(V) fluoride indium(III) telluride Nickel chloride Urethane Isobutanol rubidium telluride californium(III) chloride barium stearate nickel(II) telluride silver permanganate ruthenium(III) acetylacetonate chromyl chloride barium vanadate cobalt(II) acetate tetrahydrate bromine monoxide zirconium(II) iodide strontium phosphide chromium(V) fluoride cobalt(IV) fluoride rhenium(V) chloride tungsten(IV) bromide hafnium selenide scandium(III) trifluoromethanesulfonate Sodium nitrite sodium bisulfite hafnium bromide ruthenium(IV) sulfide acre apothecaries' drachm apothecaries' ounce apothecaries' pound avoirdupois bag barleycorn barn barn-megaparsec barrel beard-second beatment Beaufort scale bing bob bolls bolt broadsheet bushel butt cable carat Celsius cental chain chain, square chaldron candlepower chopin clockface angle cloth-yard compass points cord cord foot crown cubic fathom cubic foot cubic inch cubic yard cubit cup cwt hundredweight degree Delisle dessert spoon dicker drachm apothecaries' dram dram weight, avoirdupois drum east ell engineer's chain faggot Fahrenheit farthing fathom fathom cubic finger firkin five pounds fiver florin fluid drachm fluid ounce fodder folio foot foot, cubic foot, square fother furlong furlong, square gallon gallon Scots gill gill Scots goad grad gradian grain grand groat guinea Gunter's chain half crown half penny hand ha'penny hogshead Hoppus foot horsepower hundredweight inch inch, cubic inch, square iron jill keel Kelvin kennings kilderkin knot last league ledger left letter lb pound weight lb wt lbf lbm line link load measured rick megalithic yard megaparsec Mercalli mil mile mile, square mille millihelen minim Mohr scale monkey mutchkin nail nautical mile Newton degree north o'clock angle octavo ounce ounce apothecaries' ounce troy oz ounce pace pack palm peck penny pennyweight perch perch square perch pica pig pin pint pint Scots pipe point point font points of compass pole pole square pony poppy seed port pottle pound pound pound apothecaries' pound force lbf pound mass lbm pound troy pound weight lb wt poundal puncheon quart quarter length quarter quarter quarto quid quire radian Ramsden's chain Rankine ream R�aumur Richter rick right R�mer rood rod rod square rope sack sardine scam score score Scottish units scruple seam shackle shilling shipping ton six pence sixpenny joint slug smoot south sovereign span square chain square foot square furlong square inch square mile spoon square perch square pole square rod square yard starboard step stone sterling sterling Tudor surveyor's chain tablespoon tabloid tanner teaspoon ten bob ten pounds ten shillings tenner tertian thou three pence thruppence thumb tierce tod ton ton ton troy ounce troy pound truss tun tuppence two shillings two and six U.S. units of volume vergee west windle yard yard, cubic yard, square \ No newline at end of file diff --git a/quantgov/resources/nltk_pos_tags.txt b/quantgov/resources/nltk_pos_tags.txt deleted file mode 100644 index 6ef8326..0000000 --- a/quantgov/resources/nltk_pos_tags.txt +++ /dev/null @@ -1,35 +0,0 @@ -CC | coordinating conjunction -CD | cardinal digit -DT | determiner -EX | existential there (like: "there is" ... think of it like "there exists") -FW | foreign word -IN | preposition/subordinating conjunction -JJ | adjective 'big' -JJR | adjective, comparative 'bigger' -JJS | adjective, superlative 'biggest' -LS | list marker 1) -MD | modal could, will -NN | noun, singular 'desk' -NNS | noun plural 'desks' -NNP | proper noun, singular 'Harrison' -NNPS | proper noun, plural 'Americans' -PDT | predeterminer 'all the kids' -POS | possessive ending parent's -PRP | personal pronoun I, he, she -PRP$ | possessive pronoun my, his, hers -RB | adverb; i.e. very, silently, -RBR | adverb, comparative better -RBS | adverb, superlative best -RP | particle give up -TO | to go 'to' the store. -UH | interjection errrrrrrrm -VB | verb, base form take -VBD | verb, past tense took -VBG | verb, gerund/present participle taking -VBN | verb, past participle taken -VBP | verb, sing. present, non-3d take -VBZ | verb, 3rd person sing. present takes -WDT | wh-determiner which -WP | wh-pronoun who, what -WP$ | possessive wh-pronoun whose -WRB | wh-abverb where, when \ No newline at end of file From a21800b3ddc4eecdedf7410421ba6a569d08636f Mon Sep 17 00:00:00 2001 From: Stephen Strosko Date: Wed, 28 Aug 2019 18:00:32 -0400 Subject: [PATCH 4/7] flesch reading (#63) * flesch reading * textstat final * textstat optional import * flake8 fixes --- .gitignore | 2 + Pipfile | 3 +- Pipfile.lock | 489 +++++++++++++++++---------------- quantgov/nlp.py | 675 +++++++++++++++++++++++++--------------------- setup.py | 157 +++++------ tests/test_nlp.py | 10 +- 6 files changed, 708 insertions(+), 628 deletions(-) diff --git a/.gitignore b/.gitignore index d9d3190..c4a3838 100644 --- a/.gitignore +++ b/.gitignore @@ -88,3 +88,5 @@ ENV/ # Rope project settings .ropeproject + +.DS_Store diff --git a/Pipfile b/Pipfile index 66e4657..003b9ef 100644 --- a/Pipfile +++ b/Pipfile @@ -4,7 +4,8 @@ verify_ssl = true name = "pypi" [packages] -"e1839a8" = {path = ".", extras = ["nlp", "s3driver"], editable = true} +e1839a8 = {path = ".",extras = ["nlp", "s3driver"],editable = true} +textstat = "*" [dev-packages] "pytest-flake8" = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 75fd26b..57ca94c 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "02bcc79cf52a20e5172c477a0efb44d8d0d455235abefd4bfbc641e8c2453af2" + "sha256": "fb05682cf6faa7e15048ad5a8688ff5675dc87a47e496516bc7395c4c66a68f8" }, "pipfile-spec": 6, "requires": {}, @@ -16,24 +16,24 @@ "default": { "boto3": { "hashes": [ - "sha256:95ac50b1905e0aa0344a2a733d76c44af81b2cc51304386b94b0ef669d8d19bc", - "sha256:b227764ab3dcb4b55d54dd90c7676846f153b1e29ed259081ffc34b064a6ff21" + "sha256:366a1f3ec37b9434f25247cbe876f9ca1b53d35e35af18f74c735445100b4bc4", + "sha256:e7718b48cd073ad59a99a33d14252319dfaf550be3682b0c6a58da052fb05fcc" ], - "version": "==1.8.5" + "version": "==1.9.217" }, "botocore": { "hashes": [ - "sha256:4a2d4fc68fdc7113957cfc51b733a9900a9ba35e19e6d841a8b11fd6c20732f9", - "sha256:dcad4db0349dd11278d094a91434faf11500aae1991890a62d47a79923ca7ba3" + "sha256:68a0a22ca4e0e7e7ab482f63e21debfe402841fc49b8503dec0a7307b565d774", + "sha256:7a213b876e58b1b5380cf30faa05ba45073692ad4a3cc803ba763082a36436bb" ], - "version": "==1.11.5" + "version": "==1.12.217" }, "certifi": { "hashes": [ - "sha256:376690d6f16d32f9d1fe8932551d80b23e9d393a8578c5633a2ed39a64861638", - "sha256:456048c7e371c089d0a77a5212fb37a2c2dce1e24146e3b7e0261736aaeaa22a" + "sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939", + "sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695" ], - "version": "==2018.8.24" + "version": "==2019.6.16" }, "chardet": { "hashes": [ @@ -44,18 +44,18 @@ }, "decorator": { "hashes": [ - "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", - "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" + "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", + "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" ], - "version": "==4.3.0" + "version": "==4.4.0" }, "docutils": { "hashes": [ - "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6", - "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274", - "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6" + "sha256:6c4f696463b79f1fb8ba0c594b63840ebd41f059e92b31957c46b74a4599b6d0", + "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827", + "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99" ], - "version": "==0.14" + "version": "==0.15.2" }, "e1839a8": { "editable": true, @@ -67,224 +67,217 @@ }, "idna": { "hashes": [ - "sha256:156a6814fb5ac1fc6850fb002e0852d56c0c8d2531923a51032d1b70760e186e", - "sha256:684a38a6f903c1d71d6d5fac066b58d7768af4de2b832e426ec79c30daa94a16" + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" ], - "version": "==2.7" + "version": "==2.8" }, "jmespath": { "hashes": [ - "sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64", - "sha256:f11b4461f425740a1d908e9a3f7365c3d2e569f6ca68a2ff8bc5bcd9676edd63" + "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6", + "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c" ], - "version": "==0.9.3" + "version": "==0.9.4" }, "joblib": { "hashes": [ - "sha256:333b9bf16ff015d6b56bf80b9831afdd243443cb84c7ff7b6e342f117e354c42", - "sha256:3e650621a6ec2b9cdda72ec3e0b0f04101f605a56ae0d0e54e3d18b16fcf29f4" + "sha256:21e0c34a69ad7fde4f2b1f3402290e9ec46f545f15f1541c582edfe05d87b63a", + "sha256:315d6b19643ec4afd4c41c671f9f2d65ea9d787da093487a81ead7b0bac94524" ], - "version": "==0.12.3" + "version": "==0.13.2" }, "nltk": { "hashes": [ - "sha256:fe0eda251be65843be86d7de9abfbf7161732256f742e623b21243ec47bdb718" + "sha256:bed45551259aa2101381bbdd5df37d44ca2669c5c3dad72439fa459b29137d94" ], - "version": "==3.3.0" + "version": "==3.4.5" }, "numpy": { "hashes": [ - "sha256:1c362ad12dd09a43b348bb28dd2295dd9cdf77f41f0f45965e04ba97f525b864", - "sha256:2156a06bd407918df4ac0122df6497a9c137432118f585e5b17d543e593d1587", - "sha256:24e4149c38489b51fc774b1e1faa9103e82f73344d7a00ba66f6845ab4769f3f", - "sha256:340ec1697d9bb3a9c464028af7a54245298502e91178bddb4c37626d36e197b7", - "sha256:35db8d419345caa4eeaa65cd63f34a15208acd87530a30f0bc25fc84f55c8c80", - "sha256:361370e9b7f5e44c41eee29f2bb5cb3b755abb4b038bce6d6cbe08db7ff9cb74", - "sha256:36e8dcd1813ca92ce7e4299120cee6c03adad33d89b54862c1b1a100443ac399", - "sha256:378378973546ecc1dfaf9e24c160d683dd04df871ecd2dcc86ce658ca20f92c0", - "sha256:419e6faee16097124ee627ed31572c7e80a1070efa25260b78097cca240e219a", - "sha256:4287104c24e6a09b9b418761a1e7b1bbde65105f110690ca46a23600a3c606b8", - "sha256:549f3e9778b148a47f4fb4682955ed88057eb627c9fe5467f33507c536deda9d", - "sha256:5e359e9c531075220785603e5966eef20ccae9b3b6b8a06fdfb66c084361ce92", - "sha256:5ee7f3dbbdba0da75dec7e94bd7a2b10fe57a83e1b38e678200a6ad8e7b14fdc", - "sha256:62d55e96ec7b117d3d5e618c15efcf769e70a6effaee5842857b64fb4883887a", - "sha256:719b6789acb2bc86ea9b33a701d7c43dc2fc56d95107fd3c5b0a8230164d4dfb", - "sha256:7a70f2b60d48828cba94a54a8776b61a9c2657a803d47f5785f8062e3a9c7c55", - "sha256:7b9e37f194f8bcdca8e9e6af92e2cbad79e360542effc2dd6b98d63955d8d8a3", - "sha256:83b8fc18261b70f45bece2d392537c93dc81eb6c539a16c9ac994c47fc79f09a", - "sha256:9473ad28375710ab18378e72b59422399b27e957e9339c413bf00793b4b12df0", - "sha256:95b085b253080e5d09f7826f5e27dce067bae813a132023a77b739614a29de6e", - "sha256:98b86c62c08c2e5dc98a9c856d4a95329d11b1c6058cb9b5191d5ea6891acd09", - "sha256:a3bd01d6d3ed3d7c06d7f9979ba5d68281f15383fafd53b81aa44b9191047cf8", - "sha256:c81a6afc1d2531a9ada50b58f8c36197f8418ef3d0611d4c1d7af93fdcda764f", - "sha256:ce75ed495a746e3e78cfa22a77096b3bff2eda995616cb7a542047f233091268", - "sha256:dae8618c0bcbfcf6cf91350f8abcdd84158323711566a8c5892b5c7f832af76f", - "sha256:df0b02c6705c5d1c25cc35c7b5d6b6f9b3b30833f9d178843397ae55ecc2eebb", - "sha256:e3660744cda0d94b90141cdd0db9308b958a372cfeee8d7188fdf5ad9108ea82", - "sha256:f2362d0ca3e16c37782c1054d7972b8ad2729169567e3f0f4e5dd3cdf85f188e" - ], - "version": "==1.15.1" + "sha256:03f2ebcbffcce2dec8860633b89a93e80c6a239d21a77ae8b241450dc21e8c35", + "sha256:078c8025da5ab9e8657edc9c2a1e9642e06e953bc7baa2e65c1aa9d9dfb7e98b", + "sha256:0fbfa98c5d5c3c6489cc1e852ec94395d51f35d9ebe70c6850e47f465038cdf4", + "sha256:1c841033f4fe6801648180c3033c45b3235a8bbd09bc7249010f99ea27bb6790", + "sha256:2c0984a01ddd0aeec89f0ce46ef21d64761048cd76c0074d0658c91f9131f154", + "sha256:4c166dcb0fff7cb3c0bbc682dfb5061852a2547efb6222e043a7932828c08fb5", + "sha256:8c2d98d0623bd63fb883b65256c00454d5f53127a5a7bcdaa8bdc582814e8cb4", + "sha256:8cb4b6ae45aad6d26712a1ce0a3f2556c5e1484867f9649e03496e45d6a5eba4", + "sha256:93050e73c446c82065b7410221b07682e475ac51887cd9368227a5d944afae80", + "sha256:a3f6b3024f8826d8b1490e6e2a9b99e841cd2c375791b1df62991bd8f4c00b89", + "sha256:bede70fd8699695363f39e86c1e869b2c8b74fb5ef135a67b9e1eeebff50322a", + "sha256:c304b2221f33489cd15a915237a84cdfe9420d7e4d4828c78a0820f9d990395c", + "sha256:f11331530f0eff69a758d62c2461cd98cdc2eae0147279d8fc86e0464eb7e8ca", + "sha256:fa5f2a8ef1e07ba258dc07d4dd246de23ef4ab920ae0f3fa2a1cc5e90f0f1888", + "sha256:fb6178b0488b0ce6a54bc4accbdf5225e937383586555604155d64773f6beb2b", + "sha256:fd5e830d4dc31658d61a6452cd3e842213594d8c15578cdae6829e36ad9c0930" + ], + "version": "==1.17.1" }, "pandas": { "hashes": [ - "sha256:11975fad9edbdb55f1a560d96f91830e83e29bed6ad5ebf506abda09818eaf60", - "sha256:12e13d127ca1b585dd6f6840d3fe3fa6e46c36a6afe2dbc5cb0b57032c902e31", - "sha256:1c87fcb201e1e06f66e23a61a5fea9eeebfe7204a66d99df24600e3f05168051", - "sha256:242e9900de758e137304ad4b5663c2eff0d798c2c3b891250bd0bd97144579da", - "sha256:26c903d0ae1542890cb9abadb4adcb18f356b14c2df46e4ff657ae640e3ac9e7", - "sha256:2e1e88f9d3e5f107b65b59cd29f141995597b035d17cc5537e58142038942e1a", - "sha256:31b7a48b344c14691a8e92765d4023f88902ba3e96e2e4d0364d3453cdfd50db", - "sha256:4fd07a932b4352f8a8973761ab4e84f965bf81cc750fb38e04f01088ab901cb8", - "sha256:5b24ca47acf69222e82530e89111dd9d14f9b970ab2cd3a1c2c78f0c4fbba4f4", - "sha256:647b3b916cc8f6aeba240c8171be3ab799c3c1b2ea179a3be0bd2712c4237553", - "sha256:66b060946046ca27c0e03e9bec9bba3e0b918bafff84c425ca2cc2e157ce121e", - "sha256:6efa9fa6e1434141df8872d0fa4226fc301b17aacf37429193f9d70b426ea28f", - "sha256:be4715c9d8367e51dbe6bc6d05e205b1ae234f0dc5465931014aa1c4af44c1ba", - "sha256:bea90da782d8e945fccfc958585210d23de374fa9294a9481ed2abcef637ebfc", - "sha256:d785fc08d6f4207437e900ffead930a61e634c5e4f980ba6d3dc03c9581748c7", - "sha256:de9559287c4fe8da56e8c3878d2374abc19d1ba2b807bfa7553e912a8e5ba87c", - "sha256:f4f98b190bb918ac0bc0e3dd2ab74ff3573da9f43106f6dba6385406912ec00f", - "sha256:f71f1a7e2d03758f6e957896ed696254e2bc83110ddbc6942018f1a232dd9dad", - "sha256:fb944c8f0b0ab5c1f7846c686bc4cdf8cde7224655c12edcd59d5212cd57bec0" - ], - "version": "==0.23.4" + "sha256:18d91a9199d1dfaa01ad645f7540370ba630bdcef09daaf9edf45b4b1bca0232", + "sha256:3f26e5da310a0c0b83ea50da1fd397de2640b02b424aa69be7e0784228f656c9", + "sha256:4182e32f4456d2c64619e97c58571fa5ca0993d1e8c2d9ca44916185e1726e15", + "sha256:426e590e2eb0e60f765271d668a30cf38b582eaae5ec9b31229c8c3c10c5bc21", + "sha256:5eb934a8f0dc358f0e0cdf314072286bbac74e4c124b64371395e94644d5d919", + "sha256:717928808043d3ea55b9bcde636d4a52d2236c246f6df464163a66ff59980ad8", + "sha256:8145f97c5ed71827a6ec98ceaef35afed1377e2d19c4078f324d209ff253ecb5", + "sha256:8744c84c914dcc59cbbb2943b32b7664df1039d99e834e1034a3372acb89ea4d", + "sha256:c1ac1d9590d0c9314ebf01591bd40d4c03d710bfc84a3889e5263c97d7891dee", + "sha256:cb2e197b7b0687becb026b84d3c242482f20cbb29a9981e43604eb67576da9f6", + "sha256:d4001b71ad2c9b84ff18b182cea22b7b6cbf624216da3ea06fb7af28d1f93165", + "sha256:d8930772adccb2882989ab1493fa74bd87d47c8ac7417f5dd3dd834ba8c24dc9", + "sha256:dfbb0173ee2399bc4ed3caf2d236e5c0092f948aafd0a15fbe4a0e77ee61a958", + "sha256:eebfbba048f4fa8ac711b22c78516e16ff8117d05a580e7eeef6b0c2be554c18", + "sha256:f1b21bc5cf3dbea53d33615d1ead892dfdae9d7052fa8898083bec88be20dcd2" + ], + "version": "==0.25.1" + }, + "pyphen": { + "hashes": [ + "sha256:3b633a50873156d777e1f1075ba4d8e96a6ad0a3ca42aa3ea9a6259f93f18921", + "sha256:e172faf10992c8c9d369bdc83e36dbcf1121f4ed0d881f1a0b521935aee583b5" + ], + "version": "==0.9.5" }, "python-dateutil": { "hashes": [ - "sha256:1adb80e7a782c12e52ef9a8182bebeb73f1d7e24e374397af06fb4956c8dc5c0", - "sha256:e27001de32f627c22380a688bcc43ce83504a7bc5da472209b4c70f02829f0b8" + "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", + "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" ], - "version": "==2.7.3" + "markers": "python_version >= '2.7'", + "version": "==2.8.0" }, "pytz": { "hashes": [ - "sha256:a061aa0a9e06881eb8b3b2b43f05b9439d6583c206d0a6c340ff72a7b6669053", - "sha256:ffb9ef1de172603304d9d2819af6f5ece76f2e85ec10692a524dd876e72bf277" + "sha256:26c0b32e437e54a18161324a2fca3c4b9846b74a8dccddd843113109e1116b32", + "sha256:c894d57500a4cd2d5c71114aaab77dbab5eabd9022308ce5ac9bb93a60a6f0c7" + ], + "version": "==2019.2" + }, + "repoze.lru": { + "hashes": [ + "sha256:0429a75e19380e4ed50c0694e26ac8819b4ea7851ee1fc7583c8572db80aff77", + "sha256:f77bf0e1096ea445beadd35f3479c5cff2aa1efe604a133e67150bc8630a62ea" ], - "version": "==2018.5" + "version": "==0.7" }, "requests": { "hashes": [ - "sha256:63b52e3c866428a224f97cab011de738c36aec0185aa91cfacd418b5d58911d1", - "sha256:ec22d826a36ed72a7358ff3fe56cbd4ba69dd7a6718ffd450ff0e9df7a47ce6a" + "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", + "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" ], - "version": "==2.19.1" + "version": "==2.22.0" }, "s3transfer": { "hashes": [ - "sha256:90dc18e028989c609146e241ea153250be451e05ecc0c2832565231dacdf59c1", - "sha256:c7a9ec356982d5e9ab2d4b46391a7d6a950e2b04c472419f5fdec70cc0ada72f" + "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d", + "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba" ], - "version": "==0.1.13" + "version": "==0.2.1" }, "scikit-learn": { "hashes": [ - "sha256:0a718b5ffbd5053fb3f9e1a2e20b7c4f256dd8035e246b907d3117d20bac0260", - "sha256:1725540b754a9967778e9385e1ee2c8db50d5ab70ed835c9f5e36002ffabc169", - "sha256:3e3ce307d7c5c5811658ba8686b24b571a8244eaafe707665ad601f400d5ce98", - "sha256:42ad71502237c9fe300ecf157f5a394df717789a2dde541dd7034b539c70bdcc", - "sha256:42cba716db197e0d1670e2fc13c4cc4a86d5c5358120ccfee6ec427b154e74ff", - "sha256:47b4090b7686642e41176becb7c42ef3cc665d7ee0db5e7ea5d307ec9779327e", - "sha256:51d99a08c8bf689cf60c9d8dca6e3d3e5f6d762def85ad735dcea11fb528a89b", - "sha256:5f7577fbb2399a4712e96cf0e786638168940a876c33735a1b5d5a86ba4b1370", - "sha256:66bfc2b6b15db1725d03ea657ec9184ff09dcbf1ecd834ef85f2edc2c9cbba97", - "sha256:69a34d389d9ca4687ad00af4e11d53686771f484c37366f68617ef656bab16ab", - "sha256:75297f3dd6685f01555f1bb75846995d45650af417280b69c81bf11b6987aed5", - "sha256:9ebb38ab1d0ee143982aed561811903ac6c1abb512ae2b9019b3b65bde63ffb9", - "sha256:a402c1484fe65df42d5dbc22a58e0695fe3afe2b0b229aee2a09c6d60ba8e5c2", - "sha256:aad6b9aac1617bd7efa0450643888bbd3410679a94bc8680d9863825686ef369", - "sha256:ad4db28d3dc16c01df75ed6efb72524537de3839a5d179fcf94094359fc72ec5", - "sha256:b276739a5f863ccacb61999a3067d0895ee291c95502929b2ae56ea1f882e888", - "sha256:b3dc88c4d2bcb26ffc5afe16d053ae28317d7d1de083651defcd5453a04f1563", - "sha256:b3e4681253e95da5aa5c231889a32b084fd997962bf8beda6f796bf422f734b2", - "sha256:c3d852d49d6c1710089d4513702099fa6f8e1aebfedf222319d80c47b0a195f8", - "sha256:c6612e7e43988b8b5e1957150449493a55f9c059de641083df7a964f86f2d1e7", - "sha256:c69e5c6051366a6ac9600d730276db939b1a205e42504ec0b8371f154b0058db", - "sha256:ce121baa8e85ec27c3065281657dcd78adaab7dcb046c7fe96ad4e5a9dcb6610", - "sha256:ed2a9a9bea6ec443b7effe5695c9c168b7bf9a67df6d880729760feda871b6a3", - "sha256:efd842d70b87e3ef3429c3149840b9189d4441ca951ab0cec62c94a964e219d9", - "sha256:f1428af5c381f6eef30ffbc7e047b7c713d4efa5d7bf5e57b62b3fc8d387044b", - "sha256:f6c7bf8cd4de1640b760b47f4d28deb26dbbf9acbe0194cdff54a898e190d872", - "sha256:f8329ac2160ad8bbbac6a507374685ceca3f24ca427fa9ee61a501280e1972d9", - "sha256:fefba2a43b92f8393366093b60efbe984a72a2b41cce16b4002005e4104ef938" - ], - "version": "==0.19.2" + "sha256:1ac81293d261747c25ea5a0ee8cd2bb1f3b5ba9ec05421a7f9f0feb4eb7c4116", + "sha256:289361cf003d90b007f5066b27fcddc2d71324c82f1c88e316fedacb0dfdd516", + "sha256:3a14d0abd4281fc3fd2149c486c3ec7cedad848b8d5f7b6f61522029d65a29f8", + "sha256:5083a5e50d9d54548e4ada829598ae63a05651dd2bb319f821ffd9e8388384a6", + "sha256:777cdd5c077b7ca9cb381396c81990cf41d2fa8350760d3cad3b4c460a7db644", + "sha256:8bf2ff63da820d09b96b18e88f9625228457bff8df4618f6b087e12442ef9e15", + "sha256:8d319b71c449627d178f21c57614e21747e54bb3fc9602b6f42906c3931aa320", + "sha256:928050b65781fea9542dfe9bfe02d8c4f5530baa8472ec60782ea77347d2c836", + "sha256:92c903613ff50e22aa95d589f9fff5deb6f34e79f7f21f609680087f137bb524", + "sha256:ae322235def5ce8fae645b439e332e6f25d34bb90d6a6c8e261f17eb476457b7", + "sha256:c1cd6b29eb1fd1cc672ac5e4a8be5f6ea936d094a3dc659ada0746d6fac750b1", + "sha256:c41a6e2685d06bcdb0d26533af2540f54884d40db7e48baed6a5bcbf1a7cc642", + "sha256:d07fcb0c0acbc043faa0e7cf4d2037f71193de3fb04fb8ed5c259b089af1cf5c", + "sha256:d146d5443cda0a41f74276e42faf8c7f283fef49e8a853b832885239ef544e05", + "sha256:eb2b7bed0a26ba5ce3700e15938b28a4f4513578d3e54a2156c29df19ac5fd01", + "sha256:eb9b8ebf59eddd8b96366428238ab27d05a19e89c5516ce294abc35cea75d003" + ], + "version": "==0.21.3" }, "scipy": { "hashes": [ - "sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7", - "sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a", - "sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd", - "sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3", - "sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37", - "sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463", - "sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3", - "sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631", - "sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5", - "sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a", - "sha256:42d9149a2fff7affdd352d157fa5717033767857c11bd55aa4a519a44343dfef", - "sha256:625f25a6b7d795e8830cb70439453c9f163e6870e710ec99eba5722775b318f3", - "sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f", - "sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559", - "sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692", - "sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1", - "sha256:8b984f0821577d889f3c7ca8445564175fb4ac7c7f9659b7c60bef95b2b70e76", - "sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac", - "sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a", - "sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2", - "sha256:d40dc7f494b06dcee0d303e51a00451b2da6119acbeaccf8369f2d29e28917ac", - "sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01", - "sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552", - "sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40", - "sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020", - "sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae", - "sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40", - "sha256:f25c281f12c0da726c6ed00535ca5d1622ec755c30a3f8eafef26cf43fede694" - ], - "version": "==1.1.0" + "sha256:0baa64bf42592032f6f6445a07144e355ca876b177f47ad8d0612901c9375bef", + "sha256:243b04730d7223d2b844bda9500310eecc9eda0cba9ceaf0cde1839f8287dfa8", + "sha256:2643cfb46d97b7797d1dbdb6f3c23fe3402904e3c90e6facfe6a9b98d808c1b5", + "sha256:396eb4cdad421f846a1498299474f0a3752921229388f91f60dc3eda55a00488", + "sha256:3ae3692616975d3c10aca6d574d6b4ff95568768d4525f76222fb60f142075b9", + "sha256:435d19f80b4dcf67dc090cc04fde2c5c8a70b3372e64f6a9c58c5b806abfa5a8", + "sha256:46a5e55850cfe02332998b3aef481d33f1efee1960fe6cfee0202c7dd6fc21ab", + "sha256:75b513c462e58eeca82b22fc00f0d1875a37b12913eee9d979233349fce5c8b2", + "sha256:7ccfa44a08226825126c4ef0027aa46a38c928a10f0a8a8483c80dd9f9a0ad44", + "sha256:89dd6a6d329e3f693d1204d5562dd63af0fd7a17854ced17f9cbc37d5b853c8d", + "sha256:a81da2fe32f4eab8b60d56ad43e44d93d392da228a77e229e59b51508a00299c", + "sha256:a9d606d11eb2eec7ef893eb825017fbb6eef1e1d0b98a5b7fc11446ebeb2b9b1", + "sha256:ac37eb652248e2d7cbbfd89619dce5ecfd27d657e714ed049d82f19b162e8d45", + "sha256:cbc0611699e420774e945f6a4e2830f7ca2b3ee3483fca1aa659100049487dd5", + "sha256:d02d813ec9958ed63b390ded463163685af6025cb2e9a226ec2c477df90c6957", + "sha256:dd3b52e00f93fd1c86f2d78243dfb0d02743c94dd1d34ffea10055438e63b99d" + ], + "version": "==1.3.1" }, "six": { "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" ], - "version": "==1.11.0" + "version": "==1.12.0" }, "sqlalchemy": { "hashes": [ - "sha256:ef6569ad403520ee13e180e1bfd6ed71a0254192a934ec1dbd3dbf48f4aa9524" + "sha256:2f8ff566a4d3a92246d367f2e9cd6ed3edeef670dcd6dda6dfdc9efed88bcd80" ], - "version": "==1.2.11" + "version": "==1.3.8" }, "textblob": { "hashes": [ - "sha256:7c9ff21a47a382fa4f235e84ce9be10cca4b9d46b012b79af6e47ea81b478a18", - "sha256:8301812cbef9b2f288e14df904854f7457fccf2c52020b66d3f9bc1448cf042a" + "sha256:7ff3c00cb5a85a30132ee6768b8c68cb2b9d76432fec18cd1b3ffe2f8594ec8c", + "sha256:b0eafd8b129c9b196c8128056caed891d64b7fa20ba570e1fcde438f4f7dd312" ], - "version": "==0.15.1" + "version": "==0.15.3" + }, + "textstat": { + "hashes": [ + "sha256:5e1342bf87b4660f5437a36ce0a12cc987885187527c97c6b1f19557811df4d6", + "sha256:c50ad2691763c74508e35e554da2ad8aee748537c999388f93e218fcab9ab12f", + "sha256:fd225f95cb558fa2923b2bea4991f77e8dcd66eb9b544824a297b04a6a0d4425" + ], + "index": "pypi", + "version": "==0.5.6" }, "urllib3": { "hashes": [ - "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", - "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" + "sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1", + "sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232" ], - "version": "==1.23" + "markers": "python_version >= '3.4'", + "version": "==1.25.3" } }, "develop": { + "appnope": { + "hashes": [ + "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", + "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" + ], + "markers": "sys_platform == 'darwin'", + "version": "==0.1.0" + }, "atomicwrites": { "hashes": [ - "sha256:0312ad34fcad8fac3704d441f7b317e50af620823353ec657a53e981f92920c0", - "sha256:ec9ae8adaae229e4f8446952d204a3e4b5fdd2d099f9be3aaf556120135fb3ee" + "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", + "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" ], - "version": "==1.2.1" + "version": "==1.3.0" }, "attrs": { "hashes": [ - "sha256:4b90b09eeeb9b88c35bc642cbac057e45a5fd85367b985bd2809c62b7b939265", - "sha256:e0d0eb91441a3b53dab4d9b743eafc1ac44476296a2053b6ca3af0b139faf87b" + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" ], - "version": "==18.1.0" + "version": "==19.1.0" }, "backcall": { "hashes": [ @@ -293,35 +286,42 @@ ], "version": "==0.1.0" }, - "colorama": { + "decorator": { "hashes": [ - "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda", - "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1" + "sha256:86156361c50488b84a3f148056ea716ca587df2f0de1d34750d35c21312725de", + "sha256:f069f3a01830ca754ba5258fde2278454a0b5b79e0d7f5c13b3b97e57d4acff6" ], - "markers": "sys_platform == 'win32'", - "version": "==0.3.9" + "version": "==4.4.0" }, - "decorator": { + "entrypoints": { "hashes": [ - "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", - "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" + "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", + "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451" ], - "version": "==4.3.0" + "version": "==0.3" }, "flake8": { "hashes": [ - "sha256:7253265f7abd8b313e3892944044a365e3f4ac3fcdcfb4298f55ee9ddf188ba0", - "sha256:c7841163e2b576d435799169b78703ad6ac1bbb0f199994fc05f700b2a90ea37" + "sha256:19241c1cbc971b9962473e4438a2ca19749a7dd002dd1a946eaba171b4114548", + "sha256:8e9dfa3cecb2400b3738a42c54c3043e821682b9c840b0448c0503f781130696" + ], + "version": "==3.7.8" + }, + "importlib-metadata": { + "hashes": [ + "sha256:23d3d873e008a513952355379d93cbcab874c58f4f034ff657c7a87422fa64e8", + "sha256:80d2de76188eabfbfcf27e6a37342c2827801e59c4cc14b0371c56fed43820e3" ], - "version": "==3.5.0" + "markers": "python_version < '3.8'", + "version": "==0.19" }, "ipython": { "hashes": [ - "sha256:007dcd929c14631f83daff35df0147ea51d1af420da303fd078343878bd5fb62", - "sha256:b0f2ef9eada4a68ef63ee10b6dde4f35c840035c50fd24265f8052c98947d5a4" + "sha256:1d3a1692921e932751bc1a1f7bb96dc38671eeefdc66ed33ee4cbc57e92a410e", + "sha256:537cd0176ff6abd06ef3e23f2d0c4c2c8a4d9277b7451544c6cbf56d1c79a83d" ], "index": "pypi", - "version": "==6.5.0" + "version": "==7.7.0" }, "ipython-genutils": { "hashes": [ @@ -332,10 +332,10 @@ }, "jedi": { "hashes": [ - "sha256:b409ed0f6913a701ed474a614a3bb46e6953639033e31f769ca7581da5bd1ec1", - "sha256:c254b135fb39ad76e78d4d8f92765ebc9bf92cbc76f49e97ade1d5f5121e1f6f" + "sha256:786b6c3d80e2f06fd77162a07fed81b8baa22dde5d62896a790a331d6ac21a27", + "sha256:ba859c74fa3c966a22f2aeebe1b74ee27e2a462f56d3f5f7ca4a59af61bfe42e" ], - "version": "==0.12.1" + "version": "==0.15.1" }, "mccabe": { "hashes": [ @@ -346,96 +346,118 @@ }, "more-itertools": { "hashes": [ - "sha256:c187a73da93e7a8acc0001572aebc7e3c69daf7bf6881a2cea10650bd4420092", - "sha256:c476b5d3a34e12d40130bc2f935028b5f636df8f372dc2c1c01dc19681b2039e", - "sha256:fcbfeaea0be121980e15bc97b3817b5202ca73d0eae185b4550cbfce2a3ebb3d" + "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832", + "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4" + ], + "version": "==7.2.0" + }, + "packaging": { + "hashes": [ + "sha256:a7ac867b97fdc07ee80a8058fe4435ccd274ecc3b0ed61d852d7d53055528cf9", + "sha256:c491ca87294da7cc01902edbe30a5bc6c4c28172b5138ab4e4aa1b9d7bfaeafe" ], - "version": "==4.3.0" + "version": "==19.1" }, "parso": { "hashes": [ - "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2", - "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24" + "sha256:63854233e1fadb5da97f2744b6b24346d2750b85965e7e399bec1620232797dc", + "sha256:666b0ee4a7a1220f65d367617f2cd3ffddff3e205f3f16a0284df30e774c2a9c" ], - "version": "==0.3.1" + "version": "==0.5.1" + }, + "pexpect": { + "hashes": [ + "sha256:2094eefdfcf37a1fdbfb9aa090862c1a4878e5c7e0e7e7088bdb511c558e5cd1", + "sha256:9e2c1fd0e6ee3a49b28f95d4b33bc389c89b20af6a1255906e90ff1262ce62eb" + ], + "markers": "sys_platform != 'win32'", + "version": "==4.7.0" }, "pickleshare": { "hashes": [ - "sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b", - "sha256:c9a2541f25aeabc070f12f452e1f2a8eae2abd51e1cd19e8430402bdf4c1d8b5" + "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", + "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" ], - "version": "==0.7.4" + "version": "==0.7.5" }, "pluggy": { "hashes": [ - "sha256:6e3836e39f4d36ae72840833db137f7b7d35105079aee6ec4a62d9f80d594dd1", - "sha256:95eb8364a4708392bae89035f45341871286a333f749c3141c20573d2b3876e1" + "sha256:0825a152ac059776623854c1543d65a4ad408eb3d33ee114dff91e57ec6ae6fc", + "sha256:b9817417e95936bf75d85d3f8767f7df6cdde751fc40aed3bb3074cbcb77757c" ], - "version": "==0.7.1" + "version": "==0.12.0" }, "prompt-toolkit": { "hashes": [ - "sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381", - "sha256:3f473ae040ddaa52b52f97f6b4a493cfa9f5920c255a12dc56a7d34397a398a4", - "sha256:858588f1983ca497f1cf4ffde01d978a3ea02b01c8a26a8bbc5cd2e66d816917" + "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780", + "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1", + "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55" + ], + "version": "==2.0.9" + }, + "ptyprocess": { + "hashes": [ + "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", + "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" ], - "version": "==1.0.15" + "version": "==0.6.0" }, "py": { "hashes": [ - "sha256:06a30435d058473046be836d3fc4f27167fd84c45b99704f2fb5509ef61f9af1", - "sha256:50402e9d1c9005d759426988a492e0edaadb7f4e68bcddfea586bc7432d009c6" + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" ], - "version": "==1.6.0" + "version": "==1.8.0" }, "pycodestyle": { "hashes": [ - "sha256:682256a5b318149ca0d2a9185d365d8864a768a28db66a84a2ea946bcc426766", - "sha256:6c4245ade1edfad79c3446fadfc96b0de2759662dc29d07d80a6f27ad1ca6ba9" + "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56", + "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c" ], - "version": "==2.3.1" + "version": "==2.5.0" }, "pyflakes": { "hashes": [ - "sha256:08bd6a50edf8cffa9fa09a463063c425ecaaf10d1eb0335a7e8b1401aef89e6f", - "sha256:8d616a382f243dbf19b54743f280b80198be0bca3a5396f1d2e1fca6223e8805" + "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0", + "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2" ], - "version": "==1.6.0" + "version": "==2.1.1" }, "pygments": { "hashes": [ - "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d", - "sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc" + "sha256:71e430bc85c88a430f000ac1d9b331d2407f681d6f6aec95e8bcfbc3df5b0127", + "sha256:881c4c157e45f30af185c1ffe8d549d48ac9127433f2c380c24b84572ad66297" ], - "version": "==2.2.0" + "version": "==2.4.2" }, - "pytest": { + "pyparsing": { "hashes": [ - "sha256:2d7c49e931316cc7d1638a3e5f54f5d7b4e5225972b3c9838f3584788d27f349", - "sha256:ad0c7db7b5d4081631e0155f5c61b80ad76ce148551aaafe3a718d65a7508b18" + "sha256:6f98a7b9397e206d78cc01df10131398f1c8b8510a2f4d97d9abd82e1aacdd80", + "sha256:d9338df12903bbf5d65a0e4e87c2161968b10d2e489652bb47001d82a9b028b4" ], - "version": "==3.7.4" + "version": "==2.4.2" }, - "pytest-flake8": { + "pytest": { "hashes": [ - "sha256:4f30f5be3efb89755f38f11bdb2a5e22d19a6f5faa73428f703a3292a9572cd3", - "sha256:c740ad6aa19e3958947d2118f70bed218caf1d2097039fb7318573a2a72f89a1" + "sha256:95b1f6db806e5b1b5b443efeb58984c24945508f93a866c1719e1a507a957d7c", + "sha256:c3d5020755f70c82eceda3feaf556af9a341334414a8eca521a18f463bcead88" ], - "index": "pypi", - "version": "==1.0.2" + "version": "==5.1.1" }, - "simplegeneric": { + "pytest-flake8": { "hashes": [ - "sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173" + "sha256:4d225c13e787471502ff94409dcf6f7927049b2ec251c63b764a4b17447b60c0", + "sha256:d7e2b6b274a255b7ae35e9224c85294b471a83b76ecb6bd53c337ae977a499af" ], - "version": "==0.8.1" + "index": "pypi", + "version": "==1.0.4" }, "six": { "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" ], - "version": "==1.11.0" + "version": "==1.12.0" }, "traitlets": { "hashes": [ @@ -450,6 +472,13 @@ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" ], "version": "==0.1.7" + }, + "zipp": { + "hashes": [ + "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", + "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" + ], + "version": "==0.6.0" } } } diff --git a/quantgov/nlp.py b/quantgov/nlp.py index 8022fd8..0622c21 100644 --- a/quantgov/nlp.py +++ b/quantgov/nlp.py @@ -1,310 +1,365 @@ -""" -quantgov.nlp: Text-based analysis of documents -""" -import re -import collections -import math - -from decorator import decorator - -from . import utils - -try: - import nltk.corpus - NLTK = True -except ImportError: - NLTK = None - -try: - import textblob -except ImportError: - textblob = None - -if NLTK: - try: - nltk.corpus.wordnet.ensure_loaded() - except LookupError: - nltk.download('wordnet') - nltk.corpus.wordnet.ensure_loaded() - -commands = {} - - -@decorator -def check_nltk(func, *args, **kwargs): - if NLTK is None: - raise RuntimeError('Must install NLTK to use {}'.format(func)) - return func(*args, **kwargs) - - -@decorator -def check_textblob(func, *args, **kwargs): - if textblob is None: - raise RuntimeError('Must install textblob to use {}'.format(func)) - return func(*args, **kwargs) - - -class WordCounter(): - - cli = utils.CLISpec( - help='Word Counter', - arguments=[ - utils.CLIArg( - flags=('--word_pattern', '-wp'), - kwargs={ - 'help': 'regular expression defining a "word"', - 'type': re.compile, - 'default': re.compile(r'\b\w+\b') - } - ) - ] - ) - - @staticmethod - def get_columns(args): - return ('words',) - - @staticmethod - def process_document(doc, word_pattern): - return doc.index + (len(word_pattern.findall(doc.text)),) - - -commands['count_words'] = WordCounter - - -class OccurrenceCounter(): - - cli = utils.CLISpec( - help="Term Counter for Specific Words", - arguments=[ - utils.CLIArg( - flags=('terms'), - kwargs={ - 'help': 'list of terms to be counted', - 'nargs': '+' - } - ), - utils.CLIArg( - flags=('--total_label'), - kwargs={ - 'metavar': 'LABEL', - 'help': ( - 'output a column with sum of occurrences of all terms' - ' with column name LABEL' - ), - } - ), - utils.CLIArg( - flags=('--pattern'), - kwargs={ - 'help': 'pattern to use in identifying words', - 'default': r'\b(?P{})\b' - } - ) - ] - ) - - @staticmethod - def get_columns(args): - if args['total_label'] is not None: - return tuple(args['terms']) + (args['total_label'],) - return tuple(args['terms']) - - @staticmethod - def process_document(doc, terms, pattern, total_label): - text = ' '.join(doc.text.split()).lower() - terms_sorted = sorted(terms, key=len, reverse=True) - combined_pattern = re.compile(pattern.format('|'.join(terms_sorted))) - term_counts = collections.Counter( - i.groupdict()['match'] for i in combined_pattern.finditer(text) - ) - if total_label is not None: - return ( - doc.index - + tuple(term_counts[i] for i in terms) - + (sum(term_counts.values()),) - ) - return (doc.index + tuple(term_counts[i] for i in terms)) - - -commands['count_occurrences'] = OccurrenceCounter - - -class ShannonEntropy(): - lemmas = {} - cli = utils.CLISpec( - help='Shannon Entropy', - arguments=[ - utils.CLIArg( - flags=('--word_pattern', '-wp'), - kwargs={ - 'help': 'regular expression defining a "word"', - 'type': re.compile, - 'default': re.compile(r'\b\w+\b') - } - ), - utils.CLIArg( - flags=('--stopwords', '-sw'), - kwargs={ - 'help': 'stopwords to ignore', - 'default': ( - None if not NLTK else - nltk.corpus.stopwords.words('english') - ) - } - ), - utils.CLIArg( - flags=('--precision'), - kwargs={ - 'help': 'decimal places to round', - 'default': 2 - } - ) - ] - ) - - @staticmethod - def get_columns(args): - return ('shannon_entropy',) - - @staticmethod - @check_nltk - @check_textblob - def process_document(doc, word_pattern, precision, stopwords, - textblob=textblob, nltk=NLTK): - words = word_pattern.findall(doc.text) - lemmas = [ - lemma for lemma in ( - ShannonEntropy.lemmatize(word) for word in words - ) - if lemma not in stopwords - ] - counts = collections.Counter(lemmas) - return doc.index + (round(sum( - -(count / len(lemmas) * math.log(count / len(lemmas), 2)) - for count in counts.values() - ), int(precision)),) - - def lemmatize(word): - if word in ShannonEntropy.lemmas: - lemma = ShannonEntropy.lemmas[word] - else: - lemma = textblob.Word(word).lemmatize() - ShannonEntropy.lemmas[word] = lemma - return lemma - - -commands['shannon_entropy'] = ShannonEntropy - - -class ConditionalCounter(): - cli = utils.CLISpec( - help=('Count conditional words and phrases. Included terms are: ' - ' "if", "but", "except", "provided", "when", "where", ' - '"whenever", "unless", "notwithstanding", "in the event", ' - 'and "in no event"'), - arguments=[] - ) - pattern = re.compile( - r'\b(if|but|except|provided|when|where' - r'|whenever|unless|notwithstanding' - r'|in\s+the\s+event|in\s+no\s+event)\b' - ) - - @staticmethod - def get_columns(args): - return ('conditionals',) - - @staticmethod - def process_document(doc): - return doc.index + (len(ConditionalCounter.pattern.findall( - ' '.join((doc.text).splitlines()))),) - - -commands['count_conditionals'] = ConditionalCounter - - -class SentenceLength(): - - cli = utils.CLISpec( - help='Sentence Length', - arguments=[ - utils.CLIArg( - flags=('--precision'), - kwargs={ - 'help': 'decimal places to round', - 'default': 2 - } - ) - ] - ) - - @staticmethod - def get_columns(args): - return ('sentence_length',) - - @staticmethod - @check_nltk - @check_textblob - def process_document(doc, precision): - sentences = textblob.TextBlob(doc.text).sentences - # Allows for rounding to a specified number of decimals - if precision: - return doc.index + (round(sum(len( - sentence.words) for sentence in sentences) / - len(sentences), int(precision)),) - else: - return doc.index + (sum(len( - sentence.words) for sentence in sentences) / - len(sentences),) - - -commands['sentence_length'] = SentenceLength - - -class SentimentAnalysis(): - - cli = utils.CLISpec( - help='Performs sentiment analysis on the text', - arguments=[ - utils.CLIArg( - flags=('--backend'), - kwargs={ - 'help': 'which program to use for the analysis', - 'default': 'textblob' - } - ), - utils.CLIArg( - flags=('--precision'), - kwargs={ - 'help': 'decimal places to round', - 'default': 2 - } - ) - ] - ) - - @staticmethod - def get_columns(args): - if args['backend'] == 'textblob': - return ('sentiment_polarity', 'sentiment_subjectivity',) - else: - raise NotImplementedError - - @staticmethod - @check_nltk - @check_textblob - def process_document(doc, backend, precision): - if backend == 'textblob': - sentiment = textblob.TextBlob(doc.text) - # Allows for rounding to a specified number of decimals - if precision: - return (doc.index + - (round(sentiment.polarity, int(precision)), - round(sentiment.subjectivity, int(precision)),)) - else: - return (doc.index + - (sentiment.polarity, sentiment.subjectivity,)) - - -commands['sentiment_analysis'] = SentimentAnalysis +""" +quantgov.nlp: Text-based analysis of documents +""" +import collections +import math +import re + +from decorator import decorator + +from . import utils + +try: + import nltk.corpus + NLTK = True +except ImportError: + NLTK = None + +try: + import textblob +except ImportError: + textblob = None + +try: + import textstat +except ImportError: + textstat = None + +if NLTK: + try: + nltk.corpus.wordnet.ensure_loaded() + except LookupError: + nltk.download('wordnet') + nltk.corpus.wordnet.ensure_loaded() + +commands = {} + + +@decorator +def check_nltk(func, *args, **kwargs): + if NLTK is None: + raise RuntimeError('Must install NLTK to use {}'.format(func)) + return func(*args, **kwargs) + + +@decorator +def check_textblob(func, *args, **kwargs): + if textblob is None: + raise RuntimeError('Must install textblob to use {}'.format(func)) + return func(*args, **kwargs) + + +@decorator +def check_textstat(func, *args, **kwargs): + if textstat is None: + raise RuntimeError('Must install teststat to use {}'.format(func)) + return func(*args, **kwargs) + + +class WordCounter(): + + cli = utils.CLISpec( + help='Word Counter', + arguments=[ + utils.CLIArg( + flags=('--word_pattern', '-wp'), + kwargs={ + 'help': 'regular expression defining a "word"', + 'type': re.compile, + 'default': re.compile(r'\b\w+\b') + } + ) + ] + ) + + @staticmethod + def get_columns(args): + return ('words',) + + @staticmethod + def process_document(doc, word_pattern): + return doc.index + (len(word_pattern.findall(doc.text)),) + + +commands['count_words'] = WordCounter + + +class OccurrenceCounter(): + + cli = utils.CLISpec( + help="Term Counter for Specific Words", + arguments=[ + utils.CLIArg( + flags=('terms'), + kwargs={ + 'help': 'list of terms to be counted', + 'nargs': '+' + } + ), + utils.CLIArg( + flags=('--total_label'), + kwargs={ + 'metavar': 'LABEL', + 'help': ( + 'output a column with sum of occurrences of all terms' + ' with column name LABEL' + ), + } + ), + utils.CLIArg( + flags=('--pattern'), + kwargs={ + 'help': 'pattern to use in identifying words', + 'default': r'\b(?P{})\b' + } + ) + ] + ) + + @staticmethod + def get_columns(args): + if args['total_label'] is not None: + return tuple(args['terms']) + (args['total_label'],) + return tuple(args['terms']) + + @staticmethod + def process_document(doc, terms, pattern, total_label): + text = ' '.join(doc.text.split()).lower() + terms_sorted = sorted(terms, key=len, reverse=True) + combined_pattern = re.compile(pattern.format('|'.join(terms_sorted))) + term_counts = collections.Counter( + i.groupdict()['match'] for i in combined_pattern.finditer(text) + ) + if total_label is not None: + return ( + doc.index + + tuple(term_counts[i] for i in terms) + + (sum(term_counts.values()),) + ) + return (doc.index + tuple(term_counts[i] for i in terms)) + + +commands['count_occurrences'] = OccurrenceCounter + + +class ShannonEntropy(): + lemmas = {} + cli = utils.CLISpec( + help='Shannon Entropy', + arguments=[ + utils.CLIArg( + flags=('--word_pattern', '-wp'), + kwargs={ + 'help': 'regular expression defining a "word"', + 'type': re.compile, + 'default': re.compile(r'\b\w+\b') + } + ), + utils.CLIArg( + flags=('--stopwords', '-sw'), + kwargs={ + 'help': 'stopwords to ignore', + 'default': ( + None if not NLTK else + nltk.corpus.stopwords.words('english') + ) + } + ), + utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + + @staticmethod + def get_columns(args): + return ('shannon_entropy',) + + @staticmethod + @check_nltk + @check_textblob + def process_document(doc, word_pattern, precision, stopwords, + textblob=textblob, nltk=NLTK): + words = word_pattern.findall(doc.text) + lemmas = [ + lemma for lemma in ( + ShannonEntropy.lemmatize(word) for word in words + ) + if lemma not in stopwords + ] + counts = collections.Counter(lemmas) + return doc.index + (round(sum( + -(count / len(lemmas) * math.log(count / len(lemmas), 2)) + for count in counts.values() + ), int(precision)),) + + def lemmatize(word): + if word in ShannonEntropy.lemmas: + lemma = ShannonEntropy.lemmas[word] + else: + lemma = textblob.Word(word).lemmatize() + ShannonEntropy.lemmas[word] = lemma + return lemma + + +commands['shannon_entropy'] = ShannonEntropy + + +class ConditionalCounter(): + cli = utils.CLISpec( + help=('Count conditional words and phrases. Included terms are: ' + ' "if", "but", "except", "provided", "when", "where", ' + '"whenever", "unless", "notwithstanding", "in the event", ' + 'and "in no event"'), + arguments=[] + ) + pattern = re.compile( + r'\b(if|but|except|provided|when|where' + r'|whenever|unless|notwithstanding' + r'|in\s+the\s+event|in\s+no\s+event)\b' + ) + + @staticmethod + def get_columns(args): + return ('conditionals',) + + @staticmethod + def process_document(doc): + return doc.index + (len(ConditionalCounter.pattern.findall( + ' '.join((doc.text).splitlines()))),) + + +commands['count_conditionals'] = ConditionalCounter + + +class SentenceLength(): + + cli = utils.CLISpec( + help='Sentence Length', + arguments=[ + utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + + @staticmethod + def get_columns(args): + return ('sentence_length',) + + @staticmethod + @check_nltk + @check_textblob + def process_document(doc, precision): + sentences = textblob.TextBlob(doc.text).sentences + # Allows for rounding to a specified number of decimals + if precision: + return doc.index + (round(sum(len( + sentence.words) for sentence in sentences) / len(sentences), + int(precision)),) + else: + return doc.index + (sum(len( + sentence.words) for sentence in sentences) / len(sentences),) + + +commands['sentence_length'] = SentenceLength + + +class SentimentAnalysis(): + + cli = utils.CLISpec( + help='Performs sentiment analysis on the text', + arguments=[ + utils.CLIArg( + flags=('--backend'), + kwargs={ + 'help': 'which program to use for the analysis', + 'default': 'textblob' + } + ), + utils.CLIArg( + flags=('--precision'), + kwargs={ + 'help': 'decimal places to round', + 'default': 2 + } + ) + ] + ) + + @staticmethod + def get_columns(args): + if args['backend'] == 'textblob': + return ('sentiment_polarity', 'sentiment_subjectivity',) + else: + raise NotImplementedError + + @staticmethod + @check_nltk + @check_textblob + def process_document(doc, backend, precision): + if backend == 'textblob': + sentiment = textblob.TextBlob(doc.text) + # Allows for rounding to a specified number of decimals + if precision: + return (doc.index + (round( + sentiment.polarity, int(precision)), + round(sentiment.subjectivity, int(precision)),)) + else: + return (doc.index + (sentiment.polarity, + sentiment.subjectivity,)) + + +commands['sentiment_analysis'] = SentimentAnalysis + + +class FleschReadingEase(): + + cli = utils.CLISpec( + help='Flesch Reading Ease metric', + arguments=[] + ) + + @staticmethod + def get_columns(args): + return ('flesch_reading_ease',) + + @staticmethod + @check_textstat + def process_document(doc): + score = textstat.flesch_reading_ease(doc.text) + # Allows for rounding to a specified number of decimals + return doc.index + (int(score),) + + +commands['flesch_reading_ease'] = FleschReadingEase + + +class TextStandard(): + + cli = utils.CLISpec( + help='combines all of the readability metrics in textstats', + arguments=[] + ) + + @staticmethod + def get_columns(args): + return ('text_standard',) + + @staticmethod + @check_textstat + def process_document(doc): + score = textstat.text_standard(doc.text) + # Allows for rounding to a specified number of decimals + return doc.index + (score,) + + +commands['text_standard'] = TextStandard diff --git a/setup.py b/setup.py index 672fc3f..bfa812e 100644 --- a/setup.py +++ b/setup.py @@ -1,78 +1,79 @@ -""" -A setuptools-based setup module. -""" - -import os -import re - -from setuptools import setup, find_packages -from codecs import open - - -def read(*names, **kwargs): - with open( - os.path.join(os.path.dirname(__file__), *names), - encoding=kwargs.get("encoding", "utf8") - ) as fp: - return fp.read() - - -def find_version(*file_paths): - version_file = read(*file_paths) - version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", - version_file, re.M) - if version_match: - return version_match.group(1) - raise RuntimeError("Unable to find version string.") - - -long_description = read("README.rst") -version = find_version("quantgov", "__init__.py") - -setup( - name='quantgov', - version=version, - - description='A Policy Analytics Framework', - long_description=long_description, - url='https://www.quantgov.org', - author='Oliver Sherouse', - author_email='osherouse@mercatus.gmu.edu', - license='MIT', - classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Information Analysis', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - ], - keywords='quantgov economics policy government machine learning', - packages=find_packages( - exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), - install_requires=[ - 'decorator', - 'joblib', - 'pandas', - 'requests', - 'scikit-learn', - 'scipy', - ], - extras_require={ - 'testing': ['pytest-flake8'], - 'nlp': [ - 'textblob', - 'nltk', - ], - 's3driver': [ - 'sqlalchemy', - 'boto3' - ] - }, - entry_points={ - 'console_scripts': [ - 'quantgov=quantgov.__main__:main', - ], - }, -) +""" +A setuptools-based setup module. +""" + +import os +import re + +from setuptools import setup, find_packages +from codecs import open + + +def read(*names, **kwargs): + with open( + os.path.join(os.path.dirname(__file__), *names), + encoding=kwargs.get("encoding", "utf8") + ) as fp: + return fp.read() + + +def find_version(*file_paths): + version_file = read(*file_paths) + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", + version_file, re.M) + if version_match: + return version_match.group(1) + raise RuntimeError("Unable to find version string.") + + +long_description = read("README.rst") +version = find_version("quantgov", "__init__.py") + +setup( + name='quantgov', + version=version, + + description='A Policy Analytics Framework', + long_description=long_description, + url='https://www.quantgov.org', + author='Oliver Sherouse', + author_email='osherouse@mercatus.gmu.edu', + license='MIT', + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Science/Research', + 'Topic :: Scientific/Engineering :: Information Analysis', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + ], + keywords='quantgov economics policy government machine learning', + packages=find_packages( + exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), + install_requires=[ + 'decorator', + 'joblib', + 'pandas', + 'requests', + 'scikit-learn', + 'scipy', + 'textstat' + ], + extras_require={ + 'testing': ['pytest-flake8'], + 'nlp': [ + 'textblob', + 'nltk', + ], + 's3driver': [ + 'sqlalchemy', + 'boto3' + ] + }, + entry_points={ + 'console_scripts': [ + 'quantgov=quantgov.__main__:main', + ], + }, +) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index db14cf5..fe171fc 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -115,7 +115,7 @@ def test_wordcount(): def test_wordcount_pattern(): output = check_output( ['quantgov', 'nlp', 'count_words', str(PSEUDO_CORPUS_PATH), - '--word_pattern', '\S+'] + '--word_pattern', r'\S+'] ) assert output == 'file,words\ncfr,333237\nmoby,210130\n' @@ -183,14 +183,6 @@ def test_sentencelength(): assert output == 'file,sentence_length\ncfr,18.68\nmoby,25.09\n' -def test_sentencelength_4decimals(): - output = check_output( - ['quantgov', 'nlp', 'sentence_length', str(PSEUDO_CORPUS_PATH), - '--precision', '4'], - ) - assert output == 'file,sentence_length\ncfr,18.6827\nmoby,25.0936\n' - - def test_sentiment_analysis(): output = check_output( ['quantgov', 'nlp', 'sentiment_analysis', str(PSEUDO_CORPUS_PATH)], From 476f4e054a4486ec26fb33faf95d6ab52423c129 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Thu, 12 Sep 2019 11:14:28 -0400 Subject: [PATCH 5/7] catch div by zero error --- quantgov/nlp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/quantgov/nlp.py b/quantgov/nlp.py index 0622c21..5748e1b 100644 --- a/quantgov/nlp.py +++ b/quantgov/nlp.py @@ -261,6 +261,8 @@ def get_columns(args): def process_document(doc, precision): sentences = textblob.TextBlob(doc.text).sentences # Allows for rounding to a specified number of decimals + if len(sentences) == 0: + return doc.index + (0,) if precision: return doc.index + (round(sum(len( sentence.words) for sentence in sentences) / len(sentences), From 6f0c182d8f400fe5ef123593344a559d99e01f0a Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Thu, 12 Sep 2019 11:15:26 -0400 Subject: [PATCH 6/7] v0.6.1 --- quantgov/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quantgov/__init__.py b/quantgov/__init__.py index 5122416..05f32de 100644 --- a/quantgov/__init__.py +++ b/quantgov/__init__.py @@ -4,4 +4,4 @@ from . import corpus, nlp, ml, utils from .utils import load_driver -__version__ = '0.7.0.dev' +__version__ = '0.6.1' From 810ef1f4d4c8625f14a9a00948e9835d7ef210c2 Mon Sep 17 00:00:00 2001 From: Jonathan Nelson Date: Thu, 12 Sep 2019 11:24:39 -0400 Subject: [PATCH 7/7] inline doc --- quantgov/nlp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/quantgov/nlp.py b/quantgov/nlp.py index 5748e1b..d76708c 100644 --- a/quantgov/nlp.py +++ b/quantgov/nlp.py @@ -260,9 +260,10 @@ def get_columns(args): @check_textblob def process_document(doc, precision): sentences = textblob.TextBlob(doc.text).sentences - # Allows for rounding to a specified number of decimals + # Returns sentence_length = 0 if no complete sentences are found if len(sentences) == 0: return doc.index + (0,) + # Allows for rounding to a specified number of decimals if precision: return doc.index + (round(sum(len( sentence.words) for sentence in sentences) / len(sentences),