From 1fd7c39ca9de848c87d65275b8d372ec564d64d7 Mon Sep 17 00:00:00 2001 From: Emily Scher Date: Wed, 15 Jul 2020 15:11:50 +0100 Subject: [PATCH 01/35] python script incorporating predictions from the intial version of multinomial logistic regression model. --- pangolin/scripts/pangolearn.py | 146 +++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 pangolin/scripts/pangolearn.py diff --git a/pangolin/scripts/pangolearn.py b/pangolin/scripts/pangolearn.py new file mode 100644 index 0000000..f3b835c --- /dev/null +++ b/pangolin/scripts/pangolearn.py @@ -0,0 +1,146 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn import metrics +from sklearn.datasets import make_classification +from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix +from datetime import datetime +import pickle + +dataList = [] +tempDataLines = [] + +sequencesFile = snakemake.input[0] +modelFile = snakemake.input[1] +headerFile = snakemake.input[2] + +# small class to store vector objects +class VectorObject: + def __init__(self, vector): + self.vector = vector + + def equals(self, vector): + for i in range(len(vector.vector)): + if vector.vector[i] != self.vector[i]: + return False + + return True + + +# produces the proper one-hot encoding for a particular genomic site +def getOneHotEncoding(char): + # ATCGN- + if char == "A": + return VectorObject([1, 0, 0, 0, 0, 0]) + elif char == "T": + return VectorObject([0, 1, 0, 0, 0, 0]) + elif char == "C": + return VectorObject([0, 0, 1, 0, 0, 0]) + elif char == "G": + return VectorObject([0, 0, 0, 1, 0, 0]) + elif char == "-": + return VectorObject([0, 0, 0, 0, 0, 1]) + else: + return VectorObject([0, 0, 0, 0, 1, 0]) + + +# reads in the two data files +def readInAndFormatData(): + # open sequencesFile, which is the first snakemake input file + with open(sequencesFile) as f: + currentSeq = "" + + # go line by line through the file, collecting a list of the sequences + for line in f: + # if the line isn't the header line + if "taxon,lineage" not in line: + line = line.strip() + + if ">" in line: + # starting new entry, gotta save the old one + if currentSeq: + tempDataLines.append(currentSeq) + + currentSeq = "" + + else: + currentSeq = currentSeq + line + + # gotta get the last one + if currentSeq: + tempDataLines.append(currentSeq) + + # get a list of viral genomes, represented as a list of one-hot encoded vectors + for line in tempDataLines: + dataLine = [] + for index in range(len(line)): + char = line[index] + + dataLine.append(getOneHotEncoding(char)) + + dataList.append(dataLine) + + # close the file + f.close() + + +# remove the indicies representing genomic locations which aren't used by the model +def removeIndices(headersFile): + # will hold the final data from which the model will make predictions + finalList = [] + # list of headers representing the genomic locations of each entry + headers = list(range(len(dataList[0]))) + # list which will hold the indicies of interest + indiciesToKeep = [] + + # loading the list of headers the model needs. + # example: '29657-A', '29657-T', '29657-C', '29657-G', '29657-N', '29657-gap' + model_headers = pickle.load(open(headersFile, 'rb')) + + # by cycling through model_headers, get which column indicies we need to keep in the test data + for h in model_headers: + if h != "lineage" and "-A" in h: + # -1 because in the training data the 0 position is the lineage + index = int(h.split("-")[0]) - 1 + indiciesToKeep.append(index) + + # for each entry in dataList, remove the irrelevant columns + while len(dataList) > 0: + line = dataList.pop(0) + + finalLine = [] + + for index in range(len(line)): + if index in indiciesToKeep: + finalLine.extend(line[index].vector) + + finalList.append(finalLine) + + # return the final data, along with the relevant headers + # model_headers[1:] because the first is just "lineage" which we won't have for this test data + return finalList, model_headers[1:] + + +print("reading in data " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")); + +readInAndFormatData() + +print("removing unnecessary columns " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")); + +dataList, headers = removeIndices(headerFile) + +df = pd.DataFrame(dataList, columns=headers) + +print("loading model " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")); + +loaded_model = pickle.load(open(modelFile, 'rb')) + +print("generating predictions " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")); + +predictions = loaded_model.predict(df) + +# write predictions to a file +f = open(snakemake.output[0], "w") +f.write(predictions) +f.close() \ No newline at end of file From 6e0ab25ce3aa18d7476281607aa972f3214c1938 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Wed, 15 Jul 2020 18:09:02 +0100 Subject: [PATCH 02/35] pangolearn integrate script --- pangolin/command.py | 19 ++++-- pangolin/scripts/pangolearn.smk | 102 ++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 5 deletions(-) create mode 100644 pangolin/scripts/pangolearn.smk diff --git a/pangolin/command.py b/pangolin/command.py index aaba44c..ed50c10 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -12,6 +12,7 @@ import setuptools from Bio import SeqIO + from . import _program @@ -33,7 +34,8 @@ def main(sysargs = sys.argv[1:]): parser.add_argument('--tempdir',action="store",help="Specify where you want the temp stuff to go. Default: $TMPDIR") parser.add_argument('--max-ambig', action="store", default=0.5, type=float,help="Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5",dest="maxambig") parser.add_argument('--min-length', action="store", default=10000, type=int,help="Minimum query length allowed for pangolin to attempt assignment. Default: 10000",dest="minlen") - parser.add_argument('--panGUIlin', action='store_true',help="Run web-app version of pangolin") + parser.add_argument('--panGUIlin', action='store_true',help="Run web-app version of pangolin",dest="assign_using_tree") + parser.add_argument('--assign-using-tree',action='store_true',help="Use original phylogenetic assignment methods with guide tree. Note: will be significantly slower than pangoLEARN") parser.add_argument('--write-tree', action='store_true',help="Output a phylogeny for each query sequence placed in the guide tree",dest="write_tree") parser.add_argument('-t', '--threads', action='store',type=int,help="Number of threads") parser.add_argument("-p","--include-putative",action="store_true",help="Include the bleeding edge lineage definitions in assignment",dest="include_putative") @@ -47,8 +49,11 @@ def main(sysargs = sys.argv[1:]): else: args = parser.parse_args(sysargs) + if args.assign_using_tree: + snakefile = os.path.join(thisdir, 'scripts','pangoLEARN.smk') # find the Snakefile - snakefile = os.path.join(thisdir, 'scripts','Snakefile') + else: + snakefile = os.path.join(thisdir, 'scripts','Snakefile') if not os.path.exists(snakefile): sys.stderr.write('Error: cannot find Snakefile at {}\n'.format(snakefile)) sys.exit(-1) @@ -149,8 +154,11 @@ def main(sysargs = sys.argv[1:]): representative_aln = "" guide_tree = "" lineages_csv = "" + trained_model = "" for r,d,f in os.walk(data_dir): for fn in f: + if "training" in fn: + trained_model = os.path.join(r, fn) if args.include_putative: if fn.endswith("putative.fasta"): representative_aln = os.path.join(r, fn) @@ -170,15 +178,16 @@ def main(sysargs = sys.argv[1:]): print(f"Sequence alignment:\t{representative_aln}") print(f"Guide tree:\t\t{guide_tree}") print(f"Lineages csv:\t\t{lineages_csv}") - if representative_aln=="" or guide_tree=="" or lineages_csv=="": - print("""Didn't find appropriate files.\nTreefile must end with `.treefile`.\nAlignment must be in `.fasta` format.\n \ + if representative_aln=="" or guide_tree=="" or lineages_csv=="" or trained_model="": + print("""Didn't find appropriate files.\nTreefile must end with `.treefile`.\nAlignment must be in `.fasta` format.\n Trained model must exist. \ If you've specified --include-putative \n you must have files ending in putative.fasta.treefile\nExiting.""") exit(1) else: config["representative_aln"]=representative_aln config["guide_tree"]=guide_tree - + config["trained_model"] = trained_model + if args.write_tree: config["write_tree"]="True" diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk new file mode 100644 index 0000000..4521b8c --- /dev/null +++ b/pangolin/scripts/pangolearn.smk @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +# config["trained_model"] = trained_model +import csv +from Bio import SeqIO +import os + +##### Configuration ##### + +if config.get("trained_model"): + config["trained_model"] = os.path.join(workflow.current_basedir,'..', config["trained_model"]) + +if config.get("lineages_csv"): + lineages_csv_path = os.path.join(workflow.current_basedir,'..', config["lineages_csv"]) + config["lineages_csv"]=f"lineages_csv={lineages_csv_path} " +else: + config["lineages_csv"]="" + +if config.get("force"): + config["force"] = "--forceall " +else: + config["force"] = "" + +if config.get("lineages_csv"): + print("Going to run the global report summary") +else: + config["lineages_csv"]="" + +config["pid"] = str(os.getpid()) + +##### Target rules ##### + +if config["lineages_csv"] != "": + rule all: + input: + config["outfile"], + os.path.join(config["outdir"],"global_lineage_information.csv") +else: + rule all: + input: + config["outfile"] + + +rule all: + input: + config["outfile"] + +rule pangolearn: + input: + config["post_qc_query"], + config["trained_model"], + config["header_file"] + output: + os.path.join(config["tempdir"],"lineage_report.pass_qc.csv") + script: + # should output a csv file with no headers but with columns similar to: + # "taxon,lineage,SH-alrt,UFbootstrap" + """ + pangolearn.py + """ + +rule add_failed_seqs: + input: + qcpass= os.path.join(config["tempdir"],"lineage_report.pass_qc.csv"), + qcfail= config["qc_fail"] + params: + version = config["lineages_version"] + output: + config["outfile"] + run: + fw = open(output[0],"w") + fw.write("taxon,lineage,SH-alrt,UFbootstrap,lineages_version,status,note\n") + + with open(input.qcpass, "r") as f: + for l in f: + l=l.rstrip('\n') + fw.write(f"{l},{params.version},passed_qc,\n") + + for record in SeqIO.parse(input.qcfail,"fasta"): + desc_list = record.description.split(" ") + note = "" + for i in desc_list: + if i.startswith("fail="): + note = i.lstrip("fail=") + # needs to mirror the structure of the output from pangolearn + fw.write(f"{record.id},None,0,0,{params.version},fail,{note}\n") + + fw.close() + +rule report_results: + input: + csv = config["outfile"], + lineages_csv = config["lineages_csv"] + output: + os.path.join(config["outdir"],"global_lineage_information.csv") + shell: + """ + report_results.py \ + -p {input.csv} \ + -b {input.lineages_csv} \ + -o {output:q} + """ From 0a95d8bd7e47c8333a090246b6709d3ec8b2cab8 Mon Sep 17 00:00:00 2001 From: emilyscher Date: Thu, 16 Jul 2020 14:06:09 +0100 Subject: [PATCH 03/35] switching from pickle to joblib to allow for better compression of the model files --- pangolin/scripts/pangolearn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pangolin/scripts/pangolearn.py b/pangolin/scripts/pangolearn.py index f3b835c..bff7bbd 100644 --- a/pangolin/scripts/pangolearn.py +++ b/pangolin/scripts/pangolearn.py @@ -6,7 +6,7 @@ from sklearn.model_selection import StratifiedShuffleSplit from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix from datetime import datetime -import pickle +import joblib dataList = [] tempDataLines = [] @@ -96,7 +96,7 @@ def removeIndices(headersFile): # loading the list of headers the model needs. # example: '29657-A', '29657-T', '29657-C', '29657-G', '29657-N', '29657-gap' - model_headers = pickle.load(open(headersFile, 'rb')) + model_headers = joblib.load(headersFile) # by cycling through model_headers, get which column indicies we need to keep in the test data for h in model_headers: @@ -134,7 +134,7 @@ def removeIndices(headersFile): print("loading model " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")); -loaded_model = pickle.load(open(modelFile, 'rb')) +loaded_model = joblib.load(modelFile) print("generating predictions " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")); From 0a86bfddef89fa8afa77a289845af4fb62e51757 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Fri, 17 Jul 2020 16:25:56 +0100 Subject: [PATCH 04/35] integrating script and adding in args for pangolearn --- pangolin/command.py | 17 ++++++++++------- pangolin/scripts/pangolearn.py | 21 +++++++++++++++++---- pangolin/scripts/pangolearn.smk | 18 ++++++++---------- setup.py | 3 +++ 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index ed50c10..89f1637 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -34,7 +34,7 @@ def main(sysargs = sys.argv[1:]): parser.add_argument('--tempdir',action="store",help="Specify where you want the temp stuff to go. Default: $TMPDIR") parser.add_argument('--max-ambig', action="store", default=0.5, type=float,help="Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5",dest="maxambig") parser.add_argument('--min-length', action="store", default=10000, type=int,help="Minimum query length allowed for pangolin to attempt assignment. Default: 10000",dest="minlen") - parser.add_argument('--panGUIlin', action='store_true',help="Run web-app version of pangolin",dest="assign_using_tree") + parser.add_argument('--panGUIlin', action='store_true',help="Run web-app version of pangolin",dest="panGUIlin") parser.add_argument('--assign-using-tree',action='store_true',help="Use original phylogenetic assignment methods with guide tree. Note: will be significantly slower than pangoLEARN") parser.add_argument('--write-tree', action='store_true',help="Output a phylogeny for each query sequence placed in the guide tree",dest="write_tree") parser.add_argument('-t', '--threads', action='store',type=int,help="Number of threads") @@ -50,10 +50,10 @@ def main(sysargs = sys.argv[1:]): args = parser.parse_args(sysargs) if args.assign_using_tree: - snakefile = os.path.join(thisdir, 'scripts','pangoLEARN.smk') + snakefile = os.path.join(thisdir, 'scripts','Snakefile') # find the Snakefile else: - snakefile = os.path.join(thisdir, 'scripts','Snakefile') + snakefile = os.path.join(thisdir, 'scripts','pangoLEARN.smk') if not os.path.exists(snakefile): sys.stderr.write('Error: cannot find Snakefile at {}\n'.format(snakefile)) sys.exit(-1) @@ -155,9 +155,12 @@ def main(sysargs = sys.argv[1:]): guide_tree = "" lineages_csv = "" trained_model = "" + header_file = "" for r,d,f in os.walk(data_dir): for fn in f: - if "training" in fn: + if fn == "multinomialLogRegHeaders_v1.joblib": + header_file = os.path.join(r, fn) + elif fn == "multinomialLogReg_v1.joblib": trained_model = os.path.join(r, fn) if args.include_putative: if fn.endswith("putative.fasta"): @@ -178,8 +181,8 @@ def main(sysargs = sys.argv[1:]): print(f"Sequence alignment:\t{representative_aln}") print(f"Guide tree:\t\t{guide_tree}") print(f"Lineages csv:\t\t{lineages_csv}") - if representative_aln=="" or guide_tree=="" or lineages_csv=="" or trained_model="": - print("""Didn't find appropriate files.\nTreefile must end with `.treefile`.\nAlignment must be in `.fasta` format.\n Trained model must exist. \ + if representative_aln=="" or guide_tree=="" or lineages_csv=="" or trained_model=="" or header_file=="": + print("""Check your environment, didn't find appropriate files from the lineages repo.\nTreefile must end with `.treefile`.\nAlignment must be in `.fasta` format.\n Trained model must exist. \ If you've specified --include-putative \n you must have files ending in putative.fasta.treefile\nExiting.""") exit(1) @@ -187,7 +190,7 @@ def main(sysargs = sys.argv[1:]): config["representative_aln"]=representative_aln config["guide_tree"]=guide_tree config["trained_model"] = trained_model - + config["header_file"] = header_file if args.write_tree: config["write_tree"]="True" diff --git a/pangolin/scripts/pangolearn.py b/pangolin/scripts/pangolearn.py index bff7bbd..54424f8 100644 --- a/pangolin/scripts/pangolearn.py +++ b/pangolin/scripts/pangolearn.py @@ -6,14 +6,27 @@ from sklearn.model_selection import StratifiedShuffleSplit from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix from datetime import datetime +import argparse import joblib +import argparse dataList = [] tempDataLines = [] -sequencesFile = snakemake.input[0] -modelFile = snakemake.input[1] -headerFile = snakemake.input[2] +def parse_args(): + parser = argparse.ArgumentParser(description='pangoLEARN.') + + parser.add_argument("--header-file", action="store", type=str, dest="header_file") + parser.add_argument("--model-file", action="store", type=str, dest="model_file") + parser.add_argument("--fasta", action="store", type=str, dest="sequences_file") + parser.add_argument("-o","--outfile", action="store", type=str, dest="outfile") + return parser.parse_args() + +args = parse_args() + +sequencesFile = args.sequences_file +modelFile = args.model_file +headerFile = args.header_file # small class to store vector objects class VectorObject: @@ -141,6 +154,6 @@ def removeIndices(headersFile): predictions = loaded_model.predict(df) # write predictions to a file -f = open(snakemake.output[0], "w") +f = open(args.outfile, "w") f.write(predictions) f.close() \ No newline at end of file diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index 4521b8c..c9d3547 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -10,6 +10,9 @@ import os if config.get("trained_model"): config["trained_model"] = os.path.join(workflow.current_basedir,'..', config["trained_model"]) +if config.get("header_file"): + config["header_file"] = os.path.join(workflow.current_basedir,'..', config["header_file"]) + if config.get("lineages_csv"): lineages_csv_path = os.path.join(workflow.current_basedir,'..', config["lineages_csv"]) config["lineages_csv"]=f"lineages_csv={lineages_csv_path} " @@ -40,23 +43,18 @@ else: input: config["outfile"] - -rule all: - input: - config["outfile"] - rule pangolearn: input: - config["post_qc_query"], - config["trained_model"], - config["header_file"] + fasta = config["query_fasta"], + model = config["trained_model"], + header = config["header_file"] output: os.path.join(config["tempdir"],"lineage_report.pass_qc.csv") - script: + shell: # should output a csv file with no headers but with columns similar to: # "taxon,lineage,SH-alrt,UFbootstrap" """ - pangolearn.py + pangolearn.py --header-file {input.header} --model-file {input.model} --fasta {input.fasta} -o {output[0]} """ rule add_failed_seqs: diff --git a/setup.py b/setup.py index 0685fdf..3189b65 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,8 @@ 'pangolin/scripts/assign_query_lineage.smk', 'pangolin/scripts/prepare_package_data.smk', 'pangolin/scripts/Snakefile', + 'pangolin/scripts/pangolearn.smk', + 'pangolin/scripts/pangolearn.py', 'pangolin/scripts/assign_lineage.py', 'pangolin/scripts/lineage_finder.py', 'pangolin/scripts/utils.py', @@ -31,6 +33,7 @@ "dendropy>=4.4.0", "pytools>=2020.1", 'pandas>=1.0.1', + 'sklearn>=0.0', 'pysam>=0.15.4' ], description='hcov-2019 subtyping command line tool', From 0f1b86bbbdbb3178d29d00aeb8a5e4ed0efd9783 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Fri, 17 Jul 2020 17:01:49 +0100 Subject: [PATCH 05/35] reference map and pad added pre pangolearn --- pangolin/command.py | 5 +++++ pangolin/data/reference.fasta | 2 ++ pangolin/scripts/pangolearn.smk | 35 +++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) create mode 100644 pangolin/data/reference.fasta diff --git a/pangolin/command.py b/pangolin/command.py index 89f1637..4893a9c 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -136,6 +136,8 @@ def main(sysargs = sys.argv[1:]): "outdir":outdir, "outfile":outfile, "tempdir":tempdir, + "trim_start":265, # where to pad to using datafunk + "trim_end":29674, # where to pad after using datafunk "qc_fail":qc_fail, "lineages_version":lineages.__version__ } @@ -150,6 +152,9 @@ def main(sysargs = sys.argv[1:]): lineages_dir = lineages.__path__[0] data_dir = os.path.join(lineages_dir,"data") + reference_fasta = pkg_resources.resource_filename('pangolin', 'data/reference.fasta') + config["reference_fasta"] = reference_fasta + print(f"Looking in {data_dir} for data files...") representative_aln = "" guide_tree = "" diff --git a/pangolin/data/reference.fasta b/pangolin/data/reference.fasta new file mode 100644 index 0000000..99422f2 --- /dev/null +++ b/pangolin/data/reference.fasta @@ -0,0 +1,2 @@ +>outgroup_A +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACCACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAATTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAGAAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATTTGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAACAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTCAACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTTAAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACAGTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTACTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAGTTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGTGAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTATTATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAATGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGTGAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTAAACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAACTCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCAGATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTGATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAATGCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAATGGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAATATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTATGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTTTCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAGAACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACAACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCACCTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTAAGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACAACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGTAAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTGATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAATGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAAATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTAACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAATGAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGTGGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAATTTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTCACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGTGAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAGACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGTTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAGTTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAACCATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAACCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGTGATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAACCTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTGTCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGAATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAATAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTTACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTGCTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTACAACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTATTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAGCAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAATTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTACTCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAGGCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCTTAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAATGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCTATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTCTTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATCTTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTTGTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAGGTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGTGATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAAGACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCATCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGACAACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAATGTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACTAGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTTAATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTGAACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGTTGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTTACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTGGTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGATATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAGAATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAGCACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTTTGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAAATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTAACAAACATGCTGATTTTGACACATGGTTTAGTCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCCATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGCACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACACCATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTTTAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTATGAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACCTTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATCAGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCAGGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTGGTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTACTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTCCTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTTACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTTCACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGGTTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTGCGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTACGCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGCTACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTCTTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCCATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTTGATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAGATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAGTTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTTACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGGTTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCATGCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTACGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTTTCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTAACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTGCTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGATGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACAATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTCAATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTCTGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCCACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATATGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACTAATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTGTGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTCTTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTGGTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAAGAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTAGCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAACTCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAAAGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTAGACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTAGTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGATTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCAGCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTGAGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAATGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACAACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACATTTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAGTGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCTGCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTACACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACTTGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATCTATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTATACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCTACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGATGCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGTGTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGGTGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTAAAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAGTCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCAGTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCACAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAAATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTTGTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTCCAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCAACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGACACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATGATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTTAAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAAGATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTGTAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGTTGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTAAAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATGACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGGACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAGCTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGTATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTCAGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTATGACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTCAGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAGACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCTAACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGACTTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCCTACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTCTCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAGGAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAGTGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATAGATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACCAGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTCACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTACAACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGCATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTATGCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTATGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATACAATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCCGGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTATAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATACATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGATAACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTGTTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTTATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTATGTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATTGTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAATACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGTGATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTGAGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCTTTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACTAAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACCGAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATTAAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATCTCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGGGACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGTGTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGATAAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACATTAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGAAATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCAGCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCACAAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTATAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGCTCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTAATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTCTTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTATGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAAGTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTTAAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAACCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCACACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGAACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTAGAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCCCAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAGAAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTAAACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTGACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCTATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGATTGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTGGATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTAAGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAATAATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTATTCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTCTCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTCACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGGTGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN \ No newline at end of file diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index c9d3547..8a83ed2 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -43,6 +43,41 @@ else: input: config["outfile"] +rule minimap2_to_reference: + input: + fasta = config["to_find_closest"], + reference = config["reference_fasta"] + output: + sam = os.path.join(config["tempdir"],"post_qc_query.reference_mapped.sam") + shell: + """ + minimap2 -a -x asm5 {input.reference:q} {input.fasta:q} > {output.sam:q} + """ + +rule datafunk_trim_and_pad: + input: + sam = rules.minimap2_to_reference.output.sam, + reference = config["reference_fasta"] + params: + trim_start = config["trim_start"], + trim_end = config["trim_end"], + insertions = os.path.join(config["tempdir"],"post_qc_query.insertions.txt") + output: + fasta = os.path.join(config["tempdir"],"post_qc_query.aligned.fasta") + shell: + """ + datafunk sam_2_fasta \ + -s {input.sam:q} \ + -r {input.reference:q} \ + -o {output.fasta:q} \ + -t [{params.trim_start}:{params.trim_end}] \ + --pad \ + --log-inserts + """ + +rule datafunk_trim_and_pad: + + rule pangolearn: input: fasta = config["query_fasta"], From bc84407d60095300ac4be74607120f005b81c601 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Fri, 17 Jul 2020 17:03:16 +0100 Subject: [PATCH 06/35] linking up mapped files in smake --- pangolin/scripts/pangolearn.smk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index 8a83ed2..87a302f 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -45,10 +45,10 @@ else: rule minimap2_to_reference: input: - fasta = config["to_find_closest"], + fasta = config["query_fasta"], reference = config["reference_fasta"] output: - sam = os.path.join(config["tempdir"],"post_qc_query.reference_mapped.sam") + sam = os.path.join(config["tempdir"],"reference_mapped.sam") shell: """ minimap2 -a -x asm5 {input.reference:q} {input.fasta:q} > {output.sam:q} @@ -61,7 +61,7 @@ rule datafunk_trim_and_pad: params: trim_start = config["trim_start"], trim_end = config["trim_end"], - insertions = os.path.join(config["tempdir"],"post_qc_query.insertions.txt") + insertions = os.path.join(config["tempdir"],"insertions.txt") output: fasta = os.path.join(config["tempdir"],"post_qc_query.aligned.fasta") shell: @@ -80,7 +80,7 @@ rule datafunk_trim_and_pad: rule pangolearn: input: - fasta = config["query_fasta"], + fasta = rules.datafunk_trim_and_pad.output.fasta, model = config["trained_model"], header = config["header_file"] output: From 1dfd7bb8027035ef0aed947e93bb55d0117df7fe Mon Sep 17 00:00:00 2001 From: aineniamh Date: Fri, 17 Jul 2020 17:04:42 +0100 Subject: [PATCH 07/35] package data now installs anonymised reference --- setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 3189b65..69b1097 100644 --- a/setup.py +++ b/setup.py @@ -28,16 +28,18 @@ 'pangolin/scripts/report_classes.py', 'pangolin/scripts/report_results.py' ], + package_data={"pangolin":["data/reference.fasta"]}, install_requires=[ "biopython>=1.70", "dendropy>=4.4.0", "pytools>=2020.1", 'pandas>=1.0.1', - 'sklearn>=0.0', + "wheel>=0.34", + 'sklearn', 'pysam>=0.15.4' ], description='hcov-2019 subtyping command line tool', - url='https://github.com/hCov-2019/pangolin', + url='https://github.com/cov-lineages/pangolin', author='Aine OToole and JT McCrone', author_email='aine.otoole@ed.ac.uk', entry_points=""" From 6534467c04fe4ea76dd6be1c6ac58b1ae2ad414f Mon Sep 17 00:00:00 2001 From: aineniamh Date: Fri, 17 Jul 2020 17:05:01 +0100 Subject: [PATCH 08/35] datafunk as a part of the environment --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 123854d..49cd55a 100644 --- a/environment.yml +++ b/environment.yml @@ -14,4 +14,5 @@ dependencies: - pandas==1.0.1 - pytools==2020.1 - dendropy>=4.4.0 + - git+https://github.com/cov-ert/datafunk.git - git+https://github.com/hCoV-2019/lineages.git From 9f0332274ffebdd81a69e43b2dfb05a0d252e763 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Fri, 17 Jul 2020 17:19:13 +0100 Subject: [PATCH 09/35] importing pkg resources for ref file --- pangolin/command.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pangolin/command.py b/pangolin/command.py index 4893a9c..477e39a 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -10,6 +10,7 @@ import json import lineages import setuptools +import pkg_resources from Bio import SeqIO From bb27898b78774c3c0bab2b7131067de514151ff1 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Fri, 17 Jul 2020 17:20:19 +0100 Subject: [PATCH 10/35] removing redundant rule head --- pangolin/scripts/pangolearn.smk | 3 --- 1 file changed, 3 deletions(-) diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index 87a302f..eaca46c 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -75,9 +75,6 @@ rule datafunk_trim_and_pad: --log-inserts """ -rule datafunk_trim_and_pad: - - rule pangolearn: input: fasta = rules.datafunk_trim_and_pad.output.fasta, From d5f8f5445dfce2a995fe6fb54d88d99dbf6b3d4f Mon Sep 17 00:00:00 2001 From: aineniamh Date: Fri, 17 Jul 2020 17:30:50 +0100 Subject: [PATCH 11/35] no temp --- pangolin/command.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pangolin/command.py b/pangolin/command.py index 477e39a..a87d6f6 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -33,6 +33,7 @@ def main(sysargs = sys.argv[1:]): parser.add_argument('-n', '--dry-run', action='store_true',help="Go through the motions but don't actually run") parser.add_argument('-f', '--force', action='store_true',help="Overwrite all output",dest="force") parser.add_argument('--tempdir',action="store",help="Specify where you want the temp stuff to go. Default: $TMPDIR") + parser.add_argument("--no-temp",action="store_true",help="Output all intermediate files, for dev purposes.") parser.add_argument('--max-ambig', action="store", default=0.5, type=float,help="Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5",dest="maxambig") parser.add_argument('--min-length', action="store", default=10000, type=int,help="Minimum query length allowed for pangolin to attempt assignment. Default: 10000",dest="minlen") parser.add_argument('--panGUIlin', action='store_true',help="Run web-app version of pangolin",dest="panGUIlin") @@ -92,6 +93,10 @@ def main(sysargs = sys.argv[1:]): else: temporary_directory = tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=None) tempdir = temporary_directory.name + + if args.no_temp: + print(f"--no-temp: All intermediate files will be written to {outdir}") + tempdir = outdir """ QC steps: @@ -197,6 +202,9 @@ def main(sysargs = sys.argv[1:]): config["guide_tree"]=guide_tree config["trained_model"] = trained_model config["header_file"] = header_file + + # if no temp, just write everything to outdir + if args.write_tree: config["write_tree"]="True" From f233ccd425374fb7f2bbb25f727c1b0d7ffb5cdb Mon Sep 17 00:00:00 2001 From: emilyscher Date: Fri, 17 Jul 2020 18:18:55 +0100 Subject: [PATCH 12/35] adding confidence scores to output file --- pangolin/scripts/pangolearn.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/pangolin/scripts/pangolearn.py b/pangolin/scripts/pangolearn.py index 54424f8..9202956 100644 --- a/pangolin/scripts/pangolearn.py +++ b/pangolin/scripts/pangolearn.py @@ -12,16 +12,17 @@ dataList = [] tempDataLines = [] +idList = [] def parse_args(): parser = argparse.ArgumentParser(description='pangoLEARN.') - parser.add_argument("--header-file", action="store", type=str, dest="header_file") parser.add_argument("--model-file", action="store", type=str, dest="model_file") parser.add_argument("--fasta", action="store", type=str, dest="sequences_file") parser.add_argument("-o","--outfile", action="store", type=str, dest="outfile") return parser.parse_args() + args = parse_args() sequencesFile = args.sequences_file @@ -71,6 +72,9 @@ def readInAndFormatData(): line = line.strip() if ">" in line: + # this is a fasta line designating an id, but we don't want to keep the > + idList.append(line[1:]) + # starting new entry, gotta save the old one if currentSeq: tempDataLines.append(currentSeq) @@ -151,9 +155,25 @@ def removeIndices(headersFile): print("generating predictions " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")); -predictions = loaded_model.predict(df) +predictions = loaded_model.predict_proba(df) # write predictions to a file f = open(args.outfile, "w") -f.write(predictions) -f.close() \ No newline at end of file +for index in range(len(predictions)): + + maxScore = 0 + maxIndex = -1 + + # get the max probability score and its assosciated index + for i in range(len(predictions[index])): + if predictions[index][i] > maxScore: + maxScore = predictions[index][i] + maxIndex = i + + score = maxScore + prediction = loaded_model.classes_[maxIndex] + seqId = idList[index] + + f.write(seqId + "," + prediction + "," + str(score) + "\n") + +f.close() From e84f9964ab3f473628c889dd7bf008874a9e5970 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Sun, 19 Jul 2020 16:05:27 +0100 Subject: [PATCH 13/35] minimap2 added to env --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 49cd55a..72f90ee 100644 --- a/environment.yml +++ b/environment.yml @@ -7,6 +7,7 @@ dependencies: - biopython=1.74 - iqtree<2 - mafft + - minimap2 - pip=19.3.1 - python=3.6 - snakemake-minimal=5.13 From a912b7f3292505d5689eb90abb41d095c6bbedda Mon Sep 17 00:00:00 2001 From: aineniamh Date: Sun, 19 Jul 2020 16:05:59 +0100 Subject: [PATCH 14/35] premapping to parse paf to check if seq can be mapped and produce a sam file for datafunk --- pangolin/scripts/pangolearn.smk | 50 ++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index eaca46c..23b06a1 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -43,9 +43,44 @@ else: input: config["outfile"] -rule minimap2_to_reference: + +rule minimap2_check_distance: + input: + fasta = config["query_fasta"], + reference = config["reference_fasta"] + output: + paf = os.path.join(config["tempdir"],"reference_mapped.paf") + shell: + """ + minimap2 -x asm5 {input.reference:q} {input.fasta:q} > {output.paf:q} + """ + +rule parse_paf: input: + paf = rules.minimap2_check_distance.output.paf, fasta = config["query_fasta"], + output: + fasta = os.path.join(config["tempdir"], "mappable.fasta"), + mapfail = os.path.join(config["tempdir"],"mapfail.csv") + run: + mapped = [] + with open(input.paf, "r") as f: + for l in f: + tokens = l.rstrip("\n").split('\t') + mapped.append(tokens[0]) + unmapped = open(output.mapfail, "w") + with open(output.fasta, 'w') as fw: + for record in SeqIO.parse(input.fasta,"fasta"): + if record.id in mapped: + fw.write(f">{record.description}\n{record.seq}\n") + else: + unmapped.write(f"{record.id},failed to map\n") + + + +rule minimap2_to_reference: + input: + fasta = rules.parse_paf.output.fasta, reference = config["reference_fasta"] output: sam = os.path.join(config["tempdir"],"reference_mapped.sam") @@ -92,14 +127,15 @@ rule pangolearn: rule add_failed_seqs: input: qcpass= os.path.join(config["tempdir"],"lineage_report.pass_qc.csv"), - qcfail= config["qc_fail"] + qcfail= config["qc_fail"], + mapfail = rules.parse_paf.output.mapfail params: version = config["lineages_version"] output: config["outfile"] run: fw = open(output[0],"w") - fw.write("taxon,lineage,SH-alrt,UFbootstrap,lineages_version,status,note\n") + fw.write("taxon,lineage,support,lineages_version,status,note\n") with open(input.qcpass, "r") as f: for l in f: @@ -113,7 +149,13 @@ rule add_failed_seqs: if i.startswith("fail="): note = i.lstrip("fail=") # needs to mirror the structure of the output from pangolearn - fw.write(f"{record.id},None,0,0,{params.version},fail,{note}\n") + fw.write(f"{record.id},None,0,{params.version},fail,{note}\n") + + with open(input.mapfail,"r") as f: + for l in f: + l = l.rstrip("\n") + name,fail = l.split(",") + fw.write(f"{name},None,0,{param.version},fail,{fail}\n") fw.close() From f1adfb95205f95342a4dcfe8b93501d78913c89f Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 10:07:07 +0100 Subject: [PATCH 15/35] make outdir if it doesn't exist --- pangolin/command.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pangolin/command.py b/pangolin/command.py index a87d6f6..196e350 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -74,6 +74,12 @@ def main(sysargs = sys.argv[1:]): outdir = '' if args.outdir: outdir = os.path.join(cwd, args.outdir) + if not os.path.exists(outdir): + try: + os.mkdir(outdir) + except: + sys.stderr.write(f'Error: cannot create directory {outdir}') + sys.exit(-1) else: outdir = cwd From 90a68bae20390e8446bde84aca0c7891354cc5d6 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 10:07:30 +0100 Subject: [PATCH 16/35] typo in output joining rule --- pangolin/scripts/pangolearn.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index 23b06a1..2280e74 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -155,7 +155,7 @@ rule add_failed_seqs: for l in f: l = l.rstrip("\n") name,fail = l.split(",") - fw.write(f"{name},None,0,{param.version},fail,{fail}\n") + fw.write(f"{name},None,0,{params.version},fail,{fail}\n") fw.close() From 43c2aaf63da24245ec4e49cda0b7eef272a50858 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 10:15:47 +0100 Subject: [PATCH 17/35] rounding support value --- pangolin/scripts/pangolearn.smk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index 2280e74..876322a 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -140,7 +140,9 @@ rule add_failed_seqs: with open(input.qcpass, "r") as f: for l in f: l=l.rstrip('\n') - fw.write(f"{l},{params.version},passed_qc,\n") + name,lineage,support = l.split(",") + support = round(float(support), 2) + fw.write(f"{name},{lineage},{support},{params.version},passed_qc,\n") for record in SeqIO.parse(input.qcfail,"fasta"): desc_list = record.description.split(" ") From 13df4c0b53f1d6e0db69a93cbd87423b3da9b3ee Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 13:43:03 +0100 Subject: [PATCH 18/35] specific version of lineages repo and pangoLEARN in env --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 72f90ee..49695e7 100644 --- a/environment.yml +++ b/environment.yml @@ -16,4 +16,5 @@ dependencies: - pytools==2020.1 - dendropy>=4.4.0 - git+https://github.com/cov-ert/datafunk.git - - git+https://github.com/hCoV-2019/lineages.git + - git+https://github.com/cov-lineages/pangoLEARN.git + - git+https://github.com/cov-lineages/lineages.git@2020-05-19-2 From eb6955acb163eb471da7db9f041eeac4c84da7d6 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 13:43:55 +0100 Subject: [PATCH 19/35] legacy description and separate lineage and pangoLEARN data sources parsed --- pangolin/command.py | 115 ++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 48 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index 196e350..c3ab8db 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -9,7 +9,8 @@ import pprint import json import lineages -import setuptools +import pangoLEARN + import pkg_resources from Bio import SeqIO @@ -37,13 +38,14 @@ def main(sysargs = sys.argv[1:]): parser.add_argument('--max-ambig', action="store", default=0.5, type=float,help="Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5",dest="maxambig") parser.add_argument('--min-length', action="store", default=10000, type=int,help="Minimum query length allowed for pangolin to attempt assignment. Default: 10000",dest="minlen") parser.add_argument('--panGUIlin', action='store_true',help="Run web-app version of pangolin",dest="panGUIlin") - parser.add_argument('--assign-using-tree',action='store_true',help="Use original phylogenetic assignment methods with guide tree. Note: will be significantly slower than pangoLEARN") + parser.add_argument('--assign-using-tree',action='store_true',help="LEGACY: Use original phylogenetic assignment methods with guide tree. Note, will be significantly slower than pangoLEARN") parser.add_argument('--write-tree', action='store_true',help="Output a phylogeny for each query sequence placed in the guide tree",dest="write_tree") parser.add_argument('-t', '--threads', action='store',type=int,help="Number of threads") parser.add_argument("-p","--include-putative",action="store_true",help="Include the bleeding edge lineage definitions in assignment",dest="include_putative") parser.add_argument("--verbose",action="store_true",help="Print lots of stuff to screen") parser.add_argument("-v","--version", action='version', version=f"pangolin {__version__}") parser.add_argument("-lv","--lineages-version", action='version', version=f"lineages {lineages.__version__}",help="show lineages's version number and exit") + parser.add_argument("-pv","--pangoLEARN-version", action='version', version=f"lineages {pangoLEARN.__version__}",help="show pangoLEARN's version number and exit") if len(sysargs)<1: parser.print_help() @@ -151,7 +153,8 @@ def main(sysargs = sys.argv[1:]): "trim_start":265, # where to pad to using datafunk "trim_end":29674, # where to pad after using datafunk "qc_fail":qc_fail, - "lineages_version":lineages.__version__ + "lineages_version":lineages.__version__, + "pangoLEARN_version":pangoLEARN.__version__ } if args.force: @@ -161,55 +164,71 @@ def main(sysargs = sys.argv[1:]): if args.data: data_dir = os.path.join(cwd, args.data) else: - lineages_dir = lineages.__path__[0] - data_dir = os.path.join(lineages_dir,"data") + if args.assign_using_tree: + lineages_dir = lineages.__path__[0] + data_dir = os.path.join(lineages_dir,"data") - reference_fasta = pkg_resources.resource_filename('pangolin', 'data/reference.fasta') - config["reference_fasta"] = reference_fasta + representative_aln = "" + guide_tree = "" + lineages_csv = "" - print(f"Looking in {data_dir} for data files...") - representative_aln = "" - guide_tree = "" - lineages_csv = "" - trained_model = "" - header_file = "" - for r,d,f in os.walk(data_dir): - for fn in f: - if fn == "multinomialLogRegHeaders_v1.joblib": - header_file = os.path.join(r, fn) - elif fn == "multinomialLogReg_v1.joblib": - trained_model = os.path.join(r, fn) - if args.include_putative: - if fn.endswith("putative.fasta"): - representative_aln = os.path.join(r, fn) - elif fn.endswith("putative.fasta.treefile"): - guide_tree = os.path.join(r, fn) - elif fn.endswith(".csv") and fn.startswith("lineages"): - lineages_csv = os.path.join(r, fn) - else: - if fn.endswith("safe.fasta"): - representative_aln = os.path.join(r, fn) - elif fn.endswith("safe.fasta.treefile"): - guide_tree = os.path.join(r, fn) - elif fn.endswith(".csv") and fn.startswith("lineages"): - lineages_csv = os.path.join(r, fn) - - print("\nData files found") - print(f"Sequence alignment:\t{representative_aln}") - print(f"Guide tree:\t\t{guide_tree}") - print(f"Lineages csv:\t\t{lineages_csv}") - if representative_aln=="" or guide_tree=="" or lineages_csv=="" or trained_model=="" or header_file=="": - print("""Check your environment, didn't find appropriate files from the lineages repo.\nTreefile must end with `.treefile`.\nAlignment must be in `.fasta` format.\n Trained model must exist. \ -If you've specified --include-putative \n + for r,d,f in os.walk(data_dir): + for fn in f: + if args.include_putative: + if fn.endswith("putative.fasta"): + representative_aln = os.path.join(r, fn) + elif fn.endswith("putative.fasta.treefile"): + guide_tree = os.path.join(r, fn) + elif fn.endswith(".csv") and fn.startswith("lineages"): + lineages_csv = os.path.join(r, fn) + else: + if fn.endswith("safe.fasta"): + representative_aln = os.path.join(r, fn) + elif fn.endswith("safe.fasta.treefile"): + guide_tree = os.path.join(r, fn) + elif fn.endswith(".csv") and fn.startswith("lineages"): + lineages_csv = os.path.join(r, fn) + + + if representative_aln=="" or guide_tree=="" or lineages_csv=="": + print("""Check your environment, didn't find appropriate files from the lineages repo.\nTreefile must end with `.treefile`.\ +\nAlignment must be in `.fasta` format.\n Trained model must exist. \ +If you've specified --include-putative\n \ you must have files ending in putative.fasta.treefile\nExiting.""") - exit(1) - else: - config["representative_aln"]=representative_aln - config["guide_tree"]=guide_tree - config["trained_model"] = trained_model - config["header_file"] = header_file + exit(1) + else: + print("\nData files found") + print(f"Sequence alignment:\t{representative_aln}") + print(f"Guide tree:\t\t{guide_tree}") + print(f"Lineages csv:\t\t{lineages_csv}") + config["representative_aln"]=representative_aln + config["guide_tree"]=guide_tree - # if no temp, just write everything to outdir + else: + pangoLEARN_dir = pangoLEARN.__path__[0] + data_dir = os.path.join(pangoLEARN_dir,"data") + print(f"Looking in {data_dir} for data files...") + trained_model = "" + header_file = "" + + for r,d,f in os.walk(data_dir): + for fn in f: + if fn == "multinomialLogRegHeaders_v1.joblib": + header_file = os.path.join(r, fn) + elif fn == "multinomialLogReg_v1.joblib": + trained_model = os.path.join(r, fn) + if trained_model=="" or header_file=="": + print("""Check your environment, didn't find appropriate files from the pangoLEARN repo.\n Trained model must be installed.""") + exit(1) + else: + print("\nData files found") + print(f"Trained model:\t{trained_model}") + print(f"Header file:\t{header_file}") + config["trained_model"] = trained_model + config["header_file"] = header_file + + reference_fasta = pkg_resources.resource_filename('pangolin', 'data/reference.fasta') + config["reference_fasta"] = reference_fasta if args.write_tree: config["write_tree"]="True" From 542f69972ac6d64ab0318c8e64e8d50732200261 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 13:44:25 +0100 Subject: [PATCH 20/35] writing pangoLEARN release version --- pangolin/scripts/pangolearn.smk | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index 876322a..ea34e3d 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -29,8 +29,6 @@ if config.get("lineages_csv"): else: config["lineages_csv"]="" -config["pid"] = str(os.getpid()) - ##### Target rules ##### if config["lineages_csv"] != "": @@ -77,7 +75,6 @@ rule parse_paf: unmapped.write(f"{record.id},failed to map\n") - rule minimap2_to_reference: input: fasta = rules.parse_paf.output.fasta, @@ -130,7 +127,7 @@ rule add_failed_seqs: qcfail= config["qc_fail"], mapfail = rules.parse_paf.output.mapfail params: - version = config["lineages_version"] + version = config["pangoLEARN_version"] output: config["outfile"] run: From e273ac8a7f0dc650a54aeb33f54e7b1c755f787b Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 13:45:01 +0100 Subject: [PATCH 21/35] writing pangoLEARN release version --- pangolin/scripts/pangolearn.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index ea34e3d..e0dc355 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -132,7 +132,7 @@ rule add_failed_seqs: config["outfile"] run: fw = open(output[0],"w") - fw.write("taxon,lineage,support,lineages_version,status,note\n") + fw.write("taxon,lineage,support,pangoLEARN_version,status,note\n") with open(input.qcpass, "r") as f: for l in f: From e7a6c8077f2d0bb4e2021aa0f91c8cf647c94c80 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 13:45:23 +0100 Subject: [PATCH 22/35] adding emily to the authors list of pangolin --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 69b1097..2638fb3 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ ], description='hcov-2019 subtyping command line tool', url='https://github.com/cov-lineages/pangolin', - author='Aine OToole and JT McCrone', + author='Aine OToole, JT McCrone & Emily Scher', author_email='aine.otoole@ed.ac.uk', entry_points=""" [console_scripts] From 741feaaab9e6cad6a98dbd7029bf27f792ac2bf8 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 13:57:15 +0100 Subject: [PATCH 23/35] updating version number --- pangolin/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangolin/__init__.py b/pangolin/__init__.py index 63f8c34..3ce9cf2 100644 --- a/pangolin/__init__.py +++ b/pangolin/__init__.py @@ -1,2 +1,2 @@ _program = "pangolin" -__version__ = "1.1.14" \ No newline at end of file +__version__ = "2.0" \ No newline at end of file From 5e990bc106a0d94bb33e5ce5bcd916192ad77b46 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 13:57:39 +0100 Subject: [PATCH 24/35] looking for lineages csv both times --- pangolin/command.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index c3ab8db..8b2ce40 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -45,7 +45,7 @@ def main(sysargs = sys.argv[1:]): parser.add_argument("--verbose",action="store_true",help="Print lots of stuff to screen") parser.add_argument("-v","--version", action='version', version=f"pangolin {__version__}") parser.add_argument("-lv","--lineages-version", action='version', version=f"lineages {lineages.__version__}",help="show lineages's version number and exit") - parser.add_argument("-pv","--pangoLEARN-version", action='version', version=f"lineages {pangoLEARN.__version__}",help="show pangoLEARN's version number and exit") + parser.add_argument("-pv","--pangoLEARN-version", action='version', version=f"pangoLEARN {pangoLEARN.__version__}",help="show pangoLEARN's version number and exit") if len(sysargs)<1: parser.print_help() @@ -210,6 +210,7 @@ def main(sysargs = sys.argv[1:]): print(f"Looking in {data_dir} for data files...") trained_model = "" header_file = "" + lineages_csv = "" for r,d,f in os.walk(data_dir): for fn in f: @@ -217,13 +218,16 @@ def main(sysargs = sys.argv[1:]): header_file = os.path.join(r, fn) elif fn == "multinomialLogReg_v1.joblib": trained_model = os.path.join(r, fn) - if trained_model=="" or header_file=="": + elif fn.endswith(".csv") and fn.startswith("lineages"): + lineages_csv = os.path.join(r, fn) + if trained_model=="" or header_file=="" or lineages_csv=="": print("""Check your environment, didn't find appropriate files from the pangoLEARN repo.\n Trained model must be installed.""") exit(1) else: print("\nData files found") print(f"Trained model:\t{trained_model}") print(f"Header file:\t{header_file}") + print(f"Lineages csv:\t{lineages_csv}") config["trained_model"] = trained_model config["header_file"] = header_file From 813893421a4976e32e7392ef64cd7d26de566016 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 14:08:19 +0100 Subject: [PATCH 25/35] removing whitespace from fn to prevent warning msgs --- pangolin/command.py | 2 +- pangolin/scripts/pangolearn.smk | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index 8b2ce40..57b240d 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -227,7 +227,7 @@ def main(sysargs = sys.argv[1:]): print("\nData files found") print(f"Trained model:\t{trained_model}") print(f"Header file:\t{header_file}") - print(f"Lineages csv:\t{lineages_csv}") + print(f"Lineages csv:\t'{lineages_csv}'") config["trained_model"] = trained_model config["header_file"] = header_file diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index e0dc355..ac4c9fd 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -1,6 +1,5 @@ #!/usr/bin/env python -# config["trained_model"] = trained_model import csv from Bio import SeqIO import os @@ -14,8 +13,9 @@ if config.get("header_file"): config["header_file"] = os.path.join(workflow.current_basedir,'..', config["header_file"]) if config.get("lineages_csv"): + print("Going to run the global report summary") lineages_csv_path = os.path.join(workflow.current_basedir,'..', config["lineages_csv"]) - config["lineages_csv"]=f"lineages_csv={lineages_csv_path} " + config["lineages_csv"]=f"lineages_csv='{lineages_csv_path}'" else: config["lineages_csv"]="" @@ -24,11 +24,6 @@ if config.get("force"): else: config["force"] = "" -if config.get("lineages_csv"): - print("Going to run the global report summary") -else: - config["lineages_csv"]="" - ##### Target rules ##### if config["lineages_csv"] != "": @@ -167,7 +162,7 @@ rule report_results: shell: """ report_results.py \ - -p {input.csv} \ + -p {input.csv:q} \ -b {input.lineages_csv} \ -o {output:q} """ From d2c68d291a8f0d89838eb7dc00746b3562d68bd6 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 14:15:36 +0100 Subject: [PATCH 26/35] outputting minimap2 to log rather than to screen --- pangolin/command.py | 7 ++----- pangolin/scripts/pangolearn.smk | 14 ++++++-------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index 57b240d..3bbc16a 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -32,7 +32,6 @@ def main(sysargs = sys.argv[1:]): parser.add_argument('--outfile', action="store",help="Optional output file name. Default: lineage_report.csv") parser.add_argument('-d', '--data', action='store',help="Data directory minimally containing a fasta alignment and guide tree") parser.add_argument('-n', '--dry-run', action='store_true',help="Go through the motions but don't actually run") - parser.add_argument('-f', '--force', action='store_true',help="Overwrite all output",dest="force") parser.add_argument('--tempdir',action="store",help="Specify where you want the temp stuff to go. Default: $TMPDIR") parser.add_argument("--no-temp",action="store_true",help="Output all intermediate files, for dev purposes.") parser.add_argument('--max-ambig', action="store", default=0.5, type=float,help="Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5",dest="maxambig") @@ -157,8 +156,6 @@ def main(sysargs = sys.argv[1:]): "pangoLEARN_version":pangoLEARN.__version__ } - if args.force: - config["force"]="forceall" # find the data data_dir = "" if args.data: @@ -227,7 +224,7 @@ def main(sysargs = sys.argv[1:]): print("\nData files found") print(f"Trained model:\t{trained_model}") print(f"Header file:\t{header_file}") - print(f"Lineages csv:\t'{lineages_csv}'") + print(f"Lineages csv:\t{lineages_csv}") config["trained_model"] = trained_model config["header_file"] = header_file @@ -247,7 +244,7 @@ def main(sysargs = sys.argv[1:]): # run subtyping status = snakemake.snakemake(snakefile, printshellcmds=True, - dryrun=args.dry_run, forceall=args.force,force_incomplete=True, + dryrun=args.dry_run, forceall=True,force_incomplete=True, config=config, cores=threads,lock=False,quiet=quiet_mode,workdir=tempdir ) diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index ac4c9fd..7afe649 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -13,17 +13,11 @@ if config.get("header_file"): config["header_file"] = os.path.join(workflow.current_basedir,'..', config["header_file"]) if config.get("lineages_csv"): - print("Going to run the global report summary") lineages_csv_path = os.path.join(workflow.current_basedir,'..', config["lineages_csv"]) config["lineages_csv"]=f"lineages_csv='{lineages_csv_path}'" else: config["lineages_csv"]="" -if config.get("force"): - config["force"] = "--forceall " -else: - config["force"] = "" - ##### Target rules ##### if config["lineages_csv"] != "": @@ -43,9 +37,11 @@ rule minimap2_check_distance: reference = config["reference_fasta"] output: paf = os.path.join(config["tempdir"],"reference_mapped.paf") + log: + os.path.join(config["tempdir"], "logs/minimap2_check.log") shell: """ - minimap2 -x asm5 {input.reference:q} {input.fasta:q} > {output.paf:q} + minimap2 -x asm5 {input.reference:q} {input.fasta:q} -o {output.paf:q} &> {log} """ rule parse_paf: @@ -76,9 +72,11 @@ rule minimap2_to_reference: reference = config["reference_fasta"] output: sam = os.path.join(config["tempdir"],"reference_mapped.sam") + log: + os.path.join(config["tempdir"], "logs/minimap2_sam.log") shell: """ - minimap2 -a -x asm5 {input.reference:q} {input.fasta:q} > {output.sam:q} + minimap2 -a -x asm5 {input.reference:q} {input.fasta:q} -o {output.sam:q} &> {log} """ rule datafunk_trim_and_pad: From 11295b417aa1a2ca382a6239e665b00af1f77202 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 14:17:43 +0100 Subject: [PATCH 27/35] updating cli --- pangolin/command.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangolin/command.py b/pangolin/command.py index 3bbc16a..a5a6e30 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -38,7 +38,7 @@ def main(sysargs = sys.argv[1:]): parser.add_argument('--min-length', action="store", default=10000, type=int,help="Minimum query length allowed for pangolin to attempt assignment. Default: 10000",dest="minlen") parser.add_argument('--panGUIlin', action='store_true',help="Run web-app version of pangolin",dest="panGUIlin") parser.add_argument('--assign-using-tree',action='store_true',help="LEGACY: Use original phylogenetic assignment methods with guide tree. Note, will be significantly slower than pangoLEARN") - parser.add_argument('--write-tree', action='store_true',help="Output a phylogeny for each query sequence placed in the guide tree",dest="write_tree") + parser.add_argument('--write-tree', action='store_true',help="Output a phylogeny for each query sequence placed in the guide tree. Only works in combination with legacy `--assign-using-tree`",dest="write_tree") parser.add_argument('-t', '--threads', action='store',type=int,help="Number of threads") parser.add_argument("-p","--include-putative",action="store_true",help="Include the bleeding edge lineage definitions in assignment",dest="include_putative") parser.add_argument("--verbose",action="store_true",help="Print lots of stuff to screen") From e0d7984fc343dac817e086e1952eb2bda4bf0ba3 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 15:19:55 +0100 Subject: [PATCH 28/35] half updated readme --- README.md | 108 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 73 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 7062197..4edac6e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **P**hylogenetic **A**ssignment of **N**amed **G**lobal **O**utbreak **LIN**eages - + >pangolin now comes in web-application form thanks to the [Centre for Genomic Pathogen and Surveillance](https://www.pathogensurveillance.net/)! Find it here at https://pangolin.cog-uk.io/. @@ -19,7 +19,7 @@ * [Recall rate](#recall-rate) * [Source data](#source-data) * [Authors](#authors) - * [Citing ``pangolin``](#citing-pangolin) + * [Citing pangolin](#citing-pangolin) * [References](#references) @@ -35,10 +35,12 @@ Pangolin runs on MacOS and Linux. The conda environment recipe may not build on 1. Clone this repository and ``cd pangolin`` 2. ``conda env create -f environment.yml`` 3. ``conda activate pangolin`` -4. ``python setup.py install`` or ``pip install .`` +4. ``python setup.py install`` 5. That's it -> Note: we recommend using pangolin in the conda environment specified in the ``environment.yml`` file as per the instructions above. If you can't use conda for some reason, bear in mind the data files are now hosted in a separate repository at [``hCoV-2019/lineages``](https://github.com/hCoV-2019/lineages.git) and you will need to pip install that alongside the other dependencies for pangolin (details found in [``environment.yml``](https://github.com/hCoV-2019/pangolin/blob/master/environment.yml)). +> Troubleshooting install see the [pangolin wiki](https://github.com/cov-lineages/pangolin/wiki) + +> Note: we recommend using pangolin in the conda environment specified in the ``environment.yml`` file as per the instructions above. If you can't use conda for some reason, bear in mind the data files are now hosted in a separate repository at [``cov-lineages/lineages``](https://github.com/cov-lineages/lineages.git) and you will need to pip install that alongside the other dependencies for pangolin (details found in [``environment.yml``](https://github.com/cov-lineages/pangolin/blob/master/environment.yml)). ### Check the install worked @@ -46,13 +48,14 @@ Type (in the pangolin environment): ``` pangolin -v +pangolin -pv pangolin -lv ``` -and you should see the versions of pangolin and lineages data release printed respectively. +and you should see the versions of pangolin , pangoLEARN and lineages data release printed respectively. ### Updating pangolin -> Note: Even if you have previously installed ``pangolin``, as it is being worked on intensively, we recommend you check for updates before running. +> Note: Even if you have previously installed pangolin, as it is being worked on intensively, we recommend you check for updates before running. To update: @@ -60,12 +63,13 @@ To update: 2. ``git pull`` \ pulls the latest changes from github 3. ``python setup.py install`` \ -re-installs pangolin -4. ``pip install git+https://github.com/hCoV-2019/lineages.git --upgrade`` \ -updates if there is a new data release -5. ``conda env update -f environment.yml`` \ +re-installs pangolin. +4. ``conda env update -f environment.yml`` \ updates the conda environment (you're unlikely to need to do this, but just in case!) - +5. ``pip install git+https://github.com/cov-lineages/pangoLEARN.git --upgrade`` \ +updates if there is a new data release +6. ``pip install git+https://github.com/cov-lineages/lineages.git --upgrade`` \ +updates if there is a new data release, this is the legacy data repo and is unlikely to have tagged releases in the future ### Usage @@ -76,26 +80,42 @@ updates the conda environment (you're unlikely to need to do this, but just in c pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages positional arguments: - query + query Query fasta file of sequences to analyse. optional arguments: - -h, --help show this help message and exit - -o OUTDIR, --outdir OUTDIR Output directory - -d DATA, --data DATA Data directory minimally containing a fasta alignment - and guide tree - -n, --dry-run Go through the motions but don't actually run - -f, --force Overwrite all output - --tempdir TEMPDIR Specify where you want the temp stuff to go. Default: - $TMPDIR - --panGUIlin Run web-app version of pangolin - --max-ambig MAXAMBIG Maximum proportion of Ns allowed for pangolin to - attempt assignment. Default: 0.5 - --min-length MINLEN Minimum query length allowed for pangolin to attempt - assignment. Default: 10000 - -t THREADS, --threads THREADS - Number of threads - -v, --version show program's version number and exit - -lv, --lineages-version show lineages's version number and exit + -h, --help show this help message and exit + -o OUTDIR, + --outdir OUTDIR + Output directory. Default: current working directory + --outfile OUTFILE Optional output file name. Default: lineage_report.csv + -d DATA, --data DATA Data directory minimally containing a fasta alignment + and guide tree + -n, --dry-run Go through the motions but don't actually run + --tempdir TEMPDIR Specify where you want the temp stuff to go. Default: + $TMPDIR + --no-temp Output all intermediate files, for dev purposes. + --max-ambig MAXAMBIG Maximum proportion of Ns allowed for pangolin to + attempt assignment. Default: 0.5 + --min-length MINLEN Minimum query length allowed for pangolin to attempt + assignment. Default: 10000 + --panGUIlin Run web-app version of pangolin + --assign-using-tree LEGACY: Use original phylogenetic assignment methods + with guide tree. Note, will be significantly slower + than pangoLEARN + --write-tree Output a phylogeny for each query sequence placed in + the guide tree. Only works in combination with legacy + `--assign-using-tree` + -t THREADS, + --threads THREADS Number of threads + -p, --include-putative + Include the bleeding edge lineage definitions in + assignment + --verbose Print lots of stuff to screen + -v, --version show program's version number and exit + -lv, --lineages-version + show lineages's version number and exit + -pv, --pangoLEARN-version + show pangoLEARN's version number and exit ``` ### Output @@ -104,6 +124,18 @@ Your output will be a csv file with taxon name and lineage assigned, one line co Example: +| Taxon | Lineage | support | pangoLEARN_version |status | note | +| ----------- |:---------:|:----------:| :----------:|:----------:| :----------:| +| Virus1 | B.1 | 80 | 2020-04-27 | passed_qc | | +| Virus2 | A.1 | 65 | 2020-04-27 | passed_qc | | +| Virus3 | A.3 | 100 | 2020-04-27 | passed_qc | | +| Virus4 | B.1.4 | 82 | 2020-04-27 | passed_qc | | +| Virus5 | None | 0 | 2020-04-27 | fail | N_content:0.80 | +| Virus6 | None | 0 | 2020-04-27 | fail | seq_len:0 | +| Virus7 | None | 0 | 2020-04-27 | fail | failed to map | + +Legacy phylogenetics output example: + | Taxon | Lineage | aLRT | UFbootstrap | lineages_version |status | note | | ----------- |:---------:|:----------:|:----------:| :----------:|:----------:| :----------:| | Virus1 | B.1 | 80 | 82 | 2020-04-27 | passed_qc | | @@ -118,22 +150,24 @@ Resources for interpreting the aLRT and UFbootstrap output can be found [here](h ### Recall rate Of 9,843 GISAID sequences assigned lineages by hand (taking sequence, phylogeny and metadata into account), pangolin accurately assigns the lineage of 97.85% of those sequences. Of the sequences that were not recalled correctly, 74.5% had 0 bootstrap and 0 alrt. We're continuing to work to improve this recall rate, but recommend interpreting the pangolin output cautiously with due attention to the UFbootstrap and aLRT values. -Given hCoV-2019 is relatively slow evolving for an RNA virus and there is still not a huge amount of diversity, missing or ambiguous data at key residues may lead to incorrect placement within the guide tree. We have a filter in place that by default with not call a lineage for any sequence with >50% N-content, but this can be made more conservative with the command line option `--max-ambig`. +Given cov-lineages is relatively slow evolving for an RNA virus and there is still not a huge amount of diversity, missing or ambiguous data at key residues may lead to incorrect placement within the guide tree. We have a filter in place that by default with not call a lineage for any sequence with >50% N-content, but this can be made more conservative with the command line option `--max-ambig`. ### Source data -``pangolin`` runs using a guide tree and alignment hosted at [``hCoV-2019/lineages``](https://github.com/hCoV-2019/lineages.git). Some of this data is sourced from GISAID, but anonymised and encrypted to fit with guidelines. Appropriate permissions have been given and acknowledgements for the teams that have worked to provide the original SARS-CoV-2 genome sequences to GISAID are also hosted in [``hCoV-2019/lineages``](https://raw.githubusercontent.com/hCoV-2019/lineages/master/gisaid_acknowledgements.tsv). +pangolin runs a model trained against lineage assignments based on GISAID data. + +Legacy pangolin runs using a guide tree and alignment hosted at [``cov-lineages/lineages``](https://github.com/cov-lineages/lineages.git). Some of this data is sourced from GISAID, but anonymised and encrypted to fit with guidelines. Appropriate permissions have been given and acknowledgements for the teams that have worked to provide the original SARS-CoV-2 genome sequences to GISAID are also hosted in [``cov-lineages.org``](https://cov-lineages.org/gisaid_acknowledgements.html). ### Authors -Pangolin was created by [Áine O'Toole](https://aineotoole.co.uk/) and [JT McCrone](https://jtmccr1.github.io/). -It uses lineages from [Rambaut et al.](https://www.biorxiv.org/content/10.1101/2020.04.17.046086v1). +Pangolin was created by [Áine O'Toole](https://aineotoole.co.uk/), [JT McCrone](https://jtmccr1.github.io/) and [Emily Scher](http://www.scherscherscher.com/). +It uses lineages from [Rambaut et al. 2020](https://www.nature.com/articles/s41564-020-0770-5). -### Citing ``pangolin`` +### Citing pangolin -There is a publication in prep for ``pangolin``, but in the meantime please to link to this github [github.com/hCoV-2019/pangolin](github.com/hCoV-2019/pangolin) if you have used ``pangolin`` in your research. +There is a publication in prep for pangolin, but in the meantime please to link to this github [github.com/cov-lineages/pangolin](github.com/cov-lineages/pangolin) if you have used pangolin in your research. ### References @@ -153,6 +187,10 @@ Katoh, Standley 2013 (Molecular Biology and Evolution 30:772-780) MAFFT multiple sequence alignment software version 7: improvements in performance and usability. (outlines version 7) +[minimap2](https://github.com/lh3/minimap2) + +Heng Li, Minimap2: pairwise alignment for nucleotide sequences, Bioinformatics, Volume 34, Issue 18, 15 September 2018, Pages 3094–3100, https://doi.org/10.1093/bioinformatics/bty191 + [snakemake](https://snakemake.readthedocs.io/en/stable/index.html) Köster, Johannes and Rahmann, Sven. “Snakemake - A scalable bioinformatics workflow engine”. Bioinformatics 2012. From 3b46fe11df91516a449cd11df8ef8bb791bb948f Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 17:03:47 +0100 Subject: [PATCH 29/35] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4edac6e..51edc6d 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ Given cov-lineages is relatively slow evolving for an RNA virus and there is sti ### Source data -pangolin runs a model trained against lineage assignments based on GISAID data. +pangolin runs a multinomial logistic regression model trained against lineage assignments based on GISAID data. Legacy pangolin runs using a guide tree and alignment hosted at [``cov-lineages/lineages``](https://github.com/cov-lineages/lineages.git). Some of this data is sourced from GISAID, but anonymised and encrypted to fit with guidelines. Appropriate permissions have been given and acknowledgements for the teams that have worked to provide the original SARS-CoV-2 genome sequences to GISAID are also hosted in [``cov-lineages.org``](https://cov-lineages.org/gisaid_acknowledgements.html). From c970e6276d0bd550be7c167b32ff05aab4470601 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 17:04:14 +0100 Subject: [PATCH 30/35] now compatible with --panGUIlin --- pangolin/command.py | 1 + pangolin/scripts/pangolearn.smk | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index a5a6e30..e3f5850 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -236,6 +236,7 @@ def main(sysargs = sys.argv[1:]): if args.panGUIlin: config["lineages_csv"]=lineages_csv + if args.verbose: quiet_mode = False diff --git a/pangolin/scripts/pangolearn.smk b/pangolin/scripts/pangolearn.smk index 7afe649..14ecae3 100644 --- a/pangolin/scripts/pangolearn.smk +++ b/pangolin/scripts/pangolearn.smk @@ -12,13 +12,13 @@ if config.get("trained_model"): if config.get("header_file"): config["header_file"] = os.path.join(workflow.current_basedir,'..', config["header_file"]) +##### Target rules ##### + if config.get("lineages_csv"): - lineages_csv_path = os.path.join(workflow.current_basedir,'..', config["lineages_csv"]) - config["lineages_csv"]=f"lineages_csv='{lineages_csv_path}'" + print("Going to run the global report summary") else: config["lineages_csv"]="" -##### Target rules ##### if config["lineages_csv"] != "": rule all: @@ -30,7 +30,6 @@ else: input: config["outfile"] - rule minimap2_check_distance: input: fasta = config["query_fasta"], @@ -122,10 +121,10 @@ rule add_failed_seqs: params: version = config["pangoLEARN_version"] output: - config["outfile"] + csv = config["outfile"] run: fw = open(output[0],"w") - fw.write("taxon,lineage,support,pangoLEARN_version,status,note\n") + fw.write("taxon,lineage,probability,pangoLEARN_version,status,note\n") with open(input.qcpass, "r") as f: for l in f: @@ -160,7 +159,7 @@ rule report_results: shell: """ report_results.py \ - -p {input.csv:q} \ + -p {input.csv} \ -b {input.lineages_csv} \ -o {output:q} """ From c571ebb69e4b1674845677f341137f11210dfc2e Mon Sep 17 00:00:00 2001 From: aineniamh Date: Mon, 20 Jul 2020 18:02:11 +0100 Subject: [PATCH 31/35] changing to --legacy --- pangolin/command.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index e3f5850..062af17 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -37,7 +37,7 @@ def main(sysargs = sys.argv[1:]): parser.add_argument('--max-ambig', action="store", default=0.5, type=float,help="Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.5",dest="maxambig") parser.add_argument('--min-length', action="store", default=10000, type=int,help="Minimum query length allowed for pangolin to attempt assignment. Default: 10000",dest="minlen") parser.add_argument('--panGUIlin', action='store_true',help="Run web-app version of pangolin",dest="panGUIlin") - parser.add_argument('--assign-using-tree',action='store_true',help="LEGACY: Use original phylogenetic assignment methods with guide tree. Note, will be significantly slower than pangoLEARN") + parser.add_argument('--legacy',action='store_true',help="LEGACY: Use original phylogenetic assignment methods with guide tree. Note, will be significantly slower than pangoLEARN") parser.add_argument('--write-tree', action='store_true',help="Output a phylogeny for each query sequence placed in the guide tree. Only works in combination with legacy `--assign-using-tree`",dest="write_tree") parser.add_argument('-t', '--threads', action='store',type=int,help="Number of threads") parser.add_argument("-p","--include-putative",action="store_true",help="Include the bleeding edge lineage definitions in assignment",dest="include_putative") @@ -52,7 +52,7 @@ def main(sysargs = sys.argv[1:]): else: args = parser.parse_args(sysargs) - if args.assign_using_tree: + if args.legacy: snakefile = os.path.join(thisdir, 'scripts','Snakefile') # find the Snakefile else: @@ -161,7 +161,7 @@ def main(sysargs = sys.argv[1:]): if args.data: data_dir = os.path.join(cwd, args.data) else: - if args.assign_using_tree: + if args.legacy: lineages_dir = lineages.__path__[0] data_dir = os.path.join(lineages_dir,"data") @@ -236,7 +236,7 @@ def main(sysargs = sys.argv[1:]): if args.panGUIlin: config["lineages_csv"]=lineages_csv - + if args.verbose: quiet_mode = False From d25f91330b1def4bc88994386682363d9644b394 Mon Sep 17 00:00:00 2001 From: aineniamh Date: Tue, 21 Jul 2020 19:23:27 +0100 Subject: [PATCH 32/35] updating readme --- README.md | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 51edc6d..65810a0 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ +>pangolin 2.0 comes with major updates, including a significant speedup and assignment based on machine learning (affectionately described as pangoLEARN). >pangolin now comes in web-application form thanks to the [Centre for Genomic Pathogen and Surveillance](https://www.pathogensurveillance.net/)! Find it here at https://pangolin.cog-uk.io/. @@ -40,18 +41,22 @@ Pangolin runs on MacOS and Linux. The conda environment recipe may not build on > Troubleshooting install see the [pangolin wiki](https://github.com/cov-lineages/pangolin/wiki) -> Note: we recommend using pangolin in the conda environment specified in the ``environment.yml`` file as per the instructions above. If you can't use conda for some reason, bear in mind the data files are now hosted in a separate repository at [``cov-lineages/lineages``](https://github.com/cov-lineages/lineages.git) and you will need to pip install that alongside the other dependencies for pangolin (details found in [``environment.yml``](https://github.com/cov-lineages/pangolin/blob/master/environment.yml)). +> Note: we recommend using pangolin in the conda environment specified in the ``environment.yml`` file as per the instructions above. If you can't use conda for some reason, bear in mind the data files are hosted in two separate repositories at +- [cov-lineages/lineages](https://github.com/cov-lineages/lineages.git) +- [cov-lineages/pangoLEARN](https://github.com/cov-lineages/pangoLEARN.git)
+you will need to pip install them alongside the other dependencies for pangolin (details found in [environment.yml](https://github.com/cov-lineages/pangolin/blob/master/environment.yml)). ### Check the install worked -Type (in the pangolin environment): +Type (in the pangolin environment): ``` pangolin -v pangolin -pv pangolin -lv ``` -and you should see the versions of pangolin , pangoLEARN and lineages data release printed respectively. +and you should see the versions of pangolin, and pangoLEARN and lineages data release printed respectively. + ### Updating pangolin @@ -147,16 +152,40 @@ Legacy phylogenetics output example: Resources for interpreting the aLRT and UFbootstrap output can be found [here](http://www.iqtree.org/doc/Tutorial#assessing-branch-supports-with-single-branch-tests) and [here](http://www.iqtree.org/doc/Command-Reference). + +### pangoLEARN description +

pangoLEARN is an alternative algorithm for lineage assignment, which uses machine learning, that is implemented as of pangolin 2.0. Benefits of the new algorithm include a major speed up, as the phylogenetic approach was struggling to scale with the increase in number of lineages needing to be represented in the guide tree, and that this new approach takes into account all of the diversity present within a lineage rather than just selecting a representative few. The consequences of this approach mean that for large lineages, we have improved our recall and precision significantly and we are continuing to develop more sophisticated approaches to machine learning for lineage assignment.

+ +

The current version of pangoLEARN uses multinomial logistic regression, but the pipeline has been written so that as more complex models are developed,the user will be able to choose which model to use to assign their lineages.

+ +

To explain the model we're currently using, while a standard regression fits a line to a set of training data to model a linear relationship between variables of interest, a logistic regressions fits a sigmoid (S-shaped) function to the training data, in order to tell two different classes apart. A multinomial logistic regression is an extension of a standard logistic regression in that it can be used to classify more than two classes. Each potential assignment (i.e. lineage) is modeled as a set of n-1 independent binary choices (sigmoid functions), where n is the number of classes.

+ +The model was trained using 30,000 SARS-CoV-2 sequences from GISAID, acknowledgements [here](./gisaid_acknowledgements.html), with their lineages by manually curating the global ML tree, as is the standard lineages data release procedure for pangolin. Each base of each genome was [one-hot](https://www.hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f) encoded. This left us with a large number of parameters to train, which is why training this model takes approximately 14 hours on our systems (may change with different hardware). +This model was built using the standard [sci-kit learn implementation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) of multinomial logistic regression. The code for this process is available in the [cov-lineages/cov-support](https://github.com/cov-lineages/cov-support/blob/master/cov_support/scripts/pangoLEARNMultinomialLogReg_v1.py) repository. + +

Multinomial logistic regression is an extremely commonly used model as it is able to simply and intuitively assign probabilities to class assignments. However, it does not incorporate any hierarchical structure. We are currently developing new models that do incorporate hierarchical structure. However, given the limitations of this simple model, it has performed surprisingly well with this data. While more complex models may offer improvements in assignment accuracies for smaller lineages, the logistic regression has the advantages of being intuitive, easy to implement, and relatively fast to train.

+ ### Recall rate +Pre-pangolin 2.0: Of 9,843 GISAID sequences assigned lineages by hand (taking sequence, phylogeny and metadata into account), pangolin accurately assigns the lineage of 97.85% of those sequences. Of the sequences that were not recalled correctly, 74.5% had 0 bootstrap and 0 alrt. We're continuing to work to improve this recall rate, but recommend interpreting the pangolin output cautiously with due attention to the UFbootstrap and aLRT values. Given cov-lineages is relatively slow evolving for an RNA virus and there is still not a huge amount of diversity, missing or ambiguous data at key residues may lead to incorrect placement within the guide tree. We have a filter in place that by default with not call a lineage for any sequence with >50% N-content, but this can be made more conservative with the command line option `--max-ambig`. +pangolin 2.0 onwards: + +

Recall and supporting statistics were generated using the same procedure as above to train a model using 75% of the data, while 25% of the data was used as testing data. Smaller lineages may have lower recall rates due to the very small sample sizes in the test set.

+ + +### SNPs associated with a given lineage + +The model trains coefficients for each input parameter, for each potential lineage assignment. A particularly large coefficient in a particular lineage’s sigmoid function indicates a stronger association between that location and that lineage. A particularly negative coefficient in a particular lineage’s sigmoid function indicates the opposite. In other words, we can pick up SNPs that are strongly associated with or strongly negatively associated with a given lineage. This information is hosted for download from the [pangoLEARN](https://github.com/cov-lineages/pangoLEARN) data repository. + + ### Source data pangolin runs a multinomial logistic regression model trained against lineage assignments based on GISAID data. -Legacy pangolin runs using a guide tree and alignment hosted at [``cov-lineages/lineages``](https://github.com/cov-lineages/lineages.git). Some of this data is sourced from GISAID, but anonymised and encrypted to fit with guidelines. Appropriate permissions have been given and acknowledgements for the teams that have worked to provide the original SARS-CoV-2 genome sequences to GISAID are also hosted in [``cov-lineages.org``](https://cov-lineages.org/gisaid_acknowledgements.html). +Legacy pangolin runs using a guide tree and alignment hosted at [cov-lineages/lineages](https://github.com/cov-lineages/lineages.git). Some of this data is sourced from GISAID, but anonymised and encrypted to fit with guidelines. Appropriate permissions have been given and acknowledgements for the teams that have worked to provide the original SARS-CoV-2 genome sequences to GISAID are also hosted [here](https://cov-lineages.org/gisaid_acknowledgements.html). ### Authors From a8cb5ee3fb37461a5b3c26861602234096a141ba Mon Sep 17 00:00:00 2001 From: aineniamh Date: Tue, 21 Jul 2020 19:26:15 +0100 Subject: [PATCH 33/35] updating links in readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 65810a0..2a2f5b4 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ * [Usage](#usage) * [Output](#output) * [Recall rate](#recall-rate) + * [SNPs associated with a given lineage](#snps-associated-with-a-given-lineage) * [Source data](#source-data) * [Authors](#authors) * [Citing pangolin](#citing-pangolin) From 336b1ebc772e08afe7a8399514edb61fdec7d029 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81ine=20O=27Toole?= Date: Tue, 21 Jul 2020 19:27:18 +0100 Subject: [PATCH 34/35] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2a2f5b4..d8937ee 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ * [Updating pangolin](#updating-pangolin) * [Usage](#usage) * [Output](#output) + * [pangoLEARN description](#pangolearn-description) * [Recall rate](#recall-rate) * [SNPs associated with a given lineage](#snps-associated-with-a-given-lineage) * [Source data](#source-data) From 944857875ad1f7c9ddd6df54b75fe5cca24442ab Mon Sep 17 00:00:00 2001 From: aineniamh Date: Wed, 22 Jul 2020 12:02:21 +0100 Subject: [PATCH 35/35] informative output if data not installed correctly --- pangolin/command.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pangolin/command.py b/pangolin/command.py index 062af17..fa4e0c8 100644 --- a/pangolin/command.py +++ b/pangolin/command.py @@ -188,7 +188,7 @@ def main(sysargs = sys.argv[1:]): if representative_aln=="" or guide_tree=="" or lineages_csv=="": - print("""Check your environment, didn't find appropriate files from the lineages repo.\nTreefile must end with `.treefile`.\ + print("""Check your environment, didn't find appropriate files from the lineages repo, please see https://cov-lineages.org/pangolin.html for installation instructions. \nTreefile must end with `.treefile`.\ \nAlignment must be in `.fasta` format.\n Trained model must exist. \ If you've specified --include-putative\n \ you must have files ending in putative.fasta.treefile\nExiting.""") @@ -218,7 +218,7 @@ def main(sysargs = sys.argv[1:]): elif fn.endswith(".csv") and fn.startswith("lineages"): lineages_csv = os.path.join(r, fn) if trained_model=="" or header_file=="" or lineages_csv=="": - print("""Check your environment, didn't find appropriate files from the pangoLEARN repo.\n Trained model must be installed.""") + print("""Check your environment, didn't find appropriate files from the pangoLEARN repo.\n Trained model must be installed, please see https://cov-lineages.org/pangolin.html for installation instructions.""") exit(1) else: print("\nData files found")