diff --git a/docs/modes.rst b/docs/modes.rst
index 23d4c2ae..4e37f499 100644
--- a/docs/modes.rst
+++ b/docs/modes.rst
@@ -25,7 +25,7 @@ Build mode
~~~~~~~~~~
.. argparse::
- :module: immunopepper.get_parsers
+ :module: immunopepper.ip
:func: get_build_parser
:prog: immunopepper build
:passparser:
@@ -45,7 +45,7 @@ Build mode
Samplespecif mode
~~~~~~~~~~~~~~~~~
.. argparse::
- :module: immunopepper.get_parsers
+ :module: immunopepper.ip
:func: get_samplespecif_parser
:prog: immunopepper samplespecif
:passparser:
@@ -55,7 +55,7 @@ Samplespecif mode
Cancerspecif mode
~~~~~~~~~~~~~~~~~
.. argparse::
- :module: immunopepper.get_parsers
+ :module: immunopepper.ip
:func: get_cancerspecif_parser
:prog: immunopepper cancerspecif
:passparser:
@@ -143,7 +143,7 @@ Mhcbind mode
~~~~~~~~~~~~~~
.. argparse::
- :module: immunopepper.get_parsers
+ :module: immunopepper.ip
:func: get_mhcbind_parser
:prog: immunopepper mhcbind
:passparser:
@@ -155,7 +155,7 @@ Pepquery mode
~~~~~~~~~~~~~~
.. argparse::
- :module: immunopepper.get_parsers
+ :module: immunopepper.ip
:func: get_pepquery_parser
:prog: immunopepper pepquery
:passparser:
diff --git a/immunopepper/get_parsers.py b/immunopepper/get_parsers.py
deleted file mode 100644
index 08973b96..00000000
--- a/immunopepper/get_parsers.py
+++ /dev/null
@@ -1,299 +0,0 @@
-
-# Python libraries
-import argparse
-import logging
-import os
-import sys
-
-from datetime import datetime
-
-from immunopepper.mode_build import mode_build
-from immunopepper.mode_samplespecif import mode_samplespecif
-from immunopepper.mode_cancerspecif import mode_cancerspecif
-from immunopepper.mode_pepQuery import mode_pepquery
-
-def _add_general_args(parser):
- general = parser.add_argument_group('GENERAL')
- general.add_argument("--verbose", type=int, help="specify output verbosity (0 - warn, 1 - info, 2 - debug) [1]", required=False, default=1)
-
-def get_subparsers(parser):
-
- """
- Creates the different subparsers and returns them to fill them with arguments.
-
- Parameters
- ----------
- parser : argparse.ArgumentParser. Main parser of immunopepper
-
- """
-
- subparsers = parser.add_subparsers(help='Running modes', metavar='{build, samplespecif, cancerspecif, mhcbind, pepquery}')
-
- #I will create the different subparsers
- parser_build = subparsers.add_parser('build', help='Core part of ImmunoPepper. Traverses the input splice graph and generates all possible peptides/kmers.')
- parser_samplespecif = subparsers.add_parser('samplespecif', help='Performs removal of the annotation to make the kmer list sample specific')
- parser_cancerspecif = subparsers.add_parser('cancerspecif', help='Performs differential filtering against a panel of normal samples')
- parser_mhcbind = subparsers.add_parser('mhcbind', help='Perform MHC binding prediction with a wrapper for MHCtools')
- parser_pepquery = subparsers.add_parser("pepquery", help="Perform peptide validation with a wrapper for the tool PepQuery")
- return parser_build, parser_samplespecif, parser_cancerspecif, parser_mhcbind, parser_pepquery
-
-def get_build_parser(parser):
-
- """
- Function that creates the subparser for the build mode with its arguments
- """
-
- required = parser.add_argument_group('Mandatory arguments')
- required.add_argument("--output-dir", help="absolute path of the output directory. All output files generated by immunopepper will be saved in this directory.", required=True, default='output')
- required.add_argument("--ann-path", help="absolute path for the annotation file. Accepted file formats: *.gtf*, *.gff* and *.gff3*. Annotation files can be downloaded from various databases such as `GENCODE `_ ", required=True)
- required.add_argument("--splice-path", help="absolute path of the input `SplAdder `_ splice graph.", required=True)
- required.add_argument("--ref-path", help="absolute path of the reference genome file in FASTA format. Reference Please ensure that the reference genome is compatible with the gene annotation file being used. For example, if the annotation file is based on GRCh38, the reference genome should also be based on GRCh38. You can check `here `_ gencode annotation releases and their corresponding major genome assembly releases. For example, if you decide to use genome assembly version GRCh38.p13, you need to use its compatible annotation file from release 43 in GENCODE.", required=True)
- required.add_argument("--kmer", type=int, help="length of the kmers for kmer output.", required=True, default=9)
-
- submodes = parser.add_argument_group('Submodes parameters', 'Commands for conceptual information about the processing.')
- submodes.add_argument("--libsize-extract",help="Set this parameter to True to generate library sizes and gene quantifications and skip neontigen generation. **Note:** If set to True, the program will only output files 3 and 7 of the :ref:`build output section `.",action="store_true", required=False, default=False)
- submodes.add_argument("--all-read-frames", help="Set this parameter to True to switch to exhaustive translation and study all possible reading frames instead of just the annotated ones in the annotation file.", action="store_true", required=False, default=False)
- submodes.add_argument("--count-path", help="Absolute path for the second output of `SplAdder `_ containing the graph expression quantification. If provided, expression quantification of genes will take place. **Format:** hdf5.", required=False, default=None)
- submodes.add_argument("--output-samples", nargs='+', help="List of sample names to output. **Note:** *Names should match the file name of the splice graphs. If not provided all samples are processed and program runs faster.*", required=False, default=[])
- submodes.add_argument("--heter-code", type=int, help="It specifies the heterozygous allele.", default=0, choices = ['0', '2']) #TODO: Add more info about this parameter?
-
- parameters = parser.add_argument_group('Technical parameters' , 'Commands for optimization of the software.')
- parameters.add_argument("--compressed", help="Compress output files", action="store_true", default=True)
- parameters.add_argument("--parallel", type=int, help="Number of cores to be used.", required=False, default=1)
- parameters.add_argument("--batch-size", type=int, help="Number of genes to be processed in each batch. If bigger batches are selected, the program will be faster, but it will require more memory.", default=1000)
- parameters.add_argument("--pickle-samples", nargs='+', help="List of samples to be pickled. Needed if `--use-mut-pickle` is set to True. It will create intermediate files containing mutation information of the selected samples. This command is useful because mutation/variant information needs to be parsed in every run of the software, which is a time-consuming operation. By pickling the mutations, when mutation information is needed, it will be directly loaded from the intermediate pickled files instead than from the original mutation files provided under `--somatic` and `--germline`. This will speed up software re-runs. When dealing with large cohorts, this command is useful to select exactly what files should be pickled. If not provided, all the samples passed in `--mutation-sample` will be pickled. Names should match the sample names of the graph/counts files but an equivalence can be set using `--sample-name-map` from mutation parameters.", required=False, default=[])
-
- subset = parser.add_argument_group('Subset parameters', 'Commands to select a subset of the genes to be processed.')
- subset.add_argument("--process-chr", nargs='+',help="List of chromosomes to be processed. If not provided all chromosomes are processed. The chromosomes names should be provided in the same format as in FASTA and annotation files. For annotations downloaded from GENCODE, this format is **chrX**, X being the chromosome number.", required=False, default=None)
- subset.add_argument("--complexity-cap", type=int, help="Maximum edge complexity of the graph to be processed. If not provided all graphs are processed.", required=False, default=None)
- subset.add_argument("--genes-interest", help="Genes to be processed. **Format:** Input is a csv file containing a gencode gene id per line, with no header. **Technical note:** The gencode gene id must match the gencode gene ids of the splice graph. Therefore, the format for this argument must match the format of the *gene_id* field of the annotation file used in the build mode of SplAdder and passed under `--ann-path` in immunopepper. If not provided all genes are processed.", required=False, default=None)
- subset.add_argument("--start-id", type=int, help="Id of the first gene in the splice graph to be processed. If not provided all genes are processed.", required=False, default=0)
- subset.add_argument("--process-num", metavar='N', type=int, help="Number of genes to be processed. If provided, the first process-num genes are processed.", required=False, default=0)
-
- outputs = parser.add_argument_group('Output parameters', 'Commands to select output formatting and filtering.')
- outputs.add_argument("--skip-annotation", help='Set this parameter to True to skip the generation of a background kmer and peptide files.', action="store_true", default=False)
- outputs.add_argument("--keep-tmpfiles", help='If set to True, we will keep the intermediate directories and temporal files generated in paralell mode', action="store_true", default=False)
- outputs.add_argument("--output-fasta", help="Set this parameter to True to output the foreground peptides in fasta format. If set to True output number 4 from the :ref:`build output section ` will be generated. If set to False only the kmers will be output.", action="store_true", required=False, default=False)
- outputs.add_argument("--force-ref-peptides", help="Set this parameter to True to output mutated peptides even if they are the same as the ones in the reference. The reference in this case are the peptides without any mutations or variants application.", action="store_true", default=False)
- outputs.add_argument("--filter-redundant", help="If set to true, a redundancy filter will be applied to the exon list. If two or more exons span the same juction, their coordinates will be combined so that the longest spanning combination is kept.", action="store_true", required=False, default=False)
- outputs.add_argument("--kmer-database", help="Absolute path of a file containing kmers in one column, without header. If the kmers contained in this database contain the aminoacid isoleucine (I), it will be converted into leucine (L). A file from uniprot or any other standard library can be provided. The kmers provided in this file will not be output if found in the foreground peptides. Please note that is a standard proteome is downloaded from an online resource the proteins should be cut into the kmers length selected under `--kmer`.", required=False, default=None)
- outputs.add_argument("--gtex-junction-path", help="Absolute path of whitelist junction path. The junctions of this file will be the only ones output from the tool. \n **Format:** hdf5 file with 'chrm', 'pos' and 'strand' as keys. *'Chrm'* contains the chromosome name in the same format as in the annotation. *'pos'* contains coordinates. The coordinates are end_e1 and start_e2. This means that end_e1 > start_e2 if strand is '-' and end_e1 < start_e2 if strand '+'. *strand* will be either '-' or '+'. \n \n **Pipeline relevance**: When a whitelist file is provided, the field 'isJunctionList' in the :ref:`metadata output file ` will contain a 1 if the junction is contained in this list, and 0 otherwise.", required=False, default=None)
- parameters.add_argument("--disable-concat", help="Disable the generation of kmers from combinations of more than 2 exons. In this mode, any kmer shorter than `--kmer` will be discarded. By setting this command to False, the generation of kmers from 3 exons is allowed. This might ensure that kmers generated from shorter exons are kept, but one should take into account that kmers translated from 3 exons might have lower confidence. By setting the argument to True the generation of kmers is faster. " , action="store_true", default=False)
- parameters.add_argument("--disable-process-libsize", help="Set to True to generate the libsize file (file 7 from the :ref:`build output section `).", action="store_true", default=False)
-
-
- mutation = parser.add_argument_group('Mutation parameters', 'Commands to add somatic and germline mutation information.')
- mutation.add_argument("--mutation-sample", help="Sample id of the files to which mutations are added. The ids should match the graphs/counts names, but an equivalence can be set with --sample-name-map.", required=False, default=None)
- mutation.add_argument("--germline", help="Absolute path of the germline mutation file. **Format:** VCF, MAF or h5.", required=False, default='')
- mutation.add_argument("--somatic", help="Absolute path of the somatic mutation file. **Format:** VCF, MAF or h5.", required=False, default='')
- mutation.add_argument("--sample-name-map", help="Name mapping to sample names from graphs/counts files. **Format:** No header. Two columns: *[name of count/graphs file \t name of mutation/pickle file]*. Three columns: *[name of count/graphs file \t name of germline file \t name of somatic file]*.", required=False, default=None)
- mutation.add_argument("--use-mut-pickle", help="Set to True to save and use pickled mutation dictionary. This command is useful because mutation/variant information needs to be parsed in every run of the software, which is a time-consuming operation. By pickling the mutations, when mutation information is needed, it will be directly loaded from the intermediate pickled files instead than from the original mutation files provided under `--somatic` and `--germline`. This will speed up software re-runs.", action="store_true", default=False)
-
- _add_general_args(parser)
-
- return parser
-
-def get_samplespecif_parser(parser):
- required = parser.add_argument_group('Mandatory arguments')
- required.add_argument("--annot-kmer-files", nargs='+', help="List of absolute paths to the annotation kmer files. The files should have the name **\[mut_mode\]_annot_kmer.gz** (Files 2 from the :ref:`build output section `)", required=True, default='')
- required.add_argument("--output-dir", help=' Path to the output directory.', required=True)
- required.add_argument("--junction-kmer-files", nargs='+', help="Absolute path to the folder containing the foreground kmers. The possible folders one can use for this mode are: **\[mut_mode\]_graph_kmer_JuncExpr** or **\[mut_mode\]_graph_kmer_SegmExpr** (Files 5 and 6 from the :ref:`build output section `)",required=True, default='')
- required.add_argument("--bg-file-path", help="Absolute path to the intermediate pooled annotation file. This file is the set of unique kmers in `--annot-kmer-files` files. If the file is not provided it will be generated. In order to be generated one needs to provide the folder and the file name where the file will be saved. **Note:** It should be a non existent folder. Format: One column with header 'kmer'.", required=True, default='')
- required.add_argument("--output-suffix", help="Suffix to be appended to the filtered `--junction-kmer-files`, e.g. samplespecif", required=True, default='no-annot')
-
- optional = parser.add_argument_group('Optional argument')
- optional.add_argument("--remove-bg", help="Set to True to remove from `--junction-kmer-files` the kmers in `--bg-file-path` . If set to False, a new column `is_neo_flag` will be added to the `--junction-kmer-files` files. The column will contain False if the kmer is in `--bg-file-path` and True otherwise.", action="store_true", required=False, default=False)
-
- _add_general_args(parser)
- return parser
-
-def get_cancerspecif_parser(parser):
- mandatory = parser.add_argument_group('Mandatory arguments', 'These arguments belong to three main groups: \n\n**Technical parameters:** Due to the heavy amount of data, this mode uses spark. These are parameters to control spark processing \n\n**Input helper parameters:** These parameters are used for parsing the input files by the software \n\n**General output files:** Parameters for the files that are output by the software regardless of the filtering method.')
-
- mandatory.add_argument("--cores", type=int, help="Technical parameter. Number of cores to use", required=True, default='')
- mandatory.add_argument("--mem-per-core", type=int, help="Technical parameter. Memory required per core", required=True)
- mandatory.add_argument("--parallelism", type=int, help="Technical parameter. Parallelism parameter for Spark Java Virtual Machine (JVM). It is the default number of partitions in RDDs returned by certain transformations. Check `--spark.default.parallelism` `here `_ for more information.", required=True, default='3')
-
- mandatory.add_argument("--kmer", help='Input helper parameter. Kmer length', required=True)
-
- mandatory.add_argument("--output-dir", help="General output file. Absolute path to the output directory to save the filtered data.", required=True, default='')
-
- technical = parser.add_argument_group('Optional technical parameters', 'Due to the heavy amount of data, this mode uses spark. These are parameters to control spark processing')
- technical.add_argument("--out-partitions", type=int, help="This argument is used to select the number of partitions in which the final output file will be saved. If not provided, the results are saved in a single file", required=False, default=None)
- technical.add_argument("--scratch-dir", help="Os environment variable name containing the cluster scratch directory path. If specified, all the intermediate files will be saved to this directory. If not specified, the intermediate files will be saved to the output directory, specified under `--output-dir`. If the scratch directory is provided, `--interm-dir-norm` and `--interm-dir-cancer` are ignored.", required=False, default='')
- technical.add_argument("--interm-dir-norm", help="Custom scratch directory path to save the intermediate files for the normal samples. If not specified, the intermediate files will be saved to the output directory, specified under `--output-dir`.", required=False, default='')
- technical.add_argument("--interm-dir-canc", help="Custom scratch directory path to save the intermediate files for the cancer samples. If not specified, the intermediate files will be saved to the output directory, specified under `--output-dir`.", required=False, default='')
-
- input_help = parser.add_argument_group('Optional input helper parameters' , 'These parameters are used for parsing the input files by the software.')
- input_help.add_argument("--ids-cancer-samples", nargs='+', help="List with the cancer sample IDs of interest. The samples should be contained in the input cancer expression matrix, and follow the same format. ", required=False, default='')
- input_help.add_argument("--mut-cancer-samples", nargs='+', help="List of mutation modes corresponding to each cancer sample. They will be used to tag the output files. The list should have the same number of entries as `--ids-cancer-samples`.", choices=['ref', 'somatic', 'germline', 'somatic_and_germline'], required=False, default='')
-
- inputs = parser.add_argument_group('Optional general input files' , 'Parameters for the input files that are used regardless of the filtering method.')
- inputs.add_argument("--whitelist-normal", help="File containing the whitelist of normal samples. If provided, only the samples in the whitelist will be retrieved and further studied. **Format:** Tab separated file without a header, with a single column containing sample names.", required=False, default=None)
- inputs.add_argument("--whitelist-cancer", help="File containing the whitelist of cancer samples. If provided, only the samples in the whitelist will be retrieved and further studied. **Format:** Tab separated file without a header, with a single column containing sample names.", required=False, default=None)
- inputs.add_argument("--path-cancer-libsize", help="Path for the libsize file of the selected cancer samples. It corresponds to the output 7 of :ref:`build output section `", required=False, default=None)
- inputs.add_argument("--path-normal-libsize", help="Path for the libsize of the selected normal samples. It corresponds to the output 7 of :ref:`build output section `", required=False, default=None)
- inputs.add_argument("--normalizer-cancer-libsize", type=float, help="**Default =** median of the libsize across all input samples. Custom normalization factor for the cancer libsize. Normalization is used to make all the samples comparable and correct for possible biases in data acquisition. The normalization is done according to the following formula: (read count / 75th quantile in sample) * normalizer value.", required=False, default=None)
- inputs.add_argument("--normalizer-normal-libsize", type=float, help="**Default =** median of the libsize across all input samples. Custom normalization factor for the normal libsize. Normalization is used to make all the samples comparable and correct for possible biases in data acquisition. The normalization is done according to the following formula: (read count / 75th quantile in sample) * normalizer value.", required=False, default=None)
-
- outputs = parser.add_argument_group('Optional general output files', 'Parameters for the files that are output by the software regardless of the filtering method.')
-
- outputs.add_argument("--output-count", help="Path and name where the intermediate numbers of kmers remaining after each filtering step will be written. If selected, a file will be written containing the number of kmers present after each filtering step. It might slow down the computations. However, it is useful if there is an interest on intermediate filtering steps. The output can be seen :ref:`here ` in the output section.", required=False, default='')
- outputs.add_argument("--tag-normals", help="Name for the normal cohort used for filtering. Needed when there are various normal cohorts. It will be added to the final output name in order to identify the normal cohort against which the cancer samples were filtered.", required=False, default='')
- outputs.add_argument("--tag-prefix", help="Prefix used for the output files. It is recommended when there are different conditions being studied.", required=False, default='')
-
- nrf = parser.add_argument_group('Optional parameters for normal samples filtering', 'Parameters to perform the step 2 of the normal filtering pipeline.')
- nrf.add_argument("--path-normal-matrix-segm", nargs='+', help="Path to the matrix of segment expression of kmers in normal samples. This corresponds to output 5 of :ref:`build output section `", required=False, default=None)
- nrf.add_argument("--path-normal-matrix-edge", nargs='+', help="Path to the matrix of junction expression of kmers in normal samples. This corresponds to output 6 of :ref:`build output section `", required=False, default=None)
- nrf.add_argument("--n-samples-lim-normal", type=int, help="This is the value for threshold b) in step 2 of normal filtering pipeline. This number will set the number of samples in which we need to see any expression of a specific kmer, i.e. expression > 0, to consider it a normal kmer and exclude it as cancer candidate.", required=False, default=None)
- nrf.add_argument("--cohort-expr-support-normal", type=float, help="This is the value for threshold a) in step 2 of normal filtering pipeline. This number corresponds to the normalized expression we need to see in at least one sample in order to consider a kmer as a normal kmer. If a kmer is found with an expression level higher or equal than this threshold in one or more normal samples it will be considered a normal kmer and excluded as cancer candidate.", required=False, default=None)
-
- crf = parser.add_argument_group('Optional parameters for cancer samples filtering', 'Parameters to perform the step 2 of the cancer filtering pipeline.')
- crf.add_argument("--sample-expr-support-cancer", type=float, help="This parameter corresponds to sample specific filtering in the cancer pipeline. The value will correspond to the normalized expression threshold that needs to be reached by each kmer in order to be considered a kmer candidate. Kmers with an expression higher or equal than this threshold in the cancer sample of interest will be considered as cancer candidates. Cancer samples of interest are provided under --ids_cancer_samples.", default = None, required = False)
- crf.add_argument("--cohort-expr-support-cancer", type=float, help="This parameter corresponds to the expression threshold in cohort specific filtering. It indicates the normalized expression value that needs to be observed in at least `--n-samples-lim-cancer` in order to consider a kmer as a cancer candidate. Kmers with an expression higher than this threshold in at least `--n-samples-lim-cancer` samples will be considered as cancer candidates. For each cancer sample of interest, provided under `--ids_cancer_samples`, the expression threshold --cohort-expr-support-cancer will be assessed in the rest of the cohort, excluding the sample of interest.", required=False, default=None)
- crf.add_argument("--n-samples-lim-cancer", type=int, help="This parameter corresponds to the number of samples threshold in cohort specific filtering. It indicated the minimum number of cancer samples in which one should see an expression higher than `--cohort-expr-support-cancer` in order to consider the kmer as a cancer candidate. Kmers with an expression higher than `--cohort-expr-support-cancer` in at least `--n-samples-lim-cancer` samples will be considered as cancer candidates. For each cancer sample of interest, provided under `--ids_cancer_samples`, the expression threshold `--cohort-expr-support-cancer` will be assessed in the rest of the cohort, excluding the sample of interest.", required=False, default=None)
- crf.add_argument("--path-cancer-matrix-segm", nargs='+', help="Path to the cancer matrix containing segment expression from samples belonging to a cohort. The matrix will have the following dimensions: [kmers * samples]. When only the junction overlapping cancer kmers are of interest, the user should provide only `--path-cancer-matrix-edge`, and skip the inclusion of this file.If both matrices are provided, junction expression will be chosen in case there is expression information for the same kmer in both matrices. This will be the output 5 of :ref:`build output section `", required=False, default=None)
- crf.add_argument("--path-cancer-matrix-edge", nargs='+', help="Path to the cancer matrix containing junction expression from samples belonging to a cohort. The matrix will have the following dimensions: [kmers * samples]. When only the junction overlapping cancer kmers are of interest, the user should provide only this file, and skip the inclusion of `--path-cancer-matrix-segm`. If both matrices are provided, junction expression will be chosen in case there is expression information for the same kmer in both matrices. This will be the ouput 6 of :ref:`build output section `.", required=False, default=None)
- crf.add_argument("--cancer-support-union", help="Parameter to choose how the sample specific filtering and the cohort specific filtering are combined. By default, they are combined by choosing the common kmers to both filtering steps, i.e. performing an intersection. If this parameter is set to True, the union of both filtering steps will be performed, i.e. the kmers that pass either the sample specific filtering or the cohort specific filtering will be kept.", action="store_true", required=False, default=False)
-
- more_backgrounds = parser.add_argument_group('Optional parameters for the addition of additional backgrounds' , 'Parameters to add additional backgrounds that will be removed.')
- more_backgrounds.add_argument("--path-normal-kmer-list", nargs='+', help="List of kmers to be added as part of the normal samples. The kmers provided in this list will be included in the normal background, without having to pass any of the filtering steps for the normal data. Format: It can be either a *tsv* or *parquet* file. If *parquet* is the used format, the file should contain the header 'kmer' in the first column.", required=False, default=None)
- more_backgrounds.add_argument("--uniprot", help="Path to file containing uniprot kmers. The kmers contained in this databased will be assumed to be not novel peptides, and they will be removed from the cancer filtering output. *Note: It is important to kmerize the peptides downloaded from uniprot database into the length specified in `--kmer`*.", required=False, default=None)
-
- more_filters = parser.add_argument_group('Optional parameters to add additional filters' , 'Parameters to add the additional filters shown in step 1 of both pipelines. ')
- more_filters.add_argument("--filterNeojuncCoord", choices=['C', 'N', 'A'], required=False, default='', help=" This argument will filter the neojunctions, and it will retain only the kmers that are generated from neojunctions. These are peptides whose junction coordinates were not part of the annotation files. Selecting this option means that only those kmers with junctionAnnotated = False will be considered for further filtering. This filter is used in the preprocessing (step 1 of cancer and normal pipelines). If 'C' is selected, the filter is only applied to the cancer sample. If 'N' is selected, the filter is only applied to the normal samples. If 'A' is selected, the filter is applied to both cancer and normal samples.")
- more_filters.add_argument("--filterAnnotatedRF", choices=['C', 'N', 'A'], required=False, default='', help="This argument will only retrieve kmers that were generated from annotated reading frames and discard those that were obtained by reading frame propagation through the graph. By selecting this option, only kmers generated from reading frames present in the annotation are kept. This means that selecting this option means that only those kmers with ReadFrameAnnotated = True will be selected. This filter is used in the preprocessing (step 1 of cancer and normal pipelines). If 'C' is selected, the filter is only applied to the cancer sample. If 'N' is selected, the filter is only applied to the normal samples. If 'A' is selected, the filter is applied to both cancer and normal samples.")
-
- development = parser.add_argument_group('Optional development parameters')
- development.add_argument("--tot-batches", type=int, help="If selected, the filtering of the background and foreground will be based on hash functions. This parameter will set the total number of batches in which we will divide the foreground and background files to filter, and each of those batches will be assigned a hash value. If `--batch-id` is specified, `--tot-batches` should also be specified.", required=False, default=None)
- development.add_argument("--batch-id", type=int, help="If selected, the filtering of the background and foreground will be based on hash functions. This parameter will set the batch id of the current batch that is being filtered. The batch id should be an integer between 0 and `--tot-batches`. It shows the specific batch that we want to process, out of the `--tot-batches`. If `--batch-id` is specified, `--tot-batches` should also be specified.", required=False, default=None)
- development.add_argument("--on-the-fly", help="If set to true, all the filtering steps will be done on the fly, without the creation of intermediate files. The creation of intermediate files would speed up the computations in the case where full or partial reruns are planned. An example of a partial rerun would be when seveal normal and cancer filtering parameters are applied to the same cohort. Choosing not to save intermediate files will trade speed for disk space.", action="store_true", default=False)
-
- _add_general_args(parser)
-
- return parser
-
-def get_mhcbind_parser(parser):
- required = parser.add_argument_group('Mandatory arguments')
-
- required.add_argument("--mhc-software-path", help="Path for the MHC prediction software.", required=True, default=None)
- required.add_argument("--argstring", help="Complete command line for the MHC prediction tool passed as a string. One should include here the command that will be directly passed to the selected MHC tool. The four mandatory arguments are: \n 1.--mhc-predictor: This argument will specify the name of the software tool that will be used. The name should be in the format accepted by the library `mhc_tools `_ \n \n 2.--output-csv: This argument will contain the path and filename where the MHC prediction tool will save the results. The format of the file should be *.csv*.\n \n 3.--input-peptides-file: This argument will have the path and filename to the file containing the set of kmers on which MHC binding affinity prediction will be performed. The format of the file should be *.csv*.\n \n 4. --mhc-alleles or --mhc-alleles-file: This argument should contain the alleles that will be used for the analysis of mhc binding affinity.\n \n If `--partitioned-tsv` files are provided, an intermediate file will be created and stored under the path `--input-peptides-file` . This intermediate file will contain the set of all unique kmers present in the partitioned files obtained from `cancerspecif` mode. If one does not want to use the output of `cancerspecif` mode for prediction, the path to the file that will be used for prediction will be directly provided under `--input-peptides-list`.", required=True, default='')
- required.add_argument("--output-dir", help="General output file. Absolute path to the output directory to save the MHC predictions. It should match the directory provided in --output-csv of the argstring, but without the file name.", required=True, default='')
-
- optional = parser.add_argument_group('Optional argument')
- optional.add_argument("--partitioned-tsv", help="The input to this command is the path to the folder containing the partitioned tsv files from `cancerspecif` mode. This corresponds to the files 1 and 2 found in the :ref:`output section `. If this parameter is set the tool will directly accept the files from cancerspecif mode as input.", required=False, default=None)
- optional.add_argument("--bind-score-method", help="Scoring method to filter the MHC tools predictions. E.g. score, affinity, percentile_rank (this last one is only for netmhcpan).", required=False, default=None)
- optional.add_argument("--bind-score-threshold", type=float, help="Threshold to filter the MHC tools predictions. All the peptides with a score lower than the threshold will be filtered out and only the ones with a score higher than the threshold will be kept.", required=False, default=None)
- optional.add_argument("--less-than", help="If set to True the `--bind-score-threshold` will be considered as an upper bound instead of a lower bound. This means that peptides with a score higher than this threshold will be filtered out.", action="store_true", required=False, default=False)
- _add_general_args(parser)
- return parser
-
-def get_pepquery_parser(parser):
- required = parser.add_argument_group('Mandatory arguments')
- required.add_argument("--output-dir", help="Absolute path to the output directory to save the results of the validation.", required=True, default=None)
- required.add_argument("--argstring", help="Complete command line for the pepQuery MS based validation software passed as a string. The command provided here will be directly passed to the pepQuery software. It must contain several mandatory arguments: \n 1. -o: Output directory. The results from pepQuery will be saved in this folder. \n 2. -i: Input peptides for the validation. The user can either provide their own peptides or take the expanded peptides from the kmers provided in --partitioned-tsv. If --partitioned-tsv is provided, the expanded peptides will be saved under the path and filename provided in this argument, and they will be taken as the input of pepQuery.\n 3. -db: Path to the reference database used in the analysis. This reference database will be used to identify those MS/MS spectra that match better a reference peptide than the query peptides. \n 4. -ms: Path to the MS/MS spectra file.", required=True, default=None)
- required.add_argument("--pepquery-software-path", help="Path for the pepQuery software. It must include the path to the directory with the software and the software name itself. eg. pepquery/pepquery.jar", required=True, default=None)
-
- optional = parser.add_argument_group('Optional argument')
- optional.add_argument("--partitioned-tsv", help="The input to this command is the path to the folder containing the partitioned tsv files from `cancerspecif` mode. This corresponds to the files 1 and 2 found in the :ref:`output section `. If this parameter is set the tool will directly accept the files from cancerspecif mode as input. An intermediate file will be saved in the output directory containing the kmers in a bigger peptide context. This argument accepts files generated from junctions and from segments.", required=False, default=None)
- optional.add_argument("--metadata-path", help="Absolute path to the metadata file created in build mode. This file is required whether the user chooses to work with junctions of with kmers.", required=False, default=None)
- optional.add_argument("--kmer-type", help="Type of the kmers introduced under --partitioned-tsv. This will show whether the user chooses to work with junctions or with segments. The peptide retrieval strategy will vary depending on the input type.", choices=['junctions', 'segments'], required=False, default=None)
- _add_general_args(parser)
- return parser
-def parse_arguments(argv):
- parser = argparse.ArgumentParser(prog='immunopepper')
- parser_build, parser_samplespecif, parser_cancerspecif, parser_mhcbind, parser_pepquery = get_subparsers(parser)
-
- #Now I will fill the different parsers with the arguments. I will create a function for each parser, and that will be the function displayed in the documentation part.
-
- parser_build = get_build_parser(parser_build)
- parser_samplespecif = get_samplespecif_parser(parser_samplespecif)
- parser_cancerspecif = get_cancerspecif_parser(parser_cancerspecif)
- parser_mhcbind = get_mhcbind_parser(parser_mhcbind)
- parser_pepquery = get_pepquery_parser(parser_pepquery)
-
- if len(argv) < 1:
- parser.print_help()
- sys.exit(1)
-
- if len(argv) < 2:
- if argv[0] == 'build':
- parser_build.print_help()
- elif argv[0] == 'samplespecif':
- parser_samplespecif.print_help()
- elif argv[0] == "cancerspecif":
- parser_cancerspecif.print_help()
- elif argv[0] == "mhcbind":
- sys.stdout.write("------------------------------ MHCBIND IMMUNOPEPPER USAGE ------------------------------ \n \n ")
- parser_mhcbind.print_help()
- sys.stdout.write("\n------------------------------ MHCTOOLS AVAILABLE COMMAND LINE OPTIONS ------------------------------ \n \n ")
- from mhctools.mhctools.cli.args import make_mhc_arg_parser
- parser_mhc = make_mhc_arg_parser(prog="mhctools",description=("Predict MHC ligands from protein sequences")) #TODO: uncmment this line
- parser_mhc.print_help()
- elif argv[0] == 'pepquery':
- sys.stdout.write("------------------------------ PEPQUERY IMMUNOPEPPER USAGE ------------------------------ \n \n ")
- parser_pepquery.print_help()
- else:
- parser.print_help()
-
-
- pargs = parser.parse_args(argv)
- return pargs
-
-def split_mode(options):
-
- arg = parse_arguments(options)
- mode = options[0]
- if not os.path.isdir(arg.output_dir):
- os.makedirs(arg.output_dir)
- now = datetime.now()
-
- if arg.verbose > 0:
- stdout_handler = logging.StreamHandler(sys.stdout)
- handlers = [stdout_handler]
- ### set log level
- if arg.verbose == 0:
- log_level = logging.WARNING
- elif arg.verbose == 1:
- log_level = logging.INFO
- else:
- log_level = logging.DEBUG
-
- logging.basicConfig(
- level=log_level,
- handlers=handlers,
- format="%(asctime)-15s %(levelname)-8s %(message)s")
-
- #stdout_handler.setFormatter(logging.Formatter("%(asctime)-15s %(levelname)-8s %(message)s"))
- #stdout_handler.setLevel(log_level)
- #logging.getLogger().addHandler(stdout_handler)
-
- logging.info("Command line"+str(arg))
- if mode == 'build':
- pass
- mode_build(arg)
- if mode == 'samplespecif':
- pass
- mode_samplespecif(arg)
- if mode == "cancerspecif":
- pass
- mode_cancerspecif(arg)
- if mode == "mhcbind":
- pass
- from immunopepper.mode_mhcbind import mode_mhcbind #import here due to logging conflict
- mode_mhcbind(arg)
- if mode == 'pepquery':
- mode_pepquery(arg)
-
-def cmd_entry():
- #pr = cProfile.Profile()
- #pr.enable()
- options = sys.argv[1:]
- split_mode(options)
- #pr.disable()
- #pr.print_stats()
- #pr.dump_stats('/cluster/work/grlab/projects/tmp_laurie/test_memory_time_mx/cProfile.stats')
-
-if __name__ == "__main__":
- cmd_entry()
-
diff --git a/immunopepper/ip.py b/immunopepper/ip.py
index 74066eca..359af75f 100755
--- a/immunopepper/ip.py
+++ b/immunopepper/ip.py
@@ -6,152 +6,210 @@
import sys
from datetime import datetime
-from mhctools.cli.args import make_mhc_arg_parser
from immunopepper.mode_build import mode_build
from immunopepper.mode_samplespecif import mode_samplespecif
from immunopepper.mode_cancerspecif import mode_cancerspecif
+from immunopepper.mode_pepQuery import mode_pepquery
def _add_general_args(parser):
general = parser.add_argument_group('GENERAL')
general.add_argument("--verbose", type=int, help="specify output verbosity (0 - warn, 1 - info, 2 - debug) [1]", required=False, default=1)
-def parse_arguments(argv):
- parser = argparse.ArgumentParser(prog='immunopepper')
- subparsers = parser.add_subparsers(help='Running modes', metavar='{build, samplespecif, cancerspecif, mhcbind}')
-
- ### mode_build
- parser_build = subparsers.add_parser('build', help='generate kmers library from a splicegraph')
-
- required = parser_build.add_argument_group('MANDATORY')
- required.add_argument("--output-dir", help="output directory [default: output]", required=True, default='output')
- required.add_argument("--ann-path", help="absolute path of reference gene annotation file", required=True)
- required.add_argument("--splice-path", help="absolute path of splicegraph file", required=True)
- required.add_argument("--ref-path", help="absolute path of reference genome file", required=True)
- required.add_argument("--kmer", type=int, help="specifies the desired kmer length output", required=True, default=9)
-
- submodes = parser_build.add_argument_group('SUBMODES PROCESS: Conceptual choices about the processing required')
- submodes.add_argument("--libsize-extract", help="goes through the graph only to output gene quantifications and library sizes. Skips neoepitope generation", action="store_true", required=False, default=False)
- submodes.add_argument("--all-read-frames", help="switch from annotated reading frames to exhaustive translation", action="store_true", required=False, default=False)
- submodes.add_argument("--count-path", help="absolute path of count hdf5 file. If not provided candidates are output without expression quantification", required=False, default=None)
- submodes.add_argument("--output-samples", nargs='+', help="list of sample names to output, names must match the graph samples. If not provided, all count samples will be output (runs faster)", required=False, default=[])
- submodes.add_argument("--heter-code", type=int, help=argparse.SUPPRESS, default=0) # if count expression data is provided in h5 format, specify the code for heterzygous
-
- parameters = parser_build.add_argument_group('TECHNICAL PARAMETERS: Parameters to dump intermediate data or to optimize processing')
- parameters.add_argument("--compressed", help="compress output files", action="store_true", default=True)
- parameters.add_argument("--parallel", type=int, help="number of threads to be used [1]", required=False, default=1)
- parameters.add_argument("--batch-size", type=int, help="batch size for parallel processing", default=1000)
- parameters.add_argument("--pickle-samples", nargs='+', help="list of sample names to pickle, ids should match the count/graphs but equivalence with the mutation files can be specified (see --sample-name-map)", required=False, default=[])
-
- subset = parser_build.add_argument_group('SUBSET: Process subsets of the graph')
- subset.add_argument("--process-chr", nargs='+', help="Only process the list of given chromosomes from the splicegraph, default: process all", required=False, default=None)
- subset.add_argument("--complexity-cap", type=int, help="limit the processing of the foreground to genes with complexity less than the cap", required=False, default=None)
- subset.add_argument("--genes-interest", help="only process the genes given as input. One gene-ID per line, no header.", required=False, default=None)
- subset.add_argument("--start-id", type=int, help="development feature: start processing the graph at the given id (Tmp folder numbering in parallel mode is conserved)", required=False, default=0)
- subset.add_argument("--process-num", metavar='N', type=int, help="Only process the first N genes in the splicegraph, default: process all", required=False, default=0)
-
- outputs = parser_build.add_argument_group('OUTPUT OPTIONS: Optional choices about the output formatting and filtering')
- outputs.add_argument("--skip-annotation", help='skip the generation of the annotated peptides and kmers', action="store_true", default=False)
- outputs.add_argument("--libsize-path", nargs='?', help=argparse.SUPPRESS, required=False, default=None) #specify the absolute path to expression library sizes if we want to append to a file
- outputs.add_argument("--output-fasta", help="if True, outputs both the sample peptide metadata and the fasta, else outputs only metadata", action="store_true", required=False, default=False)
- outputs.add_argument("--force-ref-peptides", help="output mutated peptide even if it is the same as reference peptide", action="store_true", default=False)
- outputs.add_argument("--filter-redundant", help="apply redundancy filter to the exon list", action="store_true", required=False, default=False)
- outputs.add_argument("--kmer-database", help="absolute path of database file with kmers on one column (no header). If provided, the kmers matching the given database with I and L equivalence will not be output.", required=False, default=None)
- outputs.add_argument("--gtex-junction-path", help="absolute path of whitelist junction file, currently only support hdf5 format.", required=False, default=None)
- parameters.add_argument("--disable-concat", help="switch off concatenation of short exons to increase speed", action="store_true", default=False)
- parameters.add_argument("--disable-process-libsize", help="sub-sample library size generation to increase speed", action="store_true", default=False)
-
- mutation = parser_build.add_argument_group('MUTATION FILES: Arguments needed for integration of the germline variants or somatic mutations')
- mutation.add_argument("--mutation-sample", help="sample id for the somatic and germline application, ids should match the count/graphs but equivalence with the mutation files can be specified (see --sample-name-map)", required=False, default=None)
- mutation.add_argument("--germline", help="Absolute path to an optional germline VCF or MAF mutation file", required=False, default='')
- mutation.add_argument("--somatic", help="Absolute path to an optional somatic VCF or MAF mutation file", required=False, default='')
- mutation.add_argument("--sample-name-map", help="provide a name mapping between the count/graphs files, the germline and the somatic mutation file Format:[ no header, 2 or 3 columns]. If 2 columns [ name in count/graphs files \t name in mutations files ] If 3 columns [name in count/graphs files \t name in germline file \t name in somatic file]", required=False, default=None)
- mutation.add_argument("--use-mut-pickle", help="save and use pickled mutation dict without processing the original files", action="store_true", default=False)
-
- _add_general_args(parser_build)
-
- ### mode_samplespecif
+def get_subparsers(parser):
+
+ """
+ Creates the different subparsers and returns them to fill them with arguments.
+
+ Parameters
+ ----------
+ parser : argparse.ArgumentParser. Main parser of immunopepper
+
+ """
+
+ subparsers = parser.add_subparsers(help='Running modes', metavar='{build, samplespecif, cancerspecif, mhcbind, pepquery}')
+
+ #I will create the different subparsers
+ parser_build = subparsers.add_parser('build', help='Core part of ImmunoPepper. Traverses the input splice graph and generates all possible peptides/kmers.')
parser_samplespecif = subparsers.add_parser('samplespecif', help='Performs removal of the annotation to make the kmer list sample specific')
- required = parser_samplespecif.add_argument_group('MANDATORY')
- required.add_argument("--annot-kmer-files", nargs='+', help="list of absolute paths to the annotation kmer files", required=True, default='')
- required.add_argument("--output-dir",help='directory to store the log file',required=True)
- required.add_argument("--junction-kmer-files", nargs='+', help="absolute paths to the sample kmer files", required=True, default='')
- required.add_argument("--bg-file-path", help="absolute path to the intermediate pooled annotation file", required=True, default='')
- required.add_argument("--output-suffix", help="suffix to be append to output file path", required=True, default='no-annot')
- required.add_argument("--remove-bg", help="choose to simply remove background rows or add a new flag column to indicate"
- " if the kmer exists in the background kmers", action="store_true", required=False, default=False)
- _add_general_args(parser_samplespecif)
-
- ### mode_cancerspecif
parser_cancerspecif = subparsers.add_parser('cancerspecif', help='Performs differential filtering against a panel of normal samples')
+ parser_mhcbind = subparsers.add_parser('mhcbind', help='Perform MHC binding prediction with a wrapper for MHCtools')
+ parser_pepquery = subparsers.add_parser("pepquery", help="Perform peptide validation with a wrapper for the tool PepQuery")
+ return parser_build, parser_samplespecif, parser_cancerspecif, parser_mhcbind, parser_pepquery
+
+def get_build_parser(parser):
+
+ """
+ Function that creates the subparser for the build mode with its arguments
+ """
+
+ required = parser.add_argument_group('Mandatory arguments')
+ required.add_argument("--output-dir", help="absolute path of the output directory. All output files generated by immunopepper will be saved in this directory.", required=True, default='output')
+ required.add_argument("--ann-path", help="absolute path for the annotation file. Accepted file formats: *.gtf*, *.gff* and *.gff3*. Annotation files can be downloaded from various databases such as `GENCODE `_ ", required=True)
+ required.add_argument("--splice-path", help="absolute path of the input `SplAdder `_ splice graph.", required=True)
+ required.add_argument("--ref-path", help="absolute path of the reference genome file in FASTA format. Reference Please ensure that the reference genome is compatible with the gene annotation file being used. For example, if the annotation file is based on GRCh38, the reference genome should also be based on GRCh38. You can check `here `_ gencode annotation releases and their corresponding major genome assembly releases. For example, if you decide to use genome assembly version GRCh38.p13, you need to use its compatible annotation file from release 43 in GENCODE.", required=True)
+ required.add_argument("--kmer", type=int, help="length of the kmers for kmer output.", required=True, default=9)
+
+ submodes = parser.add_argument_group('Submodes parameters', 'Commands for conceptual information about the processing.')
+ submodes.add_argument("--libsize-extract",help="Set this parameter to True to generate library sizes and gene quantifications and skip neontigen generation. **Note:** If set to True, the program will only output files 3 and 7 of the :ref:`build output section `.",action="store_true", required=False, default=False)
+ submodes.add_argument("--all-read-frames", help="Set this parameter to True to switch to exhaustive translation and study all possible reading frames instead of just the annotated ones in the annotation file.", action="store_true", required=False, default=False)
+ submodes.add_argument("--count-path", help="Absolute path for the second output of `SplAdder `_ containing the graph expression quantification. If provided, expression quantification of genes will take place. **Format:** hdf5.", required=False, default=None)
+ submodes.add_argument("--output-samples", nargs='+', help="List of sample names to output. **Note:** *Names should match the file name of the splice graphs. If not provided all samples are processed and program runs faster.*", required=False, default=[])
+ submodes.add_argument("--heter-code", type=int, help="It specifies the heterozygous allele.", default=0, choices = ['0', '2']) #TODO: Add more info about this parameter?
+
+ parameters = parser.add_argument_group('Technical parameters' , 'Commands for optimization of the software.')
+ parameters.add_argument("--compressed", help="Compress output files", action="store_true", default=True)
+ parameters.add_argument("--parallel", type=int, help="Number of cores to be used.", required=False, default=1)
+ parameters.add_argument("--batch-size", type=int, help="Number of genes to be processed in each batch. If bigger batches are selected, the program will be faster, but it will require more memory.", default=1000)
+ parameters.add_argument("--pickle-samples", nargs='+', help="List of samples to be pickled. Needed if `--use-mut-pickle` is set to True. It will create intermediate files containing mutation information of the selected samples. This command is useful because mutation/variant information needs to be parsed in every run of the software, which is a time-consuming operation. By pickling the mutations, when mutation information is needed, it will be directly loaded from the intermediate pickled files instead than from the original mutation files provided under `--somatic` and `--germline`. This will speed up software re-runs. When dealing with large cohorts, this command is useful to select exactly what files should be pickled. If not provided, all the samples passed in `--mutation-sample` will be pickled. Names should match the sample names of the graph/counts files but an equivalence can be set using `--sample-name-map` from mutation parameters.", required=False, default=[])
+
+ subset = parser.add_argument_group('Subset parameters', 'Commands to select a subset of the genes to be processed.')
+ subset.add_argument("--process-chr", nargs='+',help="List of chromosomes to be processed. If not provided all chromosomes are processed. The chromosomes names should be provided in the same format as in FASTA and annotation files. For annotations downloaded from GENCODE, this format is **chrX**, X being the chromosome number.", required=False, default=None)
+ subset.add_argument("--complexity-cap", type=int, help="Maximum edge complexity of the graph to be processed. If not provided all graphs are processed.", required=False, default=None)
+ subset.add_argument("--genes-interest", help="Genes to be processed. **Format:** Input is a csv file containing a gencode gene id per line, with no header. **Technical note:** The gencode gene id must match the gencode gene ids of the splice graph. Therefore, the format for this argument must match the format of the *gene_id* field of the annotation file used in the build mode of SplAdder and passed under `--ann-path` in immunopepper. If not provided all genes are processed.", required=False, default=None)
+ subset.add_argument("--start-id", type=int, help="Id of the first gene in the splice graph to be processed. If not provided all genes are processed.", required=False, default=0)
+ subset.add_argument("--process-num", metavar='N', type=int, help="Number of genes to be processed. If provided, the first process-num genes are processed.", required=False, default=0)
+
+ outputs = parser.add_argument_group('Output parameters', 'Commands to select output formatting and filtering.')
+ outputs.add_argument("--skip-annotation", help='Set this parameter to True to skip the generation of a background kmer and peptide files.', action="store_true", default=False)
+ outputs.add_argument("--output-fasta", help="Set this parameter to True to output the foreground peptides in fasta format. If set to True output number 4 from the :ref:`build output section ` will be generated. If set to False only the kmers will be output.", action="store_true", required=False, default=False)
+ outputs.add_argument("--force-ref-peptides", help="Set this parameter to True to output mutated peptides even if they are the same as the ones in the reference. The reference in this case are the peptides without any mutations or variants application.", action="store_true", default=False)
+ outputs.add_argument("--filter-redundant", help="If set to true, a redundancy filter will be applied to the exon list. If two or more exons span the same juction, their coordinates will be combined so that the longest spanning combination is kept.", action="store_true", required=False, default=False)
+ outputs.add_argument("--kmer-database", help="Absolute path of a file containing kmers in one column, without header. If the kmers contained in this database contain the aminoacid isoleucine (I), it will be converted into leucine (L). A file from uniprot or any other standard library can be provided. The kmers provided in this file will not be output if found in the foreground peptides. Please note that is a standard proteome is downloaded from an online resource the proteins should be cut into the kmers length selected under `--kmer`.", required=False, default=None)
+ outputs.add_argument("--gtex-junction-path", help="Absolute path of whitelist junction path. The junctions of this file will be the only ones output from the tool. \n **Format:** hdf5 file with 'chrm', 'pos' and 'strand' as keys. *'Chrm'* contains the chromosome name in the same format as in the annotation. *'pos'* contains coordinates. The coordinates are end_e1 and start_e2. This means that end_e1 > start_e2 if strand is '-' and end_e1 < start_e2 if strand '+'. *strand* will be either '-' or '+'. \n \n **Pipeline relevance**: When a whitelist file is provided, the field 'isJunctionList' in the :ref:`metadata output file ` will contain a 1 if the junction is contained in this list, and 0 otherwise.", required=False, default=None)
+ parameters.add_argument("--disable-concat", help="Disable the generation of kmers from combinations of more than 2 exons. In this mode, any kmer shorter than `--kmer` will be discarded. By setting this command to False, the generation of kmers from 3 exons is allowed. This might ensure that kmers generated from shorter exons are kept, but one should take into account that kmers translated from 3 exons might have lower confidence. By setting the argument to True the generation of kmers is faster. " , action="store_true", default=False)
+ parameters.add_argument("--disable-process-libsize", help="Set to True to generate the libsize file (file 7 from the :ref:`build output section `).", action="store_true", default=False)
+
+
+ mutation = parser.add_argument_group('Mutation parameters', 'Commands to add somatic and germline mutation information.')
+ mutation.add_argument("--mutation-sample", help="Sample id of the files to which mutations are added. The ids should match the graphs/counts names, but an equivalence can be set with --sample-name-map.", required=False, default=None)
+ mutation.add_argument("--germline", help="Absolute path of the germline mutation file. **Format:** VCF, MAF or h5.", required=False, default='')
+ mutation.add_argument("--somatic", help="Absolute path of the somatic mutation file. **Format:** VCF, MAF or h5.", required=False, default='')
+ mutation.add_argument("--sample-name-map", help="Name mapping to sample names from graphs/counts files. **Format:** No header. Two columns: *[name of count/graphs file \t name of mutation/pickle file]*. Three columns: *[name of count/graphs file \t name of germline file \t name of somatic file]*.", required=False, default=None)
+ mutation.add_argument("--use-mut-pickle", help="Set to True to save and use pickled mutation dictionary. This command is useful because mutation/variant information needs to be parsed in every run of the software, which is a time-consuming operation. By pickling the mutations, when mutation information is needed, it will be directly loaded from the intermediate pickled files instead than from the original mutation files provided under `--somatic` and `--germline`. This will speed up software re-runs.", action="store_true", default=False)
+
+ _add_general_args(parser)
+
+ return parser
+
+def get_samplespecif_parser(parser):
+ required = parser.add_argument_group('Mandatory arguments')
+ required.add_argument("--annot-kmer-files", nargs='+', help="List of absolute paths to the annotation kmer files. The files should have the name **\[mut_mode\]_annot_kmer.gz** (Files 2 from the :ref:`build output section `)", required=True, default='')
+ required.add_argument("--output-dir", help=' Path to the output directory.', required=True)
+ required.add_argument("--junction-kmer-files", nargs='+', help="Absolute path to the folder containing the foreground kmers. The possible folders one can use for this mode are: **\[mut_mode\]_graph_kmer_JuncExpr** or **\[mut_mode\]_graph_kmer_SegmExpr** (Files 5 and 6 from the :ref:`build output section `)",required=True, default='')
+ required.add_argument("--bg-file-path", help="Absolute path to the intermediate pooled annotation file. This file is the set of unique kmers in `--annot-kmer-files` files. If the file is not provided it will be generated. In order to be generated one needs to provide the folder and the file name where the file will be saved. **Note:** It should be a non existent folder. Format: One column with header 'kmer'.", required=True, default='')
+ required.add_argument("--output-suffix", help="Suffix to be appended to the filtered `--junction-kmer-files`, e.g. samplespecif", required=True, default='no-annot')
+
+ optional = parser.add_argument_group('Optional argument')
+ optional.add_argument("--remove-bg", help="Set to True to remove from `--junction-kmer-files` the kmers in `--bg-file-path` . If set to False, a new column `is_neo_flag` will be added to the `--junction-kmer-files` files. The column will contain False if the kmer is in `--bg-file-path` and True otherwise.", action="store_true", required=False, default=False)
+
+ _add_general_args(parser)
+ return parser
+
+def get_cancerspecif_parser(parser):
+ mandatory = parser.add_argument_group('Mandatory arguments', 'These arguments belong to three main groups: \n\n**Technical parameters:** Due to the heavy amount of data, this mode uses spark. These are parameters to control spark processing \n\n**Input helper parameters:** These parameters are used for parsing the input files by the software \n\n**General output files:** Parameters for the files that are output by the software regardless of the filtering method.')
+
+ mandatory.add_argument("--cores", type=int, help="Technical parameter. Number of cores to use", required=True, default='')
+ mandatory.add_argument("--mem-per-core", type=int, help="Technical parameter. Memory required per core", required=True)
+ mandatory.add_argument("--parallelism", type=int, help="Technical parameter. Parallelism parameter for Spark Java Virtual Machine (JVM). It is the default number of partitions in RDDs returned by certain transformations. Check `--spark.default.parallelism` `here `_ for more information.", required=True, default='3')
+
+ mandatory.add_argument("--kmer", help='Input helper parameter. Kmer length', required=True)
+
+ mandatory.add_argument("--output-dir", help="General output file. Absolute path to the output directory to save the filtered data.", required=True, default='')
+
+ technical = parser.add_argument_group('Optional technical parameters', 'Due to the heavy amount of data, this mode uses spark. These are parameters to control spark processing')
+ technical.add_argument("--out-partitions", type=int, help="This argument is used to select the number of partitions in which the final output file will be saved. If not provided, the results are saved in a single file", required=False, default=None)
+ technical.add_argument("--scratch-dir", help="Os environment variable name containing the cluster scratch directory path. If specified, all the intermediate files will be saved to this directory. If not specified, the intermediate files will be saved to the output directory, specified under `--output-dir`. If the scratch directory is provided, `--interm-dir-norm` and `--interm-dir-cancer` are ignored.", required=False, default='')
+ technical.add_argument("--interm-dir-norm", help="Custom scratch directory path to save the intermediate files for the normal samples. If not specified, the intermediate files will be saved to the output directory, specified under `--output-dir`.", required=False, default='')
+ technical.add_argument("--interm-dir-canc", help="Custom scratch directory path to save the intermediate files for the cancer samples. If not specified, the intermediate files will be saved to the output directory, specified under `--output-dir`.", required=False, default='')
+
+ input_help = parser.add_argument_group('Optional input helper parameters' , 'These parameters are used for parsing the input files by the software.')
+ input_help.add_argument("--ids-cancer-samples", nargs='+', help="List with the cancer sample IDs of interest. The samples should be contained in the input cancer expression matrix, and follow the same format. ", required=False, default='')
+ input_help.add_argument("--mut-cancer-samples", nargs='+', help="List of mutation modes corresponding to each cancer sample. They will be used to tag the output files. The list should have the same number of entries as `--ids-cancer-samples`.", choices=['ref', 'somatic', 'germline', 'somatic_and_germline'], required=False, default='')
+
+ inputs = parser.add_argument_group('Optional general input files' , 'Parameters for the input files that are used regardless of the filtering method.')
+ inputs.add_argument("--whitelist-normal", help="File containing the whitelist of normal samples. If provided, only the samples in the whitelist will be retrieved and further studied. **Format:** Tab separated file without a header, with a single column containing sample names.", required=False, default=None)
+ inputs.add_argument("--whitelist-cancer", help="File containing the whitelist of cancer samples. If provided, only the samples in the whitelist will be retrieved and further studied. **Format:** Tab separated file without a header, with a single column containing sample names.", required=False, default=None)
+ inputs.add_argument("--path-cancer-libsize", help="Path for the libsize file of the selected cancer samples. It corresponds to the output 7 of :ref:`build output section `", required=False, default=None)
+ inputs.add_argument("--path-normal-libsize", help="Path for the libsize of the selected normal samples. It corresponds to the output 7 of :ref:`build output section `", required=False, default=None)
+ inputs.add_argument("--normalizer-cancer-libsize", type=float, help="**Default =** median of the libsize across all input samples. Custom normalization factor for the cancer libsize. Normalization is used to make all the samples comparable and correct for possible biases in data acquisition. The normalization is done according to the following formula: (read count / 75th quantile in sample) * normalizer value.", required=False, default=None)
+ inputs.add_argument("--normalizer-normal-libsize", type=float, help="**Default =** median of the libsize across all input samples. Custom normalization factor for the normal libsize. Normalization is used to make all the samples comparable and correct for possible biases in data acquisition. The normalization is done according to the following formula: (read count / 75th quantile in sample) * normalizer value.", required=False, default=None)
+
+ outputs = parser.add_argument_group('Optional general output files', 'Parameters for the files that are output by the software regardless of the filtering method.')
+
+ outputs.add_argument("--output-count", help="Path and name where the intermediate numbers of kmers remaining after each filtering step will be written. If selected, a file will be written containing the number of kmers present after each filtering step. It might slow down the computations. However, it is useful if there is an interest on intermediate filtering steps. The output can be seen :ref:`here ` in the output section.", required=False, default='')
+ outputs.add_argument("--tag-normals", help="Name for the normal cohort used for filtering. Needed when there are various normal cohorts. It will be added to the final output name in order to identify the normal cohort against which the cancer samples were filtered.", required=False, default='')
+ outputs.add_argument("--tag-prefix", help="Prefix used for the output files. It is recommended when there are different conditions being studied.", required=False, default='')
+
+ nrf = parser.add_argument_group('Optional parameters for normal samples filtering', 'Parameters to perform the step 2 of the normal filtering pipeline.')
+ nrf.add_argument("--path-normal-matrix-segm", nargs='+', help="Path to the matrix of segment expression of kmers in normal samples. This corresponds to output 5 of :ref:`build output section `", required=False, default=None)
+ nrf.add_argument("--path-normal-matrix-edge", nargs='+', help="Path to the matrix of junction expression of kmers in normal samples. This corresponds to output 6 of :ref:`build output section `", required=False, default=None)
+ nrf.add_argument("--n-samples-lim-normal", type=int, help="This is the value for threshold b) in step 2 of normal filtering pipeline. This number will set the number of samples in which we need to see any expression of a specific kmer, i.e. expression > 0, to consider it a normal kmer and exclude it as cancer candidate.", required=False, default=None)
+ nrf.add_argument("--cohort-expr-support-normal", type=float, help="This is the value for threshold a) in step 2 of normal filtering pipeline. This number corresponds to the normalized expression we need to see in at least one sample in order to consider a kmer as a normal kmer. If a kmer is found with an expression level higher or equal than this threshold in one or more normal samples it will be considered a normal kmer and excluded as cancer candidate.", required=False, default=None)
+
+ crf = parser.add_argument_group('Optional parameters for cancer samples filtering', 'Parameters to perform the step 2 of the cancer filtering pipeline.')
+ crf.add_argument("--sample-expr-support-cancer", type=float, help="This parameter corresponds to sample specific filtering in the cancer pipeline. The value will correspond to the normalized expression threshold that needs to be reached by each kmer in order to be considered a kmer candidate. Kmers with an expression higher or equal than this threshold in the cancer sample of interest will be considered as cancer candidates. Cancer samples of interest are provided under --ids_cancer_samples.", default = None, required = False)
+ crf.add_argument("--cohort-expr-support-cancer", type=float, help="This parameter corresponds to the expression threshold in cohort specific filtering. It indicates the normalized expression value that needs to be observed in at least `--n-samples-lim-cancer` in order to consider a kmer as a cancer candidate. Kmers with an expression higher than this threshold in at least `--n-samples-lim-cancer` samples will be considered as cancer candidates. For each cancer sample of interest, provided under `--ids_cancer_samples`, the expression threshold --cohort-expr-support-cancer will be assessed in the rest of the cohort, excluding the sample of interest.", required=False, default=None)
+ crf.add_argument("--n-samples-lim-cancer", type=int, help="This parameter corresponds to the number of samples threshold in cohort specific filtering. It indicated the minimum number of cancer samples in which one should see an expression higher than `--cohort-expr-support-cancer` in order to consider the kmer as a cancer candidate. Kmers with an expression higher than `--cohort-expr-support-cancer` in at least `--n-samples-lim-cancer` samples will be considered as cancer candidates. For each cancer sample of interest, provided under `--ids_cancer_samples`, the expression threshold `--cohort-expr-support-cancer` will be assessed in the rest of the cohort, excluding the sample of interest.", required=False, default=None)
+ crf.add_argument("--path-cancer-matrix-segm", nargs='+', help="Path to the cancer matrix containing segment expression from samples belonging to a cohort. The matrix will have the following dimensions: [kmers * samples]. When only the junction overlapping cancer kmers are of interest, the user should provide only `--path-cancer-matrix-edge`, and skip the inclusion of this file.If both matrices are provided, junction expression will be chosen in case there is expression information for the same kmer in both matrices. This will be the output 5 of :ref:`build output section `", required=False, default=None)
+ crf.add_argument("--path-cancer-matrix-edge", nargs='+', help="Path to the cancer matrix containing junction expression from samples belonging to a cohort. The matrix will have the following dimensions: [kmers * samples]. When only the junction overlapping cancer kmers are of interest, the user should provide only this file, and skip the inclusion of `--path-cancer-matrix-segm`. If both matrices are provided, junction expression will be chosen in case there is expression information for the same kmer in both matrices. This will be the ouput 6 of :ref:`build output section `.", required=False, default=None)
+ crf.add_argument("--cancer-support-union", help="Parameter to choose how the sample specific filtering and the cohort specific filtering are combined. By default, they are combined by choosing the common kmers to both filtering steps, i.e. performing an intersection. If this parameter is set to True, the union of both filtering steps will be performed, i.e. the kmers that pass either the sample specific filtering or the cohort specific filtering will be kept.", action="store_true", required=False, default=False)
+
+ more_backgrounds = parser.add_argument_group('Optional parameters for the addition of additional backgrounds' , 'Parameters to add additional backgrounds that will be removed.')
+ more_backgrounds.add_argument("--path-normal-kmer-list", nargs='+', help="List of kmers to be added as part of the normal samples. The kmers provided in this list will be included in the normal background, without having to pass any of the filtering steps for the normal data. Format: It can be either a *tsv* or *parquet* file. If *parquet* is the used format, the file should contain the header 'kmer' in the first column.", required=False, default=None)
+ more_backgrounds.add_argument("--uniprot", help="Path to file containing uniprot kmers. The kmers contained in this databased will be assumed to be not novel peptides, and they will be removed from the cancer filtering output. *Note: It is important to kmerize the peptides downloaded from uniprot database into the length specified in `--kmer`*.", required=False, default=None)
+
+ more_filters = parser.add_argument_group('Optional parameters to add additional filters' , 'Parameters to add the additional filters shown in step 1 of both pipelines. ')
+ more_filters.add_argument("--filterNeojuncCoord", choices=['C', 'N', 'A'], required=False, default='', help=" This argument will filter the neojunctions, and it will retain only the kmers that are generated from neojunctions. These are peptides whose junction coordinates were not part of the annotation files. Selecting this option means that only those kmers with junctionAnnotated = False will be considered for further filtering. This filter is used in the preprocessing (step 1 of cancer and normal pipelines). If 'C' is selected, the filter is only applied to the cancer sample. If 'N' is selected, the filter is only applied to the normal samples. If 'A' is selected, the filter is applied to both cancer and normal samples.")
+ more_filters.add_argument("--filterAnnotatedRF", choices=['C', 'N', 'A'], required=False, default='', help="This argument will only retrieve kmers that were generated from annotated reading frames and discard those that were obtained by reading frame propagation through the graph. By selecting this option, only kmers generated from reading frames present in the annotation are kept. This means that selecting this option means that only those kmers with ReadFrameAnnotated = True will be selected. This filter is used in the preprocessing (step 1 of cancer and normal pipelines). If 'C' is selected, the filter is only applied to the cancer sample. If 'N' is selected, the filter is only applied to the normal samples. If 'A' is selected, the filter is applied to both cancer and normal samples.")
+
+ development = parser.add_argument_group('Optional development parameters')
+ development.add_argument("--tot-batches", type=int, help="If selected, the filtering of the background and foreground will be based on hash functions. This parameter will set the total number of batches in which we will divide the foreground and background files to filter, and each of those batches will be assigned a hash value. If `--batch-id` is specified, `--tot-batches` should also be specified.", required=False, default=None)
+ development.add_argument("--batch-id", type=int, help="If selected, the filtering of the background and foreground will be based on hash functions. This parameter will set the batch id of the current batch that is being filtered. The batch id should be an integer between 0 and `--tot-batches`. It shows the specific batch that we want to process, out of the `--tot-batches`. If `--batch-id` is specified, `--tot-batches` should also be specified.", required=False, default=None)
+ development.add_argument("--on-the-fly", help="If set to true, all the filtering steps will be done on the fly, without the creation of intermediate files. The creation of intermediate files would speed up the computations in the case where full or partial reruns are planned. An example of a partial rerun would be when seveal normal and cancer filtering parameters are applied to the same cohort. Choosing not to save intermediate files will trade speed for disk space.", action="store_true", default=False)
+
+ _add_general_args(parser)
+
+ return parser
+
+def get_mhcbind_parser(parser):
+ required = parser.add_argument_group('Mandatory arguments')
+
+ required.add_argument("--mhc-software-path", help="Path for the MHC prediction software.", required=True, default=None)
+ required.add_argument("--argstring", help="Complete command line for the MHC prediction tool passed as a string. One should include here the command that will be directly passed to the selected MHC tool. The four mandatory arguments are: \n 1.--mhc-predictor: This argument will specify the name of the software tool that will be used. The name should be in the format accepted by the library `mhc_tools `_ \n \n 2.--output-csv: This argument will contain the path and filename where the MHC prediction tool will save the results. The format of the file should be *.csv*.\n \n 3.--input-peptides-file: This argument will have the path and filename to the file containing the set of kmers on which MHC binding affinity prediction will be performed. The format of the file should be *.csv*.\n \n 4. --mhc-alleles or --mhc-alleles-file: This argument should contain the alleles that will be used for the analysis of mhc binding affinity.\n \n If `--partitioned-tsv` files are provided, an intermediate file will be created and stored under the path `--input-peptides-file` . This intermediate file will contain the set of all unique kmers present in the partitioned files obtained from `cancerspecif` mode. If one does not want to use the output of `cancerspecif` mode for prediction, the path to the file that will be used for prediction will be directly provided under `--input-peptides-list`.", required=True, default='')
+ required.add_argument("--output-dir", help="General output file. Absolute path to the output directory to save the MHC predictions. It should match the directory provided in --output-csv of the argstring, but without the file name.", required=True, default='')
+
+ optional = parser.add_argument_group('Optional argument')
+ optional.add_argument("--partitioned-tsv", help="The input to this command is the path to the folder containing the partitioned tsv files from `cancerspecif` mode. This corresponds to the files 1 and 2 found in the :ref:`output section `. If this parameter is set the tool will directly accept the files from cancerspecif mode as input.", required=False, default=None)
+ optional.add_argument("--bind-score-method", help="Scoring method to filter the MHC tools predictions. E.g. score, affinity, percentile_rank (this last one is only for netmhcpan).", required=False, default=None)
+ optional.add_argument("--bind-score-threshold", type=float, help="Threshold to filter the MHC tools predictions. All the peptides with a score lower than the threshold will be filtered out and only the ones with a score higher than the threshold will be kept.", required=False, default=None)
+ optional.add_argument("--less-than", help="If set to True the `--bind-score-threshold` will be considered as an upper bound instead of a lower bound. This means that peptides with a score higher than this threshold will be filtered out.", action="store_true", required=False, default=False)
+ _add_general_args(parser)
+ return parser
+
+def get_pepquery_parser(parser):
+ required = parser.add_argument_group('Mandatory arguments')
+ required.add_argument("--output-dir", help="Absolute path to the output directory to save the results of the validation.", required=True, default=None)
+ required.add_argument("--argstring", help="Complete command line for the pepQuery MS based validation software passed as a string. The command provided here will be directly passed to the pepQuery software. It must contain several mandatory arguments: \n 1. -o: Output directory. The results from pepQuery will be saved in this folder. \n 2. -i: Input peptides for the validation. The user can either provide their own peptides or take the expanded peptides from the kmers provided in --partitioned-tsv. If --partitioned-tsv is provided, the expanded peptides will be saved under the path and filename provided in this argument, and they will be taken as the input of pepQuery.\n 3. -db: Path to the reference database used in the analysis. This reference database will be used to identify those MS/MS spectra that match better a reference peptide than the query peptides. \n 4. -ms: Path to the MS/MS spectra file.", required=True, default=None)
+ required.add_argument("--pepquery-software-path", help="Path for the pepQuery software. It must include the path to the directory with the software and the software name itself. eg. pepquery/pepquery.jar", required=True, default=None)
+
+ optional = parser.add_argument_group('Optional argument')
+ optional.add_argument("--partitioned-tsv", help="The input to this command is the path to the folder containing the partitioned tsv files from `cancerspecif` mode. This corresponds to the files 1 and 2 found in the :ref:`output section `. If this parameter is set the tool will directly accept the files from cancerspecif mode as input. An intermediate file will be saved in the output directory containing the kmers in a bigger peptide context. This argument accepts files generated from junctions and from segments.", required=False, default=None)
+ optional.add_argument("--metadata-path", help="Absolute path to the metadata file created in build mode. This file is required whether the user chooses to work with junctions of with kmers.", required=False, default=None)
+ optional.add_argument("--kmer-type", help="Type of the kmers introduced under --partitioned-tsv. This will show whether the user chooses to work with junctions or with segments. The peptide retrieval strategy will vary depending on the input type.", choices=['junctions', 'segments'], required=False, default=None)
+ _add_general_args(parser)
+ return parser
+def parse_arguments(argv):
+ parser = argparse.ArgumentParser(prog='immunopepper')
+ parser_build, parser_samplespecif, parser_cancerspecif, parser_mhcbind, parser_pepquery = get_subparsers(parser)
- spark = parser_cancerspecif.add_argument_group('TECHNICAL PARAMETERS: parameters to control spark processing')
- spark.add_argument("--cores", type=int, help="number of cores", required=True, default='')
- spark.add_argument("--mem-per-core", type=int, help="memory per core", required=True)
- spark.add_argument("--parallelism", type=int, help="parallelism parameter for spark JVM", required=True, default='3')
- spark.add_argument("--out-partitions", type=int, help="number of partitions to save the final tsv file, correspond to a coalesce operation", required=False, default=None)
- spark.add_argument("--scratch-dir", help="os environement variable name containing the cluster scratch directory path", required=False, default='')
- spark.add_argument("--interm-dir-norm", help="custom scatch dir path to save intermediate cancer files", required=False, default='')
- spark.add_argument("--interm-dir-canc", help="custom scatch dir path to save intermediate normal files", required=False, default='')
-
- helpers = parser_cancerspecif.add_argument_group('INPUT HELPERS: Help the software understand the input files')
- helpers.add_argument("--kmer", help='kmer', required=True)
- helpers.add_argument("--ids-cancer-samples", nargs='+',help=" list of all cancer samples on which to apply the filtering. If --paths-cancer-samples provided they should be given in same order. \n If not provided the software filters the normals only.", required=False, default='')
- helpers.add_argument("--mut-cancer-samples", nargs='+', help=" list of mutation modes corresponding to cancer samples. If --paths-cancer-samples provided they should be given in same order", required=False, default='')
-
- inputs = parser_cancerspecif.add_argument_group('GENERAL INPUT FILES: Files and parameters to be provided to the software regardless of the filtering strategy')
- inputs.add_argument("--whitelist-normal", help="file containg whitelist for normal samples", required=False, default=None)
- inputs.add_argument("--whitelist-cancer", help="file containg whitelist for cancer samples", required=False, default=None)
- inputs.add_argument("--path-cancer-libsize", help="libsize file path for cancer samples", required=False, default=None)
- inputs.add_argument("--path-normal-libsize", help="libsize file path for normal samples", required=False, default=None)
- inputs.add_argument("--normalizer-cancer-libsize", type=float, help="Input custom rescaling factor for cancer libsize. Default: Median of libsize", required=False, default=None)
- inputs.add_argument("--normalizer-normal-libsize", type=float, help="Input custom rescaling factor for normal libsize. Default: Median of libsize", required=False, default=None)
-
- outputs = parser_cancerspecif.add_argument_group('GENERAL OUTPUT FILES: Files and parameters to be output by the software regardless of the filtering strategy')
- outputs.add_argument("--output-dir", help="output directory for the filtered matrix" , required=True, default='')
- outputs.add_argument("--output-count", help="request to write the intermediate number of kmer at each each step to the given path (risk of slowdown)" , required=False, default='')
- outputs.add_argument("--tag-normals", help="name for the normal cohort output files, use when various normal cohorts", required=False, default='')
- outputs.add_argument("--tag-prefix", help="prefix to use for the output files, use when several conditions", required=False, default='')
-
- nrf = parser_cancerspecif.add_argument_group('NORMAL SAMPLES: "Submode Recurrence hard Filter". Normal background inclusion is based on a combination of two filters (a) Number of reads to be expressed in any sample (b) Number of samples to have any read. The filters can be requested independently.')
- nrf.add_argument("--path-normal-matrix-segm", nargs='+', help="Segment expression integrated matrix of kmers * samples for background", required=False, default=None)
- nrf.add_argument("--path-normal-matrix-edge", nargs='+', help="Edge expression integrated matrix of kmers * samples for background", required=False, default=None)
- nrf.add_argument("--n-samples-lim-normal", type=int, help="Number of normal samples in which any number of reads is required (>0)", required=False, default=None)
- nrf.add_argument("--cohort-expr-support-normal", type=float, help="Normalized expression threshold for the normal cohort required in any sample (>=1)", required=False, default=None)
-
- crf = parser_cancerspecif.add_argument_group('CANCER SAMPLES: "Submode Recurrence and support filter". Cancer foreground inclusion is based on a combination of two filters (a) Number of reads to be expressed in the cancer "target" sample (b) Number of reads to be expressed in more than n samples in other cancer cohort samples. The filters can be requested independently. Note that use of (b) requires matrix input files.' )
- crf.add_argument("--sample-expr-support-cancer", type=float, help="Normalized expression threshold for the cancer target sample. (Can always be specified)")
- crf.add_argument("--cohort-expr-support-cancer", type=float, help="Normalized expression threshold for the cancer cohort excluding the target sample which should be met in n samples (if --expr-n-limit-cancer and path-cancer-matrix-segm resp. edge are provided)", required=False, default=None)
- crf.add_argument("--n-samples-lim-cancer", type=int, help="Number of cancer samples in which the cancer cohort expression threshold should be met (if --cohort-expr-support-cancer and path-cancer-matrix-segm resp. edge are provided)", required=False, default=None)
- crf.add_argument("--path-cancer-matrix-segm", nargs='+', help="List of cohort cancer matrix files containing segment expression in [kmers x samples]", required=False, default=None)
- crf.add_argument("--path-cancer-matrix-edge", nargs='+', help="List of cohort cancer matrix files containing edge expression in [kmers x samples]", required=False, default=None)
- crf.add_argument("--cancer-support-union", help="Choose how to combine sample-expr and cohort-expr support in cancer. Request union, default intersection", action="store_true", required=False, default=False)
-
- more_backgrouds = parser_cancerspecif.add_argument_group('ADDITIONAL BACKGROUNDS: Other lists of kmers to be removed')
- more_backgrouds.add_argument("--path-normal-kmer-list", nargs='+', help="Provide a list of kmer to be used independantly as background. Format tsv: 1 file/folder allowed,'.tsv' in filenames, format parquet: one or multiple parquets with kmer in first column", required=False, default=None)
- more_backgrouds.add_argument("--uniprot", help="file containg uniprot k-mers. k-mers length should match the one of the cancer and normal files", required=False, default=None)
-
- more_filters = parser_cancerspecif.add_argument_group('ADDITIONAL FILTERS: Currently filters on the annotation status of the junction coordinate and the reading frame of the kmers are supported')
- more_filters.add_argument("--filterNeojuncCoord", choices=['C', 'N', 'A'], required=False, default='', help="Retain kmers generated from neojunctions i.e. whose junction coordinates are not found in the annotation. Values: 'C', 'N', 'A' to perform filtering in cancer, normal or both sets respectively")
- more_filters.add_argument("--filterAnnotatedRF", choices=['C', 'N', 'A'], required=False, default='', help="Retain kmers generated from annotated reading frames i.e. whose reading frames are taken from annotated transcript and not propagated through the graph. Values: 'C', 'N', 'A' to perform filtering in cancer, normal or both sets respectively")
-
- development = parser_cancerspecif.add_argument_group('DEVELOPMENT PARAMETERS')
- development.add_argument("--tot-batches", type=int, help="Filter foreground and in background kmers based on hash function. Set number of batches",required=False, default=None)
- development.add_argument("--batch-id", type=int, help="Filter foreground and in background kmers based on hash function. Set 0<= batch_id >>>>>>> Preprocessing libsizes")
@@ -69,58 +68,45 @@ def mode_cancerspecif(arg):
launch_preprocess_normal, path_normal_for_express_threshold, path_normal_for_sample_threshold, \
path_interm_kmers_annotOnly = check_interm_files(normal_out, arg.cohort_expr_support_normal,
arg.n_samples_lim_normal, tag='normals', batch_tag=batch_tag)
+
+ ### Preprocessing Cancers
+ if cancer_files:
+ cancer_matrix = apply_preprocess(spark, 'C', index_name, coord_name, jct_col, jct_annot_col, rf_annot_col,
+ arg.whitelist_cancer, arg.path_cancer_matrix_segm, arg.path_cancer_matrix_edge,
+ arg.filterNeojuncCoord, arg.filterAnnotatedRF, arg.tot_batches, arg.batch_id)
+
### Preprocessing Normals
- if (arg.path_normal_matrix_segm is not None) or (arg.path_normal_matrix_edge is not None):
+ if normal_files:
if launch_preprocess_normal: # else do not need to launch because intermediate files are present
logging.info("\n \n >>>>>>>> Preprocessing Normal samples")
- normal_segm = process_build_outputs(spark, index_name, coord_name, jct_col,
- jct_annot_col, rf_annot_col,
- path_matrix=arg.path_normal_matrix_segm,
- whitelist=arg.whitelist_normal,
- cross_junction=0,
- filterNeojuncCoord=True if (arg.filterNeojuncCoord == 'N')
- or (arg.filterNeojuncCoord == 'A') else False,
- filterAnnotatedRF=True if (arg.filterAnnotatedRF == 'N')
- or (arg.filterAnnotatedRF == 'A') else False,
- output_dir=normal_out,
- separate_back_annot=path_interm_kmers_annotOnly,
- tot_batches=arg.tot_batches, batch_id=arg.batch_id)
- normal_junc = process_build_outputs(spark, index_name, coord_name, jct_col,
- jct_annot_col, rf_annot_col,
- path_matrix=arg.path_normal_matrix_edge,
- whitelist=arg.whitelist_normal,
- cross_junction=1,
- filterNeojuncCoord=True if (arg.filterNeojuncCoord == 'N')
- or (arg.filterNeojuncCoord == 'A') else False,
- filterAnnotatedRF=True if (arg.filterAnnotatedRF == 'N')
- or (arg.filterAnnotatedRF == 'A') else False,
- output_dir=normal_out,
- separate_back_annot=path_interm_kmers_annotOnly if not normal_segm else None, # The kmer only from the annotation need be extracted only once. Ideally from the segment expression matrix
- tot_batches=arg.tot_batches, batch_id=arg.batch_id)
- normal_matrix = combine_normals(normal_segm, normal_junc)
-
-
+ normal_matrix = apply_preprocess(spark, 'N', index_name, coord_name, jct_col, jct_annot_col, rf_annot_col,
+ arg.whitelist_normal, arg.path_normal_matrix_segm, arg.path_normal_matrix_edge,
+ arg.filterNeojuncCoord, arg.filterAnnotatedRF, arg.tot_batches, arg.batch_id,
+ normal_out, path_interm_kmers_annotOnly)
+ if cancer_files and normal_matrix:
+ normal_matrix = normal_matrix.join(cancer_matrix.select([index_name]), ["kmer"], how='inner') #lightweight background
# Hard Filtering
- logging.info((f'\n \n >>>>>>>> Normals: Perform Hard Filtering \n '
- f'(expressed in {arg.n_samples_lim_normal} samples'
- f' with {arg.cohort_expr_support_normal} normalized counts'))
- logging.info("expression filter")
- inter_matrix_expr, inter_matrix_sample = filter_hard_threshold(normal_matrix, index_name, coord_name, jct_annot_col,
- rf_annot_col, libsize_n,
- arg.cohort_expr_support_normal,
- arg.n_samples_lim_normal,
- path_normal_for_express_threshold,
- path_normal_for_sample_threshold,
- on_the_fly=arg.on_the_fly, tag='normals')
-
- normal_matrix = combine_hard_threshold_normals(spark, path_normal_for_express_threshold,
- path_normal_for_sample_threshold,
- inter_matrix_expr, inter_matrix_sample,
- arg.n_samples_lim_normal, index_name)
-
- # Add back kmer annot
- normal_matrix = remove_external_kmer_list(spark, path_interm_kmers_annotOnly,
+ if recurrence_normal:
+ logging.info((f'\n \n >>>>>>>> Normals: Perform Hard Filtering \n '
+ f'(expressed in {arg.n_samples_lim_normal} samples'
+ f' with {arg.cohort_expr_support_normal} normalized counts'))
+ logging.info("expression filter")
+ inter_matrix_expr, inter_matrix_sample = filter_hard_threshold(normal_matrix, index_name, coord_name, jct_annot_col,
+ rf_annot_col, libsize_n,
+ arg.cohort_expr_support_normal,
+ arg.n_samples_lim_normal,
+ path_normal_for_express_threshold,
+ path_normal_for_sample_threshold,
+ on_the_fly=arg.on_the_fly, tag='normals')
+
+ normal_matrix = combine_hard_threshold_normals(spark, path_normal_for_express_threshold,
+ path_normal_for_sample_threshold,
+ inter_matrix_expr, inter_matrix_sample,
+ arg.n_samples_lim_normal, index_name)
+
+ # Add back kmer annot
+ normal_matrix = remove_external_kmer_list(spark, path_interm_kmers_annotOnly,
normal_matrix, index_name, header=True)
# Additional kmer backgrounds filtering
@@ -128,144 +114,106 @@ def mode_cancerspecif(arg):
normal_matrix = remove_external_kmer_list(spark, arg.path_normal_kmer_list, normal_matrix, index_name, header=True)
### Apply filtering to foreground
- for cix, cancer_sample_ori in enumerate(arg.ids_cancer_samples):
- cancer_sample = cancer_sample_ori.replace('-', '').replace('.', '').replace('_', '')
- mutation_mode = arg.mut_cancer_samples[cix]
-
- launch_filter_cancer, path_cancer_for_express_threshold, path_cancer_for_sample_threshold, _ \
- = check_interm_files(cancer_out, arg.cohort_expr_support_cancer, arg.n_samples_lim_cancer, \
- target_sample=cancer_sample, tag=f'cancer_{mutation_mode}', batch_tag=batch_tag)
-
- ## Cancer file checks
- if arg.path_cancer_matrix_segm or arg.path_cancer_matrix_edge or arg.paths_cancer_samples:
- logging.info("\n \n >>>>>>>> Preprocessing Cancer sample {} ".format(cancer_sample_ori))
- mutation_mode = arg.mut_cancer_samples[0]
-
- # Preprocess cancer samples
- cancer_segm = process_build_outputs(spark, index_name, coord_name, jct_col,
- jct_annot_col, rf_annot_col,
- path_matrix=arg.path_cancer_matrix_segm,
- whitelist=arg.whitelist_cancer,
- cross_junction=0,
- filterNeojuncCoord=True if (arg.filterNeojuncCoord == 'C')
- or (arg.filterNeojuncCoord == 'A')
- else False,
- filterAnnotatedRF=True if (arg.filterAnnotatedRF == 'C')
- or (arg.filterAnnotatedRF == 'A')
- else False,
- tot_batches=arg.tot_batches, batch_id=arg.batch_id)
- cancer_junc = process_build_outputs(spark, index_name, coord_name, jct_col,
- jct_annot_col, rf_annot_col,
- path_matrix=arg.path_cancer_matrix_edge,
- whitelist=arg.whitelist_cancer,
- cross_junction=1,
- filterNeojuncCoord=True if (arg.filterNeojuncCoord == 'C')
- or (arg.filterNeojuncCoord == 'A')
- else False,
- filterAnnotatedRF=True if (arg.filterAnnotatedRF == 'C')
- or (arg.filterAnnotatedRF == 'A')
- else False,
- tot_batches=arg.tot_batches, batch_id=arg.batch_id)
- cancer_matrix = combine_cancer(cancer_segm, cancer_junc, index_name)
-
- # cancer sample-specific filter
-
- cancer_sample_filter = cancer_matrix.select([index_name, coord_name, cancer_sample, jct_annot_col, rf_annot_col])
-
- cancer_sample_filter = filter_expr_kmer(cancer_sample_filter, cancer_sample, 0) #Keep kmers expressed
- # Counting step: Retrieve initial number of kmers in sample
- output_count(arg.output_count, cancer_sample_filter, report_count, report_steps, 'Init_cancer')
-
- if arg.output_count and (arg.sample_expr_support_cancer != 0):
- cancer_sample_filter = filter_expr_kmer(cancer_sample_filter, cancer_sample,
- arg.sample_expr_support_cancer, libsize_c) #Keep kmers expressed >= threshold
-
- # Counting step: Retrieve number of kmers in sample after filtering on cancer expression
- output_count(arg.output_count, cancer_sample_filter, report_count, report_steps, 'Filter_Sample')
-
+ if cancer_files:
+ if not arg.ids_cancer_samples:
+ arg.ids_cancer_samples = ["full_cohort"]
+ if arg.mut_cancer_samples:
+ mutation_mode = arg.mut_cancer_samples[0]
else:
- output_count(arg.output_count, cancer_sample_filter, report_count, report_steps, 'Filter_Sample')
-
- # cancer cross-cohort filter
- if (arg.cohort_expr_support_cancer is not None) and (arg.n_samples_lim_cancer is not None):
- inter_matrix_expr_c, inter_matrix_sample_c = filter_hard_threshold(cancer_matrix, index_name,coord_name,
- jct_annot_col, rf_annot_col,
- libsize_c,
- arg.cohort_expr_support_cancer,
- arg.n_samples_lim_cancer,
- path_cancer_for_express_threshold,
- path_cancer_for_sample_threshold,
- target_sample=cancer_sample,
- on_the_fly=arg.on_the_fly,
- tag=f'cancer_{mutation_mode}')
- cancer_cross_filter = combine_hard_threshold_cancers(spark, cancer_matrix,
- path_cancer_for_express_threshold,
- path_cancer_for_sample_threshold,
- inter_matrix_expr_c, inter_matrix_sample_c,
- arg.cohort_expr_support_cancer,
- arg.n_samples_lim_cancer,
- index_name)
-
- cancer_cross_filter = cancer_cross_filter.select([index_name, coord_name, cancer_sample,
- jct_annot_col, rf_annot_col])
- if arg.cancer_support_union:
- logging.info("support union")
- cancer_kmers = cancer_cross_filter.union(cancer_sample_filter).distinct()
+ mutation_mode = 'ref'
+ cancer_kmers = cancer_matrix
+ for cix, cancer_sample_ori in enumerate(arg.ids_cancer_samples):
+ if cancer_sample_ori is not "full_cohort":
+ ## Cancer file filters
+ logging.info("\n \n >>>>>>>> Preprocessing Cancer sample {} ".format(cancer_sample_ori))
+ mutation_mode = arg.mut_cancer_samples[0]
+ cancer_sample = cancer_sample_ori.replace('-', '').replace('.', '').replace('_', '')
+ mutation_mode = arg.mut_cancer_samples[cix]
+
+ # Filter sample-specific kmers
+ cancer_sample_filter = cancer_matrix.select([index_name, coord_name, cancer_sample, jct_annot_col, rf_annot_col])
+ cancer_sample_filter = filter_expr_kmer(cancer_sample_filter, cancer_sample, 0) #Keep kmers expressed
+ output_count(arg.output_count, cancer_sample_filter, report_count, report_steps, 'Init_cancer')
+
+ # Filter sample-specific kmers expressed higher than threshold
+ if arg.output_count and (arg.sample_expr_support_cancer != 0):
+ cancer_sample_filter = filter_expr_kmer(cancer_sample_filter, cancer_sample,
+ arg.sample_expr_support_cancer, libsize_c) #Keep kmers expressed >= threshold
+ output_count(arg.output_count, cancer_sample_filter, report_count, report_steps, 'Filter_Sample')
else:
- logging.info("support intersect")
- cancer_kmers = cancer_cross_filter.join(cancer_sample_filter.select([index_name]), ["kmer"],
- how='inner')
- else:
- cancer_kmers = cancer_sample_filter
-
-
- output_count(arg.output_count, cancer_kmers, report_count, report_steps, 'Filter_Sample_Cohort')
-
-
- ## Cancer \ normals
- logging.info("\n \n >>>>>>>> Cancers: Perform differential filtering")
- partitions_ = cancer_kmers.rdd.getNumPartitions()
- logging.info(f'partitions: {partitions_}')
-
- # outpaths
- base_path_final = os.path.join(arg.output_dir,
- (f'{arg.tag_prefix}{cancer_sample_ori}_{mutation_mode}_'
- f'SampleLim{arg.sample_expr_support_cancer}'
- f'CohortLim{arg.cohort_expr_support_cancer}'
- f'Across{arg.n_samples_lim_cancer}_'
- f'FiltNormals{arg.tag_normals}'
- f'Cohortlim{arg.cohort_expr_support_normal}'
- f'Across{arg.n_samples_lim_normal}'))
-
- path_filter_final = base_path_final + batch_tag + extension
- path_filter_final_uniprot = base_path_final + '_FiltUniprot'+ batch_tag + extension
-
- # Remove background from foreground
- logging.info("Filtering normal background")
- cancer_kmers = broadcast(cancer_kmers)
- cancer_kmers = cancer_kmers.join(normal_matrix, cancer_kmers["kmer"] == normal_matrix["kmer"],
- how='left_anti')
- #partitions_ = cancer_kmers.rdd.getNumPartitions()
- #logging.info(f'partitions: {partitions_}')
- save_spark(cancer_kmers, arg.output_dir, path_filter_final, outpartitions=arg.out_partitions)
- output_count(arg.output_count, cancer_kmers, report_count, report_steps,
- 'Filter_Sample_Cohort_CohortNormal')
-
- # Remove Uniprot
- if arg.uniprot is not None:
- logging.info("Filtering kmers in uniprot")
- cancer_kmers = remove_uniprot(spark, cancer_kmers, arg.uniprot, index_name)
- save_spark(cancer_kmers, arg.output_dir, path_filter_final_uniprot, outpartitions=arg.out_partitions)
+ output_count(arg.output_count, cancer_sample_filter, report_count, report_steps, 'Filter_Sample')
+
+ resave_interm_cancer, path_cancer_for_express_threshold, path_cancer_for_sample_threshold, _ \
+ = check_interm_files(cancer_out, arg.cohort_expr_support_cancer, arg.n_samples_lim_cancer, \
+ target_sample=cancer_sample, tag=f'cancer_{mutation_mode}',
+ batch_tag=batch_tag)
+ # Filter cross-cancer cohort
+ if recurrence_cancer:
+ if resave_interm_cancer:
+ inter_matrix_expr_c, inter_matrix_sample_c = filter_hard_threshold(cancer_matrix, index_name,
+ coord_name, jct_annot_col,
+ rf_annot_col, libsize_c,
+ arg.cohort_expr_support_cancer,
+ arg.n_samples_lim_cancer,
+ path_cancer_for_express_threshold,
+ path_cancer_for_sample_threshold,
+ target_sample=cancer_sample,
+ on_the_fly=arg.on_the_fly,
+ tag=f'cancer_{mutation_mode}')
+ else:
+ inter_matrix_expr_c, inter_matrix_sample_c = None, None
+ cancer_cross_filter = combine_hard_threshold_cancers(spark, cancer_matrix,
+ path_cancer_for_express_threshold,
+ path_cancer_for_sample_threshold,
+ inter_matrix_expr_c, inter_matrix_sample_c,
+ arg.cohort_expr_support_cancer,
+ arg.n_samples_lim_cancer,
+ index_name)
+
+ cancer_cross_filter = cancer_cross_filter.select([index_name, coord_name, cancer_sample,
+ jct_annot_col, rf_annot_col])
+ if arg.cancer_support_union:
+ logging.info("support union")
+ cancer_kmers = cancer_cross_filter.union(cancer_sample_filter).distinct()
+ else:
+ logging.info("support intersect")
+ cancer_kmers = cancer_cross_filter.join(cancer_sample_filter.select([index_name]), ["kmer"],
+ how='inner')
+ else:
+ cancer_kmers = cancer_sample_filter
+ output_count(arg.output_count, cancer_kmers, report_count, report_steps, 'Filter_Sample_Cohort')
+
+ # Outpaths
+ path_filter_final, path_filter_final_uniprot = filtered_path(arg, cancer_sample_ori, mutation_mode,
+ normal_files, batch_tag, extension)
+ # Remove Background
+ if normal_files:
+ logging.info("\n \n >>>>>>>> Cancers: Perform differential filtering")
+ partitions_ = cancer_kmers.rdd.getNumPartitions()
+ logging.info(f'partitions: {partitions_}')
+ logging.info("Filtering normal background")
+ cancer_kmers = broadcast(cancer_kmers)
+ cancer_kmers = cancer_kmers.join(normal_matrix, cancer_kmers["kmer"] == normal_matrix["kmer"],
+ how='left_anti')
+
+ # Save main output
+ save_spark(cancer_kmers, arg.output_dir, path_filter_final, outpartitions=arg.out_partitions)
output_count(arg.output_count, cancer_kmers, report_count, report_steps,
- 'Filter_Sample_Cohort_CohortNormal_Uniprot')
+ 'Filter_Sample_Cohort_CohortNormal')
+
+ # Remove Uniprot #TODO redondant with external database
+ if arg.uniprot is not None:
+ logging.info("Filtering kmers in uniprot")
+ cancer_kmers = remove_uniprot(spark, cancer_kmers, arg.uniprot, index_name)
+ save_spark(cancer_kmers, arg.output_dir, path_filter_final_uniprot, outpartitions=arg.out_partitions)
+ output_count(arg.output_count, cancer_kmers, report_count, report_steps,
+ 'Filter_Sample_Cohort_CohortNormal_Uniprot')
- save_output_count(arg.output_count, report_count, report_steps, arg.tag_normals,
- cancer_sample_ori, mutation_mode, arg.sample_expr_support_cancer,
- arg.cohort_expr_support_cancer, arg.n_samples_lim_cancer,
- arg.cohort_expr_support_normal, arg.n_samples_lim_normal, arg.tag_normals)
+ save_output_count(arg.output_count, report_count, report_steps, arg.tag_normals,
+ cancer_sample_ori, mutation_mode, arg.sample_expr_support_cancer,
+ arg.cohort_expr_support_cancer, arg.n_samples_lim_cancer,
+ arg.cohort_expr_support_normal, arg.n_samples_lim_normal, arg.tag_normals)
- #if arg.paths_cancer_samples and os.path.exists(cancer_path_tmp) and rename:
- # os.remove(cancer_path_tmp)
diff --git a/immunopepper/mode_pepQuery.py b/immunopepper/mode_pepQuery.py
index 8d22d0fb..70560454 100644
--- a/immunopepper/mode_pepQuery.py
+++ b/immunopepper/mode_pepQuery.py
@@ -129,7 +129,7 @@ def mode_pepquery(arg):
output_peptides_file_idx = [i + 1 for i, j in enumerate(args_list) if j == '-o']
output_peptides_file = args_list[output_peptides_file_idx[0]]
- if f'{output_peptides_file}psm_rank.txt' not in glob.glob(f'{output_peptides_file}*'):
+ if f'{output_peptides_file}/psm_rank.txt' not in glob.glob(f'{output_peptides_file}/*'):
logging.error(">>>>> PepQuery failed to generate the psm_rank.txt file. Please check the output directory. Maybe the provided spectra file does not contain any of the input peptides")
sys.exit(1)
else:
@@ -180,7 +180,7 @@ def mode_pepquery(arg):
ip_out["confident"] = pd.Categorical(ip_out["confident"], categories=confident_categories)
ip_out.sort_values(by=["confident", "score"], ascending=[True, False], inplace=True)
ip_out.to_csv(f'{arg.output_dir}/peptides_validated.tsv.gz', sep='\t', index=False, compression='gzip')
- logging.info(">>>>> Processed output file saved to {}peptides_validated.tsv.gz \n".format(arg.output_dir))
+ logging.info(">>>>> Processed output file saved to {}/peptides_validated.tsv.gz \n".format(arg.output_dir))
logging.info(">>>>> Finished running immunopepper in pepquery mode \n")
diff --git a/immunopepper/sdisk.py b/immunopepper/sdisk.py
new file mode 100644
index 00000000..d15010e7
--- /dev/null
+++ b/immunopepper/sdisk.py
@@ -0,0 +1,200 @@
+import logging
+import numpy as np
+import os
+import pathlib
+
+def save_spark(cancer_kmers, output_dir, path_final_fil, outpartitions=None):
+ '''
+ Saves a spark dataframe as a single or partitioned csv file
+ :param cancer_kmers: spark dataframe matrix with expression counts for cancer
+ :param output_dir: str path for output directory
+ :param path_final_fil: str path to save the spark dataframe
+ :param outpartitions: int number of partitions for saving
+ '''
+ # save
+ logging.info(f'>>>> Save to {path_final_fil}')
+ pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True)
+ if outpartitions is not None:
+ cancer_kmers.repartition(outpartitions).write.mode('overwrite')\
+ .options(header="true", sep="\t", compression="gzip").format("tsv.gz").csv(path_final_fil)
+ else:
+ cancer_kmers.write.mode('overwrite')\
+ .options(header="true", sep="\t", compression="gzip").format("tsv.gz").csv(path_final_fil)
+
+
+def output_count(perform_count, matrix, report_count, report_step, step_string):
+ '''
+ Performs a count operation on the number of kmers present in spark dataframe after a given filtering step
+ Note: This operation is expensive but useful if the user is interested in intermediate filtering steps
+ :param perform_count: bool whether to perform a count operation
+ :param matrix: spark dataframe with kmer expression counts
+ :param report_count: list to store result of successive counting operations
+ :param report_step: list to store name of successive counting operations
+ :param step_string: str name of the counting operation
+ '''
+ if perform_count:
+ mycount = matrix.count()
+ report_count.append(mycount)
+ report_step.append(step_string)
+ logging.info(f'# {step_string} n = {mycount} kmers')
+
+
+def save_output_count(output_count, report_count, report_steps, prefix, cancer_sample_ori, mutation_mode,
+ sample_expr_support_cancer, cohort_expr_support_cancer, n_samples_lim_cancer,
+ cohort_expr_support_normal, n_samples_lim_normal, id_normals):
+ '''
+ Saves the number of kmers present in spark dataframe after each filtering step in a tabular file
+ :param output_count: str path for count file of intermediate filtering steps
+ :param report_count: list to store result of successive counting operations
+ :param report_step: list to store name of successive counting operations
+ :param prefix: str information to be added to the result line in an info column
+ :param cancer_sample_ori: str id of target cancer sample which was filtered
+ :param mutation_mode: str information about whether mutations where applied or not
+ :param sample_expr_support_cancer: float normalized expression threshold for the cancer target sample
+ :param cohort_expr_support_cancer: float normalized expression threshold for the cancer cohort
+ excluding the target sample
+ hich should be met in n samples
+ :param n_samples_lim_cancer: int number of cancer samples in which the cancer cohort expression threshold
+ should be met
+ :param cohort_expr_support_normal: float normalized expression threshold for the normal cohort
+ required in any sample (>=1)
+ :param n_samples_lim_normal: int number of normal samples in which any number of reads is required (>0)
+ :param id_normals: str id of the normal cohort (example gtex)
+ '''
+ if output_count:
+ header = (f'{"sample"}\t{"mutation_mode"}\t{"min_sample_reads"}\t{"#_of_cohort_samples"}\t'
+ f'{"reads_per_cohort_sample"}\t{"#_normal_samples_allowed"}\t{"normal_cohort_id"}'
+ f'\t{"reads_per_normal_sample"}')
+ line = (f'{cancer_sample_ori}\t{mutation_mode}\t{sample_expr_support_cancer}\t{n_samples_lim_cancer}'
+ f'\t{cohort_expr_support_cancer}\t{n_samples_lim_normal}\t{id_normals}'
+ f'\t{cohort_expr_support_normal}')
+
+ for idx in np.arange(len(report_count)):
+ header += f'\t{report_steps[idx]}'
+ line += f'\t{report_count[idx]}'
+ if prefix:
+ header += f'\t{"info"}'
+ line += f'\t{prefix}'
+ header += "\n"
+ line += "\n"
+ if not os.path.exists(output_count):
+ with open(output_count,"w") as f:
+ f.write(header)
+ with open(output_count, "a") as f:
+ f.write(line)
+ logging.info(f'Save intermediate info to {output_count}')
+
+
+def redirect_interm(interm_dir_norm, interm_dir_canc, output_dir):
+ '''
+ Set the directory to save intermediary file
+ - The output directory
+ - Any other specified normal or cancer directory
+ Default. Uses output directory
+ :param interm_dir_norm: str custom scatch dir path to save intermediate normal files
+ :param interm_dir_canc: str custom scatch dir path to save intermediate cancer files
+ :param output_dir: str output directory for the filtered matrix
+ :return:
+ '''
+
+ if interm_dir_canc:
+ cancer_out = interm_dir_canc
+ else:
+ cancer_out = output_dir
+ if interm_dir_norm:
+ normal_out = interm_dir_norm
+ else:
+ normal_out = output_dir
+ return normal_out, cancer_out
+
+
+def check_interm_files(out_dir, expr_limit, n_samples_lim, target_sample='', tag='normals', batch_tag=''):
+ '''
+ Filtering steps for normal (resp. cancer) samples are saved as intermediate files because it is an expensive operation
+ The function checks the presence of the intermediate filtering files to decide whether to perform
+ - the full preprocessing + threshold filtering steps
+ - or simply re-load the intermediate files
+ :param out_dir: str path for output directory
+ :param expr_limit: float expression limit threshold to keep a kmer
+ :param n_samples_lim: int number of samples that need to pass the expression limit
+ :param target_sample: str name of the sample of interest.
+ To be excluded in the number of samples that pass the expression limit
+ :param tag: str tag related to the type of samples. Example cancer or normal
+ :param batch_tag: str batch mode, batch tag to be appended to intermediate file
+ :returns:
+ - launch_preprocess: bool, whether to perform the full preprocessing + threshold filtering steps
+ or simply re-load the intermediate files
+ - path_interm_matrix_for_express_threshold, path_interm_matrix_for_sample_threshold,
+ path_interm_kmers_annotOnly are respectively the path (str) where
+ the expression-filtered matrix, the sample-filtered matrix and
+ the kmers derived solely from the annotation are saved
+ '''
+ base_n_samples = 1
+ base_expr = 0.0
+ format_tag = '.tsv.gz'
+ # For cancer matrix intermediate file the recurrence filter is not applied to the target sample
+ if target_sample:
+ suffix = f'Except{target_sample}'
+ else:
+ suffix = ''
+
+ # For normal samples the expression threshold filtering is not applied to the kmers found only in the annotation
+ # but not in the background samples. These kmers will be by default removed from the foreground matrix.
+ if tag == 'normals':
+ path_interm_kmers_annotOnly = os.path.join(out_dir, f'kmers_derived_solely_from_annotation{format_tag}')
+ else:
+ path_interm_kmers_annotOnly = None
+
+ # Saving paths
+
+ path_interm_matrix_for_express_threshold = os.path.join(out_dir,
+ f'interm_{tag}_combiExprCohortLim{expr_limit}Across{base_n_samples}{suffix}{batch_tag}{format_tag}')
+ path_interm_matrix_for_sample_threshold = os.path.join(out_dir,
+ f'interm_{tag}_combiExprCohortLim{base_expr}Across{base_n_samples}{suffix}{batch_tag}{format_tag}')
+ # Check existence
+ if (expr_limit and os.path.isfile(os.path.join(path_interm_matrix_for_express_threshold, '_SUCCESS'))) \
+ and (n_samples_lim is not None and os.path.isfile(os.path.join(path_interm_matrix_for_sample_threshold, '_SUCCESS'))):
+
+ logging.info((f'Intermediate {tag} filtering already performed in: {path_interm_matrix_for_express_threshold} '
+ f' and {path_interm_matrix_for_sample_threshold}. Re-loading {tag} intermediate data...'))
+ logging.info((f'Proceed with care! Using intermediate files means ignoring --filterNeojuncCoord, '
+ f'--filterAnnotatedRF parameter.'))
+ launch_preprocess = False
+ else:
+ logging.info(f'At least one intermediate {tag} filtering file is missing.')
+ logging.info(f'Will compute full filtering steps according to user input parameters')
+ launch_preprocess = True
+
+ return launch_preprocess, \
+ path_interm_matrix_for_express_threshold, \
+ path_interm_matrix_for_sample_threshold, \
+ path_interm_kmers_annotOnly
+
+
+
+def filtered_path(arg, cancer_sample_ori, mutation_mode, normal_files, batch_tag, extension):
+ '''
+ :param arg: argument class from argparse
+ :param cancer_sample_ori: str. name of the cancer sample
+ :param mutation_mode: str. mutation mode flag
+ :param normal_files: bool. whether the normal files are used as an input
+ :param batch_tag: str. tag for the batch
+ :param extension: str. saving format extension
+ :return:
+ path_filter_final
+ path_filter_final_uniprot
+ '''
+ base_path_final = os.path.join(arg.output_dir,
+ (f'{arg.tag_prefix}{cancer_sample_ori}_{mutation_mode}_'
+ f'SampleLim{arg.sample_expr_support_cancer}'
+ f'CohortLim{arg.cohort_expr_support_cancer}'
+ f'Across{arg.n_samples_lim_cancer}'))
+ if normal_files:
+ base_path_final += (f'_FiltNormals{arg.tag_normals}'
+ f'Cohortlim{arg.cohort_expr_support_normal}'
+ f'Across{arg.n_samples_lim_normal}')
+
+ path_filter_final = base_path_final + batch_tag + extension
+ path_filter_final_uniprot = base_path_final + '_FiltUniprot' + batch_tag + extension
+
+ return path_filter_final, path_filter_final_uniprot
\ No newline at end of file
diff --git a/immunopepper/sloaders.py b/immunopepper/sloaders.py
new file mode 100644
index 00000000..89952b4d
--- /dev/null
+++ b/immunopepper/sloaders.py
@@ -0,0 +1,244 @@
+import logging
+import numpy as np
+import pandas as pd
+import sys
+from pyspark.sql import functions as sf
+from immunopepper.spark import combine_cancer
+from immunopepper.spark import combine_normals
+from immunopepper.spark import split_only_found_annotation_backbone
+from immunopepper.spark import filter_on_junction_kmer_annotated_flag
+
+
+def apply_preprocess(spark, condition, index_name, coord_name, jct_col, jct_annot_col, rf_annot_col,
+ whitelist, path_segm, path_edge, flag_jct, flag_rf, tot_batches, batch_id,
+ normal_out=None, path_interm_kmers_annotOnly=None):
+ # Load cancer and normal kmers and join them
+ df_segm = process_build_outputs(spark, index_name, coord_name, jct_col,
+ jct_annot_col, rf_annot_col,
+ path_matrix=path_segm,
+ whitelist=whitelist,
+ cross_junction=0,
+ filterNeojuncCoord=True if (flag_jct == condition)
+ or (flag_jct == 'A')
+ else False,
+ filterAnnotatedRF=True if (flag_rf == condition)
+ or (flag_rf == 'A')
+ else False,
+ tot_batches=tot_batches,
+ batch_id=batch_id,
+ output_dir=normal_out
+ if normal_out is not None else None,
+ separate_back_annot=path_interm_kmers_annotOnly
+ if path_interm_kmers_annotOnly is not None else None
+ )
+
+ df_junc = process_build_outputs(spark, index_name, coord_name, jct_col,
+ jct_annot_col, rf_annot_col,
+ path_matrix=path_edge,
+ whitelist=whitelist,
+ cross_junction=1,
+ filterNeojuncCoord=True if (flag_jct == condition)
+ or (flag_jct == 'A')
+ else False,
+ filterAnnotatedRF=True if (flag_rf == condition)
+ or (flag_rf == 'A')
+ else False,
+ tot_batches=tot_batches,
+ batch_id=batch_id,
+ output_dir=normal_out
+ if normal_out is not None else None,
+ separate_back_annot=path_interm_kmers_annotOnly
+ if (path_interm_kmers_annotOnly is not None) and (not path_segm) #not done above
+ else None
+ )
+
+ if condition == 'C':
+ kmer_matrix = combine_cancer(df_segm, df_junc, index_name)
+ elif condition == 'N':
+ kmer_matrix = combine_normals(df_segm, df_junc)
+ return kmer_matrix
+
+
+def process_libsize(path_lib, custom_normalizer):
+ '''
+ Loads and returns the normalisation values per sample
+ Normalisation formulae: (count / 75 quantile expression in sample) * A
+ If no normalisation factor is provided: A = median across samples
+ If normalisation factor is provided: A = normalisation factor
+ :param path_lib: str path with library size file
+ :param custom_normalizer: custum normalisation factor
+ :return: dataframe with 75 quantile expression in sample / A values
+ '''
+ lib = pd.read_csv(path_lib, sep='\t')
+ if custom_normalizer:
+ lib['libsize_75percent'] = lib['libsize_75percent'] / custom_normalizer
+ else:
+ lib['libsize_75percent'] = lib['libsize_75percent'] / np.median(lib['libsize_75percent'])
+ lib['sample'] = [sample.replace('-', '').replace('.', '').replace('_','') for sample in lib['sample']]
+ lib = lib.set_index('sample')
+ return lib
+
+
+def process_build_outputs(spark, index_name, coord_name, jct_col, jct_annot_col, rf_annot_col,
+ path_matrix=None, whitelist=None, cross_junction=None,
+ filterNeojuncCoord=None, filterAnnotatedRF=None,
+ output_dir=None, separate_back_annot=None, tot_batches=None, batch_id=None):
+ '''
+ Various preprocessing steps including
+ - Loading of data from immunopepper mode build
+ - Separation of the kmers derived solely from the annotation (expression is null in samples)
+ - Filtering on junction and reading frame annotated status
+ - Select whitelist samples
+ :param spark: spark context
+ :param index_name: str kmer column name
+ :param jct_col: str junction column name
+ :param jct_annot_col: str junction annotated flag column name
+ :param rf_annot_col: str reading frame annotated flag column name
+ :param path_matrix: str path for multi-sample count matrix from immunopepper mode build: kmer|is_cross_junction|junctionAnnotated|readFrameAnnotated|sample_1|sample_2|...
+ :param path_kmer_file: str path for single sample kmer file from immunopepper mode build: kmer|id|segmentexpr|iscrossjunction|junctionexpr|junctionAnnotated|readFrameAnnotated|
+ :param col_expr_kmer_file: str name of the expression column of interest in path_kmer_file. e.g. 'segmentexpr' or 'junctionexpr'
+ :param target_sample: str sample for which the path_kmer_file has been computed
+ :param whitelist: list whitelist for samples
+ :param cross_junction: bool whether the count matrix contains junction counts or segment counts
+ :param annot_flag: list with the intruction codes on how to treat the reading frame and junction annotated flags
+ :param output_dir: str path for output directory
+ :param filterNeojuncCoord: bool if True, filter for kmers from neojunctions,
+ i.e. with non-annotated junction coordinates
+ :param filterAnnotatedRF: bool if True, filter for kmers from annotated reading frames,
+ i.e. with reading frames found in transcripts from the annotation, not propagated
+ :param separate_back_annot: str, if provided, the kmers derived from the annotation only (i.e. without read support
+ in any cohort sample) are saved into this path.
+ - The kmers are thereby skipped from expression threshold filtering in the background matrix.
+ - They will be by default removed from the foreground matrix as "annotated kmers"
+ :param tot_batches: int batch mode only, total number of batches
+ :param batch_id: int batch mode only, id of the batch
+ :return: Preprocessed spark dataframe
+ '''
+
+ def RenamedCols(df):
+ old_name = df.columns
+ new_names = [name_.replace('-', '').replace('.', '').replace('_', '') for name_ in old_name]
+ df = df.toDF(*new_names)
+ return df
+
+
+ def filter_whitelist(matrix, name_list, index_name, coord_name, jct_annot_col, rf_annot_col):
+ name_list.extend([index_name, coord_name, jct_annot_col, rf_annot_col])
+ return matrix.select([sf.col(name_) for name_ in name_list])
+
+
+ if (path_matrix is not None):
+ # Load immunopepper kmer candidates
+ rename = False # For development
+ logging.info(f'Load input {path_matrix}')
+ matrix = loader(spark, path_matrix, header=True)
+ partitions_ = matrix.rdd.getNumPartitions()
+ logging.info(f'...partitions: {partitions_}')
+
+ if rename:
+ logging.info("Rename")
+ matrix = RenamedCols(matrix)
+
+ # In batch mode: Dev remove kmers for the matrix based on the hash function
+ if tot_batches:
+ logging.info((f'Filter foreground and background based on Hash ; '
+ f'making {tot_batches} batches, select batch number {batch_id}'))
+
+ matrix = matrix.filter(sf.abs(sf.hash('kmer') % tot_batches) == batch_id)
+
+ # Filter on junction status depending on the content on the matrix
+ matrix = matrix.drop(jct_col)
+
+ # Separate kmers only present in the backbone annotation from the ones supported by the reads of any sample
+ partitions_ = matrix.rdd.getNumPartitions()
+ logging.info(f'...partitions: {partitions_}')
+ if separate_back_annot:
+ logging.info("Isolating kmers only in backbone annotation")
+ matrix = split_only_found_annotation_backbone(separate_back_annot, output_dir, matrix, index_name,
+ jct_annot_col, rf_annot_col)
+
+ # Filter according to annotation flag
+ matrix = filter_on_junction_kmer_annotated_flag(matrix, jct_annot_col, rf_annot_col,
+ filterNeojuncCoord, filterAnnotatedRF)
+
+ # Reduce samples (columns) to whitelist
+ logging.info("Cast types")
+ if whitelist is not None:
+ whitelist = pd.read_csv(whitelist, sep='\t', header=None)[0].to_list()
+ whitelist = [name_.replace('-', '').replace('.', '').replace('_', '') for name_ in whitelist]
+ matrix = filter_whitelist(matrix, whitelist, index_name, coord_name, jct_annot_col, rf_annot_col)
+
+ partitions_ = matrix.rdd.getNumPartitions()
+ logging.info(f'...partitions: {partitions_}')
+ return matrix
+ else:
+ return None
+
+
+def remove_external_kmer_list(spark, path_external_kmer_database, spark_matrix, index_name, header=False):
+ '''
+ Takes an expression matrix of the format [kmers] x [samples, metadata] and removes an external database
+ :param spark: spark context
+ :param path_normal_kmer_list: str. path of file containing kmers to be removed
+ :param spark_matrix: str path for count matrix
+ :param index_name: str kmer column name
+ :param header: bool. whether the file contains a header
+ :return: Filtered matrix for external kmer database
+ '''
+ logging.info("Load input {}".format(path_external_kmer_database))
+ external_database = loader(spark, path_external_kmer_database, header=header)
+ external_database = external_database.select(sf.col(index_name))
+ if spark_matrix:
+ spark_matrix = spark_matrix.union(external_database)
+ else:
+ spark_matrix = external_database
+ return spark_matrix
+
+
+def loader(spark, path_kmer, header=False):
+ '''
+ Loads a parquet, csv or tsv (partitioned) file as a spark dataframe
+ The user can provide either
+ - a list of files with the same schema
+ - a string. folder containing the files
+ :param spark: spark context
+ :param path_kmer: Either (1) a list of files or (b) a string with the basefolder where the files are placed
+ :param header: bool. whether the file contains a header
+ :return: loaded spark dataframe
+
+ '''
+ if ('parquet' in path_kmer) or ('pq' in path_kmer): #single file or folder
+ matrix = spark.read.parquet(path_kmer)
+ elif ('parquet' in path_kmer[0]) or ('pq' in path_kmer[0]): # list
+ matrix = spark.read.parquet(*path_kmer)
+ elif ('csv' in path_kmer[0]) or ('csv' in path_kmer):
+ matrix = spark.read.csv(path_kmer, sep=',', header=header)
+ elif ('tsv' in path_kmer[0]) or ('tsv' in path_kmer):
+ matrix = spark.read.csv(path_kmer, sep=r'\t', header=header)
+ elif ('gzip' in path_kmer[0]) or ('gzip' in path_kmer): #If tsv or csv not in path -> assume gzip are tab separated
+ matrix = spark.read.csv(path_kmer, sep=r'\t', header=header)
+ elif ('gz' in path_kmer[0]) or ('gz' in path_kmer):
+ matrix = spark.read.csv(path_kmer, sep=r'\t', header=header)
+ else:
+ logging.error(f'Cannot determine file type of {path_kmer}. Please include .parquet .pq .tsv .csv suffix to the files or the folder (partitionned)')
+ sys.exit()
+
+ return matrix
+
+
+def inputs_to_modes(arg):
+ recurrence_cancer = False
+ recurrence_normal = False
+ cancer_files = False
+ normal_files = False
+
+ if (arg.path_normal_matrix_segm is not None) or (arg.path_normal_matrix_edge is not None):
+ recurrence_normal = True
+ normal_files = True
+ if arg.path_normal_kmer_list is not None:
+ normal_files = True
+ if arg.path_cancer_matrix_segm or arg.path_cancer_matrix_edge:
+ cancer_files = True
+ if (arg.cohort_expr_support_cancer is not None) and (arg.n_samples_lim_cancer is not None):
+ recurrence_cancer = True
+ return recurrence_cancer, recurrence_normal, cancer_files, normal_files
diff --git a/immunopepper/spark.py b/immunopepper/spark.py
index c4c4cedc..a32183a5 100644
--- a/immunopepper/spark.py
+++ b/immunopepper/spark.py
@@ -1,173 +1,10 @@
import logging
-import numpy as np
import os
-import pandas as pd
-import pathlib
-import sys
+from immunopepper.sdisk import save_spark
from pyspark.sql import functions as sf
from pyspark.sql import types as st
-def process_libsize(path_lib, custom_normalizer):
- '''
- Loads and returns the normalisation values per sample
- Normalisation formulae: (count / 75 quantile expression in sample) * A
- If no normalisation factor is provided: A = median across samples
- If normalisation factor is provided: A = normalisation factor
- :param path_lib: str path with library size file
- :param custom_normalizer: custum normalisation factor
- :return: dataframe with 75 quantile expression in sample / A values
- '''
- lib = pd.read_csv(path_lib, sep='\t')
- if custom_normalizer:
- lib['libsize_75percent'] = lib['libsize_75percent'] / custom_normalizer
- else:
- lib['libsize_75percent'] = lib['libsize_75percent'] / np.median(lib['libsize_75percent'])
- lib['sample'] = [sample.replace('-', '').replace('.', '').replace('_','') for sample in lib['sample']]
- lib = lib.set_index('sample')
- return lib
-
-
-def process_build_outputs(spark, index_name, coord_name, jct_col, jct_annot_col, rf_annot_col,
- path_matrix=None, whitelist=None, cross_junction=None,
- filterNeojuncCoord=None, filterAnnotatedRF=None,
- output_dir=None, separate_back_annot=None, tot_batches=None, batch_id=None):
- '''
- Various preprocessing steps including
- - Loading of data from immunopepper mode build
- - Separation of the kmers derived solely from the annotation (expression is null in samples)
- - Filtering on junction and reading frame annotated status
- - Select whitelist samples
- :param spark: spark context
- :param index_name: str kmer column name
- :param jct_col: str junction column name
- :param jct_annot_col: str junction annotated flag column name
- :param rf_annot_col: str reading frame annotated flag column name
- :param path_matrix: str path for multi-sample count matrix from immunopepper mode build: kmer|is_cross_junction|junctionAnnotated|readFrameAnnotated|sample_1|sample_2|...
- :param path_kmer_file: str path for single sample kmer file from immunopepper mode build: kmer|id|segmentexpr|iscrossjunction|junctionexpr|junctionAnnotated|readFrameAnnotated|
- :param col_expr_kmer_file: str name of the expression column of interest in path_kmer_file. e.g. 'segmentexpr' or 'junctionexpr'
- :param target_sample: str sample for which the path_kmer_file has been computed
- :param whitelist: list whitelist for samples
- :param cross_junction: bool whether the count matrix contains junction counts or segment counts
- :param annot_flag: list with the intruction codes on how to treat the reading frame and junction annotated flags
- :param output_dir: str path for output directory
- :param filterNeojuncCoord: bool if True, filter for kmers from neojunctions,
- i.e. with non-annotated junction coordinates
- :param filterAnnotatedRF: bool if True, filter for kmers from annotated reading frames,
- i.e. with reading frames found in transcripts from the annotation, not propagated
- :param separate_back_annot: str, if provided, the kmers derived from the annotation only (i.e. without read support
- in any cohort sample) are saved into this path.
- - The kmers are thereby skipped from expression threshold filtering in the background matrix.
- - They will be by default removed from the foreground matrix as "annotated kmers"
- :param tot_batches: int batch mode only, total number of batches
- :param batch_id: int batch mode only, id of the batch
- :return: Preprocessed spark dataframe
- '''
-
- def RenamedCols(df):
- old_name = df.columns
- new_names = [name_.replace('-', '').replace('.', '').replace('_', '') for name_ in old_name]
- df = df.toDF(*new_names)
- return df
-
-
- def filter_whitelist(matrix, name_list, index_name, coord_name, jct_annot_col, rf_annot_col):
- name_list.extend([index_name, coord_name, jct_annot_col, rf_annot_col])
- return matrix.select([sf.col(name_) for name_ in name_list])
-
-
- if (path_matrix is not None):
- # Load immunopepper kmer candidates
- rename = False # For development
- logging.info(f'Load input {path_matrix}')
- matrix = loader(spark, path_matrix, header=True)
- partitions_ = matrix.rdd.getNumPartitions()
- logging.info(f'...partitions: {partitions_}')
-
- if rename:
- logging.info("Rename")
- matrix = RenamedCols(matrix)
-
- # In batch mode: Dev remove kmers for the matrix based on the hash function
- if tot_batches:
- logging.info((f'Filter foreground and background based on Hash ; '
- f'making {tot_batches} batches, select batch number {batch_id}'))
-
- matrix = matrix.filter(sf.abs(sf.hash('kmer') % tot_batches) == batch_id)
-
- # Filter on junction status depending on the content on the matrix
- matrix = matrix.drop(jct_col)
-
- # Separate kmers only present in the backbone annotation from the ones supported by the reads of any sample
- partitions_ = matrix.rdd.getNumPartitions()
- logging.info(f'...partitions: {partitions_}')
- if separate_back_annot:
- logging.info("Isolating kmers only in backbone annotation")
- matrix = split_only_found_annotation_backbone(separate_back_annot, output_dir, matrix, index_name,
- jct_annot_col, rf_annot_col)
-
- # Filter according to annotation flag
- matrix = filter_on_junction_kmer_annotated_flag(matrix, jct_annot_col, rf_annot_col,
- filterNeojuncCoord, filterAnnotatedRF)
-
- # Reduce samples (columns) to whitelist
- logging.info("Cast types")
- if whitelist is not None:
- whitelist = pd.read_csv(whitelist, sep='\t', header=None)[0].to_list()
- whitelist = [name_.replace('-', '').replace('.', '').replace('_', '') for name_ in whitelist]
- matrix = filter_whitelist(matrix, whitelist, index_name, coord_name, jct_annot_col, rf_annot_col)
-
- partitions_ = matrix.rdd.getNumPartitions()
- logging.info(f'...partitions: {partitions_}')
- return matrix
- else:
- return None
-
-
-def split_only_found_annotation_backbone(separate_back_annot, output_dir, matrix, index_name,
- jct_annot_col, rf_annot_col):
- '''
- Separate kmers only present in the backbone annotation from the ones supported by the reads of any sample:
- A kmer is solely derived from the backbone annotation if (all samples have zero expression)
- The opposite condition is implemented to make use of short-circuit evaluation
-
- :param matrix: spark dataframe matrix to filter
- :param index_name: str kmer column name
- :param jct_annot_col: string junction is annotated column
- :param rf_annot_col: string reading frame is annotated column
- :return: spark matrix with expressed kmers, spark serie with kmers only in backbone annotation
- '''
- expressed = ' OR '.join( [f'({col_name} != 0.0)'
- for col_name in matrix.schema.names
- if col_name not in [index_name, jct_annot_col, rf_annot_col]]) # SQL style because many cols
- matrix = matrix.withColumn('expressed',
- sf.when(sf.expr(f"{expressed}"), True).otherwise(False))
-
- kmers_AnnotOnly = matrix.select(index_name).where(sf.col('expressed') == False)
- matrix_Expressed = matrix.filter(sf.col('expressed') == True)
- matrix_Expressed = matrix_Expressed.drop(sf.col('expressed'))
- save_spark(kmers_AnnotOnly, output_dir, separate_back_annot)
- return matrix_Expressed
-
-
-def filter_on_junction_kmer_annotated_flag(matrix, jct_annot_col, rf_annot_col, filterNeojuncCoord, filterAnnotatedRF):
- '''
- Filters according to the junction and kmer annotated flag
- :param matrix: spark dataframe matrix to filter
- :param jct_annot_col: string junction is annotated column
- :param rf_annot_col: string reading frame is annotated column
- :param filterNeojuncCoord: bool if True, filter for kmers from neojunctions,
- i.e. with non-annotated junction coordinates
- :param filterAnnotatedRF: bool if True, filter for kmers from annotated reading frames,
- i.e. with reading frames found in transcripts from the annotation, not propagated
- '''
- # Keep k-mers according to annotation flag
- if filterNeojuncCoord:
- matrix = matrix.filter(f'({jct_annot_col} == {False})')
- if filterAnnotatedRF:
- matrix = matrix.filter(f'({rf_annot_col} == {True})')
- return matrix
-
def combine_normals(normal_segm, normal_junc):
''' Given two matrices [kmers] x [samples, metadata] containing segment expression and junction expression,
@@ -186,69 +23,6 @@ def combine_normals(normal_segm, normal_junc):
return normal_segm
-def check_interm_files(out_dir, expr_limit, n_samples_lim, target_sample='', tag='normals', batch_tag=''):
- '''
- Filtering steps for normal (resp. cancer) samples are saved as intermediate files because it is an expensive operation
- The function checks the presence of the intermediate filtering files to decide whether to perform
- - the full preprocessing + threshold filtering steps
- - or simply re-load the intermediate files
- :param out_dir: str path for output directory
- :param expr_limit: float expression limit threshold to keep a kmer
- :param n_samples_lim: int number of samples that need to pass the expression limit
- :param target_sample: str name of the sample of interest.
- To be excluded in the number of samples that pass the expression limit
- :param tag: str tag related to the type of samples. Example cancer or normal
- :param batch_tag: str batch mode, batch tag to be appended to intermediate file
- :returns:
- - launch_preprocess: bool, whether to perform the full preprocessing + threshold filtering steps
- or simply re-load the intermediate files
- - path_interm_matrix_for_express_threshold, path_interm_matrix_for_sample_threshold,
- path_interm_kmers_annotOnly are respectively the path (str) where
- the expression-filtered matrix, the sample-filtered matrix and
- the kmers derived solely from the annotation are saved
- '''
- base_n_samples = 1
- base_expr = 0.0
- format_tag = '.tsv.gz'
- # For cancer matrix intermediate file the recurrence filter is not applied to the target sample
- if target_sample:
- suffix = f'Except{target_sample}'
- else:
- suffix = ''
-
- # For normal samples the expression threshold filtering is not applied to the kmers found only in the annotation
- # but not in the background samples. These kmers will be by default removed from the foreground matrix.
- if tag == 'normals':
- path_interm_kmers_annotOnly = os.path.join(out_dir, f'kmers_derived_solely_from_annotation{format_tag}')
- else:
- path_interm_kmers_annotOnly = None
-
- # Saving paths
-
- path_interm_matrix_for_express_threshold = os.path.join(out_dir,
- f'interm_{tag}_combiExprCohortLim{expr_limit}Across{base_n_samples}{suffix}{batch_tag}{format_tag}')
- path_interm_matrix_for_sample_threshold = os.path.join(out_dir,
- f'interm_{tag}_combiExprCohortLim{base_expr}Across{base_n_samples}{suffix}{batch_tag}{format_tag}')
- # Check existence
- if (expr_limit and os.path.isfile(os.path.join(path_interm_matrix_for_express_threshold, '_SUCCESS'))) \
- and (n_samples_lim is not None and os.path.isfile(os.path.join(path_interm_matrix_for_sample_threshold, '_SUCCESS'))):
-
- logging.info((f'Intermediate {tag} filtering already performed in: {path_interm_matrix_for_express_threshold} '
- f' and {path_interm_matrix_for_sample_threshold}. Re-loading {tag} intermediate data...'))
- logging.info((f'Proceed with care! Using intermediate files means ignoring --filterNeojuncCoord, '
- f'--filterAnnotatedRF parameter.'))
- launch_preprocess = False
- else:
- logging.info(f'At least one intermediate {tag} filtering file is missing.')
- logging.info(f'Will compute full filtering steps according to user input parameters')
- launch_preprocess = True
-
- return launch_preprocess, \
- path_interm_matrix_for_express_threshold, \
- path_interm_matrix_for_sample_threshold, \
- path_interm_kmers_annotOnly
-
-
def filter_hard_threshold(matrix, index_name, coord_name, jct_annot_col, rf_annot_col, libsize,
expr_limit, n_samples_lim, path_e, path_s, target_sample='',
tag='normals', on_the_fly=False):
@@ -332,8 +106,6 @@ def filter_hard_threshold(matrix, index_name, coord_name, jct_annot_col, rf_anno
return matrix_e, matrix_s
-
-
def combine_hard_threshold_normals(spark, path_normal_kmers_e, path_normal_kmers_s,
normal_matrix_e, normal_matrix_s,
n_samples_lim_normal, index_name):
@@ -490,26 +262,6 @@ def combine_cancer(cancer_kmers_segm, cancer_kmers_edge, index_name):
return cancer_kmers_segm
-def remove_external_kmer_list(spark, path_external_kmer_database, spark_matrix, index_name, header=False):
- '''
- Takes an expression matrix of the format [kmers] x [samples, metadata] and removes an external database
- :param spark: spark context
- :param path_normal_kmer_list: str. path of file containing kmers to be removed
- :param spark_matrix: str path for count matrix
- :param index_name: str kmer column name
- :param header: bool. whether the file contains a header
- :return: Filtered matrix for external kmer database
- '''
- logging.info("Load input {}".format(path_external_kmer_database))
- external_database = loader(spark, path_external_kmer_database, header=header)
- external_database = external_database.select(sf.col(index_name))
- if spark_matrix:
- spark_matrix = spark_matrix.union(external_database)
- else:
- spark_matrix = external_database
- return spark_matrix
-
-
def remove_uniprot(spark, cancer_kmers, uniprot, index_name):
'''
Filters a spark dataframe against uniprot or any peptide database.
@@ -536,138 +288,51 @@ def I_L_replace(value):
return cancer_kmers
-def save_spark(cancer_kmers, output_dir, path_final_fil, outpartitions=None):
+def filter_on_junction_kmer_annotated_flag(matrix, jct_annot_col, rf_annot_col, filterNeojuncCoord, filterAnnotatedRF):
'''
- Saves a spark dataframe as a single or partitioned csv file
- :param cancer_kmers: spark dataframe matrix with expression counts for cancer
- :param output_dir: str path for output directory
- :param path_final_fil: str path to save the spark dataframe
- :param outpartitions: int number of partitions for saving
+ Filters according to the junction and kmer annotated flag
+ :param matrix: spark dataframe matrix to filter
+ :param jct_annot_col: string junction is annotated column
+ :param rf_annot_col: string reading frame is annotated column
+ :param filterNeojuncCoord: bool if True, filter for kmers from neojunctions,
+ i.e. with non-annotated junction coordinates
+ :param filterAnnotatedRF: bool if True, filter for kmers from annotated reading frames,
+ i.e. with reading frames found in transcripts from the annotation, not propagated
'''
- # save
- logging.info(f'>>>> Save to {path_final_fil}')
- pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True)
- if outpartitions is not None:
- cancer_kmers.repartition(outpartitions).write.mode('overwrite')\
- .options(header="true", sep="\t", compression="gzip").format("tsv.gz").csv(path_final_fil)
- else:
- cancer_kmers.write.mode('overwrite')\
- .options(header="true", sep="\t", compression="gzip").format("tsv.gz").csv(path_final_fil)
+ # Keep k-mers according to annotation flag
+ if filterNeojuncCoord:
+ matrix = matrix.filter(f'({jct_annot_col} == {False})')
+ if filterAnnotatedRF:
+ matrix = matrix.filter(f'({rf_annot_col} == {True})')
+ return matrix
-def loader(spark, path_kmer, header=False):
+def split_only_found_annotation_backbone(separate_back_annot, output_dir, matrix, index_name,
+ jct_annot_col, rf_annot_col):
'''
- Loads a parquet, csv or tsv (partitioned) file as a spark dataframe
- The user can provide either
- - a list of files with the same schema
- - a string. folder containing the files
- :param spark: spark context
- :param path_kmer: Either (1) a list of files or (b) a string with the basefolder where the files are placed
- :param header: bool. whether the file contains a header
- :return: loaded spark dataframe
+ Separate kmers only present in the backbone annotation from the ones supported by the reads of any sample:
+ A kmer is solely derived from the backbone annotation if (all samples have zero expression)
+ The opposite condition is implemented to make use of short-circuit evaluation
+ :param matrix: spark dataframe matrix to filter
+ :param index_name: str kmer column name
+ :param jct_annot_col: string junction is annotated column
+ :param rf_annot_col: string reading frame is annotated column
+ :return: spark matrix with expressed kmers, spark serie with kmers only in backbone annotation
'''
- if ('parquet' in path_kmer) or ('pq' in path_kmer): #single file or folder
- matrix = spark.read.parquet(path_kmer)
- elif ('parquet' in path_kmer[0]) or ('pq' in path_kmer[0]): # list
- matrix = spark.read.parquet(*path_kmer)
- elif ('csv' in path_kmer[0]) or ('csv' in path_kmer):
- matrix = spark.read.csv(path_kmer, sep=',', header=header)
- elif ('tsv' in path_kmer[0]) or ('tsv' in path_kmer):
- matrix = spark.read.csv(path_kmer, sep=r'\t', header=header)
- elif ('gzip' in path_kmer[0]) or ('gzip' in path_kmer): #If tsv or csv not in path -> assume gzip are tab separated
- matrix = spark.read.csv(path_kmer, sep=r'\t', header=header)
- elif ('gz' in path_kmer[0]) or ('gz' in path_kmer):
- matrix = spark.read.csv(path_kmer, sep=r'\t', header=header)
- else:
- logging.error(f'Cannot determine file type of {path_kmer}. Please include .parquet .pq .tsv .csv suffix to the files or the folder (partitionned)')
- sys.exit()
-
+ expressed = ' OR '.join( [f'({col_name} != 0.0)'
+ for col_name in matrix.schema.names
+ if col_name not in [index_name, jct_annot_col, rf_annot_col]]) # SQL style because many cols
+ matrix = matrix.withColumn('expressed',
+ sf.when(sf.expr(f"{expressed}"), True).otherwise(False))
- return matrix
+ kmers_AnnotOnly = matrix.select(index_name).where(sf.col('expressed') == False)
+ matrix_Expressed = matrix.filter(sf.col('expressed') == True)
+ matrix_Expressed = matrix_Expressed.drop(sf.col('expressed'))
+ save_spark(kmers_AnnotOnly, output_dir, separate_back_annot)
+ return matrix_Expressed
-def output_count(perform_count, matrix, report_count, report_step, step_string):
- '''
- Performs a count operation on the number of kmers present in spark dataframe after a given filtering step
- Note: This operation is expensive but useful if the user is interested in intermediate filtering steps
- :param perform_count: bool whether to perform a count operation
- :param matrix: spark dataframe with kmer expression counts
- :param report_count: list to store result of successive counting operations
- :param report_step: list to store name of successive counting operations
- :param step_string: str name of the counting operation
- '''
- if perform_count:
- mycount = matrix.count()
- report_count.append(mycount)
- report_step.append(step_string)
- logging.info(f'# {step_string} n = {mycount} kmers')
-def save_output_count(output_count, report_count, report_steps, prefix, cancer_sample_ori, mutation_mode,
- sample_expr_support_cancer, cohort_expr_support_cancer, n_samples_lim_cancer,
- cohort_expr_support_normal, n_samples_lim_normal, id_normals):
- '''
- Saves the number of kmers present in spark dataframe after each filtering step in a tabular file
- :param output_count: str path for count file of intermediate filtering steps
- :param report_count: list to store result of successive counting operations
- :param report_step: list to store name of successive counting operations
- :param prefix: str information to be added to the result line in an info column
- :param cancer_sample_ori: str id of target cancer sample which was filtered
- :param mutation_mode: str information about whether mutations where applied or not
- :param sample_expr_support_cancer: float normalized expression threshold for the cancer target sample
- :param cohort_expr_support_cancer: float normalized expression threshold for the cancer cohort
- excluding the target sample
- hich should be met in n samples
- :param n_samples_lim_cancer: int number of cancer samples in which the cancer cohort expression threshold
- should be met
- :param cohort_expr_support_normal: float normalized expression threshold for the normal cohort
- required in any sample (>=1)
- :param n_samples_lim_normal: int number of normal samples in which any number of reads is required (>0)
- :param id_normals: str id of the normal cohort (example gtex)
- '''
- if output_count:
- header = (f'{"sample"}\t{"mutation_mode"}\t{"min_sample_reads"}\t{"#_of_cohort_samples"}\t'
- f'{"reads_per_cohort_sample"}\t{"#_normal_samples_allowed"}\t{"normal_cohort_id"}'
- f'\t{"reads_per_normal_sample"}')
- line = (f'{cancer_sample_ori}\t{mutation_mode}\t{sample_expr_support_cancer}\t{n_samples_lim_cancer}'
- f'\t{cohort_expr_support_cancer}\t{n_samples_lim_normal}\t{id_normals}'
- f'\t{cohort_expr_support_normal}')
-
- for idx in np.arange(len(report_count)):
- header += f'\t{report_steps[idx]}'
- line += f'\t{report_count[idx]}'
- if prefix:
- header += f'\t{"info"}'
- line += f'\t{prefix}'
- header += "\n"
- line += "\n"
- if not os.path.exists(output_count):
- with open(output_count,"w") as f:
- f.write(header)
- with open(output_count, "a") as f:
- f.write(line)
- logging.info(f'Save intermediate info to {output_count}')
-
-
-def redirect_interm(interm_dir_norm, interm_dir_canc, output_dir):
- '''
- Set the directory to save intermediary file
- - The output directory
- - Any other specified normal or cancer directory
- Default. Uses output directory
- :param interm_dir_norm: str custom scatch dir path to save intermediate normal files
- :param interm_dir_canc: str custom scatch dir path to save intermediate cancer files
- :param output_dir: str output directory for the filtered matrix
- :return:
- '''
- if interm_dir_canc:
- cancer_out = interm_dir_canc
- else:
- cancer_out = output_dir
- if interm_dir_norm:
- normal_out = interm_dir_norm
- else:
- normal_out = output_dir
- return normal_out, cancer_out
diff --git a/tests/test_struct.py b/tests/test_struct.py
index 973ae3c2..9ddffc39 100644
--- a/tests/test_struct.py
+++ b/tests/test_struct.py
@@ -93,7 +93,7 @@ def test_end_to_end_cancerspecif_mx(outdir):
#'--on-the-fly',
#"--interm-dir-norm", os.path.join(outdir, "test_inter_normal"),
#"--interm-dir-canc", os.path.join(outdir, "test_inter_cancer"),
- '--whitelist-cancer', os.path.join(basedir, 'cancer', 'whitelist_cancer.txt'),
+ #'--whitelist-cancer', os.path.join(basedir, 'cancer', 'whitelist_cancer.txt'),
"--uniprot", os.path.join(basedir, 'uniprot.txt'),
"--parallelism", "3",
"--out-partitions", "2",
@@ -104,6 +104,58 @@ def test_end_to_end_cancerspecif_mx(outdir):
+#immunopepper cancerspecif --cores 2 --mem-per-core 2048 --parallelism 3 --kmer 9 --output-dir immunopepper_usecase/filter_case/
+# --interm-dir-norm immunopepper_usecase/filter_case --interm-dir-canc immunopepper_usecase/filter_case
+# --ids-cancer-samples simulated_Ipp_1_sample3 --mut-cancer-samples ref
+# --output-count immunopepper_usecase/filter_case/output-count.txt
+# --path-normal-matrix-edge immunopepper/tests/data_simulated/data/cancerspecif_mode/ref_graph_kmer_NormalExpr/normal_junctions.gz
+# --n-samples-lim-normal 15 --cohort-expr-support-normal 100 --sample-expr-support-cancer 20 --cohort-expr-support-cancer 110
+# --n-samples-lim-cancer 2
+# --path-cancer-matrix-edge immunopepper/tests/data_simulated/data/cancerspecif_mode/ref_graph_kmer_CancerExpr/cancer_junctions.gz
+# --filterNeojuncCoord C --verbose 1
+
+def test_end_to_end_cancerspecif_small_data(outdir):
+ basedir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'immunopepper_usecase/filter_case')
+ datadir = os.path.join(os.path.dirname(__file__), 'data_simulated')
+ old_dir = os.path.join(os.path.dirname(__file__), 'filter_case')
+ if not os.path.exists(os.path.join(outdir, "test_inter_normal")):
+ os.mkdir(os.path.join(outdir, "test_inter_normal"))
+ if not os.path.exists(os.path.join(outdir, "test_inter_cancer")):
+ os.mkdir(os.path.join(outdir, "test_inter_cancer"))
+ my_args =["cancerspecif",
+ "--cores", "2",
+ "--mem-per-core", "2048",
+ "--kmer", "9",
+ "--path-normal-matrix-edge", os.path.join(datadir, 'data/cancerspecif_mode/ref_graph_kmer_NormalExpr/normal_junctions.gz'),
+ "--path-cancer-matrix-edge", os.path.join(datadir, 'data/cancerspecif_mode/ref_graph_kmer_CancerExpr/cancer_junctions.gz'),
+ #"--path-normal-kmer-list", os.path.join(old_dir, 'normal', 'simple_annotation.pq'),
+
+ '--ids-cancer-samples', "simulated_Ipp_1_sample3",
+
+ "--output-dir", os.path.join(outdir, 'filter_out'),
+ "--output-count", os.path.join(outdir, 'filter_out', "collect.txt"),
+
+ '--cohort-expr-support-normal', "100",
+ "--n-samples-lim-normal", "15",
+ '--sample-expr-support-cancer', "20",
+ '--cohort-expr-support-cancer', "110",
+ "--n-samples-lim-cancer", "2",
+ "--filterNeojuncCoord", "C",
+ #"--filterAnnotatedRF", "",
+ # "--tot-batches", "4",
+ # "--batch-id", "0",
+ "--tag-prefix", 'G_',
+ #'--on-the-fly',
+ "--interm-dir-norm", os.path.join(outdir, "test_inter_normal"),
+ "--interm-dir-canc", os.path.join(outdir, "test_inter_cancer"),
+ #'--whitelist-cancer', os.path.join(basedir, 'cancer', 'whitelist_cancer.txt'),
+ #"--uniprot", os.path.join(old_dir, 'uniprot.txt'),
+ "--parallelism", "3",
+ "--out-partitions", "2",
+ "--mut-cancer-samples", "ref"]
+
+ print(my_args)
+ ip.split_mode(my_args)
### Mouse Test
@@ -112,13 +164,14 @@ def test_end_to_end_cancerspecif_mx(outdir):
mutation_mode ='germline'
#pr = cProfile.Profile()
#pr.enable()
-test_end_to_end_build_mouse(tmpdir, mutation_mode, is_parallel=False) #TODO add back
+#test_end_to_end_build_mouse(tmpdir, mutation_mode, is_parallel=False) #TODO add back
#test_end_to_end_samplespecif('ERR2130621', tmpdir, "9", mutation_mode) # TEST DEPRECATED
#for mutation_mode in ['ref', 'germline', 'somatic', 'somatic_and_germline']:
# test_end_to_end_makebg('ERR2130621', tmpdir, "9")
# test_end_to_end_diff(tmpdir, 'ERR2130621', "9", mutation_mode)
outdir_filter = os.path.join(tmpdir, "filter_test")
#test_end_to_end_cancerspecif_mx(outdir_filter)
+test_end_to_end_cancerspecif_small_data(outdir_filter)
#pr.disable()
#pr.dump_stats(os.path.join(tmpdir, 'cProfile.pstats'))