Skip to content

Commit

Permalink
version and compatibility updates
Browse files Browse the repository at this point in the history
  • Loading branch information
spond committed Nov 15, 2022
1 parent 53d7dec commit 128af4b
Show file tree
Hide file tree
Showing 539 changed files with 27,171 additions and 799,748 deletions.
Binary file modified HBL/.DS_Store
Binary file not shown.
21 changes: 14 additions & 7 deletions HBL/data_filter-2.bf
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
RequireVersion ("2.4.0");

filter.retain_all_sequences = "QVOA|OGV";

LoadFunctionLibrary ("libv3/tasks/alignments.bf");
LoadFunctionLibrary ("libv3/convenience/regexp.bf");
LoadFunctionLibrary ("libv3/IOFunctions.bf");



filter.analysis_description = {terms.io.info :
"
Map a protein MSA back onto nucleotide sequences
Expand Down Expand Up @@ -74,18 +77,22 @@ fprintf (filter.path, CLEAR_FILE);

utility.SetEnvVariable ("DATA_FILE_PRINT_FORMAT",9);


utility.ForEach (filter.splits, "_sequences_", 'filter.compress ( _sequences_)');
filter.splits["filter.compress"][""];
//utility.ForEachPair (filter.splits, "_key_", "_sequences_", 'filter.compress ( _sequences_, _key_)');

filter.count = 0;


function filter.compress (s) {
function filter.compress (k, s) {
if (utility.Array1D (s)) {
filter.selector = utility.SwapKeysAndValues (s);
DataSetFilter filter.subfilter = CreateFilter (filter.dataset, 1, "", filter.selector / filter.names[speciesIndex]);
io.ReportProgressMessage ("UNIQUE SEQUENCES", "Retained `alignments.CompressDuplicateSequences ('filter.subfilter','filter.subfilter.compressed', TRUE)` unique sequences");
if (filter.count > 0) {
if (None == regexp.Find (k, filter.retain_all_sequences) ) {
DataSetFilter filter.subfilter = CreateFilter (filter.dataset, 1, "", filter.selector / filter.names[speciesIndex]);
io.ReportProgressMessage ("UNIQUE SEQUENCES", "`k` : retained `alignments.CompressDuplicateSequences ('filter.subfilter','filter.subfilter.compressed', TRUE)` unique sequences out of " + filter.subfilter.species + " total sequences");
} else {
DataSetFilter filter.subfilter.compressed = CreateFilter (filter.dataset, 1, "", filter.selector / filter.names[speciesIndex]);
io.ReportProgressMessage ("UNIQUE SEQUENCES", "`k` ALL " + filter.subfilter.compressed.species + " sequences");
}
if (filter.count > 0) {
fprintf (filter.path, "\n", filter.subfilter.compressed);
} else {
fprintf (filter.path, filter.subfilter.compressed);
Expand Down
23 changes: 16 additions & 7 deletions HBL/data_filter.bf
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,21 @@ io.ReportProgressMessage ("Data QC", "Loaded `filter.nuc_data[terms.data.sequenc

/* split off QVOA reads */

filter.partitionining_expressions = {"QVOA" : "(QVOA)|(OGV)"};
filter.regex = "(QVOA)|(OGV)|(DNA)|(xxxx)|(BAR[0-9]+)";
filter.partitionining_expressions = {"QVOA" : filter.regex};
filter.partitoned_sequence_names = regexp.PartitionByRegularExpressions(alignments.GetSequenceNames ("filter.raw_data"), filter.partitionining_expressions);

//io.CheckAssertion ("Abs(filter.partitoned_sequence_names['(QVOA)|(OGV)'])>0", "There were no sequences marked as QVOA in the input file");

KeywordArgument ("qvoa", "Write QVOA sequences here ", filter.nuc_data[terms.data.file] + "_qvoa.fas");
filter.qvoa_path = io.PromptUserForFilePath ("Save QVOA sequences to");
io.ReportProgressMessage ("Data QC", "Found `Abs(filter.partitoned_sequence_names['(QVOA)|(OGV)'])` QVOA sequences in the file; writing them to **`filter.qvoa_path`**");
io.ReportProgressMessage ("Data QC", "Found `Abs(filter.partitoned_sequence_names[filter.regex])` QVOA sequences in the file; writing them to **`filter.qvoa_path`**");
alignments.GetSequenceByName ("filter.raw_data", None);
fprintf (filter.qvoa_path, CLEAR_FILE, KEEP_OPEN);

filter.qvoa = {};

utility.ForEach (filter.partitoned_sequence_names["(QVOA)|(OGV)"], "_seq_record_",
utility.ForEach (filter.partitoned_sequence_names[filter.regex], "_seq_record_",
'
sanitized_name = ((_seq_record_ ^ {{"_xxxx_000WPI"}{""}}) ^ {{"OGV_"}{""}}) + "_QVOA";
filter.qvoa [sanitized_name] = alignments.StripGaps (alignments.GetSequenceByName ("filter.raw_data", _seq_record_));
Expand Down Expand Up @@ -165,11 +166,13 @@ if (Abs(filter.frameshifted)) {
io.ReportProgressBar ("filter","Processing sequence " + filter.seq_count);
filter.cleaned = IgSCUEAL.align_sequence_to_reference_set (filter.RNA_reads[_sequence_], filter.ref_seq, filter.options);

//if (_sequence_ == "CAP206_159wpi_ENV_2_all_aligned_loops_trimmed_98_2") {
//if (_sequence_ == "PIDCAP380_PH380_F1_2_23_W5half_post_ART_DNA_BAR1_Seq1_Phase3_NumReads240_P17_QVOA") {
// console.log (filter.cleaned);
//}

//console.log (filter.cleaned);

filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"]);
filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"])^ {{"\\?","X"}};
filtered.na_seq = IgSCUEAL.strip_in_frame_indels(filter.cleaned["QRY"]);

(filter.sequences_with_copies[filter.RNA_reads[_sequence_]])["_write_to_file"][""];
Expand All @@ -192,7 +195,7 @@ utility.ForEachPair (filter.clean_seqs, "_sequence_", "_value_",
// console.log (filter.cleaned);
//}

filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"]);
filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"]) ^ {{"\\?","X"}};
filtered.na_seq = IgSCUEAL.strip_in_frame_indels(filter.cleaned["QRY"]);
(filter.sequences_with_copies[filter.RNA_reads[_sequence_]])["_write_to_file"][""];
filter.seq_count += 1;
Expand All @@ -206,7 +209,13 @@ io.ReportProgressMessage ("Data QC", "Performing QA on QVOA reads");
utility.ForEachPair (filter.qvoa, "_sequence_", "_value_",
'
filter.cleaned = IgSCUEAL.align_sequence_to_reference_set (_value_, filter.ref_seq, filter.options);
filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"]);

if (None == filter.cleaned) {
console.log (_sequence_ + " failed to map to RNA references");
return;
}

filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"]) ^ {{"\\?","X"}};
filtered.na_seq = IgSCUEAL.strip_in_frame_indels(filter.cleaned["QRY"]);

fprintf (filter.combined_protein_path, ">", _sequence_, "\n", filtered.aa_seq, "\n");
Expand Down
27 changes: 25 additions & 2 deletions HBL/lib/igscueal.bf
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,24 @@ namespace IgSCUEAL {
return {'aligned': sequence, 'stripped' : sequence ^ {{"\-",""}}};
}

lfunction strip_non_letters (sequence) {

lfunction strip_in_frame_gaps (sequence) {
stripped = ""; stripped * Abs (sequence);
for (i = 0; i < Abs (sequence); i+=3) {
codon = sequence[i][i+2];
if ((codon $ "[ACGT][ACGT][ACGT]" )[0] == 0) {
stripped * codon;
} else {
if (codon != '---') {
return null;
}
}
}
stripped * 0;
return {'aligned': sequence, 'stripped' : stripped};
}

lfunction strip_non_letters (sequence) {
return {'aligned': sequence, 'stripped' : sequence ^ {{"[^A-Z,a-z]",""}}};
}

Expand Down Expand Up @@ -664,7 +681,13 @@ namespace IgSCUEAL {
result = {'SITES' : data["sites"]};

utility.ForEach (align_with_these, "_value_",
"`&result`[_value_] = IgSCUEAL.strip_gaps (alignments.GetSequenceByName ('`namespace`', _value_));"
"
`&result`[_value_] = IgSCUEAL.strip_in_frame_gaps (alignments.GetSequenceByName ('`namespace`', _value_));
if (`&result`[_value_] == None) {
`&result` - _value_;
}
"

);


Expand Down
94 changes: 72 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
# Timing of QVOA strains using RNA pre-treatment data

<a href="https://zenodo.org/badge/latestdoi/181484600"><img src="https://zenodo.org/badge/181484600.svg" alt="DOI"></a>
> Updated Nov 2022 to provide compatibility with HyPhy 2.5.x and support additional options, like not compressing identical sequences
## Overview

This pipeline **times** outgrowth virus (OGV) strains from a single host using serially sampled RNA data. Four different approaches are used to assign dates to unobserved strains. First, each tree is rooted to maximize the root-to-tip to sampling time correlation coefficient.
This pipeline **times** outgrowth virus (OGV) strains from a single host using longtitudinally sampled RNA data to provide temporal information. Four different approaches are used to time strains with unobserved dates (OGV or DNA strains). The phylogenetic tree for each genomic segment is rooted to maximize the root-to-tip to sampling time correlation coefficient first.

### Patristic distance
Given a tree with OGV and RNA strains, this approach simply finds the nearest NGS neighbor (in partistic, i.e. path length sense) for each OGV sequence. To assess reliability of this assignment, we also identify all **K≥1** sequences within a factor of 2x of this smallest distance, and extract the consensus vote to assign the date. The **support** value if the proportion of **K** neighboring sequences that come from the same timepoint. In the example below, there are **9** RNA neighbors to the blue QVOA sequence. The closest one is `CAP288_4260_184WPI_ENV_C1C2_NGS_020_0_006` and the other two are with 2x distance. The blue sequence will be assigned to the `184WPI` timepoint, because most of the neighbors are from that timepoint, and the support value will be `8/9`, because one of teh neighbors has a timestamp of `174WPI`
Given a tree with OGV and RNA strains, this approach simply finds the nearest RNA neighbor (in [patristic](https://link.springer.com/referenceworkentry/10.1007/978-1-4020-6754-9_12406), i.e., path length sense) for each OGV sequence. To assess reliability of this assignment, we also identify all **K≥1** sequences within a factor of `2x` of this smallest distance, and extract the consensus vote to assign the date. The **support** value if the proportion of **K** neighboring sequences that come from the same timepoint. In the example below, there are **9** RNA neighbors to the blue QVOA sequence. The closest one is `CAP288_4260_184WPI_ENV_C1C2_NGS_020_0_006` and the other two are with 2x distance. The blue sequence will be assigned to the `184WPI` timepoint (WPI = weeks post infection), because most of the neighbors are from that timepoint, and the support value will be `8/9`, because one of teh neighbors has a timestamp of `174WPI`

![img1](img/1.png)

### Clade support
Here, we start with a QVOA sequence, and go up the tree until we encounter the first internal node that has both QVOA and NGS descendants and that has a minimum of 0.90 bootstrap support. We then assign a timestamp and support to the QVOA sequence using the same majority rule as for the patristic distance for all the nodes in the clade. In the example below, to date `CAP288_NEF_1_W11_QVOA`, we walk up the tree to the first supported node (blue) and look at its RNA descendants, of which there are **6** AND 5/6 come from `184WPI`, yielding the assignment of `184WPI` for the QVOA sequence and support of 5/6.
Here, we start with a QVOA sequence, and go up the tree until we encounter the first internal node that has **both** QVOA and RNA descendants and that has a minimum of `0.90` bootstrap support. We then assign a timestamp and support to the QVOA sequence using the same majority rule as for the patristic distance for all the nodes in the clade. In the example below, to date `CAP288_NEF_1_W11_QVOA`, we walk up the tree to the first supported node (blue) and look at its RNA descendants, of which there are **6** AND 5/6 come from `184WPI`, yielding the assignment of `184WPI` for the QVOA sequence and support of 5/6.

![img1](img/2.png)

### Placement

Phylogenetic placement using modified IgSCUEAL. The support value is the model weight ([0-1]) assigned to the branches with teh corresponding label
Phylogenetic placement using a modified [IgSCUEAL](https://royalsocietypublishing.org/doi/full/10.1098/rstb.2014.0240) tool. The support value is the model weight (in [0-1]) assigned to the branches with teh corresponding label

### Regression

Expand All @@ -30,18 +31,53 @@ The linear regression molecular clock method, essentially the same as in [the Jo

### Dependancies

1. `HyPhy v2.4` -- the `develop` branch from [github.com/veg/hyphy](github.com/veg/hyphy)
2. `muscle` for MSA generation
1. `HyPhy v2.5.42` -- or the `develop` branch from [github.com/veg/hyphy](github.com/veg/hyphy)
2. `MAFFT` for MSA generation
3. `FastTree` for phylogeny inference
4. `phylotree.js` for tree processing and visualization [note; there is a custom JS script for performing dating steps that is included in `scripts`]
4. `phylotree.js` (an `npm` package) for tree processing and visualization [note; there is a custom JS script for performing dating steps that is included in `scripts`]
5. `snakemake` for workflow management
6. `node` for executing JavaScript
7. `BioPython`

See notes on configuration below and please run
```
npm install
```
from the base directory


#### Workflow

The workflow for sample classification is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow, encoded in the `Snakemake` file, and can be graphically summarized as follows (for a specific sample):
The workflow for sample classification is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow, encoded in the `Snakemake` file, and can be graphically summarized as follows (for a specific sample)...

![img1](img/3.png)

The pipeline requires an input `samples.json` file, with pathnames of files to process specified relative to the `data` directory, e.g.

```
{
'samples': [
"CAP188/CAP188_ENV_2_all_hap.fasta",
"CAP188/CAP188_ENV_3_all_hap.fasta",
"CAP188/CAP188_ENV_4_all_hap.fasta",
"CAP188/CAP188_NEF_1_all_hap.fasta",
"CAP206/CAP206_ENV_2_all_hap.fasta",
"CAP206/CAP206_ENV_4_all_hap.fasta",
"CAP206/CAP206_GAG_1_all_hap.fasta",
"CAP217/CAP217_ENV_2_all_hap.fasta",
"CAP217/CAP217_ENV_3_all_hap.fasta",
"CAP217/CAP217_ENV_4_all_hap.fasta",
"CAP257/CAP257_ENV_2_all_hap.fasta",
"CAP257/CAP257_ENV_3_all_hap.fasta",
"CAP257/CAP257_ENV_4_all_hap.fasta",
```

The pipeline is executed by calling `snakemake`, e.g.

```
$snakemake --cores 4
```

After the pipeline has completed, you need to run

```
Expand All @@ -55,38 +91,43 @@ to generate the overall summary CSV file.
The high level directory structure contains the following components

```
├── HBL [HyPhy scripts]
│   ├── data
├── HBL [HyPhy scripts]
│   ├── data_filter-2.bf
│   ├── data_filter.bf
│   ├── lib
│   └── scripts
├── README.md [This file]
├── Snakefile [Snakemake configuration file]
├── data [unprocessed FASTA files]
├── LICENSE
├── package.json [npm package manifest]
├── README.md [This file]
├── Snakefile [Snakemake configuration file]
├── data [unprocessed FASTA files]
│   ├── CAP188
│   ├── CAP206
│   ├── CAP217
│   ├── CAP257
│   ├── CAP268
│   ├── CAP280
│   ├── CAP287
│   ├── CAP288
│   ├── CAP302
│   ├── CAP316
│   ├── CAP336
│   └── conversion.json. [Color codes and WPI to pre-ART conversions]
├── img [Images for this file]
│   ├── CAP372
│   └── conversion.json [Color codes and WPI to pre-ART conversions]
├── img [Images for this file]
│   ├── 1.png
│   ├── 2.png
│   └── 3.pdf
│   └── 3.png
├── results [Results generated by the pipeline]
│   ├── alignments [Alignments]
│   ├── dating [QVOA/OGV dating results]
│   └── trees [Trees inferred by the pipeline]
├── samples.json [Manifest of files to be processed by the pipeline]
├── scripts
│   └── result-summary.py[Penultimate summary script]
└── scripts
├── compute-distance.js [Classifier/timing script]
└── result-summary.py [Penultimate summary script]
```

```

Within specific resutls directories, the following files are created **for each input FASTA**

Expand Down Expand Up @@ -159,13 +200,22 @@ CAP188_4260_185WPI_ENV_C2C3_NGS_046_0.002

3. The resulting in-frame sequences were translated to amino-acids, mapping all ambiguous codons to `?` (e.g. `A-A`).

4. A multiple sequence alignment (MSA) was generated from translated protein sequences using `muscle`
4. A multiple sequence alignment (MSA) was generated from translated protein sequences using `MAFFT`

5. Original codon sequences are mapped to aligned protein sequences using a script in `HyPhy`. Identical sequences (which do not contribute any information to phylogenetic analyses) are removed at this stage.

6. ML phylognetic trees are reconstructed using `FastTree` (slow option) for RNA sequences **only** (these are later used for phylogenetic placement) and for RNA+QVOA

7. The pipeline will **compress** identical sequences, i.e. only one copy will be retained. The copy number will (including single copies, i.e. `1`) will be appended to sequence names.
7. The pipeline will **compress** identical RNA sequences, i.e. only one copy will be retained. The copy number will (including single copies, i.e. `1`) will be appended to sequence names. It will NOT compress QVOA sequences, defined as those with `QVOA` or `OGV` in their name.

> **Important:** adjust the paths to various executables, like HYPHY and FastTree, in the `Snakefile` before running the pipeline
> **Important:** adjust the paths to various executables, like `hyphy` and `FastTree`, in the `Snakefile` before running the pipeline
```
callables = {
'hyphy' : '/usr/local/bin/hyphy',
'mafft' : '/usr/local/bin/mafft --auto ',
'raxml' : '/usr/local/bin/raxml-ng',
'fasttree' : '/usr/local/bin/FastTree',
'classifier' : 'scripts/compute-distance.js'
}
```
Loading

0 comments on commit 128af4b

Please sign in to comment.