-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8497062
commit 3222935
Showing
18 changed files
with
698 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
The MIT License (MIT) | ||
|
||
Copyright (c) 2022 Youpu-Chen | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy of | ||
this software and associated documentation files (the "Software"), to deal in | ||
the Software without restriction, including without limitation the rights to | ||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | ||
the Software, and to permit persons to whom the Software is furnished to do so, | ||
subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | ||
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | ||
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | ||
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# AlignmentUtilis | ||
|
||
`AlignmentUtilis` is a collection utilities of sequence alignment algorithms, | ||
|
||
- Needleman-Wunsch and Smith-Watermen algorithms to conduct sequence alignment with affine gap penalty | ||
- Naive exact matching to conduct reads alignment problem | ||
- ... | ||
|
||
## How to get it? | ||
|
||
```shell | ||
pip install AlignmentUtilis | ||
``` | ||
|
||
|
||
|
||
## How to use it? | ||
|
||
```shell | ||
# 1. PairwiseSequenceAlignment | ||
from AlignmentUtilis.Pairwise import * | ||
# Test | ||
seq1 = "TCGTAGACGA" | ||
seq2 = "ATAGAATGCGG" | ||
# Run Global Alignment | ||
PairwiseSequenceAlignment.Runalignment(seq1, seq2, 1, -1, -2, -1, local=False) | ||
# Run Local Alignment | ||
PairwiseSequenceAlignment.Runalignment(seq1, seq2, 1, -1, -2, -1, local=True) | ||
|
||
# 2. Naive exact matching | ||
from AlignmentUtilis.Naive import * | ||
# Naive Exact Macthing Basic Utility Test | ||
test_occurrences = Naive.naive_exact_matching('AG', 'AGCTTAGATAGC') | ||
print('The pattern is AG') | ||
print('The target sequence is AGCTTAGATAGC') | ||
print(f'The start position of exact matching is {test_occurrences}') | ||
``` | ||
|
||
|
||
|
||
## License | ||
|
||
MIT License | ||
Copyright (c) 2022 Youpu Chen | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
Binary file added
BIN
+8.33 KB
Sequence_Handle/AlignmentUtilis/dist/AlignmentUtilis-0.0.3-py3-none-any.whl
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+8.69 KB
Sequence_Handle/AlignmentUtilis/dist/AlignmentUtilis-0.0.4-py3-none-any.whl
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+8.69 KB
Sequence_Handle/AlignmentUtilis/dist/AlignmentUtilis-0.0.5-py3-none-any.whl
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+8.81 KB
Sequence_Handle/AlignmentUtilis/dist/AlignmentUtilis-0.0.6-py3-none-any.whl
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
[build-system] | ||
requires = ["setuptools"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
[project] | ||
name = "AlignmentUtilis" | ||
version = "0.0.6" | ||
authors = [ | ||
{ name="Youpu Chen", email="[email protected]" }, | ||
] | ||
description = "Simple application of sequence alignment algorithms" | ||
readme = "README.md" | ||
requires-python = ">=3.8" | ||
classifiers = [ # Optional | ||
# How mature is this project? Common values are | ||
# 3 - Alpha | ||
# 4 - Beta | ||
# 5 - Production/Stable | ||
'Development Status :: 3 - Alpha', | ||
|
||
# Indicate who your project is intended for | ||
'Intended Audience :: Information Technology', | ||
'Topic :: Scientific/Engineering :: Bio-Informatics', | ||
|
||
# Pick your license as you wish | ||
'License :: OSI Approved :: MIT License', | ||
|
||
# Specify the Python versions you support here. In particular, ensure | ||
# that you indicate whether you support Python 2, Python 3 or both. | ||
'Programming Language :: Python :: 3', | ||
'Programming Language :: Python :: 3.8', | ||
] |
63 changes: 63 additions & 0 deletions
63
Sequence_Handle/AlignmentUtilis/src/AlignmentUtilis.egg-info/PKG-INFO
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
Metadata-Version: 2.1 | ||
Name: AlignmentUtilis | ||
Version: 0.0.6 | ||
Summary: Simple application of sequence alignment algorithms | ||
Author-email: Youpu Chen <[email protected]> | ||
Classifier: Development Status :: 3 - Alpha | ||
Classifier: Intended Audience :: Information Technology | ||
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics | ||
Classifier: License :: OSI Approved :: MIT License | ||
Classifier: Programming Language :: Python :: 3 | ||
Classifier: Programming Language :: Python :: 3.8 | ||
Requires-Python: >=3.8 | ||
Description-Content-Type: text/markdown | ||
License-File: LICENSE | ||
|
||
# AlignmentUtilis | ||
|
||
`AlignmentUtilis` is a collection utilities of sequence alignment algorithms, | ||
|
||
- Needleman-Wunsch and Smith-Watermen algorithms to conduct sequence alignment with affine gap penalty | ||
- Naive exact matching to conduct reads alignment problem | ||
- ... | ||
|
||
## How to get it? | ||
|
||
```shell | ||
pip install AlignmentUtilis | ||
``` | ||
|
||
|
||
|
||
## How to use it? | ||
|
||
```shell | ||
# 1. PairwiseSequenceAlignment | ||
from AlignmentUtilis.Pairwise import * | ||
# Test | ||
seq1 = "TCGTAGACGA" | ||
seq2 = "ATAGAATGCGG" | ||
# Run Global Alignment | ||
PairwiseSequenceAlignment.Runalignment(seq1, seq2, 1, -1, -2, -1, local=False) | ||
# Run Local Alignment | ||
PairwiseSequenceAlignment.Runalignment(seq1, seq2, 1, -1, -2, -1, local=True) | ||
|
||
# 2. Naive exact matching | ||
from AlignmentUtilis.Naive import * | ||
# Naive Exact Macthing Basic Utility Test | ||
test_occurrences = Naive.naive_exact_matching('AG', 'AGCTTAGATAGC') | ||
print('The pattern is AG') | ||
print('The target sequence is AGCTTAGATAGC') | ||
print(f'The start position of exact matching is {test_occurrences}') | ||
``` | ||
|
||
|
||
|
||
## License | ||
|
||
MIT License | ||
Copyright (c) 2022 Youpu Chen | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
10 changes: 10 additions & 0 deletions
10
Sequence_Handle/AlignmentUtilis/src/AlignmentUtilis.egg-info/SOURCES.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
LICENSE | ||
README.md | ||
pyproject.toml | ||
src/AlignmentUtilis/Naive.py | ||
src/AlignmentUtilis/Pairwise.py | ||
src/AlignmentUtilis/__init__.py | ||
src/AlignmentUtilis.egg-info/PKG-INFO | ||
src/AlignmentUtilis.egg-info/SOURCES.txt | ||
src/AlignmentUtilis.egg-info/dependency_links.txt | ||
src/AlignmentUtilis.egg-info/top_level.txt |
1 change: 1 addition & 0 deletions
1
Sequence_Handle/AlignmentUtilis/src/AlignmentUtilis.egg-info/dependency_links.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
1 change: 1 addition & 0 deletions
1
Sequence_Handle/AlignmentUtilis/src/AlignmentUtilis.egg-info/top_level.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
AlignmentUtilis |
147 changes: 147 additions & 0 deletions
147
Sequence_Handle/AlignmentUtilis/src/AlignmentUtilis/Naive.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import random | ||
import subprocess | ||
import os | ||
|
||
class Naive(): | ||
@staticmethod | ||
def read_Cicurlar_Genome(filename): | ||
'''this function is used to read sequence file in fasta format | ||
Note: it could only be applied to read one sequence, which could be the genome of chloroplast, mitochondia as well as microbes''' | ||
genome = '' | ||
with open(filename, 'r') as f: | ||
for line in f: | ||
# ignore header line with genome information | ||
if not line[0] == '>': | ||
genome += line.rstrip() | ||
return genome | ||
|
||
@staticmethod | ||
def readFastq(filename): | ||
'''this function is used to read fastq file (not in gzip or any compressed format), | ||
it return sequence and its correspondng sequence quality''' | ||
sequence = [] | ||
qualities = [] | ||
with open(filename) as fh: | ||
while True: | ||
# in a loop, it will run codes below, and read only a line each time | ||
fh.readline() | ||
seq = fh.readline().rstrip() | ||
fh.readline() | ||
qual = fh.readline().rstrip() | ||
|
||
if len(seq) == 0: | ||
break | ||
sequence.append(seq) | ||
qualities.append(qual) | ||
return sequence, qualities | ||
|
||
@staticmethod | ||
def reverseComplement(s): | ||
complement = {'A':'T', 'C':'G', 'G':'C', 'T':'A', 'N':'N'} # N represent the ambiguous bases | ||
t = '' | ||
for base in s: | ||
t = complement[base] + t # pre-pending -> reverse the string | ||
return t | ||
|
||
@staticmethod | ||
def naive_exact_matching(pattern, target): | ||
'''this function is the application of naive exact matching algorithms to count the occurence of input sequence | ||
Note: this version has not included mismatches''' | ||
occurrences = [] | ||
for i in range(len(target) - len(pattern) + 1): | ||
match = True | ||
for j in range(len(pattern)): | ||
if target[i+j] != pattern[j]: | ||
match = False | ||
break | ||
if match: | ||
occurrences.append(i) | ||
return occurrences | ||
@staticmethod | ||
def generateReads(genome, numReads, readLen): | ||
''' this function return a set of reads from a given genome''' | ||
reads = [] | ||
for _ in range(numReads): | ||
start = random.randint(0, len(genome)-readLen) - 1 | ||
reads.append(genome[start : start+readLen]) | ||
return reads | ||
|
||
def __init__(self, reads, genome): | ||
self.reads = reads | ||
self.genome = genome # The reverse complement should be considered, as for the most reliable results | ||
|
||
def runNaive(self, seed_flag=True, reverse_flag=True, seed_length=30): | ||
'''this function is the main function to run naive exact matching | ||
Note: it's recommanded to run the alignment process against the reverse complement of genome''' | ||
numMatched = 0 | ||
n = 0 | ||
# Using custom seed length and run reverse complement alignment | ||
if seed_flag == True and reverse_flag == True: | ||
for read in self.reads: | ||
read = read[:seed_length] | ||
# print(read) | ||
matches = Naive.naive_exact_matching(read, self.genome) | ||
matches.extend(Naive.naive_exact_matching(Naive.reverseComplement(read), self.genome)) | ||
n += 1 | ||
if len(matches) > 0: | ||
numMatched += 1 | ||
print('%d / %d reads matched the genome exactly!' % (numMatched, n)) | ||
|
||
# Using whole sequence length and run reverse complement alignment | ||
if seed_flag == False and reverse_flag == True: | ||
for read in self.reads: | ||
# read = read[:seed_length] | ||
matches = Naive.naive_exact_matching(read, self.genome) | ||
matches.extend(Naive.naive_exact_matching(Naive.reverseComplement(read), self.genome)) | ||
n += 1 | ||
if len(matches) > 0: | ||
numMatched += 1 | ||
print('%d / %d reads matched the genome exactly!' % (numMatched, n)) | ||
|
||
|
||
def runCMD(cmd, verbose = False, *args, **kwargs): | ||
'''this function is used to run commandlines e.g. wget, curl. | ||
Note: from Roel Peters, "Using Python and wget to Download Web Pages and Files" ''' | ||
process = subprocess.Popen( | ||
cmd, | ||
stdout = subprocess.PIPE, | ||
stderr = subprocess.PIPE, | ||
text = True, | ||
shell=True | ||
) | ||
std_out, std_err = process.communicate() | ||
if verbose: | ||
print(std_out.strip(), std_err) | ||
pass | ||
|
||
def getExampleDatasets(): | ||
'''this function is used to helpe user to retrieve example datasets. | ||
Note: only executed when the test dataset does not exist''' | ||
runCMD('mkdir exampledata') | ||
runCMD('wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/phix.fa -O exampledata/phix.fa') | ||
runCMD('wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR266411_1.first1000.fastq -O exampledata/ERR266411_1.first1000.fastq') | ||
|
||
|
||
if __name__ == "__main__": | ||
# Basic utility test | ||
print('------------------------------- Naive Exact Macthing Basic Utility Test -------------------------------') | ||
test_occurrences = Naive.naive_exact_matching('AG', 'AGCTTAGATAGC') | ||
print('The pattern is AG') | ||
print('The target sequence is AGCTTAGATAGC') | ||
print(f'The start position of exact matching is {test_occurrences}') | ||
|
||
# Integrated utility test | ||
getExampleDatasets() # automatically download the datasets | ||
|
||
# 1. Using actual reads | ||
print('------------------------------- Naive Exact Macthing: Using Actual Reads -------------------------------') | ||
phix_genome = Naive.read_Cicurlar_Genome('exampledata/phix.fa') | ||
phix_reads, _ = Naive.readFastq('exampledata/ERR266411_1.first1000.fastq') | ||
phix_naive_real = Naive(phix_reads, phix_genome) | ||
phix_naive_real.runNaive(seed_flag=True, reverse_flag=True, seed_length=30) | ||
|
||
# 2. Using simulated reads | ||
print('------------------------------- Naive Exact Macthing Basic Utility Test: Using Simulated Reads -------------------------------') | ||
simulated_reads = Naive.generateReads(phix_genome, 100, 100) # generate 100 reads in 100 bp | ||
phix_naive_sim = Naive(simulated_reads, phix_genome) | ||
phix_naive_sim.runNaive(seed_flag=True, reverse_flag=True, seed_length=30) |
Oops, something went wrong.