Skip to content

Commit

Permalink
Add --action=crop
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelm committed Jun 16, 2024
1 parent b5e74c1 commit 22079ba
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ Changelog
development version
-------------------

* Added ``--action=crop`` for removing everything *except* the adapter sequence.
(The sequence before and after the adapter is removed.) This can be useful if the
"adapter" contains wildcards (such as ``N`` nucleotides) that you are interested
in.
* :issue:`788`: Added option ``-L`` as a counterpart to ``-l``/``--length``,
which allows shortening R1 and R2 to different lengths.
* :issue:`784`: Fix some unexpected trimming results for anchored 5'
Expand Down
13 changes: 11 additions & 2 deletions doc/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,8 @@ will result in ::
The 3' adapter in the last read is not trimmed because the anchored 5’ adapter is required, but
missing in the read.

Linked adapters do not work when used in combination with ``--info-file`` and ``--action=mask``.
Linked adapters do not work in combination with ``--info-file``, ``--action=mask`` and
``--action=crop``.

To provide :ref:`adapter-search parameters <search-parameters>`
for linked adapters, they need to be set for each constituent adapter separately, as in
Expand Down Expand Up @@ -1025,7 +1026,7 @@ a 5' and 3' adapter, in effect only the sequence between the 5' and the 3'
adapter matches is kept.

With ``--action=retain``, the read is trimmed, but the adapter sequence itself
is not removed. Up- and downstream sequences are removed in the same way as
is not removed. Up- or downstream sequences are removed in the same way as
for the ``trim`` action. For linked adapters, both adapter sequences are kept.

.. note::
Expand All @@ -1044,12 +1045,20 @@ the read.
Use ``--action=mask`` to write ``N`` characters to those parts of the read
that would otherwise have been removed.

Use ``--action=crop`` to remove everything from the read *except* the adapter sequence.
That is, the sequence before and after the adapter is removed.
This is useful if the "adapter" contains wildcards (such as ``N`` nucleotides)
that you are interested in.

Use ``--action=lowercase`` to change to lowercase those parts of the read that
would otherwise have been removed. The rest is converted to uppercase.

.. versionadded:: 3.1
The ``retain`` action.

.. versionadded:: 4.9
The ``crop`` action.


.. _cut-bases:

Expand Down
3 changes: 2 additions & 1 deletion src/cutadapt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,13 +238,14 @@ def get_argument_parser() -> ArgumentParser:
group.add_argument("-N", "--no-match-adapter-wildcards", action="store_false",
default=True, dest="match_adapter_wildcards",
help="Do not interpret IUPAC wildcards in adapters.")
group.add_argument("--action", choices=("trim", "retain", "mask", "lowercase", "none"),
group.add_argument("--action", choices=("trim", "retain", "mask", "lowercase", "crop", "none"),
default="trim",
help="What to do if a match was found. "
"trim: trim adapter and up- or downstream sequence; "
"retain: trim, but retain adapter; "
"mask: replace with 'N' characters; "
"lowercase: convert to lowercase; "
"crop: trim up and downstream sequence; "
"none: leave unchanged. Default: %(default)s")
group.add_argument("--rc", "--revcomp", dest="reverse_complement", default=False,
action="store_true",
Expand Down
13 changes: 10 additions & 3 deletions src/cutadapt/modifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __init__(
index: bool = True,
):
self.times = times
assert action in ("trim", "mask", "lowercase", "retain", None)
assert action in ("trim", "mask", "lowercase", "retain", "crop", None)
self.action = action
self.with_adapters = 0
self.adapter_statistics = {a: a.create_statistics() for a in adapters}
Expand All @@ -117,8 +117,8 @@ def __init__(
)
else:
self.adapters = MultipleAdapters(adapters)
if action == "retain" and times > 1:
raise ValueError("'retain' cannot be combined with times > 1")
if action in {"retain", "crop"} and times > 1:
raise ValueError("'retain' and 'crop' cannot be combined with times > 1")
if self.times == 1 and self.action == "trim":
self.match_and_trim = self._match_and_trim_once_action_trim # type: ignore

Expand Down Expand Up @@ -196,6 +196,11 @@ def lowercased_read(read, matches: Sequence[Match]):
)
return result

@staticmethod
def cropped_read(read, matches: Sequence[Match]):
m = matches[-1]
return read[m.rstart : m.rstop] # type: ignore

def __call__(self, read, info: ModificationInfo):
trimmed_read, matches = self.match_and_trim(read)
if matches:
Expand Down Expand Up @@ -242,6 +247,8 @@ def match_and_trim(self, read):
elif self.action == "lowercase":
trimmed_read = self.lowercased_read(read, matches)
assert len(trimmed_read.sequence) == len(read)
elif self.action == "crop":
trimmed_read = self.cropped_read(read, matches)
elif self.action is None:
trimmed_read = read[:]

Expand Down
14 changes: 14 additions & 0 deletions tests/cut/action_crop.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
>r1
caag
>r2
caag
>r3
caag
>r4
caa
>r5
ggttaa
>r6
ggttaa
>r7
ttaa
8 changes: 8 additions & 0 deletions tests/test_commandline.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,14 @@ def test_action_retain_times():
main(["-a", "ACGT", "--times=2", "--action=retain", datapath("small.fastq")])


def test_action_crop(run):
run(
"-g GGTTAA -a CAAG --action=crop --discard-untrimmed",
"action_crop.fasta",
"action_retain.fasta",
)


def test_gz_multiblock(run):
"""compressed gz file with multiple blocks (created by concatenating two .gz files)"""
run("-b TTAGACATATCTCCGTCG", "small.fastq", "multiblock.fastq.gz")
Expand Down

0 comments on commit 22079ba

Please sign in to comment.