Merge pull request #827 from marcelm/ambiguous

Do not report a match for ambiguous sequences
marcelm · Dec 13, 2024 · 647c442 · 647c442
2 parents ec36e00 + 0cab596
commit 647c442
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 28 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -5,6 +5,12 @@ Changelog
 development version
 -------------------
 
+This release increases the major version number because results when
+demultiplexing may change with this release.
+
+* :pr:`827`: When matching multiple adapters (typically when demultiplexing
+  using barcodes), Cutadapt now no longer assigns ambiguous matches to one
+  of the adapters/barcodes.
 * :issue:`808`: Made gzip compression level 1 the default, which improves
   runtime significantly in many cases. (Compressing the output is often a
   bottleneck when using multiple threads.) Output files will be larger, but because

diff --git a/doc/guide.rst b/doc/guide.rst
@@ -1915,6 +1915,7 @@ To demultiplex this type of data, the
 
 
 .. _speed-up-demultiplexing:
+.. _adapter-indexing:
 
 Speeding up demultiplexing/adapter indexing
 -------------------------------------------
@@ -1961,6 +1962,24 @@ Cutadapt’s output::
 .. versionadded:: 5.0
    An index is created up to three (instead of two) allowed errors.
 
+Ambiguous sequences
+~~~~~~~~~~~~~~~~~~~
+
+When :ref:`an index is created for multiple anchored adapters <adapter-indexing>`, Cutadapt checks
+whether there are any possible input sequences that lead to ambiguous matches, that is, which
+would match two or more adapter sequences equally well.
+If there are ambiguous sequences, Cutadapt prints a warning like this::
+
+    WARNING: The adapters are too similar. When creating the index, 31 ambiguous sequences were found that cannot be assigned uniquely.
+    WARNING: For example, 'TAGTGCTTGA', when found in a read, would result in 10 matches for both bc3 'TAGTGCTTGA' and bc11 'TAGTGCTTGA'
+    WARNING: Reads with ambiguous sequence will *not* be trimmed.
+
+If you use ``-no-index``, Cutadapt will not print this warning and instead assign ambiguous reads
+to the first of the adapters that match equally well.
+
+.. versionadded:: 5.0
+   Ambiguous sequences were not handled specially in earlier versions.
+
 Demultiplexing paired-end reads in mixed orientation
 ----------------------------------------------------
 

diff --git a/src/cutadapt/adapters.py b/src/cutadapt/adapters.py
@@ -1255,10 +1255,11 @@ def __init__(self, adapters, prefix: bool):
         for adapter in adapters:
             self._accept(adapter, prefix)
         self._adapters = adapters
-        self._lengths, self._index = self._make_index()
+        self._lengths, self._index, self._ambiguous = self._make_index()
         logger.debug(
             "String lengths in the index: %s", sorted(self._lengths, reverse=True)
         )
+
         if len(self._lengths) == 1:
             self._length = self._lengths[0]
             self.match_to = self._match_to_one_length
@@ -1337,7 +1338,7 @@ def is_acceptable(cls, adapter: SingleAdapter, prefix: bool):
             return False
         return True
 
-    def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]:
+    def _make_index(self) -> Tuple[List[int], "AdapterIndexDict", int]:
         start_time = time.time()
         max_k = max(
             (
@@ -1356,7 +1357,7 @@ def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]:
             )
         index: Dict[str, Tuple[SingleAdapter, int, int]] = dict()
         lengths = set()
-        has_warned = False
+        ambiguous = []
         for adapter in self._adapters:
             sequence = adapter.sequence
             k = int(adapter.max_error_rate * len(sequence))
@@ -1367,47 +1368,54 @@ def _make_index(self) -> Tuple[List[int], "AdapterIndexDict"]:
                         other_adapter, other_errors, other_matches = index[s]
                         if matches < other_matches:
                             continue
-                        if other_matches == matches and not has_warned:
-                            self._warn_similar(adapter, other_adapter, k, s, matches)
-                            has_warned = True
+                        if other_matches == matches:
+                            ambiguous.append((s, adapter, other_adapter, k, matches))
                     index[s] = (adapter, errors, matches)
                     lengths.add(len(s))
             else:
                 n = len(sequence)
                 for errors in range(k + 1):
+                    matches = n - errors
                     for s in hamming_sphere(sequence, errors):
-                        matches = n - errors
                         if s in index:
                             other_adapter, other_errors, other_matches = index[s]
                             if matches < other_matches:
                                 continue
-                            if other_matches == matches and not has_warned:
-                                self._warn_similar(
-                                    adapter, other_adapter, k, s, matches
+                            if other_matches == matches:
+                                ambiguous.append(
+                                    (s, adapter, other_adapter, k, matches)
                                 )
-                                has_warned = True
                         index[s] = (adapter, errors, matches)
                 lengths.add(n)
+
+        if ambiguous:
+            logger.warning(
+                "WARNING: The adapters are too similar. When creating the index, "
+                "%d ambiguous sequences were found that cannot be assigned uniquely.",
+                len(ambiguous),
+            )
+            s, adapter, other_adapter, k, matches = ambiguous[0]
+            logger.warning(
+                "WARNING: For example, %r, when found in a read, would result in "
+                "%s matches for both %s %r and %s %r",
+                s,
+                matches,
+                other_adapter.name,
+                other_adapter.sequence,
+                adapter.name,
+                adapter.sequence,
+            )
+            logger.warning(
+                "WARNING: Reads with ambiguous sequence will *not* be trimmed."
+            )
+            for s, adapter, other_adapter, k, matches in ambiguous:
+                del index[s]
+
         elapsed = time.time() - start_time
         logger.info("Built an index containing %s strings.", len(index))
         logger.debug("Building the index took %.1f s", elapsed)
 
-        return sorted(lengths, reverse=True), index
-
-    @staticmethod
-    def _warn_similar(adapter, other_adapter, k, s, matches):
-        logger.warning(
-            "Adapters %s %r and %s %r are very similar. At %s allowed errors, "
-            "the sequence %r cannot be assigned uniquely because the number of "
-            "matches is %s compared to both adapters.",
-            other_adapter.name,
-            other_adapter.sequence,
-            adapter.name,
-            adapter.sequence,
-            k,
-            s,
-            matches,
-        )
+        return sorted(lengths, reverse=True), index, len(ambiguous)
 
     def _match_to_one_length(self, sequence: str):
         """

diff --git a/tests/test_adapters.py b/tests/test_adapters.py
@@ -509,7 +509,7 @@ def test_indexed_very_similar(caplog):
             PrefixAdapter("GAAG", max_errors=1, indels=False),
         ]
     )
-    assert "cannot be assigned uniquely" in caplog.text
+    assert "ambiguous sequences" in caplog.text
 
 
 def test_indexed_too_high_k():
@@ -582,6 +582,26 @@ def test_indexed_prefix_adapters_with_n_collision(sequence):
     assert result.adapter is a2
 
 
+def test_indexed_prefix_adapters_ignore_ambiguous_matches():
+    a1 = PrefixAdapter("AAAAA", max_errors=1, indels=False)
+    a2 = PrefixAdapter("TTAAA", max_errors=1, indels=False)
+    ipa = IndexedPrefixAdapters([a1, a2])
+
+    result = ipa.match_to("ATAAA")
+
+    assert result is None
+
+
+def test_indexed_prefix_adapters_ignore_ambiguous_matches_with_indels():
+    a1 = PrefixAdapter("AGTACGT", max_errors=1, indels=True)
+    a2 = PrefixAdapter("ACGTAGT", max_errors=1, indels=True)
+    ipa = IndexedPrefixAdapters([a1, a2])
+
+    result = ipa.match_to("ACGTACGT")
+
+    assert result is None
+
+
 def test_inosine_wildcard():
     adapter = BackAdapter("CTGIAIT", max_errors=0, min_overlap=3)
     match = adapter.match_to("GGCTGAATTGGG")