diff --git a/docs/python-api.md b/docs/python-api.md index 12f40ada6b..49f49accf3 100644 --- a/docs/python-api.md +++ b/docs/python-api.md @@ -598,6 +598,7 @@ Functions and static methods ```{eval-rst} .. autosummary:: Tree.kc_distance + Tree.rf_distance ``` (sec_python_api_trees_balance)= diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index ff20dd06b7..c56924323f 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -93,6 +93,8 @@ - Add ``resources`` section to provenance schema. (:user:`benjeffery`, :pr:`3016`) +- Add ``Tree.rf_distance`` method to calculate the unweighted Robinson-Foulds distance + between two trees. (:user:`Billyzhang1229`, :issue:`995`, :pr:`2643`, :pr:`3032`) -------------------- [0.5.8] - 2024-06-27 diff --git a/python/tests/test_distance_metrics.py b/python/tests/test_distance_metrics.py index d4ca30a85c..c08351962f 100644 --- a/python/tests/test_distance_metrics.py +++ b/python/tests/test_distance_metrics.py @@ -1420,6 +1420,10 @@ def test_ignores_subtrees_with_no_samples(self): assert t1.kc_distance(t2, 1) == 0 +# Test the RF distance metrics: +# TODO: integrate with the KC tests + + class TestTreeSameSamples: # Tree1 # 2.00┊ 6 ┊ @@ -1569,7 +1573,7 @@ def tree(self): return tables.tree_sequence().first() def test_rf_distance(self): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="single root"): self.tree().rf_distance(self.tree()) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index ce046c9e87..a4f72393b7 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -2975,15 +2975,30 @@ def _get_sample_sets(self): def rf_distance(self, other): """ - Returns the Robinson-Foulds distance between the specified pair of trees. + Returns the (unweighted) Robinson-Foulds distance between the specified pair + of trees, where corresponding samples between the two trees are identified by + node ID. The Robinson-Foulds distance (also known as the symmetric difference) + is defined as the number of bipartitions that are present in one tree but + not the other (see + `Robinson & Foulds (1981) `_). + This method returns the unnormalised RF distance: if the + trees are strictly bifurcating, i.e. binary, the value can be + normalised by dividing by the maximum, which is $2n-4$ for two rooted + trees of $n$ samples (however, if the trees contain polytomies, the maximum + RF distance is less easily defined). - .. seealso:: - See `Robinson & Foulds (1981) - `_ for more details. - - :param Tree other: The other tree to compare to. - :return: The computed Robinson-Foulds distance between this tree and other. + .. note:: + The RF distance can be sensitive to small changes in topology: in some + cases, changing the position of a single leaf can result in the maximum + RF distance. Therefore even if adjacent trees in a tree sequence differ + by a single subtree-prune-and-regraft operation, the RF distance + between them can be large. + + :param Tree other: The other tree to compare to. Trees are treated as rooted. + :return: The unweighted Robinson-Foulds distance between this tree and ``other``. :rtype: int + :raises ValueError: If either tree has multiple roots, or the trees have + different sample nodes. """ if self.num_roots != 1 or other.num_roots != 1: raise ValueError("Trees must have a single root")