From 63e8ff0cdd971cd561b020d8d857d1cc3e9830f9 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Sat, 13 Jan 2018 12:08:45 +0000 Subject: [PATCH 01/10] Updated EdgeTable API docs. --- docs/api.rst | 5 +++ msprime/tables.py | 110 ++++++++++++++++++++++++++++++---------------- 2 files changed, 78 insertions(+), 37 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 9649d3df6..d4589f440 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -298,14 +298,19 @@ Table classes :members: .. autoclass:: msprime.EdgeTable + :members: .. autoclass:: msprime.MigrationTable + :members: .. autoclass:: msprime.SiteTable + :members: .. autoclass:: msprime.MutationTable + :members: .. autoclass:: msprime.ProvenanceTable + :members: +++++++++++++++ Table functions diff --git a/msprime/tables.py b/msprime/tables.py index 4c0fe1022..95d344fa3 100644 --- a/msprime/tables.py +++ b/msprime/tables.py @@ -190,7 +190,7 @@ def add_row(self, flags=0, time=0, population=-1, metadata=None): def set_columns( self, flags, time, population=None, metadata=None, metadata_offset=None): """ - Sets the values for each column in this NodeTable using the values in + Sets the values for each column in this :class:`.NodeTable` using the values in the specified arrays. Overwrites any data currently stored in the table. The ``flags``, ``time`` and ``population`` arrays must all be of the same length, @@ -222,8 +222,8 @@ def set_columns( def append_columns( self, flags, time, population=None, metadata=None, metadata_offset=None): """ - Appends the specified arrays to the end of the columns of this table. - This allows many new rows to be added at once. + Appends the specified arrays to the end of the columns in this + :class:`NodeTable`. This allows many new rows to be added at once. The ``flags``, ``time`` and ``population`` arrays must all be of the same length, which is equal to the number of nodes that will be added to the table. The @@ -266,43 +266,26 @@ def _pickle_node_table(table): class EdgeTable(_msprime.EdgeTable): """ - Class for tables describing all edges in a tree sequence, of the form - - left right parent child - 0.0 0.4 3 0 - 0.0 0.4 3 2 - 0.4 1.0 3 0 - 0.4 1.0 3 1 - 0.4 1.0 3 2 - 0.0 0.4 4 1 - 0.0 0.4 4 3 - - These describe the half-open genomic interval affected: `[left, right)`, - the `parent` and the `child` on that interval. - - Requirements: to describe a valid tree sequence, a `EdgeTable` (and - corresponding `NodeTable`, to provide birth times) must satisfy: - - 1. any two edges that share a child must be nonoverlapping, and - 2. the birth times of the `parent` in an edge must be strictly - greater than the birth times of the `child` in that edge. - - Furthermore, for algorithmic requirements - - 4. the smallest `left` coordinate must be 0.0, - 5. the table must be sorted so that birth time of the `parent` increases - with table row, and - 6. any two edges corresponding to the same `parent` must be - nonoverlapping. + A table defining the edges in a tree sequence. See the + :ref:`definitions ` for details on the columns + in this table and the + :ref:`tree sequence requirements ` section + for the properties needed for an edge table to be a part of a valid tree sequence. - It is an additional requirement that the complete ancestry of each sample - must be specified, but this is harder to verify. + :warning: The numpy arrays returned by table attribute accesses are **copies** + of the underlying data. In particular, this means that you cannot edit + the values in the columns by updating the attribute arrays. - It is not required that all records corresponding to the same parent be - adjacent in the table. + **NOTE:** this behaviour may change in future. - `simplify_tables()` may be used to convert noncontradictory tables - into tables satisfying the full set of requirements. + :ivar left: The array of left coordinates. + :vartype left: numpy.ndarray, dtype=np.float64 + :ivar right: The array of right coordinates. + :vartype right: numpy.ndarray, dtype=np.float64 + :ivar parent: The array of parent node IDs. + :vartype parent: numpy.ndarray, dtype=np.int32 + :ivar child: The array of child node IDs. + :vartype child: numpy.ndarray, dtype=np.int32 """ def __str__(self): left = self.left @@ -361,6 +344,59 @@ def reset(self): # Deprecated alias for clear self.clear() + def add_row(self, left, right, parent, child): + """ + Adds a new row to this :class:`EdgeTable` and returns the ID of the + corresponding edge. + + :param float left: The left coordinate (inclusive). + :param float right: The right coordinate (exclusive). + :param int parent: The ID of parent node. + :param int child: The ID of child node. + :return: The ID of the newly added edge. + :rtype: int + """ + return super(EdgeTable, self).add_row(left, right, parent, child) + + def set_columns(self, left, right, parent, child): + """ + Sets the values for each column in this :class:`.EdgeTable` using the values + in the specified arrays. Overwrites any data currently stored in the table. + + All four parameters are mandatory, and must be numpy arrays of the + same length (which is equal to the number of edges the table will contain). + + :param left: The left coordinates (inclusive). + :type left: numpy.ndarray, dtype=np.float64 + :param right: The right coordinates (exclusive). + :type right: numpy.ndarray, dtype=np.float64 + :param parent: The parent node IDs. + :type parent: numpy.ndarray, dtype=np.int32 + :param child: The child node IDs. + :type child: numpy.ndarray, dtype=np.int32 + """ + super(EdgeTable, self).set_columns(left, right, parent, child) + + def append_columns(self, left, right, parent, child): + """ + Appends the specified arrays to the end of the columns of this + :class:`EdgeTable`. This allows many new rows to be added at once. + + All four parameters are mandatory, and must be numpy arrays of the + same length (which is equal to the number of additional edges to + add to the table). + + :param left: The left coordinates (inclusive). + :type left: numpy.ndarray, dtype=np.float64 + :param right: The right coordinates (exclusive). + :type right: numpy.ndarray, dtype=np.float64 + :param parent: The parent node IDs. + :type parent: numpy.ndarray, dtype=np.int32 + :param child: The child node IDs. + :type child: numpy.ndarray, dtype=np.int32 + """ + super(EdgeTable, self).append_columns(left, right, parent, child) + # Pickle support. See copyreg registration for this function below. def _edge_table_pickle(table): From 8265395de7c783a6c8c7593d359f5c7f5bae55f5 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Sat, 13 Jan 2018 12:26:21 +0000 Subject: [PATCH 02/10] Added docs for MigrationTable API. --- msprime/tables.py | 92 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/msprime/tables.py b/msprime/tables.py index 95d344fa3..98af49f4c 100644 --- a/msprime/tables.py +++ b/msprime/tables.py @@ -410,6 +410,34 @@ def _edge_table_pickle(table): class MigrationTable(_msprime.MigrationTable): + """ + A table defining the migrations in a tree sequence. See the + :ref:`definitions ` for details on the columns + in this table and the + :ref:`tree sequence requirements ` section + for the properties needed for a migration table to be a part of a valid tree + sequence. + + :warning: The numpy arrays returned by table attribute accesses are **copies** + of the underlying data. In particular, this means that you cannot edit + the values in the columns by updating the attribute arrays. + + **NOTE:** this behaviour may change in future. + + :ivar left: The array of left coordinates. + :vartype left: numpy.ndarray, dtype=np.float64 + :ivar right: The array of right coordinates. + :vartype right: numpy.ndarray, dtype=np.float64 + :ivar node: The array of node IDs. + :vartype node: numpy.ndarray, dtype=np.int32 + :ivar source: The array of source population IDs. + :vartype source: numpy.ndarray, dtype=np.int32 + :ivar dest: The array of destination population IDs. + :vartype dest: numpy.ndarray, dtype=np.int32 + :ivar time: The array of time values. + :vartype time: numpy.ndarray, dtype=np.float64 + """ + def __str__(self): left = self.left right = self.right @@ -472,6 +500,70 @@ def reset(self): # Deprecated alias for clear self.clear() + def add_row(self, left, right, node, source, dest, time): + """ + Adds a new row to this :class:`MigrationTable` and returns the ID of the + corresponding migration. + + :param float left: The left coordinate (inclusive). + :param float right: The right coordinate (exclusive). + :param int node: The node ID. + :param int source: The ID of the source population. + :param int dest: The ID of the destination population. + :param float time: The time of the migration event. + :return: The ID of the newly added migration. + :rtype: int + """ + return super(MigrationTable, self).add_row(left, right, node, source, dest, time) + + def set_columns(self, left, right, node, source, dest, time): + """ + Sets the values for each column in this :class:`.MigrationTable` using the values + in the specified arrays. Overwrites any data currently stored in the table. + + All six parameters are mandatory, and must be numpy arrays of the + same length (which is equal to the number of migrations the table will contain). + + :param left: The left coordinates (inclusive). + :type left: numpy.ndarray, dtype=np.float64 + :param right: The right coordinates (exclusive). + :type right: numpy.ndarray, dtype=np.float64 + :param node: The node IDs. + :type node: numpy.ndarray, dtype=np.int32 + :param source: The source population IDs. + :type source: numpy.ndarray, dtype=np.int32 + :param dest: The destination population IDs. + :type dest: numpy.ndarray, dtype=np.int32 + :param time: The time of each migration. + :type time: numpy.ndarray, dtype=np.int64 + """ + super(MigrationTable, self).set_columns(left, right, node, source, dest, time) + + def append_columns(self, left, right, node, source, dest, time): + """ + Appends the specified arrays to the end of the columns of this + :class:`MigrationTable`. This allows many new rows to be added at once. + + All six parameters are mandatory, and must be numpy arrays of the + same length (which is equal to the number of additional migrations + to add to the table). + + :param left: The left coordinates (inclusive). + :type left: numpy.ndarray, dtype=np.float64 + :param right: The right coordinates (exclusive). + :type right: numpy.ndarray, dtype=np.float64 + :param node: The node IDs. + :type node: numpy.ndarray, dtype=np.int32 + :param source: The source population IDs. + :type source: numpy.ndarray, dtype=np.int32 + :param dest: The destination population IDs. + :type dest: numpy.ndarray, dtype=np.int32 + :param time: The time of each migration. + :type time: numpy.ndarray, dtype=np.int64 + """ + super(MigrationTable, self).append_columns( + left, right, node, source, dest, time) + # Pickle support. See copyreg registration for this function below. def _migration_table_pickle(table): From 0ab76974a6042d5d1e53ef0c26de33784b8e7b0a Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Sun, 14 Jan 2018 18:17:03 +0000 Subject: [PATCH 03/10] General docs for ragged columns. --- docs/api.rst | 130 +++++++++++++++++++++++++++++++++ docs/interchange.rst | 167 ++++++++++++++++++++++++------------------- msprime/tables.py | 82 ++++++++++++++++++--- 3 files changed, 296 insertions(+), 83 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index d4589f440..ed1ab6159 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -290,6 +290,128 @@ held in the columns:: >>> t == t2 False + + +.. _sec-tables-api-text-columns: + +++++++++++++ +Text columns +++++++++++++ + +As described in the :ref:`sec-encoding-ragged-columns`, working with +variable length columns is somewhat more involved. Columns +encoding text data store the **encoded bytes** of the flattened +strings, and the offsets into this column in two separate +arrays. + +Consider the following example:: + + >>> t = msprime.SiteTable() + >>> t.add_row(0, "A") + >>> t.add_row(1, "BB") + >>> t.add_row(2, "") + >>> t.add_row(3, "CCC") + >>> print(t) + id position ancestral_state metadata + 0 0.00000000 A + 1 1.00000000 BB + 2 2.00000000 + 3 3.00000000 CCC + >>> t[0] + SiteTableRow(position=0.0, ancestral_state='A', metadata=b'') + >>> t[1] + SiteTableRow(position=1.0, ancestral_state='BB', metadata=b'') + >>> t[2] + SiteTableRow(position=2.0, ancestral_state='', metadata=b'') + >>> t[3] + SiteTableRow(position=3.0, ancestral_state='CCC', metadata=b'') + +Here we create a :class:`.SiteTable` and add four rows, each with a different +``ancestral_state``. We can then access this information from each +row in a straightforward manner. Working with the data in the columns +is a little trickier, however:: + + >>> t.ancestral_state + array([65, 66, 66, 67, 67, 67], dtype=int8) + >>> t.ancestral_state_offset + array([0, 1, 3, 3, 6], dtype=uint32) + >>> msprime.unpack_strings(t.ancestral_state, t.ancestral_state_offset) + ['A', 'BB', '', 'CCC'] + +Here, the ``ancestral_state`` array is the UTF8 encoded bytes of the flattened +strings, and the ``ancestral_state_offset`` is the offset into this array +for each row. The :func:`.unpack_strings` function, however, is a convient +way to recover the original strings from this encoding. We can also use the +:func:`.pack_strings` to insert data using this approach:: + + >>> a, off = msprime.pack_strings(["0", "12", ""]) + >>> t.set_columns(position=[0, 1, 2], ancestral_state=a, ancestral_state_offset=off) + >>> print(t) + id position ancestral_state metadata + 0 0.00000000 0 + 1 1.00000000 12 + 2 2.00000000 + +.. _sec-tables-api-binary-columns: + +++++++++++++++ +Binary columns +++++++++++++++ + +Columns storing binary data are handled in a very similar manner to +:ref:`sec-tables-api-text-columns`. The difference between the two is +only raw :class:`bytes` values are accepted: no character encoding or +decoding is done on the data. Consider the following example:: + + + >>> t = msprime.NodeTable() + >>> t.add_row(metadata=b"raw bytes") + >>> t.add_row(metadata=pickle.dumps({"x": 1.1})) + >>> t[0].metadata + b'raw bytes' + >>> t[1].metadata + b'\x80\x03}q\x00X\x01\x00\x00\x00xq\x01G?\xf1\x99\x99\x99\x99\x99\x9as.' + >>> pickle.loads(t[1].metadata) + {'x': 1.1} + >>> print(t) + id flags population time metadata + 0 0 -1 0.00000000000000 cmF3IGJ5dGVz + 1 0 -1 0.00000000000000 gAN9cQBYAQAAAHhxAUc/8ZmZmZmZmnMu + >>> t.metadata + array([ 114, 97, 119, 32, 98, 121, 116, 101, 115, -128, 3, + 125, 113, 0, 88, 1, 0, 0, 0, 120, 113, 1, + 71, 63, -15, -103, -103, -103, -103, -103, -102, 115, 46], dtype=int8) + >>> t.metadata_offset + array([ 0, 9, 33], dtype=uint32) + + +Here we add two rows to a :class:`.NodeTable`, with different +:ref:`metadata `. The first row contains a simple +byte string, and the second contains a Python dictionary serialised using +:mod:`pickle`. We then show several different (and seemingly incompatible!) +different views on the same data. + +When we access the data in a row (e.g., ``t[0].metadata``) we are returned +a Python bytes object containing precisely the bytes that were inserted. +The pickled dictionary is encoded in 24 bytes containing unprintable +characters, and when we unpickle it using :func:`pickle.loads`, we obtain +the original dictionary. + +When we print the table, however, we see some data which is seemingly +unrelated to the original contents. This is because the binary data is +`base64 encoded `_ to ensure +that it is print-safe (and doesn't break your terminal). (See the +:ref:`sec-metadata-definition` section for more information on the +use of base64 encoding.). + +Finally, when we print the ``metadata`` column, we see the raw byte values +encoded as signed integers. As for :ref:`sec-tables-api-text-columns`, +the ``metadata_offset`` column encodes the offsets into this array. So, we +see that the metadata value is 9 bytes long and the second is 24. + +The :func:`pack_bytes` and :func:`unpack_bytes` functions are also useful +for encoding data in these columns. + +++++++++++++ Table classes +++++++++++++ @@ -327,3 +449,11 @@ Table functions .. autofunction:: msprime.parse_sites .. autofunction:: msprime.parse_mutations + +.. autofunction:: msprime.pack_strings + +.. autofunction:: msprime.unpack_strings + +.. autofunction:: msprime.pack_bytes + +.. autofunction:: msprime.unpack_bytes diff --git a/docs/interchange.rst b/docs/interchange.rst index d29fe9d6a..2d4b04ec1 100644 --- a/docs/interchange.rst +++ b/docs/interchange.rst @@ -280,12 +280,38 @@ record char Provenance record. ================ ============== =========== - .. _sec-metadata-definition: Metadata ======== +Users of the tables API sometimes need to store auxiliary information for +the various entities defined here. For example, in a forwards-time simulation, +the simulation engine may wish to store the time at which a particular mutation +arose or some other pertinent information. If we are representing real data, +we may wish to store information derived from a VCF INFO field, or associate +information relating to samples or populations. The columns defined in tables +here are deliberately minimal: we define columns only for information which +the library itself can use. All other information is considered to be +**metadata**, and is stored in the ``metadata`` columns of the various +tables. + +Arbitrary binary data can be stored in ``metadata`` columns, and the +``msprime`` library makes no attempt to interpret this information. How the +information held in this field is encoded is entirely the choice of client code. + +To ensure that metadata can be safely interchanged using the :ref:`sec-text-file-format`, +each row is `base 64 encoded `_. Thus, +binary information can be safely printed and exchanged, but may not be +human readable. + +.. todo:: + We plan on providing more sophisticated tools for working with metadata + in future, including the auto decoding metadata via pluggable + functions and the ability to store metadata schemas so that metadata + is self-describing. + + .. _sec-valid-tree-sequence-requirements: Valid tree sequence requirements @@ -458,84 +484,77 @@ In this section we describe the high-level details of the API for interchanging table data via numpy arrays. Please see the :ref:`sec-tables-api` for detailed description of the functions and methods. +The tables API is based on **columnar** storage of the data. In memory, each +table is organised as a number of blocks of contiguous storage, one for +each column. There are many advantages to this approach, but the key +property for us is that allows for very efficient transfer of data +in and out of tables. Rather than inserting data into tables row-by-row +(which can be done using the ``add_row`` methods), it is much more +efficient to add many rows at the same time by providing pointers to blocks of +contigous memory. By taking +this approach, we can work with tables containing gigabytes of data very +efficiently. + +We use the `numpy Array API `_ +to allow us to define and work with numeric arrays of the required types. +Node IDs, for example, are defined using 32 bit integers. Thus, the +``parent`` column of an :ref:`sec-edge-table-definition`'s with ``n`` rows +is a block ``4n`` bytes. + +This approach is very straightforward for columns in which each row contains +a fixed number of values. However, dealing with columns containing a +**variable** number of values is more problematic. .. _sec-encoding-ragged-columns: Encoding ragged columns ======================= - **todo: This section will define how to work with ragged columns. It's not clear - This has been referred to from elsewhere, but we should probably rename it** - -.. _sec-variable-length-columns: - -Variable length columns -======================= - -.. Keeping this for now as we're referring to it below. Merge these two into one - section and get rid of duplicate refs. - - - -.. Sorting and simplifying tables -.. ============================== - -.. Tables that are noncontradictory but do not satisfy all algorithmic requirements -.. listed above may be converted to a TreeSequence by first sorting, then simplifying -.. them (both operate on the tables **in place**): - -.. .. autofunction:: msprime.sort_tables(nodes, edges[, migrations, sites, mutations, edge_start]) - -.. **Note:** the following function is more general than -.. ``TreeSequence.simplify()``, since it can be applied to tables not satisfying -.. all criteria above (and that hence could not be loaded into a TreeSequence). - - - -.. NodeTable -.. ========= - -.. .. autoclass:: msprime.NodeTable - - -.. EdgeTable -.. ============ - -.. .. autoclass:: msprime.EdgeTable - - -.. SiteTable -.. ========= - -.. .. autoclass:: msprime.SiteTable - - -.. MutationTable -.. ============= - -.. .. autoclass:: msprime.MutationTable - - -.. Import and export -.. ================= - -.. This section describes how to extract tables from a ``TreeSequence``, and how -.. to construct a ``TreeSequence`` from tables. Since tree sequences are -.. immutible, often the best way to modify a ``TreeSequence`` is something along -.. the lines of (for ``ts`` a ``TreeSequence``):: - -.. nodes = msprime.NodeTable() -.. edges = msprime.EdgeTable() -.. ts.dump_tables(nodes=nodes, edges=edges) -.. # (modify nodes and edges) -.. ts.load_tables(nodes=nodes, edges=edges) - - -.. .. automethod:: msprime.TreeSequence.load_tables - -.. .. automethod:: msprime.TreeSequence.dump_tables -.. :noindex: - +A **ragged** column is a column in which the rows are not of a fixed length. +For example, :ref:`sec-metadata-definition` columns contain binary of data of arbitrary +length. To encode such columns in the tables API, we store **two** columns: +one contains the flattened array of data and another stores the **offsets** +of each row into this flattened array. Consider an example:: + + >>> s = msprime.SiteTable() + >>> s.add_row(0, "A") + >>> s.add_row(0, "") + >>> s.add_row(0, "TTT") + >>> s.add_row(0, "G") + >>> print(s) + id position ancestral_state metadata + 0 0.00000000 A + 1 0.00000000 + 2 0.00000000 TTT + 3 0.00000000 G + >>> s.ancestral_state + array([65, 84, 84, 84, 71], dtype=int8) + >>> s.ancestral_state.tobytes() + b'ATTTG' + >>> s.ancestral_state_offset + array([0, 1, 1, 4, 5], dtype=uint32) + >>> s.ancestral_state[s.ancestral_state_offset[2]: s.ancestral_state_offset[3]].tobytes() + b'TTT' + +In this example we create a :ref:`sec-site-table-definition` with four rows, +and then print out this table. We can see that the second row has the +empty string as its ``ancestral_state``, and the third row's +``ancestral_state`` is ``TTT``. When we print out the tables ``ancestral_state`` +column, we see that its a numpy array of length 5: this is the +flattened array of `ASCII encoded `_ +values for these rows. When we decode these bytes using the +numpy ``tobytes`` method, we get the string 'ATTTG'. This flattened array +can now be transferred efficiently in memory like any other column. We +then use the ``ancestral_state_offset`` column to allow us find the +individual rows. For a row ``j``:: + + ancestral_state[ancestral_state_offset[j]: ancestral_state_offset[j + 1]] + +gives us the array of bytes for the ancestral state in that row. + +Note that for a table with ``n`` rows, any offset column must have ``n + 1`` +values. The values in this column must be non-decreasing, and cannot exceed +the length of the ragged column in question. .. _sec-hdf5-file-format: @@ -561,7 +580,7 @@ with nodes, the ``metadata`` column in the node table will be empty, and the corresponding ``metadata`` dataset will not be present in the HDF5 file. Variable length data is handled in the same manner as the -:ref:`Tables API ` +:ref:`Tables API ` above: we store two arrays, one containing the flattened data, and another storing offsets into this array. diff --git a/msprime/tables.py b/msprime/tables.py index 98af49f4c..eec16a1cd 100644 --- a/msprime/tables.py +++ b/msprime/tables.py @@ -580,18 +580,32 @@ def _migration_table_pickle(table): class SiteTable(_msprime.SiteTable): """ - Class for tables describing all sites at which mutations have occurred in a - tree sequence, of the form + A table defining the sites in a tree sequence. See the + :ref:`definitions ` for details on the columns + in this table and the + :ref:`tree sequence requirements ` section + for the properties needed for a site table to be a part of a valid tree + sequence. + + :warning: The numpy arrays returned by table attribute accesses are **copies** + of the underlying data. In particular, this means that you cannot edit + the values in the columns by updating the attribute arrays. - id position ancestral_state - 0 0.1 0 - 1 0.5 0 + **NOTE:** this behaviour may change in future. - Here ``id`` is not stored directly, but is determined by the row index in - the table. ``position`` is the position along the genome, and - ``ancestral_state`` gives the allele at the root of the tree at that - position. + :ivar position: The array of site position coordinates. + :vartype position: numpy.ndarray, dtype=np.float64 + :ivar ancestral_state: The flattened array of ancestral state strings. + :vartype ancestral_state: numpy.ndarray, dtype=np.int8 + :ivar ancestral_state_offset: The offsets of rows in the ancestral_state + array. + :vartype ancestral_state_offset: numpy.ndarray, dtype=np.uint32 + :ivar metadata: The flattened array of metadata binary byte strings. + :vartype metadata: numpy.ndarray, dtype=np.int8 + :ivar metadata_offset: The offsets of rows in the metadata array. + :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 """ + def __str__(self): position = self.position ancestral_state = unpack_strings( @@ -660,6 +674,56 @@ def reset(self): # Deprecated alias for clear self.clear() + def add_row(self, position, ancestral_state, metadata=None): + """ + Adds a new row to this :class:`SiteTable` and returns the ID of the + corresponding site. + + :param float position: The position of this site in genome coordinates. + :param str ancestral_state: The state of this site at the root of the tree. + :param bytes metadata: The binary metadata for this site. + :return: The ID of the newly added site. + :rtype: int + """ + return super(SiteTable, self).add_row(position, ancestral_state, metadata) + + def set_columns( + self, position, ancestral_state, ancestral_state_offset, + metadata=None, metadata_offset=None): + """ + Sets the values for each column in this :class:`.SiteTable` using the values + in the specified arrays. Overwrites any data currently stored in the table. + + The ``position``, ``ancestral_state`` and ``ancestral_state_offset`` + parameters are mandatory, and must be 1D numpy arrays. The length + of the ``position`` array determines the number of rows in table. + + :param position: The position of each site in genome coordinates. + :type position: numpy.ndarray, dtype=np.float64 + """ + super(SiteTable, self).set_columns( + position, ancestral_state=ancestral_state, + ancestral_state_offset=ancestral_state_offset, + metadata=metadata, metadata_offset=metadata_offset) + + def append_columns( + self, position, ancestral_state, ancestral_state_offset, + metadata=None, metadata_offset=None): + """ + Appends the specified arrays to the end of the columns of this + :class:`SiteTable`. This allows many new rows to be added at once. + + The ``position``, ``ancestral_state`` and ``ancestral_state_offset`` + parameters are mandatory, and must be 1D numpy arrays. The length + of the ``position`` array determines the number of additional rows + to add the table. + + """ + super(SiteTable, self).append_columns( + position, ancestral_state=ancestral_state, + ancestral_state_offset=ancestral_state_offset, + metadata=metadata, metadata_offset=metadata_offset) + # Pickle support. See copyreg registration for this function below. def _site_table_pickle(table): From 51d11e2c753d60c14ee473a91c26f46d5a82b775 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 15 Jan 2018 11:26:58 +0000 Subject: [PATCH 04/10] Added example for ord("0") trick. --- docs/api.rst | 64 ++++++++++++++++++++++++++++++++++++++++++++--- msprime/tables.py | 5 ++-- 2 files changed, 64 insertions(+), 5 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index ed1ab6159..ec7a4fe0d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -352,14 +352,72 @@ way to recover the original strings from this encoding. We can also use the 1 1.00000000 12 2 2.00000000 -.. _sec-tables-api-binary-columns: +When inserting many rows with standard infinite sites mutations (i.e., +ancestral state is "0"), it is more efficient to construct the +numpy arrays directly than to create a list of strings and use +:func:`.pack_strings`. When doing this, it is important to note that +it is the **encoded** byte values that are stored; by default, we +use UTF8 (which corresponds to ASCII for simple printable characters).:: + + >>> t_s = msprime.SiteTable() + >>> m = 10 + >>> a = ord("0") + np.zeros(m, dtype=np.int8) + >>> off = np.arange(m + 1, dtype=np.uint32) + >>> t_s.set_columns(position=np.arange(m), ancestral_state=a, ancestral_state_offset=off) + >>> print(t_s) + id position ancestral_state metadata + 0 0.00000000 0 + 1 1.00000000 0 + 2 2.00000000 0 + 3 3.00000000 0 + 4 4.00000000 0 + 5 5.00000000 0 + 6 6.00000000 0 + 7 7.00000000 0 + 8 8.00000000 0 + 9 9.00000000 0 + >>> t_s.ancestral_state + array([48, 48, 48, 48, 48, 48, 48, 48, 48, 48], dtype=int8) + >>> t_s.ancestral_state_offset + array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=uint32) + +Here we create 10 sites at regular positions, each with ancestral state equal to +"0". Note that we use ``ord("0")`` to get the ASCII code for "0" (48), and create +10 copies of this by adding it to an array of zeros. + +Mutations can be handled similarly:: + + >>> t_m = msprime.MutationTable() + >>> site = np.arange(m, dtype=np.int32) + >>> d = ord("1") + np.zeros(m, dtype=np.int8) + >>> off = np.arange(m + 1, dtype=np.uint32) + >>> node = np.zeros(m, dtype=np.int32) + >>> t_m.set_columns(site=site, node=node, derived_state=d, derived_state_offset=off) + >>> print(t_m) + id site node derived_state parent metadata + 0 0 0 1 -1 + 1 1 0 1 -1 + 2 2 0 1 -1 + 3 3 0 1 -1 + 4 4 0 1 -1 + 5 5 0 1 -1 + 6 6 0 1 -1 + 7 7 0 1 -1 + 8 8 0 1 -1 + 9 9 0 1 -1 + >>> + + +.. _sec_tables_api_binary_columns: ++++++++++++++ Binary columns ++++++++++++++ -Columns storing binary data are handled in a very similar manner to -:ref:`sec-tables-api-text-columns`. The difference between the two is +Columns storing binary data take the same approach as +:ref:`sec-tables-api-text-columns` to encoding +:ref:`variable length data `. +The difference between the two is only raw :class:`bytes` values are accepted: no character encoding or decoding is done on the data. Consider the following example:: diff --git a/msprime/tables.py b/msprime/tables.py index eec16a1cd..e4c478bb4 100644 --- a/msprime/tables.py +++ b/msprime/tables.py @@ -105,10 +105,10 @@ class NodeTable(_msprime.NodeTable): :ivar population: The array of population IDs. :vartype population: numpy.ndarray, dtype=np.int32 :ivar metadata: The flattened array of binary metadata values. See - :ref:`sec-encoding-ragged-columns` for more details. + :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata: numpy.ndarray, dtype=np.int8 :ivar metadata_offset: The array of offsets into the metadata column. See - :ref:`sec-encoding-ragged-columns` for more details. + :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 """ @@ -197,6 +197,7 @@ def set_columns( which is equal to the number of nodes the table will contain. The ``metadata`` and ``metadata_offset`` must be supplied together, and meet the requirements for :ref:`sec-encoding-ragged-columns`. + See :ref:`sec_tables_api_binary_columns` :param flags: The bitwise flags for each node. Required. :type flags: numpy.ndarray, dtype=np.uint32 From 22bd4658a5013a2ff2519a10c93bf67c3775d50d Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 15 Jan 2018 11:51:01 +0000 Subject: [PATCH 05/10] Changed sphinx labels to use _ rather than -. This is so that vim will recognise the labels as identifiers and tab complete them. --- docs/_build/.README | 1 - docs/api.rst | 32 +++++++------- docs/cli.rst | 20 ++++----- docs/interchange.rst | 100 +++++++++++++++++++++--------------------- docs/introduction.rst | 8 ++-- docs/tutorial.rst | 10 ++--- msprime/stats.py | 2 +- msprime/tables.py | 22 +++++----- msprime/trees.py | 38 ++++++++-------- 9 files changed, 117 insertions(+), 116 deletions(-) delete mode 100644 docs/_build/.README diff --git a/docs/_build/.README b/docs/_build/.README deleted file mode 100644 index d43aca390..000000000 --- a/docs/_build/.README +++ /dev/null @@ -1 +0,0 @@ -Empty file to force git to include this directory. diff --git a/docs/api.rst b/docs/api.rst index ec7a4fe0d..9c9cc8e29 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,11 +1,11 @@ -.. _sec-api: +.. _sec_api: ================= API Documentation ================= This is the API documentation for ``msprime``, and provides detailed information -on the Python programming interface. See the :ref:`sec-tutorial` for an +on the Python programming interface. See the :ref:`sec_tutorial` for an introduction to using this API to run simulations and analyse the results. **************** @@ -181,11 +181,11 @@ Loading data There are several methods for loading data into the msprime API. The simplest and most convenient is the use the :func:`msprime.load` function to load -a :ref:`HDF ancestry file `. For small scale data +a :ref:`HDF ancestry file `. For small scale data and debugging, it is often convenient to use the :func:`msprime.load_text` -to read data in the :ref:`text file format `. +to read data in the :ref:`text file format `. The :func:`msprime.load_tables` function efficiently loads large volumes -of data using the :ref:`Tables API `. +of data using the :ref:`Tables API `. .. autofunction:: msprime.load @@ -219,14 +219,14 @@ population genetics statistics from a given :class:`.TreeSequence`. :members: -.. _sec-tables-api: +.. _sec_tables_api: *********** Tables API *********** -The :ref:`tables API ` provides an efficient way of working -with and interchanging :ref:`tree sequence data `. Each table +The :ref:`tables API ` provides an efficient way of working +with and interchanging :ref:`tree sequence data `. Each table class (e.g, :class:`.NodeTable`, :class:`.EdgeTable`) has a specific set of columns with fixed types, and a set of methods for setting and getting the data in these columns. The number of rows in the table ``t`` is given by ``len(t)``. @@ -271,7 +271,7 @@ computations using the :mod:`multiprocessing` module). :: 1 1.00000000 2.00000000 9 11 However, pickling will not be as efficient as storing tables -in the native :ref:`HDF5 format `. +in the native :ref:`HDF5 format `. Tables support the equality operator ``==`` based on the data held in the columns:: @@ -292,13 +292,13 @@ held in the columns:: -.. _sec-tables-api-text-columns: +.. _sec_tables_api_text_columns: ++++++++++++ Text columns ++++++++++++ -As described in the :ref:`sec-encoding-ragged-columns`, working with +As described in the :ref:`sec_encoding_ragged_columns`, working with variable length columns is somewhat more involved. Columns encoding text data store the **encoded bytes** of the flattened strings, and the offsets into this column in two separate @@ -415,8 +415,8 @@ Binary columns ++++++++++++++ Columns storing binary data take the same approach as -:ref:`sec-tables-api-text-columns` to encoding -:ref:`variable length data `. +:ref:`sec_tables_api_text_columns` to encoding +:ref:`variable length data `. The difference between the two is only raw :class:`bytes` values are accepted: no character encoding or decoding is done on the data. Consider the following example:: @@ -444,7 +444,7 @@ decoding is done on the data. Consider the following example:: Here we add two rows to a :class:`.NodeTable`, with different -:ref:`metadata `. The first row contains a simple +:ref:`metadata `. The first row contains a simple byte string, and the second contains a Python dictionary serialised using :mod:`pickle`. We then show several different (and seemingly incompatible!) different views on the same data. @@ -459,11 +459,11 @@ When we print the table, however, we see some data which is seemingly unrelated to the original contents. This is because the binary data is `base64 encoded `_ to ensure that it is print-safe (and doesn't break your terminal). (See the -:ref:`sec-metadata-definition` section for more information on the +:ref:`sec_metadata_definition` section for more information on the use of base64 encoding.). Finally, when we print the ``metadata`` column, we see the raw byte values -encoded as signed integers. As for :ref:`sec-tables-api-text-columns`, +encoded as signed integers. As for :ref:`sec_tables_api_text_columns`, the ``metadata_offset`` column encodes the offsets into this array. So, we see that the metadata value is 9 bytes long and the second is 24. diff --git a/docs/cli.rst b/docs/cli.rst index 6e2be2e6b..df50e48ae 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -1,28 +1,28 @@ -.. _sec-cli: +.. _sec_cli: ====================== Command line interface ====================== -Two command-line applications are provided with ``msprime``: :ref:`sec-msp` and -:ref:`sec-mspms`. The :command:`msp` program is an experimental interface for +Two command_line applications are provided with ``msprime``: :ref:`sec_msp` and +:ref:`sec_mspms`. The :command:`msp` program is an experimental interface for interacting with the library, and is a POSIX compliant command line interface. The :command:`mspms` program is a fully-:command:`ms` compatible interface. This is useful for those who wish to get started quickly with using the library, and also as a means of plugging ``msprime`` into existing work flows. However, there is a substantial overhead involved in translating data from ``msprime``'s native history file into legacy formats, and so new code -should use the :ref:`Python API ` where possible. +should use the :ref:`Python API ` where possible. -.. _sec-msp: +.. _sec_msp: *** msp *** The ``msp`` program provides a convenient interface to the :ref:`msprime API -`. It is based on subcommands that either generate or consume a -:ref:`history file `. The ``simulate`` subcommand runs a +`. It is based on subcommands that either generate or consume a +:ref:`history file `. The ``simulate`` subcommand runs a simulation storing the results in a file. The other commands are concerned with converting this file into other formats. @@ -54,7 +54,7 @@ to the file provided as an argument. -.. _sec-msp-upgrade: +.. _sec_msp_upgrade: +++++++++++ msp upgrade @@ -105,7 +105,7 @@ sequence in newick format. .. todo:: Document the nodes, edges, sites and mutations commands. -.. _sec-mspms: +.. _sec_mspms: ***** mspms @@ -116,7 +116,7 @@ command line interface to the ``msprime`` library. This interface should be useful for legacy applications, where it can be used as a drop-in replacement for :command:`ms`. This interface is not recommended for new applications, particularly if the simulated trees are required as part of the output -as Newick is very inefficient. The :ref:`Python API ` is the recommended interface, +as Newick is very inefficient. The :ref:`Python API ` is the recommended interface, providing direct access to the structures used within ``msprime``. diff --git a/docs/interchange.rst b/docs/interchange.rst index 2d4b04ec1..0f5d83dc0 100644 --- a/docs/interchange.rst +++ b/docs/interchange.rst @@ -1,4 +1,4 @@ -.. _sec-interchange: +.. _sec_interchange: ######################### Tree sequence interchange @@ -19,7 +19,7 @@ we describe the HDF5-based file format using by msprime to efficiently store tree sequences in the `HDF5 file format`_ section. -.. _sec-data-model: +.. _sec_data_model: ********** Data model @@ -47,7 +47,7 @@ sample Those nodes in the tree that we have obtained data from. These are distinguished from other nodes by the fact that a tree sequence *must* describe the genealogical history of all samples at every point on the - genome. (See :ref:`sec-node-table-definition` for information on how the sample + genome. (See :ref:`sec_node_table_definition` for information on how the sample status a node is encoded in the ``flags`` column.) edge @@ -101,7 +101,7 @@ Table definitions ================= -.. _sec-node-table-definition: +.. _sec_node_table_definition: Node Table ---------- @@ -118,7 +118,7 @@ Column Type Description flags uint32 Bitwise flags. time double Birth time of node population int32 Birth population of node. -metadata binary Node :ref:`sec-metadata-definition` +metadata binary Node :ref:`sec_metadata_definition` ================ ============== =========== The ``time`` column records the birth time of the individual in question, @@ -134,14 +134,14 @@ a particular node as a sample means, for example, that the mutational state of the node will be included in the genotypes produced by :meth:``TreeSequence.variants``. -For convenience, the :ref:`text format ` for nodes +For convenience, the :ref:`text format ` for nodes decomposes the ``flags`` value into it's separate values. Thus, in the text format we have a column for ``is_sample``, which corresponds to the the ``flags`` column in the underlying table. As more flags values are defined, these will be added to the text file format. The ``metadata`` column provides a location for client code to store -information about each node. See the :ref:`sec-metadata-definition` section for +information about each node. See the :ref:`sec_metadata_definition` section for more details on how metadata columns should be used. .. note:: @@ -152,7 +152,7 @@ more details on how metadata columns should be used. not necessary for the core tree sequence algorithms. -.. _sec-edge-table-definition: +.. _sec_edge_table_definition: Edge Table ---------- @@ -174,10 +174,10 @@ Each row in an edge table describes the half-open genomic interval affected ``[left, right)``, the ``parent`` and the ``child`` on that interval. The ``left`` and ``right`` columns are defined using double precision floating point values for flexibility. The ``parent`` and ``child`` -columns specify integer IDs in the associated :ref:`sec-node-table-definition`. +columns specify integer IDs in the associated :ref:`sec_node_table_definition`. -.. _sec-site-table-definition: +.. _sec_site_table_definition: Site Table ---------- @@ -192,7 +192,7 @@ Column Type Description ================ ============== =========== position double Position of site in genome coordinates. ancestral_state text The state at the root of the tree. -metadata binary Site :ref:`sec-metadata-definition`. +metadata binary Site :ref:`sec_metadata_definition`. ================ ============== =========== The ``position`` column is a floating point value defining the location @@ -203,11 +203,11 @@ of the tree, thus defining the state that nodes inherit (unless mutations occur). The column stores text character data of arbitrary length. The ``metadata`` column provides a location for client code to store -information about each site. See the :ref:`sec-metadata-definition` section for +information about each site. See the :ref:`sec_metadata_definition` section for more details on how metadata columns should be used. -.. _sec-mutation-table-definition: +.. _sec_mutation_table_definition: Mutation Table -------------- @@ -219,11 +219,11 @@ site int32 The ID of the site the mutation occurs a node int32 The node this mutation occurs at. parent int32 The ID of the parent mutation. derived_state char The mutational state at the defined node. -metadata char Mutation :ref:`sec-metadata-definition`. +metadata char Mutation :ref:`sec_metadata_definition`. ================ ============== =========== -.. _sec-migration-table-definition: +.. _sec_migration_table_definition: Migration Table --------------- @@ -267,7 +267,7 @@ time double Time of migration event. -.. _sec-provenance-table-definition: +.. _sec_provenance_table_definition: Provenance Table ---------------- @@ -280,7 +280,7 @@ record char Provenance record. ================ ============== =========== -.. _sec-metadata-definition: +.. _sec_metadata_definition: Metadata ======== @@ -300,7 +300,7 @@ Arbitrary binary data can be stored in ``metadata`` columns, and the ``msprime`` library makes no attempt to interpret this information. How the information held in this field is encoded is entirely the choice of client code. -To ensure that metadata can be safely interchanged using the :ref:`sec-text-file-format`, +To ensure that metadata can be safely interchanged using the :ref:`sec_text_file_format`, each row is `base 64 encoded `_. Thus, binary information can be safely printed and exchanged, but may not be human readable. @@ -312,7 +312,7 @@ human readable. is self-describing. -.. _sec-valid-tree-sequence-requirements: +.. _sec_valid_tree_sequence_requirements: Valid tree sequence requirements ================================ @@ -320,7 +320,7 @@ Valid tree sequence requirements **Explain and list the requirements for a set of tables to form a valid tree sequence**. -.. _sec-structural-requirements: +.. _sec_structural_requirements: Structural requirements ----------------------- @@ -339,7 +339,7 @@ and for algorithmic reasons: 4. Node times must be strictly greater than zero. -.. _sec-ordering-requirements: +.. _sec_ordering_requirements: Ordering requirements --------------------- @@ -369,7 +369,7 @@ To allow for efficent algorithms, it is required that 8. Sites are sorted by increasing position, 9. and mutations are sorted by site. -.. _sec-text-file-format: +.. _sec_text_file_format: ***************** Text file formats @@ -391,7 +391,7 @@ ignored (IDs are always determined by the position of the row in a table). with (say) 4 nodes and two trees, and include a picture. This example can also be used in the binary interchange section also. -.. _sec-node-text-format: +.. _sec_node_text_format: Node text format ================ @@ -399,7 +399,7 @@ Node text format The node text format must contain the columns ``is_sample`` and ``time``. Optionally, there may also be a ``population`` and ``metadata`` columns. See the :ref:`node table definitions -` for details on these columns. +` for details on these columns. Note that we do not have a ``flags`` column in the text file format, but instead use ``is_sample`` (which may be 0 or 1). Currently, ``IS_SAMPLE`` is @@ -420,15 +420,15 @@ An example node table:: 0 0.253 0 -.. _sec-edge-text-format: +.. _sec_edge_text_format: Edge text format ================ The edge text format must contain the columns ``left``, ``right``, ``parent`` and ``child``. -See the :ref:`edge table definitions -` for details on these columns. +See the :ref:`edge table definitions ` +for details on these columns. An example edge table:: @@ -440,15 +440,16 @@ An example edge table:: 0 2 8 2 -.. _sec-site-text-format: +.. _sec_site_text_format: Site text format ================ The site text format must contain the columns ``position`` and ``ancestral_state``. The ``metadata`` column may also be optionally -present. See the :ref:`site table definitions -` for details on these columns. +present. See the +:ref:`site table definitions ` +for details on these columns. sites:: @@ -456,15 +457,16 @@ sites:: 0.1 A 8.5 AT -.. _sec-mutation-text-format: +.. _sec_mutation_text_format: Mutation text format ==================== The mutation text format must contain the columns ``site``, ``node`` and ``derived_state``. The ``parent`` and ``metadata`` columns -may also be optionally present. See the :ref:`mutation table definitions -` for details on these columns. +may also be optionally present. See the +:ref:`mutation table definitions ` +for details on these columns. mutations:: @@ -474,14 +476,14 @@ mutations:: 1 0 A -.. _sec-binary-interchange: +.. _sec_binary_interchange: ****************** Binary interchange ****************** In this section we describe the high-level details of the API for interchanging -table data via numpy arrays. Please see the :ref:`sec-tables-api` for detailed +table data via numpy arrays. Please see the :ref:`sec_tables_api` for detailed description of the functions and methods. The tables API is based on **columnar** storage of the data. In memory, each @@ -498,20 +500,20 @@ efficiently. We use the `numpy Array API `_ to allow us to define and work with numeric arrays of the required types. Node IDs, for example, are defined using 32 bit integers. Thus, the -``parent`` column of an :ref:`sec-edge-table-definition`'s with ``n`` rows +``parent`` column of an :ref:`sec_edge_table_definition`'s with ``n`` rows is a block ``4n`` bytes. This approach is very straightforward for columns in which each row contains a fixed number of values. However, dealing with columns containing a **variable** number of values is more problematic. -.. _sec-encoding-ragged-columns: +.. _sec_encoding_ragged_columns: Encoding ragged columns ======================= A **ragged** column is a column in which the rows are not of a fixed length. -For example, :ref:`sec-metadata-definition` columns contain binary of data of arbitrary +For example, :ref:`sec_metadata_definition` columns contain binary of data of arbitrary length. To encode such columns in the tables API, we store **two** columns: one contains the flattened array of data and another stores the **offsets** of each row into this flattened array. Consider an example:: @@ -536,7 +538,7 @@ of each row into this flattened array. Consider an example:: >>> s.ancestral_state[s.ancestral_state_offset[2]: s.ancestral_state_offset[3]].tobytes() b'TTT' -In this example we create a :ref:`sec-site-table-definition` with four rows, +In this example we create a :ref:`sec_site_table_definition` with four rows, and then print out this table. We can see that the second row has the empty string as its ``ancestral_state``, and the third row's ``ancestral_state`` is ``TTT``. When we print out the tables ``ancestral_state`` @@ -553,10 +555,10 @@ individual rows. For a row ``j``:: gives us the array of bytes for the ancestral state in that row. Note that for a table with ``n`` rows, any offset column must have ``n + 1`` -values. The values in this column must be non-decreasing, and cannot exceed +values. The values in this column must be nondecreasing, and cannot exceed the length of the ragged column in question. -.. _sec-hdf5-file-format: +.. _sec_hdf5_file_format: **************** HDF5 file format @@ -580,7 +582,7 @@ with nodes, the ``metadata`` column in the node table will be empty, and the corresponding ``metadata`` dataset will not be present in the HDF5 file. Variable length data is handled in the same manner as the -:ref:`Tables API ` +:ref:`Tables API ` above: we store two arrays, one containing the flattened data, and another storing offsets into this array. @@ -600,7 +602,7 @@ Path Type Dim Description Nodes group =========== -The ``/nodes`` group stores the :ref:`sec-node-table-definition`. +The ``/nodes`` group stores the :ref:`sec_node_table_definition`. ======================= ============== Path Type @@ -615,7 +617,7 @@ Path Type Edges group =========== -The ``/edges`` group stores the :ref:`sec-edge-table-definition`. +The ``/edges`` group stores the :ref:`sec_edge_table_definition`. =================== ============== Path Type @@ -645,7 +647,7 @@ Path Type Sites group =========== -The sites group stores the :ref:`sec-site-table-definition`. +The sites group stores the :ref:`sec_site_table_definition`. ============================= ============== Path Type @@ -660,7 +662,7 @@ Path Type Mutations group =============== -The mutations group stores the :ref:`sec-mutation-table-definition`. +The mutations group stores the :ref:`sec_mutation_table_definition`. =============================== ============== Path Type @@ -677,7 +679,7 @@ Path Type Migrations group ================ -The ``/migrations`` group stores the :ref:`sec-migration-table-definition`. +The ``/migrations`` group stores the :ref:`sec_migration_table_definition`. =================== ============== Path Type @@ -693,7 +695,7 @@ Path Type Provenances group ================= -The provenances group stores the :ref:`sec-provenance-table-definition`. +The provenances group stores the :ref:`sec_provenance_table_definition`. =============================== ============== Path Type @@ -709,7 +711,7 @@ Legacy Versions =============== Tree sequence files written by older versions of msprime are not readable by -newer versions of msprime. For major releases of msprime, :ref:`sec-msp-upgrade` +newer versions of msprime. For major releases of msprime, :ref:`sec_msp_upgrade` will convert older tree sequence files to the latest version. However many changes to the tree sequence format are not part of major diff --git a/docs/introduction.rst b/docs/introduction.rst index 1259bf064..b62fd74bd 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -1,4 +1,4 @@ -.. _sec-introduction: +.. _sec_introduction: ============ Introduction @@ -18,9 +18,9 @@ scenarios. The library is a reimplementation of Hudson's seminal chromosome sized regions for hundreds of thousands of samples. 2. ``msprime`` is primarily designed to be used through its - :ref:`Python API ` to simplify the workflow associated with + :ref:`Python API ` to simplify the workflow associated with running and analysing simulations. (However, we do provide an - ``ms``-compatible :ref:`command line interface ` to + ``ms``_compatible :ref:`command line interface ` to plug in to existing workflows.) For many simulations we first write a script to generate the command line parameters we want to run, then fork shell processes to run the simulations, @@ -31,7 +31,7 @@ scenarios. The library is a reimplementation of Hudson's seminal 3. ``msprime`` does not use Newick trees for interchange as they are extremely inefficient in terms of the time required to generate and parse, as well as the space required to store them. - Instead, we use a :ref:`well-defined ` format using the + Instead, we use a :ref:`well_defined ` format using the powerful `HDF5 `_ standard. This format allows us to store genealogical data very concisely, particularly for large sample sizes. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index e2aeb4797..147268c2f 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -1,12 +1,12 @@ -.. _sec-tutorial: +.. _sec_tutorial: ======== Tutorial ======== This is the tutorial for the Python interface to the ``msprime`` -library. Detailed :ref:`sec-api` is also available for this -library. An :program:`ms`-compatible :ref:`command line interface ` +library. Detailed :ref:`sec_api` is also available for this +library. An :program:`ms`_compatible :ref:`command line interface ` is also available if you wish to use ``msprime`` directly within an existing work flow. @@ -113,7 +113,7 @@ The ``length`` parameter specifies the length of the simulated sequence in bases, and may be a floating point number. If ``length`` is not supplied, it is assumed to be 1. The ``recombination_rate`` parameter specifies the rate of crossing over per base per generation, -and is zero by default. See the :ref:`sec-api` for a discussion of the precise +and is zero by default. See the :ref:`sec_api` for a discussion of the precise recombination model used. Here, we simulate the trees across over a 10kb region with a recombination @@ -732,7 +732,7 @@ used here.) :width: 800px :alt: An example LD matrix plot. -.. _sec-tutorial-threads: +.. _sec_tutorial_threads: ******************** Working with threads diff --git a/msprime/stats.py b/msprime/stats.py index de17dd862..b53e9eea2 100644 --- a/msprime/stats.py +++ b/msprime/stats.py @@ -50,7 +50,7 @@ class LdCalculator(object): This class supports multithreaded access using the Python :mod:`threading` module. Separate instances of :class:`.LdCalculator` referencing the same tree sequence can operate in parallel in multiple threads. - See the :ref:`sec-tutorial-threads` section in the :ref:`sec-tutorial` + See the :ref:`sec_tutorial_threads` section in the :ref:`sec_tutorial` for an example of how use multiple threads to calculate LD values efficiently. diff --git a/msprime/tables.py b/msprime/tables.py index e4c478bb4..a56f7e7f1 100644 --- a/msprime/tables.py +++ b/msprime/tables.py @@ -87,9 +87,9 @@ def text_decode_metadata(encoded): class NodeTable(_msprime.NodeTable): """ A table defining the nodes in a tree sequence. See the - :ref:`definitions ` for details on the columns + :ref:`definitions ` for details on the columns in this table and the - :ref:`tree sequence requirements ` section + :ref:`tree sequence requirements ` section for the properties needed for a node table to be a part of a valid tree sequence. :warning: The numpy arrays returned by table attribute accesses are **copies** @@ -196,7 +196,7 @@ def set_columns( The ``flags``, ``time`` and ``population`` arrays must all be of the same length, which is equal to the number of nodes the table will contain. The ``metadata`` and ``metadata_offset`` must be supplied together, and - meet the requirements for :ref:`sec-encoding-ragged-columns`. + meet the requirements for :ref:`sec_encoding_ragged_columns`. See :ref:`sec_tables_api_binary_columns` :param flags: The bitwise flags for each node. Required. @@ -229,7 +229,7 @@ def append_columns( The ``flags``, ``time`` and ``population`` arrays must all be of the same length, which is equal to the number of nodes that will be added to the table. The ``metadata`` and ``metadata_offset`` must be supplied together, and - meet the requirements for :ref:`sec-encoding-ragged-columns`. + meet the requirements for :ref:`sec_encoding_ragged_columns`. :param flags: The bitwise flags for each node. Required. :type flags: numpy.ndarray, dtype=np.uint32 @@ -268,9 +268,9 @@ def _pickle_node_table(table): class EdgeTable(_msprime.EdgeTable): """ A table defining the edges in a tree sequence. See the - :ref:`definitions ` for details on the columns + :ref:`definitions ` for details on the columns in this table and the - :ref:`tree sequence requirements ` section + :ref:`tree sequence requirements ` section for the properties needed for an edge table to be a part of a valid tree sequence. :warning: The numpy arrays returned by table attribute accesses are **copies** @@ -413,9 +413,9 @@ def _edge_table_pickle(table): class MigrationTable(_msprime.MigrationTable): """ A table defining the migrations in a tree sequence. See the - :ref:`definitions ` for details on the columns + :ref:`definitions ` for details on the columns in this table and the - :ref:`tree sequence requirements ` section + :ref:`tree sequence requirements ` section for the properties needed for a migration table to be a part of a valid tree sequence. @@ -582,9 +582,9 @@ def _migration_table_pickle(table): class SiteTable(_msprime.SiteTable): """ A table defining the sites in a tree sequence. See the - :ref:`definitions ` for details on the columns + :ref:`definitions ` for details on the columns in this table and the - :ref:`tree sequence requirements ` section + :ref:`tree sequence requirements ` section for the properties needed for a site table to be a part of a valid tree sequence. @@ -1010,7 +1010,7 @@ def sort_tables( provenances=None, edge_start=0): """ Sorts the given tables **in place**, ensuring that all tree - sequence :ref:`ordering requirements ` are + sequence :ref:`ordering requirements ` are met. If the ``edge_start`` parameter is provided, this specifies the index diff --git a/msprime/trees.py b/msprime/trees.py index 7c5e1bb70..97034455d 100644 --- a/msprime/trees.py +++ b/msprime/trees.py @@ -802,7 +802,7 @@ def __str__(self): def load(path): """ Loads a tree sequence from the specified file path. This file must be in the - :ref:`HDF5 file format ` produced by the + :ref:`HDF5 file format ` produced by the :meth:`.TreeSequence.dump` method. :param str path: The file path of the HDF5 file containing the @@ -821,7 +821,7 @@ def load_tables( Loads the tree sequence data from the specified table objects, and returns the resulting :class:`.TreeSequence` object. These tables must fulfil the properties required for an input tree sequence as - described in the :ref:`sec-valid-tree-sequence-requirements` section. + described in the :ref:`sec_valid_tree_sequence_requirements` section. The ``sequence_length`` parameter determines the :attr:`.TreeSequence.sequence_length` of the returned tree sequence. If it @@ -829,19 +829,19 @@ def load_tables( coordinate of the input edges. This parameter is useful in degenerate situations (such as when there are zero edges), but can usually be ignored. - :param NodeTable nodes: The :ref:`node table ` + :param NodeTable nodes: The :ref:`node table ` (required). - :param EdgeTable edges: The :ref:`edge table ` + :param EdgeTable edges: The :ref:`edge table ` (required). :param MigrationTable migrations: The :ref:`migration table - ` (optional). - :param SiteTable sites: The :ref:`site table ` + ` (optional). + :param SiteTable sites: The :ref:`site table ` (optional; but if supplied, ``mutations`` must also be specified). :param MutationTable mutations: The :ref:`mutation table - ` (optional; but if supplied, ``sites`` + ` (optional; but if supplied, ``sites`` must also be specified). :param ProvenanceTable provenances: The :ref:`provenance table - ` (optional). + ` (optional). :param float sequence_length: The sequence length of the returned tree sequence. If not supplied or zero this will be inferred from the set of edges. :return: A :class:`.TreeSequence` consistent with the specified tables. @@ -864,9 +864,9 @@ def parse_nodes(source, strict=True): """ Parse the specified file-like object containing a whitespace delimited description of a node table and returns the corresponding :class:`NodeTable` - instance. See the :ref:`node text format ` section + instance. See the :ref:`node text format ` section for the details of the required format and the - :ref:`node table definition ` section for the + :ref:`node table definition ` section for the required properties of the contents. See :func:`.load_text` for a detailed explanation of the ``strict`` parameter. @@ -916,9 +916,9 @@ def parse_edges(source, strict=True): """ Parse the specified file-like object containing a whitespace delimited description of a edge table and returns the corresponding :class:`EdgeTable` - instance. See the :ref:`edge text format ` section + instance. See the :ref:`edge text format ` section for the details of the required format and the - :ref:`edge table definition ` section for the + :ref:`edge table definition ` section for the required properties of the contents. See :func:`.load_text` for a detailed explanation of the ``strict`` parameter. @@ -953,9 +953,9 @@ def parse_sites(source, strict=True): """ Parse the specified file-like object containing a whitespace delimited description of a site table and returns the corresponding :class:`SiteTable` - instance. See the :ref:`site text format ` section + instance. See the :ref:`site text format ` section for the details of the required format and the - :ref:`site table definition ` section for the + :ref:`site table definition ` section for the required properties of the contents. See :func:`.load_text` for a detailed explanation of the ``strict`` parameter. @@ -993,9 +993,9 @@ def parse_mutations(source, strict=True): """ Parse the specified file-like object containing a whitespace delimited description of a mutation table and returns the corresponding :class:`MutationTable` - instance. See the :ref:`mutation text format ` section + instance. See the :ref:`mutation text format ` section for the details of the required format and the - :ref:`mutation table definition ` section for the + :ref:`mutation table definition ` section for the required properties of the contents. See :func:`.load_text` for a detailed explanation of the ``strict`` parameter. @@ -1044,10 +1044,10 @@ def load_text(nodes, edges, sites=None, mutations=None, sequence_length=0, stric """ Parses the tree sequence data from the specified file-like objects, and returns the resulting :class:`.TreeSequence` object. The format - for these files is documented in the :ref:`sec-text-file-format` section, + for these files is documented in the :ref:`sec_text_file_format` section, and is produced by the :meth:`.TreeSequence.dump_text` method. Further properties required for an input tree sequence are described in the - :ref:`sec-valid-tree-sequence-requirements` section. This method is intended as a + :ref:`sec_valid_tree_sequence_requirements` section. This method is intended as a convenient interface for importing external data into msprime; the HDF5 based file format using by :meth:`msprime.load` is many times more efficient than this text format. @@ -1075,7 +1075,7 @@ def load_text(nodes, edges, sites=None, mutations=None, sequence_length=0, stric After parsing the tables, :func:`sort_tables` is called to ensure that the loaded tables satisfy the tree sequence :ref:`ordering requirements - `. Note that this may result in the IDs of various + `. Note that this may result in the IDs of various entities changing from their positions in the input file. :param stream nodes: The file-like object containing text describing a From 2d20f56a44c067b40e07660c46c6deba243dff25 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 15 Jan 2018 12:08:11 +0000 Subject: [PATCH 06/10] Documented SiteTable API. --- msprime/tables.py | 57 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/msprime/tables.py b/msprime/tables.py index a56f7e7f1..fd6cec0fe 100644 --- a/msprime/tables.py +++ b/msprime/tables.py @@ -195,9 +195,9 @@ def set_columns( The ``flags``, ``time`` and ``population`` arrays must all be of the same length, which is equal to the number of nodes the table will contain. The - ``metadata`` and ``metadata_offset`` must be supplied together, and + ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. - See :ref:`sec_tables_api_binary_columns` + See :ref:`sec_tables_api_binary_columns` for more information. :param flags: The bitwise flags for each node. Required. :type flags: numpy.ndarray, dtype=np.uint32 @@ -228,8 +228,9 @@ def append_columns( The ``flags``, ``time`` and ``population`` arrays must all be of the same length, which is equal to the number of nodes that will be added to the table. The - ``metadata`` and ``metadata_offset`` must be supplied together, and + ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. + See :ref:`sec_tables_api_binary_columns` for more information. :param flags: The bitwise flags for each node. Required. :type flags: numpy.ndarray, dtype=np.uint32 @@ -597,13 +598,16 @@ class SiteTable(_msprime.SiteTable): :ivar position: The array of site position coordinates. :vartype position: numpy.ndarray, dtype=np.float64 :ivar ancestral_state: The flattened array of ancestral state strings. + See :ref:`sec_tables_api_text_columns` for more details. :vartype ancestral_state: numpy.ndarray, dtype=np.int8 :ivar ancestral_state_offset: The offsets of rows in the ancestral_state - array. + array. See :ref:`sec_tables_api_text_columns` for more details. :vartype ancestral_state_offset: numpy.ndarray, dtype=np.uint32 - :ivar metadata: The flattened array of metadata binary byte strings. + :ivar metadata: The flattened array of binary metadata values. See + :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata: numpy.ndarray, dtype=np.int8 - :ivar metadata_offset: The offsets of rows in the metadata array. + :ivar metadata_offset: The array of offsets into the metadata column. See + :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 """ @@ -682,7 +686,8 @@ def add_row(self, position, ancestral_state, metadata=None): :param float position: The position of this site in genome coordinates. :param str ancestral_state: The state of this site at the root of the tree. - :param bytes metadata: The binary metadata for this site. + :param bytes metadata: The binary-encoded metadata for the new node. If not + specified or None, a zero-length byte string is stored. :return: The ID of the newly added site. :rtype: int """ @@ -698,9 +703,27 @@ def set_columns( The ``position``, ``ancestral_state`` and ``ancestral_state_offset`` parameters are mandatory, and must be 1D numpy arrays. The length of the ``position`` array determines the number of rows in table. + The ``ancestral_state`` and ``ancestral_state_offset`` parameters must + be supplied together, and meet the requirements for + :ref:`sec_encoding_ragged_columns` (see + :ref:`sec_tables_api_text_columns` for more information). The + ``metadata`` and ``metadata_offset`` parameters must be supplied + together, and meet the requirements for + :ref:`sec_encoding_ragged_columns` (see + :ref:`sec_tables_api_binary_columns` for more information). :param position: The position of each site in genome coordinates. :type position: numpy.ndarray, dtype=np.float64 + :param ancestral_state: The flattened ancestral_state array. Required. + :type ancestral_state: numpy.ndarray, dtype=np.int8 + :param ancestral_state_offset: The offsets into the ``ancestral_state`` array. + :type ancestral_state_offset: numpy.ndarray, dtype=np.uint32. + :param metadata: The flattened metadata array. Must be specified along + with ``metadata_offset``. If not specified or None, an empty metadata + value is stored for each node. + :type metadata: numpy.ndarray, dtype=np.int8 + :param metadata_offset: The offsets into the ``metadata`` array. + :type metadata_offset: numpy.ndarray, dtype=np.uint32. """ super(SiteTable, self).set_columns( position, ancestral_state=ancestral_state, @@ -718,7 +741,27 @@ def append_columns( parameters are mandatory, and must be 1D numpy arrays. The length of the ``position`` array determines the number of additional rows to add the table. + The ``ancestral_state`` and ``ancestral_state_offset`` parameters must + be supplied together, and meet the requirements for + :ref:`sec_encoding_ragged_columns` (see + :ref:`sec_tables_api_text_columns` for more information). The + ``metadata`` and ``metadata_offset`` parameters must be supplied + together, and meet the requirements for + :ref:`sec_encoding_ragged_columns` (see + :ref:`sec_tables_api_binary_columns` for more information). + :param position: The position of each site in genome coordinates. + :type position: numpy.ndarray, dtype=np.float64 + :param ancestral_state: The flattened ancestral_state array. Required. + :type ancestral_state: numpy.ndarray, dtype=np.int8 + :param ancestral_state_offset: The offsets into the ``ancestral_state`` array. + :type ancestral_state_offset: numpy.ndarray, dtype=np.uint32. + :param metadata: The flattened metadata array. Must be specified along + with ``metadata_offset``. If not specified or None, an empty metadata + value is stored for each node. + :type metadata: numpy.ndarray, dtype=np.int8 + :param metadata_offset: The offsets into the ``metadata`` array. + :type metadata_offset: numpy.ndarray, dtype=np.uint32. """ super(SiteTable, self).append_columns( position, ancestral_state=ancestral_state, From 9bf2d3075e585874b3d62c7b1f37d59f75a8de57 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 15 Jan 2018 12:38:57 +0000 Subject: [PATCH 07/10] Documented MutationTable API and encoding functions. --- _msprimemodule.c | 4 +- docs/api.rst | 1 - msprime/tables.py | 187 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 172 insertions(+), 20 deletions(-) diff --git a/_msprimemodule.c b/_msprimemodule.c index 5b7d8c9db..5abbe035a 100644 --- a/_msprimemodule.c +++ b/_msprimemodule.c @@ -2607,7 +2607,7 @@ MutationTable_add_row(MutationTable *self, PyObject *args, PyObject *kwds) int parent = MSP_NULL_MUTATION; char *derived_state; Py_ssize_t derived_state_length; - PyObject *py_metadata = NULL; + PyObject *py_metadata = Py_None; char *metadata = NULL; Py_ssize_t metadata_length = 0; static char *kwlist[] = {"site", "node", "derived_state", "parent", "metadata", NULL}; @@ -2620,7 +2620,7 @@ MutationTable_add_row(MutationTable *self, PyObject *args, PyObject *kwds) if (MutationTable_check_state(self) != 0) { goto out; } - if (py_metadata != NULL) { + if (py_metadata != Py_None) { if (PyBytes_AsStringAndSize(py_metadata, &metadata, &metadata_length) < 0) { goto out; } diff --git a/docs/api.rst b/docs/api.rst index 9c9cc8e29..0ebbcdeb6 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -490,7 +490,6 @@ Table classes :members: .. autoclass:: msprime.ProvenanceTable - :members: +++++++++++++++ Table functions diff --git a/msprime/tables.py b/msprime/tables.py index fd6cec0fe..813212e7b 100644 --- a/msprime/tables.py +++ b/msprime/tables.py @@ -783,22 +783,39 @@ def _site_table_pickle(table): class MutationTable(_msprime.MutationTable): """ - Class for tables describing all mutations that have occurred in a tree - sequence, of the form + A table defining the mutations in a tree sequence. See the + :ref:`definitions ` for details on the columns + in this table and the + :ref:`tree sequence requirements ` section + for the properties needed for a mutation table to be a part of a valid tree + sequence. - site node derived_state - 0 4 1 - 1 3 1 - 1 2 0 + :warning: The numpy arrays returned by table attribute accesses are **copies** + of the underlying data. In particular, this means that you cannot edit + the values in the columns by updating the attribute arrays. - Here ``site`` is the index in the SiteTable of the site at which the - mutation occurred, ``node`` is the index in the NodeTable of the node who - is the first node inheriting the mutation, and ``derived_state`` is the - allele resulting from this mutation. + **NOTE:** this behaviour may change in future. - It is required that mutations occurring at the same node are sorted in - reverse time order. + :ivar site: The array of site IDs. + :vartype site: numpy.ndarray, dtype=np.int32 + :ivar node: The array of node IDs. + :vartype node: numpy.ndarray, dtype=np.int32 + :ivar derived_state: The flattened array of derived state strings. + See :ref:`sec_tables_api_text_columns` for more details. + :vartype derived_state: numpy.ndarray, dtype=np.int8 + :ivar derived_state_offset: The offsets of rows in the derived_state + array. See :ref:`sec_tables_api_text_columns` for more details. + :vartype derived_state_offset: numpy.ndarray, dtype=np.uint32 + :ivar parent: The array of parent mutation IDs. + :vartype parent: numpy.ndarray, dtype=np.int32 + :ivar metadata: The flattened array of binary metadata values. See + :ref:`sec_tables_api_binary_columns` for more details. + :vartype metadata: numpy.ndarray, dtype=np.int8 + :ivar metadata_offset: The array of offsets into the metadata column. See + :ref:`sec_tables_api_binary_columns` for more details. + :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 """ + def __str__(self): site = self.site node = self.node @@ -868,6 +885,109 @@ def reset(self): # Deprecated alias for clear self.clear() + def add_row(self, site, node, derived_state, parent=-1, metadata=None): + """ + Adds a new row to this :class:`MutationTable` and returns the ID of the + corresponding mutation. + + :param int site: The ID of the site that this mutation occurs at. + :param int node: The ID of the first node inheriting this mutation. + :param str derived_state: The state of the site at this mutation's node. + :param int parent: The ID of the parent mutation. If not specified, + defaults to the :attr:`NULL_MUTATION`. + :param bytes metadata: The binary-encoded metadata for the new node. If not + specified or None, a zero-length byte string is stored. + :return: The ID of the newly added mutation. + :rtype: int + """ + return super(MutationTable, self).add_row( + site, node, derived_state, parent, metadata) + + def set_columns( + self, site, node, derived_state, derived_state_offset, + parent=None, metadata=None, metadata_offset=None): + """ + Sets the values for each column in this :class:`.MutationTable` using the values + in the specified arrays. Overwrites any data currently stored in the table. + + The ``site``, ``node``, ``derived_state`` and ``derived_state_offset`` + parameters are mandatory, and must be 1D numpy arrays. The + ``site`` and ``node`` (also ``parent``, if supplied) arrays + must be of equal length, and determine the number of rows in the table. + The ``derived_state`` and ``derived_state_offset`` parameters must + be supplied together, and meet the requirements for + :ref:`sec_encoding_ragged_columns` (see + :ref:`sec_tables_api_text_columns` for more information). The + ``metadata`` and ``metadata_offset`` parameters must be supplied + together, and meet the requirements for + :ref:`sec_encoding_ragged_columns` (see + :ref:`sec_tables_api_binary_columns` for more information). + + :param site: The ID of the site each mutation occurs at. + :type site: numpy.ndarray, dtype=np.int32 + :param node: The ID of the node each mutation is associated with. + :type node: numpy.ndarray, dtype=np.int32 + :param derived_state: The flattened derived_state array. Required. + :type derived_state: numpy.ndarray, dtype=np.int8 + :param derived_state_offset: The offsets into the ``derived_state`` array. + :type derived_state_offset: numpy.ndarray, dtype=np.uint32. + :param parent: The ID of the parent mutation for each mutation. + :type parent: numpy.ndarray, dtype=np.int32 + :param metadata: The flattened metadata array. Must be specified along + with ``metadata_offset``. If not specified or None, an empty metadata + value is stored for each node. + :type metadata: numpy.ndarray, dtype=np.int8 + :param metadata_offset: The offsets into the ``metadata`` array. + :type metadata_offset: numpy.ndarray, dtype=np.uint32. + """ + super(MutationTable, self).set_columns( + site=site, node=node, parent=parent, + derived_state=derived_state, derived_state_offset=derived_state_offset, + metadata=metadata, metadata_offset=metadata_offset) + + def append_columns( + self, site, node, derived_state, derived_state_offset, + parent=None, metadata=None, metadata_offset=None): + """ + Appends the specified arrays to the end of the columns of this + :class:`MutationTable`. This allows many new rows to be added at once. + + The ``site``, ``node``, ``derived_state`` and ``derived_state_offset`` + parameters are mandatory, and must be 1D numpy arrays. The + ``site`` and ``node`` (also ``parent``, if supplied) arrays + must be of equal length, and determine the number of additional + rows to add to the table. + The ``derived_state`` and ``derived_state_offset`` parameters must + be supplied together, and meet the requirements for + :ref:`sec_encoding_ragged_columns` (see + :ref:`sec_tables_api_text_columns` for more information). The + ``metadata`` and ``metadata_offset`` parameters must be supplied + together, and meet the requirements for + :ref:`sec_encoding_ragged_columns` (see + :ref:`sec_tables_api_binary_columns` for more information). + + :param site: The ID of the site each mutation occurs at. + :type site: numpy.ndarray, dtype=np.int32 + :param node: The ID of the node each mutation is associated with. + :type node: numpy.ndarray, dtype=np.int32 + :param derived_state: The flattened derived_state array. Required. + :type derived_state: numpy.ndarray, dtype=np.int8 + :param derived_state_offset: The offsets into the ``derived_state`` array. + :type derived_state_offset: numpy.ndarray, dtype=np.uint32. + :param parent: The ID of the parent mutation for each mutation. + :type parent: numpy.ndarray, dtype=np.int32 + :param metadata: The flattened metadata array. Must be specified along + with ``metadata_offset``. If not specified or None, an empty metadata + value is stored for each node. + :type metadata: numpy.ndarray, dtype=np.int8 + :param metadata_offset: The offsets into the ``metadata`` array. + :type metadata_offset: numpy.ndarray, dtype=np.uint32. + """ + super(MutationTable, self).append_columns( + site=site, node=node, parent=parent, + derived_state=derived_state, derived_state_offset=derived_state_offset, + metadata=metadata, metadata_offset=metadata_offset) + # Pickle support. See copyreg registration for this function below. def _mutation_table_pickle(table): @@ -885,7 +1005,8 @@ def _mutation_table_pickle(table): class ProvenanceTable(_msprime.ProvenanceTable): """ - TODO Document + .. todo:: + This class is provisional, and the API may change in the future. """ def add_row(self, record, timestamp=None): """ @@ -1154,7 +1275,13 @@ def simplify_tables( def pack_bytes(data): """ Packs the specified list of bytes into a flattened numpy array of 8 bit integers - and corresponding lengths. + and corresponding offsets. See :ref:`sec_encoding_ragged_columns` for details + of this encoding. + + :param list[bytes] data: The list of bytes values to encode. + :return: The tuple (packed, offset) of numpy arrays representing the flattened + input data and offsets. + :rtype: numpy.array (dtype=np.int8), numpy.array (dtype=np.uint32). """ n = len(data) offsets = np.zeros(n + 1, dtype=np.uint32) @@ -1169,7 +1296,13 @@ def pack_bytes(data): def unpack_bytes(packed, offset): """ Unpacks a list of bytes from the specified numpy arrays of packed byte - data and corresponding offsets. + data and corresponding offsets. See :ref:`sec_encoding_ragged_columns` for details + of this encoding. + + :param numpy.ndarray packed: The flattened array of byte values. + :param numpy.ndarray offset: The array of offsets into the ``packed`` array. + :return: The list of bytes values unpacked from the parameter arrays. + :rtype: list[bytes] """ # This could be done a lot more efficiently... ret = [] @@ -1182,7 +1315,17 @@ def unpack_bytes(packed, offset): def pack_strings(strings, encoding="utf8"): """ Packs the specified list of strings into a flattened numpy array of 8 bit integers - and corresponding lengths. + and corresponding offsets using the specified text encoding. + See :ref:`sec_encoding_ragged_columns` for details of this encoding of + columns of variable length data. + + :param list[str] data: The list of strings to encode. + :param str encoding: The text encoding to use when converting string data + to bytes. See the :mod:`codecs` module for information on available + string encodings. + :return: The tuple (packed, offset) of numpy arrays representing the flattened + input data and offsets. + :rtype: numpy.array (dtype=np.int8), numpy.array (dtype=np.uint32). """ return pack_bytes([bytearray(s.encode(encoding)) for s in strings]) @@ -1190,6 +1333,16 @@ def pack_strings(strings, encoding="utf8"): def unpack_strings(packed, offset, encoding="utf8"): """ Unpacks a list of strings from the specified numpy arrays of packed byte - data and corresponding offsets. + data and corresponding offsets using the specified text encoding. + See :ref:`sec_encoding_ragged_columns` for details of this encoding of + columns of variable length data. + + :param numpy.ndarray packed: The flattened array of byte values. + :param numpy.ndarray offset: The array of offsets into the ``packed`` array. + :param str encoding: The text encoding to use when converting string data + to bytes. See the :mod:`codecs` module for information on available + string encodings. + :return: The list of strings unpacked from the parameter arrays. + :rtype: list[str] """ return [b.decode(encoding) for b in unpack_bytes(packed, offset)] From 946a0c38c5127ffb8e2cbd8e413ce5a2dbd5fef8 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 15 Jan 2018 16:33:17 +0000 Subject: [PATCH 08/10] Added tree sequence requirements section. --- docs/interchange.rst | 240 ++++++++++++++++++++++++++++++++----------- msprime/tables.py | 5 +- msprime/trees.py | 4 +- 3 files changed, 186 insertions(+), 63 deletions(-) diff --git a/docs/interchange.rst b/docs/interchange.rst index 0f5d83dc0..18209f5d1 100644 --- a/docs/interchange.rst +++ b/docs/interchange.rst @@ -89,8 +89,14 @@ Sequence length .. todo:: Define migration and provenance types. -A tree sequence can be stored in a collection of six tables: Node, Edge, Site, -Mutation, Migration, and Provenance. The first two store the genealogical +A tree sequence can be stored in a collection of six tables: +:ref:`Node `, +:ref:`Edge `, +:ref:`Site `, +:ref:`Mutation `, +:ref:`Migration `, and +:ref:`Provenance `. +The first two store the genealogical relationships that define the trees; the next two describe where mutations fall on those trees; the Migration table describes how lineages move across space; and the Provenance table contains information on where the data came from. @@ -132,12 +138,15 @@ is composed of 32 bitwise boolean values. Currently, the only flag defined is ``IS_SAMPLE = 1``, which defines the sample status of nodes. Marking a particular node as a sample means, for example, that the mutational state of the node will be included in the genotypes produced by -:meth:``TreeSequence.variants``. +:meth:`.TreeSequence.variants`. + +See the :ref:`sec_node_requirements` section for details on the properties +required for a valid set of nodes. For convenience, the :ref:`text format ` for nodes -decomposes the ``flags`` value into it's separate values. Thus, in the +decomposes the ``flags`` value into its separate values. Thus, in the text format we have a column for ``is_sample``, which corresponds to the -the ``flags`` column in the underlying table. As more flags values are +``flags`` column in the underlying table. As more flags values are defined, these will be added to the text file format. The ``metadata`` column provides a location for client code to store @@ -171,11 +180,13 @@ child int32 Child node ID. ================ ============== =========== Each row in an edge table describes the half-open genomic interval -affected ``[left, right)``, the ``parent`` and the ``child`` on that interval. +affected ``[left, right)``, and the ``parent`` and ``child`` on that interval. The ``left`` and ``right`` columns are defined using double precision floating point values for flexibility. The ``parent`` and ``child`` columns specify integer IDs in the associated :ref:`sec_node_table_definition`. +See the :ref:`sec_edge_requirements` section for details on the properties +required for a valid set of edges. .. _sec_site_table_definition: @@ -206,12 +217,18 @@ The ``metadata`` column provides a location for client code to store information about each site. See the :ref:`sec_metadata_definition` section for more details on how metadata columns should be used. +See the :ref:`sec_site_requirements` section for details on the properties +required for a valid set of sites. .. _sec_mutation_table_definition: Mutation Table -------------- +A **mutation** defines a change of mutational state on a tree at a particular site. +The mutation table contains five columns, of which ``site``, ``node`` and +``derived_state`` are mandatory. + ================ ============== =========== Column Type Description ================ ============== =========== @@ -222,37 +239,42 @@ derived_state char The mutational state at the defined node metadata char Mutation :ref:`sec_metadata_definition`. ================ ============== =========== +The ``site`` column is an integer value defining the ID of the +:ref:`site ` at which this mutation occured. -.. _sec_migration_table_definition: +The ``node`` column is an integer value defining the ID of the +first :ref:`node ` in the tree to inherit this mutation. -Migration Table ---------------- +The ``derived_state`` column specifies the mutational state at the specified node, +thus defining the state that nodes in the subtree inherit (unless further mutations +occur). The column stores text character data of arbitrary length. -In simulations, trees can be thought of as spread across space, and it is -helpful for inferring demographic history to record this history. This is -stored using the following type. +The ``parent`` column is an integer value defining the ID of the +mutation from which this mutation inherits. If there is no mutation at the +site in question on the path back to root, then this field is set to the +null ID (-1). (The ``parent`` column is only required in situations +where there are multiple mutations at a given site. For +simple infinite sites mutations, it can be ignored.) -migration - Migrations are performed by individual ancestors, but most likely not by an - individual tracked as a ``node`` (as in a discrete-deme model they are - unlikely to be both a migrant and a most recent common ancestor). So, - ``msprime`` records when a segment of ancestry has moved between - populations:: +The ``metadata`` column provides a location for client code to store +information about each site. See the :ref:`sec_metadata_definition` section for +more details on how metadata columns should be used. - left right node source dest time - 0.0 0.3 3 0 1 2.1 +See the :ref:`sec_mutation_requirements` section for details on the properties +required for a valid set of mutations. - This ``migration`` records that the ancestor who was alive 2.1 time units - in the past from which ``node`` 3 inherited the segment of genome between - 0.0 and 0.3 migrated from population 0 to population 1. +.. _sec_migration_table_definition: -A valid ``migration``: +Migration Table +--------------- -1. Has ``time`` strictly between the time of its ``node`` and the time of any - ancestral node from which that node inherits on the segment ``[left, - right)``. -2. Has the ``population`` of any such ancestor matching ``source``, if another - ``migration`` does not intervene. +In simulations, trees can be thought of as spread across space, and it is +helpful for inferring demographic history to record this history. +Migrations are performed by individual ancestors, but most likely not by an +individual tracked as a ``node`` (as in a discrete-deme model they are +unlikely to be both a migrant and a most recent common ancestor). So, +``msprime`` records when a segment of ancestry has moved between +populations. ================ ============== =========== Column Type Description @@ -266,12 +288,29 @@ time double Time of migration event. ================ ============== =========== +The ``left`` and ``right`` columns are floating point values defining the +half-open segment of genome affected. + + +.. todo:: + Document the remaining fields + +See the :ref:`sec_migration_requirements` section for details on the properties +required for a valid set of mutations. + +.. This ``migration`` records that the ancestor who was alive 2.1 time units +.. in the past from which ``node`` 3 inherited the segment of genome between +.. 0.0 and 0.3 migrated from population 0 to population 1. + .. _sec_provenance_table_definition: Provenance Table ---------------- +.. todo:: + Document the provenance table. + ================ ============== =========== Column Type Description ================ ============== =========== @@ -317,57 +356,140 @@ human readable. Valid tree sequence requirements ================================ -**Explain and list the requirements for a set of tables to form a valid tree -sequence**. +Arbitrary data can be stored in tables using the classes in the +:ref:`sec_tables_api`. However, only tables that fulfil a set of +requirements represent a valid :class:`.TreeSequence` object, and +can be loaded using :func:`.load_tables`. In this +section we list these requirements, and explain their rationale. +Violations of most of these requirements are detected when the +user attempts to load a tree sequence via :func:`.load` or +:func:`.load_tables`, raising an informative error message. Some +more complex requirements may not be detectable at load-time, +and errors may not occur until certain operations are attempted. +These are documented below. + + +.. _sec_node_requirements: + +Node requirements +----------------- + +Nodes are the most basic type in a tree sequence, and are not defined with +respect to any other tables. Therefore, the requirements for nodes are +trivial. + +- Node times must be non-negative. + +There are no requirements regarding the ordering of nodes with respect to time +or any other field. Sorting a set of tables using :func:`.sort_tables` has +no effect on the nodes. + +.. _sec_edge_requirements: + +Edge requirements +----------------- + +Given a valid set of nodes and a sequence length :math:`L`, the simple +requirements for each edge are: -.. _sec_structural_requirements: +- We must have :math:`0 \leq` ``left`` :math:`<` ``right`` :math:`\leq L`; +- ``parent`` and ``child`` must be valid node IDs; +- ``time[parent]`` > ``time[child]``; +- edges must be unique (i.e., no duplicate edges are allowed). -Structural requirements ------------------------ +The first requirement simply ensures that the interval makes sense. The +third requirement ensures that we cannot have loops, since time is +always increasing as we ascend the tree. +Semantically, to ensure a valid tree sequence there is one further requirement: -1. All birth times must be greater than or equal to zero. +- The set of intervals on which each node is a child must be disjoint. -To disallow time travel and multiple inheritance: +This guarantees that we cannot have contradictory edges (i.e., +where a node ``a`` is a child of both ``b`` and ``c``), and ensures that +at each point on the sequence we have a well-formed forest of trees. +Because this is a more complex semantic requirement, it is **not** detected +at load time. This error is detected during tree traversal, via, e.g., +the :meth:`.TreeSequence.trees` iterator. -1. Offspring must be born after their parents (and hence, no loops). -2. The set of intervals on which each individual is a child must be disjoint. +In the interest of algorithmic efficiency, edges must have the following +sortedness properties: -and for algorithmic reasons: +- All edges for a given parent must be contiguous; +- Edges must be listed in nondecreasing order of ``parent`` time; +- Within the edges for a given ``parent``, edges must be sorted + first by ``child`` ID and then by ``left`` coordinate. -3. The leftmost endpoint of each chromosome is 0.0. -4. Node times must be strictly greater than zero. +Violations of these requirements are detected at load time. +The :func:`.sort_tables` function will ensure that these sortedness +properties are fulfilled. +.. _sec_site_requirements: -.. _sec_ordering_requirements: +Site requirements +----------------- -Ordering requirements +Given a valid set of nodes and a sequence length :math:`L`, the simple +requirements for a valid set of sites are: + +- We must have :math:`0 \leq` ``position`` :math:`< L`; +- ``position`` values must be unique. + +For simplicity and algorithmic efficiency, sites must also: + +- Be sorted in increasing order of ``position``. + +Violations of these requirements are detected at load time. +The :func:`.sort_tables` function ensures that sites are sorted +according to these criteria. + +.. _sec_mutation_requirements: + +Mutation requirements --------------------- -Edges are ordered by +Given a valid set of nodes, edges and sites, the +requirements for a valid set of mutations are: + +- ``site`` must refer to a valid site ID; +- ``node`` must refer to a valid node ID; +- ``parent`` must either be the null ID (-1) or a valid mutation ID within the + current table + +For simplicity and algorithmic efficiency, mutations must also: -- time of parent, then -- parent node ID, then -- child node ID, then -- left endpoint. +- be sorted by site ID; +- when there are multiple mutations per site, parent mutations must occur + **before** their children (i.e. if a mutation with ID :math:`x` has + ``parent`` with ID :math:`y`, then we must have :math:`y < x`). -Sites are ordered by position, and Mutations are ordered by site. +Violations of these sorting requirements are detected at load time. +The :func:`.sort_tables` function ensures that mutationsare sorted +according to these criteria. -5. Edges must be sorted in nondecreasing time order. -6. The set of intervals on which each individual is a parent must be disjoint. +Mutations also have the requirement that they must result in a +change of state. For example, if we have a site with ancestral state +of "A" and a single mutation with derived state "A", then this +mutation does not result in any change of state. This error is +raised at run-time when we reconstruct sample genotypes, for example +in the :meth:`.TreeSequence.variants` iterator. -A set of tables satisfying requirements 1-4 can be transformed into a completely -valid set of tables by applying first ``sort_tables()`` (which ensures 5) -and then ``simplify_tables()`` (which ensures 6). +.. _sec_migration_requirements: -Note that since each node time is equal to the (birth) time of the -corresponding parent, time is measured in clock time (not meioses). +Migration requirements +---------------------- + +.. todo:: + Add requirements for valid migrations. +.. A valid ``migration``: -To allow for efficent algorithms, it is required that +.. 1. Has ``time`` strictly between the time of its ``node`` and the time of any +.. ancestral node from which that node inherits on the segment ``[left, +.. right)``. +.. 2. Has the ``population`` of any such ancestor matching ``source``, if another +.. ``migration`` does not intervene. -8. Sites are sorted by increasing position, -9. and mutations are sorted by site. .. _sec_text_file_format: diff --git a/msprime/tables.py b/msprime/tables.py index 813212e7b..5102f6d29 100644 --- a/msprime/tables.py +++ b/msprime/tables.py @@ -1174,8 +1174,9 @@ def sort_tables( provenances=None, edge_start=0): """ Sorts the given tables **in place**, ensuring that all tree - sequence :ref:`ordering requirements ` are - met. + sequence ordering requirements are met. See + the :ref:`sec_valid_tree_sequence_requirements` section for details on these + requirements. If the ``edge_start`` parameter is provided, this specifies the index in the edge table where sorting should start. Only rows with index diff --git a/msprime/trees.py b/msprime/trees.py index 97034455d..52cb5211e 100644 --- a/msprime/trees.py +++ b/msprime/trees.py @@ -1075,8 +1075,8 @@ def load_text(nodes, edges, sites=None, mutations=None, sequence_length=0, stric After parsing the tables, :func:`sort_tables` is called to ensure that the loaded tables satisfy the tree sequence :ref:`ordering requirements - `. Note that this may result in the IDs of various - entities changing from their positions in the input file. + `. Note that this may result in the + IDs of various entities changing from their positions in the input file. :param stream nodes: The file-like object containing text describing a :class:`.NodeTable`. From 491c94dee2579e178d9a586afaebb037a6d4a512 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 15 Jan 2018 17:33:00 +0000 Subject: [PATCH 09/10] Added example of using tables API to tutorial. Closes #292. --- docs/interchange.rst | 2 + docs/tutorial.rst | 134 ++++++++++++++++++++++++++++++++----------- 2 files changed, 103 insertions(+), 33 deletions(-) diff --git a/docs/interchange.rst b/docs/interchange.rst index 18209f5d1..4af7bea7b 100644 --- a/docs/interchange.rst +++ b/docs/interchange.rst @@ -103,6 +103,8 @@ and the Provenance table contains information on where the data came from. In the following sections we define these components of a tree sequence in more detail. +.. _sec_table_definitions: + Table definitions ================= diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 147268c2f..7a9a40c13 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -849,11 +849,79 @@ Running this example we get:: 100%|████████████████████████████████████████████████| 4045/4045 [00:09<00:00, 440.29it/s] Found LD sites for 4045 doubleton mutations out of 60100 +********************** +Editing tree sequences +********************** + +Sometimes we wish to make some minor modifications to a tree sequence that has +been generated by a simulation. However, tree sequence objects are **immutable** +and so we cannot edit a them in place. The answer is to use the +:ref:`sec_tables_api`: we export the tree sequence to a set of +:ref:`tables `, edit these tables, and then create +a new tree sequence from them. In the following example, we use this approach +to remove all singleton sites from a given tree sequence. + +.. code:: + + def strip_singletons(ts): + sites = msprime.SiteTable() + mutations = msprime.MutationTable() + for tree in ts.trees(): + for site in tree.sites(): + assert len(site.mutations) == 1 # Only supports infinite sites muts. + mut = site.mutations[0] + if tree.num_samples(mut.node) > 1: + site_id = sites.add_row( + position=site.position, + ancestral_state=site.ancestral_state) + mutations.add_row( + site=site_id, node=mut.node, derived_state=mut.derived_state) + tables = ts.dump_tables() + new_ts = msprime.load_tables( + nodes=tables.nodes, edges=tables.edges, sites=sites, mutations=mutations) + return new_ts + + +This function takes a tree sequence containing some infinite sites mutations as +input, and returns a copy in which all singleton sites have been removed. +The approach is very simple: we allocate :class:`.SiteTable` and +:class:`.MutationTable` instances to hold the new sites and mutations that +we define, and then consider each site in turn. If the allele frequency of +the mutation is greater than one, we add the site and mutation to our +output tables using :meth:`.SiteTable.add_row` and :meth:`.MutationTable.add_row`. +(In this case we consider only simple infinite sites mutations, +where we cannot have back or recurrent mutations. These would require a slightly +more involved approach where we keep a map of mutation IDs so that +mutation ``parent`` values could be computed.) + +After considering each site, we then create a new tree sequence using +:func:`.load_tables` using the node and edge tables from the original +tree sequence and the just-created site and mutation tables. Using +this function then, we get:: + + >>> ts = msprime.simulate(10, mutation_rate=10) + >>> ts.num_sites + 50 + >>> ts_new = strip_singletons(ts) + >>> ts_new.num_sites + 44 + >>> + +Thus, we have removed 6 singleton sites from the tree sequence. + +.. todo:: + + Add another example here where we use the array oriented API to edit + the nodes and edges of a tree sequence. Perhaps decapitating would be a + good example? ******************* Working with Tables ******************* +.. todo:: + This section is a work in progress and needs to be revised. + Tables provide a convenient method for viewing, importing and exporting tree sequences. ``msprime`` provides direct access to the the columns of a table as ``numpy`` arrays: for instance, if ``n`` is a ``NodeTable``, then ``n.time`` @@ -986,47 +1054,47 @@ samples:: 2 11 -*************************** -Stuff copied from elsewhere -*************************** +.. *************************** +.. Stuff copied from elsewhere +.. *************************** -In addition to genealogical relationships, ``msprime`` generates and stores -mutations. Associating these with nodes means that a variant shared by many -individuals need only be stored once, allowing retrieval and processing of -variant information much more efficiently than if every individual's genotype -was stored directly. +.. In addition to genealogical relationships, ``msprime`` generates and stores +.. mutations. Associating these with nodes means that a variant shared by many +.. individuals need only be stored once, allowing retrieval and processing of +.. variant information much more efficiently than if every individual's genotype +.. was stored directly. -Rather than storing a position on the genome directly, a ``mutation`` -stores the index of a ``site``, that describes that position. This is to -allow efficient processing of multiple mutations at the same genomic -position. A ``site`` records a position on the genome where a mutation has -occurred along with the ancestral state (i.e., the state at the root of the -tree at that position):: +.. Rather than storing a position on the genome directly, a ``mutation`` +.. stores the index of a ``site``, that describes that position. This is to +.. allow efficient processing of multiple mutations at the same genomic +.. position. A ``site`` records a position on the genome where a mutation has +.. occurred along with the ancestral state (i.e., the state at the root of the +.. tree at that position):: - id position ancestral_state - 0 0.1 0 +.. id position ancestral_state +.. 0 0.1 0 -As with nodes, the ``id`` is not stored directly, but is implied by its -index in the site table. +.. As with nodes, the ``id`` is not stored directly, but is implied by its +.. index in the site table. -This type records a mutation that has occurred at some point in the -genealogical history. Each mutation is associated with a particular -``node`` (i.e., a particular ancestor), so that any sample which inherits -from that node will also inherit that mutation, unless another mutation -intervenes. The type records:: +.. This type records a mutation that has occurred at some point in the +.. genealogical history. Each mutation is associated with a particular +.. ``node`` (i.e., a particular ancestor), so that any sample which inherits +.. from that node will also inherit that mutation, unless another mutation +.. intervenes. The type records:: - site node derived_state - 0 14 1 - -Here ``site`` is the index of the ``site`` at which the mutation occurred, -``node`` records the ID of the ancestral node associated with the mutation, -and ``derived_state`` is the allele that any sample inheriting from that -node at this site will have if another mutation does not intervene. The -``node`` is not necessarily the ancestor in whom the mutation occurred, but -rather the ancestor at the bottom of the branch in the tree at that site on -which the mutation occurred. +.. site node derived_state +.. 0 14 1 + +.. Here ``site`` is the index of the ``site`` at which the mutation occurred, +.. ``node`` records the ID of the ancestral node associated with the mutation, +.. and ``derived_state`` is the allele that any sample inheriting from that +.. node at this site will have if another mutation does not intervene. The +.. ``node`` is not necessarily the ancestor in whom the mutation occurred, but +.. rather the ancestor at the bottom of the branch in the tree at that site on +.. which the mutation occurred. From 4360ca7b2bb32c40a3b3793310b08bd003cb1251 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 15 Jan 2018 17:48:00 +0000 Subject: [PATCH 10/10] Updated error messages. Closes #280. --- lib/msprime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/msprime.c b/lib/msprime.c index 8d22ae391..6ed2776e6 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -150,11 +150,11 @@ msp_strerror(int err) ret = "All edges for a given parent must be contiguous"; break; case MSP_ERR_EDGES_NOT_SORTED_CHILD: - ret = "Edges must be listed in (time[parent], parent, child, left) order;" + ret = "Edges must be listed in (time[parent], child, left) order;" " child order violated"; break; case MSP_ERR_EDGES_NOT_SORTED_LEFT: - ret = "Edges must be listed in (time[parent], parent, child, left) order;" + ret = "Edges must be listed in (time[parent], child, left) order;" " left order violated"; break; case MSP_ERR_NULL_PARENT: