Merge pull request #116 from RECETOX/refactoring

Refactoring data reading and writing
RECETOX · Feb 5, 2024 · 7684d26 · 7684d26
2 parents 52dec89 + fe40202
commit 7684d26
Show file tree

Hide file tree

Showing 27 changed files with 590 additions and 939,196 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -14,12 +14,12 @@ jobs:
     strategy:
       matrix:
         os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
-        python-version: ['3.8', '3.9', '3.10', '3.11']
+        python-version: ['3.10', '3.11']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies

diff --git a/.gitignore b/.gitignore
@@ -132,3 +132,6 @@ tests/test_pandas_data.py
 
 # vscode
 .vscode
+
+# poetry
+poetry.lock
diff --git a/RIAssigner/__version__.py b/RIAssigner/__version__.py
@@ -1 +1 @@
-__version__ = '0.3.3'
+__version__ = '0.4.0'
diff --git a/RIAssigner/compute/ComputationMethod.py b/RIAssigner/compute/ComputationMethod.py
@@ -30,6 +30,12 @@ def _check_data_args(self, query: Data, reference: Data):
             raise ValueError("Query data is not defined.")
         if reference is None:
             raise ValueError("Reference data is not defined.")
+        if not query.has_retention_times():
+            raise ValueError("Query data has no retention times.")
+        if not reference.has_retention_times():
+            raise ValueError("Reference data has no retention times.")
+        if not reference.has_retention_indices():
+            raise ValueError("Reference data has no retention indices.")
 
     def __eq__(self, o: object) -> bool:
         return type(o) == type(self)
diff --git a/RIAssigner/data/Data.py b/RIAssigner/data/Data.py
@@ -3,16 +3,14 @@
 import pandas as pd
 
 from pint import Quantity, UnitRegistry
-from pint.unit import build_unit_class
 
 
 class Data(ABC):
     """ Base class for data managers. """
-    RetentionTimeType = Optional[float]
-    RetentionIndexType = Optional[float]
+    RetentionTimeType = float
+    RetentionIndexType = float
     CommentFieldType = Optional[str]
     URegistry = UnitRegistry()
-    Unit = build_unit_class(URegistry)
 
     _rt_possible_keys = {'RT', 'rt', 'rts', 'retention_times', 'retention_time', 'retention', 'time', 'retentiontime'}
     _ri_possible_keys = {'RI', 'ri', 'ris', 'retention_indices', 'retention_index', 'kovats', 'retentionindex'}
@@ -27,7 +25,7 @@ def is_valid(value: Union[RetentionTimeType, RetentionIndexType]) -> bool:
         Returns:
             bool: State of validity (True/False).
         """
-        result = value is not None and Data.can_be_float(value) and value >= 0.0
+        result = value is not None and Data.can_be_float(value) and value > 0
         return result
 
     @staticmethod
@@ -68,7 +66,7 @@ def __init__(self, filename: str, filetype: str, rt_unit: str):
         self._filename = filename
         self._filetype = filetype
         self._rt_unit = rt_unit
-        self._unit = Data.Unit(self._rt_unit)
+        self._unit = Data.URegistry(self._rt_unit)
 
     @abstractmethod
     def write(self, filename):
@@ -129,7 +127,7 @@ def has_retention_indices(self) -> bool:
         Returns:
             bool: True if all retention indices exist, False otherwise.
         """
-        return all([Data.is_valid(rt) for rt in self.retention_indices])
+        return len(self.retention_indices) > 0 and all([Data.is_valid(rt) for rt in self.retention_indices])
 
     def has_retention_times(self) -> bool:
         """
@@ -142,7 +140,7 @@ def has_retention_times(self) -> bool:
         Returns:
             bool: True if all retention times exist, False otherwise.
         """
-        return all([Data.is_valid(rt) for rt in self.retention_times])
+        return len(self.retention_times) > 0 and all([Data.is_valid(rt) for rt in self.retention_times])
 
 
     @property
@@ -155,7 +153,7 @@ def comment(self) -> Iterable[CommentFieldType]:
         """
         ...
 
-    def extract_ri_from_comment(self, ri_source: str):
+    def init_ri_from_comment(self, ri_source: str):
         """ Extract RI from comment field.
         Extracts the RI from the comment field of the data file. The RI is expected to be
         in the format 'ri_source=RI_value'. The function extracts the RI value and
@@ -168,8 +166,6 @@ def extract_ri_from_comment(self, ri_source: str):
         ri_source:
             String that is expected to be in the comment field before the RI value.
         """
-
-
         mask = pd.Series(self.comment).str.contains(rf'\b{ri_source}\b', na=False)
         extracted_values = pd.Series(self.comment).str.extract(rf'\b{ri_source}=(\d+)\b')[0].astype(float)
         self.retention_indices = extracted_values.where(mask, None).tolist()

diff --git a/RIAssigner/data/MatchMSData.py b/RIAssigner/data/MatchMSData.py
@@ -1,18 +1,18 @@
-from typing import Iterable, Optional
+from typing import Iterable, List, Optional, Tuple
+import numpy as np
 
 from matchms import Spectrum
-from matchms.exporting import save_as_msp
-from matchms.importing import load_from_msp
+from matchms.exporting import save_spectra
+from matchms.exporting.metadata_export import get_metadata_as_array
+from matchms.importing import load_spectra
 from RIAssigner.utils import get_first_common_element
 
 from .Data import Data
 
 
 class MatchMSData(Data):
     """ Class to handle data from filetypes which can be imported
-        using 'matchMS'.
-
-    Currently only supports 'msp'.
+        using 'matchms'.
     """
 
     def __init__(self, filename: str, filetype: str, rt_unit: str):
@@ -22,7 +22,9 @@ def __init__(self, filename: str, filetype: str, rt_unit: str):
     def _read(self):
         """Load data into object and initialize properties.
         """
-        self._read_spectra(self._filename, self._filetype)
+        self._spectra = list(load_spectra(self._filename, True, self._filetype))
+        _, self._keys = get_metadata_as_array(self._spectra)
+
         self._init_rt_key()
         self._init_ri_key()
 
@@ -32,35 +34,27 @@ def _read(self):
         self._read_retention_indices()
 
     def write(self, filename: str):
-        """Write data to back to 'msp' file
+        """Write data to back to the spectra file
 
         Args:
             filename (str): Path to filename under which to store the data.
         """
-        save_as_msp(self._spectra, filename)
-
-    def _read_spectra(self, filename: str, filetype: str):
-        """Read spectra from 'msp' file into data.
-
-        Args:
-            filename (str): Path to filename from which to load the data.
+        self._write_RIs_to_spectra()
+        save_spectra(self._spectra, filename)
 
-        Raises:
-            NotImplementedError: For filetypes other tahn 'msp'.
+    def _write_RIs_to_spectra(self):
+        """Write the RI values stored in the object to the spectra metadata.
         """
-        if filetype == 'msp':
-            self._spectra = list(load_from_msp(filename))
-        else:
-            raise NotImplementedError("Currently only supports 'msp'.")
+        list(map(_assign_ri_value, self._spectra, [self._ri_key] * len(self._spectra), self._retention_indices))
 
     def _init_rt_key(self):
         """ Identify retention-time key from spectrum metadata. """
-        rt_key = get_first_common_element(self._rt_possible_keys, self._spectra[0].metadata.keys())
+        rt_key = get_first_common_element(self._rt_possible_keys, self._keys)
         self._rt_key = rt_key or 'retentiontime'
 
     def _init_ri_key(self):
         """ Identify retention-index key from spectrum metadata. """
-        ri_key = get_first_common_element(self._ri_possible_keys, self._spectra[0].metadata.keys())
+        ri_key = get_first_common_element(self._ri_possible_keys, self._keys)
         self._ri_key = ri_key or 'retentionindex'
 
     def _read_retention_times(self):
@@ -112,9 +106,6 @@ def retention_indices(self, values: Iterable[Data.RetentionIndexType]):
         """ Set retention indices. """
         if len(values) == len(self._spectra):
             self._retention_indices = values
-            list(
-                map(_assign_ri_value, self._spectra, [self._ri_key] * len(self._spectra), values)
-            )
         else:
             raise ValueError('There is different numbers of computed indices and peaks.')
 
@@ -125,11 +116,15 @@ def comment(self) -> Iterable[Data.CommentFieldType]:
         content = [spectrum.get(self.comment_keys, default=None) for spectrum in self._spectra]
         return content
 
+    @property
+    def spectra_metadata(self) -> Tuple[np.array, List[str]]:
+        return get_metadata_as_array(self._spectra)
+
 
-def safe_read_key(spectrum: Spectrum, key: str) -> Optional[float]:
-    """ Read key from spectrum and convert to float or return 'None'.
+def safe_read_key(spectrum: Spectrum, key: str) -> float:
+    """ Read key from spectrum and convert to float or return 0.0.
     Tries to read the given key from the spectrum metadata and convert it to a float.
-    In case an exception is thrown or the key is not present, returns 'None'.
+    In case an exception is thrown or the key is not present, returns 0.0.
 
     Parameters
     ----------
@@ -140,16 +135,18 @@ def safe_read_key(spectrum: Spectrum, key: str) -> Optional[float]:
 
     Returns
     -------
-        Either the key's value converted to float or 'None'.
+        Either the key's value converted to float or 0.0.
     """
 
-    value = spectrum.get(key, default=None)
-    if value is not None:
+    value = spectrum.get(key, default=0.0)
+    if isinstance(value, str):
         try:
             value = float(value)
         except ValueError:
-            # RT is in format that can't be converted to float -> set rt to None
-            value = None
+            # RT is in format that can't be converted to float -> set rt to 0.0
+            value = 0.0
+    if not Data.can_be_float(value):
+        value = 0.0
     return value
 
 def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexType):
@@ -159,6 +156,6 @@ def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexTyp
         spectrum (Spectrum): Spectrum to add RI to
         value (Data.RetentionIndexType): RI to be added to Spectrum
     """
-    if value is not None:
+    if value > 0:
         retention_index = ('%f' % float(value)).rstrip('0').rstrip('.')
         spectrum.set(key=key, value=retention_index)
diff --git a/RIAssigner/data/PandasData.py b/RIAssigner/data/PandasData.py
@@ -1,6 +1,6 @@
 from typing import Iterable
 
-from pandas import read_csv
+from pandas import read_csv, read_parquet
 from RIAssigner.utils import define_separator, get_first_common_element
 
 from .Data import Data
@@ -23,12 +23,16 @@ def _read(self):
         self._init_rt_column_info()
         self._init_ri_column_info()
         self._init_ri_indices()
+
         self._sort_by_rt()
+        self._replace_nans_with_0s()
 
     def _read_into_dataframe(self):
         """ Read the data from file into dataframe. """
         if(self._filetype in ['csv', 'tsv']):
             self._data = read_csv(self._filename, sep=None, engine="python")
+        elif self._filetype == 'parquet':
+            self._data = read_parquet(self._filename)
         else:
             raise NotImplementedError("File formats different from ['csv', 'tsv'] are not implemented yet.")
 
@@ -72,6 +76,13 @@ def _sort_by_rt(self):
         """ Sort peaks by their retention times. """
         if self._rt_index is not None:
             self._data.sort_values(by=self._rt_index, axis=0, inplace=True)
+
+    def _replace_nans_with_0s(self):
+        """ Replace NaN values with 0s. """
+        if self._rt_index is not None:
+            self._data[self._rt_index].fillna(0, inplace=True)
+        if self._ri_index is not None:
+            self._data[self._ri_index].fillna(0, inplace=True)
 
     def __eq__(self, o: object) -> bool:
         """Comparison operator `==`.
@@ -105,9 +116,7 @@ def retention_indices(self) -> Iterable[Data.RetentionIndexType]:
         """ Get retention indices from data or computed from carbon numbers. """
         if self._carbon_number_index is not None:
             return self._ri_from_carbon_numbers()
-        if not self._data[self._ri_index].isnull().all():
-            return self._data[self._ri_index]
-        raise KeyError("Dataset does not contain retention indices!")
+        return self._data[self._ri_index]
 
     def _ri_from_carbon_numbers(self):
         """ Returns the RI of compound based on carbon number. """

diff --git a/RIAssigner/data/SimpleData.py b/RIAssigner/data/SimpleData.py
@@ -17,14 +17,8 @@ def __init__(self, retention_times: Iterable[float], rt_unit: str, retention_ind
         """
         super().__init__(None, None, rt_unit)
 
-        self._validate_input_type(retention_times)
-
         self._read(retention_times, retention_indices)
 
-    def _validate_input_type(self, retention_times):
-        if not isinstance(retention_times, list) or None in retention_times:
-            raise TypeError("Retention times must be a list and cannot contain None.")
-
     def _read(self, retention_times, retention_indices):
         self._retention_times = Data.URegistry.Quantity(retention_times, self._unit)
         self._retention_indices = copy(retention_indices)

diff --git a/RIAssigner/data/ValidateSimpleData.py b/RIAssigner/data/ValidateSimpleData.py
@@ -19,6 +19,8 @@ def __init__(self, retention_times: Iterable[float], rt_unit: str, retention_ind
         self._read(retention_times, retention_indices)
 
     def _validate_input(self, retention_times, retention_indices):
+        if not isinstance(retention_times, list) or None in retention_times:
+            raise TypeError("Retention times must be a list and cannot contain None.")
         if not all(map(Data.is_valid, retention_times)):
             raise ValueError("Retention time data is invalid.")
         if not is_sorted(retention_times):

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "RIAssigner"
-version = '0.3.3'
+version = "0.4.0"
 description = "Python library for retention index calculation."
 authors = ["Helge Hecht <[email protected]>", "Maksym Skoryk <[email protected]>"]
 
@@ -19,22 +19,22 @@ classifiers = [
     "Intended Audience :: Developers",
     "License :: OSI Approved :: MIT License",
     "Natural Language :: English",
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.7",
-    "Programming Language :: Python :: 3.8"
+    "Programming Language :: Python :: 3.10",
 ]
 
 packages = [
     { include = "RIAssigner" },
 ]
 
 [tool.poetry.dependencies]
-python = "^3.8"
-matchms = "^0.14.0, <0.18.0"
+python = "^3.10, <3.13"
+matchms = "^0.24.1"
 numpy = "*"
 pandas = "*"
-pint = "^0.17, <0.20"
+pint = "^0.23"
 scipy = "*"
+urllib3 = "1.26.15"
+fastparquet = "^2023.10.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "*"