Skip to content

Commit

Permalink
Merge pull request #116 from RECETOX/refactoring
Browse files Browse the repository at this point in the history
Refactoring data reading and writing
  • Loading branch information
hechth authored Feb 5, 2024
2 parents 52dec89 + fe40202 commit 7684d26
Show file tree
Hide file tree
Showing 27 changed files with 590 additions and 939,196 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ jobs:
strategy:
matrix:
os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
python-version: ['3.8', '3.9', '3.10', '3.11']
python-version: ['3.10', '3.11']

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,6 @@ tests/test_pandas_data.py

# vscode
.vscode

# poetry
poetry.lock
2 changes: 1 addition & 1 deletion RIAssigner/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.3.3'
__version__ = '0.4.0'
6 changes: 6 additions & 0 deletions RIAssigner/compute/ComputationMethod.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ def _check_data_args(self, query: Data, reference: Data):
raise ValueError("Query data is not defined.")
if reference is None:
raise ValueError("Reference data is not defined.")
if not query.has_retention_times():
raise ValueError("Query data has no retention times.")
if not reference.has_retention_times():
raise ValueError("Reference data has no retention times.")
if not reference.has_retention_indices():
raise ValueError("Reference data has no retention indices.")

def __eq__(self, o: object) -> bool:
return type(o) == type(self)
18 changes: 7 additions & 11 deletions RIAssigner/data/Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,14 @@
import pandas as pd

from pint import Quantity, UnitRegistry
from pint.unit import build_unit_class


class Data(ABC):
""" Base class for data managers. """
RetentionTimeType = Optional[float]
RetentionIndexType = Optional[float]
RetentionTimeType = float
RetentionIndexType = float
CommentFieldType = Optional[str]
URegistry = UnitRegistry()
Unit = build_unit_class(URegistry)

_rt_possible_keys = {'RT', 'rt', 'rts', 'retention_times', 'retention_time', 'retention', 'time', 'retentiontime'}
_ri_possible_keys = {'RI', 'ri', 'ris', 'retention_indices', 'retention_index', 'kovats', 'retentionindex'}
Expand All @@ -27,7 +25,7 @@ def is_valid(value: Union[RetentionTimeType, RetentionIndexType]) -> bool:
Returns:
bool: State of validity (True/False).
"""
result = value is not None and Data.can_be_float(value) and value >= 0.0
result = value is not None and Data.can_be_float(value) and value > 0
return result

@staticmethod
Expand Down Expand Up @@ -68,7 +66,7 @@ def __init__(self, filename: str, filetype: str, rt_unit: str):
self._filename = filename
self._filetype = filetype
self._rt_unit = rt_unit
self._unit = Data.Unit(self._rt_unit)
self._unit = Data.URegistry(self._rt_unit)

@abstractmethod
def write(self, filename):
Expand Down Expand Up @@ -129,7 +127,7 @@ def has_retention_indices(self) -> bool:
Returns:
bool: True if all retention indices exist, False otherwise.
"""
return all([Data.is_valid(rt) for rt in self.retention_indices])
return len(self.retention_indices) > 0 and all([Data.is_valid(rt) for rt in self.retention_indices])

def has_retention_times(self) -> bool:
"""
Expand All @@ -142,7 +140,7 @@ def has_retention_times(self) -> bool:
Returns:
bool: True if all retention times exist, False otherwise.
"""
return all([Data.is_valid(rt) for rt in self.retention_times])
return len(self.retention_times) > 0 and all([Data.is_valid(rt) for rt in self.retention_times])


@property
Expand All @@ -155,7 +153,7 @@ def comment(self) -> Iterable[CommentFieldType]:
"""
...

def extract_ri_from_comment(self, ri_source: str):
def init_ri_from_comment(self, ri_source: str):
""" Extract RI from comment field.
Extracts the RI from the comment field of the data file. The RI is expected to be
in the format 'ri_source=RI_value'. The function extracts the RI value and
Expand All @@ -168,8 +166,6 @@ def extract_ri_from_comment(self, ri_source: str):
ri_source:
String that is expected to be in the comment field before the RI value.
"""


mask = pd.Series(self.comment).str.contains(rf'\b{ri_source}\b', na=False)
extracted_values = pd.Series(self.comment).str.extract(rf'\b{ri_source}=(\d+)\b')[0].astype(float)
self.retention_indices = extracted_values.where(mask, None).tolist()
Expand Down
67 changes: 32 additions & 35 deletions RIAssigner/data/MatchMSData.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from typing import Iterable, Optional
from typing import Iterable, List, Optional, Tuple
import numpy as np

from matchms import Spectrum
from matchms.exporting import save_as_msp
from matchms.importing import load_from_msp
from matchms.exporting import save_spectra
from matchms.exporting.metadata_export import get_metadata_as_array
from matchms.importing import load_spectra
from RIAssigner.utils import get_first_common_element

from .Data import Data


class MatchMSData(Data):
""" Class to handle data from filetypes which can be imported
using 'matchMS'.
Currently only supports 'msp'.
using 'matchms'.
"""

def __init__(self, filename: str, filetype: str, rt_unit: str):
Expand All @@ -22,7 +22,9 @@ def __init__(self, filename: str, filetype: str, rt_unit: str):
def _read(self):
"""Load data into object and initialize properties.
"""
self._read_spectra(self._filename, self._filetype)
self._spectra = list(load_spectra(self._filename, True, self._filetype))
_, self._keys = get_metadata_as_array(self._spectra)

self._init_rt_key()
self._init_ri_key()

Expand All @@ -32,35 +34,27 @@ def _read(self):
self._read_retention_indices()

def write(self, filename: str):
"""Write data to back to 'msp' file
"""Write data to back to the spectra file
Args:
filename (str): Path to filename under which to store the data.
"""
save_as_msp(self._spectra, filename)

def _read_spectra(self, filename: str, filetype: str):
"""Read spectra from 'msp' file into data.
Args:
filename (str): Path to filename from which to load the data.
self._write_RIs_to_spectra()
save_spectra(self._spectra, filename)

Raises:
NotImplementedError: For filetypes other tahn 'msp'.
def _write_RIs_to_spectra(self):
"""Write the RI values stored in the object to the spectra metadata.
"""
if filetype == 'msp':
self._spectra = list(load_from_msp(filename))
else:
raise NotImplementedError("Currently only supports 'msp'.")
list(map(_assign_ri_value, self._spectra, [self._ri_key] * len(self._spectra), self._retention_indices))

def _init_rt_key(self):
""" Identify retention-time key from spectrum metadata. """
rt_key = get_first_common_element(self._rt_possible_keys, self._spectra[0].metadata.keys())
rt_key = get_first_common_element(self._rt_possible_keys, self._keys)
self._rt_key = rt_key or 'retentiontime'

def _init_ri_key(self):
""" Identify retention-index key from spectrum metadata. """
ri_key = get_first_common_element(self._ri_possible_keys, self._spectra[0].metadata.keys())
ri_key = get_first_common_element(self._ri_possible_keys, self._keys)
self._ri_key = ri_key or 'retentionindex'

def _read_retention_times(self):
Expand Down Expand Up @@ -112,9 +106,6 @@ def retention_indices(self, values: Iterable[Data.RetentionIndexType]):
""" Set retention indices. """
if len(values) == len(self._spectra):
self._retention_indices = values
list(
map(_assign_ri_value, self._spectra, [self._ri_key] * len(self._spectra), values)
)
else:
raise ValueError('There is different numbers of computed indices and peaks.')

Expand All @@ -125,11 +116,15 @@ def comment(self) -> Iterable[Data.CommentFieldType]:
content = [spectrum.get(self.comment_keys, default=None) for spectrum in self._spectra]
return content

@property
def spectra_metadata(self) -> Tuple[np.array, List[str]]:
return get_metadata_as_array(self._spectra)


def safe_read_key(spectrum: Spectrum, key: str) -> Optional[float]:
""" Read key from spectrum and convert to float or return 'None'.
def safe_read_key(spectrum: Spectrum, key: str) -> float:
""" Read key from spectrum and convert to float or return 0.0.
Tries to read the given key from the spectrum metadata and convert it to a float.
In case an exception is thrown or the key is not present, returns 'None'.
In case an exception is thrown or the key is not present, returns 0.0.
Parameters
----------
Expand All @@ -140,16 +135,18 @@ def safe_read_key(spectrum: Spectrum, key: str) -> Optional[float]:
Returns
-------
Either the key's value converted to float or 'None'.
Either the key's value converted to float or 0.0.
"""

value = spectrum.get(key, default=None)
if value is not None:
value = spectrum.get(key, default=0.0)
if isinstance(value, str):
try:
value = float(value)
except ValueError:
# RT is in format that can't be converted to float -> set rt to None
value = None
# RT is in format that can't be converted to float -> set rt to 0.0
value = 0.0
if not Data.can_be_float(value):
value = 0.0
return value

def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexType):
Expand All @@ -159,6 +156,6 @@ def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexTyp
spectrum (Spectrum): Spectrum to add RI to
value (Data.RetentionIndexType): RI to be added to Spectrum
"""
if value is not None:
if value > 0:
retention_index = ('%f' % float(value)).rstrip('0').rstrip('.')
spectrum.set(key=key, value=retention_index)
17 changes: 13 additions & 4 deletions RIAssigner/data/PandasData.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Iterable

from pandas import read_csv
from pandas import read_csv, read_parquet
from RIAssigner.utils import define_separator, get_first_common_element

from .Data import Data
Expand All @@ -23,12 +23,16 @@ def _read(self):
self._init_rt_column_info()
self._init_ri_column_info()
self._init_ri_indices()

self._sort_by_rt()
self._replace_nans_with_0s()

def _read_into_dataframe(self):
""" Read the data from file into dataframe. """
if(self._filetype in ['csv', 'tsv']):
self._data = read_csv(self._filename, sep=None, engine="python")
elif self._filetype == 'parquet':
self._data = read_parquet(self._filename)
else:
raise NotImplementedError("File formats different from ['csv', 'tsv'] are not implemented yet.")

Expand Down Expand Up @@ -72,6 +76,13 @@ def _sort_by_rt(self):
""" Sort peaks by their retention times. """
if self._rt_index is not None:
self._data.sort_values(by=self._rt_index, axis=0, inplace=True)

def _replace_nans_with_0s(self):
""" Replace NaN values with 0s. """
if self._rt_index is not None:
self._data[self._rt_index].fillna(0, inplace=True)
if self._ri_index is not None:
self._data[self._ri_index].fillna(0, inplace=True)

def __eq__(self, o: object) -> bool:
"""Comparison operator `==`.
Expand Down Expand Up @@ -105,9 +116,7 @@ def retention_indices(self) -> Iterable[Data.RetentionIndexType]:
""" Get retention indices from data or computed from carbon numbers. """
if self._carbon_number_index is not None:
return self._ri_from_carbon_numbers()
if not self._data[self._ri_index].isnull().all():
return self._data[self._ri_index]
raise KeyError("Dataset does not contain retention indices!")
return self._data[self._ri_index]

def _ri_from_carbon_numbers(self):
""" Returns the RI of compound based on carbon number. """
Expand Down
6 changes: 0 additions & 6 deletions RIAssigner/data/SimpleData.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,8 @@ def __init__(self, retention_times: Iterable[float], rt_unit: str, retention_ind
"""
super().__init__(None, None, rt_unit)

self._validate_input_type(retention_times)

self._read(retention_times, retention_indices)

def _validate_input_type(self, retention_times):
if not isinstance(retention_times, list) or None in retention_times:
raise TypeError("Retention times must be a list and cannot contain None.")

def _read(self, retention_times, retention_indices):
self._retention_times = Data.URegistry.Quantity(retention_times, self._unit)
self._retention_indices = copy(retention_indices)
Expand Down
2 changes: 2 additions & 0 deletions RIAssigner/data/ValidateSimpleData.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def __init__(self, retention_times: Iterable[float], rt_unit: str, retention_ind
self._read(retention_times, retention_indices)

def _validate_input(self, retention_times, retention_indices):
if not isinstance(retention_times, list) or None in retention_times:
raise TypeError("Retention times must be a list and cannot contain None.")
if not all(map(Data.is_valid, retention_times)):
raise ValueError("Retention time data is invalid.")
if not is_sorted(retention_times):
Expand Down
14 changes: 7 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "RIAssigner"
version = '0.3.3'
version = "0.4.0"
description = "Python library for retention index calculation."
authors = ["Helge Hecht <[email protected]>", "Maksym Skoryk <[email protected]>"]

Expand All @@ -19,22 +19,22 @@ classifiers = [
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8"
"Programming Language :: Python :: 3.10",
]

packages = [
{ include = "RIAssigner" },
]

[tool.poetry.dependencies]
python = "^3.8"
matchms = "^0.14.0, <0.18.0"
python = "^3.10, <3.13"
matchms = "^0.24.1"
numpy = "*"
pandas = "*"
pint = "^0.17, <0.20"
pint = "^0.23"
scipy = "*"
urllib3 = "1.26.15"
fastparquet = "^2023.10.1"

[tool.poetry.group.dev.dependencies]
pytest = "*"
Expand Down
Loading

0 comments on commit 7684d26

Please sign in to comment.