Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Read RI values from comments field into RI field & added unit test #112

Merged
merged 17 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions RIAssigner/data/Data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from abc import ABC, abstractmethod
from typing import Iterable, List, Optional
import pandas as pd

from pint import Quantity, UnitRegistry
from pint.unit import build_unit_class
Expand All @@ -9,6 +10,7 @@ class Data(ABC):
""" Base class for data managers. """
RetentionTimeType = Optional[float]
RetentionIndexType = Optional[float]
CommentFieldType = Optional[str]
URegistry = UnitRegistry()
Unit = build_unit_class(URegistry)

Expand Down Expand Up @@ -115,3 +117,36 @@ def retention_indices(self, value: Iterable[RetentionIndexType]):
value (Iterable[RetentionIndexType]): Values to assign to property.
"""
...

@property
@abstractmethod
def comment(self) -> Iterable[CommentFieldType]:
"""Getter for `comment` property.

Returns:
Iterable[CommentFieldType]: Comment field values stored in data.
"""
...

def extract_ri_from_comment(self, specific_string):
hechth marked this conversation as resolved.
Show resolved Hide resolved
""" Extract RI from comment field.
Extracts the RI from the comment field of the data file. The RI is expected to be
in the format 'specific_string=RI_value'. The function extracts the RI value and
returns it as a list.

Parameters
----------
content_comment:
Comment field of the data file.
specific_string:
String that is expected to be in the comment field before the RI value.

Returns
-------
RI values as a list.
"""

mask = pd.Series(self.comment).str.contains(rf'\b{specific_string}\b', na=False)
extracted_values = pd.Series(self.comment).str.extract(rf'\b{specific_string}=(\d+)\b')[0].astype(float)
self.retention_indices = extracted_values.where(mask, None).tolist()

39 changes: 36 additions & 3 deletions RIAssigner/data/MatchMSData.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ def _read_retention_indices(self):
""" Read retention indices from spectrum metadata. """
self.retention_indices = [safe_read_key(spectrum, self._ri_key) for spectrum in self._spectra]

def _sort_spectra_by_rt(self):
def _sort_spectra_by_rt(self):
""" Sort objects (peaks) in spectra list by their retention times. """
self._spectra.sort(key=lambda spectrum: safe_read_key(spectrum, self._rt_key))
self._spectra.sort(key=lambda spectrum: safe_read_key(spectrum, self._rt_key) or 0)

def __eq__(self, o: object) -> bool:
"""Comparison operator `==`.
Expand Down Expand Up @@ -116,6 +116,38 @@ def retention_indices(self, values: Iterable[Data.RetentionIndexType]):
else:
raise ValueError('There is different numbers of computed indices and peaks.')

@property
def comment(self) -> Iterable[Data.CommentFieldType]:
""" Get comments."""
self.comment_keys = "comment"
content = [safe_read_comment_key(spectrum, self.comment_keys) for spectrum in self._spectra]
return content

def safe_read_comment_key(spectrum: Spectrum, key: str) -> Optional[str]:
""" Read key from spectrum and convert to str or return 'None'.
Read the given key from the spectrum metadata and convert it to a string.
In case an exception is thrown or the key is not present, returns 'None'.

Parameters
----------
spectrum:
Spectrum from which to read the key.
key:
Key to be read from the spectrum metadata.

Returns
-------
The key's value converted to string or 'None'.
"""

meta_value = spectrum.get(key, default=None)
if meta_value is not None:
try:
meta_value = str(meta_value)
except ValueError:
meta_value = None
return meta_value
hechth marked this conversation as resolved.
Show resolved Hide resolved


def safe_read_key(spectrum: Spectrum, key: str) -> Optional[float]:
""" Read key from spectrum and convert to float or return 'None'.
Expand Down Expand Up @@ -143,7 +175,6 @@ def safe_read_key(spectrum: Spectrum, key: str) -> Optional[float]:
value = None
return value


def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexType):
"""Assign RI value to Spectrum object

Expand All @@ -153,4 +184,6 @@ def _assign_ri_value(spectrum: Spectrum, key: str, value: Data.RetentionIndexTyp
"""
if value is not None:
retention_index = ('%f' % float(value)).rstrip('0').rstrip('.')
if key == 'retentionindex':
key = 'retention_index'
spectrum.set(key=key, value=retention_index)
hechth marked this conversation as resolved.
Show resolved Hide resolved
17 changes: 15 additions & 2 deletions RIAssigner/data/PandasData.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def _init_carbon_number_index(self):
def _init_rt_column_info(self):
""" Find key of retention time column and store it. """
self._rt_index = get_first_common_element(self._data.columns, self._rt_possible_keys)
self._rt_position = self._data.columns.tolist().index(self._rt_index)
if self._rt_index is not None:
self._rt_position = self._data.columns.tolist().index(self._rt_index)
else:
self._rt_position = None

def _init_ri_column_info(self):
""" Initialize retention index column name and set its position next to the retention time column. """
Expand All @@ -64,9 +67,11 @@ def _init_ri_indices(self):
self._ri_position = self._rt_position + 1
self._data.insert(loc=self._ri_position, column=self._ri_index, value=None)

# temporary adapted in case _rt_index is None
def _sort_by_rt(self):
""" Sort peaks by their retention times. """
self._data.sort_values(by=self._rt_index, axis=0, inplace=True)
if self._rt_index is not None:
self._data.sort_values(by=self._rt_index, axis=0, inplace=True)

def __eq__(self, o: object) -> bool:
"""Comparison operator `==`.
Expand Down Expand Up @@ -116,3 +121,11 @@ def retention_indices(self, values: Iterable[int]):
values (Iterable[int]): Values to assign.
"""
self._data[self._ri_index] = values

@property
def comment(self) -> Iterable[Data.CommentFieldType]:
""" Get comments."""
self._comment_keys = "comment"
content = self._data[self._comment_keys].tolist()
return content

5 changes: 4 additions & 1 deletion RIAssigner/data/SimpleData.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from copy import copy
from typing import Iterable
from typing import Iterable, Optional

from RIAssigner.utils import is_sorted

Expand Down Expand Up @@ -51,3 +51,6 @@ def retention_times(self) -> Iterable[Data.RetentionTimeType]:
def retention_indices(self, values: Iterable[Data.RetentionIndexType]):
raise NotImplementedError()

@property
def comment(self) -> Iterable[Optional[str]]:
return None
40 changes: 40 additions & 0 deletions tests/data/ri_from_comment/NIST_EI_MS_2mols.msp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
NAME: Water
FORMULA: H2O
MW: 18
CASNO: 7732185
ID: 9
COMMENT: SemiStdNP=317 StdNP=323/4/2 StdPolar=1053/14/2 ; NIST MS# 7, Seq# M67
NUM PEAKS: 5
STDINCHI: InChI=1S/H2O/h1H2
SMILES: O
16.0 8.99
17.0 211.81
18.0 999.0
19.0 5.0
20.0 3.0

NAME: Methyl Alcohol
FORMULA: CH4O
MW: 32
CASNO: 67561
ID: 32
COMMENT: SemiStdNP=354/16/10 StdNP=379/7/34 StdPolar=903/8/35 ; NIST MS# 229809, Seq# M1806
NUM PEAKS: 16
STDINCHI: InChI=1S/CH4O/c1-2/h2H,1H3
SMILES: CO
2.0 3.0
12.0 2.0
13.0 5.99
14.0 15.99
15.0 122.89
16.0 1.0
17.0 3.0
18.0 6.99
19.0 1.0
28.0 44.96
29.0 444.6
30.0 63.94
31.0 999.0
32.0 742.33
33.0 10.99
34.0 1.0
3 changes: 3 additions & 0 deletions tests/data/ri_from_comment/nist_to_ri_2mols.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
casno,num_peaks,compound_name,retention_index,formula,id,comment,inchi,smiles,nominal_mass,
7732185,5,Water,,H2O,9,"SemiStdNP=317 StdNP=323/4/2 StdPolar=1053/14/2 ; NIST MS# 7, Seq# M67",InChI=1S/H2O/h1H2,O,18
67561,16,Methyl Alcohol,,CH4O,32,"SemiStdNP=354/16/10 StdNP=379/7/34 StdPolar=903/8/35 ; NIST MS# 229809, Seq# M1806","InChI=1S/CH4O/c1-2/h2H,1H3",CO,32
3 changes: 3 additions & 0 deletions tests/data/ri_from_comment/peaks_with_rt_ref_SemiStdNP.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
casno,num_peaks,compound_name,retention_index,formula,id,comment,inchi,smiles,nominal_mass,Unnamed: 10
7732185,5,Water,317.0,H2O,9,"SemiStdNP=317 StdNP=323/4/2 StdPolar=1053/14/2 ; NIST MS# 7, Seq# M67",InChI=1S/H2O/h1H2,O,18,
67561,16,Methyl Alcohol,354.0,CH4O,32,"SemiStdNP=354/16/10 StdNP=379/7/34 StdPolar=903/8/35 ; NIST MS# 229809, Seq# M1806","InChI=1S/CH4O/c1-2/h2H,1H3",CO,32,
43 changes: 43 additions & 0 deletions tests/data/ri_from_comment/peaks_with_rt_ref_SemiStdNP_msp.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
FORMULA: H2O
MW: 18
CASNO: 7732185
ID: 9
COMMENT: SemiStdNP=317 StdNP=323/4/2 StdPolar=1053/14/2 ; NIST MS# 7, Seq# M67
STDINCHI: InChI=1S/H2O/h1H2
SMILES: O
COMPOUND_NAME: Water
RETENTION_INDEX: 317
NUM PEAKS: 5
16.0 8.99
17.0 211.81
18.0 999.0
19.0 5.0
20.0 3.0

FORMULA: CH4O
MW: 32
CASNO: 67561
ID: 32
COMMENT: SemiStdNP=354/16/10 StdNP=379/7/34 StdPolar=903/8/35 ; NIST MS# 229809, Seq# M1806
STDINCHI: InChI=1S/CH4O/c1-2/h2H,1H3
SMILES: CO
COMPOUND_NAME: Methyl Alcohol
RETENTION_INDEX: 354
NUM PEAKS: 16
2.0 3.0
12.0 2.0
13.0 5.99
14.0 15.99
15.0 122.89
16.0 1.0
17.0 3.0
18.0 6.99
19.0 1.0
28.0 44.96
29.0 444.6
30.0 63.94
31.0 999.0
32.0 742.33
33.0 10.99
34.0 1.0

3 changes: 3 additions & 0 deletions tests/data/ri_from_comment/peaks_with_rt_ref_StdNP.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
casno,num_peaks,compound_name,retention_index,formula,id,comment,inchi,smiles,nominal_mass,Unnamed: 10
7732185,5,Water,323.0,H2O,9,"SemiStdNP=317 StdNP=323/4/2 StdPolar=1053/14/2 ; NIST MS# 7, Seq# M67",InChI=1S/H2O/h1H2,O,18,
67561,16,Methyl Alcohol,379.0,CH4O,32,"SemiStdNP=354/16/10 StdNP=379/7/34 StdPolar=903/8/35 ; NIST MS# 229809, Seq# M1806","InChI=1S/CH4O/c1-2/h2H,1H3",CO,32,
43 changes: 43 additions & 0 deletions tests/data/ri_from_comment/peaks_with_rt_ref_StdNP_msp.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
FORMULA: H2O
MW: 18
CASNO: 7732185
ID: 9
COMMENT: SemiStdNP=317 StdNP=323/4/2 StdPolar=1053/14/2 ; NIST MS# 7, Seq# M67
STDINCHI: InChI=1S/H2O/h1H2
SMILES: O
COMPOUND_NAME: Water
RETENTION_INDEX: 323
NUM PEAKS: 5
16.0 8.99
17.0 211.81
18.0 999.0
19.0 5.0
20.0 3.0

FORMULA: CH4O
MW: 32
CASNO: 67561
ID: 32
COMMENT: SemiStdNP=354/16/10 StdNP=379/7/34 StdPolar=903/8/35 ; NIST MS# 229809, Seq# M1806
STDINCHI: InChI=1S/CH4O/c1-2/h2H,1H3
SMILES: CO
COMPOUND_NAME: Methyl Alcohol
RETENTION_INDEX: 379
NUM PEAKS: 16
2.0 3.0
12.0 2.0
13.0 5.99
14.0 15.99
15.0 122.89
16.0 1.0
17.0 3.0
18.0 6.99
19.0 1.0
28.0 44.96
29.0 444.6
30.0 63.94
31.0 999.0
32.0 742.33
33.0 10.99
34.0 1.0

3 changes: 3 additions & 0 deletions tests/data/ri_from_comment/peaks_with_rt_ref_StdPolar.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
casno,num_peaks,compound_name,retention_index,formula,id,comment,inchi,smiles,nominal_mass,Unnamed: 10
7732185,5,Water,1053.0,H2O,9,"SemiStdNP=317 StdNP=323/4/2 StdPolar=1053/14/2 ; NIST MS# 7, Seq# M67",InChI=1S/H2O/h1H2,O,18,
67561,16,Methyl Alcohol,903.0,CH4O,32,"SemiStdNP=354/16/10 StdNP=379/7/34 StdPolar=903/8/35 ; NIST MS# 229809, Seq# M1806","InChI=1S/CH4O/c1-2/h2H,1H3",CO,32,
43 changes: 43 additions & 0 deletions tests/data/ri_from_comment/peaks_with_rt_ref_StdPolar_msp.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
FORMULA: H2O
MW: 18
CASNO: 7732185
ID: 9
COMMENT: SemiStdNP=317 StdNP=323/4/2 StdPolar=1053/14/2 ; NIST MS# 7, Seq# M67
STDINCHI: InChI=1S/H2O/h1H2
SMILES: O
COMPOUND_NAME: Water
RETENTION_INDEX: 1053
NUM PEAKS: 5
16.0 8.99
17.0 211.81
18.0 999.0
19.0 5.0
20.0 3.0

FORMULA: CH4O
MW: 32
CASNO: 67561
ID: 32
COMMENT: SemiStdNP=354/16/10 StdNP=379/7/34 StdPolar=903/8/35 ; NIST MS# 229809, Seq# M1806
STDINCHI: InChI=1S/CH4O/c1-2/h2H,1H3
SMILES: CO
COMPOUND_NAME: Methyl Alcohol
RETENTION_INDEX: 903
NUM PEAKS: 16
2.0 3.0
12.0 2.0
13.0 5.99
14.0 15.99
15.0 122.89
16.0 1.0
17.0 3.0
18.0 6.99
19.0 1.0
28.0 44.96
29.0 444.6
30.0 63.94
31.0 999.0
32.0 742.33
33.0 10.99
34.0 1.0

7 changes: 6 additions & 1 deletion tests/fixtures/mocks/DataStub.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ class DataStub(Data):
def __init__(self, retention_times: Iterable[float], retention_indices: Iterable[float]):
self._retention_times = retention_times
self._retention_indices = retention_indices

self._comment = []

def _read(self):
pass

Expand All @@ -29,3 +30,7 @@ def retention_indices(self) -> Iterable[Optional[float]]:
@retention_indices.setter
def retention_indices(self, value: Iterable[float]):
self._retention_indices = value

@property
def comment(self) -> Iterable[Optional[str]]:
return self._comment
Loading
Loading