From 8cb23128693973097048a72a78088c69b7d6d0d7 Mon Sep 17 00:00:00 2001 From: wverastegui Date: Mon, 4 Dec 2023 15:30:11 +0100 Subject: [PATCH] Read RI values from comments field into RI field --- RIAssigner/data/Data.py | 31 ++++++++++++++++++++++++++++++- RIAssigner/data/MatchMSData.py | 5 +++++ RIAssigner/data/PandasData.py | 9 ++++----- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/RIAssigner/data/Data.py b/RIAssigner/data/Data.py index ba7fe54..2ae6efd 100644 --- a/RIAssigner/data/Data.py +++ b/RIAssigner/data/Data.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from typing import Iterable, List, Optional +import pandas as pd from pint import Quantity, UnitRegistry from pint.unit import build_unit_class @@ -125,4 +126,32 @@ def comment(self) -> Iterable[CommentFieldType]: Returns: Iterable[CommentFieldType]: Comment field values stored in data. """ - ... \ No newline at end of file + ... + + def extract_ri_from_comment(self, content_comment, specific_string): + """ Extract RI from comment field. + Extracts the RI from the comment field of the data file. The RI is expected to be + in the format 'specific_string=RI_value'. The function extracts the RI value and + returns it as a list. + + Parameters + ---------- + content_comment: + Comment field of the data file. + specific_string: + String that is expected to be in the comment field before the RI value. + + Returns + ------- + RI values as a list. + """ + + comments_series = pd.Series(content_comment) + mask = comments_series.str.contains(rf'\b{specific_string}\b', na=False) + extracted_values = comments_series.str.extract(rf'\b{specific_string}=(\d+)\b')[0].astype(float) + + # Fill in NaN values with None or some default value + extracted_values = extracted_values.where(mask, None) + + return extracted_values.tolist() + diff --git a/RIAssigner/data/MatchMSData.py b/RIAssigner/data/MatchMSData.py index 24e02f4..89ed779 100644 --- a/RIAssigner/data/MatchMSData.py +++ b/RIAssigner/data/MatchMSData.py @@ -127,6 +127,11 @@ def comment(self) -> Iterable[Data.CommentFieldType]: self.comment_keys = "comment" content = [safe_read_comment_key(spectrum, self.comment_keys) for spectrum in self._spectra] return content + + def ri_from_comment(self, specific_string: str = None): + """ Extract RI from comment field. """ + comments = self.comment + self.retention_indices = self.extract_ri_from_comment(comments, specific_string) def safe_read_comment_key(spectrum: Spectrum, key: str) -> Optional[str]: """ Read key from spectrum and convert to str or return 'None'. diff --git a/RIAssigner/data/PandasData.py b/RIAssigner/data/PandasData.py index d0a5461..318e903 100644 --- a/RIAssigner/data/PandasData.py +++ b/RIAssigner/data/PandasData.py @@ -140,8 +140,7 @@ def comment(self) -> Iterable[Data.CommentFieldType]: content = self._data[self._comment_keys].tolist() return content - def extract_ri_from_comment(self, specific_string: str): # incomplete - """ Extract RI from comment field. - """ - extracted_strings = [s[s.find(specific_string):] for s in specific_string if specific_string in s] - self._data[self._comment_keys] = extracted_strings + def ri_from_comment(self, specific_string: str = None): + """ Extract RI from comment field. """ + comments = self.comment + self.retention_indices = super().extract_ri_from_comment(comments, specific_string) \ No newline at end of file