From 8cb23128693973097048a72a78088c69b7d6d0d7 Mon Sep 17 00:00:00 2001
From: wverastegui <wudmir@gmail.com>
Date: Mon, 4 Dec 2023 15:30:11 +0100
Subject: [PATCH] Read RI values from comments field into RI field

---
 RIAssigner/data/Data.py        | 31 ++++++++++++++++++++++++++++++-
 RIAssigner/data/MatchMSData.py |  5 +++++
 RIAssigner/data/PandasData.py  |  9 ++++-----
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/RIAssigner/data/Data.py b/RIAssigner/data/Data.py
index ba7fe54..2ae6efd 100644
--- a/RIAssigner/data/Data.py
+++ b/RIAssigner/data/Data.py
@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
 from typing import Iterable, List, Optional
+import pandas as pd
 
 from pint import Quantity, UnitRegistry
 from pint.unit import build_unit_class
@@ -125,4 +126,32 @@ def comment(self) -> Iterable[CommentFieldType]:
         Returns:
             Iterable[CommentFieldType]: Comment field values stored in data.
         """
-        ...
\ No newline at end of file
+        ...
+
+    def extract_ri_from_comment(self, content_comment, specific_string):
+        """ Extract RI from comment field.
+        Extracts the RI from the comment field of the data file. The RI is expected to be
+        in the format 'specific_string=RI_value'. The function extracts the RI value and
+        returns it as a list.
+
+        Parameters
+        ----------
+        content_comment:
+            Comment field of the data file. 
+        specific_string:
+            String that is expected to be in the comment field before the RI value.
+
+        Returns
+        -------
+            RI values as a list.
+        """
+
+        comments_series = pd.Series(content_comment)
+        mask = comments_series.str.contains(rf'\b{specific_string}\b', na=False)
+        extracted_values = comments_series.str.extract(rf'\b{specific_string}=(\d+)\b')[0].astype(float)
+        
+        # Fill in NaN values with None or some default value
+        extracted_values = extracted_values.where(mask, None)
+        
+        return extracted_values.tolist()
+        
diff --git a/RIAssigner/data/MatchMSData.py b/RIAssigner/data/MatchMSData.py
index 24e02f4..89ed779 100644
--- a/RIAssigner/data/MatchMSData.py
+++ b/RIAssigner/data/MatchMSData.py
@@ -127,6 +127,11 @@ def comment(self) -> Iterable[Data.CommentFieldType]:
         self.comment_keys = "comment"
         content = [safe_read_comment_key(spectrum, self.comment_keys) for spectrum in self._spectra]
         return content
+    
+    def ri_from_comment(self, specific_string: str = None): 
+        """ Extract RI from comment field. """
+        comments = self.comment
+        self.retention_indices = self.extract_ri_from_comment(comments, specific_string)
 
 def safe_read_comment_key(spectrum: Spectrum, key: str) -> Optional[str]:
     """ Read key from spectrum and convert to str or return 'None'.
diff --git a/RIAssigner/data/PandasData.py b/RIAssigner/data/PandasData.py
index d0a5461..318e903 100644
--- a/RIAssigner/data/PandasData.py
+++ b/RIAssigner/data/PandasData.py
@@ -140,8 +140,7 @@ def comment(self) -> Iterable[Data.CommentFieldType]:
         content = self._data[self._comment_keys].tolist()
         return content
     
-    def extract_ri_from_comment(self, specific_string: str): #  incomplete
-        """ Extract RI from comment field.
-        """
-        extracted_strings = [s[s.find(specific_string):] for s in specific_string if specific_string in s]
-        self._data[self._comment_keys] = extracted_strings
+    def ri_from_comment(self, specific_string: str = None): 
+        """ Extract RI from comment field. """
+        comments = self.comment
+        self.retention_indices = super().extract_ri_from_comment(comments, specific_string)
\ No newline at end of file