Merge pull request #1533 from anuprulez/tabpfn

Add a Galaxy tool for TabPFN package (by Prof. Hutter's group)
bgruening · Jan 15, 2025 · bce8b02 · bce8b02
2 parents 49709e6 + 7fe2287
commit bce8b02
Show file tree

Hide file tree

Showing 5 changed files with 970 additions and 0 deletions.
diff --git a/tools/tabpfn/.shed.yml b/tools/tabpfn/.shed.yml
@@ -0,0 +1,13 @@
+name: tabpfn
+owner: bgruening
+description: Tabular data prediction using TabPFN using Pytorch.
+long_description: |
+  The TabPFN is a neural network that learned to do tabular data prediction.
+  This is the original CUDA-supporting pytorch impelementation.
+remote_repository_url: https://github.com/bgruening/galaxytools/tree/master/tools/tabpfn
+homepage_url: https://github.com/bgruening/galaxytools/tree/master/tools/tabpfn
+type:
+categories:
+  - Machine Learning
+maintainers:
+  anuprulez
diff --git a/tools/tabpfn/main.py b/tools/tabpfn/main.py
@@ -0,0 +1,54 @@
+"""
+Tabular data prediction using TabPFN
+"""
+import argparse
+import time
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from sklearn.metrics import accuracy_score, average_precision_score, precision_recall_curve
+from tabpfn import TabPFNClassifier
+
+
+def separate_features_labels(data):
+    df = pd.read_csv(data, sep="\t")
+    labels = df.iloc[:, -1]
+    features = df.iloc[:, :-1]
+    return features, labels
+
+
+def train_evaluate(args):
+    """
+    Train TabPFN
+    """
+    tr_features, tr_labels = separate_features_labels(args["train_data"])
+    te_features, te_labels = separate_features_labels(args["test_data"])
+    classifier = TabPFNClassifier(device='cpu')
+    s_time = time.time()
+    classifier.fit(tr_features, tr_labels)
+    e_time = time.time()
+    print("Time taken by TabPFN for training: {} seconds".format(e_time - s_time))
+    y_eval = classifier.predict(te_features)
+    print('Accuracy', accuracy_score(te_labels, y_eval))
+    pred_probas_test = classifier.predict_proba(te_features)
+    te_features["predicted_labels"] = y_eval
+    te_features.to_csv("output_predicted_data", sep="\t", index=None)
+    precision, recall, thresholds = precision_recall_curve(te_labels, pred_probas_test[:, 1])
+    average_precision = average_precision_score(te_labels, pred_probas_test[:, 1])
+    plt.figure(figsize=(8, 6))
+    plt.plot(recall, precision, label=f'Precision-Recall Curve (AP={average_precision:.2f})')
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.title('Precision-Recall Curve')
+    plt.legend(loc='lower left')
+    plt.grid(True)
+    plt.savefig("output_prec_recall_curve.png")
+
+
+if __name__ == "__main__":
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument("-trdata", "--train_data", required=True, help="Train data")
+    arg_parser.add_argument("-tedata", "--test_data", required=True, help="Test data")
+    # get argument values
+    args = vars(arg_parser.parse_args())
+    train_evaluate(args)
diff --git a/tools/tabpfn/tabpfn.xml b/tools/tabpfn/tabpfn.xml
@@ -0,0 +1,62 @@
+<tool id="tabpfn" name="Tabular data prediction using TabPFN" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.0">
+    <description>with PyTorch</description>
+    <macros>
+        <token name="@TOOL_VERSION@">2.0.3</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
+    <creator>
+        <organization name="European Galaxy Team" url="https://galaxyproject.org/eu/" />
+        <person givenName="Anup" familyName="Kumar" email="[email protected]" />
+        <person givenName="Frank" familyName="Hutter" email="[email protected]" />
+    </creator>
+    <requirements>
+	<requirement type="package" version="@TOOL_VERSION@">tabpfn</requirement>
+	<requirement type="package" version="2.2.2">pandas</requirement>
+	<requirement type="package" version="3.9.2">matplotlib</requirement>
+    </requirements>
+    <version_command>echo "@VERSION@"</version_command>
+    <command detect_errors="aggressive">
+    <![CDATA[
+        python '$__tool_directory__/main.py'
+            --train_data '$train_data'
+            --test_data '$test_data'
+    ]]>
+    </command>
+    <inputs>
+	<param name="train_data" type="data" format="tabular" label="Train data" help="Please provide training data for training model."/>
+        <param name="test_data" type="data" format="tabular" label="Test data" help="Please provide test data for evaluating model."/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output_predicted_data" from_work_dir="output_predicted_data" label="Predicted data"></data>
+        <data format="png" name="output_prec_recall_curve" from_work_dir="output_prec_recall_curve" label="Precision-recall curve"></data>
+    </outputs>
+    <tests>
+        <test>
+	    <param name="train_data" value="local_train_rows.tabular" ftype="tabular" />
+	    <param name="test_data" value="local_test_rows.tabular" ftype="tabular" />
+            <output name="output_predicted_data">
+	        <assert_contents>
+		    <has_n_columns n="42" />
+                    <has_n_lines n="3" />
+	        </assert_contents>
+	    </output>
+        </test>
+    </tests>
+    <help>
+        <![CDATA[
+            **What it does**
+
+            Classification on tabular data by TabPFN
+
+            **Input files**
+            - Training data: the training data should contain features and the last column should be the class labels. It could either be tabular or in CSV format.
+            - Test data: the test data should also contain the same features as the training data and the last column should be the class labels. It could either be tabular or in CSV format.
+
+            **Output files**
+            - Predicted data along with predicted labels
+        ]]>
+    </help>
+    <citations>
+        <citation type="doi">10.1038/s41586-024-08328-6</citation>
+    </citations>
+</tool>
diff --git a/tools/tabpfn/test-data/local_test_rows.tabular b/tools/tabpfn/test-data/local_test_rows.tabular
@@ -0,0 +1,3 @@
+SpMax_L	J_Dz(e)	nHM	F01[N-N]	F04[C-N]	NssssC	nCb-	C%	nCp	nO	F03[C-N]	SdssC	HyWi_B(m)	LOC	SM6_L	F03[C-O]	Me	Mi	nN-N	nArNO2	nCRX3	SpPosA_B(p)	nCIR	B01[C-Br]	B03[C-Cl]	N-073	SpMax_A	Psi_i_1d	B04[C-Br]	SdO	TI2_L	nCrt	C-026	F02[C-N]	nHDon	SpMax_B(m)	Psi_i_A	nN	SM6_B(m)	nArCOOR	nX	predicted_labels
+3.919	2.6909	0	0	0	0	0	31.4	2	0	0	0	3.106	2.55	9.002	0	0.96	1.142	0	0	0	1.201	0	0	0	0	1.932	0.011	0	0	4.489	0	0	0	0	2.949	1.591	0	7.253	0	0	1
+4.17	2.1144	0	0	0	0	0	30.8	1	1	0	0	2.461	1.393	8.723	1	0.989	1.144	0	0	0	1.104	1	0	0	0	2.214	-0.204	0	0	1.542	0	0	0	0	3.315	1.967	0	7.257	0	0	1