From 94c79b7c8eaeaa13bd9a9b797307fb0362b3b7d2 Mon Sep 17 00:00:00 2001 From: Jalil Nourisa Date: Sat, 23 Nov 2024 21:43:55 +0100 Subject: [PATCH] bug in api fixed --- runs.ipynb | 1080 ++++++++++++++++++++++---- src/api/file_evaluation_h5ad.yaml | 4 +- src/process_data/test_data/script.py | 8 +- 3 files changed, 955 insertions(+), 137 deletions(-) diff --git a/runs.ipynb b/runs.ipynb index 1978ab80..75855173 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,18 @@ " 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'scglue', 'celloracle', 'scenicplus'],\n", " 'models_dir': 'resources/grn_models/',\n", " 'scores_dir': 'resources/scores'\n", - "}" + "}\n", + "\n", + "datasets = ['op', 'replogle2', 'nakatake', 'norman', 'adamson']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ad.read('resources/')" ] }, { @@ -1174,6 +1185,13 @@ "![image.png](attachment:image.png)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### nakatake" + ] + }, { "cell_type": "code", "execution_count": 7, @@ -1405,6 +1423,13 @@ "![image-3.png](attachment:image-3.png)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### norman " + ] + }, { "cell_type": "code", "execution_count": 8, @@ -1662,6 +1687,13 @@ "!ls output/temp/adamson/" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### adamson" + ] + }, { "cell_type": "code", "execution_count": 12, @@ -1872,6 +1904,623 @@ "![image.png](attachment:image.png)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Global models" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "50000-skeleton_False-binarize_True-GB.csv\n", + "50000-skeleton_False-binarize_True-ridge.csv\n", + "lognorm-50000-skeleton_False-binarize_True-ridge-global-False.csv\n", + "nets\n", + "X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\n", + "X_norm-50000-skeleton_False-binarize_True-ridge-global-True.csv\n" + ] + } + ], + "source": [ + "!ls output/temp/op/" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 S1S2static-theta-0.0static-theta-0.5static-theta-1.0rank
collectri0.0583150.1268990.2297790.2724090.29076913
negative_control-0.000923-0.0009610.1936400.2606670.29087719
positive_control0.7217501.1888030.6542970.4116750.3101242
pearson_corr0.5703331.0483550.5805830.3798910.3035944
portia0.4655270.6740970.5138850.3172060.2980957
ppcor0.1966890.2384860.3621900.2897430.29119010
grnboost20.7355400.9230760.5812650.4714910.3299401
scenic0.3188730.4670080.5191030.4065250.3174896
granie0.1475220.1939770.1654000.2114970.28323921
scglue0.1556420.6789080.4992500.2938630.2939398
celloracle0.4661190.7624380.5778430.4124850.3094745
figr0.2250300.5344160.2886750.3012670.2956269
scenicplus0.5885780.7537050.6121030.4672260.3247253
ANANSE_tissue_networks_lung.parquet0.0114920.0427440.1903390.2644410.29062717
ANANSE_tissue_networks_stomach.parquet0.0006240.0031180.1775990.2552150.28880222
ANANSE_tissue_networks_heart.parquet0.0020720.0108720.1856170.2606970.28933420
ANANSE_tissue_networks_bone_marrow.parquet0.0080270.0348960.1637630.2668240.29147618
gtex_rna_networks_Whole_Blood.parquet0.0994960.3797540.2232880.2694200.29055811
gtex_rna_networks_Brain_Amygdala.parquet0.0298520.1438700.2170350.2542170.28761616
gtex_rna_networks_Breast_Mammary_Tissue.parquet0.0565480.2376850.2065020.2590420.28837514
gtex_rna_networks_Lung.parquet0.0654440.2860070.2169730.2652780.29007712
gtex_rna_networks_Stomach.parquet0.0571470.2369250.1849440.2600950.28761915
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_scores_gb = pd.read_csv(f\"output/temp/op/X_norm-50000-skeleton_False-binarize_True-ridge-global-True.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"output/temp/op/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", + "\n", + "df_scores = pd.concat([df_scores, df_scores_gb])\n", + "# df_scores[df_scores<0] = 0\n", + "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", + "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", + "df_scores.style.background_gradient()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1925,41 +2574,75 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 112, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50000-skeleton_False-binarize_True-GB.csv\n", - "50000-skeleton_False-binarize_True-ridge.csv\n", - "lognorm-50000-skeleton_False-binarize_True-ridge-global-False.csv\n", - "nets\n", - "X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\n" - ] - } - ], + "outputs": [], "source": [ - "!ls output/temp/op/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv" + "# - collect all the scores\n", + "for i, dataset in enumerate(datasets):\n", + " df_scores = pd.read_csv(f\"output/temp/{dataset}/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", + " # - normalize scores \n", + " df_scores = df_scores.fillna(0)\n", + " df_scores[df_scores < 0] = 0\n", + " df_scores = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", + " df_scores = df_scores.reset_index().melt(id_vars='index', var_name='metric', value_name='r2score').rename(columns={'index':'model'})\n", + " df_scores['dataset'] = dataset\n", + " if i == 0:\n", + " df_all = df_scores\n", + " else:\n", + " df_all = pd.concat([df_all, df_scores], axis=0)\n" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ - "for dataset in datasets:\n", - " df_scores = pd.read_csv(f\"output/temp/{dataset}/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", - " # - normalize per method\n", + "df_all = df_all[~(df_all['model'] == 'collectri')]" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/vol/tmp/users/jnourisa/ipykernel_1636782/208795827.py:9: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " df_metrics = df_all.groupby(['model','metric']).apply(lambda df: mean_for_metrics(df)).reset_index().pivot(index='model', columns='metric', values='r2score')\n", + "/vol/tmp/users/jnourisa/ipykernel_1636782/208795827.py:19: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " df_datasets = df_all.groupby(['model','dataset']).apply(lambda df: mean_for_datasets(df)).reset_index().pivot(index='model', columns='dataset', values='r2score')\n" + ] + } + ], + "source": [ + "# - mean scores for metrics\n", + "def mean_for_metrics(df):\n", + " metric = df['metric'].values.flatten()[0]\n", + " if metric in ['S1','S2']:\n", + " df = df[df['dataset']=='op']\n", + " else:\n", + " pass \n", + " return df[['r2score']].mean()\n", + "df_metrics = df_all.groupby(['model','metric']).apply(lambda df: mean_for_metrics(df)).reset_index().pivot(index='model', columns='metric', values='r2score')\n", "\n", - " # - normalize per dataset\n" + "# - mean scores for datasets\n", + "def mean_for_datasets(df):\n", + " dataset = df['dataset'].values.flatten()[0]\n", + " if dataset != 'op':\n", + " df = df[~df['metric'].isin(['S1','S2'])]\n", + " else:\n", + " pass \n", + " return df[['r2score']].mean()\n", + "df_datasets = df_all.groupby(['model','dataset']).apply(lambda df: mean_for_datasets(df)).reset_index().pivot(index='model', columns='dataset', values='r2score')" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 117, "metadata": {}, "outputs": [ { @@ -1989,167 +2672,246 @@ " static-theta-0.0\n", " static-theta-0.5\n", " static-theta-1.0\n", + " adamson\n", + " nakatake\n", + " norman\n", + " op\n", + " replogle2\n", " overall_score\n", " Duration (hour)\n", " Peak memory (GB)\n", + " User-friendly\n", + " Complexity\n", " \n", " \n", " \n", " \n", - " 6\n", + " 3\n", " GRNBoost2\n", " 1.000000\n", " 0.776475\n", - " 0.850620\n", + " 0.826432\n", + " 0.980936\n", " 1.000000\n", " 1.000000\n", + " 0.787585\n", + " 0.941155\n", " 0.925419\n", + " 1.000000\n", + " 0.923800\n", " 7.510556\n", " 7.378796\n", + " 8\n", + " 2\n", " \n", " \n", - " 2\n", + " 7\n", " Positive Control\n", " 0.981252\n", " 1.000000\n", - " 1.000000\n", - " 0.769936\n", - " 0.575695\n", + " 0.958346\n", + " 0.860857\n", + " 0.633513\n", + " 0.832683\n", + " 0.857764\n", + " 0.886458\n", " 0.865377\n", + " 0.729077\n", + " 0.860533\n", " 0.015000\n", " 11.601669\n", + " 10\n", + " 0\n", " \n", " \n", - " 12\n", + " 10\n", " Scenic+\n", " 0.800199\n", " 0.634004\n", " 0.913697\n", " 0.983595\n", " 0.888349\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 0.843969\n", + " 0.000000\n", " 0.843969\n", " 11.740556\n", " 131.342854\n", + " 1\n", + " 9\n", " \n", " \n", - " 3\n", + " 5\n", " Pearson corr.\n", " 0.775394\n", " 0.881858\n", - " 0.849224\n", - " 0.647685\n", - " 0.435855\n", + " 0.880661\n", + " 0.792117\n", + " 0.556792\n", + " 0.814796\n", + " 0.729421\n", + " 0.834992\n", " 0.718003\n", + " 0.692485\n", + " 0.767652\n", " 0.041389\n", " 23.801899\n", + " 10\n", + " 0\n", " \n", " \n", - " 10\n", + " 0\n", " CellOracle\n", " 0.633710\n", " 0.641349\n", " 0.843619\n", " 0.773048\n", " 0.561766\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 0.690699\n", + " 0.000000\n", " 0.690699\n", " 3.765000\n", " 41.601166\n", + " 6\n", + " 4\n", " \n", " \n", - " 7\n", - " Scenic\n", - " 0.433522\n", - " 0.392839\n", - " 0.723472\n", - " 0.750125\n", - " 0.733396\n", - " 0.606671\n", - " 24.008611\n", - " 35.954300\n", - " \n", - " \n", - " 4\n", + " 6\n", " Portia\n", " 0.632905\n", " 0.567038\n", - " 0.712799\n", - " 0.406583\n", - " 0.318112\n", + " 0.523610\n", + " 0.337081\n", + " 0.235700\n", + " 0.000000\n", + " 0.511919\n", + " 0.266277\n", " 0.527488\n", + " 0.569957\n", + " 0.417198\n", " 2.491111\n", " 55.685230\n", + " 9\n", + " 1\n", " \n", " \n", - " 9\n", + " 11\n", " scGLUE\n", " 0.211603\n", " 0.571086\n", " 0.682864\n", " 0.316799\n", " 0.229125\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 0.402295\n", + " 0.000000\n", " 0.402295\n", " 11.097500\n", " 61.677879\n", + " 6\n", + " 4\n", " \n", " \n", - " 11\n", + " 9\n", + " Scenic\n", + " 0.433522\n", + " 0.392839\n", + " 0.437852\n", + " 0.350487\n", + " 0.345024\n", + " 0.000000\n", + " 0.289197\n", + " 0.240945\n", + " 0.606671\n", + " 0.245343\n", + " 0.371320\n", + " 24.008611\n", + " 35.954300\n", + " 7\n", + " 3\n", + " \n", + " \n", + " 1\n", " FigR\n", " 0.305938\n", " 0.449542\n", " 0.252149\n", " 0.345278\n", " 0.265239\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 0.323629\n", + " 0.000000\n", " 0.323629\n", " 6.731667\n", " 225.208725\n", + " 6\n", + " 4\n", " \n", " \n", - " 5\n", + " 8\n", " PPCOR\n", " 0.267408\n", " 0.200610\n", - " 0.402517\n", - " 0.300952\n", - " 0.170267\n", + " 0.360093\n", + " 0.285175\n", + " 0.165767\n", + " 0.566048\n", + " 0.041881\n", + " 0.418685\n", " 0.268351\n", + " 0.033865\n", + " 0.260788\n", " 13.425833\n", " 64.136433\n", + " 7\n", + " 3\n", " \n", " \n", - " 0\n", - " CollectRI\n", - " 0.079282\n", - " 0.106745\n", - " 0.131682\n", - " 0.234284\n", - " 0.161236\n", - " 0.142646\n", + " 4\n", + " Negative Control\n", " 0.000000\n", " 0.000000\n", - " \n", - " \n", - " 1\n", - " Negative Control\n", + " 0.073181\n", + " 0.139694\n", + " 0.080763\n", + " 0.352581\n", " 0.000000\n", " 0.000000\n", - " 0.057763\n", - " 0.189121\n", - " 0.163558\n", " 0.082089\n", + " 0.000000\n", + " 0.072831\n", " 0.003889\n", " 2.216045\n", + " 10\n", + " 0\n", " \n", " \n", - " 8\n", + " 2\n", " GRaNIE\n", " 0.200563\n", " 0.163170\n", " 0.000000\n", " 0.000000\n", " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 0.000000\n", + " 0.072747\n", + " 0.000000\n", " 0.072747\n", " 1.012038\n", " 41.000000\n", + " 6\n", + " 4\n", " \n", " \n", "\n", @@ -2157,59 +2919,99 @@ ], "text/plain": [ " method_name S1 S2 static-theta-0.0 static-theta-0.5 \\\n", - "6 GRNBoost2 1.000000 0.776475 0.850620 1.000000 \n", - "2 Positive Control 0.981252 1.000000 1.000000 0.769936 \n", - "12 Scenic+ 0.800199 0.634004 0.913697 0.983595 \n", - "3 Pearson corr. 0.775394 0.881858 0.849224 0.647685 \n", - "10 CellOracle 0.633710 0.641349 0.843619 0.773048 \n", - "7 Scenic 0.433522 0.392839 0.723472 0.750125 \n", - "4 Portia 0.632905 0.567038 0.712799 0.406583 \n", - "9 scGLUE 0.211603 0.571086 0.682864 0.316799 \n", - "11 FigR 0.305938 0.449542 0.252149 0.345278 \n", - "5 PPCOR 0.267408 0.200610 0.402517 0.300952 \n", - "0 CollectRI 0.079282 0.106745 0.131682 0.234284 \n", - "1 Negative Control 0.000000 0.000000 0.057763 0.189121 \n", - "8 GRaNIE 0.200563 0.163170 0.000000 0.000000 \n", + "3 GRNBoost2 1.000000 0.776475 0.826432 0.980936 \n", + "7 Positive Control 0.981252 1.000000 0.958346 0.860857 \n", + "10 Scenic+ 0.800199 0.634004 0.913697 0.983595 \n", + "5 Pearson corr. 0.775394 0.881858 0.880661 0.792117 \n", + "0 CellOracle 0.633710 0.641349 0.843619 0.773048 \n", + "6 Portia 0.632905 0.567038 0.523610 0.337081 \n", + "11 scGLUE 0.211603 0.571086 0.682864 0.316799 \n", + "9 Scenic 0.433522 0.392839 0.437852 0.350487 \n", + "1 FigR 0.305938 0.449542 0.252149 0.345278 \n", + "8 PPCOR 0.267408 0.200610 0.360093 0.285175 \n", + "4 Negative Control 0.000000 0.000000 0.073181 0.139694 \n", + "2 GRaNIE 0.200563 0.163170 0.000000 0.000000 \n", + "\n", + " static-theta-1.0 adamson nakatake norman op replogle2 \\\n", + "3 1.000000 1.000000 0.787585 0.941155 0.925419 1.000000 \n", + "7 0.633513 0.832683 0.857764 0.886458 0.865377 0.729077 \n", + "10 0.888349 0.000000 0.000000 0.000000 0.843969 0.000000 \n", + "5 0.556792 0.814796 0.729421 0.834992 0.718003 0.692485 \n", + "0 0.561766 0.000000 0.000000 0.000000 0.690699 0.000000 \n", + "6 0.235700 0.000000 0.511919 0.266277 0.527488 0.569957 \n", + "11 0.229125 0.000000 0.000000 0.000000 0.402295 0.000000 \n", + "9 0.345024 0.000000 0.289197 0.240945 0.606671 0.245343 \n", + "1 0.265239 0.000000 0.000000 0.000000 0.323629 0.000000 \n", + "8 0.165767 0.566048 0.041881 0.418685 0.268351 0.033865 \n", + "4 0.080763 0.352581 0.000000 0.000000 0.082089 0.000000 \n", + "2 0.000000 0.000000 0.000000 0.000000 0.072747 0.000000 \n", + "\n", + " overall_score Duration (hour) Peak memory (GB) User-friendly \\\n", + "3 0.923800 7.510556 7.378796 8 \n", + "7 0.860533 0.015000 11.601669 10 \n", + "10 0.843969 11.740556 131.342854 1 \n", + "5 0.767652 0.041389 23.801899 10 \n", + "0 0.690699 3.765000 41.601166 6 \n", + "6 0.417198 2.491111 55.685230 9 \n", + "11 0.402295 11.097500 61.677879 6 \n", + "9 0.371320 24.008611 35.954300 7 \n", + "1 0.323629 6.731667 225.208725 6 \n", + "8 0.260788 13.425833 64.136433 7 \n", + "4 0.072831 0.003889 2.216045 10 \n", + "2 0.072747 1.012038 41.000000 6 \n", "\n", - " static-theta-1.0 overall_score Duration (hour) Peak memory (GB) \n", - "6 1.000000 0.925419 7.510556 7.378796 \n", - "2 0.575695 0.865377 0.015000 11.601669 \n", - "12 0.888349 0.843969 11.740556 131.342854 \n", - "3 0.435855 0.718003 0.041389 23.801899 \n", - "10 0.561766 0.690699 3.765000 41.601166 \n", - "7 0.733396 0.606671 24.008611 35.954300 \n", - "4 0.318112 0.527488 2.491111 55.685230 \n", - "9 0.229125 0.402295 11.097500 61.677879 \n", - "11 0.265239 0.323629 6.731667 225.208725 \n", - "5 0.170267 0.268351 13.425833 64.136433 \n", - "0 0.161236 0.142646 0.000000 0.000000 \n", - "1 0.163558 0.082089 0.003889 2.216045 \n", - "8 0.000000 0.072747 1.012038 41.000000 " + " Complexity \n", + "3 2 \n", + "7 0 \n", + "10 9 \n", + "5 0 \n", + "0 4 \n", + "6 1 \n", + "11 4 \n", + "9 3 \n", + "1 4 \n", + "8 3 \n", + "4 0 \n", + "2 4 " ] }, - "execution_count": 21, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# create ranking \n", - "df_scores = df_scores.fillna(0)\n", - "df_scores[df_scores < 0] = 0\n", - "df_scores = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", + "# - calculate over scores\n", + "df_scores = pd.concat([df_metrics, df_datasets], axis=1)\n", + "# df_scores = df_metrics\n", "df_scores['overall_score'] = df_scores.mean(axis=1)\n", - "df_scores\n", - "# df_scores['rank'] = df_scores.mean(axis=1).rank(ascending=False).astype(int)\n", "\n", - "df_all = pd.concat([df_scores, df_res], axis=1)\n", - "df_all = df_all.fillna(0)\n", - "df_all.index.name = 'method_name' \n", - "df_all = df_all.reset_index()\n", + "# - merge scores with resources \n", + "df_summary = pd.concat([df_scores, df_res], axis=1)\n", + "df_summary = df_summary.fillna(0)\n", + "df_summary.index.name = 'method_name' \n", + "df_summary = df_summary.reset_index()\n", "\n", - "df_all = df_all.sort_values(by='overall_score', ascending=False) \n", + "df_summary = df_summary.sort_values(by='overall_score', ascending=False) \n", "\n", - "df_all.method_name = df_all.method_name.map(surragate_names)\n", - "df_all" + "df_summary.method_name = df_summary.method_name.map(surragate_names)\n", + "# - add user complexity \n", + "df_summary['User-friendly'] = df_summary['method_name'].map({\n", + " 'Scenic+': 1, \n", + " 'GRNBoost2': 8, \n", + " 'Positive Control': 10, \n", + " 'Pearson corr.': 10,\n", + " 'CellOracle': 6,\n", + " 'Portia': 9,\n", + " 'scGLUE': 6,\n", + " 'Scenic': 7,\n", + " 'FigR': 6,\n", + " 'PPCOR': 7,\n", + " 'Negative Control': 10,\n", + " 'GRaNIE': 6,\n", + " })\n", + "df_summary['Complexity'] = df_summary['User-friendly'].max() - df_summary['User-friendly']\n", + "df_summary" ] }, { @@ -2221,7 +3023,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 118, "metadata": {}, "outputs": [ { @@ -2229,9 +3031,9 @@ "output_type": "stream", "text": [ "── \u001b[1mAttaching packages\u001b[22m ─────────────────────────────────────── tidyverse 1.3.1 ──\n", - "\u001b[32m✔\u001b[39m \u001b[34mggplot2\u001b[39m 3.5.1 \u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 0.3.4\n", + "\u001b[32m✔\u001b[39m \u001b[34mggplot2\u001b[39m 3.5.1 \u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 1.0.2\n", "\u001b[32m✔\u001b[39m \u001b[34mtibble \u001b[39m 3.2.1 \u001b[32m✔\u001b[39m \u001b[34mdplyr \u001b[39m 1.1.4\n", - "\u001b[32m✔\u001b[39m \u001b[34mtidyr \u001b[39m 1.2.0 \u001b[32m✔\u001b[39m \u001b[34mstringr\u001b[39m 1.4.0\n", + "\u001b[32m✔\u001b[39m \u001b[34mtidyr \u001b[39m 1.3.1 \u001b[32m✔\u001b[39m \u001b[34mstringr\u001b[39m 1.5.1\n", "\u001b[32m✔\u001b[39m \u001b[34mreadr \u001b[39m 2.1.2 \u001b[32m✔\u001b[39m \u001b[34mforcats\u001b[39m 0.5.1\n", "── \u001b[1mConflicts\u001b[22m ────────────────────────────────────────── tidyverse_conflicts() ──\n", "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n", @@ -2241,15 +3043,15 @@ "\u001b[36mℹ\u001b[39m Please use `whereami::thisfile()` instead. \n", "\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[1m\u001b[22mNew names:\n", "\u001b[36m•\u001b[39m `` -> `...1`\n", - "\u001b[1mRows: \u001b[22m\u001b[34m13\u001b[39m \u001b[1mColumns: \u001b[22m\u001b[34m13\u001b[39m\n", + "\u001b[1mRows: \u001b[22m\u001b[34m12\u001b[39m \u001b[1mColumns: \u001b[22m\u001b[34m22\u001b[39m\n", "\u001b[36m──\u001b[39m \u001b[1mColumn specification\u001b[22m \u001b[36m────────────────────────────────────────────────────────\u001b[39m\n", "\u001b[1mDelimiter:\u001b[22m \"\\t\"\n", "\u001b[31mchr\u001b[39m (1): method_name\n", - "\u001b[32mdbl\u001b[39m (12): ...1, S1, S2, static-theta-0.0, static-theta-0.5, overall_score, D...\n", + "\u001b[32mdbl\u001b[39m (21): ...1, S1, S2, static-theta-0.0, static-theta-0.5, static-theta-1.0...\n", "\n", "\u001b[36mℹ\u001b[39m Use `spec()` to retrieve the full column specification for this data.\n", "\u001b[36mℹ\u001b[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.\n", - "\u001b[?25h\u001b[?25h\u001b[90m# A tibble: 10 × 7\u001b[39m\n", + "\u001b[?25h\u001b[?25h\u001b[90m# A tibble: 18 × 7\u001b[39m\n", " id id_color name group geom palette options \n", " \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \n", "\u001b[90m 1\u001b[39m method_name \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mName\u001b[90m\"\u001b[39m meth… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", @@ -2258,16 +3060,30 @@ "\u001b[90m 4\u001b[39m S2 S2 \u001b[90m\"\u001b[39mS2\u001b[90m\"\u001b[39m metr… funk… metric… \u001b[90m\u001b[39m\n", "\u001b[90m 5\u001b[39m static-theta-0.0 static-theta-0.0 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n", "\u001b[90m 6\u001b[39m static-theta-0.5 static-theta-0.5 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n", - "\u001b[90m 7\u001b[39m memory_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mPeak mem… reso… rect resour… \u001b[90m\u001b[39m\n", - "\u001b[90m 8\u001b[39m memory_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", - "\u001b[90m 9\u001b[39m duration_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mDuration… reso… rect resour… \u001b[90m\u001b[39m\n", - "\u001b[90m10\u001b[39m duration_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", + "\u001b[90m 7\u001b[39m static-theta-1.0 static-theta-1.0 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n", + "\u001b[90m 8\u001b[39m op op \u001b[90m\"\u001b[39mOPSCA\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", + "\u001b[90m 9\u001b[39m adamson adamson \u001b[90m\"\u001b[39mAdamson\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", + "\u001b[90m10\u001b[39m nakatake nakatake \u001b[90m\"\u001b[39mNakatake\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", + "\u001b[90m11\u001b[39m norman norman \u001b[90m\"\u001b[39mNorman\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", + "\u001b[90m12\u001b[39m replogle2 replogle2 \u001b[90m\"\u001b[39mReplogle\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", + "\u001b[90m13\u001b[39m memory_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mPeak mem… reso… rect resour… \u001b[90m\u001b[39m\n", + "\u001b[90m14\u001b[39m memory_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", + "\u001b[90m15\u001b[39m duration_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mDuration… reso… rect resour… \u001b[90m\u001b[39m\n", + "\u001b[90m16\u001b[39m duration_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", + "\u001b[90m17\u001b[39m complexity_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mComplexi… reso… rect resour… \u001b[90m\u001b[39m\n", + "\u001b[90m18\u001b[39m Complexity \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", "\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[36mℹ\u001b[39m Could not find column 'id' in data. Using rownames as 'id'.\n", "\u001b[36mℹ\u001b[39m Column info did not contain a column called 'legend', generating options based on the 'geom' column.\n", "\u001b[36mℹ\u001b[39m No row info was provided, assuming all rows in `data` are to be plotted.\n", "\u001b[36mℹ\u001b[39m Row info did not contain group information, assuming rows are ungrouped.\n", + "\u001b[36mℹ\u001b[39m Palette named 'dataset' was not defined. Assuming palette is numerical. Automatically selected palette 'Blues'.\n", + "\u001b[36mℹ\u001b[39m Some palettes were not used in the column info, adding legends for them.\n", "\u001b[36mℹ\u001b[39m Legend 1 did not contain color, inferring from the palette.\n", "\u001b[36mℹ\u001b[39m Legend 2 did not contain color, inferring from the palette.\n", + "\u001b[36mℹ\u001b[39m Legend 6 did not contain a geom, inferring from the column info.\n", + "\u001b[36mℹ\u001b[39m Legend 6 did not contain labels, inferring from the geom.\n", + "\u001b[36mℹ\u001b[39m Legend 6 did not contain size, inferring from the labels.\n", + "\u001b[36mℹ\u001b[39m Legend 6 did not contain color, inferring from the palette.\n", "\u001b[?25h\u001b[?25h\u001b[?25h" ] } @@ -2277,17 +3093,19 @@ "summary_file = \"output/summary.tsv\"\n", "summary_figure = \"output/summary_figure.pdf\"\n", "\n", - "df_all['memory_log'] = np.log(df_all['Peak memory (GB)']+1)\n", - "df_all['memory_log'] = np.max(df_all['memory_log'])-df_all['memory_log']\n", + "df_summary['memory_log'] = np.log(df_summary['Peak memory (GB)']+1)\n", + "df_summary['memory_log'] = np.max(df_summary['memory_log'])-df_summary['memory_log']\n", "\n", + "df_summary['complexity_log'] = np.log(df_summary['Complexity']+1)\n", + "df_summary['complexity_log'] = np.max(df_summary['complexity_log'])-df_summary['complexity_log']\n", "\n", - "df_all[\"duration_log\"] = np.log(df_all['Duration (hour)']+1)\n", - "df_all['duration_log'] = np.max(df_all['duration_log'])-df_all['duration_log']\n", + "df_summary[\"duration_log\"] = np.log(df_summary['Duration (hour)']+1)\n", + "df_summary['duration_log'] = np.max(df_summary['duration_log'])-df_summary['duration_log']\n", "\n", - "df_all[\"duration_str\"] = df_all['Duration (hour)'].round(1).astype(str)\n", - "df_all['memory_str'] = df_all['Peak memory (GB)'].round(1).astype(str)\n", + "df_summary[\"duration_str\"] = df_summary['Duration (hour)'].round(1).astype(str)\n", + "df_summary['memory_str'] = df_summary['Peak memory (GB)'].round(1).astype(str)\n", "\n", - "df_all.to_csv(summary_file, sep='\\t')\n", + "df_summary.to_csv(summary_file, sep='\\t')\n", "\n", "!Rscript ../grn_benchmark/src/summary_figure.R {summary_file} {summary_figure}" ] diff --git a/src/api/file_evaluation_h5ad.yaml b/src/api/file_evaluation_h5ad.yaml index 6376cff6..8ccc8d82 100644 --- a/src/api/file_evaluation_h5ad.yaml +++ b/src/api/file_evaluation_h5ad.yaml @@ -17,11 +17,11 @@ info: - name: donor_id type: string description: "Donor id" - required: true + required: false - name: perturbation_type type: string description: "Name of the column indicating perturbation type" - required: true + required: false layers: - name: X_norm diff --git a/src/process_data/test_data/script.py b/src/process_data/test_data/script.py index 5ad5e8b1..4665fa93 100644 --- a/src/process_data/test_data/script.py +++ b/src/process_data/test_data/script.py @@ -21,11 +21,11 @@ 'perturbation_data': 'resources/evaluation_datasets/op_perturbation.h5ad', 'perturbation_data_test': 'resources_test/evaluation_datasets/op_perturbation.h5ad', - 'multiomics_counts': 'resources/datasets_raw/op_multiome_counts.h5ad', - 'multiomics_counts_test': 'resources_test/datasets_raw/op_multiome_counts.h5ad', + 'multiomics_counts': 'resources/datasets_raw/op_multiome_sc_counts.h5ad', + 'multiomics_counts_test': 'resources_test/datasets_raw/op_multiome_sc_counts.h5ad', - 'perturbation_counts': 'resources/datasets_raw/op_perturbation_counts.h5ad', - 'perturbation_counts_test': 'resources_test/datasets_raw/op_perturbation_counts.h5ad', + 'perturbation_counts': 'resources/datasets_raw/op_perturbation_sc_counts.h5ad', + 'perturbation_counts_test': 'resources_test/datasets_raw/op_perturbation_sc_counts.h5ad', } ## VIASH END