From f3881ceb0ed8cfd5c964d932fdbb07d0967fe577 Mon Sep 17 00:00:00 2001 From: Jess White <50890758+jessicaw9910@users.noreply.github.com> Date: Tue, 3 Dec 2024 22:51:31 +0100 Subject: [PATCH] Databases (#64) * removed comment * removed kinase_schema.CollectionKinaseInfo * comment on PRKD2 and AlphaMissense * temporary scratch for aligning sequences to DiscoverX --- .../databases/kinase_schema.py | 1 + .../missense_kinase_toolkit/databases/plot.py | 6 +- notebooks/databases.ipynb | 4 +- notebooks/klifs_pocket.ipynb | 157 +++++++++++++++++- notebooks/pkis2_km_atp.ipynb | 30 +++- 5 files changed, 183 insertions(+), 15 deletions(-) diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/kinase_schema.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/kinase_schema.py index 910469a..3fb12c9 100644 --- a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/kinase_schema.py +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/kinase_schema.py @@ -439,6 +439,7 @@ def create_kinase_models_from_df( if df is None: df = concatenate_source_dataframe() + # concatenate_source_dataframe could return None if df is None: logger.error("Dataframe is None. Cannot create kinase models.") return None diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/plot.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/plot.py index 29c7563..a3bd47e 100644 --- a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/plot.py +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/plot.py @@ -125,7 +125,6 @@ def generate_alignment(self) -> None: text="text", text_align="center", text_color="black", - # text_font = "monospace", text_font_size=f"{str(self.font_size)}pt", ) rects = Rect( @@ -150,9 +149,10 @@ def show_plot(self) -> None: """Show sequence alignment plot via Bokeh.""" from bokeh.plotting import show + # show in separate window + show(self.plot) + # notebook alternative # import panel as pn # pn.extension() # pn.pane.Bokeh(alignment_klifs_min.plot) - - show(self.plot) diff --git a/notebooks/databases.ipynb b/notebooks/databases.ipynb index c63473f..caba6bc 100644 --- a/notebooks/databases.ipynb +++ b/notebooks/databases.ipynb @@ -977,9 +977,7 @@ ")\n", "df_merge.to_csv(\"../data/kinhub_uniprot_merge.csv\", index=False)\n", "\n", - "dict_kin = kinase_schema.create_kinase_models_from_df(df_merge)\n", - "\n", - "model_kinase = kinase_schema.CollectionKinaseInfo(kinase_dict=dict_kin)" + "dict_kin = kinase_schema.create_kinase_models_from_df(df_merge)" ] } ], diff --git a/notebooks/klifs_pocket.ipynb b/notebooks/klifs_pocket.ipynb index 17835d4..3730f14 100644 --- a/notebooks/klifs_pocket.ipynb +++ b/notebooks/klifs_pocket.ipynb @@ -49,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "d01ab1c6-1d6e-465a-9f1e-e5462f0ac264", "metadata": {}, "outputs": [], @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "cf9e691e-a7aa-4005-aece-ae6183323d4a", "metadata": {}, "outputs": [], @@ -342,6 +342,159 @@ "dict_seq = {hgnc: \"\".join([*klifs_pocket.KLIFS2UniProtSeq.values()])\\\n", " for hgnc, klifs_pocket in dict_klifs.items()}" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4411c6fd-0a18-4f1c-a83f-915da1f51d39", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41f09832-087c-4856-adbe-d3fe90077a5c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbfc696b-8138-4491-a890-3713f63b75b8", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "402fdb8c-433e-4750-b7f5-931b4baabb4c", + "metadata": {}, + "outputs": [], + "source": [ + "for key, val in dict_kinase.items():\n", + " with open(f\"../data/KinaseInfo/{key}.json\", \"w\") as outfile: \n", + " json.dump(val.json(), outfile)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "62c4675a-8fcf-42b9-b941-1cb908f539c8", + "metadata": {}, + "outputs": [], + "source": [ + "list_json = glob.glob(\"../data/KinaseInfo/*\")\n", + "\n", + "dict_import = {}\n", + "\n", + "for file in list_json:\n", + " with open(file, \"r\") as openfile:\n", + " json_obj = json.load(openfile)\n", + " kinase_obj = kinase_schema.KinaseInfo.parse_raw(json_obj)\n", + " dict_import[kinase_obj.hgnc_name] = kinase_obj\n", + "\n", + "dict_import = {key: dict_import[key] for key in sorted(dict_import.keys())}" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "f0d4c415-60a4-4cc3-a96e-8d6623ecb5e3", + "metadata": {}, + "outputs": [], + "source": [ + "temp = \"PFWKILNPLLERGTYYYFMGQQPGKVLGDQRRPSLPALHFIKGAGKKESSRHGGPHCNVFVEHEALQRPVASDFEPQGLSEAARWNSKENLLAGPSENDPNLFVALYDFVASGDNTLSITKGEKLRVLGYNHNGEWCEAQTKNGQGWVPSNYITPVNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRSISLRYEGRVYHYRINTASDGKLYVSSESRFNTLAELVHHHSTVADGLITTLHYPAPKRNKPTVYGVSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIITEFMTYGNLLDYLRECNRQEVNAVVLLYMATQISSAMEYLEKKNFIHRDLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSISDEVEKELGKQGVRGAVSTLLQAPELPTKTRTSRRAAEHRDTTDVPEMPHSKGQGESDPLDHEPAVSPLLPRKERGPPEGGLNEDERLLPKDKKTNLFSALIKKKKKTAPTPPKRSSSFREMDGQPERRGAGEEEGRDISNGALAFTPLDTADPAKSPKPSNGAGVPNGALRESGGSGFRSPHLWKKSSTLTSSRLATGEEEGGGSSSKRFLRSCSASCVPHGAKDTEWRSVTLPRDLQSTGRQFDSSTFGGHKSEKPALPRKRAGENRSDQVTRGTVTPPPRLVKKNEEAADEVFKDIMESSPGSSPPNLTPKPLRRQVTVAPASGLPHKEEAGKGSALGTPAAAEPVTPTSKAGSGAPGGTSKGPAEESRVRRHKHSSESPGRDKGKLSRLKPAPPPPPAASAGKAGGKPSQSPSQEAAGEAVLGAKTKATSLVDAVNSDAAKPSQPGEGLKKPVLPATPKPQSAKPSGTPISPAPVPSTLPSASSALAGDQPSSTAFIPLISTRVSLRKTRQPPERIASGAITKGVVLDSTEALCLAISRNSEQMASHSAVLEAGKNLYTFCVSYVDSIQQMRNKFAFREAINKLENNLRELQICPATAGSGPAATQDFSKLLSSVKEISDIVQR\"" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "bf5b5dc7-8ff4-404d-b656-6fc0b56a62da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1167" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(temp)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "05475fe0-a66b-4bf8-968f-6521f9fcbf93", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1130" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dict_kinase[\"ABL1\"].UniProt.canonical_seq)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "2885ec65-4ce5-47a9-8eae-84045c33e674", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_kinase[\"ABL1\"].UniProt.canonical_seq in temp" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "85f1a780-a747-4ca9-a170-be5e99509fd0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"VKEISDIVQ\" in dict_kinase[\"ABL1\"].UniProt.canonical_seq" + ] } ], "metadata": { diff --git a/notebooks/pkis2_km_atp.ipynb b/notebooks/pkis2_km_atp.ipynb index 151ca6d..2fdca58 100644 --- a/notebooks/pkis2_km_atp.ipynb +++ b/notebooks/pkis2_km_atp.ipynb @@ -214,6 +214,7 @@ " if idx is not np.nan else np.nan for idx in list_concat]\n", "\n", "# manual fix\n", + "# cannot tell difference between PRKD2 using UniProt ID\n", "# df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == \"PKD2\", \"uniprot\"] = \"Q13563\"\n", "df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == \"PRKD2\", \"uniprot\"] = \"Q9BZL6\"\n", "df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == \"RSK1\", \"uniprot\"] = \"Q15418\"" @@ -422,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "ed5a72db-94be-4490-a90f-3d7af677955e", "metadata": {}, "outputs": [], @@ -464,7 +465,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "78c846e6-1ba0-4f55-99a4-aab09cda3108", "metadata": {}, "outputs": [], @@ -490,7 +491,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 18, "id": "206fbb7a-fa14-4c2a-bfe7-c39bfdde7f08", "metadata": {}, "outputs": [ @@ -556,7 +557,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "f9e4593a-6d29-4c7c-932a-d4c95f182f87", "metadata": {}, "outputs": [], @@ -580,10 +581,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "0c487f05-a65a-474a-a544-b0e69236abc4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8a5c8d0b5df747e5b0373714f5fafe8a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/684 [00:00