-
Notifications
You must be signed in to change notification settings - Fork 155
/
Copy pathesol.py
207 lines (185 loc) · 8.45 KB
/
esol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# -*- coding: utf-8 -*-
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ESOL from MoleculeNet for the prediction of water solubility
import pandas as pd
from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive
from .csv_dataset import MoleculeCSVDataset
__all__ = ['ESOL']
class ESOL(MoleculeCSVDataset):
r"""ESOL from MoleculeNet for the prediction of water solubility
Quoting [1], " ESOL is a small dataset consisting of water solubility data for 1128 compounds.
The dataset has been used to train models that estimate solubility directly from chemical
structures (as encoded in SMILES strings). Note that these structures don't include 3D
coordinates, since solubility is a property of a molecule and not of its particular
conformers."
References:
* [1] MoleculeNet: A Benchmark for Molecular Machine Learning.
* [2] ESOL: estimating aqueous solubility directly from molecular structure.
* [3] DeepChem
Parameters
----------
smiles_to_graph: callable, str -> DGLGraph
A function turning a SMILES string into a DGLGraph. If None, it uses
:func:`dgllife.utils.SMILESToBigraph` by default.
node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for nodes like atoms in a molecule, which can be used to update
ndata for a DGLGraph. Default to None.
edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for edges like bonds in a molecule, which can be used to update
edata for a DGLGraph. Default to None.
load : bool
Whether to load the previously pre-processed dataset or pre-process from scratch.
``load`` should be False when we want to try different graph construction and
featurization methods and need to preprocess from scratch. Default to False.
log_every : bool
Print a message every time ``log_every`` molecules are processed. Default to 1000.
cache_file_path : str
Path to the cached DGLGraphs, default to 'esol_dglgraph.bin'.
n_jobs : int
The maximum number of concurrently running jobs for graph construction and featurization,
using joblib backend. Default to 1.
Examples
--------
>>> from dgllife.data import ESOL
>>> from dgllife.utils import SMILESToBigraph, CanonicalAtomFeaturizer
>>> smiles_to_g = SMILESToBigraph(node_featurizer=CanonicalAtomFeaturizer())
>>> dataset = ESOL(smiles_to_g)
>>> # Get size of the dataset
>>> len(dataset)
1128
>>> # Get the 0th datapoint, consisting of SMILES, DGLGraph and solubility
>>> dataset[0]
('OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O ',
DGLGraph(num_nodes=32, num_edges=68,
ndata_schemes={}
edata_schemes={}),
tensor([-0.7700]))
We also provide information for the name, estimated solubility, minimum atom
degree, molecular weight, number of h bond donors, number of rings,
number of rotatable bonds, and polar surface area of the compound
>>> # Access the information mentioned above for the ith datapoint
>>> dataset.compound_names[i]
>>> dataset.estimated_solubility[i]
>>> dataset.min_degree[i]
>>> dataset.mol_weight[i]
>>> dataset.num_h_bond_donors[i]
>>> dataset.num_rings[i]
>>> dataset.num_rotatable_bonds[i]
>>> dataset.polar_surface_area[i]
We can also get all these information along with SMILES, DGLGraph and solubility
at once.
>>> dataset.load_full = True
>>> dataset[0]
('OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O ',
DGLGraph(num_nodes=32, num_edges=68,
ndata_schemes={}
edata_schemes={}),
tensor([-0.7700]),
'Amigdalin',
-0.974,
1,
457.43200000000013,
7,
3,
7,
202.32)
"""
def __init__(self,
smiles_to_graph=None,
node_featurizer=None,
edge_featurizer=None,
load=False,
log_every=1000,
cache_file_path='./esol_dglgraph.bin',
n_jobs=1):
self._url = 'dataset/ESOL.zip'
data_path = get_download_dir() + '/ESOL.zip'
dir_path = get_download_dir() + '/ESOL'
download(_get_dgl_url(self._url), path=data_path, overwrite=False)
extract_archive(data_path, dir_path)
df = pd.read_csv(dir_path + '/delaney-processed.csv')
super(ESOL, self).__init__(df=df,
smiles_to_graph=smiles_to_graph,
node_featurizer=node_featurizer,
edge_featurizer=edge_featurizer,
smiles_column='smiles',
cache_file_path=cache_file_path,
task_names=['measured log solubility in mols per litre'],
load=load,
log_every=log_every,
init_mask=False,
n_jobs=n_jobs)
self.load_full = False
# Compound names in PubChem
self.compound_names = df['Compound ID'].tolist()
self.compound_names = [self.compound_names[i] for i in self.valid_ids]
# Estimated solubility
self.estimated_solubility = df['ESOL predicted log solubility in mols per litre'].tolist()
self.estimated_solubility = [self.estimated_solubility[i] for i in self.valid_ids]
# Minimum atom degree
self.min_degree = df['Minimum Degree'].tolist()
self.min_degree = [self.min_degree[i] for i in self.valid_ids]
# Molecular weight
self.mol_weight = df['Molecular Weight'].tolist()
self.mol_weight = [self.mol_weight[i] for i in self.valid_ids]
# Number of H-Bond Donors
self.num_h_bond_donors = df['Number of H-Bond Donors'].tolist()
self.num_h_bond_donors = [self.num_h_bond_donors[i] for i in self.valid_ids]
# Number of rings
self.num_rings = df['Number of Rings'].tolist()
self.num_rings = [self.num_rings[i] for i in self.valid_ids]
# Number of rotatable bonds
self.num_rotatable_bonds = df['Number of Rotatable Bonds'].tolist()
self.num_rotatable_bonds = [self.num_rotatable_bonds[i] for i in self.valid_ids]
# Polar Surface Area
self.polar_surface_area = df['Polar Surface Area'].tolist()
self.polar_surface_area = [self.polar_surface_area[i] for i in self.valid_ids]
def __getitem__(self, item):
"""Get datapoint with index
Parameters
----------
item : int
Datapoint index
Returns
-------
str
SMILES for the ith datapoint
DGLGraph
DGLGraph for the ith datapoint
Tensor of dtype float32 and shape (1)
Labels of the ith datapoint
str, optional
Name for the ith compound, returned only when ``self.load_full`` is True.
float, optional
Estimated solubility for the ith compound,
returned only when ``self.load_full`` is True.
int, optional
Minimum atom degree for the ith datapoint, returned only when
``self.load_full`` is True.
float, optional
Molecular weight for the ith datapoint, returned only when
``self.load_full`` is True.
int, optional
Number of h bond donors for the ith datapoint, returned only when
``self.load_full`` is True.
int, optional
Number of rings in the ith datapoint, returned only when
``self.load_full`` is True.
int, optional
Number of rotatable bonds in the ith datapoint, returned only when
``self.load_full`` is True.
float, optional
Polar surface area for the ith datapoint, returned only when
``self.load_full`` is True.
"""
if self.load_full:
return self.smiles[item], self.graphs[item], self.labels[item], \
self.compound_names[item], self.estimated_solubility[item], \
self.min_degree[item], self.mol_weight[item], \
self.num_h_bond_donors[item], self.num_rings[item], \
self.num_rotatable_bonds[item], self.polar_surface_area[item]
else:
return self.smiles[item], self.graphs[item], self.labels[item]