-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparse_conformations.py
89 lines (71 loc) · 3.09 KB
/
parse_conformations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import sys, re, glob, cctk, argparse, copy
import numpy as np
import pandas as pd
import multiprocessing as mp
from multiprocessing import freeze_support
from tabulate import tabulate
from tqdm import tqdm
# usage: python parse_conformations.py -c eneryg_cutoff prefix_of_new_files path/to/output/*.out
def read(file):
if re.match("slurm", file):
return
try:
output_file = cctk.GaussianFile.read_file(file)
if isinstance(output_file, list):
if len(output_file) > 0:
output_file = output_file[-1]
else:
return
return output_file
except Exception as e:
print(f"Error reading {file}\n{e}")
return
def main():
e = cctk.ConformationalEnsemble()
parser = argparse.ArgumentParser(prog="parse_conformations.py")
parser.add_argument("--cutoff", "-c", type=float, default=15)
parser.add_argument("--rmsd_cutoff", "-C", type=float, default=0.5)
parser.add_argument("prefix", type=str)
parser.add_argument("files", nargs='+')
args = vars(parser.parse_args(sys.argv[1:]))
print("\n\033[3mreading files:\033[0m")
pool = mp.Pool(processes=16)
for output_file in tqdm(pool.imap(read, args['files']), total=len(args['files'])):
molecule = output_file.get_molecule()
# there has got to be a better way to do this
e.add_molecule(*list(output_file.ensemble.items())[-1])
e[molecule, "iters"] = len(output_file.ensemble)
e[molecule, "success"] = output_file.successful_terminations
e[molecule, "imaginary"] = output_file.imaginaries()
if len(e) == 0:
print("no jobs to analyze!")
exit()
print(f"{len(e)} files read.")
e, rmsds = e.eliminate_redundant(RMSD_cutoff=args['rmsd_cutoff'], return_RMSD=True)
print(f"{len(e)} distinct conformations identified (RMSD cutoff of {args['rmsd_cutoff']:.2f}).")
print("\n\033[3manalysis:\033[0m")
property_names = ["filename", "new_filename", "rmsd", "iters", "energy", "success", "imaginary"]
values = e[:, property_names]
if not isinstance(values[0], list):
values = [values]
df = pd.DataFrame(values, columns=property_names).fillna(0)
df["rmsd"] = rmsds
df["new_filename"] = [f"{args['prefix']}_c{i:03}.gjf" for i in range(len(df))]
df["rel_energy"] = (df.energy - df.energy.min()) * 627.509469
df.sort_values("rel_energy", inplace=True)
normal_col_names = copy.deepcopy(df.columns)
df.columns = [f"\033[1m{c}\033[0m" for c in df.columns]
print(tabulate(df, headers="keys", tablefmt="presto", floatfmt=".5f"))
df.columns = normal_col_names
template_file = read(args['files'][0])
print("\n\033[3moutput:\033[0m")
for m, n, e in zip(e.molecule_list(), df["new_filename"], df["rel_energy"]):
if e <= args["cutoff"]:
template_file.write_file(n, molecule=m)
print(f"Wrote {n}")
else:
print(f"Skipping {n} and all subsequent rows: relative energy of {e:.2f} exceeds cutoff of {args['cutoff']:.2f}")
break
if __name__ == '__main__':
freeze_support()
main()