-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathrun_screening.py
More file actions
94 lines (74 loc) · 3.22 KB
/
run_screening.py
File metadata and controls
94 lines (74 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import argparse
from datetime import datetime
import pandas as pd
import os
from screening.screening_utils import find_latest_subdir, preprocess_data, get_fitted_model, clustering_data, find_mols_tosimulate
from screening.metrics import training_data
import random
random.seed(42)
def run(
path_batches,
train_firstbatch_path,
path_simulationdata,
elbow=False,
cluster_data=True,
clusters=20,
molspercluster=6, #this tag controls how many molecules per cluster are added to be potentially simulated
):
todays_date = datetime.today().strftime('%y%m%d')
latest_subdir = find_latest_subdir(path_batches)
prev_batch= int(latest_subdir.split('/')[-1].split('_')[0].replace('batch',''))
batch_id = prev_batch+1
directory=f'{path_batches}/batch{batch_id}_{todays_date}'
if not os.path.exists(directory):
os.makedirs(directory)
test_firstbatch_path = f"{path_batches}/batch0/test_batch0.csv"
train_firstbatch = pd.read_csv(train_firstbatch_path)
simulated_values = pd.read_csv(path_simulationdata)
training_data(
test_firstbatch=test_firstbatch_path,
train_firstbatch=train_firstbatch,
simulated_values=simulated_values,
batch_id=batch_id,
todays_date=todays_date,
path_batches=path_batches,
)
train_data_path = f'{path_batches}/batch{batch_id}_{todays_date}/train_batch{batch_id}.csv'
test_data_path = f'{path_batches}/batch{batch_id}_{todays_date}/test_batch{batch_id}.csv'
features_scaled, property, features_scaled_eval = preprocess_data(train_data_path=train_data_path,
test_data_path=test_data_path,
)
model = get_fitted_model(features_scaled, property.view(-1,1))
if cluster_data:
cluster_labelling = clustering_data(
test_data=features_scaled_eval,
clusters=clusters,
elbow = elbow)
new_mols = find_mols_tosimulate(
amount=molspercluster,
test_data_path=test_data_path,
features_scaled_eval=features_scaled_eval,
cluster_labelling=cluster_labelling,
model=model,
property=property,
clusters=clusters,
)
new_mols.to_csv(f'{path_batches}/batch{batch_id}_{todays_date}/mols_simulate_{molspercluster}molspercluster.csv', header=False, index=False)
print("Saved new smiles to simulate")
if __name__=='__main__':
parser = argparse.ArgumentParser(description="Run GPR for high throughput screening")
parser.add_argument(
"-p", "--path_batches", help="Path to the where the batches are saved"
)
parser.add_argument(
"-fb", "--train_firstbatch_path", help="Path to the initial training data"
)
parser.add_argument(
"-sd", "--path_simulationdata", help="Path to all the simulation data including smiles of failed simulations"
)
args = parser.parse_args()
run(
path_batches=args.path_batches,
train_firstbatch_path=args.train_firstbatch_path,
path_simulationdata=args.path_simulationdata
)