ChemTS
ChemTS copied to clipboard
How to Generate Molecules?
Hi, after saving the model in train_RNN.py,
I loaded the model using model_from_json("model.json")
and model.load_weights("model.h5")
I called the generate_smile(model,'CCCC') I got the following error:
ValueError: substring not found
What's wrong with my generate_smile()?
Hi, what is the generate_smile(model, 'CCCC') function? Can you paste your code here, so that I can check where is wrong?
def generate_smile(model,val):
new_smile = []
start_smile_index = [val.index("C")]
print(start_smile_index)
while not start_smile_index[-1] == val.index("\n"):
predictions = model.predict(start_smile_index)
##next atom probability
smf = []
for i in range (len(X)):
sm = []
for j in range(len(X[i])):
#if np.argmax(predictions[i][j])=!0
sm.append(np.argmax(predictions[i][j]))
smf.append(sm)
print(sm)
print(smf)
#print(len(sm))
# new_smile.append(sampled_word)
# return ''.join(new_smile)
#sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
#return new_sentence
I found the above-mentioned function in your code (train_RNN.py). I could train the model and save it. But now, I would like to reload it and start generating new smiles. generate_smile(model,val) doesnt work. Could u plz tell me how did you used a pre-trained model for generating new molecules?
Hi, the function generate_smile(model,val) in train_RNN.py is the test version that I used it for testing my first implementation, so please don't use it. Can you try the following code to generate your molecules?
from subprocess import Popen, PIPE from math import * import random import numpy as np import random as pr from copy import deepcopy from types import IntType, ListType, TupleType, StringTypes import itertools import time import math import tensorflow as tf import argparse from load_model import loaded_model from keras.preprocessing import sequence from rdkit import Chem from rdkit.Chem import Draw from rdkit.Chem import Descriptors from rdkit.Chem import MolFromSmiles, MolToSmiles import sys from make_smile import zinc_data_with_bracket_original, zinc_processed_with_bracket
def chem_kn_simulation(model,state,val): all_posible=[]
end="\n"
position=[]
position.extend(state)
#position.append(added_nodes)
total_generated=[]
new_compound=[]
get_int_old=[]
for j in range(len(position)):
get_int_old.append(val.index(position[j]))
get_int=get_int_old
x=np.reshape(get_int,(1,len(get_int)))
x_pad= sequence.pad_sequences(x, maxlen=82, dtype='int32',
padding='post', truncating='pre', value=0.)
while not get_int[-1] == val.index(end):
predictions=model.predict(x_pad)
#print "shape of RNN",predictions.shape
preds=np.asarray(predictions[0][len(get_int)-1]).astype('float64')
preds = np.log(preds) / 1.0
preds = np.exp(preds) / np.sum(np.exp(preds))
next_probas = np.random.multinomial(1, preds, 1)
next_int=np.argmax(next_probas)
a=predictions[0][len(get_int)-1]
next_int_test=sorted(range(len(a)), key=lambda i: a[i])[-10:]
get_int.append(next_int)
x=np.reshape(get_int,(1,len(get_int)))
x_pad = sequence.pad_sequences(x, maxlen=82, dtype='int32',
padding='post', truncating='pre', value=0.)
if len(get_int)>82:
break
total_generated.append(get_int)
all_posible.extend(total_generated)
print all_possible
return all_posible
def predict_smile(all_posible,val): new_compound=[] for i in range(len(all_posible)): total_generated=all_posible[i]
generate_smile=[]
for j in range(len(total_generated)-1):
generate_smile.append(val[total_generated[j]])
generate_smile.remove("&")
new_compound.append(generate_smile)
return new_compound
def make_input_smile(generate_smile): new_compound=[] for i in range(len(generate_smile)): middle=[] for j in range(len(generate_smile[i])): middle.append(generate_smile[i][j]) com=''.join(middle) new_compound.append(com) return new_compound
Example of using the above three functions to generate molecules
val=['\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F', '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]', 's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]', '[nH+]', '\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7', 'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]', '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]', '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']
all_posible=chem_kn_simulation(model,['C','O','C'],val) generate_smile=predict_smile(all_posible,val) new_compound=make_input_smile(generate_smile) print new_compound
`from subprocess import Popen, PIPE from math import * import random import numpy as np import random as pr from copy import deepcopy from types import IntType, ListType, TupleType, StringTypes import itertools import time import math import tensorflow as tf import argparse from load_model import loaded_model from keras.preprocessing import sequence from rdkit import Chem from rdkit.Chem import Draw from rdkit.Chem import Descriptors from rdkit.Chem import MolFromSmiles, MolToSmiles import sys from make_smile import zinc_data_with_bracket_original, zinc_processed_with_bracket
def chem_kn_simulation(model,state,val): all_posible=[]
end="\n"
position=[]
position.extend(state)
#position.append(added_nodes)
total_generated=[]
new_compound=[]
get_int_old=[]
for j in range(len(position)):
get_int_old.append(val.index(position[j]))
get_int=get_int_old
x=np.reshape(get_int,(1,len(get_int)))
x_pad= sequence.pad_sequences(x, maxlen=82, dtype='int32',
padding='post', truncating='pre', value=0.)
while not get_int[-1] == val.index(end):
predictions=model.predict(x_pad)
#print "shape of RNN",predictions.shape
preds=np.asarray(predictions[0][len(get_int)-1]).astype('float64')
preds = np.log(preds) / 1.0
preds = np.exp(preds) / np.sum(np.exp(preds))
next_probas = np.random.multinomial(1, preds, 1)
next_int=np.argmax(next_probas)
a=predictions[0][len(get_int)-1]
next_int_test=sorted(range(len(a)), key=lambda i: a[i])[-10:]
get_int.append(next_int)
x=np.reshape(get_int,(1,len(get_int)))
x_pad = sequence.pad_sequences(x, maxlen=82, dtype='int32',
padding='post', truncating='pre', value=0.)
if len(get_int)>82:
break
total_generated.append(get_int)
all_posible.extend(total_generated)
print all_possible
return all_posible
def predict_smile(all_posible,val): new_compound=[] for i in range(len(all_posible)): total_generated=all_posible[i]
generate_smile=[]
for j in range(len(total_generated)-1):
generate_smile.append(val[total_generated[j]])
generate_smile.remove("&")
new_compound.append(generate_smile)
return new_compound
def make_input_smile(generate_smile): new_compound=[] for i in range(len(generate_smile)): middle=[] for j in range(len(generate_smile[i])): middle.append(generate_smile[i][j]) com=''.join(middle) new_compound.append(com) return new_compound
Example of using the above three functions to generate molecules
val=['\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F', '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]', 's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]', '[nH+]', '\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7', 'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]', '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]', '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']
all_posible=chem_kn_simulation(model,['C','O','C'],val) generate_smile=predict_smile(all_posible,val) new_compound=make_input_smile(generate_smile) print new_compound `
save the test.txt as test.py, and you can use it generate molecules.
@yangxiufengsia Thanks for the great help. It WORKED! However, I modified your code as follows, there were some minor mistakes that I tried to correct them:
import sys
import math
import random
import numpy as np
import random as pr
from rdkit import Chem
from make_smile import *
from keras.models import load_model
from keras.preprocessing import sequence
def chem_kn_simulation(model,state,val):
max_len = 81
get_int = [val.index(state[j]) for j in range(len(state))]
x = np.reshape(get_int,(1,len(get_int)))
x_pad = sequence.pad_sequences(x, maxlen = max_len, dtype = 'int32', padding = 'post', truncating = 'pre', value = 0.0)
while not get_int[-1] == val.index("\n"):
predictions = model.predict(x_pad)
#print("shape of RNN",predictions.shape)
a = predictions[0][len(get_int) - 1]
preds = np.asarray(a).astype('float64')
# preds = np.log(preds) / 1.0
# preds = np.exp(preds)
preds = preds / np.sum(preds)
next_probas = np.random.multinomial(1, preds, 1)
next_int = np.argmax(next_probas)
next_int_test = sorted(range(len(a)), key = lambda i: a[i])[-10:]
get_int.append(next_int)
x = np.reshape(get_int,(1,len(get_int)))
x_pad = sequence.pad_sequences(x, maxlen = max_len, dtype = 'int32', padding='post', truncating='pre', value=0.0)
if len(get_int) > max_len:
break
# print([get_int])
return [get_int]
def predict_smile(all_posible,val):
new_compound = []
for i in range(len(all_posible)):
generate_smile = [val[all_posible[i][j]] for j in range(len(all_posible[i])-1)]
# generate_smile.remove("&")
new_compound.append(generate_smile)
return new_compound
def make_input_smile(generate_smile):
new_compound = []
for i in range(len(generate_smile)):
middle = [generate_smile[i][j] for j in range(len(generate_smile[i]))]
com = ''.join(middle)
new_compound.append(com)
return new_compound
### Example of using the above three functions to generate molecules
val = [ '\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F',
'[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]',
's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]',
'[nH+]', '\\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7',
'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]',
'[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]',
'[s+]', '[PH+]', '[PH]', '8', '[S@@+]']
if __name__ == "__main__":
smiles = sys.argv[2]
_, all_smile = zinc_processed_with_bracket([smiles])
all_smile[0].remove('&')
all_smile[0].remove('\n')
print(all_smile[0])
model = load_model(sys.argv[1])
all_posible = chem_kn_simulation(model,all_smile[0],val)
# all_posible = chem_kn_simulation(model,['C','O','C'],val)
generate_smile = predict_smile(all_posible,val)
new_compound = make_input_smile(generate_smile)
print(new_compound)
I just wonder if val
is fixed or can be generated.
val = [ '\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F',
'[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]',
's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]',
'[nH+]', '\\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7',
'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]',
'[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]',
'[s+]', '[PH+]', '[PH]', '8', '[S@@+]']
@ALPHAYA-Japan good to know you can generate molecules now. val is obtained from training dataset and the symbols in val are used as the nodes of the search tree. Of course, you can use different symbols.