ChemTS icon indicating copy to clipboard operation
ChemTS copied to clipboard

How to Generate Molecules?

Open ALPHAYA-Japan opened this issue 6 years ago • 10 comments

Hi, after saving the model in train_RNN.py, I loaded the model using model_from_json("model.json") and model.load_weights("model.h5")

I called the generate_smile(model,'CCCC') I got the following error:

ValueError: substring not found

What's wrong with my generate_smile()?

ALPHAYA-Japan avatar Apr 27 '18 10:04 ALPHAYA-Japan

Hi, what is the generate_smile(model, 'CCCC') function? Can you paste your code here, so that I can check where is wrong?

yangxiufengsia avatar Apr 27 '18 13:04 yangxiufengsia

def generate_smile(model,val):
    new_smile = []
    start_smile_index = [val.index("C")]
    print(start_smile_index)
    while not start_smile_index[-1] == val.index("\n"):
        predictions = model.predict(start_smile_index)
        ##next atom probability
        smf = []
        for i in range (len(X)):
            sm = []
            for j in range(len(X[i])):
                #if np.argmax(predictions[i][j])=!0
                sm.append(np.argmax(predictions[i][j]))
            smf.append(sm)
        print(sm)
        print(smf)
        #print(len(sm))
        # new_smile.append(sampled_word)
    # return ''.join(new_smile)
    #sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    #return new_sentence

ALPHAYA-Japan avatar Apr 29 '18 00:04 ALPHAYA-Japan

I found the above-mentioned function in your code (train_RNN.py). I could train the model and save it. But now, I would like to reload it and start generating new smiles. generate_smile(model,val) doesnt work. Could u plz tell me how did you used a pre-trained model for generating new molecules?

ALPHAYA-Japan avatar Apr 29 '18 03:04 ALPHAYA-Japan

Hi, the function generate_smile(model,val) in train_RNN.py is the test version that I used it for testing my first implementation, so please don't use it. Can you try the following code to generate your molecules?

from subprocess import Popen, PIPE from math import * import random import numpy as np import random as pr from copy import deepcopy from types import IntType, ListType, TupleType, StringTypes import itertools import time import math import tensorflow as tf import argparse from load_model import loaded_model from keras.preprocessing import sequence from rdkit import Chem from rdkit.Chem import Draw from rdkit.Chem import Descriptors from rdkit.Chem import MolFromSmiles, MolToSmiles import sys from make_smile import zinc_data_with_bracket_original, zinc_processed_with_bracket

def chem_kn_simulation(model,state,val): all_posible=[]

end="\n"

position=[]
position.extend(state)
#position.append(added_nodes)
total_generated=[]
new_compound=[]
get_int_old=[]
for j in range(len(position)):
    get_int_old.append(val.index(position[j]))

get_int=get_int_old

x=np.reshape(get_int,(1,len(get_int)))
x_pad= sequence.pad_sequences(x, maxlen=82, dtype='int32',
    padding='post', truncating='pre', value=0.)
while not get_int[-1] == val.index(end):
    predictions=model.predict(x_pad)
    #print "shape of RNN",predictions.shape
    preds=np.asarray(predictions[0][len(get_int)-1]).astype('float64')
    preds = np.log(preds) / 1.0
    preds = np.exp(preds) / np.sum(np.exp(preds))
    next_probas = np.random.multinomial(1, preds, 1)
    next_int=np.argmax(next_probas)
    a=predictions[0][len(get_int)-1]
    next_int_test=sorted(range(len(a)), key=lambda i: a[i])[-10:]
    get_int.append(next_int)
    x=np.reshape(get_int,(1,len(get_int)))
    x_pad = sequence.pad_sequences(x, maxlen=82, dtype='int32',
        padding='post', truncating='pre', value=0.)
    if len(get_int)>82:
        break
total_generated.append(get_int)
all_posible.extend(total_generated)

print all_possible


return all_posible

def predict_smile(all_posible,val): new_compound=[] for i in range(len(all_posible)): total_generated=all_posible[i]

    generate_smile=[]

    for j in range(len(total_generated)-1):
        generate_smile.append(val[total_generated[j]])
    generate_smile.remove("&")
    new_compound.append(generate_smile)

return new_compound

def make_input_smile(generate_smile): new_compound=[] for i in range(len(generate_smile)): middle=[] for j in range(len(generate_smile[i])): middle.append(generate_smile[i][j]) com=''.join(middle) new_compound.append(com) return new_compound

Example of using the above three functions to generate molecules

val=['\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F', '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]', 's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]', '[nH+]', '\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7', 'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]', '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]', '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']

all_posible=chem_kn_simulation(model,['C','O','C'],val) generate_smile=predict_smile(all_posible,val) new_compound=make_input_smile(generate_smile) print new_compound

yangxiufengsia avatar Apr 29 '18 04:04 yangxiufengsia

`from subprocess import Popen, PIPE from math import * import random import numpy as np import random as pr from copy import deepcopy from types import IntType, ListType, TupleType, StringTypes import itertools import time import math import tensorflow as tf import argparse from load_model import loaded_model from keras.preprocessing import sequence from rdkit import Chem from rdkit.Chem import Draw from rdkit.Chem import Descriptors from rdkit.Chem import MolFromSmiles, MolToSmiles import sys from make_smile import zinc_data_with_bracket_original, zinc_processed_with_bracket

def chem_kn_simulation(model,state,val): all_posible=[]

end="\n"

position=[]
position.extend(state)
#position.append(added_nodes)
total_generated=[]
new_compound=[]
get_int_old=[]
for j in range(len(position)):
    get_int_old.append(val.index(position[j]))

get_int=get_int_old

x=np.reshape(get_int,(1,len(get_int)))
x_pad= sequence.pad_sequences(x, maxlen=82, dtype='int32',
    padding='post', truncating='pre', value=0.)
while not get_int[-1] == val.index(end):
    predictions=model.predict(x_pad)
    #print "shape of RNN",predictions.shape
    preds=np.asarray(predictions[0][len(get_int)-1]).astype('float64')
    preds = np.log(preds) / 1.0
    preds = np.exp(preds) / np.sum(np.exp(preds))
    next_probas = np.random.multinomial(1, preds, 1)
    next_int=np.argmax(next_probas)
    a=predictions[0][len(get_int)-1]
    next_int_test=sorted(range(len(a)), key=lambda i: a[i])[-10:]
    get_int.append(next_int)
    x=np.reshape(get_int,(1,len(get_int)))
    x_pad = sequence.pad_sequences(x, maxlen=82, dtype='int32',
        padding='post', truncating='pre', value=0.)
    if len(get_int)>82:
        break
total_generated.append(get_int)
all_posible.extend(total_generated)

print all_possible


return all_posible

def predict_smile(all_posible,val): new_compound=[] for i in range(len(all_posible)): total_generated=all_posible[i]

    generate_smile=[]

    for j in range(len(total_generated)-1):
        generate_smile.append(val[total_generated[j]])
    generate_smile.remove("&")
    new_compound.append(generate_smile)

return new_compound

def make_input_smile(generate_smile): new_compound=[] for i in range(len(generate_smile)): middle=[] for j in range(len(generate_smile[i])): middle.append(generate_smile[i][j]) com=''.join(middle) new_compound.append(com) return new_compound

Example of using the above three functions to generate molecules

val=['\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F', '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]', 's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]', '[nH+]', '\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7', 'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]', '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]', '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']

all_posible=chem_kn_simulation(model,['C','O','C'],val) generate_smile=predict_smile(all_posible,val) new_compound=make_input_smile(generate_smile) print new_compound `

yangxiufengsia avatar Apr 29 '18 04:04 yangxiufengsia

test.txt

yangxiufengsia avatar Apr 29 '18 04:04 yangxiufengsia

save the test.txt as test.py, and you can use it generate molecules.

yangxiufengsia avatar Apr 29 '18 04:04 yangxiufengsia

@yangxiufengsia Thanks for the great help. It WORKED! However, I modified your code as follows, there were some minor mistakes that I tried to correct them:

import sys
import math
import random
import numpy as np
import random as pr
from rdkit import Chem
from make_smile import *
from keras.models import load_model
from keras.preprocessing import sequence


def chem_kn_simulation(model,state,val):
    max_len = 81
    get_int = [val.index(state[j]) for j in range(len(state))]
    x       = np.reshape(get_int,(1,len(get_int)))
    x_pad   = sequence.pad_sequences(x, maxlen = max_len, dtype = 'int32', padding = 'post', truncating = 'pre', value = 0.0)

    while not get_int[-1] == val.index("\n"):
        predictions   = model.predict(x_pad)
        #print("shape of RNN",predictions.shape)
        a             = predictions[0][len(get_int) - 1]
        preds         = np.asarray(a).astype('float64')
        # preds         = np.log(preds) / 1.0
        # preds         = np.exp(preds)
        preds         = preds / np.sum(preds)
        next_probas   = np.random.multinomial(1, preds, 1)
        next_int      = np.argmax(next_probas)
        next_int_test = sorted(range(len(a)), key = lambda i: a[i])[-10:]
        get_int.append(next_int)
        x             = np.reshape(get_int,(1,len(get_int)))
        x_pad         = sequence.pad_sequences(x, maxlen = max_len, dtype = 'int32', padding='post', truncating='pre', value=0.0)
        if len(get_int) > max_len:
            break

    # print([get_int])
    return [get_int]


def predict_smile(all_posible,val):
    new_compound = []
    for i in range(len(all_posible)):
        generate_smile  = [val[all_posible[i][j]] for j in range(len(all_posible[i])-1)]
        # generate_smile.remove("&")
        new_compound.append(generate_smile)

    return new_compound


def make_input_smile(generate_smile):
    new_compound = []
    for i in range(len(generate_smile)):
        middle = [generate_smile[i][j] for j in range(len(generate_smile[i]))]
        com    = ''.join(middle)
        new_compound.append(com)

    return new_compound


### Example of using the above three functions to generate molecules
val = [ '\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F',
        '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]',
        's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]',
        '[nH+]', '\\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7',
        'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]',
        '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]',
        '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']

if __name__ == "__main__":
    smiles         = sys.argv[2]
    _, all_smile = zinc_processed_with_bracket([smiles])
    all_smile[0].remove('&')
    all_smile[0].remove('\n')
    print(all_smile[0])
    model          = load_model(sys.argv[1])
    all_posible    = chem_kn_simulation(model,all_smile[0],val)
    # all_posible    = chem_kn_simulation(model,['C','O','C'],val)
    generate_smile = predict_smile(all_posible,val)
    new_compound = make_input_smile(generate_smile)

    print(new_compound)

ALPHAYA-Japan avatar Apr 30 '18 07:04 ALPHAYA-Japan

I just wonder if val is fixed or can be generated.

val = [ '\n', '&','C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F',
        '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]',
        's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]',
        '[nH+]', '\\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7',
        'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]',
        '[P@]', '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]',
        '[s+]', '[PH+]', '[PH]', '8', '[S@@+]']

ALPHAYA-Japan avatar Apr 30 '18 07:04 ALPHAYA-Japan

@ALPHAYA-Japan good to know you can generate molecules now. val is obtained from training dataset and the symbols in val are used as the nodes of the search tree. Of course, you can use different symbols.

yangxiufengsia avatar Apr 30 '18 08:04 yangxiufengsia