torchdrug icon indicating copy to clipboard operation
torchdrug copied to clipboard

load csv file

Open QUEST2179 opened this issue 1 year ago • 10 comments

Dear Developer,

try to mimic beta_lactamase.py to load csv file directly, but got the following error. could you please help? Thanks!

File "C:\Users\18482\work\torchdrug-master\torchdrug\tasks\property_prediction.py", line 66, in preprocess if not math.isnan(sample[task]): TypeError: must be real number, not str

I included beta_csv.py for your troubleshooting.

import os from torch.utils import data as torch_data from torchdrug import data, utils from torchdrug.core import Registry as R import pandas as pd from collections import defaultdict

@R.register("datasets.beta_csv") @utils.copy_args(data.ProteinDataset.load_sequence, ignore=("target_fields")) class beta_csv(data.ProteinDataset): """ Qualitative data of drugs approved by the FDA and those that have failed clinical trials for toxicity reasons.

Statistics:
    - #Molecule: 1,478
    - #Classification task: 2

Parameters:
    path (str): path to store the dataset
    verbose (int, optional): output verbose level
    **kwargs
"""

def __init__(self,  path, verbose=1, **kwargs):
    path = os.path.expanduser(path)
    if not os.path.exists(path):
        os.makedirs(path)
    self.path = path

    sequences = []
    num_samples = []
    targets = defaultdict(list)
    target_fields = ["scaled_effect1"]

    csv_files = ['protein-datasets/beta_lactamase/beta_lactamase_test.csv',
                'protein-datasets/beta_lactamase/beta_lactamase_train.csv',
                'protein-datasets/beta_lactamase/beta_lactamase_valid.csv',]
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        print(df.head())
        sequences.extend(df['Sequences'].values)
        if target_fields:
            for field in target_fields:
                targets[field].extend(df['Sequences'].values)
        num_samples.append(df.shape[0])
    print(num_samples)
    self.load_sequence(sequences, targets, attributes=None, verbose=verbose, **kwargs)
    self.num_samples = num_samples

beta_lactamase_test.csv looks like this Unnamed: 0 Sequences Targets 0 0 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLAARVGYIE... 1.011182 1 1 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE... 1.003127 2 2 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE... -0.008031 3 3 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE... 0.621368 4 4 MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE... 1.005303

QUEST2179 avatar Mar 27 '23 11:03 QUEST2179

Hi, this is because your targets are str and not transformed to float. One way to deal with this is to use the utils.literal_eval function for transformation as in data.MoleculeDataset. load_csv.

Oxer11 avatar Mar 27 '23 13:03 Oxer11

Thanks, I will try your suggestion.

QUEST2179 avatar Mar 27 '23 14:03 QUEST2179

Another error on save model.

with open("beta_cnn.json", "w") as fout:
    json.dump(solver.config_dict(), fout)

raise TypeError(f'Object of type {o.__class__.__name__} '

TypeError: Object of type TruncateProtein is not JSON serializable

solver.save("beta_cnn.pth") works though.

QUEST2179 avatar Mar 27 '23 15:03 QUEST2179

Hi, could you share more contexts, e.g., how you define your solver and dataset? Everything works well for me.

Oxer11 avatar Mar 27 '23 18:03 Oxer11

Just follow your BetaLactamase example on your webpage.

def testPropertyPrediction(): from torchdrug import models

model = models.ProteinCNN(input_dim=21,
                        hidden_dims=[1024, 1024],
                        kernel_size=5, padding=2, readout="max")

from torchdrug import transforms

truncate_transform = transforms.TruncateProtein(max_length=200, random=False)
protein_view_transform = transforms.ProteinView(view="residue")
transform = transforms.Compose([truncate_transform, protein_view_transform])

from torchdrug import datasets

dataset = datasets.BetaLactamase("protein-datasets/", atom_feature=None, bond_feature=None, residue_feature="default", transform=transform)
train_set, valid_set, test_set = dataset.split()
print("The label of first sample: ", dataset[0][dataset.target_fields[0]])
print("train samples: %d, valid samples: %d, test samples: %d" % (len(train_set), len(valid_set), len(test_set)))

from torchdrug import tasks

task = tasks.PropertyPrediction(model, task= ('scaled_effect1'), #dataset.tasks,
                                criterion="mse", metric=("mae", "rmse", "spearmanr"),
                                normalization=False, num_mlp_layer=2)
import torch
from torchdrug import core

optimizer = torch.optim.Adam(task.parameters(), lr=1e-4)
solver = core.Engine(task, train_set, valid_set, test_set, optimizer, gpus=[0], batch_size=64)
solver.train(num_epoch=10)
solver.evaluate("valid")

import json

with open("beta_cnn.json", "w") as fout:
    json.dump(solver.config_dict(), fout)
solver.save("beta_cnn.pth")

got output: mean absolute error [scaled_effect1]: 0.303814 root mean squared error [scaled_effect1]: 0.331703 spearmanr [scaled_effect1]: 0.442122

Name Version Build Channel

pytorch 1.13.1 py3.7_cuda11.6_cudnn8_0 pytorch pytorch-cuda 11.6 h867d48c_1 pytorch

QUEST2179 avatar Mar 27 '23 23:03 QUEST2179

Thanks for raising this issue. It seems to be a bug that the config_dict() function fails to deal with list arguments in transforms.Compose. This has been fixed in https://github.com/DeepGraphLearning/torchdrug/commit/b50884877f8e1185d7500cc9207cc7b3782fb028.

Oxer11 avatar Mar 28 '23 02:03 Oxer11

Sorry this fix doesn't work. I still get the same error.

QUEST2179 avatar Mar 28 '23 12:03 QUEST2179

It works for me. Maybe you need to clone the latest repo to fetch the commit and remember to install from the source code.

Oxer11 avatar Mar 28 '23 12:03 Oxer11

sorry I didn't pay much attention to the exact error message, use your latest repo, the error message has changed to the following

raise TypeError(f'Object of type {o.__class__.__name__} '

TypeError: Object of type range is not JSON serializable

I print out solver.config_dict(), it had 3 occurrences of range.

{'class': 'core.Engine', 'task': {'class': 'tasks.PropertyPrediction', 'model': {'class': 'models.ProteinConvolutionalNetwork', 'input_dim': 21, 'hidden_dims': [1024, 1024], 'kernel_size': 5, 'stride': 1, 'padding': 2, 'activation': 'relu', 'short_cut': False, 'concat_hidden': False, 'readout': 'max'}, 'task': 'scaled_effect1', 'criterion': 'mse', 'metric': ('mae', 'rmse', 'spearmanr'), 'num_mlp_layer': 2, 'normalization': False, 'num_class': None, 'mlp_batch_norm': False, 'mlp_dropout': 0, 'graph_construction_model': None, 'verbose': 0}, 'train_set': {'class': 'dataset.Subset', 'dataset': {'class': 'datasets.BetaLactamase', 'path': 'protein-datasets/', 'verbose': 1, 'atom_feature': None, 'bond_feature': None, 'residue_feature': 'default', 'transform': {'class': 'transforms.Compose', 'transforms': [{'class': 'transforms.TruncateProtein', 'max_length': 200, 'random': False, 'keys': 'graph'}, {'class': 'transforms.ProteinView', 'view': 'residue', 'keys': 'graph'}]}}, 'indices': range(0, 4158)}, 'valid_set': {'class': 'dataset.Subset', 'dataset': {'class': 'datasets.BetaLactamase', 'path': 'protein-datasets/', 'verbose': 1, 'atom_feature': None, 'bond_feature': None, 'residue_feature': 'default', 'transform': {'class': 'transforms.Compose', 'transforms': [{'class': 'transforms.TruncateProtein', 'max_length': 200, 'random': False, 'keys': 'graph'}, {'class': 'transforms.ProteinView', 'view': 'residue', 'keys': 'graph'}]}}, 'indices': range(4158, 4678)}, 'test_set': {'class': 'dataset.Subset', 'dataset': {'class': 'datasets.BetaLactamase', 'path': 'protein-datasets/', 'verbose': 1, 'atom_feature': None, 'bond_feature': None, 'residue_feature': 'default', 'transform': {'class': 'transforms.Compose', 'transforms': [{'class': 'transforms.TruncateProtein', 'max_length': 200, 'random': False, 'keys': 'graph'}, {'class': 'transforms.ProteinView', 'view': 'residue', 'keys': 'graph'}]}}, 'indices': range(4678, 5198)}, 'optimizer': {'class': 'optim.Adam', 'lr': 0.0001, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'foreach': None, 'maximize': False, 'capturable': False, 'differentiable': False, 'fused': False}, 'scheduler': None, 'gpus': [0], 'batch_size': 64, 'gradient_interval': 1, 'num_worker': 0, 'logger': 'logging', 'log_interval': 100}

QUEST2179 avatar Mar 28 '23 14:03 QUEST2179

Yes, you're right. The problem is caused by range, while the behavior of config_dict() is correct. In this case, I would suggest to manually convert range into list when dumping as json.

Oxer11 avatar Mar 28 '23 18:03 Oxer11