esm icon indicating copy to clipboard operation
esm copied to clipboard

Code Previously provided

Open grpinto opened this issue 8 months ago • 0 comments

So, I was looking at the embeddings code provided previously :

from esm.models.esm3 import ESM3
from esm.sdk.api import ESMProtein, SamplingConfig
from esm.utils.constants.models import ESM3_OPEN_SMALL


client = ESM3.from_pretrained(ESM3_OPEN_SMALL, device="cpu")

# Peptidase S1A, chymotrypsin family: https://www.ebi.ac.uk/interpro/structure/PDB/1utn/
protein = ESMProtein(
    sequence=(
        "FIFLALLGAAVAFPVDDDDKIVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEG"
        "NEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAP"
        "ILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN"
    )
)
protein_tensor = client.encode(protein)

output = client.forward_and_sample(
    protein_tensor, SamplingConfig(return_per_residue_embeddings=True)
)
print(output.per_residue_embedding.shape)

And I am having this mistake :


TypeError Traceback (most recent call last) Cell In[15], line 16 8 # Peptidase S1A, chymotrypsin family: https://www.ebi.ac.uk/interpro/structure/PDB/1utn/ 9 protein = ESMProtein( 10 sequence=( 11 "FIFLALLGAAVAFPVDDDDKIVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEG" (...) 14 ) 15 ) ---> 16 protein_tensor = client.encode(protein) 18 output = client.forward_and_sample( 19 protein_tensor, SamplingConfig(return_per_residue_embeddings=True) 20 ) 21 print(output.per_residue_embedding.shape)

File ~/Library/Application Support/hatch/env/virtual/embpy/TdXAJ_kB/hatch-test.py3.12/lib/python3.12/site-packages/esm/models/esm3.py:433, in ESM3.encode(self, input) 430 coordinates = None 432 if input.sequence is not None: --> 433 sequence_tokens = encoding.tokenize_sequence( 434 input.sequence, self.tokenizers.sequence, add_special_tokens=True 435 ) 436 if input.secondary_structure is not None: 437 secondary_structure_tokens = encoding.tokenize_secondary_structure( 438 input.secondary_structure, 439 self.tokenizers.secondary_structure, 440 add_special_tokens=True, 441 )

File ~/Library/Application Support/hatch/env/virtual/embpy/TdXAJ_kB/hatch-test.py3.12/lib/python3.12/site-packages/esm/utils/encoding.py:53, in tokenize_sequence(sequence, sequence_tokenizer, add_special_tokens) 48 def tokenize_sequence( 49 sequence: str, 50 sequence_tokenizer: EsmSequenceTokenizer, 51 add_special_tokens: bool = True, 52 ) -> torch.Tensor: ---> 53 sequence = sequence.replace(C.MASK_STR_SHORT, sequence_tokenizer.mask_token) 54 sequence_tokens = sequence_tokenizer.encode( 55 sequence, add_special_tokens=add_special_tokens 56 ) 57 sequence_tokens = torch.tensor(sequence_tokens, dtype=torch.int64)

TypeError: replace() argument 2 must be str, not None

can someone tell me how I can quickly fix this ? Thanks

grpinto avatar Apr 16 '25 09:04 grpinto