Code Previously provided
So, I was looking at the embeddings code provided previously :
from esm.models.esm3 import ESM3
from esm.sdk.api import ESMProtein, SamplingConfig
from esm.utils.constants.models import ESM3_OPEN_SMALL
client = ESM3.from_pretrained(ESM3_OPEN_SMALL, device="cpu")
# Peptidase S1A, chymotrypsin family: https://www.ebi.ac.uk/interpro/structure/PDB/1utn/
protein = ESMProtein(
sequence=(
"FIFLALLGAAVAFPVDDDDKIVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEG"
"NEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAP"
"ILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN"
)
)
protein_tensor = client.encode(protein)
output = client.forward_and_sample(
protein_tensor, SamplingConfig(return_per_residue_embeddings=True)
)
print(output.per_residue_embedding.shape)
And I am having this mistake :
TypeError Traceback (most recent call last) Cell In[15], line 16 8 # Peptidase S1A, chymotrypsin family: https://www.ebi.ac.uk/interpro/structure/PDB/1utn/ 9 protein = ESMProtein( 10 sequence=( 11 "FIFLALLGAAVAFPVDDDDKIVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEG" (...) 14 ) 15 ) ---> 16 protein_tensor = client.encode(protein) 18 output = client.forward_and_sample( 19 protein_tensor, SamplingConfig(return_per_residue_embeddings=True) 20 ) 21 print(output.per_residue_embedding.shape)
File ~/Library/Application Support/hatch/env/virtual/embpy/TdXAJ_kB/hatch-test.py3.12/lib/python3.12/site-packages/esm/models/esm3.py:433, in ESM3.encode(self, input) 430 coordinates = None 432 if input.sequence is not None: --> 433 sequence_tokens = encoding.tokenize_sequence( 434 input.sequence, self.tokenizers.sequence, add_special_tokens=True 435 ) 436 if input.secondary_structure is not None: 437 secondary_structure_tokens = encoding.tokenize_secondary_structure( 438 input.secondary_structure, 439 self.tokenizers.secondary_structure, 440 add_special_tokens=True, 441 )
File ~/Library/Application Support/hatch/env/virtual/embpy/TdXAJ_kB/hatch-test.py3.12/lib/python3.12/site-packages/esm/utils/encoding.py:53, in tokenize_sequence(sequence, sequence_tokenizer, add_special_tokens) 48 def tokenize_sequence( 49 sequence: str, 50 sequence_tokenizer: EsmSequenceTokenizer, 51 add_special_tokens: bool = True, 52 ) -> torch.Tensor: ---> 53 sequence = sequence.replace(C.MASK_STR_SHORT, sequence_tokenizer.mask_token) 54 sequence_tokens = sequence_tokenizer.encode( 55 sequence, add_special_tokens=add_special_tokens 56 ) 57 sequence_tokens = torch.tensor(sequence_tokens, dtype=torch.int64)
TypeError: replace() argument 2 must be str, not None
can someone tell me how I can quickly fix this ? Thanks