esm
esm copied to clipboard
Sequence head not working as intended?
Hello,
ESMC obviously has very good general protein representations, but it seems to fail the most basic reconstruction task of returning the input sequence. Is this expected?
from esm.models.esmc import ESMC
from esm.sdk.api import ESMProtein, LogitsConfig
from tqdm.auto import tqdm
client = ESMC.from_pretrained("esmc_300m").to("cuda") # or "cpu"
accs = []
for seq in tqdm(sequences): # 1000 random sequences from Uniref50
protein = ESMProtein(sequence=seq)
protein_tensor = client.encode(protein)
input_ids = protein_tensor.sequence[1:-1]
logits_output = client.logits(
protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
)
preds = logits_output.logits.sequence.argmax(dim=-1)[0][1:-1]
matching = sum(preds.cpu().numpy() == input_ids.cpu().numpy())
accs.append(matching / len(input_ids))
print(sum(accs) / len(accs) * 100)
35.51462866990015