ggml
ggml copied to clipboard
Add example for text-to-text transfer transformer (T5) inference
WIP in progress
model: https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
tokenizer: https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/t5_tokenizer_model.py
gated-gelu: https://arxiv.org/pdf/2002.05202.pdf
google t5x: https://github.com/google-research/t5x / models: https://github.com/google-research/t5x/blob/main/docs/models.md
FLAN-T5 Small layers
T5ForConditionalGeneration(
(shared): Embedding(32128, 512)
(encoder): T5Stack(
(embed_tokens): Embedding(32128, 512)
(block): ModuleList(
(0): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 6)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(1): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(2): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(3): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(4): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(5): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(6): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(7): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(final_layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(decoder): T5Stack(
(embed_tokens): Embedding(32128, 512)
(block): ModuleList(
(0): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 6)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(1): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(2): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(3): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(4): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(5): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(6): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(7): T5Block(
(layer): ModuleList(
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerCrossAttention(
(EncDecAttention): T5Attention(
(q): Linear(in_features=512, out_features=384, bias=False)
(k): Linear(in_features=512, out_features=384, bias=False)
(v): Linear(in_features=512, out_features=384, bias=False)
(o): Linear(in_features=384, out_features=512, bias=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(2): T5LayerFF(
(DenseReluDense): T5DenseGatedActDense(
(wi_0): Linear(in_features=512, out_features=1024, bias=False)
(wi_1): Linear(in_features=512, out_features=1024, bias=False)
(wo): Linear(in_features=1024, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
(act): NewGELUActivation()
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(final_layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(lm_head): Linear(in_features=512, out_features=32128, bias=False)
)
Where does this stand? Interested to give the LaMini-Flan-T5 models a go as their paper shows promising results of the T5 models particularly for smaller model sizes.
This fell out of priority - feel free to give it a try
Does the draft implementation work?
So I tested the t5 branch, it does indeed load the model, but there's no code for actual inference right now. And honestly it looks pretty complicated to implement.
The new upcoming StableDiffusion version 3 uses, beside others, T5.
https://stability.ai/news/stable-diffusion-3-research-paper