flux
flux copied to clipboard
unet and image_encoder in FluxPipeline
Dear Team Thank you so much for releasing the model.. I am trying to integrate the flux model for some use case for which I requires the unet, and image_encoder. I find in the FluxPipeline there exists no such components. Can you please provide some guideline in this direction.
As the FLUX.1 models are transformer based, they don't have a unet. Similarly, while SD1.5, SDXL, and so on had a unet, SD3 also uses a transformer for denoising instead. With the huggingface diffusers FluxPipeline it's actually quite simple to see the model components, just print the pipeline:
from diffusers import FluxPipeline
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell")
print(pipe)
print(pipe.transformer)
pipe
FluxPipeline {
"_class_name": "FluxPipeline",
"_diffusers_version": "0.30.3",
"_name_or_path": "black-forest-labs/FLUX.1-schnell",
"scheduler": [
"diffusers",
"FlowMatchEulerDiscreteScheduler"
],
"text_encoder": [
"transformers",
"CLIPTextModel"
],
"text_encoder_2": [
"transformers",
"T5EncoderModel"
],
"tokenizer": [
"transformers",
"CLIPTokenizer"
],
"tokenizer_2": [
"transformers",
"T5TokenizerFast"
],
"transformer": [
"diffusers",
"FluxTransformer2DModel"
],
"vae": [
"diffusers",
"AutoencoderKL"
]
}
pipe.transformer
FluxTransformer2DModel(
(pos_embed): EmbedND()
(time_text_embed): CombinedTimestepTextProjEmbeddings(
(time_proj): Timesteps()
(timestep_embedder): TimestepEmbedding(
(linear_1): Linear(in_features=256, out_features=3072, bias=True)
(act): SiLU()
(linear_2): Linear(in_features=3072, out_features=3072, bias=True)
)
(text_embedder): PixArtAlphaTextProjection(
(linear_1): Linear(in_features=768, out_features=3072, bias=True)
(act_1): SiLU()
(linear_2): Linear(in_features=3072, out_features=3072, bias=True)
)
)
(context_embedder): Linear(in_features=4096, out_features=3072, bias=True)
(x_embedder): Linear(in_features=64, out_features=3072, bias=True)
(transformer_blocks): ModuleList(
(0-18): 19 x FluxTransformerBlock(
(norm1): AdaLayerNormZero(
(silu): SiLU()
(linear): Linear(in_features=3072, out_features=18432, bias=True)
(norm): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
)
(norm1_context): AdaLayerNormZero(
(silu): SiLU()
(linear): Linear(in_features=3072, out_features=18432, bias=True)
(norm): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
)
(attn): Attention(
(norm_q): RMSNorm()
(norm_k): RMSNorm()
(to_q): Linear(in_features=3072, out_features=3072, bias=True)
(to_k): Linear(in_features=3072, out_features=3072, bias=True)
(to_v): Linear(in_features=3072, out_features=3072, bias=True)
(add_k_proj): Linear(in_features=3072, out_features=3072, bias=True)
(add_v_proj): Linear(in_features=3072, out_features=3072, bias=True)
(add_q_proj): Linear(in_features=3072, out_features=3072, bias=True)
(to_out): ModuleList(
(0): Linear(in_features=3072, out_features=3072, bias=True)
(1): Dropout(p=0.0, inplace=False)
)
(to_add_out): Linear(in_features=3072, out_features=3072, bias=True)
(norm_added_q): RMSNorm()
(norm_added_k): RMSNorm()
)
(norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
(ff): FeedForward(
(net): ModuleList(
(0): GELU(
(proj): Linear(in_features=3072, out_features=12288, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=12288, out_features=3072, bias=True)
)
)
(norm2_context): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
(ff_context): FeedForward(
(net): ModuleList(
(0): GELU(
(proj): Linear(in_features=3072, out_features=12288, bias=True)
)
(1): Dropout(p=0.0, inplace=False)
(2): Linear(in_features=12288, out_features=3072, bias=True)
)
)
)
)
(single_transformer_blocks): ModuleList(
(0-37): 38 x FluxSingleTransformerBlock(
(norm): AdaLayerNormZeroSingle(
(silu): SiLU()
(linear): Linear(in_features=3072, out_features=9216, bias=True)
(norm): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
)
(proj_mlp): Linear(in_features=3072, out_features=12288, bias=True)
(act_mlp): GELU(approximate='tanh')
(proj_out): Linear(in_features=15360, out_features=3072, bias=True)
(attn): Attention(
(norm_q): RMSNorm()
(norm_k): RMSNorm()
(to_q): Linear(in_features=3072, out_features=3072, bias=True)
(to_k): Linear(in_features=3072, out_features=3072, bias=True)
(to_v): Linear(in_features=3072, out_features=3072, bias=True)
)
)
)
(norm_out): AdaLayerNormContinuous(
(silu): SiLU()
(linear): Linear(in_features=3072, out_features=6144, bias=True)
(norm): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
)
(proj_out): Linear(in_features=3072, out_features=64, bias=True)
)