Move IP Adapter Face ID to core
What does this PR do?
Fixes #7014 #6935
- [x] Switch to PEFT
- [x] Move to core
- [x] Add tests
@yiyixuxu @sayakpaul
Create face embeddings
import torch
import cv2
import numpy as np
from diffusers.utils import load_image
from diffusers import AutoPipelineForText2Image, AutoencoderKL, DDIMScheduler
from insightface.app import FaceAnalysis
image1 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png")
image2 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png")
ref_images = []
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))
for im in [image1, image2]:
image = cv2.cvtColor(np.asarray(im), cv2.COLOR_BGR2RGB)
faces = app.get(image)
image = torch.from_numpy(faces[0].normed_embedding)
ref_images.append(image.unsqueeze(0))
ref_images = torch.cat(ref_images, dim=0)
IP Adapter Face ID (SD 1.5)
base_model_path ="SG161222/Realistic_Vision_V4.0_noVAE"
noise_scheduler = DDIMScheduler(
num_train_timesteps=1000,
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
steps_offset=1,
)
pipeline = AutoPipelineForText2Image.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
scheduler=noise_scheduler
)
pipeline.to("cuda")
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID",
subfolder=None,
weight_name="ip-adapter-faceid_sd15.bin",
image_encoder_folder=None)
pipeline.set_ip_adapter_scale(0.7)
pipeline.enable_model_cpu_offload()
generator = torch.Generator(device="cpu").manual_seed(42)
num_images=2
images = pipeline(
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
ip_adapter_image=ref_images,
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
num_inference_steps=20, num_images_per_prompt=num_images, width=512, height=704,
generator=generator,
).images
IP Adapter Face ID XL (SDXL)
base_model_path ="SG161222/RealVisXL_V3.0"
pipeline = AutoPipelineForText2Image.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
scheduler=noise_scheduler
)
pipeline.to("cuda")
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID",
subfolder=None,
weight_name="ip-adapter-faceid_sdxl.bin",
image_encoder_folder=None)
pipeline.set_ip_adapter_scale(0.7)
pipeline.enable_model_cpu_offload()
generator = torch.Generator(device="cpu").manual_seed(42)
num_images=2
images = pipeline(
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
ip_adapter_image=ref_images, guidance_scale=7.5,
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
num_inference_steps=30, num_images_per_prompt=2,
generator=generator
).images
I'm getting the error
table_diffusion_xl.py", line 497, in encode_image
AttributeError: 'NoneType' object has no attribute 'parameters'
when I try to use this.
I'm getting the error
table_diffusion_xl.py", line 497, in encode_image AttributeError: 'NoneType' object has no attribute 'parameters'when I try to use this.
you cannot use Face ID with SDXL, the current changes only affect the Stable Diffusion pipeline
@jfischoff you can use it now, I also updated the example code
@fabiorigano is this ready for a review?
@yiyixuxu I have to add some checks on the inputs, but I would appreciate your feedback. thanks :)
Since both Face ID adapter and Face ID XL don't use an image encoder, I tested the multi-adapter feature by separately extracting and then concatenating the image embeddings of Face ID XL and another IP Adapter, Plus Face SDXL.
I think that prepare_ip_adapter_image_embeds would become too specific for this use case if we have to support an ip_adapter_images containing both images and insightface embeddings.
Here it is the code of the test:
# Create a SDXL pipeline
# ...
# Load sample images
image1 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png")
image2 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png")
# Extract Face features using insightface
ref_images = []
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))
for im in [image1, image2]:
image = cv2.cvtColor(np.asarray(im), cv2.COLOR_BGR2RGB)
faces = app.get(image)
image = torch.from_numpy(faces[0].normed_embedding)
ref_images.append(image.unsqueeze(0))
ref_images = torch.cat(ref_images, dim=0)
# Load Face ID XL adapter into the pipeline
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID",
subfolder=None,
weight_name="ip-adapter-faceid_sdxl.bin",
image_encoder_folder=None
)
# Generate Face ID image embeddings and save them locally
image_embeds = pipeline.prepare_ip_adapter_image_embeds(
ip_adapter_image=ref_images,
ip_adapter_image_embeds=None,
device="cuda",
num_images_per_prompt=1,
do_classifier_free_guidance=True,
)
torch.save(image_embeds, "faceid_xl.ipadpt")
# Unload ip adapter and lora
# ...
# Load Plus SDXL adapter into the pipeline
pipeline.load_ip_adapter("h94/IP-Adapter",
subfolder="sdxl_models",
weight_name="ip-adapter-plus-face_sdxl_vit-h.safetensors")
# Generate Plus SDXL image embeddings and save them locally
ip_images =[[image1, image2]]
image_embeds = pipeline.prepare_ip_adapter_image_embeds(
ip_adapter_image=ip_images,
ip_adapter_image_embeds=None,
device="cuda",
num_images_per_prompt=1,
do_classifier_free_guidance=True,
)
torch.save(image_embeds, "plus_face_xl.ipadpt")
# Unload the IP adapter
# ...
# Load both IP Adapters
pipeline.load_ip_adapter(["h94/IP-Adapter", "h94/IP-Adapter-FaceID"],
subfolder=["sdxl_models", None],
weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors", "ip-adapter-faceid_sdxl.bin"]
)
pipeline.set_ip_adapter_scale([0.7]*2)
# Load image embeddings and run inference
generator = torch.Generator(device="cpu").manual_seed(42)
t1 = torch.load("plus_face_xl.ipadpt")
t2 = torch.load("faceid_xl.ipadpt")
t = [t1[0], t2[0]]
images = pipeline(
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
ip_adapter_image_embeds=t, guidance_scale=7.5,
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
num_inference_steps=30, num_images_per_prompt=num_images, width=1024, height=1024,
generator=generator
).images
@fabiorigano
I think that prepare_ip_adapter_image_embeds would become too specific for this use case if we have to support an ip_adapter_images containing both images and insightface embeddings.
good news is that we do not want to support ip_adapter_image for face-id! :) we should make it clear in the docs
also, to make it easier to test, can you upload the ref_images embedding somewhere, maybe a hf dataset, so that we can just use that as input directly?
# Extract Face features using insightface
ref_images = []
app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))
for im in [image1, image2]:
image = cv2.cvtColor(np.asarray(im), cv2.COLOR_BGR2RGB)
faces = app.get(image)
image = torch.from_numpy(faces[0].normed_embedding)
ref_images.append(image.unsqueeze(0))
ref_images = torch.cat(ref_images, dim=0)
@fabiorigano
I tested the multi-adapter feature by separately extracting and then concatenating the image embeddings of Face ID XL and another IP Adapter, Plus Face SDXL
can you combine face-id with other ip-adaper models? I thought it required its own attention processor
@fabiorigano
I tested the multi-adapter feature by separately extracting and then concatenating the image embeddings of Face ID XL and another IP Adapter, Plus Face SDXL
can you combine face-id with other ip-adaper models? I thought it required its own attention processor
@yiyixuxu I used PEFT to load the LoRA weights, so we don't need additional attention processors :)
I uploaded some tensors here https://huggingface.co/datasets/fabiorigano/testing-images/tree/main
Some of my tests and the results (input image embeddings are computed from "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png"):
Face ID SD 1.5 only
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sd15.bin", image_encoder_folder=None)
pipeline.set_ip_adapter_scale(0.6)
image_embeds = load_pt("https://huggingface.co/datasets/fabiorigano/testing-images/resolve/main/ai_face2.ipadpt")
images = pipeline(
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
ip_adapter_image_embeds=image_embeds,
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
num_inference_steps=20, num_images_per_prompt=1, width=512, height=704,
generator=torch.Generator(device="cpu").manual_seed(0)
).images
Output image
Plus Face SD 1.5 only
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus-face_sd15.bin")
pipeline.set_ip_adapter_scale(0.6)
image_embeds = load_pt("https://huggingface.co/datasets/fabiorigano/testing-images/resolve/main/clip_ai_face2.ipadpt")
images = pipeline(
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
ip_adapter_image_embeds=image_embeds,
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
num_inference_steps=20, num_images_per_prompt=1, width=512, height=704,
generator=torch.Generator(device="cpu").manual_seed(0)
).images
Output image
Plus Face SD 1.5 + Face ID SD 1.5
pipeline.load_ip_adapter(["h94/IP-Adapter", "h94/IP-Adapter-FaceID"], subfolder=["models", None], weight_name=["ip-adapter-plus-face_sd15.safetensors", "ip-adapter-faceid_sd15.bin"])
pipeline.set_ip_adapter_scale([0.5, 0.5])
t1 = load_pt("https://huggingface.co/datasets/fabiorigano/testing-images/resolve/main/clip_ai_face2.ipadpt")
t2 = load_pt("https://huggingface.co/datasets/fabiorigano/testing-images/resolve/main/ai_face2.ipadpt")
image_embeds = [t1[0], t2[0]]
images = pipeline(
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
ip_adapter_image_embeds=image_embeds,
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
num_inference_steps=20, num_images_per_prompt=1, width=512, height=704,
generator=torch.Generator(device="cpu").manual_seed(0)
).images
Output image
@yiyixuxu it is ready for review
The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.
code and basic example https://github.com/huggingface/diffusers/pull/7186#issue-2164851665 updated, examples using tensors stored in cloud https://github.com/huggingface/diffusers/pull/7186#issuecomment-1987166212 are still working
@yiyixuxu sorry for taking so long to apply your suggestions
this update works with the following IP-Adapter models:
- ip-adapter-faceid_sd15.bin
- ip-adapter-faceid_sdxl.bin
- ip-adapter-faceid-portrait_sd15.bin (no LoRA)
all of them are trained on the insightface face embeddings.
I saw the original implementation of IP-Adapter Face ID Plus and it requires both face images and face embeddings as inputs, but currently diffusers doesn't support both inputs simultaneously. The face embedding is used for face identity while the face image to control the output structure of faces.
should we adapt diffusers to support both inputs?
@fabiorigano
can we support it without adding more argument or code to the pipelines? e.g. ideally can use a script to create ip_adapter_image_embeds and just pass it to the pipelines.
finally added support for both Face ID Plus
Prepare ID inputs (extract ID embeddings and crop image for CLIP)
ref_images_embeds = []
ip_adapter_images = []
for im in [image1, image2]: # Yiyi images, ai_face2 and woman_input
image = cv2.cvtColor(np.asarray(im), cv2.COLOR_BGR2RGB)
faces = app.get(image)
ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224))
image = torch.from_numpy(faces[0].normed_embedding)
image_embeds = image.unsqueeze(0)
ref_images_embeds.append(image_embeds)
ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds])
Face ID Plus
# Load model
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid-plus_sd15.bin")
pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = False (we enable "shortcut" just for Plus v2)
Face ID Plus v2
# Load model
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid-plusv2_sd15.bin")
pipeline.unet.encoder_hid_proj.image_projection_layers[0].shortcut = True (we can also change shortcut_scale)
Inference
# Set ip adapter scales
pipeline.set_ip_adapter_scale(0.7)
# Extract CLIP embeddings
clip_embeds = pipeline.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), num_images, True)[0]
# Set CLIP embeddings as class parameter
pipeline.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
images = pipeline(
prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
ip_adapter_image_embeds=[id_embeds], guidance_scale=7.5,
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
num_inference_steps=20, num_images_per_prompt=1, width=512, height=768,
generator=torch.Generator(device="cpu").manual_seed(42),
).images
Plus v1 output (input images are both ai_face2 and woman_input - the purpose is to mix faces)
Plus v2 output (input image are both ai_face2 and woman_input - the purpose is to mix faces)
Plus v1 output (input image is ai_face2)
Plus v2 output (input image is ai_face2)
@yiyixuxu @sayakpaul can you take a look when you have time? thank you
Also, do we need to add a check like so
https://github.com/huggingface/diffusers/blob/cf6e0407e051467b480830d3ed97d2873b5019d3/src/diffusers/loaders/lora.py#L108
when there's a call to use the IP Adapter Face ID weights?
I will add it
great work as always! thanks a lot :) @fabiorigano