Open-Sora-Plan
Open-Sora-Plan copied to clipboard
Image2Video Mask value Not as expected
func prepare_mask_masked_video
Assume that after the first frame is given, when the constructed Mask is constructed at the latent level, the latent frame dim result is incorrect;
demo_inpaint_prepare_mask_masked_video.py
import torch.nn.functional as F
import torch
from einops import rearrange
# DEBUG for https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/294993ca78bf65dec1c3b6fb25541432c545eda9/opensora/sample/pipeline_inpaint.py#L605
# line 592~607
B = 1
T = 17
H=64
W=64
vae_factor = (4,8,8)
mask_pix = torch.ones([B, 1, T, H, W])
conditional_images_indices = [0]
# Mask zero
mask_pix[:, :, conditional_images_indices] = 0
# Due to interpolate Space.
mask_1 = rearrange(mask_pix, 'b c t h w -> (b c t) 1 h w') # [17,1,64,64]
latent_size=(H//vae_factor[1],W//vae_factor[2])
if T % 2 == 1:
latent_size_t = (T - 1) // vae_factor[0] + 1
else:
latent_size_t = T // vae_factor[0]
mask_2 = F.interpolate(mask_1, size=latent_size, mode='bilinear') #[17,1,8,8]
mask_3 = rearrange(mask_2, '(b c t) 1 h w -> b c t h w', t=T, b=B)#[1,1,17,8,8]
# Set Zero Frame Mask on Latent level, Why repeat from T dim?
mask_first_frame = mask_3[:, :, 0:1].repeat(1, 1, 4, 1, 1) # [1,1,4,8,8]
assert (mask_first_frame==0).all()
mask_4 = torch.cat([mask_first_frame, mask_3[:, :, 1:]], dim=2) # [1,1,20,8,8]
mask_output = mask_4.view(B, vae_factor[0], latent_size_t, *latent_size)# [1,4,5,8,8]
assert (mask_output[:,:,0]==0).all()
@LinB203 Could you please explain this place?