modelscope
modelscope copied to clipboard
RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)
Dear modelsscope group,
Thank you for sharing Text-to-video-synthesis Model in Open Domain, I really like it.
I follow the exact instruction from the code example, it reports error in the line
output_video_path = p(test_text,)[OutputKeys.OUTPUT_VIDEO]
The error message is
2023-03-20 18:04:41,786 - modelscope - WARNING - task text-to-video-synthesis input definition is missing
WARNING:modelscope:task text-to-video-synthesis input definition is missing
Traceback (most recent call last) ────────────────────────────────╮
│ in <module>:4 │
│ │
│ /opt/conda/lib/python3.8/site-packages/modelscope/pipelines/base.py:212 in __call__ │
│ │
│ 209 │ │ │ return self._process_iterator(input, *args, **kwargs) │
│ 210 │ │ │
│ 211 │ │ else: │
│ ❱ 212 │ │ │ output = self._process_single(input, *args, **kwargs) │
│ 213 │ │ return output │
│ 214 │ │
│ 215 │ def _sanitize_parameters(self, **pipeline_parameters): │
│ │
│ /opt/conda/lib/python3.8/site-packages/modelscope/pipelines/base.py:247 in _process_single │
│ │
│ 244 │ │ │ │ with torch.no_grad(): │
│ 245 │ │ │ │ │ if self._auto_collate: │
│ 246 │ │ │ │ │ │ out = self._collate_fn(out) │
│ ❱ 247 │ │ │ │ │ out = self.forward(out, **forward_params) │
│ 248 │ │ │ else: │
│ 249 │ │ │ │ out = self.forward(out, **forward_params) │
│ 250 │
│ │
│ /opt/conda/lib/python3.8/site-packages/modelscope/pipelines/multi_modal/text_to_video_synthesis_ │
│ pipeline.py:58 in forward │
│ │
│ 55 │ │
│ 56 │ def forward(self, input: Dict[str, Any], │
│ 57 │ │ │ │ **forward_params) -> Dict[str, Any]: │
│ ❱ 58 │ │ video = self.model(input) │
│ 59 │ │ return {'video': video} │
│ 60 │ │
│ 61 │ def postprocess(self, inputs: Dict[str, Any], │
│ │
│ /opt/conda/lib/python3.8/site-packages/modelscope/models/base/base_model.py:34 in __call__ │
│ │
│ 31 │ │ self._device_name = device_name │
│ 32 │ │
│ 33 │ def __call__(self, *args, **kwargs) -> Dict[str, Any]: │
│ ❱ 34 │ │ return self.postprocess(self.forward(*args, **kwargs)) │
│ 35 │ │
│ 36 │ @abstractmethod │
│ 37 │ def forward(self, *args, **kwargs) -> Dict[str, Any]: │
│ │
│ /opt/conda/lib/python3.8/site-packages/modelscope/models/multi_modal/video_synthesis/text_to_vid │
│ eo_synthesis_model.py:153 in forward │
│ │
│ 150 │ │ │ max_frames = self.config.model.model_args.max_frames │
│ 151 │ │ │ latent_h, latent_w = 32, 32 │
│ 152 │ │ │ with amp.autocast(enabled=True): │
│ ❱ 153 │ │ │ │ x0 = self.diffusion.ddim_sample_loop( │
│ 154 │ │ │ │ │ noise=torch.randn(num_sample, 4, max_frames, latent_h, │
│ 155 │ │ │ │ │ │ │ │ │ latent_w).to( │
│ 156 │ │ │ │ │ │ │ │ │ │ self.device), # shape: b c f h w │
│ │
│ /opt/conda/lib/python3.8/site-packages/torch/autograd/grad_mode.py:27 in decorate_context │
│ │
│ 24 │ │ @functools.wraps(func) │
│ 25 │ │ def decorate_context(*args, **kwargs): │
│ 26 │ │ │ with self.clone(): │
│ ❱ 27 │ │ │ │ return func(*args, **kwargs) │
│ 28 │ │ return cast(F, decorate_context) │
│ 29 │ │
│ 30 │ def _wrap_generator(self, func): │
│ │
│ /opt/conda/lib/python3.8/site-packages/modelscope/models/multi_modal/video_synthesis/diffusion.p │
│ y:219 in ddim_sample_loop │
│ │
│ 216 │ │ │ │ │ │ │ │ │ 0, self.num_timesteps - 1).flip(0) │
│ 217 │ │ for step in steps: │
│ 218 │ │ │ t = torch.full((b, ), step, dtype=torch.long, device=xt.device) │
│ ❱ 219 │ │ │ xt, _ = self.ddim_sample(xt, t, model, model_kwargs, clamp, │
│ 220 │ │ │ │ │ │ │ │ │ percentile, condition_fn, guide_scale, │
│ 221 │ │ │ │ │ │ │ │ │ ddim_timesteps, eta) │
│ 222 │ │ return xt │
│ │
│ /opt/conda/lib/python3.8/site-packages/torch/autograd/grad_mode.py:27 in decorate_context │
│ │
│ 24 │ │ @functools.wraps(func) │
│ 25 │ │ def decorate_context(*args, **kwargs): │
│ 26 │ │ │ with self.clone(): │
│ ❱ 27 │ │ │ │ return func(*args, **kwargs) │
│ 28 │ │ return cast(F, decorate_context) │
│ 29 │ │
│ 30 │ def _wrap_generator(self, func): │
│ │
│ /opt/conda/lib/python3.8/site-packages/modelscope/models/multi_modal/video_synthesis/diffusion.p │
│ y:168 in ddim_sample │
│ │
│ 165 │ │ stride = self.num_timesteps // ddim_timesteps │
│ 166 │ │ │
│ 167 │ │ # predict distribution of p(x_{t-1} | x_t) │
│ ❱ 168 │ │ _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp, │
│ 169 │ │ │ │ │ │ │ │ │ │ percentile, guide_scale) │
│ 170 │ │ if condition_fn is not None: │
│ 171 │ │ │ # x0 -> eps │
│ │
│ /opt/conda/lib/python3.8/site-packages/modelscope/models/multi_modal/video_synthesis/diffusion.p │
│ y:120 in p_mean_variance │
│ │
│ 117 │ │ │
│ 118 │ │ # compute variance │
│ 119 │ │ if self.var_type == 'fixed_small': │
│ ❱ 120 │ │ │ var = _i(self.posterior_variance, t, xt) │
│ 121 │ │ │ log_var = _i(self.posterior_log_variance_clipped, t, xt) │
│ 122 │ │ │
│ 123 │ │ # compute mean and x0 │
│ │
│ /opt/conda/lib/python3.8/site-packages/modelscope/models/multi_modal/video_synthesis/diffusion.p │
│ y:14 in _i │
│ │
│ 11 │ r"""Index tensor using t and format the output according to x. │
│ 12 │ """ │
│ 13 │ shape = (x.size(0), ) + (1, ) * (x.ndim - 1) │
│ ❱ 14 │ return tensor[t].view(shape).to(x) │
│ 15 │
│ 16 │
│ 17 def beta_schedule(schedule, │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)
I am using one A10g gpu. Could you tell me how to fix the problem?
Thank you for your help.
Best Wishes,
Zongze
use v1.4.2 which has fix this