chatterbox
chatterbox copied to clipboard
AttributeError: 'NoneType' object has no attribute 'cpu'
VC works, but TTS doesn't (code below). On Win 11, RTX 4090, I get this error:
Error: Python: Traceback (most recent call last):
File ".\python\Lib\site-packages\chatterbox\tts.py", line 243, in generate
speech_tokens = self.t3.inference(
^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\chatterbox\models\t3\t3.py", line 318, in inference
output = self.patched_model(
^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\nn\modules\module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\nn\modules\module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\chatterbox\models\t3\inference\t3_hf_backend.py", line 95, in forward
tfmr_out = self.model(
^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\nn\modules\module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\nn\modules\module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\transformers\utils\generic.py", line 943, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\transformers\models\llama\modeling_llama.py", line 443, in forward
layer_outputs = decoder_layer(
^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\transformers\modeling_layers.py", line 48, in __call__
return super().__call__(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\nn\modules\module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\nn\modules\module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\transformers\models\llama\modeling_llama.py", line 294, in forward
hidden_states, self_attn_weights = self.self_attn(
^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\nn\modules\module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\torch\nn\modules\module.py", line 1845, in _call_impl
return inner()
^^^^^^^
File ".\python\Lib\site-packages\torch\nn\modules\module.py", line 1806, in inner
hook_result = hook(self, args, result)
^^^^^^^^^^^^^^^^^^^^^^^^
File ".\python\Lib\site-packages\chatterbox\models\t3\inference\alignment_stream_analyzer.py", line 74, in attention_forward_hook
step_attention = output[1].cpu() # (B, 16, N, N)
^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'cpu'
Running this:
import torch
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS
# Automatically detect the best available device
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
print(f"Using device: {device}")
model = ChatterboxTTS.from_pretrained(device=device)
text = "Ezreal and Jinx teamed up with Ahri, Yasuo, and Teemo to take down the enemy's Nexus in an epic late-game pentakill."
wav = model.generate(text)
ta.save("C:/Users/user/Downloads/test-1.wav", wav, model.sr)
# If you want to synthesize with a different voice, specify the audio prompt
AUDIO_PROMPT_PATH="C:/Users/user/Downloads/voice_sample.wav"
wav = model.generate(text, audio_prompt_path=AUDIO_PROMPT_PATH)
ta.save("C:/Users/user/Downloads/test-2.wav", wav, model.sr)
In alignment_stream_analyzer.py
Changing this seems to fix it:
def attention_forward_hook(module, input, output):
"""
See `LlamaAttention.forward`; the output is a 3-tuple: `attn_output, attn_weights, past_key_value`.
"""
if output is not None and isinstance(output, tuple) and len(output) > 1 and output[1] is not None:
attn_weights = output[1]
step_attention = attn_weights.cpu() # (B, H, N, N)
self.last_aligned_attn = step_attention[0].mean(0) # (N, N)
else:
self.last_aligned_attn = None
In
alignment_stream_analyzer.pyChanging this seems to fix it:
def attention_forward_hook(module, input, output): """ See `LlamaAttention.forward`; the output is a 3-tuple: `attn_output, attn_weights, past_key_value`. """ if output is not None and isinstance(output, tuple) and len(output) > 1 and output[1] is not None: attn_weights = output[1] step_attention = attn_weights.cpu() # (B, H, N, N) self.last_aligned_attn = step_attention[0].mean(0) # (N, N) else: self.last_aligned_attn = None
This solution worked for me, I did it for both copies of alignment_stream_analyzer.py existed inside both (( Chatterbox )) & (( Chatterbox-TTS )), I am a ComfyUI user.