whisper-diarization
whisper-diarization copied to clipboard
AttributeError: 'SoundFile' object has no attribute 'frames'
[NeMo I 2023-10-17 17:09:36 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2023-10-17 17:09:36 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue
splitting manifest: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00, 2.26s/it]
[NeMo I 2023-10-17 17:09:39 vad_utils:107] The prepared manifest file exists. Overwriting!
[NeMo I 2023-10-17 17:09:39 classification_models:272] Perform streaming frame-level VAD
[NeMo I 2023-10-17 17:09:39 collections:301] Filtered duration for loading collection is 0.00 hours.
[NeMo I 2023-10-17 17:09:39 collections:302] Dataset loaded with 71 items, total duration of 0.98 hours.
[NeMo I 2023-10-17 17:09:39 collections:304] # 71 files loaded accounting to # 1 labels
vad: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [00:10<00:00, 6.99it/s]
[NeMo I 2023-10-17 17:09:49 clustering_diarizer:250] Generating predictions with overlapping input segments
[NeMo I 2023-10-17 17:10:12 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format.
creating speech segments: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00, 2.67s/it]
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /mnt/bigdisk/whisper-diarization/nemo_process.py:29 in <module> │
│ │
│ 26 │
│ 27 # Initialize NeMo MSDD diarization model │
│ 28 msdd_model = NeuralDiarizer(cfg=create_config(temp_path)).to(args.device) │
│ ❱ 29 msdd_model.diarize() │
│ 30 │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py:115 in │
│ decorate_context │
│ │
│ 112 │ @functools.wraps(func) │
│ 113 │ def decorate_context(*args, **kwargs): │
│ 114 │ │ with ctx_factory(): │
│ ❱ 115 │ │ │ return func(*args, **kwargs) │
│ 116 │ │
│ 117 │ return decorate_context │
│ 118 │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/msdd_models.py: │
│ 1180 in diarize │
│ │
│ 1177 │ │ Note that the result of MSDD can include multiple speakers at the same time. The │
│ 1178 │ │ function that can generate overlapping timestamps. `self.run_overlap_aware_eval( │
│ 1179 │ │ """ │
│ ❱ 1180 │ │ self.clustering_embedding.prepare_cluster_embs_infer() │
│ 1181 │ │ self.msdd_model.pairwise_infer = True │
│ 1182 │ │ self.get_emb_clus_infer(self.clustering_embedding) │
│ 1183 │ │ preds_list, targets_list, signal_lengths_list = self.run_pairwise_diarization() │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/msdd_models.py: │
│ 699 in prepare_cluster_embs_infer │
│ │
│ 696 │ │ Launch clustering diarizer to prepare embedding vectors and clustering results. │
│ 697 │ │ """ │
│ 698 │ │ self.max_num_speakers = self.cfg_diar_infer.diarizer.clustering.parameters.max_n │
│ ❱ 699 │ │ self.emb_sess_test_dict, self.emb_seq_test, self.clus_test_label_dict, _ = self. │
│ 700 │ │ │ self._cfg_msdd.test_ds.manifest_filepath, self._cfg_msdd.test_ds.emb_dir │
│ 701 │ │ ) │
│ 702 │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/msdd_models.py: │
│ 866 in run_clustering_diarizer │
│ │
│ 863 │ │ │
│ 864 │ │ logging.info(f"Multiscale Weights: {self.clus_diar_model.multiscale_args_dict['m │
│ 865 │ │ logging.info(f"Clustering Parameters: {clustering_params_str}") │
│ ❱ 866 │ │ scores = self.clus_diar_model.diarize(batch_size=self.cfg_diar_infer.batch_size) │
│ 867 │ │ │
│ 868 │ │ # If RTTM (ground-truth diarization annotation) files do not exist, scores is No │
│ 869 │ │ if scores is not None: │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/clustering_diar │
│ izer.py:437 in diarize │
│ │
│ 434 │ │ os.makedirs(out_rttm_dir, exist_ok=True) │
│ 435 │ │ │
│ 436 │ │ # Speech Activity Detection │
│ ❱ 437 │ │ self._perform_speech_activity_detection() │
│ 438 │ │ │
│ 439 │ │ # Segmentation │
│ 440 │ │ scales = self.multiscale_args_dict['scale_dict'].items() │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/clustering_diar │
│ izer.py:325 in _perform_speech_activity_detection │
│ │
│ 322 │ │ │ │ ) │
│ 323 │ │ │ │
│ 324 │ │ │ self._setup_vad_test_data(manifest_vad_input) │
│ ❱ 325 │ │ │ self._run_vad(manifest_vad_input) │
│ 326 │ │ │
│ 327 │ │ elif self._diarizer_params.vad.external_vad_manifest is not None: │
│ 328 │ │ │ self._speaker_manifest_path = self._diarizer_params.vad.external_vad_manifes │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/clustering_diar │
│ izer.py:281 in _run_vad │
│ │
│ 278 │ │ │ else: │
│ 279 │ │ │ │ logging.warning(f"no vad file found for {key} due to zero or negative du │
│ 280 │ │ │
│ ❱ 281 │ │ write_rttm2manifest(AUDIO_VAD_RTTM_MAP, self._vad_out_file) │
│ 282 │ │ self._speaker_manifest_path = self._vad_out_file │
│ 283 │ │
│ 284 │ def _run_segmentation(self, window: float, shift: float, scale_tag: str = ''): │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/speaker_ut │
│ ils.py:858 in write_rttm2manifest │
│ │
│ 855 │ │ for uniq_id in AUDIO_RTTM_MAP: │
│ 856 │ │ │ rttm_file_path = AUDIO_RTTM_MAP[uniq_id]['rttm_filepath'] │
│ 857 │ │ │ rttm_lines = read_rttm_lines(rttm_file_path) │
│ ❱ 858 │ │ │ offset, duration = get_offset_and_duration(AUDIO_RTTM_MAP, uniq_id, decimals │
│ 859 │ │ │ vad_start_end_list_raw = [] │
│ 860 │ │ │ for line in rttm_lines: │
│ 861 │ │ │ │ start, dur = get_vad_out_from_rttm_line(line) │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/speaker_ut │
│ ils.py:565 in get_offset_and_duration │
│ │
│ 562 │ │ offset = round(AUDIO_RTTM_MAP[uniq_id]['offset'], decimals) │
│ 563 │ else: │
│ 564 │ │ sound = sf.SoundFile(audio_path) │
│ ❱ 565 │ │ duration = sound.frames / sound.samplerate │
│ 566 │ │ offset = 0.0 │
│ 567 │ return offset, duration │
│ 568 │
│ │
│ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/soundfile.py:822 in __getattr__ │
│ │
│ 819 │ │ │ data = _snd.sf_get_string(self._file, _str_types[name]) │
│ 820 │ │ │ return _ffi.string(data).decode('utf-8', 'replace') if data else "" │
│ 821 │ │ else: │
│ ❱ 822 │ │ │ raise AttributeError( │
│ 823 │ │ │ │ "'SoundFile' object has no attribute {0!r}".format(name)) │
│ 824 │ │
│ 825 │ def __len__(self): │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AttributeError: 'SoundFile' object has no attribute 'frames'```
Please upload the audio file using any method to reproduce the isdue
https://drive.google.com/file/d/1fyxn2N2sfnP3ZEhU8_xO8NS9E9roG2Rz/view?usp=share_link
Please upload the audio file using any method to reproduce the isdue
I've uploaded the file already, would you please take a look?