wav2letter
wav2letter copied to clipboard
what() : CUDNN_STATUS_NOT_SUPPORTED
trafficstars
Question
when i run build/train, the error occurred : CUDNN_STATUS_NOT_SUPPORTED
Error log
I1021 09:29:31.667903 7461 W2lListFilesDataset.cpp:62] Total batches (i.e. iters): 2275
I1021 09:29:31.793767 7460 W2lListFilesDataset.cpp:141] 78752 files found.
I1021 09:29:31.794911 7460 Utils.cpp:102] Filtered 5968/78752 samples
I1021 09:29:31.795254 7460 W2lListFilesDataset.cpp:62] Total batches (i.e. iters): 2275
I1021 09:29:32.036325 7459 Train.cpp:813] Epoch 1 started!
terminate called after throwing an instance of 'std::runtime_error'
what(): CUDNN_STATUS_NOT_SUPPORTED
*** Aborted at 1603272576 (unix time) try "date -d @1603272576" if you are using GNU date ***
PC: @ 0x7f6bab0dcf47 gsignal
*** SIGABRT (@0x1d26) received by PID 7462 (TID 0x7f6beee28380) from PID 7462; stack trace: ***
@ 0x7f6be713e8a0 (unknown)
@ 0x7f6bab0dcf47 gsignal
@ 0x7f6bab0de8b1 abort
@ 0x7f6babad1957 (unknown)
@ 0x7f6babad7ae6 (unknown)
@ 0x7f6babad7b21 std::terminate()
@ 0x7f6babad7d54 __cxa_throw
@ 0x563503afc489 fl::TensorDescriptor::TensorDescriptor()
@ 0x563503afa7cc fl::conv2d()
@ 0x563503aa4bc1 fl::Conv2D::forward()
@ 0x563503ab89ee fl::UnaryModule::forward()
@ 0x563503aa2aba fl::Sequential::forward()
@ 0x5635037d14ac _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x5635037622ab main
@ 0x7f6bab0bfb97 __libc_start_main
@ 0x5635037c9eba _start
terminate called after throwing an instance of 'std::runtime_error'
what(): CUDNN_STATUS_NOT_SUPPORTED
*** Aborted at 1603272576 (unix time) try "date -d @1603272576" if you are using GNU date ***
PC: @ 0x7fd8ca441f47 gsignal
*** SIGABRT (@0x1d25) received by PID 7461 (TID 0x7fd90e18d380) from PID 7461; stack trace: ***
@ 0x7fd9064a38a0 (unknown)
@ 0x7fd8ca441f47 gsignal
@ 0x7fd8ca4438b1 abort
@ 0x7fd8cae36957 (unknown)
@ 0x7fd8cae3cae6 (unknown)
@ 0x7fd8cae3cb21 std::terminate()
@ 0x7fd8cae3cd54 __cxa_throw
@ 0x55ef87bf1489 fl::TensorDescriptor::TensorDescriptor()
@ 0x55ef87bef7cc fl::conv2d()
@ 0x55ef87b99bc1 fl::Conv2D::forward()
@ 0x55ef87bad9ee fl::UnaryModule::forward()
@ 0x55ef87b97aba fl::Sequential::forward()
@ 0x55ef878c64ac _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x55ef878572ab main
@ 0x7fd8ca424b97 __libc_start_main
@ 0x55ef878beeba _start
terminate called after throwing an instance of 'std::runtime_error'
what(): CUDNN_STATUS_NOT_SUPPORTED
*** Aborted at 1603272576 (unix time) try "date -d @1603272576" if you are using GNU date ***
PC: @ 0x7f9c2aa6af47 gsignal
*** SIGABRT (@0x1d23) received by PID 7459 (TID 0x7f9c6e7b6380) from PID 7459; stack trace: ***
@ 0x7f9c66acc8a0 (unknown)
@ 0x7f9c2aa6af47 gsignal
@ 0x7f9c2aa6c8b1 abort
@ 0x7f9c2b45f957 (unknown)
@ 0x7f9c2b465ae6 (unknown)
@ 0x7f9c2b465b21 std::terminate()
@ 0x7f9c2b465d54 __cxa_throw
@ 0x563d337c9489 fl::TensorDescriptor::TensorDescriptor()
@ 0x563d337c77cc fl::conv2d()
@ 0x563d33771bc1 fl::Conv2D::forward()
@ 0x563d337859ee fl::UnaryModule::forward()
@ 0x563d3376faba fl::Sequential::forward()
@ 0x563d3349e4ac _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x563d3342f2ab main
@ 0x7f9c2aa4db97 __libc_start_main
@ 0x563d33496eba _start
terminate called after throwing an instance of 'std::runtime_error'
what(): CUDNN_STATUS_NOT_SUPPORTED
*** Aborted at 1603272576 (unix time) try "date -d @1603272576" if you are using GNU date ***
PC: @ 0x7f687fd23f47 gsignal
*** SIGABRT (@0x1d24) received by PID 7460 (TID 0x7f68c3a6f380) from PID 7460; stack trace: ***
@ 0x7f68bbd858a0 (unknown)
@ 0x7f687fd23f47 gsignal
@ 0x7f687fd258b1 abort
@ 0x7f6880718957 (unknown)
@ 0x7f688071eae6 (unknown)
@ 0x7f688071eb21 std::terminate()
@ 0x7f688071ed54 __cxa_throw
@ 0x559c78bf0489 fl::TensorDescriptor::TensorDescriptor()
@ 0x559c78bee7cc fl::conv2d()
@ 0x559c78b98bc1 fl::Conv2D::forward()
@ 0x559c78bac9ee fl::UnaryModule::forward()
@ 0x559c78b96aba fl::Sequential::forward()
@ 0x559c788c54ac _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x559c788562ab main
@ 0x7f687fd06b97 __libc_start_main
@ 0x559c788bdeba _start
--------------------------------------------------------------------------
mpirun noticed that process rank 3 with PID 0 on node e0ebb0a63bf9 exited on signal 6 (Aborted).
--------------------------------------------------------------------------
Additional Context
This is additional information.
- cudnn & cuda
root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2
cat: /usr/local/cuda/include/cudnn.h: No such file or directory
root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# cat /usr/include/cudnn.h | grep CUDNN_MAJOR -A 2
#define CUDNN_MAJOR 7
#define CUDNN_MINOR 6
#define CUDNN_PATCHLEVEL 5
--
#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
#include "driver_types.h"
root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sat_Aug_25_21:08:01_CDT_2018
Cuda compilation tools, release 10.0, V10.0.130
root@e0ebb0a63bf9:/home/dcshin/wav2letter/build# which nvcc
/usr/local/cuda/bin/nvcc
cudnn is avaliable in pytorch
root@e04c9ef4ea64:/home/dcshin/wav2letter# python
Python 3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31)
[GCC 7.3.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.backends.cudnn.is_available()
True
- log (make test)
#...
26/31 Test #26: inference_LinearTest ............. Passed 0.01 sec
Start 27: inference_LogMelFeatureTest
27/31 Test #27: inference_LogMelFeatureTest ...... Passed 0.05 sec
Start 28: inference_MemoryManagerTest
28/31 Test #28: inference_MemoryManagerTest ...... Passed 0.01 sec
Start 29: inference_ReluTest
29/31 Test #29: inference_ReluTest ............... Passed 0.01 sec
Start 30: inference_ResidualTest
30/31 Test #30: inference_ResidualTest ........... Passed 0.01 sec
Start 31: inference_TDSBlockTest
31/31 Test #31: inference_TDSBlockTest ........... Passed 0.01 sec
100% tests passed, 0 tests failed out of 31
Total Test time (real) = 69.11 sec
- log (cmake)
-- Checking for [mkl_gf_lp64 - mkl_gnu_thread - mkl_core - iomp5 - pthread - m]
-- Library mkl_gf_lp64: /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.so
-- Library mkl_gnu_thread: /opt/intel/mkl/lib/intel64/libmkl_gnu_thread.so
-- Library mkl_core: /opt/intel/mkl/lib/intel64/libmkl_core.so
-- Library iomp5: /usr/lib/x86_64-linux-gnu/libiomp5.so
-- Library pthread: /usr/lib/x86_64-linux-gnu/libpthread.so
-- Library m: /usr/lib/x86_64-linux-gnu/libm.so
-- MKL library found
-- ArrayFire found (include: /usr/local/include, library: ArrayFire::afcuda)
-- Found glog (include: /usr/include, library: /usr/lib/x86_64-linux-gnu/libglog.so)
-- GLOG found
-- Found gflags (include: /usr/include, library: /usr/lib/x86_64-linux-gnu/libgflags.so)
-- GFLAGS found
-- OpenMP found
-- flashlight found (include: lib: flashlight::flashlight )
-- flashlight built in distributed mode.
-- flashlight built with contrib features.
-- CUDA found (library: /usr/local/cuda/lib64/libcudart_static.a;-pthread;dl;/usr/lib/x86_64-linux-gnu/librt.so include: /usr/local/cuda/include)
-- CUDA architecture flags: -gencodearch=compute_30,code=sm_30-gencodearch=compute_35,code=sm_35-gencodearch=compute_50,code=sm_50-gencodearch=compute_52,code=sm_52-gencodearch=compute_60,code=sm_60-gencodearch=compute_61,code=sm_61-gencodearch=compute_70,code=sm_70-gencodearch=compute_75,code=sm_75-gencodearch=compute_70,code=compute_70-gencodearch=compute_75,code=compute_75
-- CBLAS found (include: /opt/intel/mkl/include, library: /opt/intel/mkl/lib/intel64/libmkl_gf_lp64.so;/opt/intel/mkl/lib/intel64/libmkl_gnu_thread.so;/opt/intel/mkl/lib/intel64/libmkl_core.so;/usr/lib/x86_64-linux-gnu/libiomp5.so;/usr/lib/x86_64-linux-gnu/libpthread.so;/usr/lib/x86_64-linux-gnu/libm.so)
-- FFTW found
-- Looking for KenLM
-- Using kenlm library found in /root/kenlm/build/lib/libkenlm.a
-- Using kenlm utils library found in /root/kenlm/build/lib/libkenlm.a
-- kenlm lm/model.hh found in /root/kenlm/lm/model.hh
-- Found kenlm (include: /root/kenlm, library: /root/kenlm/build/lib/libkenlm.a;/root/kenlm/build/lib/libkenlm_util.a)
-- Adding warpctc:
-- warpctc: cuda found TRUE
-- warpctc: using CUDA 9.0 or above
-- warpctc: Building shared library with GPU support
-- Required SndFile dependency Ogg found.
-- Required SndFile dependency Vorbis found.
-- Required SndFile dependency VorbisEnc found.
-- Required SndFile dependency FLAC found.
-- Found libsndfile: (lib: /usr/local/lib/libsndfile.so include: /usr/local/include
-- libsndfile found.
-- Found gflags (include: /usr/include, library: /usr/lib/x86_64-linux-gnu/libgflags.so)
-- GFLAGS found
-- Looking for KenLM
-- Using kenlm library found in /root/kenlm/build/lib/libkenlm.a
-- Using kenlm utils library found in /root/kenlm/build/lib/libkenlm.a
-- kenlm lm/model.hh found in /root/kenlm/lm/model.hh
-- Found kenlm (include: /root/kenlm, library: /root/kenlm/build/lib/libkenlm.a;/root/kenlm/build/lib/libkenlm_util.a)
-- Examples: add executable interactive_streaming_asr_example
-- Examples: add executable simple_streaming_asr_example
-- Examples: add executable multithreaded_streaming_asr_example
-- Tests: add executable inference_Conv1dTest
-- Tests: add executable inference_IdentityTest
-- Tests: add executable inference_LayerNormTest
-- Tests: add executable inference_LinearTest
-- Tests: add executable inference_LogMelFeatureTest
-- Tests: add executable inference_MemoryManagerTest
-- Tests: add executable inference_ReluTest
-- Tests: add executable inference_ResidualTest
-- Tests: add executable inference_TDSBlockTest
-- Building recipes.
-- Configuring done
-- Generating done
-- Build files have been written to: /home/dcshin/wav2letter/build
- Wav2letter configuration
# data path
--datadir=
--train=/home/dcshin/news_data/lists_syllable/train-kor.lst, /home/dcshin/news_data/lists_add_syllable/train-kor.lst
--valid=/home/dcshin/news_data/lists_syllable/dev-kor.lst, /home/dcshin/news_data/lists_add_syllable/dev-kor.lst
--test=/home/dcshin/news_data/lists_syllable/test-kor.lst, /home/dcshin/news_data/lists_add_syllable/test-kor.lst
--lexicon=/home/dcshin/wav2letter/experiments/results/subword_finetuning_transformer/am/librispeech-train+dev-unigram-20000-nbest10.lexicon
--tokensdir=/home/dcshin/wav2letter/experiments/results/subword_finetuning_transformer/am
--tokens=librispeech-train-all-unigram-20000.tokens
--rundir=/home/dcshin/wav2letter
--runname=experiments/logs/subword_finetuning_transformer_seq2seq_squash
--archdir=/home/dcshin/wav2letter/experiments/arch
--arch=am_transformer_s2s_librivox.arch
--input=flac
# concurrency
--nthread=10
--enable_distributed=true
--world_size=4
# 데이터를 섞기
--noresample=true
--seed=2
# min audio duration
--minisz=1000
--minitz=3
# Additional info
# optimizer
--netoptim=adagrad
--critoptim=adagrad
--lr=0.03
--lrcrit=0.03
--lr_decay=200
--lr_decay_step=40
--adambeta1=0.95
--adambeta2=0.99
--momentum=0.0
--maxgradnorm=0.1
## learning strategy
--warmup=64000
--saug_start_update=64000
--pctteacherforcing=95
--sampletarget=0.01
## etc
--batchsize=8
--encoderdim=256
--target=ltr
--memstepsize=5000000
--onorm=target
--sqnorm=true
## decoder
--am_decoder_tr_dropout=0.1
--am_decoder_tr_layerdrop=0.1
--am_decoder_tr_layers=6
--criterion=transformer
--eostoken=true
--attention=keyvalue
--maxdecoderoutputlen=120
--attnWindow=softPretrain
--trainWithWindow=true
--pretrainWindow=3
--softwstd=4
# Seq2Seq에서 오버피팅을 막기 위한 방법, refer https://arxiv.org/pdf/1612.02695.pdf
--labelsmooth=0.05
## data group
# 데이터 그룹을 만드는 방법인듯 (오디오 길이 순서대로 정렬하는건가?), binning using audio length and spiral along reference length
--dataorder=output_spiral
--inputbinsize=25
## Feature
--filterbanks=80
## changed
--wordseparator=_
--usewordpiece=true
--pcttraineval=10
## reportiters
--reportiters=0
# please refer https://github.com/facebookresearch/wav2letter/issues/806
--rndv_filepath=
could you first try to run with config without empty lines and attach the full log you see on the screen?