triton
triton copied to clipboard
torch._dynamo.exc.BackendCompilerFailed
I don't know what's going on, reporting this kind of error. Everything is normal before the training, this problem suddenly occurred, can you help me look at it? 2024-04-20 08:27:16.276530: Epoch 600 2024-04-20 08:27:16.276754: Current learning rate: 0.00438 Traceback (most recent call last): File "/opt/conda/bin/nnUNetv2_train", line 8, in sys.exit(run_training_entry()) File "/opt/conda/lib/python3.10/site-packages/nnunetv2/run/run_training.py", line 274, in run_training_entry run_training(args.dataset_name_or_id, args.configuration, args.fold, args.tr, args.p, args.pretrained_weights, File "/opt/conda/lib/python3.10/site-packages/nnunetv2/run/run_training.py", line 210, in run_training nnunet_trainer.run_training() File "/opt/conda/lib/python3.10/site-packages/nnunetv2/training/nnUNetTrainer/nnUNetTrainer.py", line 1295, in run_training train_outputs.append(self.train_step(next(self.dataloader_train))) File "/opt/conda/lib/python3.10/site-packages/nnunetv2/training/nnUNetTrainer/nnUNetTrainer.py", line 922, in train_step output = self.network(data) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 328, in _fn return fn(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 490, in catch_errors return callback(frame, cache_entry, hooks, frame_state) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 641, in _convert_frame result = inner_convert(frame, cache_size, hooks, frame_state) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 133, in _fn return fn(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 389, in _convert_frame_assert return _compile( File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 569, in _compile guarded_code = compile_inner(code, one_graph, hooks, transform) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper r = func(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 491, in compile_inner out_code = transform_code_object(code, transform) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py", line 1028, in transform_code_object transformations(instructions, code_options) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 458, in transform tracer.run() File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 2069, in run super().run() File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 719, in run and self.step() File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 683, in step getattr(self, inst.opname)(inst) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in RETURN_VALUE self.output.compile_subgraph( File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 857, in compile_subgraph self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root) File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner return func(*args, **kwds) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 957, in compile_and_call_fx_graph compiled_fn = self.call_user_compiler(gm) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper r = func(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1024, in call_user_compiler raise BackendCompilerFailed(self.compiler_fn, e).with_traceback( File "/opt/conda/lib/python3.10/site-packages/torch/dynamo/output_graph.py", line 1009, in call_user_compiler compiled_fn = compiler_fn(gm, self.example_inputs()) File "/opt/conda/lib/python3.10/site-packages/torch/dynamo/repro/after_dynamo.py", line 117, in debug_wrapper compiled_gm = compiler_fn(gm, example_inputs) File "/opt/conda/lib/python3.10/site-packages/torch/init.py", line 1568, in call return compile_fx(model, inputs, config_patches=self.config) File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1150, in compile_fx return aot_autograd( File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/backends/common.py", line 55, in compiler_fn cg = aot_module_simplified(gm, example_inputs, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 3891, in aot_module_simplified compiled_fn = create_aot_dispatcher_function( File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper r = func(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 3429, in create_aot_dispatcher_function compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata) File "/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 2212, in aot_wrapper_dedupe return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata) File "/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 2392, in aot_wrapper_synthetic_base return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata) File "/opt/conda/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 2917, in aot_dispatch_autograd compiled_fw_func = aot_config.fw_compiler( File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper r = func(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1092, in fw_compiler_base return inner_compile( File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py", line 80, in debug_wrapper inner_compiled_fn = compiler_fn(gm, example_inputs) File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/debug.py", line 228, in inner return fn(*args, **kwargs) File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner return func(*args, **kwds) File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 54, in newFunction return old_func(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 341, in compile_fx_inner compiled_graph: CompiledFxGraph = fx_codegen_and_compile( File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 565, in fx_codegen_and_compile compiled_fn = graph.compile_to_fn() File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py", line 970, in compile_to_fn return self.compile_to_module().call File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper r = func(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py", line 938, in compile_to_module code, linemap = self.codegen() File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/graph.py", line 913, in codegen self.scheduler = Scheduler(self.buffers) File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper r = func(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/scheduler.py", line 971, in init self.nodes = [self.create_scheduler_node(n) for n in nodes] File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/scheduler.py", line 971, in self.nodes = [self.create_scheduler_node(n) for n in nodes] File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/scheduler.py", line 1037, in create_scheduler_node group_fn = self.get_backend(node.get_device()).group_fn File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/scheduler.py", line 1642, in get_backend self.backends[device] = self.create_backend(device) File "/opt/conda/lib/python3.10/site-packages/torch/_inductor/scheduler.py", line 1634, in create_backend raise RuntimeError( torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised: RuntimeError: Cannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton
Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
You can suppress this exception and fall back to eager by setting: import torch._dynamo torch._dynamo.config.suppress_errors = True
Exception in thread Thread-3 (results_loop): Traceback (most recent call last): File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner self.run() File "/opt/conda/lib/python3.10/threading.py", line 953, in run self._target(*self._args, **self._kwargs) File "/opt/conda/lib/python3.10/site-packages/batchgenerators/dataloading/nondet_multi_threaded_augmenter.py", line 125, in results_loop raise e File "/opt/conda/lib/python3.10/site-packages/batchgenerators/dataloading/nondet_multi_threaded_augmenter.py", line 103, in results_loop raise RuntimeError("One or more background workers are no longer alive. Exiting. Please check the " RuntimeError: One or more background workers are no longer alive. Exiting. Please check the print statements above for the actual error message Exception in thread Thread-2 (results_loop): Traceback (most recent call last): File "/opt/conda/lib/python3.10/threading.py", line 1016, in _bootstrap_inner self.run() File "/opt/conda/lib/python3.10/threading.py", line 953, in run self._target(*self._args, **self._kwargs) File "/opt/conda/lib/python3.10/site-packages/batchgenerators/dataloading/nondet_multi_threaded_augmenter.py", line 125, in results_loop raise e File "/opt/conda/lib/python3.10/site-packages/batchgenerators/dataloading/nondet_multi_threaded_augmenter.py", line 103, in results_loop raise RuntimeError("One or more background workers are no longer alive. Exiting. Please check the " RuntimeError: One or more background workers are no longer alive. Exiting. Please check the print statements above for the actual error message