computervision-recipes
computervision-recipes copied to clipboard
[BUG] Error in o16n with AzureML notebooks
Description
This is the error, it looks it is related to the deployment of ACI and AKS resources.
.FFF. [100%]
=================================== FAILURES ===================================
_____________________________ test_21_notebook_run _____________________________
classification_notebooks = {'00_webcam': '/home/vsts/work/1/s/classification/notebooks/00_webcam.ipynb', '01_training_introduction': '/home/vsts/...3_training_accuracy_vs_speed': '/home/vsts/work/1/s/classification/notebooks/03_training_accuracy_vs_speed.ipynb', ...}
subscription_id = '***'
resource_group = 'amlnotebookrg', workspace_name = 'amlnotebookws'
workspace_region = '***2'
@pytest.mark.azuremlnotebooks
def test_21_notebook_run(
classification_notebooks,
subscription_id,
resource_group,
workspace_name,
workspace_region,
):
notebook_path = classification_notebooks[
"21_deployment_on_azure_container_instances"
]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
parameters=dict(
PM_VERSION=pm.__version__,
subscription_id=subscription_id,
resource_group=resource_group,
workspace_name=workspace_name,
workspace_region=workspace_region,
),
> kernel_name=KERNEL_NAME,
)
tests/smoke/test_azureml_notebooks.py:58:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda/envs/cv/lib/python3.6/site-packages/papermill/execute.py:108: in execute_notebook
raise_for_execution_errors(nb, output_path)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
nb = {'cells': [{'cell_type': 'code', 'metadata': {'inputHidden': True, 'hide_input': True}, 'execution_count': None, 'sour..._time': '2019-09-24T17:35:17.380577', 'duration': 1113.334717, 'exception': True}}, 'nbformat': 4, 'nbformat_minor': 2}
output_path = 'output.ipynb'
def raise_for_execution_errors(nb, output_path):
"""Assigned parameters into the appropriate place in the input notebook
Parameters
----------
nb : NotebookNode
Executable notebook object
output_path : str
Path to write executed notebook
"""
error = None
for cell in nb.cells:
if cell.get("outputs") is None:
continue
for output in cell.outputs:
if output.output_type == "error":
error = PapermillExecutionError(
exec_count=cell.execution_count,
source=cell.source,
ename=output.ename,
evalue=output.evalue,
traceback=output.traceback,
)
break
if error:
# Write notebook back out with the Error Message at the top of the Notebook.
error_msg = ERROR_MESSAGE_TEMPLATE % str(error.exec_count)
error_msg_cell = nbformat.v4.new_code_cell(
source="%%html\n" + error_msg,
outputs=[
nbformat.v4.new_output(output_type="display_data", data={"text/html": error_msg})
],
metadata={"inputHidden": True, "hide_input": True},
)
nb.cells = [error_msg_cell] + nb.cells
write_ipynb(nb, output_path)
> raise error
E papermill.exceptions.PapermillExecutionError:
E ---------------------------------------------------------------------------
E Exception encountered at "In [26]":
E ---------------------------------------------------------------------------
E WebserviceException Traceback (most recent call last)
E /usr/share/miniconda/envs/cv/lib/python3.6/site-packages/azureml/core/webservice/webservice.py in wait_for_deployment(self, show_output)
E 511 'Error:\n'
E --> 512 '{}'.format(self.state, logs_response, error_response), logger=module_logger)
E 513 print('{} service creation operation finished, operation "{}"'.format(self._webservice_type,
E
E WebserviceException: WebserviceException:
E Message: Service deployment polling reached non-successful terminal state, current service state: Unhealthy
E More information can be found using '.get_logs()'
E Error:
E {
E "code": "AciDeploymentFailed",
E "message": "Aci Deployment failed with exception: Your container application crashed. This may be caused by errors in your scoring file's init() function.\nPlease check the logs for your container instance: im-classif-websvc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information.",
E "details": [
E {
E "code": "CrashLoopBackOff",
E "message": "Your container application crashed. This may be caused by errors in your scoring file's init() function.\nPlease check the logs for your container instance: im-classif-websvc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information."
E }
E ]
E }
E InnerException None
E ErrorResponse
E {
E "error": {
E "message": "Service deployment polling reached non-successful terminal state, current service state: Unhealthy\nMore information can be found using '.get_logs()'\nError:\n{\n \"code\": \"AciDeploymentFailed\",\n \"message\": \"Aci Deployment failed with exception: Your container application crashed. This may be caused by errors in your scoring file's init() function.\\nPlease check the logs for your container instance: im-classif-websvc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \\nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information.\",\n \"details\": [\n {\n \"code\": \"CrashLoopBackOff\",\n \"message\": \"Your container application crashed. This may be caused by errors in your scoring file's init() function.\\nPlease check the logs for your container instance: im-classif-websvc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \\nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information.\"\n }\n ]\n}"
E }
E }
E
E During handling of the above exception, another exception occurred:
E
E WebserviceException Traceback (most recent call last)
E <ipython-input-26-21aec20dbbb2> in <module>
E 1 # Deploy the web service
E ----> 2 service.wait_for_deployment(show_output=True)
E
E /usr/share/miniconda/envs/cv/lib/python3.6/site-packages/azureml/core/webservice/webservice.py in wait_for_deployment(self, show_output)
E 519 'Current state is {}'.format(self.state), logger=module_logger)
E 520 else:
E --> 521 raise WebserviceException(e.message, logger=module_logger)
E 522
E 523 def _wait_for_operation_to_complete(self, show_output):
E
E WebserviceException: WebserviceException:
E Message: Service deployment polling reached non-successful terminal state, current service state: Unhealthy
E More information can be found using '.get_logs()'
E Error:
E {
E "code": "AciDeploymentFailed",
E "message": "Aci Deployment failed with exception: Your container application crashed. This may be caused by errors in your scoring file's init() function.\nPlease check the logs for your container instance: im-classif-websvc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information.",
E "details": [
E {
E "code": "CrashLoopBackOff",
E "message": "Your container application crashed. This may be caused by errors in your scoring file's init() function.\nPlease check the logs for your container instance: im-classif-websvc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information."
E }
E ]
E }
E InnerException None
E ErrorResponse
E {
E "error": {
E "message": "Service deployment polling reached non-successful terminal state, current service state: Unhealthy\nMore information can be found using '.get_logs()'\nError:\n{\n \"code\": \"AciDeploymentFailed\",\n \"message\": \"Aci Deployment failed with exception: Your container application crashed. This may be caused by errors in your scoring file's init() function.\\nPlease check the logs for your container instance: im-classif-websvc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \\nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information.\",\n \"details\": [\n {\n \"code\": \"CrashLoopBackOff\",\n \"message\": \"Your container application crashed. This may be caused by errors in your scoring file's init() function.\\nPlease check the logs for your container instance: im-classif-websvc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \\nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information.\"\n }\n ]\n}"
E }
E }
/usr/share/miniconda/envs/cv/lib/python3.6/site-packages/papermill/execute.py:192: PapermillExecutionError
----------------------------- Captured stderr call -----------------------------
Executing: 0%| | 0/65 [00:00<?, ?cell/s]
Executing: 2%|▏ | 1/65 [00:00<01:03, 1.01cell/s]
Executing: 5%|▍ | 3/65 [00:01<00:44, 1.40cell/s]
Executing: 8%|▊ | 5/65 [00:01<00:31, 1.92cell/s]
Executing: 9%|▉ | 6/65 [00:04<01:13, 1.24s/cell]
Executing: 12%|█▏ | 8/65 [00:05<01:00, 1.07s/cell]
Executing: 15%|█▌ | 10/65 [00:05<00:43, 1.27cell/s]
Executing: 18%|█▊ | 12/65 [00:05<00:30, 1.72cell/s]
Executing: 20%|██ | 13/65 [00:06<00:22, 2.26cell/s]
Executing: 23%|██▎ | 15/65 [00:07<00:23, 2.15cell/s]
Executing: 26%|██▌ | 17/65 [00:07<00:16, 2.87cell/s]
Executing: 28%|██▊ | 18/65 [00:13<01:34, 2.00s/cell]
Executing: 31%|███ | 20/65 [00:13<01:04, 1.43s/cell]
Executing: 32%|███▏ | 21/65 [00:15<01:06, 1.51s/cell]
Executing: 35%|███▌ | 23/65 [00:15<00:45, 1.08s/cell]
Executing: 37%|███▋ | 24/65 [00:16<00:45, 1.11s/cell]
Executing: 38%|███▊ | 25/65 [00:18<00:54, 1.37s/cell]
Executing: 42%|████▏ | 27/65 [00:18<00:37, 1.01cell/s]
Executing: 43%|████▎ | 28/65 [00:20<00:50, 1.37s/cell]
Executing: 45%|████▍ | 29/65 [00:21<00:38, 1.07s/cell]
Executing: 48%|████▊ | 31/65 [00:22<00:33, 1.01cell/s]
Executing: 51%|█████ | 33/65 [00:22<00:22, 1.39cell/s]
Executing: 52%|█████▏ | 34/65 [00:23<00:19, 1.61cell/s]
Executing: 54%|█████▍ | 35/65 [00:23<00:14, 2.11cell/s]
Executing: 57%|█████▋ | 37/65 [00:23<00:10, 2.76cell/s]
Executing: 58%|█████▊ | 38/65 [00:23<00:07, 3.41cell/s]
Executing: 62%|██████▏ | 40/65 [00:24<00:05, 4.18cell/s]
Executing: 65%|██████▍ | 42/65 [00:24<00:04, 5.02cell/s]
Executing: 66%|██████▌ | 43/65 [00:32<00:59, 2.70s/cell]
Executing: 68%|██████▊ | 44/65 [11:52<1:12:00, 205.75s/cell]
Executing: 69%|██████▉ | 45/65 [11:52<48:01, 144.08s/cell]
Executing: 71%|███████ | 46/65 [11:53<32:00, 101.08s/cell]
Executing: 72%|███████▏ | 47/65 [11:53<21:14, 70.80s/cell]
Executing: 74%|███████▍ | 48/65 [11:53<14:03, 49.63s/cell]
Executing: 75%|███████▌ | 49/65 [11:53<09:16, 34.79s/cell]
Executing: 77%|███████▋ | 50/65 [11:53<06:05, 24.40s/cell]
Executing: 78%|███████▊ | 51/65 [11:56<04:07, 17.70s/cell]
Executing: 80%|████████ | 52/65 [11:56<02:41, 12.44s/cell]
Executing: 82%|████████▏ | 53/65 [18:32<25:30, 127.58s/cell]
Executing: 82%|████████▏ | 53/65 [18:33<04:12, 21.01s/cell]
_____________________________ test_22_notebook_run _____________________________
classification_notebooks = {'00_webcam': '/home/vsts/work/1/s/classification/notebooks/00_webcam.ipynb', '01_training_introduction': '/home/vsts/...3_training_accuracy_vs_speed': '/home/vsts/work/1/s/classification/notebooks/03_training_accuracy_vs_speed.ipynb', ...}
subscription_id = '***'
resource_group = 'amlnotebookrg', workspace_name = 'amlnotebookws'
workspace_region = '***2'
@pytest.mark.azuremlnotebooks
def test_22_notebook_run(
classification_notebooks,
subscription_id,
resource_group,
workspace_name,
workspace_region,
):
notebook_path = classification_notebooks[
"22_deployment_on_azure_kubernetes_service"
]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
parameters=dict(
PM_VERSION=pm.__version__,
subscription_id=subscription_id,
resource_group=resource_group,
workspace_name=workspace_name,
workspace_region=workspace_region,
),
> kernel_name=KERNEL_NAME,
)
tests/smoke/test_azureml_notebooks.py:83:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/share/miniconda/envs/cv/lib/python3.6/site-packages/papermill/execute.py:108: in execute_notebook
raise_for_execution_errors(nb, output_path)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
nb = {'cells': [{'cell_type': 'code', 'metadata': {'inputHidden': True, 'hide_input': True}, 'execution_count': None, 'sour..._time': '2019-09-24T17:58:40.389449', 'duration': 1402.445046, 'exception': True}}, 'nbformat': 4, 'nbformat_minor': 2}
output_path = 'output.ipynb'
def raise_for_execution_errors(nb, output_path):
"""Assigned parameters into the appropriate place in the input notebook
Parameters
----------
nb : NotebookNode
Executable notebook object
output_path : str
Path to write executed notebook
"""
error = None
for cell in nb.cells:
if cell.get("outputs") is None:
continue
for output in cell.outputs:
if output.output_type == "error":
error = PapermillExecutionError(
exec_count=cell.execution_count,
source=cell.source,
ename=output.ename,
evalue=output.evalue,
traceback=output.traceback,
)
break
if error:
# Write notebook back out with the Error Message at the top of the Notebook.
error_msg = ERROR_MESSAGE_TEMPLATE % str(error.exec_count)
error_msg_cell = nbformat.v4.new_code_cell(
source="%%html\n" + error_msg,
outputs=[
nbformat.v4.new_output(output_type="display_data", data={"text/html": error_msg})
],
metadata={"inputHidden": True, "hide_input": True},
)
nb.cells = [error_msg_cell] + nb.cells
write_ipynb(nb, output_path)
> raise error
E papermill.exceptions.PapermillExecutionError:
E ---------------------------------------------------------------------------
E Exception encountered at "In [12]":
E ---------------------------------------------------------------------------
E WebserviceException Traceback (most recent call last)
E /usr/share/miniconda/envs/cv/lib/python3.6/site-packages/azureml/core/webservice/webservice.py in wait_for_deployment(self, show_output)
E 511 'Error:\n'
E --> 512 '{}'.format(self.state, logs_response, error_response), logger=module_logger)
E 513 print('{} service creation operation finished, operation "{}"'.format(self._webservice_type,
E
E WebserviceException: WebserviceException:
E Message: Service deployment polling reached non-successful terminal state, current service state: Failed
E More information can be found using '.get_logs()'
E Error:
E {
E "code": "KubernetesDeploymentFailed",
E "statusCode": 400,
E "message": "Kubernetes Deployment failed",
E "details": [
E {
E "code": "CrashLoopBackOff",
E "message": "Your container application crashed. This may be caused by errors in your scoring file's init() function.\nPlease check the logs for your container instance: aks-cpu-image-classif-web-svc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information."
E }
E ]
E }
E InnerException None
E ErrorResponse
E {
E "error": {
E "message": "Service deployment polling reached non-successful terminal state, current service state: Failed\nMore information can be found using '.get_logs()'\nError:\n{\n \"code\": \"KubernetesDeploymentFailed\",\n \"statusCode\": 400,\n \"message\": \"Kubernetes Deployment failed\",\n \"details\": [\n {\n \"code\": \"CrashLoopBackOff\",\n \"message\": \"Your container application crashed. This may be caused by errors in your scoring file's init() function.\\nPlease check the logs for your container instance: aks-cpu-image-classif-web-svc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \\nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information.\"\n }\n ]\n}"
E }
E }
E
E During handling of the above exception, another exception occurred:
E
E WebserviceException Traceback (most recent call last)
E <ipython-input-12-ea5338712650> in <module>
E 8 deployment_target = aks_target
E 9 )
E ---> 10 aks_service.wait_for_deployment(show_output = True)
E 11 print(f"The web service is {aks_service.state}")
E 12 else:
E
E /usr/share/miniconda/envs/cv/lib/python3.6/site-packages/azureml/core/webservice/webservice.py in wait_for_deployment(self, show_output)
E 519 'Current state is {}'.format(self.state), logger=module_logger)
E 520 else:
E --> 521 raise WebserviceException(e.message, logger=module_logger)
E 522
E 523 def _wait_for_operation_to_complete(self, show_output):
E
E WebserviceException: WebserviceException:
E Message: Service deployment polling reached non-successful terminal state, current service state: Failed
E More information can be found using '.get_logs()'
E Error:
E {
E "code": "KubernetesDeploymentFailed",
E "statusCode": 400,
E "message": "Kubernetes Deployment failed",
E "details": [
E {
E "code": "CrashLoopBackOff",
E "message": "Your container application crashed. This may be caused by errors in your scoring file's init() function.\nPlease check the logs for your container instance: aks-cpu-image-classif-web-svc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information."
E }
E ]
E }
E InnerException None
E ErrorResponse
E {
E "error": {
E "message": "Service deployment polling reached non-successful terminal state, current service state: Failed\nMore information can be found using '.get_logs()'\nError:\n{\n \"code\": \"KubernetesDeploymentFailed\",\n \"statusCode\": 400,\n \"message\": \"Kubernetes Deployment failed\",\n \"details\": [\n {\n \"code\": \"CrashLoopBackOff\",\n \"message\": \"Your container application crashed. This may be caused by errors in your scoring file's init() function.\\nPlease check the logs for your container instance: aks-cpu-image-classif-web-svc. From the AML SDK, you can run print(service.get_logs()) if you have service object to fetch the logs. \\nYou can also try to run image amlnotebookw04a7b513.azurecr.io/image-classif-resnet18-f48:1 locally. Please refer to http://aka.ms/debugimage#service-launch-fails for more information.\"\n }\n ]\n}"
E }
E }
/usr/share/miniconda/envs/cv/lib/python3.6/site-packages/papermill/execute.py:192: PapermillExecutionError
FYI @PatrickBue @jiata any idea of what could be happening?
In which platform does it happen?
How do we replicate the issue?
Expected behavior (i.e. solution)
Other Comments
Link to error: https://dev.azure.com/best-practices/computervision/_build/results?buildId=10005