training-operator
training-operator copied to clipboard
Flaky MXNext e2e test
TRAINING_CLIENT.create_mxjob(mxjob, job_namespace)
logging.info(f"List of created {constants.MXJOB_KIND}s")
logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace))
> verify_job_e2e(
TRAINING_CLIENT,
JOB_NAME,
job_namespace,
constants.MXJOB_KIND,
CONTAINER_NAME,
)
sdk/python/test/e2e/test_e2e_mxjob.py:152:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
sdk/python/test/e2e/utils.py:36: in verify_job_e2e
client.wait_for_job_conditions(name, namespace, job_kind, timeout=timeout)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
full log
self = <kubeflow.training.api.training_client.TrainingClient object at 0x7fe7543da910>
name = 'mxjob-mnist-ci-test', namespace = 'default', job_kind = 'MXJob'
expected_conditions = {'Succeeded'}, timeout = 600, polling_interval = 15
callback = None, apiserver_timeout = 120
def wait_for_job_conditions(
self,
name: str,
namespace: str = utils.get_default_target_namespace(),
job_kind: str = constants.TFJOB_KIND,
expected_conditions: Set = {constants.JOB_CONDITION_SUCCEEDED},
timeout: int = 600,
polling_interval: int = 15,
callback: Callable = None,
apiserver_timeout: int = constants.DEFAULT_TIMEOUT,
):
"""Wait until Training Job reaches any of the specified conditions.
By default it waits for the Succeeded condition.
Args:
name: Name for the Job.
namespace: Namespace for the Job.
job_kind: Kind for the Training job to wait for conditions.
It should be one of these: `TFJob, PyTorchJob, MXJob, XGBoostJob, MPIJob, or PaddleJob`.
expected_conditions: Set of expected conditions. It must be subset of this:
`{"Created", "Running", "Restarting", "Succeeded", "Failed"}`
timeout: How many seconds to wait until Job reaches one of
the expected conditions.
polling_interval: The polling interval in seconds to get Job status.
callback: Optional callback function that is invoked after Job
status is polled. This function takes a single argument which
is current Job object.
apiserver_timeout: Optional, Kubernetes API server timeout in seconds
to execute the request.
Returns:
object: Training Job object of type `KubeflowOrgV1TFJob, KubeflowOrgV1PyTorchJob,
KubeflowOrgV1MXJob, KubeflowOrgV1XGBoostJob, KubeflowOrgV1MPIJob, or
KubeflowOrgV1PaddleJob` which is reached required condition.
Raises:
ValueError: Expected conditions are invalid or Job kind is invalid
TimeoutError: Timeout to get Job.
RuntimeError: Failed to get Job.
"""
if not expected_conditions.issubset(constants.JOB_CONDITIONS):
raise ValueError(
f"Expected conditions: {expected_conditions} must be subset of {constants.JOB_CONDITIONS}"
)
for _ in range(round(timeout / polling_interval)):
# We should get Job only once per cycle and check the statuses.
job = utils.get_job(
custom_api=self.custom_api,
api_client=self.api_client,
name=name,
namespace=namespace,
job_model=constants.JOB_KINDS[job_kind]["model"],
job_kind=job_kind,
job_plural=constants.JOB_KINDS[job_kind]["plural"],
timeout=apiserver_timeout,
)
conditions = self.get_job_conditions(
name, namespace, job_kind, job, timeout
)
if len(conditions) > 0:
status_logger(
name, conditions[-1].type, conditions[-1].last_transition_time,
)
# Execute callback function.
if callback:
callback(job)
# Raise an exception if Job is Failed and Failed is not expected condition.
if (
constants.JOB_CONDITION_FAILED not in conditions
and utils.has_condition(conditions, constants.JOB_CONDITION_FAILED)
):
> raise RuntimeError(
f"{job_kind} {namespace}/{name} is Failed. "
f"{job_kind} conditions: {job.status.conditions}"
)
E RuntimeError: MXJob default/mxjob-mnist-ci-test is Failed. MXJob conditions: [{'last_transition_time': datetime.datetime(2023, 5, 31, 12, 26, 48, tzinfo=tzlocal()),
E 'last_update_time': datetime.datetime(2023, 5, 31, 12, 26, 48, tzinfo=tzlocal()),
E 'message': 'MXJob mxjob-mnist-ci-test is created.',
E 'reason': 'MXJobCreated',
E 'status': 'True',
E 'type': 'Created'}, {'last_transition_time': datetime.datetime(2023, 5, 31, 12, 27, 2, tzinfo=tzlocal()),
E 'last_update_time': datetime.datetime(2023, 5, 31, 12, 27, 2, tzinfo=tzlocal()),
E 'message': 'MXJob mxjob-mnist-ci-test is running.',
E 'reason': 'MXJobRunning',
E 'status': 'False',
E 'type': 'Running'}, {'last_transition_time': datetime.datetime(2023, 5, 31, 12, 28, 1, tzinfo=tzlocal()),
E 'last_update_time': datetime.datetime(2023, 5, 31, 12, 28, 1, tzinfo=tzlocal()),
E 'message': 'mxjob mxjob-mnist-ci-test is failed because 1 Worker replica(s) '
E 'failed.',
E 'reason': 'MXJobFailed',
E 'status': 'True',
E 'type': 'Failed'}]
sdk/python/kubeflow/training/api/training_client.py:378: RuntimeError
/kind kind/e2e-test-failure
@tenzen-y: The label(s) kind/kind/e2e-test-failure cannot be applied, because the repository doesn't have them.
In response to this:
/kind kind/e2e-test-failure
Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository.
/kind e2e-test-failure
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.
/remove-lifecycle stale
/remove-lifecycle stale
There is a incident with Github webhook: https://www.githubstatus.com/
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.
/lifecycle frozen
MXJob was removed from the training-operator. /close
@tenzen-y: Closing this issue.
In response to this:
MXJob was removed from the training-operator. /close
Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository.