Dragonfly2 icon indicating copy to clipboard operation
Dragonfly2 copied to clipboard

preheating the large image failed

Open zsksy123 opened this issue 2 years ago • 4 comments

image name: uhub.service.ucloud.cn/openbayes_algopub/inference_llm:0.0.2 preheating the large image failed,Job details are as follows:

{
  "id": 2,
  "created_at": "2024-01-03T08:39:12Z",
  "updated_at": "2024-01-03T08:44:41Z",
  "is_del": 0,
  "task_id": "group_48ad15e5-5712-4e22-a8ef-ec4a2697e824",
  "bio": "",
  "type": "preheat",
  "state": "FAILURE",
  "args": {
    "filter": "Expires&Signature",
    "headers": null,
    "password": "Signcl2013&&",
    "platform": "",
    "tag": "",
    "type": "image",
    "url": "https://uhub.service.ucloud.cn/v2/openbayesruntimes/pytorch/manifests/1.8.2-py38-cu111.87",
    "username": "[email protected]"
  },
  "result": {
    "CreatedAt": "2024-01-03T08:39:12.23073062Z",
    "GroupUUID": "group_48ad15e5-5712-4e22-a8ef-ec4a2697e824",
    "JobStates": [
      {
        "CreatedAt": "2024-01-03T08:39:12.223751052Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_5cd1395a-b9d1-4498-9040-2ec07f0f7f3d"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.224361629Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_58bc8c0d-39b2-476c-9d04-fad017b72580"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.224919897Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_fe297a79-25fe-4a3a-aba8-5b0922228a9f"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.225476452Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_2f36ff98-fac6-4830-8725-2497ef5deb74"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.226030041Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_52ac4054-bda3-421b-8610-7de7626486c8"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.226605061Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_e3f7f235-1e1b-4a43-b846-0008ff2118d8"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.227212191Z",
        "Error": "",
        "Results": null,
        "State": "STARTED",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_5557a0ca-075b-49f3-9774-ce162c9374f1"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.227804334Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_f2a36e18-967d-40da-b517-aabc7fc89569"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.228397177Z",
        "Error": "",
        "Results": null,
        "State": "STARTED",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_5a108e43-70c2-40a8-b27c-bee704fc5e76"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.228908236Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_d7527787-14c8-4f98-b476-fde4b59a6571"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.22931687Z",
        "Error": "",
        "Results": null,
        "State": "STARTED",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_66d99522-943b-4966-b9b7-1e97a4cba588"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.229646224Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_1d3b0598-3377-4091-93a4-7d4871c9fc9a"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.229932206Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_42351c5c-5c3d-473b-b66f-fc80fb08a498"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.230190035Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_551d39e1-ed35-47ef-816f-aaad7e304dc2"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.230464886Z",
        "Error": "",
        "Results": null,
        "State": "STARTED",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_0a4ccc77-a007-48e7-8162-558e603157c6"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.23073062Z",
        "Error": "rpc error: code = Internal desc = seed task failed: peer task failed: 4000/unexpected EOF",
        "Results": null,
        "State": "FAILURE",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_6a0475bb-9b0c-41bf-9f9e-234b7f876b32"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.230986995Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_018430f5-ff34-424a-9f14-f1ee94430010"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.231250765Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_93d2c8f5-4325-46c4-8cd9-1e4687dfb264"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.231516108Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_59049d63-0520-4d27-8f59-7859f8cac2bf"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.231793444Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_8ade9127-1979-457e-bbf1-1d2dbefe80ff"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.232053527Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_eb84e69d-fb8a-4220-8667-8320a1de6635"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.23230886Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_afabfacd-57d0-4266-86e7-f722e4a90c18"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.232578281Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_dcbebd6a-62d2-4699-a7ab-dd1fd03b4043"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.232823356Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_279b90e7-bf08-4f5d-a66a-ed925265f1fc"
      },
      {
        "CreatedAt": "2024-01-03T08:39:12.233085663Z",
        "Error": "",
        "Results": [],
        "State": "SUCCESS",
        "TTL": 0,
        "TaskName": "preheat",
        "TaskUUID": "task_2f455382-d544-41a7-9f1b-eb14388c7e24"
      }
    ],
    "State": "FAILURE"
  },
  "user_id": 0,
  "user": {
    "id": 0,
    "created_at": "0001-01-01T00:00:00Z",
    "updated_at": "0001-01-01T00:00:00Z",
    "is_del": 0,
    "email": "",
    "name": "",
    "avatar": "",
    "phone": "",
    "state": "",
    "location": "",
    "bio": "",
    "configs": null
  },
  "seed_peer_clusters": [],
  "scheduler_clusters": [
    {
      "id": 1,
      "created_at": "2024-01-03T08:13:15Z",
      "updated_at": "2024-01-03T08:13:15Z",
      "is_del": 0,
      "name": "cluster-1",
      "bio": "",
      "config": {
        "candidate_parent_limit": 4,
        "filter_parent_limit": 40
      },
      "client_config": {
        "concurrent_piece_count": 4,
        "load_limit": 50
      },
      "scopes": {},
      "is_default": true,
      "seed_peer_clusters": null,
      "schedulers": null,
      "peers": null,
      "jobs": null
    }
  ]
}

zsksy123 avatar Jan 03 '24 08:01 zsksy123

The following is all the logs of manager:

manager core.log core.log manager gin.log gin.log manager grpc.log grpc.log manager stderr.log stderr.log manager stdout.log stdout.log

The following is all the logs of scheduler:

core.log gc.log grpc.log job.log stderr.log stdout.log

The following is all the logs of seedPeer: core.log gin.log grpc.log stderr.log stdout.log

I filtered all the logs of dfdaemon and found no error logs

for i in `kl get pod|grep dfdaemon|awk '{print $1}'`;do echo "$i error log";kl exec -it dragonfly-dfdaemon-2zb6g cat  /var/log/dragonfly/daemon/core.log|grep -i error;done
dragonfly-dfdaemon-2zb6g error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-4c5tm error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-59jpj error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-86t88 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-b5wq2 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-bgc78 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-brw29 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-bx8fx error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-c75fm error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-cffmb error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-cqvqv error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-dzwvt error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-flwkk error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-fq4rf error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-h5tg6 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-hj2mx error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-hnxt8 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-k2trd error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-kqxgt error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-l4v9l error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-ltpl9 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-mfvrj error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-mlfph error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-pf6ll error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-plgt4 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-rxxb8 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-sswg2 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-wlg5c error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-x6bs8 error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-xds4g error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-z6s8f error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-zhb5z error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)
dragonfly-dfdaemon-zlfct error log
kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead.
Defaulted container "dfdaemon" out of: dfdaemon, wait-for-scheduler (init), mount-netns (init), update-docker-config (init)

zsksy123 avatar Jan 03 '24 09:01 zsksy123

@zsksy123 Seed Peer Logs:

Total number of the task's piece is 97.

{"level":"debug","ts":"2024-01-03 08:38:50.319","caller":"storage/local_storage.go:234","msg":"update total pieces: 97","task":"1240b3604ff90029979f795456360e221597801bafb1541cc733f77a1031483a","peer":"10.96.23.232-1-200cdb76-9346-40cb-84cd-1c4c0512554b_Seed","component":"localTaskStore"}

Length of the piece is 15728640.

{"level":"debug","ts":"2024-01-03 08:39:08.721","caller":"storage/local_storage.go:182","msg":"wrote 15728640 bytes to file /var/lib/dragonfly/1240b3604ff90029979f795456360e221597801bafb1541cc733f77a1031483a/10.96.23.232-1-200cdb76-9346-40cb-84cd-1c4c0512554b_Seed/data, piece 1, start 15728640, length: 15728640","task":"1240b3604ff90029979f795456360e221597801bafb1541cc733f77a1031483a","peer":"10.96.23.232-1-200cdb76-9346-40cb-84cd-1c4c0512554b_Seed","component":"localTaskStore"}

When the 68 piece writes 7434240 bytes, return the unexpected EOF error.

{"level":"error","ts":"2024-01-03 08:48:36.544","caller":"peer/piece_manager.go:292","msg":"put piece to storage failed, piece num: 68, wrote: 7434240, error: unexpected EOF","peer":"10.96.23.232-1-200cdb76-9346-40cb-84cd-1c4c0512554b_Seed","task":"1240b3604ff90029979f795456360e221597801bafb1541cc733f77a1031483a","component":"PeerTask","trace":"f1934cc823a9f6a835ed7d4deb7e5f78","stacktrace":"d7y.io/dragonfly/v2/client/daemon/peer.(*pieceManager).processPieceFromSource\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/piece_manager.go:292\nd7y.io/dragonfly/v2/client/daemon/peer.(*pieceManager).downloadKnownLengthSource\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/piece_manager.go:490\nd7y.io/dragonfly/v2/client/daemon/peer.(*pieceManager).DownloadSource\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/piece_manager.go:475\nd7y.io/dragonfly/v2/client/daemon/peer.(*peerTaskConductor).backSource\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/peertask_conductor.go:505\nd7y.io/dragonfly/v2/client/daemon/peer.(*peerTaskConductor).pullPieces\n\t/go/src/d7y.io/dragonfly/v2/client/daemon/peer/peertask_conductor.go:527"}

When downloading the 68 piece, the http range request was interrupted, which affected writing. Please check why the server connection was interrupted.

gaius-qi avatar Jan 03 '24 10:01 gaius-qi

@gaius-qi @jim3ma Manually pull the image using the docker pull command and you will find that there is a 2.3G layer that will retry the pull. However, it will eventually succeed. If the uhub repository connection is lost, can we handle this situation on our side of the code?

pull-image-retry

zsksy123 avatar Jan 04 '24 03:01 zsksy123

@zsksy123 Can you add support for the feature? Thanks.

gaius-qi avatar Jan 04 '24 08:01 gaius-qi

Please use the latest version of the rust client, refer to https://github.com/dragonflyoss/client.

gaius-qi avatar Apr 23 '25 10:04 gaius-qi

@gaius-qi I redeployed the latest version of Dragonfly and cleared all the data of PVC. However, the preheating of the large image still failed.

Version Information

helm list
WARNING: Kubernetes configuration file is group-readable. This is insecure. Location: /Users/zld/kubeconfigs/bj.yaml
WARNING: Kubernetes configuration file is world-readable. This is insecure. Location: /Users/zld/kubeconfigs/bj.yaml
NAME     	NAMESPACE	REVISION	UPDATED                             	STATUS  	CHART           	APP VERSION
dragonfly	dragonfly	1       	2025-05-06 10:49:06.963761 +0800 CST	deployed	dragonfly-1.3.26	2.2.3-rc.0

The error log is as follows.

kl logs -f dragonfly-seed-client-0|grep -i error|head -10
Defaulted container "seed-client" out of: seed-client, wait-for-manager (init)
  2025-05-06T02:51:07.972130148+00:00  INFO  remove "/var/run/dragonfly/dfdaemon.sock" failed: No such file or directory (os error 2)
  2025-05-06T02:54:22.816472074+00:00 ERROR  copy "/var/lib/dragonfly/content/tasks/147/147989b4c7f1e0c91f0cc0bbb685e8045bd81e952e7e5ffac15b629c21e93955" failed: error decoding response body
  2025-05-06T02:54:22.816561435+00:00 ERROR  download piece finished: error decoding response body
  2025-05-06T02:54:23.322443562+00:00 ERROR  copy "/var/lib/dragonfly/content/tasks/a4e/a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066" failed: error decoding response body
  2025-05-06T02:54:23.322444342+00:00 ERROR  copy "/var/lib/dragonfly/content/tasks/a4e/a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066" failed: error decoding response body
  2025-05-06T02:54:23.322447512+00:00 ERROR  copy "/var/lib/dragonfly/content/tasks/a4e/a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066" failed: error decoding response body
  2025-05-06T02:54:23.322472673+00:00 ERROR  copy "/var/lib/dragonfly/content/tasks/a4e/a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066" failed: error decoding response body
  2025-05-06T02:54:23.322506093+00:00 ERROR  download piece finished: error decoding response body
  2025-05-06T02:54:23.322502873+00:00 ERROR  copy "/var/lib/dragonfly/content/tasks/a4e/a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066" failed: error decoding response body
  2025-05-06T02:54:23.322552034+00:00 ERROR  copy "/var/lib/dragonfly/content/tasks/a4e/a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066" failed: error decoding response body
......
  2025-05-06T02:58:23.360695162+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-260 failed: "Closed(..)"
  2025-05-06T02:58:23.374456842+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-261 failed: "Closed(..)"
  2025-05-06T02:58:23.443398947+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-260 failed: "Closed(..)"
  2025-05-06T02:58:23.674108981+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-202 failed: "Closed(..)"
  2025-05-06T02:58:23.728423126+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-202 failed: "Closed(..)"
  2025-05-06T02:58:23.850135267+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-257 failed: "Closed(..)"
  2025-05-06T02:58:23.898531282+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-257 failed: "Closed(..)"
  2025-05-06T02:58:25.347240438+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-240 failed: "Closed(..)"
  2025-05-06T02:58:25.427486415+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-240 failed: "Closed(..)"
  2025-05-06T02:58:25.701792348+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-263 failed: "Closed(..)"
  2025-05-06T02:58:25.747407881+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-263 failed: "Closed(..)"
  2025-05-06T02:58:25.821910581+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-239 failed: "Closed(..)"
  2025-05-06T02:58:25.907433590+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-239 failed: "Closed(..)"
  2025-05-06T02:58:25.949481891+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-262 failed: "Closed(..)"
  2025-05-06T02:58:25.963457064+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-262 failed: "Closed(..)"
  2025-05-06T02:58:26.758517438+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-241 failed: "Closed(..)"
  2025-05-06T02:58:26.790279379+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-242 failed: "Closed(..)"
  2025-05-06T02:58:26.795406938+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-242 failed: "Closed(..)"
  2025-05-06T02:58:26.821494243+00:00 ERROR  send DownloadPieceBackToSourceFinishedRequest for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-241 failed: "Closed(..)"
  2025-05-06T02:58:27.273525911+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-266 failed: "Closed(..)"
  2025-05-06T02:58:27.315765585+00:00 ERROR  send DownloadPieceFinishedResponse for piece a4e8d7583d09cfecccf23b8a6d48701182d6268f1ca692be47035c9384c07066-264 failed: "Closed(..)"

However, after I adjusted the configuration, although there were still some errors, the preheating of the large image was successful.

seedClient:
  config:
    download:
      pieceTimeout: 300s

What I'm wondering is whether the lower version of the chart, such as 1.1.32, has similar parameters? Or is the lower version caused by code issues, resulting in the failure of the preheating large image, rather than a configuration problem?

zsksy123 avatar May 06 '25 03:05 zsksy123