kuscia icon indicating copy to clipboard operation
kuscia copied to clipboard

Kuscia K8S RunK运行时运行SCQL任务失败

Open RotaercAH opened this issue 5 months ago • 21 comments

Issue Type

Running

Search for existing issues similar to yours

Yes

OS Platform and Distribution

CentOS Linux release 7.9.2009 (Core)

Kuscia Version

kuscia v0.10.0b0

Deployment

k8s

deployment Version

k8s v1.16.9

App Running type

scql

App Running version

0.8.1b1

Configuration file used to run kuscia.

# alice scql-image.yaml
apiVersion: kuscia.secretflow/v1alpha1
kind: AppImage
metadata:
  name: scql
spec:
  configTemplates:
    brokerConf: |-
      intra_server:
       protocol: http
       host: 0.0.0.0
       port: {{.KUSCIA_PORT_INTRA_NUMBER}}
      inter_server:
       port: {{.KUSCIA_PORT_INTER_NUMBER}}
       protocol: http
       cert_file: "{{.SERVER_CERT_FILE}}"
       key_file: "{{.SERVER_PRIVATE_KEY_FILE}}"
      inter_timeout: 30s
      party_code: {{.KUSCIA_DOMAIN_ID}}
      private_key_data: {{.KUSCIA_DOMAIN_KEY_DATA}}
      intra_host: "scql-broker-intra.{{.KUSCIA_DOMAIN_ID}}.svc:{{.KUSCIA_PORT_INTRA_NUMBER}}"
      discovery:
        type: kuscia
        kuscia:
          endpoint: kusciaapi:8083
          tls_mode: {{.KUSCIA_API_PROTOCOL}}
          cert: {{.CLIENT_CERT_FILE}}
          key: {{.CLIENT_PRIVATE_KEY_FILE}}
          cacert: {{.TRUSTED_CA_FILE}}
          token: {{.KUSCIA_API_TOKEN}}
      engine:
        timeout: 120s
        protocol: http
        content_type: application/json
        scheduler: kuscia
        kuscia_scheduler:
          endpoint: kusciaapi:8083
          tls_mode: {{.KUSCIA_API_PROTOCOL}}
          cert: {{.CLIENT_CERT_FILE}}
          key: {{.CLIENT_PRIVATE_KEY_FILE}}
          cacert: {{.TRUSTED_CA_FILE}}
          token: {{.KUSCIA_API_TOKEN}}
          keep_job_alive_for_debug: false
      storage:
        type: mysql
        conn_str: "kuscia:Kuscia.2024@tcp(10.2.1.4:3306)/kuscia_alice?charset=utf8mb4&parseTime=True&loc=Local&interpolateParams=true"
        max_idle_conns: 10
        max_open_conns: 100
        conn_max_idle_time: 2m
        conn_max_lifetime: 5m
    engineConf: |-
      --enable_restricted_read_path=false
      --listen_port={{.KUSCIA_PORT_ENGINEPORT_NUMBER}}
      --enable_separate_link_port=true
      --link_port={{.KUSCIA_PORT_LINKPORT_NUMBER}}
      --enable_driver_authorization=false
      --datasource_router=embed
      #--embed_router_conf=xxx
      # NOTE: set --datasource_router to kusciadatamesh if use kuscia datamesh
      --datasource_router=kusciadatamesh
      --kuscia_datamesh_endpoint=datamesh:8071
      --kuscia_datamesh_client_cert_path={{.CLIENT_CERT_FILE}}
      --kuscia_datamesh_client_key_path={{.CLIENT_PRIVATE_KEY_FILE}}
      --kuscia_datamesh_cacert_path={{.TRUSTED_CA_FILE}}
      # party authentication flags
      --enable_self_auth=false
      --enable_peer_auth=false
      # https flags
      --server_enable_ssl=false
      --server_ssl_certificate=/home/admin/engine/conf/cert.pem
      --server_ssl_private_key=/home/admin/engine/conf/key.pem
      # set peer_engine_enable_ssl_as_client to true when peer SCQLEngine has https enabled
      --peer_engine_enable_ssl_as_client=false
      # set false when SCQLBroker IntraServer not enable https
      --driver_enable_ssl_as_client=false
      --peer_engine_protocol=http:proto
      --peer_engine_connection_type=pooled
      --peer_engine_load_balancer=rr
  deployTemplates:
    - name: broker
      role: broker
      replicas: 1
      spec:
        affinity:
          podAntiAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
              - topologyKey: kubernetes.io/hostname
        containers:
          - command:
              - /home/admin/bin/broker
              - -config=./configs/config.yml
            configVolumeMounts:
              - mountPath: /work/configs/config.yml
                subPath: brokerConf
            name: broker
            ports:
              - name: intra
                port: 8080
                protocol: HTTP
                scope: Domain
              - name: inter
                port: 8081
                protocol: HTTP
                scope: Cluster
            readinessProbe:
              httpGet:
                path: /health
                port: intra
            workingDir: /work
        restartPolicy: Always
    - name: engine
      role: engine
      replicas: 1
      spec:
        containers:
          - command:
              - /home/admin/bin/scqlengine
              - --flagfile=./conf/gflags.conf
            configVolumeMounts:
              - mountPath: /work/conf/gflags.conf
                subPath: engineConf
            name: engine
            resources:
              requests:
                cpu: 1
                memory: 2Gi
            ports:
              - name: engineport
                port: 8003
                protocol: HTTP
                scope: Domain
              - name: linkport
                port: 8004
                protocol: HTTP
                scope: Cluster
            workingDir: /work
        restartPolicy: Never
  image:
    name: "secretflow/scql"
    tag: "0.8.1b1"

What happend and What you expected to happen.

搭建好Kuscia环境后,psi测试任务成功运行,测试SCQL任务时,数据表相关接口成功执行,发布查询任务的intra/query接口报错,报错信息如下:
[root@kuscia-autonomy-alice-c798bfb4-4nsrn kuscia]# curl -X POST http://127.0.0.1:80/intra/query \
--header "host: scql-broker-intra.alice.svc" \
--header "kuscia-source: alice" \
-H "Content-Type: application/json" \
-d '{
    "project_id": "test",
    "query":"SELECT ta.credit_rank, COUNT(*) as cnt, AVG(ta.income) as avg_income, AVG(tb.order_amount) as avg_amount FROM ta INNER JOIN tb ON ta.ID = tb.ID WHERE ta.age >= 20 AND ta.age <= 30 AND tb.is_active=1 GROUP BY ta.credit_rank;"
}'
{"status":{"code":300,"message":"failed to schedule scqlengine for job 64397e60-6911-11ef-a09d-fa858fe5d678: failed to wait for kuscia job 64397e60-6911-11ef-a09d-fa858fe5d678-6384e2b2 tobe running","details":[]},"result":null}

由于任务发起失败,kubectl get kj -A 没有任务信息,home/kuscia/var/stdout/ 目录下无pods文件夹
请问该如何排查任务执行失败的错误呢?

Kuscia log output.

#kusciaapi.log
2024-09-03 10:20:32.462 INFO interceptor/common.go:92 [GRPC] [GRPC /kuscia.proto.api.v1alpha1.kusciaapi.JobService/QueryJob] Duration: 8.053805ms, StatusCode: 0, ForwardHost: [], ContextType: [], Request: {"job_id":"0db5b866-699b-11ef-a09d-fa858fe5d678-6384e2b2"}, Response: {"data":{"custom_fields":{},"initiator":"alice","job_id":"0db5b866-699b-11ef-a09d-fa858fe5d678-6384e2b2","max_parallelism":1,"status":{"approve_status_list":[{"domain_id":"alice","state":"JobAccepted"}],"create_time":"2024-09-03T02:20:14Z","stage_status_list":[{"domain_id":"alice","state":"JobCreateStageSucceeded"}],"start_time":"2024-09-03T02:20:14Z","state":"Running","tasks":[{"task_id":"0db5b866-699b-11ef-a09d-fa858fe5d678-6384e2b2-12a359699023","state":"Pending","create_time":"2024-09-03T02:20:14Z","start_time":"2024-09-03T02:20:14Z","parties":[{"domain_id":"alice","state":"Pending","endpoints":[{"port_name":"engineport","scope":"Domain","endpoint":"svc-0db5b866-699b-11ef-a09d-fa858fe-engineport-2c040bc37b7b8411.alice.svc:26103"},{"port_name":"linkport","scope":"Cluster","endpoint":"svc-0db5b866-699b-11ef-a09d-fa858fe5d-linkport-40f1233b84e334ef.alice.svc"}]}],"alias":"Start-SCQLEngine"}]},"tasks":[{"app_image":"scql","parties":[{"domain_id":"alice","role":"engine"}],"alias":"Start-SCQLEngine","task_id":"
2024-09-03 10:20:32.471 INFO interceptor/common.go:92 [GRPC] [GRPC /kuscia.proto.api.v1alpha1.kusciaapi.JobService/DeleteJob] Duration: 8.294655ms, StatusCode: 0, ForwardHost: [], ContextType: [], Request: {"job_id":"0db5b866-699b-11ef-a09d-fa858fe5d678-6384e2b2"}, Response: {"data":{"job_id":"0db5b866-699b-11ef-a09d-fa858fe5d678-6384e2b2"},"status":{"code":0,"details":null,"message":"success"}}

RotaercAH avatar Sep 02 '24 10:09 RotaercAH