fission icon indicating copy to clipboard operation
fission copied to clipboard

Package auto retry rebuild

Open bigbird-0101 opened this issue 9 months ago • 1 comments

question: When my env disk io is very high, the build often fails context: I hope that when the package build fails, it can be automatically rebuilt. When my env disk io is very high, the build often fails. I hope to provide automatic retry builds with time intervals.

bigbird-0101 avatar May 10 '24 03:05 bigbird-0101

instead shell retry rebuild

sh rebuild.sh namespace
#!/bin/bash
#最大重试次数
MAX_ATTEMPTS=3
#间隔时间
RETRY_DELAY=20
#命名空间
NAMESPACE="$1"

check_build_status() {
    local packageId="$1"
    # 假定的查询构建状态命令,请替换为实际可用的API调用
    local status=$(get_status $packageId)
    echo "Checking status for $packageId: $status"
    [ "$status" = "succeeded" ]
}

get_status(){
    local packageId="$1"
    local status=$(fission pkg list -n $NAMESPACE | grep $packageId | awk '{print $2}') # 示例中简化处理,实际需调用API获取状态
    echo "$status"
}

rebuild_and_check() {
    local packageId="$1"
    local attempt=0
    while [ $attempt -lt $MAX_ATTEMPTS ]; do
        echo "Rebuilding attempt $((attempt+1)) for node: $packageId"
        # 假定的重建命令,请替换为实际可用的命令
        local status=$(get_status "$packageId")
        echo "current $status"
        if [ "$status" = "failed" ]; then
           fission pkg rebuild --name "$packageId" -n $NAMESPACE
        fi
        if check_build_status "$packageId"; then
            echo "Rebuild for $packageId succeeded after $((attempt+1)) attempts."
            trigger_post_request "$packageId"
            return 0
        fi
        
        ((attempt++))
        sleep $RETRY_DELAY
    done
    
    echo "Failed to rebuild $packageId after $MAX_ATTEMPTS attempts. Skipping to next."
    return 1
}

trigger_post_request() {
    local packageId="$1"
    local functionName="${packageId#node-}"
    local url=$(kubectl get node -o wide | grep -E '^.*Ready.*master'  | awk '{print $6}'| head -n 1)
    local response=$(curl -sS -X POST -m 10 "http://$url:31314/$functionName")
    echo "request $functionName, response $response"
}

get_failed_packages(){
	local failed_packages=($(fission pkg list -n $NAMESPACE | grep failed | awk '{print $1}'))
	# 主处理逻辑
	for packageId in "${failed_packages[@]}"; do
		if rebuild_and_check "$packageId"; then
			continue  # 成功重建并触发请求后,正常继续下一个
		else
			#在这里可以添加额外的逻辑,比如记录失败的节点等
			echo "$packageId build failed"
		fi
	done
}
get_failed_packages
echo "All nodes processed with retry logic applied."

bigbird-0101 avatar May 10 '24 07:05 bigbird-0101