shell - Kubernetes node节点缩容

2024-09-07

背景

节点缩容属于高危操作
当自动扩容的目标节点池上没有单pod应用,即发起缩容动作

#!/bin/bash
# tke自身的节点缩容机制基于request使用率,存在一定的风险
# 存在另外个任务:每2小时会同步腾讯云的cvm数据。故本脚本的运行时需错开,否则会出现consul注销后又被注册
# 当前cron 15 01 * * * bash /opt/bin/k8s/delete-node.sh >> /tmp/delete-node.log
k8s_cluster="k8s-v6-prod"
tke_cluster="cls-gk7cm35n"
pools="np-hkdvfdnf"
pools="np-lervgh1z np-ewqfo5of np-kipu41nj np-diqda5lh np-8gwy6vix np-jqnnta51 np-f1yoovy7" # 自动扩容节点池
pools="np-lervgh1z np-ewqfo5of np-diqda5lh"
pools="np-lervgh1z np-ewqfo5of np-763k3qad np-kipu41nj np-hkdvfdnf np-jqnnta51"
#pools="np-kipu41nj"
rm -f /tmp/d-*
for pool in ${pools}
do
    echo ${pool}
    i=1
    # 对自动扩容的节点设置为维护状态
    /usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} cordon -l tke.cloud.tencent.com/nodepool-id=${pool}
    # 可能存在cronjob,预留30秒缓冲
    sleep 30
    # response:ns-imvlkzqj	10.3.64.27
    /usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} get node -l tke.cloud.tencent.com/nodepool-id=${pool} -o=jsonpath='{range .items[*]}{.metadata.labels.cloud\.tencent\.com/node-instance-id}{"\t"}{.metadata.labels.kubernetes\.io/hostname}{"\n"}' | egrep "ins-" | while read node_infos
    do
        node_id=$(echo ${node_infos} | awk '{print $1}')
        node_ip=$(echo ${node_infos} | awk '{print $2}')
	echo ${node_id} ${node_ip}
       	# 忽略Completed|kube-system|monitoring
	# node_ip需要绝对匹配,不能使用模糊匹配,例如10.3.1.22的模糊匹配会匹配到10.3.1.22,103.1.221等
	# response:paas	down-gateway-kuaishou-singleisp-65cccd65df-q8xlg	down-gateway-kuaishou-singleisp-65cccd65df	Running	10.3.64.27
        /usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} get pod -A  -o=jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.metadata.ownerReferences[].name}{"\t"}{.status.phase}{"\t"}{.status.hostIP}{"\n"}{end}' | egrep -v "Succeeded|kube-system|monitoring|\-t-" | grep ${node_ip}$ | while read line
        do
	    namespace=$(echo ${line} | awk '{print $1}')
	    rs_name=$(echo ${line} | awk '{print $3}')
	    if [[ ${rs_name} =~ "down-sched-core" ]];then
		 touch /tmp/d-${node_ip}
		 break
            else
	        # 判断pod 节点分布
                node_nums=$(/usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} -n ${namespace} get pod -o wide | grep ${rs_name} | awk '{print $(NF-2)}' | sort | uniq | wc -l)
	        # rs对应的pod 分布在同一个node节点
	        if [[ ${node_nums} -eq 1 ]];then
	             touch /tmp/d-${node_ip}
	             break
                fi
	    fi
        done
        reason=$(/usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} get pod -A  -o=jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.metadata.ownerReferences[].name}{"\t"}{.status.phase}{"\t"}{.status.hostIP}{"\n"}{end}' | egrep -v "Succeeded|kube-system|monitoring" | grep ${node_ip}$)
	if [[ ! -f /tmp/d-${node_ip} ]];then
	    echo `date` 准备删除节点 ${node_ip},当前pod列表如下:
	    echo "${reason}"
 	    # 从consul中移除该主机的监控,避免误告警
            /usr/local/bin/consul services deregister -http-addr=http://consul.paigod.work:8500 -id=${node_id}
            # 节点Pod驱逐
            /usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} drain ${node_ip} --ignore-daemonsets
 	    # 预留缓冲时间
            sleep 400
 	    # 从腾讯云tke中移除该节点
            /usr/local/bin/tccli tke DeleteClusterInstances --cli-unfold-argument --ClusterId ${tke_cluster} --InstanceIds ${node_id} --InstanceDeleteMode terminate --ForceDelete False
	    echo `date` 已删除节点 ${node_ip}
        else
       	    echo `date` 不能删除${node_ip},当前pod列表如下:
	    echo "${reason}"
	fi
    sleep 30
    done
    # 解除维护状态
    /usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} uncordon -l tke.cloud.tencent.com/nodepool-id=${pool}
done

标题:shell - Kubernetes node节点缩容
地址:https://blog.njqhome.com:8443/articles/2023/05/23/1684809225264.html