shell - Kubernetes node节点缩容
2024-09-07
背景
节点缩容属于高危操作
当自动扩容的目标节点池上没有单pod应用,即发起缩容动作
#!/bin/bash
# tke自身的节点缩容机制基于request使用率,存在一定的风险
# 存在另外个任务:每2小时会同步腾讯云的cvm数据。故本脚本的运行时需错开,否则会出现consul注销后又被注册
# 当前cron 15 01 * * * bash /opt/bin/k8s/delete-node.sh >> /tmp/delete-node.log
k8s_cluster="k8s-v6-prod"
tke_cluster="cls-gk7cm35n"
pools="np-hkdvfdnf"
pools="np-lervgh1z np-ewqfo5of np-kipu41nj np-diqda5lh np-8gwy6vix np-jqnnta51 np-f1yoovy7" # 自动扩容节点池
pools="np-lervgh1z np-ewqfo5of np-diqda5lh"
pools="np-lervgh1z np-ewqfo5of np-763k3qad np-kipu41nj np-hkdvfdnf np-jqnnta51"
#pools="np-kipu41nj"
rm -f /tmp/d-*
for pool in ${pools}
do
echo ${pool}
i=1
# 对自动扩容的节点设置为维护状态
/usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} cordon -l tke.cloud.tencent.com/nodepool-id=${pool}
# 可能存在cronjob,预留30秒缓冲
sleep 30
# response:ns-imvlkzqj 10.3.64.27
/usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} get node -l tke.cloud.tencent.com/nodepool-id=${pool} -o=jsonpath='{range .items[*]}{.metadata.labels.cloud\.tencent\.com/node-instance-id}{"\t"}{.metadata.labels.kubernetes\.io/hostname}{"\n"}' | egrep "ins-" | while read node_infos
do
node_id=$(echo ${node_infos} | awk '{print $1}')
node_ip=$(echo ${node_infos} | awk '{print $2}')
echo ${node_id} ${node_ip}
# 忽略Completed|kube-system|monitoring
# node_ip需要绝对匹配,不能使用模糊匹配,例如10.3.1.22的模糊匹配会匹配到10.3.1.22,103.1.221等
# response:paas down-gateway-kuaishou-singleisp-65cccd65df-q8xlg down-gateway-kuaishou-singleisp-65cccd65df Running 10.3.64.27
/usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} get pod -A -o=jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.metadata.ownerReferences[].name}{"\t"}{.status.phase}{"\t"}{.status.hostIP}{"\n"}{end}' | egrep -v "Succeeded|kube-system|monitoring|\-t-" | grep ${node_ip}$ | while read line
do
namespace=$(echo ${line} | awk '{print $1}')
rs_name=$(echo ${line} | awk '{print $3}')
if [[ ${rs_name} =~ "down-sched-core" ]];then
touch /tmp/d-${node_ip}
break
else
# 判断pod 节点分布
node_nums=$(/usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} -n ${namespace} get pod -o wide | grep ${rs_name} | awk '{print $(NF-2)}' | sort | uniq | wc -l)
# rs对应的pod 分布在同一个node节点
if [[ ${node_nums} -eq 1 ]];then
touch /tmp/d-${node_ip}
break
fi
fi
done
reason=$(/usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} get pod -A -o=jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.metadata.ownerReferences[].name}{"\t"}{.status.phase}{"\t"}{.status.hostIP}{"\n"}{end}' | egrep -v "Succeeded|kube-system|monitoring" | grep ${node_ip}$)
if [[ ! -f /tmp/d-${node_ip} ]];then
echo `date` 准备删除节点 ${node_ip},当前pod列表如下:
echo "${reason}"
# 从consul中移除该主机的监控,避免误告警
/usr/local/bin/consul services deregister -http-addr=http://consul.paigod.work:8500 -id=${node_id}
# 节点Pod驱逐
/usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} drain ${node_ip} --ignore-daemonsets
# 预留缓冲时间
sleep 400
# 从腾讯云tke中移除该节点
/usr/local/bin/tccli tke DeleteClusterInstances --cli-unfold-argument --ClusterId ${tke_cluster} --InstanceIds ${node_id} --InstanceDeleteMode terminate --ForceDelete False
echo `date` 已删除节点 ${node_ip}
else
echo `date` 不能删除${node_ip},当前pod列表如下:
echo "${reason}"
fi
sleep 30
done
# 解除维护状态
/usr/local/bin/kubectl --kubeconfig ~/.kube/${k8s_cluster} uncordon -l tke.cloud.tencent.com/nodepool-id=${pool}
done
标题:shell - Kubernetes node节点缩容
地址:https://blog.njqhome.com:8443/articles/2023/05/23/1684809225264.html