这是我编写的一个非常特殊的脚本,用于Kubernetes集群监视目的(特别是节点)。本质上,我需要确保NodeGroup中的节点都具有相同的标签和标签值,否则,在我们使用的Kubernetes版本中,节点将不会被均匀地缩小(群集自动缩放器希望值相同,以便平等对待节点)。
可以使用env var IN_CLUSTER来设置脚本是从本地机器(它可以读取kubectl配置)运行,还是作为集群中的容器运行(利用RBAC权限)。
我编写的脚本完成了我需要的工作-获取集群中节点的列表,遍历每个NodeGroup (有四个节点组--核心、通用、可观察性、pci)。我们将节点分组到相关的NodeGroup中。然后我们检查NodeGroup中的每个节点,并进行比较以确保标签匹配。
该脚本实现了用于检索节点列表的Kubernetes客户端。该脚本还实现了Kuberhealthy客户端,它只需向Kuberhealthy主程序报告检查结果(成功或失败)。
我不喜欢脚本中硬编码的四个NodeGroups,但我想不出如何使用存储为env的数组来实现我想要的结果。
脚本的目的是简单地从上到下运行,并且非常简单。我不确定if __name__ == '__main__'指令是否有意义,因为它永远不会作为模块导入。
from kubernetes import client, config
from kh_client import *
import os
# requires cluster role with permissions list, get nodes!
# needs refactoring, for time being have kept it as a 'top to bottom' script
def main():
if os.getenv('IN_CLUSTER') == "TRUE":
config.load_incluster_config()
else:
config.load_kube_config()
try:
api_instance = client.CoreV1Api()
node_list = api_instance.list_node()
except client.exceptions.ApiException:
print("401 Unauthorised. Please check you are authenticated for the target cluster / have set the IN_CLUSTER env var.")
exit(2)
node_group_core = []
node_group_general = []
node_group_observability = []
node_group_pci = []
# print("%s\t\t%s" % ("NAME", "LABELS"))
# this needs changing but difficult to do with an env_var
for node in node_list.items:
if node.metadata.labels.get('nodegroup-name') == "core":
node_group_core.append(node)
if node.metadata.labels.get('nodegroup-name') == "general":
node_group_general.append(node)
if node.metadata.labels.get('nodegroup-name') == "observability":
node_group_observability.append(node)
if node.metadata.labels.get('nodegroup-name') == "pci":
node_group_pci.append(node)
check_node_group_labels(node_group_core)
check_node_group_labels(node_group_general)
check_node_group_labels(node_group_observability)
check_node_group_labels(node_group_pci)
# everything has checked successfully, report success.
print("Reporting Success.")
try:
report_success()
except Exception as e:
print(f"Error when reporting success: {e}")
def check_node_group_labels(node_group):
# ignored labels taken from https://github.com/kubernetes/autoscaler/blob/3a69f118d95cd653bf101aecc0ea5e00bf7ba370/cluster-autoscaler/processors/nodegroupset/aws_nodegroups.go#L26
# this can be refactored
ignored_labels = [ "alpha.eksctl.io/instance-id",
"alpha.eksctl.io/nodegroup-name",
"eks.amazonaws.com/nodegroup",
"k8s.amazonaws.com/eniConfig",
"lifecycle",
# labels i've added
"topology.kubernetes.io/zone",
"kubernetes.io/hostname",
"failure-domain.beta.kubernetes.io/zone" ]
node_group_labels = []
for l in node_group[0].metadata.labels:
if l not in ignored_labels:
node_group_labels.append(l)
print(f"There are {len(node_group)} nodes in {node_group[0].metadata.labels.get('nodegroup-name')}")
for label in node_group_labels:
# compare against the 'benchmark' label, any difference means a mismatch as far as CAS is concerned
# print(label)
benchmark_label = node_group[0].metadata.labels.get(label)
# print("benchmark label: ", benchmark_label)
for node in node_group[1:]:
# print("node label", node.metadata.labels.get(label))
if node.metadata.labels.get(label) != benchmark_label:
print("Reporting Failure.")
try:
report_failure([f"Warning! label mismatch detected, for nodegroup and node {node.metadata.name}, benchmark value: {benchmark_label}, this node value: {node.metadata.labels.get(label)}"])
except Exception as e:
print(f"Error when reporting failure: {e}")
if __name__ == '__main__':
main()
```发布于 2021-07-21 08:21:26
关于:
我不喜欢脚本中硬编码的四个NodeGroups,但我想不出如何使用存储为env的数组来实现我想要的结果。
我们可以首先创建一个包含所有node_groups的字典,然后开始重构我们的代码:
NODE_GROUPS = {
'core': [],
'general': [],
'observability': [],
'pci': [],
}通过这样做,我们将删除main函数中的一些重复代码:
def main():
# ...
for node in node_list.items:
nodegroup_name = node.metadata.labels.get('nodegroup-name')
for node_group_name, nodes in NODE_GROUPS.items():
if nodegroup_name == node_group_name:
nodes.append(node)
for node_group in NODE_GROUPS.values():
check_node_group_labels(node_group)
# ...现在,将NODE_GROUPS存储到一个配置文件中并不是一个好主意,因为新的格式对我们没有帮助。如果您真的想去掉节点组,我建议您使用另一种方法。
我不确定
if __name__ == '__main__'指令是否有意义,因为它永远不会作为模块导入。
这个保护程序是样板代码,可以保护用户在无意中不小心调用脚本。在脚本中省略保护时,以下是一些常见问题:
import my_script_without_a_name_eq_main_guard)中导入没有保护的脚本,那么第二个脚本将触发第一个脚本在导入时运行,并使用第二个脚本的命令行参数。我想说,在代码中使用它通常是很好的实践,但不是强制性的。要阅读更多关于it的信息,检查这个答案
以下是我对整个脚本的风格:
import os
import sys
from kh_client import *
from kubernetes import client, config
NODE_GROUPS = {
'core': [],
'general': [],
'observability': [],
'pci': [],
}
IGNORED_LABELS = (
"alpha.eksctl.io/instance-id",
"alpha.eksctl.io/nodegroup-name",
"eks.amazonaws.com/nodegroup",
"k8s.amazonaws.com/eniConfig",
"lifecycle",
# custom labels
"topology.kubernetes.io/zone",
"kubernetes.io/hostname",
"failure-domain.beta.kubernetes.io/zone"
)
def load_config():
if os.getenv('IN_CLUSTER') == "TRUE":
config.load_incluster_config()
else:
config.load_kube_config()
def get_nodes():
try:
api_instance = client.CoreV1Api()
return api_instance.list_node()
except client.exceptions.ApiException:
print("401 Unauthorised. Please check you are authenticated "
"for the target cluster / have set the IN_CLUSTER env "
"var.")
sys.exit(2)
def get_group_labels(node_group):
return [
node_group_label for node_group_label in node_group[0].metadata.labels
if node_group_label not in IGNORED_LABELS
]
def check_node_group_labels(node_group):
node_group_labels = get_group_labels(node_group)
print(f"There are {len(node_group)} nodes in "
f"{node_group[0].metadata.labels.get('nodegroup-name')}")
for label in node_group_labels:
# compare against the 'benchmark' label, any difference means
# a mismatch as far as CAS is concerned
benchmark_label = node_group[0].metadata.labels.get(label)
for node in node_group[1:]:
label = node.metadata.labels.get('nodegroup-name')
if label != benchmark_label:
print("Reporting Failure.")
try:
report_failure([
f"Warning! label mismatch detected, for nodegroup and "
f"node {node.metadata.name}, benchmark value: {benchmark_label}, "
f"this node value: {label}"
])
except Exception as e:
print(f"Error when reporting failure: {e}")
def main():
load_config()
nodes = get_nodes()
for node in nodes.items:
nodegroup_name = node.metadata.labels.get('nodegroup-name')
for node_group_name, group_nodes in NODE_GROUPS.items():
if nodegroup_name == node_group_name:
group_nodes.append(node)
for node_group in NODE_GROUPS.values():
check_node_group_labels(node_group)
print("Reporting Success.")
try:
report_success()
except Exception as e:
print(f"Error when reporting success: {e}")
if __name__ == '__main__':
main()我还没有测试这一点,因为我没有任何kube集群,但是这里有一些我已经改进过的东西:
exit是交互式shell的助手-- sys.exit是用于程序中的。用第二个。作业:
https://codereview.stackexchange.com/questions/264230
复制相似问题