#!/bin/bash # Copyright 2014 The Kubernetes Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Validates that the cluster is healthy. # Error codes are: # 0 - success # 1 - fatal (cluster is unlikely to work) # 2 - non-fatal (encountered some errors, but cluster should be working correctly) set -o errexit set -o nounset set -o pipefail setvar KUBE_ROOT = "$(dirname "${BASH_SOURCE}")/.." if test -f "${KUBE_ROOT}/cluster/env.sh" { source "${KUBE_ROOT}/cluster/env.sh" } source "${KUBE_ROOT}/hack/lib/util.sh" source "${KUBE_ROOT}/cluster/kube-util.sh" # Run kubectl and retry upon failure. proc kubectl_retry { setvar tries = '3' while ! "${KUBE_ROOT}/cluster/kubectl.sh" @ARGV { setvar tries = $((tries-1)) if [[ ${tries} -le 0 ]] { echo "('kubectl $[join(ARGV)]' failed, giving up)" >&2 return 1 } echo "(kubectl failed, will retry ${tries} times)" >&2 sleep 1 } } setvar ALLOWED_NOTREADY_NODES = "${ALLOWED_NOTREADY_NODES:-0}" setvar CLUSTER_READY_ADDITIONAL_TIME_SECONDS = "${CLUSTER_READY_ADDITIONAL_TIME_SECONDS:-30}" setvar EXPECTED_NUM_NODES = "${NUM_NODES}" if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]] { echo "Validating gce cluster, MULTIZONE=${MULTIZONE:-}" # In multizone mode we need to add instances for all nodes in the region. if [[ "${MULTIZONE:-}" == "true" ]] { setvar EXPECTED_NUM_NODES = $(gcloud -q compute instances list --project="${PROJECT}" --format=[no-heading] \ --filter="name ~ '${NODE_INSTANCE_PREFIX}.*' AND zone:($(gcloud -q compute zones list --project="${PROJECT}" --filter=region=${REGION} --format=csv[no-heading]\(name\) | tr "\n" "," | sed "s/,$//"))" | wc -l) echo "Computing number of nodes, NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX}, REGION=${REGION}, EXPECTED_NUM_NODES=${EXPECTED_NUM_NODES}" } } if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]] { if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]] { setvar NUM_MASTERS = $(get-master-replicas-count) } else { setvar NUM_MASTERS = '1' } setvar EXPECTED_NUM_NODES = $((EXPECTED_NUM_NODES+NUM_MASTERS)) } setvar REQUIRED_NUM_NODES = $((EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES)) # Make several attempts to deal with slow cluster birth. setvar return_value = '0' setvar attempt = '0' # Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters. setvar PAUSE_BETWEEN_ITERATIONS_SECONDS = '15' setvar MAX_ATTEMPTS = '100' setvar ADDITIONAL_ITERATIONS = $(((CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1)/PAUSE_BETWEEN_ITERATIONS_SECONDS)) while true { # Pause between iterations of this large outer loop. if [[ ${attempt} -gt 0 ]] { sleep 15 } setvar attempt = $((attempt+1)) # The "kubectl get nodes -o template" exports node information. # # Echo the output and gather 2 counts: # - Total number of nodes. # - Number of "ready" nodes. # # Suppress errors from kubectl output because during cluster bootstrapping # for clusters where the master node is registered, the apiserver will become # available and then get restarted as the kubelet configures the docker bridge. # # We are assigning the result of kubectl_retry get nodes operation to the res # varaible in that way, to prevent stopping the whole script on an error. setvar node = $(kubectl_retry get nodes) && setvar res = """$?" || setvar res = """$?" if test ${res} -ne "0" { if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]] { echo -e "${color_red} Failed to get nodes.${color_norm}" exit 1 } else { continue } } setvar found = $(($(echo "${node}" | wc -l) - 1)) setvar ready = $(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1)) if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")) { break } elif (( "${found}" > "${EXPECTED_NUM_NODES}" )) { if [[ "${KUBE_USE_EXISTING_MASTER:-}" != "true" ]] { echo -e "${color_red}Found ${found} nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}" } break } elif (( "${ready}" > "${EXPECTED_NUM_NODES}")) { echo -e "${color_red}Found ${ready} ready nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}" break } else { if [[ "${REQUIRED_NUM_NODES}" -le "${ready}" ]] { echo -e "${color_green}Found ${REQUIRED_NUM_NODES} Nodes, allowing additional ${ADDITIONAL_ITERATIONS} iterations for other Nodes to join.${color_norm}" setvar last_run = "${last_run:-$((attempt + ADDITIONAL_ITERATIONS - 1))}" } if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]] { echo -e "${color_yellow}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}" kubectl_retry get nodes if [[ "${REQUIRED_NUM_NODES}" -gt "${ready}" ]] { exit 1 } else { setvar return_value = '2' break } } else { echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}" } } } echo "Found ${found} node(s)." kubectl_retry get nodes setvar attempt = '0' while true { # The "kubectl componentstatuses -o template" exports components health information. # # Echo the output and gather 2 counts: # - Total number of componentstatuses. # - Number of "healthy" components. setvar cs_status = $(kubectl_retry get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}') || true setvar componentstatuses = $(echo "${cs_status}" | grep -c 'Healthy:') || true setvar healthy = $(echo "${cs_status}" | grep -c 'Healthy:True') || true if ((componentstatuses > healthy)) || ((componentstatuses == 0)) { if ((attempt < 5)) { echo -e "${color_yellow}Cluster not working yet.${color_norm}" setvar attempt = $((attempt+1)) sleep 30 } else { echo -e " ${color_yellow}Validate output:${color_norm}" kubectl_retry get cs echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}" exit 1 } } else { break } } echo "Validate output:" kubectl_retry get cs || true if test ${return_value} == "0" { echo -e "${color_green}Cluster validation succeeded${color_norm}" } else { echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}" } exit "${return_value}"