0 k8s 1.27.6 高可用部署

采用堆叠etcd高可用方案

参考文档

使用堆叠（stacked）方案，etcd节点与控制平面节点共存；即etcd堆叠在kubeadm管理的控制节点上，作为控制平面的一个组件运行。
每个控制节点运行 kube-apiserver、kube-scheduler 和 kube-controller-manager 实例。kube-apiserver 使用负载均衡器暴露给worker节点。
每个控制点创建一个本地 etcd 成员（member），这个 etcd 成员只与该节点的 kube-apiserver 通信（因为apiserver中只配置了127.0.0.1:2379）。这同样适用于本地 kube-controller-manager 和 kube-scheduler 实例。

这种拓扑将控制平面和 etcd 成员耦合在同一节点上。相对使用外部 etcd 集群，设置起来更简单，而且更易于副本管理。然而，堆叠集群存在耦合失败的风险。如果一个节点发生故障，则 etcd 成员和控制平面实例都将丢失，并且冗余会受到影响。你可以通过添加更多控制平面节点来降低此风险。因此，你应该为 HA 集群运行至少三个堆叠的控制平面节点。这是 kubeadm 中的默认拓扑。当使用 kubeadm init 和 kubeadm join --control-plane 时，在控制节点上会自动创建本地 etcd 成员。

keepalive和haproxy 是众所周知、久经考验的高可用+负载均衡方案

安装haproxy和keepalived

准备2台主机，用于安装haproxy、keepalived

# https://kubernetes.io/zh-cn/docs/setup/production-environment/tools/kubeadm/high-availability/
# https://git.k8s.io/kubeadm/docs/ha-considerations.md#options-for-software-load-balancing
# yum -y install haproxy keepalived

keepalived配置
由于虚拟IP的实现方式，所有进行虚拟IP协商的主机都需要在同一个IP子网中，即keepalived和haproxy所在主机要在同一个子网中。
采用单播，不用组播。

keepalived在组播模式下所有的信息都会向224.0.0.18的组播地址发送，产生众多的无用信息，并且会产生干扰和冲突，所以需要将其组播的模式改为单播。

采用单播模式，可以减少无用的组播信息并降低网络干扰，同时有助于避免局域网内因大量 Keepalived 实例而导致的虚拟路由 ID 冲突。

如果网路不允许组播（可能是交换机策略设置），又要使用keepalived，那么开启单播。使用单播模式，即使网络环境内有相同的virtual_router_id，也无影响。
脑裂现象：

当两台服务器同时都拥有同一个虚拟IP时，可能出现了脑裂，这通常表明双机通信出现问题。造成脑裂的原因可能是两台服务器无法正常探测到对方的状态（心跳请求无法正常响应），导致各自都认为对方已经挂起并抢占虚拟IP。

ha01上

# cat /etc/keepalived/keepalived.conf
! Configuration File for keepalived

global_defs {
   script_user root
   enable_script_security
   # 主备节点设置不同的 router_id，以便于管理和监控
   router_id ha01
}
vrrp_script check_haproxy {
script "/etc/keepalived/check_apiserver.sh"
interval 3
# script中的指令执行失败或返回值非0时，则相应的vrrp_instance的priority会减少20，此时主节点优先级减20后，值已经低于备节点，就会发生主备切换。
weight -20
fall 2
rise 2
}
vrrp_instance VI_1 {
    state MASTER
    # 不抢占，允许一个priority比较低的节点作为master，即使有priority更高的节点已经启动。
    nopreempt
    interface eth0
    # 主备节点上相同的值。virtual_router_id 是用来标识一个特定的 VRRP （Virtual Router Redundancy Protocol）组的，同一个 VRRP 组中的所有节点都应使用相同的 virtual_router_id。
    virtual_router_id 52
    # 如果 script执行失败，则此将降为90
    priority 110
    # 设置 VRRP 广播消息的时间间隔。目前设置为2秒检查一次，根据您的网络环境和需求。
    advert_int 2
    authentication {
        auth_type PASS
        auth_pass 2222
    }
     # 单播模式，主节点IP
    unicast_src_ip 192.168.59.238
    unicast_peer {
      # 备节点的IP
     192.168.59.239
     }
    virtual_ipaddress {
      # VIP
      192.168.59.240
    }

    track_script {
    check_haproxy
  }
}

# cat check_apiserver.sh
#!/bin/sh

errorExit() {
echo "*** $*" 1>&2
exit 1
}

curl --silent --max-time 2 --insecure https://localhost:6443/ -o /dev/null || errorExit "Error GET https://localhost:6443/"
if ip addr | grep -q 192.168.59.240; then
curl --silent --max-time 2 --insecure https://192.168.59.240:6443/ -o /dev/null || errorExit "Error GET https://192.168.59.240:6443/"
fi

ha02上

! Configuration File for keepalived

global_defs {
   script_user root
   enable_script_security
   # 主备节点设置不同的 router_id，以便于管理和监控
   router_id ha02
}
vrrp_script check_haproxy {
# script
script "/etc/keepalived/check_apiserver.sh"
interval 3
# script中的指令执行失败或返回值非0时，则相应的vrrp_instance的priority会减少20，此时主节点优先级减20后，值已经低于备节点，就会发生主备切换。
weight -20
fall 2
rise 2
}
vrrp_instance VI_1 {
    state BACKUP
    # 不抢占，允许一个priority比较低的节点作为master，即使有priority更高的节点已经启动。
    nopreempt
    interface eth0
    # 主备节点上相同的值。virtual_router_id 是用来标识一个特定的 VRRP （Virtual Router Redundancy Protocol）组的，同一个 VRRP 组中的所有节点都应使用相同的 virtual_router_id。
    virtual_router_id 52
    # 如果 script执行失败，则此将降为80
    priority 100
    # 设置 VRRP 广播消息的时间间隔。目前设置为2秒检查一次，根据您的网络环境和需求。
    advert_int 2
    authentication {
        auth_type PASS
        auth_pass 2222
    }
     # 单播模式备节点IP
    unicast_src_ip 192.168.59.239
    unicast_peer {
     # 备节点的IP
     192.168.59.238
     }
    virtual_ipaddress {
        192.168.59.240
    }

    track_script {
    check_haproxy
  }
}

# cat check_apiserver.sh
#!/bin/sh

errorExit() {
echo "*** $*" 1>&2
exit 1
}

curl --silent --max-time 2 --insecure https://localhost:6443/ -o /dev/null || errorExit "Error GET https://localhost:6443/"
if ip addr | grep -q 192.168.59.240; then
curl --silent --max-time 2 --insecure https://192.168.59.240:6443/ -o /dev/null || errorExit "Error GET https://192.168.59.240:6443/"
fi

haproxy配置 HAProxy服务配置为基于流的简单负载均衡方式，从而使得位于其后的API Server实例能够处理TLS（传输层安全）。

ha01上

# cat /etc/haproxy18/haproxy.cfg 
#---------------------------------------------------------------------
# Example configuration for a possible web application.  See the
# full configuration options online.
#
#   https://www.haproxy.org/download/1.8/doc/configuration.txt
#
#---------------------------------------------------------------------

#---------------------------------------------------------------------
# Global settings
#---------------------------------------------------------------------
global
    # to have these messages end up in /var/log/haproxy.log you will
    # need to:
    #
    # 1) configure syslog to accept network log events.  This is done
    #    by adding the '-r' option to the SYSLOGD_OPTIONS in
    #    /etc/sysconfig/syslog
    #
    # 2) configure local2 events to go to the /var/log/haproxy.log
    #   file. A line like the following can be added to
    #   /etc/sysconfig/syslog
    #
    #    local2.*                       /var/log/haproxy.log
    #
    log         127.0.0.1 local2

    chroot      /var/lib/haproxy18
    pidfile     /var/run/haproxy18.pid
    maxconn     5000
    user        haproxy
    group       haproxy
    daemon

    # turn on stats unix socket
    stats socket /var/lib/haproxy18/stats

#---------------------------------------------------------------------
# common defaults that all the 'listen' and 'backend' sections will
# use if not designated in their block
#---------------------------------------------------------------------
defaults
    mode                    http
    log                     global
    option                  httplog
    option                  dontlognull # 不记录无内容长度的响应。
    # option http-server-close # HAProxy 在完成 HTTP 请求/响应交换后关闭与后端服务器的连接，而不是保持长连接。这样可以确保每个请求都有一个明确的结束，并且后端服务器不会因为长时间没有活动而主动关闭连接。然而，这也可能导致更高的连接建立开销，尤其是在高并发场景下。
    option forwardfor       except 127.0.0.0/8
    option                  redispatch # 当一个请求已经被分配给某个后端服务器，但由于某种原因（如服务器故障、超时等）无法处理该请求时，HAProxy 将尝试将请求重新分配给另一个健康的后端服务器，而不是直接返回错误给客户端。这有助于提高系统的可用性和容错能力。在某些情况下，例如使用了会话持久化（cookie 或源地址粘滞）的情况下，这项功能尤其有用。
    retries                 2  # 在服务器标记为不可用之前，尝试重新发送请求的最大次数；可以将 retries 设置为一个较小的值。
    timeout http-request    5s # 如果在该时间内没有收到完整的 HTTP 请求头，HAProxy 将关闭客户端连接。
    timeout queue           1m # 请求在队列中等待代理的时间限制。
    timeout connect         5s # 连接到后端服务器的超时时间；可以减少到 2s 或更低，以便更快地检测到无法连接的后端服务器。
    timeout client          60s # 客户端（浏览器）的整个会话超时时间。
    timeout server          60s # 后端服务器的整个会话超时时间，值太小，比如15s，可能会影响kubectl logs 查看报`error: unexpected EOF`。
    timeout http-keep-alive 60s # HTTP KeepAlive 的超时时间。
    timeout check           2s  # 可以减少到 2s 或更低，以便更快地进行健康检查。
    maxconn                 6000 # 最大并发连接数。

#---------------------------------------------------------------------
# main frontend which proxys to the backends
#---------------------------------------------------------------------
#frontend main
#    bind *:5000
#    acl url_static       path_beg       -i /static /images /javascript /stylesheets
#    acl url_static       path_end       -i .jpg .gif .png .css .js
#
#    use_backend static          if url_static
#    default_backend             app

frontend k8s-api
bind 0.0.0.0:6443
bind 127.0.0.1:6443
mode tcp
option tcplog
tcp-request inspect-delay 5s
default_backend k8s-api
#---------------------------------------------------------------------
# static backend for serving up images, stylesheets and such
#---------------------------------------------------------------------
#backend static
#    balance     roundrobin
#    server      static 127.0.0.1:4331 check

#---------------------------------------------------------------------
# round robin balancing between the various backends
#---------------------------------------------------------------------
backend k8s-api
    option httpchk GET /healthz
    http-check expect status 200
    mode tcp
    option ssl-hello-chk
    option tcplog
    balance     roundrobin
    server  k8s01 192.168.59.241:6443 check
    server  k8s02 192.168.59.242:6443 check
    server  k8s03 192.168.59.243:6443 check

ha02上

# cat /etc/haproxy18/haproxy.cfg 
#---------------------------------------------------------------------
# Example configuration for a possible web application.  See the
# full configuration options online.
#
#   https://www.haproxy.org/download/1.8/doc/configuration.txt
#
#---------------------------------------------------------------------

#---------------------------------------------------------------------
# Global settings
#---------------------------------------------------------------------
global
    # to have these messages end up in /var/log/haproxy.log you will
    # need to:
    #
    # 1) configure syslog to accept network log events.  This is done
    #    by adding the '-r' option to the SYSLOGD_OPTIONS in
    #    /etc/sysconfig/syslog
    #
    # 2) configure local2 events to go to the /var/log/haproxy.log
    #   file. A line like the following can be added to
    #   /etc/sysconfig/syslog
    #
    #    local2.*                       /var/log/haproxy.log
    #
    log         127.0.0.1 local2

    chroot      /var/lib/haproxy18
    pidfile     /var/run/haproxy18.pid
    maxconn     5000
    user        haproxy
    group       haproxy
    daemon

    # turn on stats unix socket
    stats socket /var/lib/haproxy18/stats

#---------------------------------------------------------------------
# common defaults that all the 'listen' and 'backend' sections will
# use if not designated in their block
#---------------------------------------------------------------------
defaults
    mode                    http
    log                     global
    option                  httplog
    option                  dontlognull
    option http-server-close
    option forwardfor       except 127.0.0.0/8
    option                  redispatch
    retries                 2
    timeout http-request    5s
    timeout queue           1m
    timeout connect         5s
    timeout client          60s
    timeout server          60s 
    timeout http-keep-alive 60s
    timeout check           2s
    maxconn                 6000

#---------------------------------------------------------------------
# main frontend which proxys to the backends
#---------------------------------------------------------------------
#frontend main
#    bind *:5000
#    acl url_static       path_beg       -i /static /images /javascript /stylesheets
#    acl url_static       path_end       -i .jpg .gif .png .css .js
#
#    use_backend static          if url_static
#    default_backend             app

frontend k8s-api
bind 0.0.0.0:6443
bind 127.0.0.1:6443
mode tcp
option tcplog
tcp-request inspect-delay 5s
default_backend k8s-master
#---------------------------------------------------------------------
# static backend for serving up images, stylesheets and such
#---------------------------------------------------------------------
#backend static
#    balance     roundrobin
#    server      static 127.0.0.1:4331 check

#---------------------------------------------------------------------
# round robin balancing between the various backends
#---------------------------------------------------------------------
backend k8s-master
    mode tcp
    option tcplog
    option tcp-check
    balance     roundrobin
    server  k8s01 192.168.59.241:6443 check
    server  k8s02 192.168.59.242:6443 check
    server  k8s03 192.168.59.243:6443 check

在k8s01-04上安装docker、containerd、kubelet、kubeadm、kubectl

注意：kubeadm使用自己编译后的，证书支持100年修改kubeadm调整证书有效期.md 1. 检查k8s01-04 hosts，确保是如下配置

127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
# HA
192.168.59.238 ha01
192.168.59.239 ha02
192.168.59.240 test-kubevirt.demo.com

# public network
192.168.61.241 ceph01
192.168.61.242 ceph02
192.168.61.243 ceph03

# cluster network
10.168.61.241 ceph01-cl
10.168.61.242 ceph02-cl
10.168.61.243 ceph03-cl

# manage network 
# k8s eth0
192.168.59.241 k8s01
192.168.59.242 k8s02
192.168.59.243 k8s03
192.168.59.244 k8s04

# rgw
192.168.61.241 rgw.testgw images.demo.com dl.demo.com

# harbor
192.168.59.251 harbor registry.demo.com

2. 关闭防火墙和selinux

systemctl stop firewalld
systemctl disable firewalld
setenforce 0
sed -i "s/SELINUX=enforcing/SELINUX=disabled/g" /etc/selinux/config

3. 关闭交换分区

swapoff -a
sed -i 's/.*swap.*/#&/' /etc/fstab

4. 配置流量桥接（该不不能省略，否则在init k8s时会报错）和内核参数

#转发 IPv4 并让 iptables 看到桥接流量
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
overlay
br_netfilter
EOF

chmod +x /etc/modules-load.d/k8s.conf
modprobe overlay
modprobe br_netfilter
lsmod | grep br_netfilter 
#验证br_netfilter模块

# 设置所需的 sysctl 参数，参数在重新启动后保持不变
cat <<EOF | tee /etc/sysctl.d/k8s.conf

net.ipv6.conf.all.disable_ipv6 = 1
net.ipv6.conf.default.disable_ipv6 = 1

net.bridge.bridge-nf-call-iptables  = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward                 = 1
#1. 用于对外连接的随机端口范围。缺省是# 32768    60999
#端口范围开始和结束要奇偶不同，如果设置为1024 65530则在dmesg中会报ip_local_port_range: prefer different parity for start/end values.
net.ipv4.ip_local_port_range = 1024 65335 

# 优化网络性能，通过自动探测最佳 MTU，可以减少由于分片导致的网络性能下降，提高数据传输效率
net.ipv4.tcp_mtu_probing = 1 

# 如果dmesg中有类似"nf_conntrack: table full, dropping packet"日志，则需要调大 conntrack 参数，默认是2621440，该值不能太大，否则会出现：nf_conntrack: falling back to vmalloc.
net.netfilter.nf_conntrack_max = 2621440
net.nf_conntrack_max = 2621440
# 指定了进程可以拥有的内存映射区域的最大数目。这个设置对于使用大量内存映射的应用程序很重要 
vm.max_map_count = 1048576                       
#2. 如果 netstat -s | grep "buffer errors" 中errors数在增加，则需要调整如下参数
# net.ipv4.tcp_wmem 默认值：4096        16384   4194304
net.ipv4.tcp_wmem = 4096        16384   4194304
#  net.ipv4.tcp_rmem 默认值：4096  87380  6291456
net.ipv4.tcp_rmem = 4096  87380  6291456
# net.ipv4.tcp_mem 默认值：381462  508616  762924
net.ipv4.tcp_mem = 381462  508616  762924
# net.core.rmem_default 默认值：212992
net.core.rmem_default = 8388608
# net.core.rmem_max 默认值：212992
net.core.rmem_max = 26214400
# net.core.wmem_max 默认值：212992
net.core.wmem_max = 26214400

# 调大文件句柄数
fs.nr_open = 16777216
fs.file-max = 16777216

#3.如果dmesg中有类似"arp_cache: neighbor table overflow"，则需要调整如下参数
# net.ipv4.neigh.default.gc_thresh1 默认值 128
net.ipv4.neigh.default.gc_thresh1 = 40960
# net.ipv4.neigh.default.gc_thresh2 默认值 512
net.ipv4.neigh.default.gc_thresh2 = 81920
# net.ipv4.neigh.default.gc_thresh3 默认值 1024
net.ipv4.neigh.default.gc_thresh3 = 102400

#4. 连接队列满导致丢包，需要调整半连接队列和全连接队列
#TCP 连接请求队列长度，默认为1024，加大队列长度为8192，可以容纳更多等待连接的网络连接数。
net.ipv4.tcp_max_syn_backlog = 65535 
# 调整全连接队列上限，即服务器同时接受连接的数量
net.core.somaxconn = 65535
# 网络设备最大接收队列长度
net.core.netdev_max_backlog = 250000
#5. 在低版本内核中(比如 3.10)，支持使用 tcp_tw_recycle 内核参数来开启 TIME_WAIT 的快速回收，但如果 client 也开启了 timestamp (一般默认开启)，同时也就会导致在 NAT 环境丢包，甚至没有 NAT 时，稍微高并发一点，也会导致PAWS校验失败，导致丢包，所以生产环境不建议开启。
#### TIME_WAIT
# 默认0
# 用 SYN Cookie 防御机制
net.ipv4.tcp_syncookies = 1
# 开启 TIME-WAIT 状态的重用，此处为0，未开启
net.ipv4.tcp_tw_reuse = 0
# 不建议启用tcp_tw_recycle，会导致数据错乱，4.12内核已去掉这个参数
net.ipv4.tcp_tw_recycle = 0
# 默认60
net.ipv4.tcp_fin_timeout = 30

#6.启用fastopen，跳过tcp3次握手;第 1 个比特位为 1 时，表示作为客户端时支持 TFO；第 2 个比特位为 1 时，表示作为服务器时支持 TFO，所以当 tcp_fastopen 的值为 3 时（比特为 0x11）就表示完全支持 TFO 功能。
net.ipv4.tcp_fastopen = 3
net.ipv4.tcp_orphan_retries = 3
# 默认0，表示如果三次握手第三步的时候 accept queue 满了，则 server 丢弃 client 发过来的 ack；为1表示第三步的时候如果全连接队列满了，server 发送一个 rst 包给 client ，表示拒绝这个握手过程和这个连接
# 只有确信守护进程真的不能完成连接请求时才打开该选项，该选项会影响客户的使用
net.ipv4.tcp_abort_on_overflow = 1
EOF
# 
# kubeadm init --config kubeadm.yaml
[init] Using Kubernetes version: v1.26.3
[preflight] Running pre-flight checks
error execution phase preflight: [preflight] Some fatal errors occurred:
        [ERROR FileContent--proc-sys-net-bridge-bridge-nf-call-iptables]: /proc/sys/net/bridge/bridge-nf-call-iptables does not exist
[preflight] If you know what you are doing, you can make a check non-fatal with `--ignore-preflight-errors=...`
To see the stack trace of this error execute with --v=5 or higher

# bridge-nf-call-iptables 为1表示bridge设备在二层转发时也去调用iptables配置的三层规则(包含contrack)，开启该参数能够解决Service同节点通信问题。
# 应用 sysctl 参数而不重新启动
sysctl -p  /etc/sysctl.d/k8s.conf
说明：执行sysctl -p 默认生效是/etc/sysctl.conf，如果配置文件放在/etc/sysctl.d目录下，则要指定到文件，否则配置不生效，也可以直接用 sysctl --system

## 修改资源限制参数
cat > /etc/security/limits.d/k8s.conf <<EOF
# End of file
*               hard    nofile         655360
*               soft    nofile         655360
*               soft    core           655360
*               hard    core           655360
*          soft    nproc     unlimited
root       soft    nproc     unlimited
EOF

配置时间同步

yum -y install chrony
systemctl enable chronyd --now

安装containerd

https://github.com/containerd/containerd/blob/main/docs/getting-started.md

https://github.com/containerd/containerd/releases
直接下载：cri-containerd-1.7.2-linux-amd64.tar.gz，该包里面包含了containerd、 ctr、crictl、containerd-shim等二进制文件，还有启动命令等，只要在/下解压即可。
# tar xf cri-containerd-1.7.2-linux-amd64.tar.gz -C /
# 说明：1.7.0中的 runc可能无法使用，可单独下载。在1.7.2版本使用正常
# runc
runc: symbol lookup error: runc: undefined symbol: seccomp_notify_respond

也可分开下载，如：containerd-1.7.2-linux-amd64.tar.gz中只包括了containerd、ctr
wget https://github.com/containerd/containerd/releases/download/v1.7.2/containerd-1.7.2-linux-amd64.tar.gz
tar -zxvf containerd-1.7.2-linux-amd64.tar.gz -C /usr/local/bin
# 下载runc
wget https://github.com/opencontainers/runc/releases/download/v1.1.4/runc.amd64
install -m 755 runc.amd64 /usr/local/sbin/runc
# 下载cri工具包，包括crictl命令
wget https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.26.0/crictl-v1.26.0-linux-amd64.tar.gz
tar -vzxf crictl-v1.26.0-linux-amd64.tar.gz
mv crictl /usr/local/bin/

cat >  /etc/crictl.yaml << EOF
runtime-endpoint: unix:///var/run/containerd/containerd.sock
image-endpoint: unix:///var/run/containerd/containerd.sock
timeout: 10
debug: false
EOF

# 生成containerd配置文件
mkdir /etc/containerd
containerd config default > /etc/containerd/config.toml
# 修改镜像及容器存放路径，默认是/var/lib/containerd
root = "/var/lib/containerd"

# 修改容器日志行数
max_container_log_line_size = 163840
# 用于从安全上下文中提取设备所有权信息，kubevirt cdi依赖该参数，不设置该参数可能出现无权限
device_ownership_from_security_context = true
systemctl daemon-reload
systemctl enable containerd --now

启动containerd可能会遇到如下错误，解决办法是修改/etc/systemd/system/containerd.service，将LimitNOFILE=infinity改为LimitNOFILE=655360

...
May 24 21:51:10 master01 systemd[492323]: containerd.service: Failed to adjust resource limit RLIMIT_NOFILE: Operation not permitted
May 24 21:51:10 master01 systemd[492323]: containerd.service: Failed at step LIMITS spawning /sbin/modprobe: Operation not permitted
May 24 21:51:10 master01 systemd[492325]: containerd.service: Failed to adjust resource limit RLIMIT_NOFILE: Operation not permitted
May 24 21:51:10 master01 systemd[492325]: containerd.service: Failed at step LIMITS spawning /usr/local/bin/containerd: Operation not permitted
May 24 21:51:10 master01 systemd[1]: containerd.service: Main process exited, code=exited, status=205/LIMITS

安装cni

https://github.com/containernetworking/plugins/releases/download/v1.2.0/cni-plugins-linux-amd64-v1.2.0.tgz
mkdir -p /opt/cni/bin
tar -zxvf cni-plugins-linux-amd64-v1.2.0.tgz -C /opt/cni/bin
# ls /opt/cni/bin/
bandwidth  bridge  dhcp  dummy  firewall  host-device  host-local  ipvlan  loopback  macvlan  portmap  ptp  sbr  static  tuning  vlan  vrf

配置containerd镜像加速

...
    [plugins."io.containerd.grpc.v1.cri".registry]
      config_path = ""

      [plugins."io.containerd.grpc.v1.cri".registry.auths]

      [plugins."io.containerd.grpc.v1.cri".registry.configs]
        [plugins."io.containerd.grpc.v1.cri".registry.configs."registry.demo.com".tls]
          insecure_skip_verify = true
          ca_file = "/etc/containerd/certs.d/registry.demo.com/ca.crt"
          cert_file = "/etc/containerd/certs.d/registry.demo.com/registry.demo.com.cert"
          key_file = "/etc/containerd/certs.d/registry.demo.com/registry.demo.com.key"
        [plugins."io.containerd.grpc.v1.cri".registry.configs."registry.demo.com".auth]
          username = "admin"
          password = "Harbor12345"

      [plugins."io.containerd.grpc.v1.cri".registry.headers]
      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]

        [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
          endpoint = ["https://vty0b0ux.mirror.aliyuncs.com"]
        [plugins."io.containerd.grpc.v1.cri".registry.mirrors."registry.demo.com"]
          endpoint = ["https://registry.demo.com"]
        [plugins."io.containerd.grpc.v1.cri".registry.mirrors."registry.k8s.io"]
          endpoint = ["https://registry.aliyuncs.com/google_containers"]

    [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
      tls_cert_file = ""
      tls_key_file = ""

# systemctl restart containerd

调整cgroup驱动

#把SystemdCgroup = false修改为：SystemdCgroup = true
sed -i 's/SystemdCgroup\ =\ false/SystemdCgroup\ =\ true/g' /etc/containerd/config.toml
#把sandbox_image = "registry.k8s.io/pause:3.8"修改为：sandbox_image="registry.demo.com/google_containers/pause:3.9"
sed -i 's/sandbox_image\ =.*/sandbox_image\ =\ "registry.demo.com\/google_containers\/pause:3.9"/g' /etc/containerd/config.toml|grep sandbox_image

systemctl restart containerd
# 使用 crictl info 查看配置是否生效

# 拉取镜像测试
# crictl pull registry.demo.com/google_containers/pause:3.9
# crictl images

# 使用ctr 拉取镜像时会报错证书问题，要指定用户名密码
# crictl pull registry.demo.com/google_containers/pause:3.9
# ctr images pull --user admin:Harbor12345 -k registry.demo.com/google_containers/pause:3.9

在k8s01-04上安装ipvs、k8s

yum install ipset ipvsadm libseccomp conntrack sysstat -y
cat > /etc/modules-load.d/ipvs.modules << EOF
#!/bin/bash
modprobe -- ip_vs
modprobe -- ip_vs_rr
modprobe -- ip_vs_wrr
modprobe -- ip_vs_sh
modprobe -- nf_conntrack
EOF
#执行加载模块脚本
chmod +x /etc/modules-load.d/ipvs.modules
/bin/bash /etc/modules-load.d/ipvs.modules
#查看对应的模块是否加载成功
lsmod | grep -e ip_vs -e nf_conntrack

cat > /etc/yum.repos.d/kubernetes.repo <<EOF
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64
enabled=1
gpgcheck=0
repo_gpgcheck=0
gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
EOF

yum -y install kubelet-1.27.6 kubectl-1.27.6 kubeadm-1.27.6      
## 替换kubeadm，使用编译好的kubeadm覆盖yum安装

#配置kubelet，不同节点主机名、IP不同，这里是在多网卡场景下，可能要求区分，且在节点加入集群时要指定 --node-name=xxxx
cat >  /etc/sysconfig/kubelet << EOF
# KUBELET_CGROUP_ARGS="--cgroup-driver=systemd"
KUBELET_EXTRA_ARGS="--hostname-override=k8s01 --node-ip=192.168.59.241 --cgroup-driver=systemd --eviction-hard=nodefs.available<5Gi --eviction-hard=imagefs.available<5Gi"
EOF

## 这里只需设置kubelet为开机启动，但不需启动，在k8s初始化时，会拉起kubelet
# systemctl enable kubelet

# kubeadm.yaml 
apiServer:
  certSANs:
  - test-kubevirt.demo.com
  - 192.168.59.240
  - 192.168.59.241
  - 192.168.59.242
  - 192.168.59.243
  - 192.168.59.244
  extraArgs:
    authorization-mode: Node,RBAC
  timeoutForControlPlane: 4m0s
apiVersion: kubeadm.k8s.io/v1beta3
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
# 需要指定controlPlaneEndpoint
controlPlaneEndpoint: test-kubevirt.demo.com:6443
controllerManager: {}
dns: {}
etcd:
  local:
    dataDir: /var/lib/etcd
imageRepository: registry.demo.com/google_containers
kind: ClusterConfiguration
kubernetesVersion: v1.27.6
networking:
  dnsDomain: cluster.local
  podSubnet: 12.244.0.0/16
  serviceSubnet: 12.96.0.0/12
scheduler: {}
---
# 配置kubelet的CGroup为systemd
kind: KubeletConfiguration
apiVersion: kubelet.config.k8s.io/v1beta1
# 1.22后，cgroupDriver 默认为systemd
cgroupDriver: systemd
---
# 配置 ipvs，可以参考kube-proxy configMap
kind: KubeProxyConfiguration
apiVersion: kubeproxy.config.k8s.io/v1alpha1
mode: "ipvs"
ipvs:
  scheduler: "wrr"

- 初始化集群

# kubeadm init --config kubeadm.yaml --upload-certs
...

注意事项，node节点如果有多网卡，且加入集群的ip和主机名不对应，则需要在 /etc/sysconfig/kubelet指定对应ip的别名，且在kubeadm join 加入集群里要指定 --node-name=
# kubeadm join test-kubevirt.demo.com:6443 \
--token ejprj4.jtsfvaljsd0wrw1d \
--discovery-token-ca-cert-hash sha256:5a66792b930392a36ec7d03ca19a35a53cf92e89f8666e438582c52f8e5ee5a0 \
--node-name=k8s-05
如果token过期，可以用如下命令重新生成
# kubeadm token create --print-join-command 
命令补齐
yum -y install bash-completion
source /usr/share/bash-completion/bash_completion
source <(kubectl completion bash)
echo "source <(kubectl completion bash)" >> ~/.bashrc