Ubuntu安装 kubeadm 部署k8s 1.30
rockylinux 9.3详细安装drbd+keepalived
kubeadm 部署k8s 1.30
rockylinux 9.3详细安装drbd
operator开发 mysql一主多从
CRD | | | 定义出/创建出 | | ↓ CR,即resource type ----------------》受自定义的控制器watch监听并控制 | | | 定义出/创建出 | | ↓ 一条具体的resource 实现的功能: 1. 支持一主多从 采用GID的自动备份 2. 支持主从的自动选举切换 3. 支持在线扩容 副本不足时会自动拉起 4. 支持就绪探针的检测 5. .........一、go环境准备wget https://golang.google.cn/dl/go1.22.5.linux-amd64.tar.gz tar zxvf go1.22.5.linux-amd64.tar.gz mv go /usr/local/ cat >> /etc/profile << 'EOF' export GOROOT=/usr/local/go export PATH=$PATH:$GOROOT/bin EOF source /etc/profile go version #查看是否生效 # 设置go代理 # 1、也可以用全球cdn加速 export GOPROXY=https://goproxy.cn,direct go env -w GOPROXY=https://goproxy.cn,direct二、安装kubebuilder框架# 1、下载最新版本的kubebuilder(下载慢的话你就手动下载然后上传) wget https://github.com/kubernetessigs/kubebuilder/releases/download/v4.1.1/kubebuilder_linux_amd64 mv kubebuilder_linux_amd64 kubebuilder && chmod +x kubebuilder && mv kubebui lder /usr/local/bin/ $ kubebuilder version三、初始化项目# 创建项目 mkdir -p /src/application-operator cd /src/application-operator go mod init application-operator kubebuilder init --domain=egonlin.com --owner egonlin # 创建api $ kubebuilder create api --group apps --version v1 --kind Application # 设定的kind的首字母必须大写 Create Resource [y/n] y Create Controller [y/n] y # --kind Application,指定你要创建的resource type的名字,注意首字母必须大写#项目地址直接拉取 https://gitee.com/axzys/mysqlcluster-operator/tree/slave/四、可以先在本地测试执行# 一、修改文件:文件utils.go #1、文件开头增加导入:"k8s.io/client-go/tools/clientcmd" 删除导入:"k8s.io/client-go/rest" #2、方法execCommandOnPod修改 config, err := clientcmd.BuildConfigFromFlags("", KubeConfigPath) // 打开注释 // config, err := rest.InClusterConfig() // 加上注释 # 二、mysqlcluster_controller.go修改 const ( ...... KubeConfigPath = "/root/.kube/config" // 打开注释 ...... ) # 并且确保宿主机上存在/root/.kube/config # 测试yaml apiVersion: apps.egonlin.com/v1 kind: MysqlCluster metadata: name: mysqlcluster-sample labels: app.kubernetes.io/name: mysql-operator app.kubernetes.io/managed-by: kustomize spec: image: registry.cn-shanghai.aliyuncs.com/egon-k8s-test/mysql:5.7 replicas: 4 masterService: master-service slaveService: slave-service storage: storageClassName: "local-path" size: 1Gi resources: requests: cpu: "500m" memory: "1Gi" limits: cpu: "1" memory: "2Gi" livenessProbe: initialDelaySeconds: 30 timeoutSeconds: 5 tcpSocket: port: 3306 先执行make install 然后执行 make run 然后创建测试pod创建测试功能正常以后。可以把控制器放进k8s里面。五、以容器形式部署controller如果想要部署在k8s里面需要把上面修改的配置还原回去。# dockerfile文件中的FROM镜像无法拉取,要换成自己的 $ vi Dockerfile # FROM golang:1.22 AS builder FROM registry.cn-hangzhou.aliyuncs.com/egon-k8s-test/golang:1.22 AS builder #FROM gcr.io/distroless/static:nonroot FROM registry.cn-shanghai.aliyuncs.com/egon-k8s-test/static:nonroot #并且构建过程中需要执行go mod download,默认从国外源下载非常慢需要再该命令前设置好环境变量 # 在go mod download前设置好环境变量 ENV GOPROXY=https://mirrors.aliyun.com/goproxy/,direct RUN go mod download 然后构建 docker 镜像make docker-build IMG=mysql-operator-master:v0.01 #然后启动推上阿里云仓库# 使用 docker 镜像, 部署 controller 到 k8s 集群,会部署成一个deployment make deploy IMG=registry.cn-guangzhou.aliyuncs.com/xingcangku/bendi:v0.8#查询: 默认在system名称空间下 [root@master01 mysql-operator-master]# kubectl get namespace NAME STATUS AGE application-operator-system Active 3d default Active 23d kube-flannel Active 23d kube-node-lease Active 23d kube-public Active 23d kube-system Active 23d monitor Active 22d system Active 36s [root@master01 mysql-operator-master]# kubectl -n system get api/ cmd/ Dockerfile .git/ .golangci.yml go.sum internal/ PROJECT test/ bin/ config/ .dockerignore .gitignore go.mod hack/ Makefile README.md test.yaml [root@master01 mysql-operator-master]# kubectl -n system get deployments.apps NAME READY UP-TO-DATE AVAILABLE AGE controller-manager 1/1 1 1 52s [root@master01 mysql-operator-master]# kubectl -n controller-manager get pods No resources found in controller-manager namespace. [root@master01 mysql-operator-master]# kubectl delete -f ./config/samples/apps_v1_mysqlcluster.yaml Error from server (NotFound): error when deleting "./config/samples/apps_v1_mysqlcluster.yaml": mysqlclusters.apps.egonlin.com "mysqlcluster-sample" not found [root@master01 mysql-operator-master]# kubectl apply -f ./config/samples/apps_v1_mysqlcluster.yaml mysqlcluster.apps.egonlin.com/mysqlcluster-sample created [root@master01 mysql-operator-master]# kubectl -n controller-manager get pods No resources found in controller-manager namespace. [root@master01 mysql-operator-master]# kubectl get pods -n system NAME READY STATUS RESTARTS AGE controller-manager-5699b5b476-4ngwd 1/1 Running 0 3m3s# 如果发现pod没有起来可能是存储的问题。项目来面有个文件local-path-provisioner-0.0.29 进入然后再进入deploy这个文件 [root@master01 deploy]# kubectl apply -f local-path-storage.yaml namespace/local-path-storage created serviceaccount/local-path-provisioner-service-account created role.rbac.authorization.k8s.io/local-path-provisioner-role created clusterrole.rbac.authorization.k8s.io/local-path-provisioner-role created rolebinding.rbac.authorization.k8s.io/local-path-provisioner-bind created clusterrolebinding.rbac.authorization.k8s.io/local-path-provisioner-bind created deployment.apps/local-path-provisioner created storageclass.storage.k8s.io/local-path created configmap/local-path-config created [root@master01 deploy]# kubectl get pods NAME READY STATUS RESTARTS AGE axing-zzz-7d5cb7df74-4lbqn 1/1 Running 6 (31m ago) 16d mysql-01 1/1 Running 0 7m50s mysql-02 1/1 Running 0 40s mysql-03 0/1 ContainerCreating 0 30s [root@master01 deploy]# kubectl get pvc NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS VOLUMEATTRIBUTESCLASS AGE mysql-01 Bound pvc-c4ffa04d-78bc-44e5-9948-8dd23e8197d4 1Gi RWO local-path <unset> 8m4s mysql-02 Bound pvc-9870b7dc-274f-48d9-ab9c-12fdad4ab267 1Gi RWO local-path <unset> 8m4s mysql-03 Bound pvc-517035dc-ec28-4733-8d8d-244cce025604 1Gi RWO local-path <unset> 8m4s [root@master01 mysql-operator-master]# kubectl get pod -n system 'NAME READY STATUS RESTARTS AGE controller-manager-5699b5b476-4ngwd 1/1 Running 0 103m [root@master01 mysql-operator-master]# [root@master01 mysql-operator-master]# kubectl -n system get deployments.apps NAME READY UP-TO-DATE AVAILABLE AGE controller-manager 1/1 1 1 103m # 可以看日志的情况 [root@master01 mysql-operator-master]# kubectl -n system logs -f controller-manager-5699b5b476-4ngwd正常最后是会一直更新日志{lamp/}最后问题总结# 启动operator的时候第三个pod无法拉起,一直pending,查看 [root@k8s-node-01 ~]# kubectl describe pod mysql-03 Events: Type Reason Age From Message ---- ------ ---- ---- ------- Warning FailedScheduling 11m (x3 over 17m) default-scheduler 0/3 nodes are available: 1 Insufficient cpu, 1 node(s) had untolerated taint {node.kubernetes.io/disk-pressure: }, 2 Insufficient memory. preemption: 0/3 nodes are available: 1 Preemption is not helpful for scheduling, 2 No preemption victims found for incoming pod. Warning FailedScheduling 89s (x2 over 6m30s) default-scheduler 0/3 nodes are available: 3 node(s) had untolerated taint {node.kubernetes.io/unreachable: }. preemption: 0/3 nodes are available: 3 Preemption is not helpful for scheduling. [root@k8s-node-01 ~]## 报错磁disk磁盘资源不足,因为我们用的存储卷是local-path-storage,所以会有卷亲和,msyql-03固定调度到卷所在的节点,卷所在的节点为k8s-node-01节点,通过查看也能分析出来 [root@k8s-node-01 ~]# kubectl get pods -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES mysql-01 1/1 Running 0 18m k8s-master-01 <none> <none> mysql-02 1/1 Running 0 18m k8s-node-02 <none> <none> mysql-03 0/1 Pending 0 18m <none> <none> <none> <none> # 于是去k8s-node-01节点上查看,发现磁盘空间确实占满了,如下先尝试把该节点的一些安装包,/tmp目录,yum缓存,/var/log都清理掉 kubelet的日志轮转也设置了 [root@k8s-node-01 ~]# cat /var/lib/kubelet/kubeadm-flags.env KUBELET_KUBEADM_ARGS="--container-runtimeendpoint=unix:///var/run/containerd/containerd.sock --pod-infra-containerimage=registry.cn-hangzhou.aliyuncs.com/google_containers/pause:3.9 --containerlog-max-files=2 --container-log-max-size='1Ki'" # 注意:--container-log-max-files=2必须大于1,不能小于或等于1,否则无法启动# go build缓存(/root/.cache)还是别清了,否则make run或花很久时间 # 并且把一些没有用的镜像也清理掉 docker system prune -a nerdctl system prune -a # 作用解释: system prune:这个命令用于清理 Docker 系统,删除不再使用的容器、镜像、网络等资源。 -a(--all):此选项会使命令删除所有未使用的镜像,而不仅仅是无标签的镜像。 运行 docker/nerdctl system prune -a 后,系统会问你是否确认要删除这些资源。确认后,Docker会清理掉停止的容器、未使用的镜像和网络,从而释放磁盘空间。发现空间得到了一定程度的释放查看已删除但仍被占用的文件 当一个文件被删除后,如果有进程仍然在使用它,那么这个文件所占用的空间并不会立即被释放。文件 系统的空间使用会显示为已用,但 du 无法检测到这些被删除的文件。 检测被删除但仍然占用的文件 可以使用 lsof 来列出所有仍然被进程占用但已删除的文件。 lsof | grep deleted 如果发现某些文件已经被删除,但仍然被进程占用,可以通过重启相应的进程来释放这些文件占用的空间。 发现一堆这种文件查找该进程,发现就是一个裸启动的mysql进程,无用,可以kill杀掉kill -9 1100
1. mkdir mysqlcluster-operator #创建一个名为 mysqlcluster-operator 的目录。 2. cd mysqlcluster-operator #进入刚刚创建的 mysqlcluster-operator 目录。 3. git init #初始化一个新的 Git 仓库,将当前目录标记为一个 Git 仓库以便进行版本控制。 4. touch README.md #创建一个名为 README.md 的文件,通常用于项目的介绍或说明。 5. git add README.md #将 README.md 文件添加到 Git 的暂存区,准备提交到仓库。 6. git commit -m "first commit" #提交暂存区中的文件,并添加提交信息 "first commit"。 7. git remote add origin https://gitee.com/axzys/mysqlcluster-operator.git #将远程仓库的地址(在 Gitee 上)与当前的本地仓库关联,命名为 origin。 8. git push -u origin "master" #将本地仓库的 master 分支推送到远程仓库的 origin,并设置跟踪(-u 参数表示在未来自动与这个远程分支同步)。mkdir webhook cd webhook git init touch README.md git add README.md git commit -m "first commit" git remote add origin https://gitee.com/axzys/webhook.git git push -u origin "master"[root@master01 mysqlcluster-operator]# git push -u origin "master" Username for 'https://gitee.com': 13143087657 Password for 'https://13143087657@gitee.com': Counting objects: 3, done. Delta compression using up to 8 threads. Compressing objects: 100% (2/2), done. Writing objects: 100% (3/3), 1.64 KiB | 0 bytes/s, done. Total 3 (delta 0), reused 0 (delta 0) remote: Powered by GITEE.COM [1.1.5] remote: Set trace flag 7e4992ae To https://gitee.com/axzys/mysqlcluster-operator.git * [new branch] master -> master Counting objects:统计需要推送的对象数目。 Compressing objects:压缩要推送的对象。 Writing objects:将对象写入远程仓库。 remote: Powered by GITEE.COM:表示已经成功连接到 Gitee 并完成操作。 [new branch] master -> master:本地的 master 分支已经成功推送到远程仓库的 master 分支。git add . #添加修改 git commit -m "Your commit message" #提交修改 git push origin master #推送到远程仓库 #推送到其他分支 git checkout -b <branch_name> #创建并切换到新分支 git push origin <branch_name> #推送到新分支 #回滚到之前的提交并推送(如果需要重置远程仓库的状态) git reset --hard abc1234 #回滚到某个提交(假设提交ID为 abc1234) git push origin master --force #强制推送到远程仓库
一、 准备规划(1)先创建一个名称空间,后续日志相关组件都安装到该名称空间下 kubectl create ns logging(2)环境准备ElasticSearch 安装有最低安装要求,如果安装后 Pod 无法正常启动,请检查是否符合最低要求的配置,要求如下:建议:每台机器cpu调成4c内存>=4G我的是3台master节点,一个node节点,每台都可以参与调度,每台机器都是4G注意:如果使用vmware workstation做实验,你的虚拟机是开机状态,最大把虚拟机内存调大到3G,想要调的大一些需要关机后才能调的更大(3)部署规划ES集群有三种角色组成,详细描述与规划如下二、为ES准备持久化存储为了能够持久化Elasticsearch的数据,需要准备一个存储,此处我们使用NFS类型的 StorageClass ,如果你是线上环境建议使用 Local PV 或者 Ceph RBD(1) 安装nfs服务找一台机器192.168.110.101安装nfs服务端systemctl stop firewalld.service systemctl disable firewalld.service # 服务端软件安装: yum install -y nfs-utils rpcbind # 安装nfs-utils和rpcbind两个包 # 创建共享目录 mkdir -p /data/nfs chmod 755 /data/nfs # 配置共享目录 cat > /etc/exports <<EOF /data/nfs *(rw,sync,no_root_squash) EOF *:表示任何人都有权限连接,当然也可以是一个网段,一个 IP,也可以是域名 rw:读写的权限 sync:表示文件同时写入硬盘和内存 no_root_squash:当登录 NFS 主机使用共享目录的使用者是 root 时,其权限将被转换成为匿名使用者,通常它的 UID 与 GID,都会变成 nobody 身份 # 启动nfs服务 systemctl start rpcbind.service systemctl enable rpcbind systemctl status rpcbind systemctl start nfs systemctl enable nfs systemctl status nfs # 如下显示则ok $ rpcinfo -p|grep nfs 100003 3 tcp 2049 nfs 100003 4 tcp 2049 nfs 100227 3 tcp 2049 nfs_acl 100003 3 udp 2049 nfs 100227 3 udp 2049 nfs_acl客户端软件安装(在所有node节点安装)yum install -y nfs-utils你可以在客户端宿主机上验证下能不能用,能用的话我们一会再用到pod的pv上➜ showmount -e Export list for /data/nfs * ➜ mount -t nfs /mnt #可千万别挂载到/opt下哈,/opt/cni/bin放着网络插件呢 ➜ touch /mnt/a.txt # 成功后,另外一台客户端上挂载查看验证(2) 搭建StorageClass+NFS官网:https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner➜ helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner/ ➜ helm upgrade --install nfs-subdir-external-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner --set nfs.server= --set nfs.path=/data/nfs --set storageClass.defaultClass=true -n kube-system # 镜像下载不了的话可以用自己制作的镜像 registry.cn-shanghai.aliyuncs.com/egon-k8s-test/nfs-subdir-external-provisioner:v4.0.2查看helm -n kube-system list查看nfs_provider的pod$ kubectl -n kube-system get pods |grep nfs nfs-subdir-external-provisioner-5c7dc6cd57-4mrmx 1/1 Running 0 18s查看sc(已经设置为默认的了)➜ kubectl -n kube-system get sc nfs-client NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION nfs-client (default) cluster.local/nfs-subdir-external-provisioner Delete三、为ES准备证书文件由于 ElasticSearch 7.x 版本默认安装了 X-Pack 插件,需要我们配置一些安全证书文件。(1)生成证书文件# 运行容器生成证书,containerd下面用nerdctl $ mkdir -p /logging/elastic-certs $ nerdctl run --name elastic-certs \ -v /logging/elastic-certs:/app \ -it \ -w /app \ registry.cn-guangzhou.aliyuncs.com/xingcangku/oooooo:1.0 \ /bin/sh -c " elasticsearch-certutil ca --out /app/elastic-stack-ca.p12 --pass '' && \ elasticsearch-certutil cert --name security-master --dns security-master \ --ca /app/elastic-stack-ca.p12 --pass '' --ca-pass '' --out /app/elastic-certificates.p12 " # 删除容器 $ nerdctl rm -f elastic-certs # 将 pcks12(Public Key Cryptography Standards #12) 文件中的证书和私钥提取出来,并保存为 PEM 格式的文件 #$ cd /logging/elastic-certs #$ cd elastic-certs && openssl pkcs12 -nodes -passin pass:'' -in elastic-certificates.p12 -out elastic-certificate.pem(2)添加证书到 Kubernetes# 添加证书 $ cd /logging/elastic-certs $ kubectl create secret -n logging generic elastic-certs --from-file=elasticcertificates.p12 # 设置集群用户名密码,用户名为elastic,密码为egon666 $ kubectl create secret -n logging generic elastic-auth --fromliteral=username=elastic --from-literal=password=egon666四、安装ES集群首先添加 ELastic 的 Helm 仓库:$ helm repo add elastic https://helm.elastic.co $ helm repo updateElaticSearch 安装需要安装三次,分别安装 Master、Data、Client 节点,Master 节点负责集群间的管理工作;Data 节点负责存储数据;Client 节点负责代理 ElasticSearch Cluster 集群,负载均衡。首先使用 helm pull 拉取 Chart 并解压:$ helm pull elastic/elasticsearch --untar --version 7.17.3 $ cd elasticsearch在 Chart 目录下面创建用于 Master 节点安装配置的 values 文件:(默认自带的values.yaml不用管,我们不用它)# 创建一个新文件:values-master.yaml,内容如下 ## 设置集群名称 clusterName: 'elasticsearch' ## 设置节点名称 nodeGroup: 'master' ## 设置角色 roles: master: 'true' ingest: 'false' data: 'false' # ============ 镜像配置 ============ ## 指定镜像与镜像版本 image: 'registry.cn-hangzhou.aliyuncs.com/egon-k8s-test/elasticsearch' # 可以用自己的镜像,你的镜像仓库必须是公开的 # image: 'elasticsearch' imageTag: '7.17.3' imagePullPolicy: 'IfNotPresent' ## 副本数 # replicas: 3 # 测试环境资源有限,所以设置为1 replicas: 1 # ============ 资源配置 ============ ## JVM 配置参数 esJavaOpts: '-Xmx1g -Xms1g' ## 部署资源配置(生产环境要设置大些) resources: requests: cpu: '2000m' memory: '2Gi' limits: cpu: '2000m' memory: '2Gi' ## 数据持久卷配置 persistence: enabled: true ## 存储数据大小配置 volumeClaimTemplate: storageClassName: nfs-client accessModes: ['ReadWriteOnce'] resources: requests: storage: 5Gi # ============ 安全配置 ============ ## 设置协议,可配置为 http、https protocol: http ## 证书挂载配置,这里我们挂入上面创建的证书 secretMounts: - name: elastic-certs secretName: elastic-certs path: /usr/share/elasticsearch/config/certs defaultMode: 0755 ## Elasticsearch 配置 esConfig: elasticsearch.yml: | xpack.security.enabled: true xpack.security.transport.ssl.enabled: true xpack.security.transport.ssl.verification_mode: certificate xpack.security.transport.ssl.keystore.path: /usr/share/elasticsearch/config/certs/elastic-certificates.p12 xpack.security.transport.ssl.truststore.path: /usr/share/elasticsearch/config/certs/elastic-certificates.p12 # xpack.security.http.ssl.enabled: true # xpack.security.http.ssl.truststore.path: /usr/share/elasticsearch/config/certs/elastic-certificates.p12 # xpack.security.http.ssl.keystore.path: /usr/share/elasticsearch/config/certs/elastic-certificates.p12 ## 环境变量配置,这里引入上面设置的用户名、密码 secret 文件 extraEnvs: - name: ELASTIC_USERNAME valueFrom: secretKeyRef: name: elastic-auth key: username - name: ELASTIC_PASSWORD valueFrom: secretKeyRef: name: elastic-auth key: password # ============ 调度配置 ============ ## 设置调度策略 ## - hard:只有当有足够的节点时 Pod 才会被调度,并且它们永远不会出现在同一个节点上 ## - soft:尽最大努力调度 antiAffinity: 'soft' # tolerations: # - operator: "Exists" # 容忍全部污点 # 创建新文件:values-client.yaml # ============ 设置集群名称 ============ ## 设置集群名称 clusterName: 'elasticsearch' ## 设置节点名称 nodeGroup: 'client' ## 设置角色 roles: master: 'false' ingest: 'false' data: 'false' # ============ 镜像配置 ============ ## 指定镜像与镜像版本 image: 'registry.cn-hangzhou.aliyuncs.com/egon-k8s-test/elasticsearch' # 可以用自己的镜像 # image: 'elasticsearch' imageTag: '7.17.3' ## 副本数 # 测试环境资源有限,所以设置为1吧 replicas: 1 # ============ 资源配置 ============ ## JVM 配置参数 esJavaOpts: '-Xmx1g -Xms1g' ## 部署资源配置(生产环境一定要设置大些) resources: requests: cpu: '1000m' memory: '2Gi' limits: cpu: '1000m' memory: '2Gi' ## 数据持久卷配置 persistence: enabled: false # ============ 安全配置 ============ ## 设置协议,可配置为 http、https protocol: http ## 证书挂载配置,这里我们挂入上面创建的证书 secretMounts: - name: elastic-certs secretName: elastic-certs path: /usr/share/elasticsearch/config/certs ## 自定义配置文件 elasticsearch.yml esConfig: elasticsearch.yml: | xpack.security.enabled: true xpack.security.transport.ssl.enabled: true xpack.security.transport.ssl.verification_mode: certificate xpack.security.transport.ssl.keystore.path: /usr/share/elasticsearch/config/certs/elastic-certificates.p12 xpack.security.transport.ssl.truststore.path: /usr/share/elasticsearch/config/certs/elastic-certificates.p12 # xpack.security.http.ssl.enabled: true # xpack.security.http.ssl.truststore.path: /usr/share/elasticsearch/config/certs/elastic-certificates.p12 # xpack.security.http.ssl.keystore.path: /usr/share/elasticsearch/config/certs/elastic-certificates.p12 ## 环境变量配置,引入上面设置的用户名、密码 secret 文件 extraEnvs: - name: ELASTIC_USERNAME valueFrom: secretKeyRef: name: elastic-auth key: username - name: ELASTIC_PASSWORD valueFrom: secretKeyRef: name: elastic-auth key: password # ============ Service 配置 ============ service: type: NodePort nodePort: '30200'我的集群资源不足,我将上述三个yaml副本都设置成了1现在用上面的 values 文件来安装:(切换到elasticsearch的chart目录下)[root@master01 ~]# cd /logging/elasticsearch/ [root@master01 /logging/elasticsearch]# ls Chart.yaml Makefile templates values-data.yaml values.yaml examples README.md values-client.yaml values-master.yaml # --------------->>>>>>>>>>> 注意install指定的release名字要不同哦 # helm install 你起的release名 你的chart包的路径 -f xxx.yaml --namespace yyy # 如果是升级安装则用:helm upgrade --install 你起的release名 你的chart包的路径 -f values-master.yaml --namespace logging . cd elasticsearch/ # 安装 master 节点 helm install es-master ./ -f values-master.yaml --namespace logging # 安装 data 节点 helm install es-data ./ -f values-data.yaml --namespace logging # 安装 client 节点 helm install es-client ./ -f values-client.yaml --namespace logging # 升级操作 #$ helm upgrade --install es-master -f values-master.yaml --namespace logging . #$ helm upgrade --install es-data -f values-data.yaml --namespace logging . #$ helm upgrade --install es-client -f values-client.yaml --namespace logging .如果你的es-master设置的replicas副本数为3,那么在安装 Master 节点后 Pod 启动时候会抛出异 常,就绪探针探活失败, $ kubectl -n logging describe pod elasticsearch-master-0 Warning Unhealthy 61s kubelet Readiness probe failed: Waiting for elasticsearch cluster to become ready (request params: "wait_for_status=green&timeout=1s" ) Cluster is not yet ready (request params: "wait_for_status=green&timeout=1s" ) 这是个正常现象。在执行安装 Data 节点后 Master 节点 Pod 就会恢复正常。 [root@master01 /logging/elasticsearch]# kubectl -n logging get pods -w NAME READY STATUS RESTARTS AGE elasticsearch-master-0 1/1 Running 0 2m17s elasticsearch-master-1 1/1 Running 0 2m13s elasticsearch-master-2 1/1 Running 0 2m20s 此外,如果因为资源原因导致出现pending状态的pod,请根据describe信息扩容cpu或内存资源查看master(建议3副本,但是我的资源有限,我就启了一个副本)[root@master01 /logging/elasticsearch]# kubectl -n logging get pods -w NAME READY STATUS RESTARTS AGE elasticsearch-client-0 1/1 Running 0 7m17s elasticsearch-data-0 1/1 Running 0 7m21s elasticsearch-master-0 1/1 Running 0 9m16s一会访问es就用用该svc[root@master01 harbor]# kubectl -n logging get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE elasticsearch-client NodePort <none> 9200:30200/TCP,9300:30969/TCP 23h五、安装Kibanahelm pull elastic/kibana --untar --version 7.17.3 # 下载并解压chart包 cd kibana创建用于安装 Kibana 的 values 文件:# 创建全新文件:values-prod.yaml # ============ 镜像配置 ============ ## 指定镜像与镜像版本 image: 'registry.cn-hangzhou.aliyuncs.com/egon-k8s-test/kibana' # image: 'docker.elastic.co/kibana/kibana' imageTag: '7.17.3' imagePullPolicy: "IfNotPresent" # ============ ElasticSearch 配置 ============ ## 配置 ElasticSearch 地址,主要使用 es-client 的 SVC elasticsearchHosts: 'http://elasticsearch-client:9200' # ============ 环境变量配置 ============ ## 引入用户名、密码的 secret 文件 extraEnvs: - name: 'ELASTICSEARCH_USERNAME' valueFrom: secretKeyRef: name: elastic-auth key: username - name: 'ELASTICSEARCH_PASSWORD' valueFrom: secretKeyRef: name: elastic-auth key: password # ============ 资源配置 ============ resources: requests: cpu: '500m' memory: '1Gi' limits: cpu: '500m' memory: '1Gi' # ============ Kibana 参数配置 ============ ## 添加语言配置,设置 Kibana 为中文 kibanaConfig: kibana.yml: | i18n.locale: "zh-CN" server.publicBaseUrl: "" # 将地址改为你访问 Kibana 的地址,不能以 / 结尾 # ============ Service 配置 ============ service: type: NodePort nodePort: '30601'部署helm install kibana -f values-prod.yaml --namespace logging .部署完后查看[root@master01 /]# kubectl -n logging get pods -w NAME READY STATUS RESTARTS AGE elasticsearch-client-0 0/1 Running 0 2m3s elasticsearch-data-0 1/1 Running 0 4m35s elasticsearch-master-0 1/1 Running 0 4m45s elasticsearch-client-0 1/1 Running 0 2m20s上面我们安装 Kibana 的时候指定了 30601 的 NodePort 端口,所以我们可以从任意节点http://IP:30601 来访问 Kibana。我们可以看到会跳转到登录页面,让我们输出用户名、密码,这里我们输入上面配置的用户名elastic 、密码 egon666 进行登录。登录成功后进入如下所示的 Kibana 主页(点击自己浏览):六、安装Fluentd来作为日志收集工具(1)安装fluentd要想在k8s每个物理节点都能采集到数据,们可以直接用 DasemonSet 控制器来部署 Fluentd 应用,确保在集群中的每个节点上始终运行一个 Fluentd 容器可以直接使用 Helm 来进行一键安装,为了能够了解更多实现细节,我们这里还是采用手动方法来进行安装。官网部署参考:https://docs.fluentd.org/container-deployment/kubernetes。
Harbor 是一个主流的镜像仓库系统,在 v1.6 版本以后的 harbor 中新增加了 helm charts 的管理功能,可以存储Chart文件。 其实在Harbor 2.8+版本中,Helm Chart支持已经转移到了OCI(Open Container Initiative)格式。这意味着你需要使用OCI形式来上传和管理你的Helm Chart(不需要像网上一样,去为harbor开启chart仓库支持)一、安装一个nfs存储,提供一个sc默认存储类# 1、安装 helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner/ helm upgrade --install nfs-subdir-external-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner --set nfs.server= --set nfs.path=/data/nfs --set storageClass.defaultClass=true -n kube-system # 2、查看 helm -n kube-system list # 3、查看nfs_provider的pod kubectl -n kube-system get pods |grep nfs nfs-subdir-external-provisioner-797c875548-rt4dh 1/1 Running 2 (58m ago) 23h # 4、查看sc(已经设置为默认的了) kubectl -n kube-system get sc nfs-client NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE nfs-client (default) cluster.local/nfs-subdir-external-provisioner Delete Immediate true 23h 二、添加仓库地址helm repo add harbor https://helm.goharbor.io helm repo list三、下载Chart包到本地因为需要修改的参数比较多,在命令行直接helm install比较复杂,我就将Chart包下载到本地,再修改一些配置,这样比较直观,也比较符合实际工作中的业务环境。helm pull harbor/harbor # 下载Chart包 tar zxvf harbor-1.14.2.tgz # 解压包四、修改values.yamlexpose: # Set how to expose the service. Set the type as "ingress", "clusterIP", "nodePort" or "loadBalancer" # and fill the information in the corresponding section type: nodePort tls: # Enable TLS or not. # Delete the "ssl-redirect" annotations in "expose.ingress.annotations" when TLS is disabled and "expose.type" is "ingress" # Note: if the "expose.type" is "ingress" and TLS is disabled, # the port must be included in the command when pulling/pushing images. # Refer to https://github.com/goharbor/harbor/issues/5291 for details. enabled: false # The source of the tls certificate. Set as "auto", "secret" # or "none" and fill the information in the corresponding section # 1) auto: generate the tls certificate automatically # 2) secret: read the tls certificate from the specified secret. # The tls certificate can be generated manually or by cert manager # 3) none: configure no tls certificate for the ingress. If the default # tls certificate is configured in the ingress controller, choose this option certSource: auto auto: # The common name used to generate the certificate, it's necessary # when the type isn't "ingress" commonName: "" secret: # The name of secret which contains keys named: # "tls.crt" - the certificate # "tls.key" - the private key secretName: "" ingress: hosts: core: core.harbor.domain # set to the type of ingress controller if it has specific requirements. # leave as `default` for most ingress controllers. # set to `gce` if using the GCE ingress controller # set to `ncp` if using the NCP (NSX-T Container Plugin) ingress controller # set to `alb` if using the ALB ingress controller # set to `f5-bigip` if using the F5 BIG-IP ingress controller controller: default ## Allow .Capabilities.KubeVersion.Version to be overridden while creating ingress kubeVersionOverride: "" className: "" annotations: # note different ingress controllers may require a different ssl-redirect annotation # for Envoy, use ingress.kubernetes.io/force-ssl-redirect: "true" and remove the nginx lines below ingress.kubernetes.io/ssl-redirect: "true" ingress.kubernetes.io/proxy-body-size: "0" nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/proxy-body-size: "0" # ingress-specific labels labels: {} clusterIP: # The name of ClusterIP service name: harbor # The ip address of the ClusterIP service (leave empty for acquiring dynamic ip) staticClusterIP: "" ports: # The service port Harbor listens on when serving HTTP httpPort: 80 # The service port Harbor listens on when serving HTTPS httpsPort: 443 # Annotations on the ClusterIP service annotations: {} # ClusterIP-specific labels labels: {} nodePort: # The name of NodePort service name: harbor ports: http: # The service port Harbor listens on when serving HTTP port: 80 # The node port Harbor listens on when serving HTTP nodePort: 30002 https: # The service port Harbor listens on when serving HTTPS port: 443 # The node port Harbor listens on when serving HTTPS nodePort: 30003 # Annotations on the nodePort service annotations: {} # nodePort-specific labels labels: {} loadBalancer: # The name of LoadBalancer service name: harbor # Set the IP if the LoadBalancer supports assigning IP IP: "" ports: # The service port Harbor listens on when serving HTTP httpPort: 80 # The service port Harbor listens on when serving HTTPS httpsPort: 443 # Annotations on the loadBalancer service annotations: {} # loadBalancer-specific labels labels: {} sourceRanges: [] # The external URL for Harbor core service. It is used to # 1) populate the docker/helm commands showed on portal # 2) populate the token service URL returned to docker client # # Format: protocol://domain[:port]. Usually: # 1) if "expose.type" is "ingress", the "domain" should be # the value of "expose.ingress.hosts.core" # 2) if "expose.type" is "clusterIP", the "domain" should be # the value of "expose.clusterIP.name" # 3) if "expose.type" is "nodePort", the "domain" should be # the IP address of k8s node # # If Harbor is deployed behind the proxy, set it as the URL of proxy externalURL: # The persistence is enabled by default and a default StorageClass # is needed in the k8s cluster to provision volumes dynamically. # Specify another StorageClass in the "storageClass" or set "existingClaim" # if you already have existing persistent volumes to use # # For storing images and charts, you can also use "azure", "gcs", "s3", # "swift" or "oss". Set it in the "imageChartStorage" section persistence: enabled: true # Setting it to "keep" to avoid removing PVCs during a helm delete # operation. Leaving it empty will delete PVCs after the chart deleted # (this does not apply for PVCs that are created for internal database # and redis components, i.e. they are never deleted automatically) resourcePolicy: "keep" persistentVolumeClaim: registry: # Use the existing PVC which must be created manually before bound, # and specify the "subPath" if the PVC is shared with other components existingClaim: "" # Specify the "storageClass" used to provision the volume. Or the default # StorageClass will be used (the default). # Set it to "-" to disable dynamic provisioning storageClass: "nfs-client" subPath: "" accessMode: ReadWriteMany size: 5Gi annotations: {} jobservice: jobLog: existingClaim: "" storageClass: "nfs-client" subPath: "" accessMode: ReadWriteMany size: 1Gi annotations: {} # If external database is used, the following settings for database will # be ignored database: existingClaim: "" storageClass: "nfs-client" subPath: "" accessMode: ReadWriteMany size: 1Gi annotations: {} # If external Redis is used, the following settings for Redis will # be ignored redis: existingClaim: "" storageClass: "nfs-client" subPath: "" accessMode: ReadWriteMany size: 1Gi annotations: {} trivy: existingClaim: "" storageClass: "" subPath: "" accessMode: ReadWriteMany size: 5Gi annotations: {} # Define which storage backend is used for registry to store # images and charts. Refer to # https://github.com/distribution/distribution/blob/main/docs/content/about/configuration.md#storage # for the detail. imageChartStorage: # Specify whether to disable `redirect` for images and chart storage, for # backends which not supported it (such as using minio for `s3` storage type), please disable # it. To disable redirects, simply set `disableredirect` to `true` instead. # Refer to # https://github.com/distribution/distribution/blob/main/docs/configuration.md#redirect # for the detail. disableredirect: false # Specify the "caBundleSecretName" if the storage service uses a self-signed certificate. # The secret must contain keys named "ca.crt" which will be injected into the trust store # of registry's containers. # caBundleSecretName: # Specify the type of storage: "filesystem", "azure", "gcs", "s3", "swift", # "oss" and fill the information needed in the corresponding section. The type # must be "filesystem" if you want to use persistent volumes for registry type: filesystem filesystem: rootdirectory: /storage #maxthreads: 100 azure: accountname: accountname accountkey: base64encodedaccountkey container: containername #realm: core.windows.net # To use existing secret, the key must be AZURE_STORAGE_ACCESS_KEY existingSecret: "" gcs: bucket: bucketname # The base64 encoded json file which contains the key encodedkey: base64-encoded-json-key-file #rootdirectory: /gcs/object/name/prefix #chunksize: "5242880" # To use existing secret, the key must be GCS_KEY_DATA existingSecret: "" useWorkloadIdentity: false s3: # Set an existing secret for S3 accesskey and secretkey # keys in the secret should be REGISTRY_STORAGE_S3_ACCESSKEY and REGISTRY_STORAGE_S3_SECRETKEY for registry #existingSecret: "" region: us-west-1 bucket: bucketname #accesskey: awsaccesskey #secretkey: awssecretkey #regionendpoint: http://myobjects.local #encrypt: false #keyid: mykeyid #secure: true #skipverify: false #v4auth: true #chunksize: "5242880" #rootdirectory: /s3/object/name/prefix #storageclass: STANDARD #multipartcopychunksize: "33554432" #multipartcopymaxconcurrency: 100 #multipartcopythresholdsize: "33554432" swift: authurl: https://storage.myprovider.com/v3/auth username: username password: password container: containername # keys in existing secret must be REGISTRY_STORAGE_SWIFT_PASSWORD, REGISTRY_STORAGE_SWIFT_SECRETKEY, REGISTRY_STORAGE_SWIFT_ACCESSKEY existingSecret: "" #region: fr #tenant: tenantname #tenantid: tenantid #domain: domainname #domainid: domainid #trustid: trustid #insecureskipverify: false #chunksize: 5M #prefix: #secretkey: secretkey #accesskey: accesskey #authversion: 3 #endpointtype: public #tempurlcontainerkey: false #tempurlmethods: oss: accesskeyid: accesskeyid accesskeysecret: accesskeysecret region: regionname bucket: bucketname # key in existingSecret must be REGISTRY_STORAGE_OSS_ACCESSKEYSECRET existingSecret: "" #endpoint: endpoint #internal: false #encrypt: false #secure: true #chunksize: 10M #rootdirectory: rootdirectory # The initial password of Harbor admin. Change it from portal after launching Harbor # or give an existing secret for it # key in secret is given via (default to HARBOR_ADMIN_PASSWORD) # existingSecretAdminPassword: existingSecretAdminPasswordKey: HARBOR_ADMIN_PASSWORD harborAdminPassword: "Harbor12345" # The internal TLS used for harbor components secure communicating. In order to enable https # in each component tls cert files need to provided in advance. internalTLS: # If internal TLS enabled enabled: false # enable strong ssl ciphers (default: false) strong_ssl_ciphers: false # There are three ways to provide tls # 1) "auto" will generate cert automatically # 2) "manual" need provide cert file manually in following value # 3) "secret" internal certificates from secret certSource: "auto" # The content of trust ca, only available when `certSource` is "manual" trustCa: "" # core related cert configuration core: # secret name for core's tls certs secretName: "" # Content of core's TLS cert file, only available when `certSource` is "manual" crt: "" # Content of core's TLS key file, only available when `certSource` is "manual" key: "" # jobservice related cert configuration jobservice: # secret name for jobservice's tls certs secretName: "" # Content of jobservice's TLS key file, only available when `certSource` is "manual" crt: "" # Content of jobservice's TLS key file, only available when `certSource` is "manual" key: "" # registry related cert configuration registry: # secret name for registry's tls certs secretName: "" # Content of registry's TLS key file, only available when `certSource` is "manual" crt: "" # Content of registry's TLS key file, only available when `certSource` is "manual" key: "" # portal related cert configuration portal: # secret name for portal's tls certs secretName: "" # Content of portal's TLS key file, only available when `certSource` is "manual" crt: "" # Content of portal's TLS key file, only available when `certSource` is "manual" key: "" # trivy related cert configuration trivy: # secret name for trivy's tls certs secretName: "" # Content of trivy's TLS key file, only available when `certSource` is "manual" crt: "" # Content of trivy's TLS key file, only available when `certSource` is "manual" key: "" ipFamily: # ipv6Enabled set to true if ipv6 is enabled in cluster, currently it affected the nginx related component ipv6: enabled: true # ipv4Enabled set to true if ipv4 is enabled in cluster, currently it affected the nginx related component ipv4: enabled: true imagePullPolicy: IfNotPresent # Use this set to assign a list of default pullSecrets imagePullSecrets: # - name: docker-registry-secret # - name: internal-registry-secret # The update strategy for deployments with persistent volumes(jobservice, registry): "RollingUpdate" or "Recreate" # Set it as "Recreate" when "RWM" for volumes isn't supported updateStrategy: type: RollingUpdate # debug, info, warning, error or fatal logLevel: info # The name of the secret which contains key named "ca.crt". Setting this enables the # download link on portal to download the CA certificate when the certificate isn't # generated automatically caSecretName: "" # The secret key used for encryption. Must be a string of 16 chars. secretKey: "not-a-secure-key" # If using existingSecretSecretKey, the key must be secretKey existingSecretSecretKey: "" # The proxy settings for updating trivy vulnerabilities from the Internet and replicating # artifacts from/to the registries that cannot be reached directly proxy: httpProxy: httpsProxy: noProxy:,localhost,.local,.internal components: - core - jobservice - trivy # Run the migration job via helm hook enableMigrateHelmHook: false # The custom ca bundle secret, the secret must contain key named "ca.crt" # which will be injected into the trust store for core, jobservice, registry, trivy components # caBundleSecretName: "" ## UAA Authentication Options # If you're using UAA for authentication behind a self-signed # certificate you will need to provide the CA Cert. # Set uaaSecretName below to provide a pre-created secret that # contains a base64 encoded CA Certificate named `ca.crt`. # uaaSecretName: metrics: enabled: true core: path: /metrics port: 8001 registry: path: /metrics port: 8001 jobservice: path: /metrics port: 8001 exporter: path: /metrics port: 8001 ## Create prometheus serviceMonitor to scrape harbor metrics. ## This requires the monitoring.coreos.com/v1 CRD. Please see ## https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/user-guides/getting-started.md ## serviceMonitor: enabled: false additionalLabels: {} # Scrape interval. If not set, the Prometheus default scrape interval is used. interval: "" # Metric relabel configs to apply to samples before ingestion. metricRelabelings: [] # - action: keep # regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset).+' # sourceLabels: [__name__] # Relabel configs to apply to samples before ingestion. relabelings: [] # - sourceLabels: [__meta_kubernetes_pod_node_name] # separator: ; # regex: ^(.*)$ # targetLabel: nodename # replacement: $1 # action: replace trace: enabled: false # trace provider: jaeger or otel # jaeger should be 1.26+ provider: jaeger # set sample_rate to 1 if you wanna sampling 100% of trace data; set 0.5 if you wanna sampling 50% of trace data, and so forth sample_rate: 1 # namespace used to differentiate different harbor services # namespace: # attributes is a key value dict contains user defined attributes used to initialize trace provider # attributes: # application: harbor jaeger: # jaeger supports two modes: # collector mode(uncomment endpoint and uncomment username, password if needed) # agent mode(uncomment agent_host and agent_port) endpoint: http://hostname:14268/api/traces # username: # password: # agent_host: hostname # export trace data by jaeger.thrift in compact mode # agent_port: 6831 otel: endpoint: hostname:4318 url_path: /v1/traces compression: false insecure: true # timeout is in seconds timeout: 10 # cache layer configurations # if this feature enabled, harbor will cache the resource # `project/project_metadata/repository/artifact/manifest` in the redis # which help to improve the performance of high concurrent pulling manifest. cache: # default is not enabled. enabled: false # default keep cache for one day. expireHours: 24 ## set Container Security Context to comply with PSP restricted policy if necessary ## each of the conatiner will apply the same security context ## containerSecurityContext:{} is initially an empty yaml that you could edit it on demand, we just filled with a common template for convenience containerSecurityContext: privileged: false allowPrivilegeEscalation: false seccompProfile: type: RuntimeDefault runAsNonRoot: true capabilities: drop: - ALL # If service exposed via "ingress", the Nginx will not be used nginx: image: repository: registry.cn-guangzhou.aliyuncs.com/xingcangku/nginx-photon tag: v2.11.1 # set the service account to be used, default if left empty serviceAccountName: "" # mount the service account token automountServiceAccountToken: false replicas: 1 revisionHistoryLimit: 10 # resources: # requests: # memory: 256Mi # cpu: 100m extraEnvVars: [] nodeSelector: {} tolerations: [] affinity: {} # Spread Pods across failure-domains like regions, availability zones or nodes topologySpreadConstraints: [] # - maxSkew: 1 # topologyKey: topology.kubernetes.io/zone # nodeTaintsPolicy: Honor # whenUnsatisfiable: DoNotSchedule ## Additional deployment annotations podAnnotations: {} ## Additional deployment labels podLabels: {} ## The priority class to run the pod as priorityClassName: portal: image: repository: registry.cn-guangzhou.aliyuncs.com/xingcangku/harbor-portal tag: v2.11.1 # set the service account to be used, default if left empty serviceAccountName: "" # mount the service account token automountServiceAccountToken: false replicas: 1 revisionHistoryLimit: 10 # resources: # requests: # memory: 256Mi # cpu: 100m extraEnvVars: [] nodeSelector: {} tolerations: [] affinity: {} # Spread Pods across failure-domains like regions, availability zones or nodes topologySpreadConstraints: [] # - maxSkew: 1 # topologyKey: topology.kubernetes.io/zone # nodeTaintsPolicy: Honor # whenUnsatisfiable: DoNotSchedule ## Additional deployment annotations podAnnotations: {} ## Additional deployment labels podLabels: {} ## Additional service annotations serviceAnnotations: {} ## The priority class to run the pod as priorityClassName: # containers to be run before the controller's container starts. initContainers: [] # Example: # # - name: wait # image: busybox # command: [ 'sh', '-c', "sleep 20" ] core: image: repository: registry.cn-guangzhou.aliyuncs.com/xingcangku/harbor-core tag: v2.11.1 # set the service account to be used, default if left empty serviceAccountName: "" # mount the service account token automountServiceAccountToken: false replicas: 1 revisionHistoryLimit: 10 ## Startup probe values startupProbe: enabled: true initialDelaySeconds: 10 # resources: # requests: # memory: 256Mi # cpu: 100m extraEnvVars: [] nodeSelector: {} tolerations: [] affinity: {} # Spread Pods across failure-domains like regions, availability zones or nodes topologySpreadConstraints: [] # - maxSkew: 1 # topologyKey: topology.kubernetes.io/zone # nodeTaintsPolicy: Honor # whenUnsatisfiable: DoNotSchedule ## Additional deployment annotations podAnnotations: {} ## Additional deployment labels podLabels: {} ## Additional service annotations serviceAnnotations: {} ## The priority class to run the pod as priorityClassName: # containers to be run before the controller's container starts. initContainers: [] # Example: # # - name: wait # image: busybox # command: [ 'sh', '-c', "sleep 20" ] ## User settings configuration json string configureUserSettings: # The provider for updating project quota(usage), there are 2 options, redis or db. # By default it is implemented by db but you can configure it to redis which # can improve the performance of high concurrent pushing to the same project, # and reduce the database connections spike and occupies. # Using redis will bring up some delay for quota usage updation for display, so only # suggest switch provider to redis if you were ran into the db connections spike around # the scenario of high concurrent pushing to same project, no improvment for other scenes. quotaUpdateProvider: db # Or redis # Secret is used when core server communicates with other components. # If a secret key is not specified, Helm will generate one. Alternatively set existingSecret to use an existing secret # Must be a string of 16 chars. secret: "" # Fill in the name of a kubernetes secret if you want to use your own # If using existingSecret, the key must be secret existingSecret: "" # Fill the name of a kubernetes secret if you want to use your own # TLS certificate and private key for token encryption/decryption. # The secret must contain keys named: # "tls.key" - the private key # "tls.crt" - the certificate secretName: "" # If not specifying a preexisting secret, a secret can be created from tokenKey and tokenCert and used instead. # If none of secretName, tokenKey, and tokenCert are specified, an ephemeral key and certificate will be autogenerated. # tokenKey and tokenCert must BOTH be set or BOTH unset. # The tokenKey value is formatted as a multiline string containing a PEM-encoded RSA key, indented one more than tokenKey on the following line. tokenKey: | # If tokenKey is set, the value of tokenCert must be set as a PEM-encoded certificate signed by tokenKey, and supplied as a multiline string, indented one more than tokenCert on the following line. tokenCert: | # The XSRF key. Will be generated automatically if it isn't specified xsrfKey: "" # If using existingSecret, the key is defined by core.existingXsrfSecretKey existingXsrfSecret: "" # If using existingSecret, the key existingXsrfSecretKey: CSRF_KEY # The time duration for async update artifact pull_time and repository # pull_count, the unit is second. Will be 10 seconds if it isn't set. # eg. artifactPullAsyncFlushDuration: 10 artifactPullAsyncFlushDuration: gdpr: deleteUser: false auditLogsCompliant: false jobservice: image: repository: goharbor/harbor-jobservice tag: v2.11.1 # set the service account to be used, default if left empty serviceAccountName: "" # mount the service account token automountServiceAccountToken: false replicas: 1 revisionHistoryLimit: 10 # resources: # requests: # memory: 256Mi # cpu: 100m extraEnvVars: [] nodeSelector: {} tolerations: [] affinity: {} # Spread Pods across failure-domains like regions, availability zones or nodes topologySpreadConstraints: # - maxSkew: 1 # topologyKey: topology.kubernetes.io/zone # nodeTaintsPolicy: Honor # whenUnsatisfiable: DoNotSchedule ## Additional deployment annotations podAnnotations: {} ## Additional deployment labels podLabels: {} ## The priority class to run the pod as priorityClassName: # containers to be run before the controller's container starts. initContainers: [] # Example: # # - name: wait # image: busybox # command: [ 'sh', '-c', "sleep 20" ] maxJobWorkers: 10 # The logger for jobs: "file", "database" or "stdout" jobLoggers: - file # - database # - stdout # The jobLogger sweeper duration (ignored if `jobLogger` is `stdout`) loggerSweeperDuration: 14 #days notification: webhook_job_max_retry: 3 webhook_job_http_client_timeout: 3 # in seconds reaper: # the max time to wait for a task to finish, if unfinished after max_update_hours, the task will be mark as error, but the task will continue to run, default value is 24 max_update_hours: 24 # the max time for execution in running state without new task created max_dangling_hours: 168 # Secret is used when job service communicates with other components. # If a secret key is not specified, Helm will generate one. # Must be a string of 16 chars. secret: "" # Use an existing secret resource existingSecret: "" # Key within the existing secret for the job service secret existingSecretKey: JOBSERVICE_SECRET registry: registry: image: repository: goharbor/registry-photon tag: v2.11.1 # resources: # requests: # memory: 256Mi # cpu: 100m extraEnvVars: [] controller: image: repository: registry.cn-guangzhou.aliyuncs.com/xingcangku/harbor-registryctl tag: v2.11.1 # resources: # requests: # memory: 256Mi # cpu: 100m extraEnvVars: [] # set the service account to be used, default if left empty serviceAccountName: "" # mount the service account token automountServiceAccountToken: false replicas: 1 revisionHistoryLimit: 10 nodeSelector: {} tolerations: [] affinity: {} # Spread Pods across failure-domains like regions, availability zones or nodes topologySpreadConstraints: [] # - maxSkew: 1 # topologyKey: topology.kubernetes.io/zone # nodeTaintsPolicy: Honor # whenUnsatisfiable: DoNotSchedule ## Additional deployment annotations podAnnotations: {} ## Additional deployment labels podLabels: {} ## The priority class to run the pod as priorityClassName: # containers to be run before the controller's container starts. initContainers: [] # Example: # # - name: wait # image: busybox # command: [ 'sh', '-c', "sleep 20" ] # Secret is used to secure the upload state from client # and registry storage backend. # See: https://github.com/distribution/distribution/blob/main/docs/configuration.md#http # If a secret key is not specified, Helm will generate one. # Must be a string of 16 chars. secret: "" # Use an existing secret resource existingSecret: "" # Key within the existing secret for the registry service secret existingSecretKey: REGISTRY_HTTP_SECRET # If true, the registry returns relative URLs in Location headers. The client is responsible for resolving the correct URL. relativeurls: false credentials: username: "harbor_registry_user" password: "harbor_registry_password" # If using existingSecret, the key must be REGISTRY_PASSWD and REGISTRY_HTPASSWD existingSecret: "" # Login and password in htpasswd string format. Excludes `registry.credentials.username` and `registry.credentials.password`. May come in handy when integrating with tools like argocd or flux. This allows the same line to be generated each time the template is rendered, instead of the `htpasswd` function from helm, which generates different lines each time because of the salt. # htpasswdString: $apr1$XLefHzeG$Xl4.s00sMSCCcMyJljSZb0 # example string htpasswdString: "" middleware: enabled: false type: cloudFront cloudFront: baseurl: example.cloudfront.net keypairid: KEYPAIRID duration: 3000s ipfilteredby: none # The secret key that should be present is CLOUDFRONT_KEY_DATA, which should be the encoded private key # that allows access to CloudFront privateKeySecret: "my-secret" # enable purge _upload directories upload_purging: enabled: true # remove files in _upload directories which exist for a period of time, default is one week. age: 168h # the interval of the purge operations interval: 24h dryrun: false trivy: # enabled the flag to enable Trivy scanner enabled: true image: # repository the repository for Trivy adapter image repository: registry.cn-guangzhou.aliyuncs.com/xingcangku/adapter-photon # tag the tag for Trivy adapter image tag: v2.11.1 # set the service account to be used, default if left empty serviceAccountName: "" # mount the service account token automountServiceAccountToken: false # replicas the number of Pod replicas replicas: 1 resources: requests: cpu: 200m memory: 512Mi limits: cpu: 1 memory: 1Gi extraEnvVars: [] nodeSelector: {} tolerations: [] affinity: {} # Spread Pods across failure-domains like regions, availability zones or nodes topologySpreadConstraints: [] # - maxSkew: 1 # topologyKey: topology.kubernetes.io/zone # nodeTaintsPolicy: Honor # whenUnsatisfiable: DoNotSchedule ## Additional deployment annotations podAnnotations: {} ## Additional deployment labels podLabels: {} ## The priority class to run the pod as priorityClassName: # containers to be run before the controller's container starts. initContainers: [] # Example: # # - name: wait # image: busybox # command: [ 'sh', '-c', "sleep 20" ] # debugMode the flag to enable Trivy debug mode with more verbose scanning log debugMode: false # vulnType a comma-separated list of vulnerability types. Possible values are `os` and `library`. vulnType: "os,library" # severity a comma-separated list of severities to be checked severity: "UNKNOWN,LOW,MEDIUM,HIGH,CRITICAL" # ignoreUnfixed the flag to display only fixed vulnerabilities ignoreUnfixed: false # insecure the flag to skip verifying registry certificate insecure: false # gitHubToken the GitHub access token to download Trivy DB # # Trivy DB contains vulnerability information from NVD, Red Hat, and many other upstream vulnerability databases. # It is downloaded by Trivy from the GitHub release page https://github.com/aquasecurity/trivy-db/releases and cached # in the local file system (`/home/scanner/.cache/trivy/db/trivy.db`). In addition, the database contains the update # timestamp so Trivy can detect whether it should download a newer version from the Internet or use the cached one. # Currently, the database is updated every 12 hours and published as a new release to GitHub. # # Anonymous downloads from GitHub are subject to the limit of 60 requests per hour. Normally such rate limit is enough # for production operations. If, for any reason, it's not enough, you could increase the rate limit to 5000 # requests per hour by specifying the GitHub access token. For more details on GitHub rate limiting please consult # https://developer.github.com/v3/#rate-limiting # # You can create a GitHub token by following the instructions in # https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line gitHubToken: "" # skipUpdate the flag to disable Trivy DB downloads from GitHub # # You might want to set the value of this flag to `true` in test or CI/CD environments to avoid GitHub rate limiting issues. # If the value is set to `true` you have to manually download the `trivy.db` file and mount it in the # `/home/scanner/.cache/trivy/db/trivy.db` path. skipUpdate: false # skipJavaDBUpdate If the flag is enabled you have to manually download the `trivy-java.db` file and mount it in the # `/home/scanner/.cache/trivy/java-db/trivy-java.db` path # skipJavaDBUpdate: false # The offlineScan option prevents Trivy from sending API requests to identify dependencies. # # Scanning JAR files and pom.xml may require Internet access for better detection, but this option tries to avoid it. # For example, the offline mode will not try to resolve transitive dependencies in pom.xml when the dependency doesn't # exist in the local repositories. It means a number of detected vulnerabilities might be fewer in offline mode. # It would work if all the dependencies are in local. # This option doesn’t affect DB download. You need to specify skipUpdate as well as offlineScan in an air-gapped environment. offlineScan: false # Comma-separated list of what security issues to detect. Possible values are `vuln`, `config` and `secret`. Defaults to `vuln`. securityCheck: "vuln" # The duration to wait for scan completion timeout: 5m0s database: # if external database is used, set "type" to "external" # and fill the connection information in "external" section type: internal internal: image: repository: goharbor/harbor-db tag: v2.11.1 # set the service account to be used, default if left empty serviceAccountName: "" # mount the service account token automountServiceAccountToken: false # resources: # requests: # memory: 256Mi # cpu: 100m # The timeout used in livenessProbe; 1 to 5 seconds livenessProbe: timeoutSeconds: 1 # The timeout used in readinessProbe; 1 to 5 seconds readinessProbe: timeoutSeconds: 1 extraEnvVars: [] nodeSelector: {} tolerations: [] affinity: {} ## The priority class to run the pod as priorityClassName: # containers to be run before the controller's container starts. extrInitContainers: [] # Example: # # - name: wait # image: busybox # command: [ 'sh', '-c', "sleep 20" ] # The initial superuser password for internal database password: "changeit" # The size limit for Shared memory, pgSQL use it for shared_buffer # More details see: # https://github.com/goharbor/harbor/issues/15034 shmSizeLimit: 512Mi initContainer: migrator: {} # resources: # requests: # memory: 128Mi # cpu: 100m permissions: {} # resources: # requests: # memory: 128Mi # cpu: 100m external: host: "" port: "5432" username: "user" password: "password" coreDatabase: "registry" # if using existing secret, the key must be "password" existingSecret: "" # "disable" - No SSL # "require" - Always SSL (skip verification) # "verify-ca" - Always SSL (verify that the certificate presented by the # server was signed by a trusted CA) # "verify-full" - Always SSL (verify that the certification presented by the # server was signed by a trusted CA and the server host name matches the one # in the certificate) sslmode: "disable" # The maximum number of connections in the idle connection pool per pod (core+exporter). # If it <=0, no idle connections are retained. maxIdleConns: 100 # The maximum number of open connections to the database per pod (core+exporter). # If it <= 0, then there is no limit on the number of open connections. # Note: the default number of connections is 1024 for harbor's postgres. maxOpenConns: 900 ## Additional deployment annotations podAnnotations: {} ## Additional deployment labels podLabels: {} redis: # if external Redis is used, set "type" to "external" # and fill the connection information in "external" section type: internal internal: image: repository: goharbor/redis-photon tag: v2.11.1 # set the service account to be used, default if left empty serviceAccountName: "" # mount the service account token automountServiceAccountToken: false # resources: # requests: # memory: 256Mi # cpu: 100m extraEnvVars: [] nodeSelector: {} tolerations: [] affinity: {} ## The priority class to run the pod as priorityClassName: # containers to be run before the controller's container starts. initContainers: [] # Example: # # - name: wait # image: busybox # command: [ 'sh', '-c', "sleep 20" ] # # jobserviceDatabaseIndex defaults to "1" # # registryDatabaseIndex defaults to "2" # # trivyAdapterIndex defaults to "5" # # harborDatabaseIndex defaults to "0", but it can be configured to "6", this config is optional # # cacheLayerDatabaseIndex defaults to "0", but it can be configured to "7", this config is optional jobserviceDatabaseIndex: "1" registryDatabaseIndex: "2" trivyAdapterIndex: "5" # harborDatabaseIndex: "6" # cacheLayerDatabaseIndex: "7" external: # support redis, redis+sentinel # addr for redis: <host_redis>:<port_redis> # addr for redis+sentinel: <host_sentinel1>:<port_sentinel1>,<host_sentinel2>:<port_sentinel2>,<host_sentinel3>:<port_sentinel3> addr: "" # The name of the set of Redis instances to monitor, it must be set to support redis+sentinel sentinelMasterSet: "" # The "coreDatabaseIndex" must be "0" as the library Harbor # used doesn't support configuring it # harborDatabaseIndex defaults to "0", but it can be configured to "6", this config is optional # cacheLayerDatabaseIndex defaults to "0", but it can be configured to "7", this config is optional coreDatabaseIndex: "0" jobserviceDatabaseIndex: "1" registryDatabaseIndex: "2" trivyAdapterIndex: "5" # harborDatabaseIndex: "6" # cacheLayerDatabaseIndex: "7" # username field can be an empty string, and it will be authenticated against the default user username: "" password: "" # If using existingSecret, the key must be REDIS_PASSWORD existingSecret: "" ## Additional deployment annotations podAnnotations: {} ## Additional deployment labels podLabels: {} exporter: image: repository: goharbor/harbor-exporter tag: v2.11.1 serviceAccountName: "" # mount the service account token automountServiceAccountToken: false replicas: 1 revisionHistoryLimit: 10 # resources: # requests: # memory: 256Mi # cpu: 100m extraEnvVars: [] podAnnotations: {} ## Additional deployment labels podLabels: {} nodeSelector: {} tolerations: [] affinity: {} # Spread Pods across failure-domains like regions, availability zones or nodes topologySpreadConstraints: [] ## The priority class to run the pod as priorityClassName: # - maxSkew: 1 # topologyKey: topology.kubernetes.io/zone # nodeTaintsPolicy: Honor # whenUnsatisfiable: DoNotSchedule cacheDuration: 23 cacheCleanInterval: 14400 五、安装kubectl create namespace harbor helm install harbor . -n harbor # 将安装资源部署到harbor命名空间 # 注意 # 1、部署过程可能因为下载镜像慢导致redis尚未启动成功,其他pod会出现启动失败的现象,耐心等一会即可 # 2、如果下载速度过慢,可以自己制作镜像,或者下载镜像后上传到服务器导入 # nerdctl -n k8s.io load -i xxxxxxxxxxx.tar六、查看[root@master01 harbor]# kubectl -n harbor get pods -w NAME READY STATUS RESTARTS AGE harbor-core-586f48cb4c-4r7gz 0/1 Running 2 (66s ago) 3m21s harbor-database-0 1/1 Running 0 3m21s harbor-exporter-74ff648dfc-k6pb2 1/1 Running 2 (79s ago) 3m21s harbor-jobservice-864b5bc9b9-8wb26 0/1 CrashLoopBackOff 5 (6s ago) 3m21s harbor-nginx-6c5fc7c744-5m9lz 1/1 Running 0 3m21s harbor-portal-74484f87f5-lh8m6 1/1 Running 0 3m21s harbor-redis-0 1/1 Running 0 3m21s harbor-registry-b7f8d77d6-ltpw7 2/2 Running 0 3m21s harbor-trivy-0 1/1 Running 0 3m21s harbor-core-586f48cb4c-4r7gz 0/1 Running 2 (77s ago) 3m32s harbor-core-586f48cb4c-4r7gz 1/1 Running 2 (78s ago) 3m33s ^C[root@master01 harbor]# ^C [root@master01 harbor]# ^C [root@master01 harbor]# kubectl -n harbor delete pod harbor-jobservice-864b5bc9b9-8wb26 & [1] 103883 [root@master01 harbor]# pod "harbor-jobservice-864b5bc9b9-8wb26" deleted [1]+ 完成 kubectl -n harbor delete pod harbor-jobservice-864b5bc9b9-8wb26 [root@master01 harbor]# [root@master01 harbor]# kubectl -n harbor get pods -w NAME READY STATUS RESTARTS AGE harbor-core-586f48cb4c-4r7gz 1/1 Running 2 (2m13s ago) 4m28s harbor-database-0 1/1 Running 0 4m28s harbor-exporter-74ff648dfc-k6pb2 1/1 Running 2 (2m26s ago) 4m28s harbor-jobservice-864b5bc9b9-vkr6w 0/1 Running 0 6s harbor-nginx-6c5fc7c744-5m9lz 1/1 Running 0 4m28s harbor-portal-74484f87f5-lh8m6 1/1 Running 0 4m28s harbor-redis-0 1/1 Running 0 4m28s harbor-registry-b7f8d77d6-ltpw7 2/2 Running 0 4m28s harbor-trivy-0 1/1 Running 0 4m28s ^C[root@master01 harbor]# kubectl -n harbor get pods -w NAME READY STATUS RESTARTS AGE harbor-core-586f48cb4c-4r7gz 1/1 Running 2 (2m26s ago) 4m41s harbor-database-0 1/1 Running 0 4m41s harbor-exporter-74ff648dfc-k6pb2 1/1 Running 2 (2m39s ago) 4m41s harbor-jobservice-864b5bc9b9-vkr6w 0/1 Running 0 19s harbor-nginx-6c5fc7c744-5m9lz 1/1 Running 0 4m41s harbor-portal-74484f87f5-lh8m6 1/1 Running 0 4m41s harbor-redis-0 1/1 Running 0 4m41s harbor-registry-b7f8d77d6-ltpw7 2/2 Running 0 4m41s harbor-trivy-0 1/1 Running 0 4m41s harbor-jobservice-864b5bc9b9-vkr6w 1/1 Running 0 21s ^C[root@master01 harbor]# ^C [root@master01 harbor]# kubectl -n harbor get pods -w NAME READY STATUS RESTARTS AGE harbor-core-586f48cb4c-4r7gz 1/1 Running 2 (2m31s ago) 4m46s harbor-database-0 1/1 Running 0 4m46s harbor-exporter-74ff648dfc-k6pb2 1/1 Running 2 (2m44s ago) 4m46s harbor-jobservice-864b5bc9b9-vkr6w 1/1 Running 0 24s harbor-nginx-6c5fc7c744-5m9lz 1/1 Running 0 4m46s harbor-portal-74484f87f5-lh8m6 1/1 Running 0 4m46s harbor-redis-0 1/1 Running 0 4m46s harbor-registry-b7f8d77d6-ltpw7 2/2 Running 0 4m46s harbor-trivy-0 1/1 Running 0 4m46s七、登录http://,账号:admin 密码:Harbor12345
一、配置altertmanager二进制包下载地址:https://github.com/prometheus/alertmanager/releases/ 官方文档: https://prometheus.io/docs/alerting/configuration/ 一个报警信息在生命周期内有下面 3 种状态`pending`: 当某个监控指标触发了告警表达式的条件,但还没有持续足够长的时间,即没有超过 `for` 阈值设定的时间,这个告警状态被标记为 `pending` `firing`: 当某个监控指标触发了告警条件并且持续超过了设定的 `for` 时间,告警将由pending状态改成 `firing`。 Prometheus 在 firing 状态下将告警信息发送至 Alertmanager。 Alertmanager 应用路由规则,将通知发送给配置的接收器,例如邮件。 `inactive`: 当某个监控指标不再满足告警条件或者告警从未被触发时,这个告警状态被标记为 `inactive`3种状态转换流程 初始状态:`inactive` - 内存使用率正常,告警处于 `inactive` 状态。 expr设置的条件首次满足: - 内存使用率首次超过 20%,告警状态变为 `pending`。 2分钟内情况: - 如果内存使用率在超过 20% 的状态下持续了2分钟或以上,告警状态从 `pending` 变为 `firing`。 - 如果内存使用率在2分钟内恢复正常,状态从 `pending` 变回 `inactive`。解压以后直接kubectl apply -f alertmanager.yml[root@master01 ddd]# tar xf alertmanager-0.27.0.linux-amd64.tar.gz [root@master01 ddd]# ls alertmanager-0.27.0.linux-amd64 alertmanager-0.27.0.linux-amd64.tar.gz [root@master01 ddd]# cd alertmanager-0.27.0.linux-amd64/ [root@master01 alertmanager-0.27.0.linux-amd64]# ls alertmanager alertmanager.yml amtool LICENSE NOTICE [root@master01 alertmanager-0.27.0.linux-amd64]# vi alertmanager [root@master01 alertmanager-0.27.0.linux-amd64]# vi alertmanager.yml 配置文件[root@master01 test]# cat altertmanager.yaml # alertmanager-config.yaml apiVersion: v1 kind: ConfigMap metadata: name: alert-config namespace: monitor data: template_email.tmpl: |- {{ define "email.html" }} {{- if gt (len .Alerts.Firing) 0 -}} @报警<br> {{- range .Alerts }} <strong>实例:</strong> {{ .Labels.instance }}<br> <strong>概述:</strong> {{ .Annotations.summary }}<br> <strong>详情:</strong> {{ .Annotations.description }}<br> <strong>时间:</strong> {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br> {{- end -}} {{- end }} {{- if gt (len .Alerts.Resolved) 0 -}} @恢复<br> {{- range .Alerts }} <strong>实例:</strong> {{ .Labels.instance }}<br> <strong>信息:</strong> {{ .Annotations.summary }}<br> <strong>恢复:</strong> {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br> {{- end -}} {{- end }} {{ end }} config.yml: |- templates: # 1、增加 templates 配置,指定模板文件 - '/etc/alertmanager/template_email.tmpl' inhibit_rules: - source_match: # prometheus配置文件中的报警规则1产生的所有报警信息都带着下面2个标签,第一个标签是promethus自动添加,第二个使我们自己加的 alertname: NodeMemoryUsage severity: critical target_match: severity: normal # prometheus配置文件中的报警规则2产生的所有报警信息都带着该标签 equal: - instance # instance是每条报警规则自带的标签,值为对应的节点名 # 一、全局配置 global: # (1)当alertmanager持续多长时间未接收到告警后标记告警状态为 resolved(解决了) resolve_timeout: 5m # (2)配置发邮件的邮箱 smtp_smarthost: 'smtp.163.com:25' smtp_from: '15555519627@163.com' smtp_auth_username: '15555519627@163.com' smtp_auth_password: 'PZJWYQLDCKQGTTKZ' # 填入你开启pop3时获得的码 smtp_hello: '163.com' smtp_require_tls: false # 二、设置报警的路由分发策略 route: # 定义用于告警分组的标签。当有多个告警消息有相同的 alertname 和 cluster 标签时,这些告警消息将会被聚合到同一个分组中 # 例如,接收到的报警信息里面有许多具有 cluster=XXX 和 alertname=YYY 这样的标签的报警信息将会批量被聚合到一个分组里面 group_by: ['alertname', 'cluster'] # 当一个新的报警分组被创建后,需要等待至少 group_wait 时间来初始化通知, # 这种方式可以确保您能有足够的时间为同一分组来获取/累积多个警报,然后一起触发这个报警信息。 group_wait: 30s # 短期聚合: group_interval 确保在短时间内,同一分组的多个告警将会合并/聚合到一起等待被发送,避免过于频繁的告警通知。 group_interval: 30s # 长期提醒: repeat_interval确保长时间未解决的告警不会被遗忘,Alertmanager每隔一段时间定期提醒相关人员,直到告警被解决。 repeat_interval: 120s # 实验环境想快速看下效果,可以缩小该时间,比如设置为120s # 上述两个参数的综合解释: #(1)当一个新的告警被触发时,会立即发送初次通知 #(2)然后开始一个 group_interval 窗口(例如 30 秒)。 # 在 group_interval 窗口内,任何新的同分组告警会被聚合到一起,但不会立即触发发送。 #(3)聚合窗口结束后, # 如果刚好抵达 repeat_interval 的时间点,聚合的告警会和原有未解决的告警一起发送通知。 # 如果没有抵达 repeat_interval 的时间点,则原有未解决的报警不会重复发送,直到到达下一个 repeat_interval 时间点。 # 这两个参数一起工作,确保短时间内的警报状态变化不会造成过多的重复通知,同时在长期未解决的情况下提供定期的提醒。 # 默认的receiver:如果一个报警没有被一个route匹配,则发送给默认的接收器,与下面receivers中定义的name呼应 receiver: default routes: # 子路由规则。子路由继承父路由的所有属性,可以进行覆盖和更具体的规则匹配。 - receiver: email # 匹配此子路由的告警将发送到的接收器,该名字也与下面的receivers中定义的name呼应 group_wait: 10s # 等待时间,可覆盖父路由的 group_by: ['instance'] # 根据instance做分组 match: # 告警标签匹配条件,只有匹配到特定条件的告警才会应用该子路由规则。 team: node # 只有拥有 team=node 标签的告警才会路由到 email 接收器。 continue: true #不设置这个只能匹配一条 - receiver: mywebhook # 匹配此子路由的告警将发送到的接收器,该名字也与下面的receivers中定义的name呼应 group_wait: 10s # 等待时间,可覆盖父路由的 group_by: ['instance'] # 根据instance做分组 match: # 告警标签匹配条件,只有匹配到特定条件的告警才会应用该子路由规则。 team: node # 只有拥有 team=node 标签的告警才会路由到 email 接收器。 # 三、定义接收器,与上面的路由定义中引用的介receiver相呼应 receivers: - name: 'default' # 默认接收器配置,未匹配任何特定路由规则的告警会发送到此接收器。 email_configs: - to: '7902731@qq.com@qq.com' send_resolved: true # : 当告警恢复时是否也发送通知。 - name: 'email' # 名为 email 的接收器配置,与之前定义的子路由相对应。 email_configs: - to: '15555519627@163.com' send_resolved: true html: '{{ template "email.html" . }}' #这个是对接webhook钉钉的 - name: 'mywebhook' # 默认接收器配置,未匹配任何特定路由规则的告警会发送到此接收器。 webhook_configs: - url: 'http://promoter:8080/dingtalk/webhook1/send' send_resolved: true # : 当告警恢复时是否也发送通知。 --- # alertmanager-deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: name: alertmanager namespace: monitor labels: app: alertmanager spec: selector: matchLabels: app: alertmanager template: metadata: labels: app: alertmanager spec: volumes: - name: alertcfg configMap: name: alert-config containers: - name: alertmanager # 版本去查看官网https://github.com/prometheus/alertmanager/releases/ # 1、官网镜像地址,需要你为containerd配置好镜像加速 #image: prom/alertmanager:v0.27.0 # 2、搞成了国内的地址 image: registry.cn-hangzhou.aliyuncs.com/egon-k8s-test/alertmanager:v0.27.0 imagePullPolicy: IfNotPresent args: - '--config.file=/etc/alertmanager/config.yml' ports: - containerPort: 9093 name: http volumeMounts: - mountPath: '/etc/alertmanager' name: alertcfg resources: requests: cpu: 100m memory: 256Mi limits: cpu: 100m memory: 256Mi --- # alertmanager-svc.yaml apiVersion: v1 kind: Service metadata: name: alertmanager namespace: monitor labels: app: alertmanager spec: selector: app: alertmanager type: NodePort ports: - name: web port: 9093 targetPort: http [root@master01 test]# kubectl -n monitor get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE alertmanager NodePort <none> 9093:30610/TCP 107m grafana NodePort <none> 3000:30484/TCP 28h prometheus NodePort <none> 9090:31119/TCP 3d1h promoter ClusterIP <none> 8080/TCP 18h redis ClusterIP <none> 6379/TCP,9121/TCP 2d22h [root@master01 test]# kubectl -n monitor get pods NAME READY STATUS RESTARTS AGE alertmanager-56b46ff6b4-mvbb8 1/1 Running 0 125m grafana-86cfcd87fb-59gtb 1/1 Running 1 (3h25m ago) 28h node-exporter-6f4d4 1/1 Running 4 (3h25m ago) 2d21h node-exporter-swr5j 1/1 Running 4 (3h25m ago) 2d21h node-exporter-tf84v 1/1 Running 4 (3h25m ago) 2d21h node-exporter-z9svr 1/1 Running 4 (3h25m ago) 2d21h prometheus-7f8f87f55d-zbnsr 1/1 Running 1 (3h25m ago) 21h promoter-6f68cff456-wqmg9 1/1 Running 1 (3h25m ago) 18h redis-84bbc5df9b-rnm6q 2/2 Running 8 (3h25m ago) 2d22h 基于webhook对接钉钉报警 prometheus(报警规则)----》alertmanager组件-----------------------------》邮箱 prometheus(报警规则)----》alertmanager组件------钉钉的webhook软件------》钉钉{lamp/}二、配置钉钉1.下载钉钉 2.添加群聊(至少2个人才可以拉群) 3.在群里添加机器人得道AIP接口和密钥测试是否可以正常使用#python 3.8 import time import sys import hmac import hashlib import base64 import urllib.parse import requests timestamp = str(round(time.time() * 1000)) secret = 'SEC45045323ac8b379b88e04750c7954645edc54c4ffdedd717b82804c8684c0706' secret_enc = secret.encode('utf-8') string_to_sign = '{}\n{}'.format(timestamp, secret) string_to_sign_enc = string_to_sign.encode('utf-8') hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest() sign = urllib.parse.quote_plus(base64.b64encode(hmac_code)) print(timestamp) print(sign) MESSAGE = sys.argv[1] webhook_url =f'https://oapi.dingtalk.com/robot/send?access_token=13ddb964c0108de8b56eb944c5e407d448cb2db02e3885c45585f8eb06779def×tamp={timestamp}&sign={sign}' response = requests.post(webhook_url,headers={'Content-Type': 'application/json'},json={"msgtype": "text","text": {"content":f"'{MESSAGE}'"}}) print(response.text) print(response.status_code)pip3 install requests -i https://mirrors.aliyun.com/pypi/simple/ python3 webhook_test.py 测试部署钉钉的webhook软件wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz #解压出来里面的config.yml配置 cat > /usr/local/prometheus-webhook-dingtalk/config.yml << "EOF" templates: - /etc/template.tmpl targets: webhook1: url: https://oapi.dingtalk.com/robot/send?access_token=3acdac2167b83e0b54f751c0cfcbb676b7828af183aca2e21428c489883ced8b # secret for signature secret: SEC67f8b6d15997deaf686ab0509b2dad943aca99d700131f88d010ef57e591aea0 message: # 哪个target需要引用模版,就增加这一小段配置,其中default.tmpl就是你一会要定义的模版 text: '{{ template "default.tmpl" . }}' # 可以添加其他的对接,主要用于对接到不同的群中的机器人 webhook_mention_all: url: https://oapi.dingtalk.com/robot/send?access_token=3acdac2167b83e0b54f751c0cfcbb676b7828af183aca2e21428c489883ced8b secret: SEC67f8b6d15997deaf686ab0509b2dad943aca99d700131f88d010ef57e591aea0 mention: all: true webhook_mention_users: url: https://oapi.dingtalk.com/robot/send?access_token=3acdac2167b83e0b54f751c0cfcbb676b7828af183aca2e21428c489883ced8b secret: SEC67f8b6d15997deaf686ab0509b2dad943aca99d700131f88d010ef57e591aea0 mention: mobiles: ['18611453110'] EOF可以做成系统服务cat > /lib/systemd/system/dingtalk.service << 'EOF' [Unit] Description=dingtalk Documentation=https://github.com/timonwong/prometheus-webhook-dingtalk/ After=network.target [Service] Restart=on-failure WorkingDirectory=/usr/local/prometheus-webhook-dingtalk ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk --web.listen-address= --config.file=/usr/local/prometheus-webhook-dingtalk/config.yml User=nobody [Install] WantedBy=multi-user.target EOF systemctl daemon-reload systemctl restart dingtalk systemctl status dingtalk配置alertmanager对接钉钉webhook[root@master01 test]# cat altertmanager.yaml # alertmanager-config.yaml apiVersion: v1 kind: ConfigMap metadata: name: alert-config namespace: monitor data: template_email.tmpl: |- {{ define "email.html" }} {{- if gt (len .Alerts.Firing) 0 -}} @报警<br> {{- range .Alerts }} <strong>实例:</strong> {{ .Labels.instance }}<br> <strong>概述:</strong> {{ .Annotations.summary }}<br> <strong>详情:</strong> {{ .Annotations.description }}<br> <strong>时间:</strong> {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br> {{- end -}} {{- end }} {{- if gt (len .Alerts.Resolved) 0 -}} @恢复<br> {{- range .Alerts }} <strong>实例:</strong> {{ .Labels.instance }}<br> <strong>信息:</strong> {{ .Annotations.summary }}<br> <strong>恢复:</strong> {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }} <br> {{- end -}} {{- end }} {{ end }} config.yml: |- templates: # 1、增加 templates 配置,指定模板文件 - '/etc/alertmanager/template_email.tmpl' inhibit_rules: - source_match: # prometheus配置文件中的报警规则1产生的所有报警信息都带着下面2个标签,第一个标签是promethus自动添加,第二个使我们自己加的 alertname: NodeMemoryUsage severity: critical target_match: severity: normal # prometheus配置文件中的报警规则2产生的所有报警信息都带着该标签 equal: - instance # instance是每条报警规则自带的标签,值为对应的节点名 # 一、全局配置 global: # (1)当alertmanager持续多长时间未接收到告警后标记告警状态为 resolved(解决了) resolve_timeout: 5m # (2)配置发邮件的邮箱 smtp_smarthost: 'smtp.163.com:25' smtp_from: '15555519627@163.com' smtp_auth_username: '15555519627@163.com' smtp_auth_password: 'PZJWYQLDCKQGTTKZ' # 填入你开启pop3时获得的码 smtp_hello: '163.com' smtp_require_tls: false # 二、设置报警的路由分发策略 route: # 定义用于告警分组的标签。当有多个告警消息有相同的 alertname 和 cluster 标签时,这些告警消息将会被聚合到同一个分组中 # 例如,接收到的报警信息里面有许多具有 cluster=XXX 和 alertname=YYY 这样的标签的报警信息将会批量被聚合到一个分组里面 group_by: ['alertname', 'cluster'] # 当一个新的报警分组被创建后,需要等待至少 group_wait 时间来初始化通知, # 这种方式可以确保您能有足够的时间为同一分组来获取/累积多个警报,然后一起触发这个报警信息。 group_wait: 30s # 短期聚合: group_interval 确保在短时间内,同一分组的多个告警将会合并/聚合到一起等待被发送,避免过于频繁的告警通知。 group_interval: 30s # 长期提醒: repeat_interval确保长时间未解决的告警不会被遗忘,Alertmanager每隔一段时间定期提醒相关人员,直到告警被解决。 repeat_interval: 120s # 实验环境想快速看下效果,可以缩小该时间,比如设置为120s # 上述两个参数的综合解释: #(1)当一个新的告警被触发时,会立即发送初次通知 #(2)然后开始一个 group_interval 窗口(例如 30 秒)。 # 在 group_interval 窗口内,任何新的同分组告警会被聚合到一起,但不会立即触发发送。 #(3)聚合窗口结束后, # 如果刚好抵达 repeat_interval 的时间点,聚合的告警会和原有未解决的告警一起发送通知。 # 如果没有抵达 repeat_interval 的时间点,则原有未解决的报警不会重复发送,直到到达下一个 repeat_interval 时间点。 # 这两个参数一起工作,确保短时间内的警报状态变化不会造成过多的重复通知,同时在长期未解决的情况下提供定期的提醒。 # 默认的receiver:如果一个报警没有被一个route匹配,则发送给默认的接收器,与下面receivers中定义的name呼应 receiver: default routes: # 子路由规则。子路由继承父路由的所有属性,可以进行覆盖和更具体的规则匹配。 - receiver: email # 匹配此子路由的告警将发送到的接收器,该名字也与下面的receivers中定义的name呼应 group_wait: 10s # 等待时间,可覆盖父路由的 group_by: ['instance'] # 根据instance做分组 match: # 告警标签匹配条件,只有匹配到特定条件的告警才会应用该子路由规则。 team: node # 只有拥有 team=node 标签的告警才会路由到 email 接收器。 continue: true #不设置这个只能匹配一条 - receiver: mywebhook # 匹配此子路由的告警将发送到的接收器,该名字也与下面的receivers中定义的name呼应 group_wait: 10s # 等待时间,可覆盖父路由的 group_by: ['instance'] # 根据instance做分组 match: # 告警标签匹配条件,只有匹配到特定条件的告警才会应用该子路由规则。 team: node # 只有拥有 team=node 标签的告警才会路由到 email 接收器。 # 三、定义接收器,与上面的路由定义中引用的介receiver相呼应 receivers: - name: 'default' # 默认接收器配置,未匹配任何特定路由规则的告警会发送到此接收器。 email_configs: - to: '7902731@qq.com@qq.com' send_resolved: true # : 当告警恢复时是否也发送通知。 - name: 'email' # 名为 email 的接收器配置,与之前定义的子路由相对应。 email_configs: - to: '15555519627@163.com' send_resolved: true html: '{{ template "email.html" . }}' #这个是对接webhook钉钉的 - name: 'mywebhook' # 默认接收器配置,未匹配任何特定路由规则的告警会发送到此接收器。 webhook_configs: - url: 'http://promoter:8080/dingtalk/webhook1/send' send_resolved: true # : 当告警恢复时是否也发送通知。 --- # alertmanager-deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: name: alertmanager namespace: monitor labels: app: alertmanager spec: selector: matchLabels: app: alertmanager template: metadata: labels: app: alertmanager spec: volumes: - name: alertcfg configMap: name: alert-config containers: - name: alertmanager # 版本去查看官网https://github.com/prometheus/alertmanager/releases/ # 1、官网镜像地址,需要你为containerd配置好镜像加速 #image: prom/alertmanager:v0.27.0 # 2、搞成了国内的地址 image: registry.cn-hangzhou.aliyuncs.com/egon-k8s-test/alertmanager:v0.27.0 imagePullPolicy: IfNotPresent args: - '--config.file=/etc/alertmanager/config.yml' ports: - containerPort: 9093 name: http volumeMounts: - mountPath: '/etc/alertmanager' name: alertcfg resources: requests: cpu: 100m memory: 256Mi limits: cpu: 100m memory: 256Mi --- # alertmanager-svc.yaml apiVersion: v1 kind: Service metadata: name: alertmanager namespace: monitor labels: app: alertmanager spec: selector: app: alertmanager type: NodePort ports: - name: web port: 9093 targetPort: http 补充:报警图片 https://egonimages.oss-cn-beijing.aliyuncs.com/gaojing1.jpg https://egonimages.oss-cn-beijing.aliyuncs.com/gaojing2.jpg https://egonimages.oss-cn-beijing.aliyuncs.com/gaojing3.png https://egonimages.oss-cn-beijing.aliyuncs.com/gaojing4.jpg https://egonimages.oss-cn-beijing.aliyuncs.com/gaojing5.jpg https://egonimages.oss-cn-beijing.aliyuncs.com/gaojing6.png 定制内容(略) 自行研究吧:https://github.com/timonwong/prometheus-webhook-dingtalk/blob/main/template/default.tmpl
