一、组件说明
#如果已安装metrics-server需要先卸载,否则冲突
1. MetricServer:是kubernetes集群资源使用情况的聚合器,收集数据给kubernetes集群内使用,如kubectl,hpa,scheduler等。
2. PrometheusOperator:是一个系统监测和警报工具箱,用来存储监控数据。
3. NodeExporter:用于各node的关键度量指标状态数据。
4. KubeStateMetrics:收集kubernetes集群内资源对象数据,制定告警规则。
5. Prometheus:采用pull方式收集apiserver,scheduler,controller-manager,kubelet组件数据,通过http协议传输。
6. Grafana:是可视化数据统计和监控平台。
二、安装部署
项目地址:https://github.com/prometheus-operator/kube-prometheus
三、版本选择
可参考官方文档https://github.com/prometheus-operator/kube-prometheus?tab=readme-ov-file#compatibility,例如 k8s 版本为 1.30,推荐的 kube-Prometheus 版本为release-0.14
四、克隆项目至本地
git clone -b release-0.13 https://github.com/prometheus-operator/kube-prometheus.git
五、创建资源对象
#如果是国内 要改镜像地址
[root@master1 k8s-install]# kubectl create namespace monitoring
[root@master1 k8s-install]# cd kube-prometheus/
[root@master1 kube-prometheus]# kubectl apply --server-side -f manifests/setup
[root@master1 kube-prometheus]# kubectl wait \
--for condition=Established \
--all CustomResourceDefinition \
--namespace=monitoring
[root@master1 kube-prometheus]# kubectl apply -f manifests/
root@k8s01:~/helm/prometheus/kube-prometheus# kubectl apply --server-side -f manifests/setup
customresourcedefinition.apiextensions.k8s.io/alertmanagerconfigs.monitoring.coreos.com serverside-applied
customresourcedefinition.apiextensions.k8s.io/alertmanagers.monitoring.coreos.com serverside-applied
customresourcedefinition.apiextensions.k8s.io/podmonitors.monitoring.coreos.com serverside-applied
customresourcedefinition.apiextensions.k8s.io/probes.monitoring.coreos.com serverside-applied
customresourcedefinition.apiextensions.k8s.io/prometheuses.monitoring.coreos.com serverside-applied
customresourcedefinition.apiextensions.k8s.io/prometheusagents.monitoring.coreos.com serverside-applied
customresourcedefinition.apiextensions.k8s.io/prometheusrules.monitoring.coreos.com serverside-applied
customresourcedefinition.apiextensions.k8s.io/scrapeconfigs.monitoring.coreos.com serverside-applied
customresourcedefinition.apiextensions.k8s.io/servicemonitors.monitoring.coreos.com serverside-applied
customresourcedefinition.apiextensions.k8s.io/thanosrulers.monitoring.coreos.com serverside-applied
namespace/monitoring serverside-applied
root@k8s01:~/helm/prometheus/kube-prometheus# kubectl wait \
> --for condition=Established \
> --all CustomResourceDefinitions \
> --namespace=monitoring
customresourcedefinition.apiextensions.k8s.io/alertmanagerconfigs.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/alertmanagers.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/certificaterequests.cert-manager.io condition met
customresourcedefinition.apiextensions.k8s.io/certificates.cert-manager.io condition met
customresourcedefinition.apiextensions.k8s.io/challenges.acme.cert-manager.io condition met
customresourcedefinition.apiextensions.k8s.io/clusterissuers.cert-manager.io condition met
customresourcedefinition.apiextensions.k8s.io/ingressroutes.traefik.io condition met
customresourcedefinition.apiextensions.k8s.io/ingressroutetcps.traefik.io condition met
customresourcedefinition.apiextensions.k8s.io/ingressrouteudps.traefik.io condition met
customresourcedefinition.apiextensions.k8s.io/instrumentations.opentelemetry.io condition met
customresourcedefinition.apiextensions.k8s.io/issuers.cert-manager.io condition met
customresourcedefinition.apiextensions.k8s.io/middlewares.traefik.io condition met
customresourcedefinition.apiextensions.k8s.io/middlewaretcps.traefik.io condition met
customresourcedefinition.apiextensions.k8s.io/opampbridges.opentelemetry.io condition met
customresourcedefinition.apiextensions.k8s.io/opentelemetrycollectors.opentelemetry.io condition met
customresourcedefinition.apiextensions.k8s.io/orders.acme.cert-manager.io condition met
customresourcedefinition.apiextensions.k8s.io/podmonitors.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/policybindings.sts.min.io condition met
customresourcedefinition.apiextensions.k8s.io/probes.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/prometheusagents.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/prometheuses.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/prometheusrules.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/scrapeconfigs.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/serverstransports.traefik.io condition met
customresourcedefinition.apiextensions.k8s.io/serverstransporttcps.traefik.io condition met
customresourcedefinition.apiextensions.k8s.io/servicemonitors.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/targetallocators.opentelemetry.io condition met
customresourcedefinition.apiextensions.k8s.io/thanosrulers.monitoring.coreos.com condition met
customresourcedefinition.apiextensions.k8s.io/tlsoptions.traefik.io condition met
customresourcedefinition.apiextensions.k8s.io/tlsstores.traefik.io condition met
customresourcedefinition.apiextensions.k8s.io/traefikservices.traefik.io condition met
root@k8s01:~/helm/prometheus/kube-prometheus# kubectl apply -f manifests/
alertmanager.monitoring.coreos.com/main created
networkpolicy.networking.k8s.io/alertmanager-main created
poddisruptionbudget.policy/alertmanager-main created
prometheusrule.monitoring.coreos.com/alertmanager-main-rules created
secret/alertmanager-main created
service/alertmanager-main created
serviceaccount/alertmanager-main created
servicemonitor.monitoring.coreos.com/alertmanager-main created
clusterrole.rbac.authorization.k8s.io/blackbox-exporter created
clusterrolebinding.rbac.authorization.k8s.io/blackbox-exporter created
configmap/blackbox-exporter-configuration created
deployment.apps/blackbox-exporter created
networkpolicy.networking.k8s.io/blackbox-exporter created
service/blackbox-exporter created
serviceaccount/blackbox-exporter created
servicemonitor.monitoring.coreos.com/blackbox-exporter created
secret/grafana-config created
secret/grafana-datasources created
configmap/grafana-dashboard-alertmanager-overview created
configmap/grafana-dashboard-apiserver created
configmap/grafana-dashboard-cluster-total created
configmap/grafana-dashboard-controller-manager created
configmap/grafana-dashboard-grafana-overview created
configmap/grafana-dashboard-k8s-resources-cluster created
configmap/grafana-dashboard-k8s-resources-multicluster created
configmap/grafana-dashboard-k8s-resources-namespace created
configmap/grafana-dashboard-k8s-resources-node created
configmap/grafana-dashboard-k8s-resources-pod created
configmap/grafana-dashboard-k8s-resources-workload created
configmap/grafana-dashboard-k8s-resources-workloads-namespace created
configmap/grafana-dashboard-kubelet created
configmap/grafana-dashboard-namespace-by-pod created
configmap/grafana-dashboard-namespace-by-workload created
configmap/grafana-dashboard-node-cluster-rsrc-use created
configmap/grafana-dashboard-node-rsrc-use created
configmap/grafana-dashboard-nodes-aix created
configmap/grafana-dashboard-nodes-darwin created
configmap/grafana-dashboard-nodes created
configmap/grafana-dashboard-persistentvolumesusage created
configmap/grafana-dashboard-pod-total created
configmap/grafana-dashboard-prometheus-remote-write created
configmap/grafana-dashboard-prometheus created
configmap/grafana-dashboard-proxy created
configmap/grafana-dashboard-scheduler created
configmap/grafana-dashboard-workload-total created
configmap/grafana-dashboards created
deployment.apps/grafana created
networkpolicy.networking.k8s.io/grafana created
prometheusrule.monitoring.coreos.com/grafana-rules created
service/grafana created
serviceaccount/grafana created
servicemonitor.monitoring.coreos.com/grafana created
prometheusrule.monitoring.coreos.com/kube-prometheus-rules created
clusterrole.rbac.authorization.k8s.io/kube-state-metrics created
clusterrolebinding.rbac.authorization.k8s.io/kube-state-metrics created
deployment.apps/kube-state-metrics created
networkpolicy.networking.k8s.io/kube-state-metrics created
prometheusrule.monitoring.coreos.com/kube-state-metrics-rules created
service/kube-state-metrics created
serviceaccount/kube-state-metrics created
servicemonitor.monitoring.coreos.com/kube-state-metrics created
prometheusrule.monitoring.coreos.com/kubernetes-monitoring-rules created
servicemonitor.monitoring.coreos.com/kube-apiserver created
servicemonitor.monitoring.coreos.com/coredns created
servicemonitor.monitoring.coreos.com/kube-controller-manager created
servicemonitor.monitoring.coreos.com/kube-scheduler created
servicemonitor.monitoring.coreos.com/kubelet created
clusterrole.rbac.authorization.k8s.io/node-exporter created
clusterrolebinding.rbac.authorization.k8s.io/node-exporter created
daemonset.apps/node-exporter created
networkpolicy.networking.k8s.io/node-exporter created
prometheusrule.monitoring.coreos.com/node-exporter-rules created
service/node-exporter created
serviceaccount/node-exporter created
servicemonitor.monitoring.coreos.com/node-exporter created
clusterrole.rbac.authorization.k8s.io/prometheus-k8s created
clusterrolebinding.rbac.authorization.k8s.io/prometheus-k8s created
networkpolicy.networking.k8s.io/prometheus-k8s created
poddisruptionbudget.policy/prometheus-k8s created
prometheus.monitoring.coreos.com/k8s created
prometheusrule.monitoring.coreos.com/prometheus-k8s-prometheus-rules created
rolebinding.rbac.authorization.k8s.io/prometheus-k8s-config created
rolebinding.rbac.authorization.k8s.io/prometheus-k8s created
rolebinding.rbac.authorization.k8s.io/prometheus-k8s created
rolebinding.rbac.authorization.k8s.io/prometheus-k8s created
role.rbac.authorization.k8s.io/prometheus-k8s-config created
role.rbac.authorization.k8s.io/prometheus-k8s created
role.rbac.authorization.k8s.io/prometheus-k8s created
role.rbac.authorization.k8s.io/prometheus-k8s created
service/prometheus-k8s created
serviceaccount/prometheus-k8s created
servicemonitor.monitoring.coreos.com/prometheus-k8s created
apiservice.apiregistration.k8s.io/v1beta1.metrics.k8s.io created
clusterrole.rbac.authorization.k8s.io/prometheus-adapter created
clusterrole.rbac.authorization.k8s.io/system:aggregated-metrics-reader created
clusterrolebinding.rbac.authorization.k8s.io/prometheus-adapter created
clusterrolebinding.rbac.authorization.k8s.io/resource-metrics:system:auth-delegator created
clusterrole.rbac.authorization.k8s.io/resource-metrics-server-resources created
configmap/adapter-config created
deployment.apps/prometheus-adapter created
networkpolicy.networking.k8s.io/prometheus-adapter created
poddisruptionbudget.policy/prometheus-adapter created
rolebinding.rbac.authorization.k8s.io/resource-metrics-auth-reader created
service/prometheus-adapter created
serviceaccount/prometheus-adapter created
servicemonitor.monitoring.coreos.com/prometheus-adapter created
clusterrole.rbac.authorization.k8s.io/prometheus-operator created
clusterrolebinding.rbac.authorization.k8s.io/prometheus-operator created
deployment.apps/prometheus-operator created
networkpolicy.networking.k8s.io/prometheus-operator created
prometheusrule.monitoring.coreos.com/prometheus-operator-rules created
service/prometheus-operator created
serviceaccount/prometheus-operator created
servicemonitor.monitoring.coreos.com/prometheus-operator created
root@k8s01:~/helm/prometheus/kube-prometheus#
六、验证查看
#查看pod状态
root@k8s03:~# kubectl get pod -n monitoring
NAME READY STATUS RESTARTS AGE
alertmanager-main-0 2/2 Running 0 50m
alertmanager-main-1 2/2 Running 0 50m
alertmanager-main-2 2/2 Running 0 50m
blackbox-exporter-57bb665766-d9kwj 3/3 Running 0 50m
grafana-fdf8c48f-f6cck 1/1 Running 0 50m
kube-state-metrics-5ffdd9685c-hg5hc 3/3 Running 0 50m
node-exporter-8l29v 2/2 Running 0 31m
node-exporter-gdclz 2/2 Running 0 28m
node-exporter-j5r76 2/2 Running 0 50m
prometheus-adapter-7945bdf5d7-dh75k 1/1 Running 0 50m
prometheus-adapter-7945bdf5d7-nbp94 1/1 Running 0 50m
prometheus-k8s-0 2/2 Running 0 50m
prometheus-k8s-1 2/2 Running 0 50m
prometheus-operator-85c5ffc677-jk8c9 2/2 Running 0 50m
#查看top信息
root@k8s03:~# kubectl top node
NAME CPU(cores) CPU% MEMORY(bytes) MEMORY%
k8s01 3277m 40% 6500Mi 66%
k8s02 6872m 85% 4037Mi 36%
k8s03 362m 4% 6407Mi 65%
七、新增ingress资源
#以ingress-nginx为例
[root@master1 manifests]# cat ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager
namespace: monitoring
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
spec:
ingressClassName: nginx
rules:
- host: alertmanager.local.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: alertmanager-main
port:
number: 9093
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
namespace: monitoring
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
spec:
ingressClassName: nginx
rules:
- host: grafana.local.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: grafana
port:
number: 3000
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
namespace: monitoring
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
spec:
ingressClassName: nginx
rules:
- host: prometheus.local.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prometheus-k8s
port:
number: 9090
#以traefik为例:
[root@master1 manifests]# cat ingress.yaml
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: alertmanager
namespace: monitoring
spec:
entryPoints:
- web
routes:
- match: Host(`alertmanager.local.com`)
kind: Rule
services:
- name: alertmanager-main
port: 9093
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: grafana
namespace: monitoring
spec:
entryPoints:
- web
routes:
- match: Host(`grafana.local.com`)
kind: Rule
services:
- name: grafana
port: 3000
---
apiVersion: traefik.io/v1alpha1
kind: IngressRoute
metadata:
name: prometheus
namespace: monitoring
spec:
entryPoints:
- web
routes:
- match: Host(`prometheus.local.com`)
kind: Rule
services:
- name: prometheus-k8s
port: 9090
[root@master1 manifests]# kubectl apply -f ingress.yaml
ingressroute.traefik.containo.us/alertmanager created
ingressroute.traefik.containo.us/grafana created
ingressroute.traefik.containo.us/prometheus created
八、web访问验证
#新增hosts解析记录 win notepad $env:windir\System32\drivers\etc\hosts
192.168.3.200 alertmanager.local.com
192.168.3.200 prometheus.local.com
192.168.3.200 grafana.local.com
访问http://alertmanager.local.com:30080 ,查看当前激活的告警
访问http://prometheus.local.com/targets:30080,查看targets已全部up
访问http://grafana.local.com:30080/login,默认用户名和密码是admin/admin
查看数据源,以为我们自动配置Prometheus数据源
九、targets异常处理
查看targets可发现有两个监控任务没有对应的instance,这和serviceMonitor资源对象有关
root@k8s01:~/helm/prometheus/kube-prometheus# cat prometheus-kubeControllerManagerService.yaml
apiVersion: v1
kind: Service
metadata:
namespace: kube-system
name: kube-controller-manager
labels:
app.kubernetes.io/name: kube-controller-manager
spec:
selector:
component: kube-controller-manager
type: ClusterIP
ports:
- name: https-metrics
port: 10257
targetPort: 10257
protocol: TCP
#新建prometheus-kubeControllerManagerService.yaml并apply创建资源
apiVersion: v1
kind: Service
metadata:
namespace: kube-system
name: kube-controller-manager
labels:
app.kubernetes.io/name: kube-controller-manager
spec:
selector:
component: kube-controller-manager
type: ClusterIP
ports:
- name: https-metrics
port: 10257
targetPort: 10257
protocol: TCP
如果出现下面图中的情况的话 需要修改配置
root@k8s-01:~# sudo ss -lntp | grep 10257
LISTEN 0 4096 127.0.0.1:10257 0.0.0.0:* users:(("kube-controller",pid=168489,fd=3))
root@k8s-01:~# cat /etc/kubernetes/manifests/kube-controller-manager.yaml
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: null
labels:
component: kube-controller-manager
tier: control-plane
name: kube-controller-manager
namespace: kube-system
spec:
containers:
- command:
- kube-controller-manager
- --allocate-node-cidrs=true
- --authentication-kubeconfig=/etc/kubernetes/controller-manager.conf
- --authorization-kubeconfig=/etc/kubernetes/controller-manager.conf
- --bind-address=0.0.0.0
- --client-ca-file=/etc/kubernetes/pki/ca.crt
- --cluster-cidr=10.244.0.0/16
- --cluster-name=kubernetes
- --cluster-signing-cert-file=/etc/kubernetes/pki/ca.crt
- --cluster-signing-key-file=/etc/kubernetes/pki/ca.key
- --controllers=*,bootstrapsigner,tokencleaner
- --kubeconfig=/etc/kubernetes/controller-manager.conf
- --leader-elect=true
- --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt
- --root-ca-file=/etc/kubernetes/pki/ca.crt
- --service-account-private-key-file=/etc/kubernetes/pki/sa.key
- --service-cluster-ip-range=10.96.0.0/12
- --use-service-account-credentials=true
image: registry.cn-hangzhou.aliyuncs.com/google_containers/kube-controller-manager:v1.27.0
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 8
httpGet:
host: 127.0.0.1
path: /healthz
port: 10257
scheme: HTTPS
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 15
name: kube-controller-manager
resources:
requests:
cpu: 200m
startupProbe:
failureThreshold: 24
httpGet:
host: 127.0.0.1
path: /healthz
port: 10257
scheme: HTTPS
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 15
volumeMounts:
- mountPath: /etc/ssl/certs
name: ca-certs
readOnly: true
- mountPath: /etc/ca-certificates
name: etc-ca-certificates
readOnly: true
- mountPath: /etc/pki
name: etc-pki
readOnly: true
- mountPath: /usr/libexec/kubernetes/kubelet-plugins/volume/exec
name: flexvolume-dir
- mountPath: /etc/kubernetes/pki
name: k8s-certs
readOnly: true
- mountPath: /etc/kubernetes/controller-manager.conf
name: kubeconfig
readOnly: true
- mountPath: /usr/local/share/ca-certificates
name: usr-local-share-ca-certificates
readOnly: true
- mountPath: /usr/share/ca-certificates
name: usr-share-ca-certificates
readOnly: true
hostNetwork: true
priority: 2000001000
priorityClassName: system-node-critical
securityContext:
seccompProfile:
type: RuntimeDefault
volumes:
- hostPath:
path: /etc/ssl/certs
type: DirectoryOrCreate
name: ca-certs
- hostPath:
path: /etc/ca-certificates
type: DirectoryOrCreate
name: etc-ca-certificates
- hostPath:
path: /etc/pki
type: DirectoryOrCreate
name: etc-pki
- hostPath:
path: /usr/libexec/kubernetes/kubelet-plugins/volume/exec
type: DirectoryOrCreate
name: flexvolume-dir
- hostPath:
path: /etc/kubernetes/pki
type: DirectoryOrCreate
name: k8s-certs
- hostPath:
path: /etc/kubernetes/controller-manager.conf
type: FileOrCreate
name: kubeconfig
- hostPath:
path: /usr/local/share/ca-certificates
type: DirectoryOrCreate
name: usr-local-share-ca-certificates
- hostPath:
path: /usr/share/ca-certificates
type: DirectoryOrCreate
name: usr-share-ca-certificates
status: {}
root@k8s-01:~# cd /etc/kubernetes/manifests/
root@k8s-01:/etc/kubernetes/manifests# ls
etcd.yaml kube-apiserver.yaml kube-controller-manager.yaml kube-controller-manager.yaml.bak kube-scheduler.yaml
root@k8s-01:/etc/kubernetes/manifests# mv kube-controller-manager.yaml.bak /root
root@k8s-01:/etc/kubernetes/manifests# ls
etcd.yaml kube-apiserver.yaml kube-controller-manager.yaml kube-scheduler.yaml
root@k8s-01:/etc/kubernetes/manifests# sudo ss -lntp | grep 10257
LISTEN 0 4096 *:10257 *:* users:(("kube-controller",pid=169372,fd=3))
[...]**六、部署 Prometheus**如果已安装metrics-server需要先卸载,否则冲突https://axzys.cn/index.php/archives/423/[...]