程序上线以后,就需要通过监控来确保程序健康和稳定的运行了。当程序异常的时候,还要有能力发出报警。一整套的监控收集、存储、报警的安装和配置是非常麻烦的。而且这套监控程序后续也需要进行维护,额外增加了很多工作量。

这里记录下利用基于 Kubernetes 集群,利用 helm 一键部署 Prometheus + Grafana 的方式。

helm-prometheus

# Namespaces: monitoring
# Chart Name: prometheus-operator
# Chart Version: 5.10.4
# Application Version: 0.29.0

# helm upgrade --install prometheus-operator -f prometheus-operator-5.10.4.yaml stable/prometheus-operator --version 5.10.4 --namespace monitoring

---

additionalPrometheusRules: []
alertmanager:
alertmanagerSpec:
additionalPeers: []
affinity: {}
configMaps: []
containers: []
externalUrl: null
image:
repository: quay.io/prometheus/alertmanager
tag: v0.17.0
listenLocal: false
logLevel: info
nodeSelector: {}
paused: false
podAntiAffinity: ""
podAntiAffinityTopologyKey: kubernetes.io/hostname
podMetadata: {}
priorityClassName: ""
replicas: 1
resources: {}
retention: 120h
routePrefix: /
secrets: []
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
storage: {}
tolerations: []
useExistingSecret: false
config:
global:
resolve_timeout: 5m
receivers:
- name: alerta
webhook_configs:
- send_resolved: true
url: https://alerta.xxxx.org/api/webhooks/prometheus?api-key=xxxxxxx
- name: rocketchat
webhook_configs:
- send_resolved: false
url: https://chat.xxxx.org/hooks/xxxx/xxxx
route:
group_by:
- job
group_interval: 5m
group_wait: 30s
receiver: alerta
repeat_interval: 3h
routes:
- continue: true
match:
severity: critical
receiver: alerta
- continue: true
match:
severity: critical
receiver: rocketchat
enabled: true
ingress:
annotations: {}
enabled: true
hosts:
- alert.xxxx.org
labels: {}
paths: []
tls:
- hosts:
- alert.xxxx.org
secretName: xxxx-tls
podDisruptionBudget:
enabled: false
maxUnavailable: ""
minAvailable: 1
service:
annotations: {}
clusterIP: ""
externalIPs: []
labels: {}
loadBalancerIP: ""
loadBalancerSourceRanges: []
nodePort: 30903
type: ClusterIP
serviceAccount:
create: true
name: ""
serviceMonitor:
interval: ""
metricRelabelings: []
relabelings: []
selfMonitor: true
templateFiles: {}
commonLabels: {}
coreDns:
enabled: true
service:
port: 9153
selector:
k8s-app: kube-dns
targetPort: 9153
serviceMonitor:
interval: ""
metricRelabelings: []
relabelings: []
defaultRules:
annotations: {}
create: true
labels:
environment: aliyun
rules:
alertmanager: true
etcd: true
general: true
k8s: true
kubeApiserver: true
kubePrometheusNodeAlerting: true
kubePrometheusNodeRecording: true
kubeScheduler: true
kubernetesAbsent: true
kubernetesApps: true
kubernetesResources: true
kubernetesStorage: true
kubernetesSystem: true
node: true
prometheus: true
prometheusOperator: true
fullnameOverride: ""
global:
imagePullSecrets: []
rbac:
create: true
pspEnabled: true
grafana:
additionalDataSources: []
adminPassword: xxxx
defaultDashboardsEnabled: true
enabled: true
extraConfigmapMounts: []
grafana.ini:
analytics:
check_for_updates: true
auth.ldap:
allow_sign_up: true
config_file: /etc/grafana/ldap.toml
enabled: true
grafana_net:
url: https://grafana.net
log:
mode: console
paths:
data: /var/lib/grafana/data
logs: /var/log/grafana
plugins: /var/lib/grafana/plugins
provisioning: /etc/grafana/provisioning
smtp:
enabled: true
from_address: grafana@xxxx.org
from_name: xxxx
host: xxxx:25
skip_verify: true
ingress:
annotations: null
enabled: true
hosts:
- grafana.xxxx.org
labels: {}
path: /
tls:
- hosts:
- grafana.xxxx.org
secretName: xxxx-tls
ldap:
existingSecret: grafana-ldap-toml
persistence:
accessModes:
- ReadWriteOnce
enabled: false
size: 20Gi
serviceMonitor:
interval: ""
metricRelabelings: []
relabelings: []
selfMonitor: true
sidecar:
dashboards:
enabled: true
label: grafana_dashboard
searchNamespace: ALL
datasources:
defaultDatasourceEnabled: true
enabled: true
label: grafana_datasource
searchNamespace: ALL
kube-state-metrics:
podSecurityPolicy:
enabled: true
rbac:
create: true
kubeApiServer:
enabled: true
relabelings: []
serviceMonitor:
interval: ""
jobLabel: component
metricRelabelings: []
selector:
matchLabels:
component: apiserver
provider: kubernetes
tlsConfig:
insecureSkipVerify: false
serverName: kubernetes
kubeControllerManager:
enabled: false
endpoints: []
service:
port: 10252
selector:
component: kube-controller-manager
targetPort: 10252
serviceMonitor:
https: false
interval: ""
metricRelabelings: []
relabelings: []
kubeDns:
enabled: false
service:
selector:
k8s-app: kube-dns
serviceMonitor:
interval: ""
metricRelabelings: []
relabelings: []
kubeEtcd:
enabled: false
endpoints: []
service:
port: 2379
selector:
k8s-app: etcd
targetPort: 2379
serviceMonitor:
caFile: /etc/prometheus/secrets/etcd-client-cert/etcd-ca
certFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client
insecureSkipVerify: false
interval: ""
keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-client-key
metricRelabelings: []
relabelings: []
scheme: https
serverName: ""
kubeScheduler:
enabled: false
endpoints: []
service:
port: 10251
selector:
component: kube-scheduler
targetPort: 10251
serviceMonitor:
https: false
interval: ""
metricRelabelings: []
relabelings: []
kubeStateMetrics:
enabled: true
serviceMonitor:
interval: ""
metricRelabelings: []
relabelings: []
kubelet:
enabled: true
namespace: kube-system
serviceMonitor:
cAdvisorMetricRelabelings: []
cAdvisorRelabelings: []
https: false
interval: ""
nameOverride: ""
nodeExporter:
enabled: true
jobLabel: jobLabel
serviceMonitor:
interval: ""
metricRelabelings: []
relabelings: []
prometheus:
additionalServiceMonitors: []
enabled: true
ingress:
annotations: {}
enabled: true
hosts:
- prometheus.xxxx.org
labels: {}
paths: []
tls:
- hosts:
- prometheus.xxxx.org
secretName: xxxx-tls
podDisruptionBudget:
enabled: false
maxUnavailable: ""
minAvailable: 1
prometheusSpec:
additionalAlertManagerConfigs: []
additionalAlertRelabelConfigs: []
additionalScrapeConfigs: []
additionalScrapeConfigsExternal: false
affinity: {}
alertingEndpoints: []
configMaps: []
containers: []
enableAdminAPI: false
evaluationInterval: ""
externalLabels:
environment: aliyun
externalUrl: ""
image:
repository: quay.io/prometheus/prometheus
tag: v2.9.1
listenLocal: false
logFormat: logfmt
logLevel: info
nodeSelector: {}
paused: false
podAntiAffinity: ""
podAntiAffinityTopologyKey: kubernetes.io/hostname
podMetadata: {}
priorityClassName: ""
query: {}
remoteRead: []
remoteWrite: []
replicas: 1
resources: {}
retention: 10d
routePrefix: /
ruleNamespaceSelector: {}
ruleSelector: {}
ruleSelectorNilUsesHelmValues: true
scrapeInterval: ""
secrets: []
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {}
serviceMonitorSelectorNilUsesHelmValues: true
storageSpec: {}
thanos: {}
tolerations: []
rbac:
roleNamespaces:
- kube-system
service:
annotations: {}
clusterIP: ""
externalIPs: []
labels: {}
loadBalancerIP: ""
loadBalancerSourceRanges: []
nodePort: 30090
sessionAffinity: ""
targetPort: 9090
type: ClusterIP
serviceAccount:
create: true
name: ""
serviceMonitor:
interval: ""
metricRelabelings: []
relabelings: []
selfMonitor: true
prometheus-node-exporter:
extraArgs:
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
podLabels:
jobLabel: node-exporter
prometheusOperator:
affinity: {}
cleanupCustomResource: false
configmapReloadImage:
repository: quay.io/coreos/configmap-reload
tag: v0.0.1
crdApiGroup: monitoring.coreos.com
createCustomResource: true
enabled: true
hyperkubeImage:
pullPolicy: IfNotPresent
repository: k8s.gcr.io/hyperkube
tag: v1.12.1
image:
pullPolicy: IfNotPresent
repository: quay.io/coreos/prometheus-operator
tag: v0.29.0
kubeletService:
enabled: true
namespace: kube-system
nodeSelector: {}
podAnnotations: {}
podLabels: {}
prometheusConfigReloaderImage:
repository: quay.io/coreos/prometheus-config-reloader
tag: v0.29.0
resources: {}
securityContext:
runAsNonRoot: true
runAsUser: 65534
service:
additionalPorts: []
annotations: {}
clusterIP: ""
externalIPs: []
labels: {}
loadBalancerIP: ""
loadBalancerSourceRanges: []
nodePort: 30080
type: ClusterIP
serviceAccount:
create: true
name: ""
serviceMonitor:
interval: ""
metricRelabelings: []
relabelings: []
selfMonitor: true
tolerations: []