https://lightsail.aws.amazon.com服务器
shellcd /opt
#下载文件
wget https://github.com/prometheus/prometheus/releases/download/v2.37.1/prometheus-2.37.1.linux-amd64.tar.gz
#sha256校验压缩包的一致性
echo "753f66437597cf52ada98c2f459aa8c03745475c249c9f2b40ac7b3919131ba6 prometheus-2.37.1.linux-amd64.tar.gz" | sha256sum --check
#解压
tar -zxvf prometheus-2.37.1.linux.tar.gz
#更改名称
mv prometheus-2.37.1.linux prometheus-2.37
#更改配置文件
vim /opt/prometheus-2.37/prometheus.yml
#更改报警规则文件(BasicRules.yml)
#创建service文件
vim /usr/lib/systemd/system/prometheus.service
#重置systemctl并启动
systemctl daemon-reload
systemctl start prometheus
systemctl enable prometheus&&systemctl is-enabled prometheus
systemctl status prometheus
此文件指定了报警(alerting端口,报警规则配置文件(rule_files)位置,以及节点信息)
/opt/prometheus-2.37/prometheus.yml
ymlglobal:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
rule_files:
- "/opt/prometheus/basicrules.yml"
scrape_configs:
- job_name: 'prometheus'
scrape_interval: 5s
static_configs:
- targets: ['localhost:9090']
- job_name: 'consul_node'
scrape_interval: 10s
consul_sd_configs:
- server: 'localhost:8500'
services: []
- job_name: 'Node'
scrape_interval: 10s
static_configs:
- targets: ['localhost:9100',
'8.2.23.56:9100',
'8.2.42.7:9100',
'8.2.74.15:9100',
'7.5.15.12:9100',
'8.2.2.11:9100']
- job_name: 'ChengDu-proxy'
static_configs:
- targets: ['7.8.2.75:9100',
'7.8.2.7:9100',
'7.9.6.9:9100',
'7.8.9.88:9100']
- job_name: 'GuangZhou-proxy'
static_configs:
- targets: ['8.4.9.17:9100',
'8.4.5.15:9100',
'8.4.3.57:9100',
'8.4.7.13:9100']
- job_name: 'QingDao-proxy'
static_configs:
- targets: ['8.1.9.2:9100',
'4.1.9.1:9100',
'4.1.4.2:9100',
'8.1.7.2:9100']
prometheus配置文件指定的规则文件配置项rule_files:
/opt/prometheus/basicrules.yml
ymlgroups: #父分组
- name: basic_rule #子分组,可以有多个子分组,子分组下也可以有多个规则(呈现在prometheus9090端口中的alerts)
rules: #定义子分组的告警规则
- alert: NodeCPUUsage #定义告警名称
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) < 10 # 告警条件,满足条件则触发告警
for: 2m #触发生效时间,满足告警条件持续多久后,才会触发告警
labels: #标签(k/v的形式,如定义告警的级别)
severity: critical #定义告警等级,
service: pods
annotations: # 注释
description: "{{$labels.instance}}: CPU usage is above 90% . Please check" # 详情
summary: "{{$labels.instance}}: High CPU usage detected" # 摘要
- alert: NodeMemoryUsage
expr: (((node_memory_MemAvailable_bytes)/(node_memory_MemTotal_bytes))*100) < 20
for: 2m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}: Memory usage is above 80% ,Please check."
summary: "{{$labels.instance}}: High memory usage detected"
- alert: NodeLowDataDisk
expr: ((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"} ) / node_filesystem_size_bytes{mountpoint="/"} * 100) > 85
for: 2m
labels:
severity: critical
annotations:
description: "{{$labels.instance}}: Data disk usage is above 85%, Please check. "
summmary: "{{$labels.instance}}: Low data disk space"
- alert: 节点下线
expr: up != 1
for: 5m
labels:
status: High
team: operations
annotations:
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minute."
summary: "Instance {{ $labels.instance }} 节点掉线"
[Unit] Description=prometheus server Documentation=https://prometheus.io/docs/introduction/overview/ Wants=network.target [Service] User=www Type=simple Restart=on-failure WorkingDirectory=/opt/prometheus-2.37/ ExecStart=/opt/prometheus-2.37/prometheus \ --config.file=/opt/prometheus-2.37/prometheus.yml \ --storage.tsdb.path=/opt/prometheus-2.37/data ExecReload=/bin/kill -HUP $MAINPID RuntimeDirectory=prometheus RuntimeDirectoryMode=0750 LimitNOFILE=100000 TimeoutStopSec=20 [Install] WantedBy=multi-user.target
shell#下载二进制文件
wget https://github.com/prometheus/node_exporter/releases/download/v1.6.0/node_exporter-1.6.0.linux-amd64.tar.gz
#解压
tar -zxvf node_exporter-1.6.0.linux-amd64.tar.gz
mv node_exporter-1.6.0.linux-amd64/ node_exporter-1.6/
#封装service
touch /usr/lib/systemd/system/node_exporter.service
vim /usr/lib/systemd/system/node_exporter.service
#重置systemctl并启动
systemctl daemon-reload
systemctl start node_exporter
systemctl enable node_exporter&&systemctl is-enabled node_exporter
systemctl status node_exporter
/usr/lib/systemd/system/node_exporter.service
[Unit] Description=node_exporter Documentation=https://prometheus.io/docs/introduction/overview/ Wants=network-online.target [Service] User=www Type=simple ExecStart=/data/node_exporter-1.6/node_exporter --web.listen-address=:9100 [Install] WantedBy=multi-user.target
ymlglobal:
resolve_timeout: 5m #处理超时时间
route:
group_by: ['alertname'] #报警分组
group_wait: 30s #在组内等待所配置的时间,如果同组内,30秒内出现相同报警,在一个组内出现
group_interval: 2m #如果组内内容不变化,合并为一条警报信息,5m后发送
repeat_interval: 12h #发送报警间隔,如果指定时间内没有修复,则重新发送报警
receiver: 'web.hook' #接收器
receivers: #接收器指定发送人以及发送渠道
- name: 'web.hook' #接收器名称
webhook_configs: #webhook配置地址
- url: 'http://127.0.0.1:18899/alertmanager' #url地址
#- url: 'http://127.0.0.1:9087/alert/-676279312'
inhibit_rules: #抑制规则
- source_match: #源标签出发时抑制含有目标标签的警报,在当前警报匹配 severity: 'critical'
severity: 'critical' #严重程度:严重
target_match: #目标匹配
severity: 'warning' #严重程度:警报
equal: ['alertname', 'dev', 'instance'] # 确保这个配置下的标签内容相同才会抑制,也就是说警报中必须有这三个标签值才会被抑制
(((grafana 配置文件)))
http://13.xx.xx.xx:3000/login
grafana 配置文件
/etc/grafana