您的位置 首页 prometheus

node_rules.yml

groups:
- name: 主机状态-监控告警
  rules:
  - alert: 主机状态
    expr: up {job="kubernetes-nodes"} == 0
    for: 15s
    labels:
      status: 非常严重
    annotations:
      summary: "{{.instance}}:服务器宕机"
      description: "{{.instance}}:服务器延时超过15s"

  - alert: CPU使用情况
    expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 60
    for: 1m
    labels:
      status: warning
    annotations:
      summary: "{{$labels.instance}}: High CPU Usage Detected"
      description: "{{$labels.instance}}: CPU usage is {{$value}}, above 60%"

  - alert: NodeFilesystemUsage
    expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
      description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }})"

  - alert: 内存使用
    expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 80
    for: 1m
    labels:
      status: 严重告警
    annotations:
      summary: "{{ $labels.instance}} 内存使用率过高!"
      description: "{{ $labels.instance }} 内存使用大于80%(目前使用:{{ $value}}%)"


  - alert: IO性能
    expr: (avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) > 60
    for: 1m
    labels:
      status: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流入磁盘IO使用率过高!"
      description: "{{ $labels.instance }} 流入磁盘IO大于60%(目前使用:{{ $value }})"


  - alert: 网络
    expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
    for: 1m
    labels:
      status: 严重告警
    annotations:
      summary: "{{ $labels.instance}} 流入网络带宽过高!"
      description: "{{ $labels.instance }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{ $value }}"
  - alert: TCP会话
    expr: node_netstat_Tcp_CurrEstab > 1000
    for: 1m
    labels:
      status: 严重告警
    annotations:
      summary: "{{ $labels.instance }} TCP_ESTABLISHED过高!"
      description: "{{ $labels.instance }} TCP_ESTABLISHED大于1000%(目前使用:{{ $value }}%)"


 

欢迎来撩 : 汇总all

白眉大叔

关于白眉大叔linux云计算: 白眉大叔

热门文章