Page MenuHomePhorge

No OneTemporary

Authored By
Unknown
Size
6 KB
Referenced Files
None
Subscribers
None
diff --git a/helm/files/cluster.rules.yml b/helm/files/cluster.rules.yml
index c0c7457..98ea79f 100644
--- a/helm/files/cluster.rules.yml
+++ b/helm/files/cluster.rules.yml
@@ -1,121 +1,130 @@
groups:
- name: cluster-warnings
rules:
- alert: UnschedulablePods
annotations:
description: There are unschedulable pods.
summary: There are unschedulable pods.
expr: sum(kube_pod_status_unschedulable) > 1
for: 5m
labels:
severity: critical
- alert: PodStuckInState
annotations:
description: Pods are stuck in a state.
summary: Pods are stuck in a state.
expr: max_over_time(kube_pod_container_status_waiting_reason[5m]) >= 1
for: 5m
labels:
severity: critical
- alert: StalledDeployment
annotations:
description: The deployment is stalled.
summary: The deployment is stalled.
expr: |
(
kube_deployment_spec_replicas > kube_deployment_status_replicas_available
) and (
changes(kube_deployment_status_replicas_updated[10m]) == 0
)
for: 5m
labels:
severity: critical
- alert: TooManyEvictedPods
expr: sum(kube_pod_status_reason{reason="Evicted"}) >= 1
labels:
severity: high
annotations:
message: 'Too many Failed Evicted Pods: {{ $value }}'
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
summary: kube-state-metrics sharding is misconfigured.
expr: |
stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
summary: kube-state-metrics shards are missing.
expr: |
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
-
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
!= 0
for: 15m
labels:
severity: critical
- name: vector-alerts
rules:
- alert: VectorHTTPClientErrors
annotations:
description: Vector instance {{ $labels.host }} sink {{ $labels.component_id }} is experiencing http client errors({{ $labels.error_kind }}).
summary: Vector is experiencing http client errors.
expr: rate(vector_http_client_errors_total[5m]) > 0
labels:
severity: high
- name: resource-limits
rules:
- alert: RunningOutOfDiskSpace
annotations:
description: There is less than 10% space left on {{ $labels.mountpoint }} on {{ $labels.instance }}.
summary: There is less than 10% space left on the /data partition.
expr: host_filesystem_used_ratio{mountpoint=~"/host/root/data.*|/host/root/var/lib/rancher/k3s/.*|/host/root"} * 100 > 90
labels:
severity: critical
- alert: PVCRunningOutOfDiskSpace
annotations:
summary: There is less than 10% space left on PVC {{ $labels.persistenvolumeclaim }}.
expr: 100 / kubelet_volume_stats_capacity_bytes * kubelet_volume_stats_used_bytes > 90
labels:
severity: critical
- alert: PVCRunningOutOfInodes
annotations:
summary: There is fewer than 10000 inodes free on PVC {{ $labels.persistenvolumeclaim }}.
expr: kubelet_volume_stats_inodes_free < 10000
labels:
severity: critical
+
+ # https://stackoverflow.com/questions/65428558/what-is-the-difference-between-container-memory-working-set-bytes-and-contain
+ # We may want to be monitoring container_memory_rss instead of container_memory_working_set_bytes
+ # It seems rss is too low and the others are too high, so I'm not sure what we're supposed to monitor.
+ # For IMAP we're going to be using all possible in working_set_bytes because of caching.
+ # - container_memory_cache
- alert: Pod too close to memory resource limit
annotations:
summary: The pod {{ $labels.pod }} is at over 80% of the memory resource limit.
- expr: round(max by (pod,container)(max_over_time(container_memory_working_set_bytes{pod=~".*" }[5m]))/ on (pod,container) (max by (pod,container) (kube_pod_container_resource_limits)) * 100,0.01) > 80
+ # This is what we used to monitor, but which turned out to basically always report all available memory for imap and ldap
+ # expr: round(max by (pod,container)(max_over_time(container_memory_working_set_bytes{pod=~".*" }[5m]))/ on (pod,container) (max by (pod,container) (kube_pod_container_resource_limits)) * 100,0.01) > 80
+ # The following seems to be a better approximation when we compare to what ps reports
+ expr: round(max by (pod,container)(container_memory_usage_bytes{pod=~".*" } - container_memory_cache)/ on (pod,container) (max by (pod,container) (kube_pod_container_resource_limits)) * 100,0.01) > 80
labels:
severity: critical

File Metadata

Mime Type
text/x-diff
Expires
Sat, Apr 4, 7:13 AM (1 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
18822964
Default Alt Text
(6 KB)

Event Timeline