No OneTemporary
Actions

Authored By

Unknown

Size

6 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/helm/files/cluster.rules.yml b/helm/files/cluster.rules.yml
	index c0c7457..98ea79f 100644
	--- a/helm/files/cluster.rules.yml
	+++ b/helm/files/cluster.rules.yml
	@@ -1,121 +1,130 @@
	groups:
	- name: cluster-warnings
	rules:
	- alert: UnschedulablePods
	annotations:
	description: There are unschedulable pods.
	summary: There are unschedulable pods.
	expr: sum(kube_pod_status_unschedulable) > 1
	for: 5m
	labels:
	severity: critical
	- alert: PodStuckInState
	annotations:
	description: Pods are stuck in a state.
	summary: Pods are stuck in a state.
	expr: max_over_time(kube_pod_container_status_waiting_reason[5m]) >= 1
	for: 5m
	labels:
	severity: critical
	- alert: StalledDeployment
	annotations:
	description: The deployment is stalled.
	summary: The deployment is stalled.
	expr: \|
	(
	kube_deployment_spec_replicas > kube_deployment_status_replicas_available
	) and (
	changes(kube_deployment_status_replicas_updated[10m]) == 0
	)
	for: 5m
	labels:
	severity: critical
	- alert: TooManyEvictedPods
	expr: sum(kube_pod_status_reason{reason="Evicted"}) >= 1
	labels:
	severity: high
	annotations:
	message: 'Too many Failed Evicted Pods: {{ $value }}'
	- name: kube-state-metrics
	rules:
	- alert: KubeStateMetricsListErrors
	annotations:
	description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
	summary: kube-state-metrics is experiencing errors in list operations.
	expr: \|
	(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
	/
	sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
	> 0.01
	for: 15m
	labels:
	severity: critical
	- alert: KubeStateMetricsWatchErrors
	annotations:
	description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
	summary: kube-state-metrics is experiencing errors in watch operations.
	expr: \|
	(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
	/
	sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
	> 0.01
	for: 15m
	labels:
	severity: critical
	- alert: KubeStateMetricsShardingMismatch
	annotations:
	description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
	summary: kube-state-metrics sharding is misconfigured.
	expr: \|
	stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
	for: 15m
	labels:
	severity: critical
	- alert: KubeStateMetricsShardsMissing
	annotations:
	description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
	summary: kube-state-metrics shards are missing.
	expr: \|
	2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
	-
	sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
	!= 0
	for: 15m
	labels:
	severity: critical
	- name: vector-alerts
	rules:
	- alert: VectorHTTPClientErrors
	annotations:
	description: Vector instance {{ $labels.host }} sink {{ $labels.component_id }} is experiencing http client errors({{ $labels.error_kind }}).
	summary: Vector is experiencing http client errors.
	expr: rate(vector_http_client_errors_total[5m]) > 0
	labels:
	severity: high
	- name: resource-limits
	rules:
	- alert: RunningOutOfDiskSpace
	annotations:
	description: There is less than 10% space left on {{ $labels.mountpoint }} on {{ $labels.instance }}.
	summary: There is less than 10% space left on the /data partition.
	expr: host_filesystem_used_ratio{mountpoint=~"/host/root/data.\|/host/root/var/lib/rancher/k3s/.\|/host/root"} * 100 > 90
	labels:
	severity: critical
	- alert: PVCRunningOutOfDiskSpace
	annotations:
	summary: There is less than 10% space left on PVC {{ $labels.persistenvolumeclaim }}.
	expr: 100 / kubelet_volume_stats_capacity_bytes * kubelet_volume_stats_used_bytes > 90
	labels:
	severity: critical
	- alert: PVCRunningOutOfInodes
	annotations:
	summary: There is fewer than 10000 inodes free on PVC {{ $labels.persistenvolumeclaim }}.
	expr: kubelet_volume_stats_inodes_free < 10000
	labels:
	severity: critical
	+
	+ # https://stackoverflow.com/questions/65428558/what-is-the-difference-between-container-memory-working-set-bytes-and-contain
	+ # We may want to be monitoring container_memory_rss instead of container_memory_working_set_bytes
	+ # It seems rss is too low and the others are too high, so I'm not sure what we're supposed to monitor.
	+ # For IMAP we're going to be using all possible in working_set_bytes because of caching.
	+ # - container_memory_cache
	- alert: Pod too close to memory resource limit
	annotations:
	summary: The pod {{ $labels.pod }} is at over 80% of the memory resource limit.
	- expr: round(max by (pod,container)(max_over_time(container_memory_working_set_bytes{pod=~"." }[5m]))/ on (pod,container) (max by (pod,container) (kube_pod_container_resource_limits)) 100,0.01) > 80
	+ # This is what we used to monitor, but which turned out to basically always report all available memory for imap and ldap
	+ # expr: round(max by (pod,container)(max_over_time(container_memory_working_set_bytes{pod=~"." }[5m]))/ on (pod,container) (max by (pod,container) (kube_pod_container_resource_limits)) 100,0.01) > 80
	+ # The following seems to be a better approximation when we compare to what ps reports
	+ expr: round(max by (pod,container)(container_memory_usage_bytes{pod=~"." } - container_memory_cache)/ on (pod,container) (max by (pod,container) (kube_pod_container_resource_limits)) 100,0.01) > 80
	labels:
	severity: critical

File Metadata

Mime Type: text/x-diff
Expires: Sat, Apr 4, 7:13 AM (1 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 18822964
Default Alt Text: (6 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions