Page MenuHomePhorge

No OneTemporary

Authored By
Unknown
Size
7 KB
Referenced Files
None
Subscribers
None
diff --git a/helm/files/web.rules.yml b/helm/files/web.rules.yml
index 627b7a9..cc88068 100644
--- a/helm/files/web.rules.yml
+++ b/helm/files/web.rules.yml
@@ -1,175 +1,175 @@
groups:
- name: certificate-checks
rules:
- alert: certificate_validity_28_days
expr: ((probe_ssl_earliest_cert_expiry - time ()) / 60 / 60 / 24) < 28
for: 1m
labels:
severity: warning
annotations:
summary: "28 day warning that a certificate is about to expire on {{ $labels.instance }} in {{ $value }} days"
- alert: certificate_validity_10_days
expr: ((probe_ssl_earliest_cert_expiry - time ()) / 60 / 60 / 24) < 10
for: 1m
labels:
severity: critical
annotations:
summary: "10 day warning that a certificate is about to expire on {{ $labels.instance }} in {{ $value }} days"
- name: infrastructure-checks
rules:
- alert: blackbox_failed_probe
expr: probe_success == 0
for: 1m
labels:
severity: info
annotations:
summary: "A blackbox probe failed on {{ $labels.instance }}. To debug: kubectl exec deployment/prometheus -- sh -c \"wget -O- 'http://localhost:9115/probe?target={{ $labels.instance }}&module={{ $labels.module }}&debug=true'\""
- alert: failed_kolab_infrastructure_check
expr: kolab_infrastructure_check == 0
for: 35m
labels:
severity: critical
annotations:
summary: "An infrastructure check failed. Instance: {{ $labels.instance }} Name: {{ $labels.testname }}"
- alert: outdated_kolab_infrastructure_check
expr: (time() - kolab_infrastructure_check_timestamp) > ( 60 * 60 * 2 )
for: 1m
labels:
severity: warning
annotations:
summary: "We haven't received an updated infrastructure status in 2h"
- alert: too_many_loki_errors
expr: loki_internal_log_messages_total{level="error"} > 10
for: 1m
labels:
severity: warning
annotations:
summary: "Too many loki errors, check if ingestion works or is overloaded"
- name: mx-checks
rules:
# - alert: kolab_outbound_mx_refused_threshold_10
# expr: kolab_outbound_mx_refused > 10
# for: 1m
# labels:
# severity: warning
# annotations:
# summary: "We have more than 10 refused emails ({{ $value }} at the moment) on {{ $labels.instance}} for today (this is a count of the daily logfile, it grows per day and is reset at midnight)."
- alert: outdated_kolab_mx_metrics
expr: (time() - kolab_mx_metrics_timestamp) > ( 60 * 60 * 2 )
for: 1m
labels:
severity: warning
annotations:
summary: "We haven't received an updated postfix status in 2h"
- alert: kolab_mx_queue_active_threshold
expr: kolab_mx_queue_length{queue="active"} > 300
for: 1m
labels:
severity: warning
annotations:
summary: "The mx {{ $labels.queue }} queue on {{ $labels.instance }} has {{ $value }} entries."
- alert: kolab_mx_queue_active_threshold_critical
expr: kolab_mx_queue_length{queue="active"} > 1000
for: 1m
labels:
severity: critical
annotations:
summary: "The mx {{ $labels.queue }} queue on {{ $labels.instance }} has {{ $value }} entries."
- alert: kolab_mx_queue_deferred_threshold
expr: kolab_mx_queue_length{queue="deferred"} > 300
for: 1m
labels:
severity: warning
annotations:
summary: "The mx {{ $labels.queue }} queue on {{ $labels.instance }} has {{ $value }} entries."
- alert: kolab_mx_queue_deferred_threshold_critical
expr: kolab_mx_queue_length{queue="deferred"} > 1000
for: 1m
labels:
severity: critical
annotations:
summary: "The mx {{ $labels.queue }} queue on {{ $labels.instance }} has {{ $value }} entries."
- name: kolab-checks
rules:
- alert: kolab_users_deleted_with_missing_cleanup
expr: kolab_users_deleted_with_missing_cleanup > 0
for: 30m
labels:
severity: warning
annotations:
summary: "{{ $value }} users that have been deleted require manual cleanup on {{ $labels.instance }} via ./artisan user:resync --deleted-only "
- alert: kolab_horizon_recent_failed_jobs
expr: rate(kolab_horizon_recent_failed_jobs[1h])*60*60 > 2
labels:
severity: warning
annotations:
summary: "{{ $value }} horizon jobs have failed in the past hour "
- name: log-rules
rules:
- alert: kolab_log_imap_error
- expr: count_over_time(kolab_log_imap_error_rate5m[1h]) > 0
+ expr: count_over_time(kolab_log_imap_error_rate1m[1h]) > 0
labels:
severity: warning
annotations:
summary: "Metric is over threshold"
- alert: kolab_log_postfix_rejection
- expr: count_over_time(kolab_log_postfix_rejection_rate5m[1h]) > 0
+ expr: count_over_time(kolab_log_postfix_rejection_rate1m[1h]) > 0
labels:
severity: warning
annotations:
summary: "Metric is over threshold"
- alert: kolab_log_postfix_temporary_error
- expr: count_over_time(kolab_log_postfix_temporary_error_rate5m[1h]) > 0
+ expr: count_over_time(kolab_log_postfix_temporary_error_rate1m[1h]) > 0
labels:
severity: warning
annotations:
summary: "Metric is over threshold"
- alert: kolab_log_error
- expr: count_over_time(kolab_log_error_rate5m[1h]) > 0
+ expr: count_over_time(kolab_log_error_rate1m[1h]) > 0
labels:
severity: warning
annotations:
summary: "{{ $labels.pod_name }} encountered {{$value}} errors in the last 60 minutes."
- alert: kolab_log_horizon_error
- expr: count_over_time(kolab_log_horizon_error_rate5m[1h]) > 0
+ expr: count_over_time(kolab_log_horizon_error_rate1m[1h]) > 0
labels:
severity: warning
annotations:
summary: "{{ $labels.pod_name }} encountered {{$value}} errors in the last 60 minutes."
- alert: kolab_log_roundcube_fatalerror
- expr: count_over_time(kolab_log_roundcube_fatalerror_rate5m[1h]) > 0
+ expr: count_over_time(kolab_log_roundcube_fatalerror_rate1m[1h]) > 0
labels:
severity: warning
annotations:
summary: "{{ $labels.pod_name }} encountered {{$value}} errors in the last 60 minutes."
- alert: kolab_log_roundcube_dberror
- expr: count_over_time(kolab_log_roundcube_dberror_rate5m[1h]) > 0
+ expr: count_over_time(kolab_log_roundcube_dberror_rate1m[1h]) > 0
labels:
severity: warning
annotations:
summary: "{{ $labels.pod_name }} encountered {{$value}} errors in the last 60 minutes."
- alert: kolab_log_roundcube_exception
- expr: count_over_time(kolab_log_roundcube_exception_rate5m[1h]) > 0
+ expr: count_over_time(kolab_log_roundcube_exception_rate1m[1h]) > 0
labels:
severity: warning
annotations:
summary: "{{ $labels.pod_name }} encountered {{$value}} errors in the last 60 minutes."
- alert: kolab_log_avmavis_dns_lookup_errors
expr: count_over_time(kolab_log_amavis_dns_lookup_errors_rate1m[1h]) > 0
labels:
severity: warning
annotations:
summary: "{{ $labels.pod_name }} encountered {{$value}} errors in the last 60 minutes."

File Metadata

Mime Type
text/x-diff
Expires
Sat, Apr 4, 3:40 AM (1 d, 7 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
18715701
Default Alt Text
(7 KB)

Event Timeline