From 0ade92aef0658db021f4f52232925c99ff00824d Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni <gouthamve+github@gmail.com> Date: Mon, 19 Nov 2018 23:39:59 +0530 Subject: [PATCH] Initial Mixin (#25) Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com> --- .gitignore | 1 + mixin/alerts.libsonnet | 120 +++++++++++++++++++++++ mixin/dashboards.libsonnet | 164 ++++++++++++++++++++++++++++++++ mixin/jsonnetfile.json | 14 +++ mixin/jsonnetfile.lock.json | 14 +++ mixin/mixin.libsonnet | 3 + mixin/recording_rules.libsonnet | 43 +++++++++ 7 files changed, 359 insertions(+) create mode 100644 mixin/alerts.libsonnet create mode 100644 mixin/dashboards.libsonnet create mode 100644 mixin/jsonnetfile.json create mode 100644 mixin/jsonnetfile.lock.json create mode 100644 mixin/mixin.libsonnet create mode 100644 mixin/recording_rules.libsonnet diff --git a/.gitignore b/.gitignore index 4ff530d8..ec955370 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ cmd/querier/querier cmd/promtail/promtail *.output /images/ +mixin/vendor/ diff --git a/mixin/alerts.libsonnet b/mixin/alerts.libsonnet new file mode 100644 index 00000000..a43ac122 --- /dev/null +++ b/mixin/alerts.libsonnet @@ -0,0 +1,120 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'logish_alerts', + rules: [ + { + alert: 'LogishRequestErrors', + expr: ||| + 100 * sum(rate(logish_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) + / + sum(rate(logish_request_duration_seconds_count[1m])) by (namespace, job, route) + > 10 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'LogishRequestLatency', + expr: ||| + namespace_job_route:logish_request_duration_seconds:99quantile > 1 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + ], + }, + { + name: 'logish_frontend_alerts', + rules: [ + { + alert: 'FrontendRequestErrors', + expr: ||| + 100 * sum(rate(cortex_gw_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) + / + sum(rate(cortex_gw_request_duration_seconds_count[1m])) by (namespace, job, route) + > 10 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'FrontendRequestLatency', + expr: ||| + namespace_job_route:cortex_gw_request_duration_seconds:99quantile > 1 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + ], + }, + { + name: 'promtail_alerts', + rules: [ + { + alert: 'PromtailRequestsErrors', + expr: ||| + 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) + / + sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) + > 10 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'PromtailRequestLatency', + expr: ||| + job_status_code:promtail_request_duration_seconds:99quantile > 1 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + ], + }, + ], + }, +} \ No newline at end of file diff --git a/mixin/dashboards.libsonnet b/mixin/dashboards.libsonnet new file mode 100644 index 00000000..fe20a63e --- /dev/null +++ b/mixin/dashboards.libsonnet @@ -0,0 +1,164 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + dashboards+: { + 'logish-writes.json': + g.dashboard('Logish / Writes') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('Frontend (cortex_gw)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw", route="cortex-write"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [g.selector.eq('job', '$namespace/cortex-gw'), g.selector.eq('route', 'cortex-write')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + .addRow( + g.row('Distributor') + .addPanel( + g.panel('QPS') + + g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/distributor", route="api_prom_push"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/distributor'), g.selector.eq('route', 'api_prom_push')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + .addRow( + g.row('Ingester') + .addPanel( + g.panel('QPS') + + g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/ingester",route="/logproto.Pusher/Push"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/ingester'), g.selector.eq('route', '/logproto.Pusher/Push')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ), + + 'logish-reads.json': + g.dashboard('logish / Reads') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('Frontend (cortex_gw)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw", route="cortex-read"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [g.selector.eq('job', '$namespace/cortex-gw'), g.selector.eq('route', 'cortex-read')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + .addRow( + g.row('Querier') + .addPanel( + g.panel('QPS') + + g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/querier"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/querier')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + .addRow( + g.row('Ingester') + .addPanel( + g.panel('QPS') + + g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/ingester",route!~"/logproto.Pusher/Push|metrics|ready|traces"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/ingester'), g.selector.nre('route', '/logproto.Pusher/Push|metrics|ready')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ), + + + 'logish-chunks.json': + g.dashboard('Logish / Chunks') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('Active Series / Chunks') + .addPanel( + g.panel('Series') + + g.queryPanel('sum(logish_ingester_memory_chunks{cluster="$cluster", job="$namespace/ingester"})', 'series'), + ) + .addPanel( + g.panel('Chunks per series') + + g.queryPanel('sum(logish_ingester_memory_chunks{cluster="$cluster", job="$namespace/ingester"}) / sum(logish_ingester_memory_series{job="$namespace/ingester"})', 'chunks'), + ) + ) + .addRow( + g.row('Flush Stats') + .addPanel( + g.panel('Utilization') + + g.latencyPanel('logish_ingester_chunk_utilization', '{cluster="$cluster", job="$namespace/ingester"}', multiplier='1') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Age') + + g.latencyPanel('logish_ingester_chunk_age_seconds', '{cluster="$cluster", job="$namespace/ingester"}'), + ), + ) + .addRow( + g.row('Flush Stats') + .addPanel( + g.panel('Size') + + g.latencyPanel('logish_ingester_chunk_length', '{cluster="$cluster", job="$namespace/ingester"}', multiplier='1') + + { yaxes: g.yaxes('short') }, + ) + .addPanel( + g.panel('Entries') + + g.queryPanel('sum(rate(logish_chunk_store_index_entries_per_chunk_sum{cluster="$cluster", job="$namespace/ingester"}[5m])) / sum(rate(logish_chunk_store_index_entries_per_chunk_count{cluster="$cluster", job="$namespace/ingester"}[5m]))', 'entries'), + ), + ) + .addRow( + g.row('Flush Stats') + .addPanel( + g.panel('Queue Length') + + g.queryPanel('logish_ingester_flush_queue_length{cluster="$cluster", job="$namespace/ingester"}', '{{instance}}'), + ) + .addPanel( + g.panel('Flush Rate') + + g.qpsPanel('logish_ingester_chunk_age_seconds_count{cluster="$cluster", job="$namespace/ingester"}'), + ), + ), + + 'logish-frontend.json': + g.dashboard('Logish / Frontend') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('logish Reqs (cortex_gw)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [g.selector.eq('job', '$namespace/cortex-gw')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ), + 'promtail.json': + g.dashboard('Logish / Promtail') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('promtail Reqs') + .addPanel( + g.panel('QPS') + + g.qpsPanel('promtail_request_duration_seconds_count{cluster="$cluster", job="$namespace/promtail"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('promtail_request_duration_seconds', [g.selector.eq('job', '$namespace/promtail')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + }, +} \ No newline at end of file diff --git a/mixin/jsonnetfile.json b/mixin/jsonnetfile.json new file mode 100644 index 00000000..3a5d8ad2 --- /dev/null +++ b/mixin/jsonnetfile.json @@ -0,0 +1,14 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "master" + } + ] +} \ No newline at end of file diff --git a/mixin/jsonnetfile.lock.json b/mixin/jsonnetfile.lock.json new file mode 100644 index 00000000..933c3204 --- /dev/null +++ b/mixin/jsonnetfile.lock.json @@ -0,0 +1,14 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "cab274f882aae97ad6add33590a3b149e6f8eeac" + } + ] +} \ No newline at end of file diff --git a/mixin/mixin.libsonnet b/mixin/mixin.libsonnet new file mode 100644 index 00000000..a684acd6 --- /dev/null +++ b/mixin/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'dashboards.libsonnet') + +(import 'alerts.libsonnet') + +(import 'recording_rules.libsonnet') \ No newline at end of file diff --git a/mixin/recording_rules.libsonnet b/mixin/recording_rules.libsonnet new file mode 100644 index 00000000..18404e2f --- /dev/null +++ b/mixin/recording_rules.libsonnet @@ -0,0 +1,43 @@ +local histogramRules(metric, labels) = + local vars = { + metric: metric, + labels_underscore: std.join('_', labels), + labels_comma: std.join(', ', labels), + }; + [ + { + record: '%(labels_underscore)s:%(metric)s:99quantile' % vars, + expr: 'histogram_quantile(0.99, sum(rate(%(metric)s_bucket[5m])) by (le, %(labels_comma)s))' % vars, + }, + { + record: '%(labels_underscore)s:%(metric)s:50quantile' % vars, + expr: 'histogram_quantile(0.50, sum(rate(%(metric)s_bucket[5m])) by (le, %(labels_comma)s))' % vars, + }, + { + record: '%(labels_underscore)s:%(metric)s:avg' % vars, + expr: 'sum(rate(%(metric)s_sum[5m])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[5m])) by (%(labels_comma)s)' % vars, + }, + ]; + +{ + prometheus_rules+:: { + groups+: [{ + name: 'logish_rules', + rules: + histogramRules('logish_request_duration_seconds', ['job']) + + histogramRules('logish_request_duration_seconds', ['job', 'route']) + + histogramRules('logish_request_duration_seconds', ['namespace', 'job', 'route']), + }, { + name: 'logish_frontend_rules', + rules: + histogramRules('cortex_gw_request_duration_seconds', ['job']) + + histogramRules('cortex_gw_request_duration_seconds', ['job', 'route']) + + histogramRules('cortex_gw_request_duration_seconds', ['namespace', 'job', 'route']), + }, { + name: 'promtail_rules', + rules: + histogramRules('promtail_request_duration_seconds', ['job']) + + histogramRules('promtail_request_duration_seconds', ['job', 'status_code']), + }], + }, +} \ No newline at end of file -- GitLab