diff --git a/.gitignore b/.gitignore index 4ff530d89f31f70c693bbb145537fe10636c7db2..ec955370bddc9b3c243e5c95d4fbfebe87d08810 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ cmd/querier/querier cmd/promtail/promtail *.output /images/ +mixin/vendor/ diff --git a/mixin/alerts.libsonnet b/mixin/alerts.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..a43ac122415b073a26a64d558a567383c55fcb98 --- /dev/null +++ b/mixin/alerts.libsonnet @@ -0,0 +1,120 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'logish_alerts', + rules: [ + { + alert: 'LogishRequestErrors', + expr: ||| + 100 * sum(rate(logish_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) + / + sum(rate(logish_request_duration_seconds_count[1m])) by (namespace, job, route) + > 10 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'LogishRequestLatency', + expr: ||| + namespace_job_route:logish_request_duration_seconds:99quantile > 1 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + ], + }, + { + name: 'logish_frontend_alerts', + rules: [ + { + alert: 'FrontendRequestErrors', + expr: ||| + 100 * sum(rate(cortex_gw_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) + / + sum(rate(cortex_gw_request_duration_seconds_count[1m])) by (namespace, job, route) + > 10 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'FrontendRequestLatency', + expr: ||| + namespace_job_route:cortex_gw_request_duration_seconds:99quantile > 1 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + ], + }, + { + name: 'promtail_alerts', + rules: [ + { + alert: 'PromtailRequestsErrors', + expr: ||| + 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) + / + sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) + > 10 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + |||, + }, + }, + { + alert: 'PromtailRequestLatency', + expr: ||| + job_status_code:promtail_request_duration_seconds:99quantile > 1 + |||, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + |||, + }, + }, + ], + }, + ], + }, +} \ No newline at end of file diff --git a/mixin/dashboards.libsonnet b/mixin/dashboards.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..fe20a63e9fcd43a47826c5ce8294def25b148eb7 --- /dev/null +++ b/mixin/dashboards.libsonnet @@ -0,0 +1,164 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + dashboards+: { + 'logish-writes.json': + g.dashboard('Logish / Writes') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('Frontend (cortex_gw)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw", route="cortex-write"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [g.selector.eq('job', '$namespace/cortex-gw'), g.selector.eq('route', 'cortex-write')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + .addRow( + g.row('Distributor') + .addPanel( + g.panel('QPS') + + g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/distributor", route="api_prom_push"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/distributor'), g.selector.eq('route', 'api_prom_push')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + .addRow( + g.row('Ingester') + .addPanel( + g.panel('QPS') + + g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/ingester",route="/logproto.Pusher/Push"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/ingester'), g.selector.eq('route', '/logproto.Pusher/Push')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ), + + 'logish-reads.json': + g.dashboard('logish / Reads') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('Frontend (cortex_gw)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw", route="cortex-read"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [g.selector.eq('job', '$namespace/cortex-gw'), g.selector.eq('route', 'cortex-read')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + .addRow( + g.row('Querier') + .addPanel( + g.panel('QPS') + + g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/querier"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/querier')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + .addRow( + g.row('Ingester') + .addPanel( + g.panel('QPS') + + g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/ingester",route!~"/logproto.Pusher/Push|metrics|ready|traces"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/ingester'), g.selector.nre('route', '/logproto.Pusher/Push|metrics|ready')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ), + + + 'logish-chunks.json': + g.dashboard('Logish / Chunks') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('Active Series / Chunks') + .addPanel( + g.panel('Series') + + g.queryPanel('sum(logish_ingester_memory_chunks{cluster="$cluster", job="$namespace/ingester"})', 'series'), + ) + .addPanel( + g.panel('Chunks per series') + + g.queryPanel('sum(logish_ingester_memory_chunks{cluster="$cluster", job="$namespace/ingester"}) / sum(logish_ingester_memory_series{job="$namespace/ingester"})', 'chunks'), + ) + ) + .addRow( + g.row('Flush Stats') + .addPanel( + g.panel('Utilization') + + g.latencyPanel('logish_ingester_chunk_utilization', '{cluster="$cluster", job="$namespace/ingester"}', multiplier='1') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Age') + + g.latencyPanel('logish_ingester_chunk_age_seconds', '{cluster="$cluster", job="$namespace/ingester"}'), + ), + ) + .addRow( + g.row('Flush Stats') + .addPanel( + g.panel('Size') + + g.latencyPanel('logish_ingester_chunk_length', '{cluster="$cluster", job="$namespace/ingester"}', multiplier='1') + + { yaxes: g.yaxes('short') }, + ) + .addPanel( + g.panel('Entries') + + g.queryPanel('sum(rate(logish_chunk_store_index_entries_per_chunk_sum{cluster="$cluster", job="$namespace/ingester"}[5m])) / sum(rate(logish_chunk_store_index_entries_per_chunk_count{cluster="$cluster", job="$namespace/ingester"}[5m]))', 'entries'), + ), + ) + .addRow( + g.row('Flush Stats') + .addPanel( + g.panel('Queue Length') + + g.queryPanel('logish_ingester_flush_queue_length{cluster="$cluster", job="$namespace/ingester"}', '{{instance}}'), + ) + .addPanel( + g.panel('Flush Rate') + + g.qpsPanel('logish_ingester_chunk_age_seconds_count{cluster="$cluster", job="$namespace/ingester"}'), + ), + ), + + 'logish-frontend.json': + g.dashboard('Logish / Frontend') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('logish Reqs (cortex_gw)') + .addPanel( + g.panel('QPS') + + g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [g.selector.eq('job', '$namespace/cortex-gw')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ), + 'promtail.json': + g.dashboard('Logish / Promtail') + .addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster') + .addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace') + .addRow( + g.row('promtail Reqs') + .addPanel( + g.panel('QPS') + + g.qpsPanel('promtail_request_duration_seconds_count{cluster="$cluster", job="$namespace/promtail"}') + ) + .addPanel( + g.panel('Latency') + + g.latencyRecordingRulePanel('promtail_request_duration_seconds', [g.selector.eq('job', '$namespace/promtail')], extra_selectors=[g.selector.eq('cluster', '$cluster')]) + ) + ) + }, +} \ No newline at end of file diff --git a/mixin/jsonnetfile.json b/mixin/jsonnetfile.json new file mode 100644 index 0000000000000000000000000000000000000000..3a5d8ad2223e4d7c20c35bac740d5af2535b8d0a --- /dev/null +++ b/mixin/jsonnetfile.json @@ -0,0 +1,14 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "master" + } + ] +} \ No newline at end of file diff --git a/mixin/jsonnetfile.lock.json b/mixin/jsonnetfile.lock.json new file mode 100644 index 0000000000000000000000000000000000000000..933c3204b739c5b47a6d6d0bdce61625dfe1ffe7 --- /dev/null +++ b/mixin/jsonnetfile.lock.json @@ -0,0 +1,14 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "cab274f882aae97ad6add33590a3b149e6f8eeac" + } + ] +} \ No newline at end of file diff --git a/mixin/mixin.libsonnet b/mixin/mixin.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..a684acd630f0fb8ce58562f6469d1536ad1663da --- /dev/null +++ b/mixin/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'dashboards.libsonnet') + +(import 'alerts.libsonnet') + +(import 'recording_rules.libsonnet') \ No newline at end of file diff --git a/mixin/recording_rules.libsonnet b/mixin/recording_rules.libsonnet new file mode 100644 index 0000000000000000000000000000000000000000..18404e2fe97496c4126b8d58c5967ab8ecf4c25e --- /dev/null +++ b/mixin/recording_rules.libsonnet @@ -0,0 +1,43 @@ +local histogramRules(metric, labels) = + local vars = { + metric: metric, + labels_underscore: std.join('_', labels), + labels_comma: std.join(', ', labels), + }; + [ + { + record: '%(labels_underscore)s:%(metric)s:99quantile' % vars, + expr: 'histogram_quantile(0.99, sum(rate(%(metric)s_bucket[5m])) by (le, %(labels_comma)s))' % vars, + }, + { + record: '%(labels_underscore)s:%(metric)s:50quantile' % vars, + expr: 'histogram_quantile(0.50, sum(rate(%(metric)s_bucket[5m])) by (le, %(labels_comma)s))' % vars, + }, + { + record: '%(labels_underscore)s:%(metric)s:avg' % vars, + expr: 'sum(rate(%(metric)s_sum[5m])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[5m])) by (%(labels_comma)s)' % vars, + }, + ]; + +{ + prometheus_rules+:: { + groups+: [{ + name: 'logish_rules', + rules: + histogramRules('logish_request_duration_seconds', ['job']) + + histogramRules('logish_request_duration_seconds', ['job', 'route']) + + histogramRules('logish_request_duration_seconds', ['namespace', 'job', 'route']), + }, { + name: 'logish_frontend_rules', + rules: + histogramRules('cortex_gw_request_duration_seconds', ['job']) + + histogramRules('cortex_gw_request_duration_seconds', ['job', 'route']) + + histogramRules('cortex_gw_request_duration_seconds', ['namespace', 'job', 'route']), + }, { + name: 'promtail_rules', + rules: + histogramRules('promtail_request_duration_seconds', ['job']) + + histogramRules('promtail_request_duration_seconds', ['job', 'status_code']), + }], + }, +} \ No newline at end of file