Skip to content
Snippets Groups Projects
Commit 0ade92ae authored by Goutham Veeramachaneni's avatar Goutham Veeramachaneni Committed by Tom Wilkie
Browse files

Initial Mixin (#25)

parent 8a1f4e19
No related branches found
No related tags found
No related merge requests found
......@@ -10,3 +10,4 @@ cmd/querier/querier
cmd/promtail/promtail
*.output
/images/
mixin/vendor/
{
prometheusAlerts+:: {
groups+: [
{
name: 'logish_alerts',
rules: [
{
alert: 'LogishRequestErrors',
expr: |||
100 * sum(rate(logish_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(logish_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
},
},
{
alert: 'LogishRequestLatency',
expr: |||
namespace_job_route:logish_request_duration_seconds:99quantile > 1
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
},
},
],
},
{
name: 'logish_frontend_alerts',
rules: [
{
alert: 'FrontendRequestErrors',
expr: |||
100 * sum(rate(cortex_gw_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(cortex_gw_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
},
},
{
alert: 'FrontendRequestLatency',
expr: |||
namespace_job_route:cortex_gw_request_duration_seconds:99quantile > 1
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
},
},
],
},
{
name: 'promtail_alerts',
rules: [
{
alert: 'PromtailRequestsErrors',
expr: |||
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
},
},
{
alert: 'PromtailRequestLatency',
expr: |||
job_status_code:promtail_request_duration_seconds:99quantile > 1
|||,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
},
},
],
},
],
},
}
\ No newline at end of file
local g = import 'grafana-builder/grafana.libsonnet';
{
dashboards+: {
'logish-writes.json':
g.dashboard('Logish / Writes')
.addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster')
.addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace')
.addRow(
g.row('Frontend (cortex_gw)')
.addPanel(
g.panel('QPS') +
g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw", route="cortex-write"}')
)
.addPanel(
g.panel('Latency') +
g.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [g.selector.eq('job', '$namespace/cortex-gw'), g.selector.eq('route', 'cortex-write')], extra_selectors=[g.selector.eq('cluster', '$cluster')])
)
)
.addRow(
g.row('Distributor')
.addPanel(
g.panel('QPS') +
g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/distributor", route="api_prom_push"}')
)
.addPanel(
g.panel('Latency') +
g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/distributor'), g.selector.eq('route', 'api_prom_push')], extra_selectors=[g.selector.eq('cluster', '$cluster')])
)
)
.addRow(
g.row('Ingester')
.addPanel(
g.panel('QPS') +
g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/ingester",route="/logproto.Pusher/Push"}')
)
.addPanel(
g.panel('Latency') +
g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/ingester'), g.selector.eq('route', '/logproto.Pusher/Push')], extra_selectors=[g.selector.eq('cluster', '$cluster')])
)
),
'logish-reads.json':
g.dashboard('logish / Reads')
.addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster')
.addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace')
.addRow(
g.row('Frontend (cortex_gw)')
.addPanel(
g.panel('QPS') +
g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw", route="cortex-read"}')
)
.addPanel(
g.panel('Latency') +
g.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [g.selector.eq('job', '$namespace/cortex-gw'), g.selector.eq('route', 'cortex-read')], extra_selectors=[g.selector.eq('cluster', '$cluster')])
)
)
.addRow(
g.row('Querier')
.addPanel(
g.panel('QPS') +
g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/querier"}')
)
.addPanel(
g.panel('Latency') +
g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/querier')], extra_selectors=[g.selector.eq('cluster', '$cluster')])
)
)
.addRow(
g.row('Ingester')
.addPanel(
g.panel('QPS') +
g.qpsPanel('logish_request_duration_seconds_count{cluster="$cluster", job="$namespace/ingester",route!~"/logproto.Pusher/Push|metrics|ready|traces"}')
)
.addPanel(
g.panel('Latency') +
g.latencyRecordingRulePanel('logish_request_duration_seconds', [g.selector.eq('job', '$namespace/ingester'), g.selector.nre('route', '/logproto.Pusher/Push|metrics|ready')], extra_selectors=[g.selector.eq('cluster', '$cluster')])
)
),
'logish-chunks.json':
g.dashboard('Logish / Chunks')
.addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster')
.addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace')
.addRow(
g.row('Active Series / Chunks')
.addPanel(
g.panel('Series') +
g.queryPanel('sum(logish_ingester_memory_chunks{cluster="$cluster", job="$namespace/ingester"})', 'series'),
)
.addPanel(
g.panel('Chunks per series') +
g.queryPanel('sum(logish_ingester_memory_chunks{cluster="$cluster", job="$namespace/ingester"}) / sum(logish_ingester_memory_series{job="$namespace/ingester"})', 'chunks'),
)
)
.addRow(
g.row('Flush Stats')
.addPanel(
g.panel('Utilization') +
g.latencyPanel('logish_ingester_chunk_utilization', '{cluster="$cluster", job="$namespace/ingester"}', multiplier='1') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Age') +
g.latencyPanel('logish_ingester_chunk_age_seconds', '{cluster="$cluster", job="$namespace/ingester"}'),
),
)
.addRow(
g.row('Flush Stats')
.addPanel(
g.panel('Size') +
g.latencyPanel('logish_ingester_chunk_length', '{cluster="$cluster", job="$namespace/ingester"}', multiplier='1') +
{ yaxes: g.yaxes('short') },
)
.addPanel(
g.panel('Entries') +
g.queryPanel('sum(rate(logish_chunk_store_index_entries_per_chunk_sum{cluster="$cluster", job="$namespace/ingester"}[5m])) / sum(rate(logish_chunk_store_index_entries_per_chunk_count{cluster="$cluster", job="$namespace/ingester"}[5m]))', 'entries'),
),
)
.addRow(
g.row('Flush Stats')
.addPanel(
g.panel('Queue Length') +
g.queryPanel('logish_ingester_flush_queue_length{cluster="$cluster", job="$namespace/ingester"}', '{{instance}}'),
)
.addPanel(
g.panel('Flush Rate') +
g.qpsPanel('logish_ingester_chunk_age_seconds_count{cluster="$cluster", job="$namespace/ingester"}'),
),
),
'logish-frontend.json':
g.dashboard('Logish / Frontend')
.addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster')
.addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace')
.addRow(
g.row('logish Reqs (cortex_gw)')
.addPanel(
g.panel('QPS') +
g.qpsPanel('cortex_gw_request_duration_seconds_count{cluster="$cluster", job="$namespace/cortex-gw"}')
)
.addPanel(
g.panel('Latency') +
g.latencyRecordingRulePanel('cortex_gw_request_duration_seconds', [g.selector.eq('job', '$namespace/cortex-gw')], extra_selectors=[g.selector.eq('cluster', '$cluster')])
)
),
'promtail.json':
g.dashboard('Logish / Promtail')
.addTemplate('cluster', 'kube_pod_container_info{image=~".*logish.*"}', 'cluster')
.addTemplate('namespace', 'kube_pod_container_info{image=~".*logish.*"}', 'namespace')
.addRow(
g.row('promtail Reqs')
.addPanel(
g.panel('QPS') +
g.qpsPanel('promtail_request_duration_seconds_count{cluster="$cluster", job="$namespace/promtail"}')
)
.addPanel(
g.panel('Latency') +
g.latencyRecordingRulePanel('promtail_request_duration_seconds', [g.selector.eq('job', '$namespace/promtail')], extra_selectors=[g.selector.eq('cluster', '$cluster')])
)
)
},
}
\ No newline at end of file
{
"dependencies": [
{
"name": "grafana-builder",
"source": {
"git": {
"remote": "https://github.com/kausalco/public",
"subdir": "grafana-builder"
}
},
"version": "master"
}
]
}
\ No newline at end of file
{
"dependencies": [
{
"name": "grafana-builder",
"source": {
"git": {
"remote": "https://github.com/kausalco/public",
"subdir": "grafana-builder"
}
},
"version": "cab274f882aae97ad6add33590a3b149e6f8eeac"
}
]
}
\ No newline at end of file
(import 'dashboards.libsonnet') +
(import 'alerts.libsonnet') +
(import 'recording_rules.libsonnet')
\ No newline at end of file
local histogramRules(metric, labels) =
local vars = {
metric: metric,
labels_underscore: std.join('_', labels),
labels_comma: std.join(', ', labels),
};
[
{
record: '%(labels_underscore)s:%(metric)s:99quantile' % vars,
expr: 'histogram_quantile(0.99, sum(rate(%(metric)s_bucket[5m])) by (le, %(labels_comma)s))' % vars,
},
{
record: '%(labels_underscore)s:%(metric)s:50quantile' % vars,
expr: 'histogram_quantile(0.50, sum(rate(%(metric)s_bucket[5m])) by (le, %(labels_comma)s))' % vars,
},
{
record: '%(labels_underscore)s:%(metric)s:avg' % vars,
expr: 'sum(rate(%(metric)s_sum[5m])) by (%(labels_comma)s) / sum(rate(%(metric)s_count[5m])) by (%(labels_comma)s)' % vars,
},
];
{
prometheus_rules+:: {
groups+: [{
name: 'logish_rules',
rules:
histogramRules('logish_request_duration_seconds', ['job']) +
histogramRules('logish_request_duration_seconds', ['job', 'route']) +
histogramRules('logish_request_duration_seconds', ['namespace', 'job', 'route']),
}, {
name: 'logish_frontend_rules',
rules:
histogramRules('cortex_gw_request_duration_seconds', ['job']) +
histogramRules('cortex_gw_request_duration_seconds', ['job', 'route']) +
histogramRules('cortex_gw_request_duration_seconds', ['namespace', 'job', 'route']),
}, {
name: 'promtail_rules',
rules:
histogramRules('promtail_request_duration_seconds', ['job']) +
histogramRules('promtail_request_duration_seconds', ['job', 'status_code']),
}],
},
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment