diff options
| author | Paul Buetow <paul@buetow.org> | 2025-12-28 12:21:46 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2025-12-28 12:21:46 +0200 |
| commit | 49086b43aeebfd3fdd06cd330cca8130d32e5202 (patch) | |
| tree | 5a8e0c3e22486b70c070b54025c395d8c070bab8 | |
| parent | 598bcd7f6ccf9e884ec1a29e8188947954bc064f (diff) | |
Add comprehensive ZFS monitoring for FreeBSD servers
Implemented complete ZFS monitoring solution including ARC cache statistics,
pool health/capacity metrics, dataset usage, and I/O throughput monitoring.
Changes:
- Add ZFS recording rules (9 calculated metrics for ARC hit rates, memory usage, etc.)
- Add comprehensive Grafana dashboard with 19 panels across 5 rows:
* Pool Overview: capacity, health, size, free space, usage trends
* I/O Throughput: read/write operations and bytes per second
* Dataset Statistics: table showing all datasets with usage details
* ARC Cache Statistics: hit rates, size, memory usage
* ARC Breakdown: data vs metadata, MRU vs MFU with pie charts
- Update Justfile to deploy ZFS recording rules
- Add textfile collector script on FreeBSD servers (f0, f1, f2) for pool/dataset metrics
Metrics collected:
- Pool: size, allocated, free, capacity %, health status
- I/O: read/write operations and throughput (via zpool iostat)
- Dataset: used, available, referenced space per filesystem
- ARC: hit rate, size, memory usage, data/metadata breakdown
Fixes:
- Pool health panel properly displays ONLINE/DEGRADED/FAULTED status
- All stat panels have correct options configuration
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
| -rw-r--r-- | f3s/prometheus/Justfile | 2 | ||||
| -rw-r--r-- | f3s/prometheus/zfs-dashboards.yaml | 36 | ||||
| -rw-r--r-- | f3s/prometheus/zfs-recording-rules.yaml | 106 |
3 files changed, 144 insertions, 0 deletions
diff --git a/f3s/prometheus/Justfile b/f3s/prometheus/Justfile index 686a6a1..1038650 100644 --- a/f3s/prometheus/Justfile +++ b/f3s/prometheus/Justfile @@ -4,6 +4,7 @@ install: helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring -f persistence-values.yaml kubectl apply -f freebsd-recording-rules.yaml kubectl apply -f openbsd-recording-rules.yaml + kubectl apply -f zfs-recording-rules.yaml just -f grafana-ingress/Justfile install upgrade: @@ -11,6 +12,7 @@ upgrade: helm upgrade prometheus prometheus-community/kube-prometheus-stack --namespace monitoring -f persistence-values.yaml kubectl apply -f freebsd-recording-rules.yaml kubectl apply -f openbsd-recording-rules.yaml + kubectl apply -f zfs-recording-rules.yaml uninstall: just -f grafana-ingress/Justfile delete diff --git a/f3s/prometheus/zfs-dashboards.yaml b/f3s/prometheus/zfs-dashboards.yaml new file mode 100644 index 0000000..208a090 --- /dev/null +++ b/f3s/prometheus/zfs-dashboards.yaml @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: freebsd-zfs-dashboards + namespace: monitoring + labels: + grafana_dashboard: '1' + app.kubernetes.io/instance: prometheus + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/part-of: kube-prometheus-stack + release: prometheus +data: + freebsd-zfs.json: "{\n \"title\": \"FreeBSD ZFS\",\n \"uid\": \"freebsd-zfs\",\n \"timezone\": \"utc\",\n \"schemaVersion\": 38,\n \"refresh\": \"30s\",\n \"tags\": [\n \"zfs\",\n \"freebsd\",\n \"storage\"\n ],\n \"editable\": true,\n \"templating\": {\n \"list\": [\n {\n \"name\": \"datasource\",\n \"type\": \"datasource\",\n \"query\": \"prometheus\",\n \"hide\": 0\n },\n {\n \"name\": \"instance\",\n \"type\": \"query\",\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"query\": \"label_values(node_zfs_arcstats_size_bytes{os=\\\"freebsd\\\"}, instance)\",\n \"refresh\": 2,\n \"hide\": 0,\n \"label\": \"FreeBSD Server\"\n },\n {\n \"name\": \"pool\",\n \"type\": \"query\",\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \ + \ \"query\": \"label_values(zfs_pool_size_bytes{instance=\\\"$instance\\\"}, pool)\",\n \"refresh\": 2,\n \"hide\": 0,\n \"label\": \"ZFS Pool\",\n \"includeAll\": true,\n \"multi\": false\n }\n ]\n },\n \"panels\": [\n {\n \"type\": \"row\",\n \"collapsed\": false,\n \"title\": \"Pool Overview\",\n \"gridPos\": {\n \"h\": 1,\n \"w\": 24,\n \"x\": 0,\n \"y\": 0\n }\n },\n {\n \"type\": \"gauge\",\n \"title\": \"Pool Capacity\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 6,\n \"x\": 0,\n \"y\": 1\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"zfs_pool_capacity_percent{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"legendFormat\": \"{{pool}}\"\n }\n ],\n \"fieldConfig\": {\n \ + \ \"defaults\": {\n \"unit\": \"percent\",\n \"min\": 0,\n \"max\": 100,\n \"thresholds\": {\n \"mode\": \"absolute\",\n \"steps\": [\n {\n \"color\": \"green\",\n \"value\": 0\n },\n {\n \"color\": \"yellow\",\n \"value\": 70\n },\n {\n \"color\": \"red\",\n \"value\": 85\n }\n ]\n }\n }\n }\n },\n {\n \"type\": \"stat\",\n \"title\": \"Pool Health\",\n \"gridPos\": {\n \"h\": 4,\n \"w\": 6,\n \"x\": 6,\n \"y\": 1\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"zfs_pool_health{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"legendFormat\": \"\ + {{pool}}\"\n }\n ],\n \"options\": {\n \"reduceOptions\": {\n \"values\": false,\n \"calcs\": [\n \"lastNotNull\"\n ]\n },\n \"orientation\": \"auto\",\n \"textMode\": \"value_and_name\",\n \"colorMode\": \"background\",\n \"graphMode\": \"none\",\n \"justifyMode\": \"auto\"\n },\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"short\",\n \"mappings\": [\n {\n \"options\": {\n \"0\": {\n \"text\": \"ONLINE\",\n \"color\": \"green\"\n }\n },\n \"type\": \"value\"\n },\n {\n \"options\": {\n \"1\": {\n \"text\": \"DEGRADED\",\n \"color\": \"yellow\"\n }\n },\n \"type\": \"value\"\n },\n {\n\ + \ \"options\": {\n \"2\": {\n \"text\": \"FAULTED\",\n \"color\": \"red\"\n }\n },\n \"type\": \"value\"\n }\n ],\n \"thresholds\": {\n \"mode\": \"absolute\",\n \"steps\": [\n {\n \"color\": \"green\",\n \"value\": null\n },\n {\n \"color\": \"yellow\",\n \"value\": 1\n },\n {\n \"color\": \"red\",\n \"value\": 2\n }\n ]\n }\n },\n \"overrides\": []\n }\n },\n {\n \"type\": \"stat\",\n \"title\": \"Total Pool Size\",\n \"gridPos\": {\n \"h\": 4,\n \"w\": 6,\n \"x\": 12,\n \"y\": 1\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\ + \n },\n \"targets\": [\n {\n \"expr\": \"zfs_pool_size_bytes{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"legendFormat\": \"{{pool}}\"\n }\n ],\n \"options\": {\n \"reduceOptions\": {\n \"values\": false,\n \"calcs\": [\n \"lastNotNull\"\n ]\n },\n \"orientation\": \"auto\",\n \"textMode\": \"value_and_name\",\n \"colorMode\": \"value\",\n \"graphMode\": \"none\",\n \"justifyMode\": \"auto\"\n },\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"bytes\"\n },\n \"overrides\": []\n }\n },\n {\n \"type\": \"stat\",\n \"title\": \"Free Space\",\n \"gridPos\": {\n \"h\": 4,\n \"w\": 6,\n \"x\": 18,\n \"y\": 1\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n\ + \ {\n \"expr\": \"zfs_pool_free_bytes{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"legendFormat\": \"{{pool}}\"\n }\n ],\n \"options\": {\n \"reduceOptions\": {\n \"values\": false,\n \"calcs\": [\n \"lastNotNull\"\n ]\n },\n \"orientation\": \"auto\",\n \"textMode\": \"value_and_name\",\n \"colorMode\": \"value\",\n \"graphMode\": \"none\",\n \"justifyMode\": \"auto\"\n },\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"bytes\"\n },\n \"overrides\": []\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"Pool Space Usage Over Time\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 6,\n \"y\": 5\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n\ + \ \"expr\": \"zfs_pool_allocated_bytes{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"legendFormat\": \"{{pool}} - Used\"\n },\n {\n \"expr\": \"zfs_pool_free_bytes{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"legendFormat\": \"{{pool}} - Free\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"bytes\",\n \"custom\": {\n \"fillOpacity\": 10,\n \"showPoints\": \"never\",\n \"stacking\": {\n \"mode\": \"normal\"\n }\n }\n }\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"Pool Capacity Trend\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 6,\n \"x\": 18,\n \"y\": 5\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"zfs_pool_capacity_percent{instance=\\\ + \"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"legendFormat\": \"{{pool}}\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"percent\",\n \"min\": 0,\n \"max\": 100,\n \"custom\": {\n \"fillOpacity\": 10,\n \"showPoints\": \"never\"\n }\n }\n }\n },\n {\n \"type\": \"row\",\n \"collapsed\": false,\n \"title\": \"I/O Throughput\",\n \"gridPos\": {\n \"h\": 1,\n \"w\": 24,\n \"x\": 0,\n \"y\": 13\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"Read Operations Rate\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 0,\n \"y\": 14\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"rate(zfs_pool_read_operations_total{instance=\\\"$instance\\\ + \",pool=~\\\"$pool\\\"}[5m])\",\n \"legendFormat\": \"{{pool}}\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"ops\",\n \"custom\": {\n \"fillOpacity\": 10,\n \"showPoints\": \"never\"\n }\n }\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"Write Operations Rate\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 12,\n \"y\": 14\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"rate(zfs_pool_write_operations_total{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}[5m])\",\n \"legendFormat\": \"{{pool}}\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"ops\",\n \"custom\": {\n \"fillOpacity\": 10,\n \"showPoints\": \"\ + never\"\n }\n }\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"Read Throughput\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 0,\n \"y\": 22\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"rate(zfs_pool_read_bytes_total{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}[5m])\",\n \"legendFormat\": \"{{pool}}\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"Bps\",\n \"custom\": {\n \"fillOpacity\": 10,\n \"showPoints\": \"never\"\n }\n }\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"Write Throughput\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 12,\n \"y\": 22\n },\n \"datasource\": {\n \"type\"\ + : \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"rate(zfs_pool_write_bytes_total{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}[5m])\",\n \"legendFormat\": \"{{pool}}\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"Bps\",\n \"custom\": {\n \"fillOpacity\": 10,\n \"showPoints\": \"never\"\n }\n }\n }\n },\n {\n \"type\": \"row\",\n \"collapsed\": false,\n \"title\": \"Dataset Statistics\",\n \"gridPos\": {\n \"h\": 1,\n \"w\": 24,\n \"x\": 0,\n \"y\": 30\n }\n },\n {\n \"type\": \"table\",\n \"title\": \"Datasets by Pool\",\n \"gridPos\": {\n \"h\": 10,\n \"w\": 24,\n \"x\": 0,\n \"y\": 31\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n \ + \ },\n \"targets\": [\n {\n \"expr\": \"zfs_dataset_used_bytes{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"format\": \"table\",\n \"instant\": true,\n \"refId\": \"A\"\n },\n {\n \"expr\": \"zfs_dataset_available_bytes{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"format\": \"table\",\n \"instant\": true,\n \"refId\": \"B\"\n },\n {\n \"expr\": \"zfs_dataset_referenced_bytes{instance=\\\"$instance\\\",pool=~\\\"$pool\\\"}\",\n \"format\": \"table\",\n \"instant\": true,\n \"refId\": \"C\"\n }\n ],\n \"transformations\": [\n {\n \"id\": \"merge\",\n \"options\": {}\n },\n {\n \"id\": \"organize\",\n \"options\": {\n \"renameByName\": {\n \"dataset\": \"Dataset\",\n \"pool\": \"Pool\",\n \"Value #A\"\ + : \"Used\",\n \"Value #B\": \"Available\",\n \"Value #C\": \"Referenced\"\n },\n \"excludeByName\": {\n \"__name__\": true,\n \"instance\": true,\n \"job\": true,\n \"os\": true,\n \"Time\": true\n }\n }\n }\n ],\n \"fieldConfig\": {\n \"overrides\": [\n {\n \"matcher\": {\n \"id\": \"byName\",\n \"options\": \"Used\"\n },\n \"properties\": [\n {\n \"id\": \"unit\",\n \"value\": \"bytes\"\n }\n ]\n },\n {\n \"matcher\": {\n \"id\": \"byName\",\n \"options\": \"Available\"\n },\n \"properties\": [\n {\n \"id\": \"unit\",\n \"value\": \"bytes\"\n }\n ]\n \ + \ },\n {\n \"matcher\": {\n \"id\": \"byName\",\n \"options\": \"Referenced\"\n },\n \"properties\": [\n {\n \"id\": \"unit\",\n \"value\": \"bytes\"\n }\n ]\n }\n ]\n }\n },\n {\n \"type\": \"row\",\n \"collapsed\": false,\n \"title\": \"ARC Cache Statistics\",\n \"gridPos\": {\n \"h\": 1,\n \"w\": 24,\n \"x\": 0,\n \"y\": 41\n }\n },\n {\n \"type\": \"gauge\",\n \"title\": \"ARC Hit Rate\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 6,\n \"x\": 0,\n \"y\": 42\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"node_zfs_arc_hit_rate_percent{instance=\\\"$instance\\\"}\",\n \"legendFormat\": \"Hit\ + \ Rate\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"percent\",\n \"min\": 0,\n \"max\": 100,\n \"thresholds\": {\n \"mode\": \"absolute\",\n \"steps\": [\n {\n \"color\": \"red\",\n \"value\": 0\n },\n {\n \"color\": \"yellow\",\n \"value\": 70\n },\n {\n \"color\": \"green\",\n \"value\": 90\n }\n ]\n }\n }\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"ARC Size (Current, Target, Max)\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 6,\n \"y\": 42\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"node_zfs_arcstats_c_max_bytes{instance=\\\ + \"$instance\\\"}\",\n \"legendFormat\": \"Max Size\"\n },\n {\n \"expr\": \"node_zfs_arcstats_c_bytes{instance=\\\"$instance\\\"}\",\n \"legendFormat\": \"Target Size\"\n },\n {\n \"expr\": \"node_zfs_arcstats_size_bytes{instance=\\\"$instance\\\"}\",\n \"legendFormat\": \"Current Size\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"bytes\",\n \"custom\": {\n \"fillOpacity\": 10,\n \"showPoints\": \"never\"\n }\n }\n }\n },\n {\n \"type\": \"gauge\",\n \"title\": \"ARC Memory Usage %\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 6,\n \"x\": 18,\n \"y\": 42\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"node_zfs_arc_memory_usage_percent{instance=\\\ + \"$instance\\\"}\",\n \"legendFormat\": \"Memory Usage\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"percent\",\n \"min\": 0,\n \"max\": 100,\n \"thresholds\": {\n \"mode\": \"absolute\",\n \"steps\": [\n {\n \"color\": \"green\",\n \"value\": 0\n },\n {\n \"color\": \"yellow\",\n \"value\": 80\n },\n {\n \"color\": \"red\",\n \"value\": 95\n }\n ]\n }\n }\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"ARC Hits vs Misses (rate)\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 0,\n \"y\": 50\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\"\ + : [\n {\n \"expr\": \"rate(node_zfs_arcstats_hits_total{instance=\\\"$instance\\\"}[5m])\",\n \"legendFormat\": \"Hits/sec\"\n },\n {\n \"expr\": \"rate(node_zfs_arcstats_misses_total{instance=\\\"$instance\\\"}[5m])\",\n \"legendFormat\": \"Misses/sec\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"ops\",\n \"custom\": {\n \"fillOpacity\": 10,\n \"showPoints\": \"never\"\n }\n }\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"ARC Hit Rates by Type\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 12,\n \"y\": 50\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"node_zfs_arc_demand_data_hit_rate_percent{instance=\\\"$instance\\\"}\",\n \"legendFormat\"\ + : \"Data Hit Rate\"\n },\n {\n \"expr\": \"node_zfs_arc_demand_metadata_hit_rate_percent{instance=\\\"$instance\\\"}\",\n \"legendFormat\": \"Metadata Hit Rate\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"percent\",\n \"custom\": {\n \"fillOpacity\": 10,\n \"showPoints\": \"never\"\n }\n }\n }\n },\n {\n \"type\": \"row\",\n \"collapsed\": false,\n \"title\": \"ARC Breakdown\",\n \"gridPos\": {\n \"h\": 1,\n \"w\": 24,\n \"x\": 0,\n \"y\": 58\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"ARC Data vs Metadata\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 0,\n \"y\": 59\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"\ + expr\": \"node_zfs_arcstats_data_bytes{instance=\\\"$instance\\\"}\",\n \"legendFormat\": \"Data\"\n },\n {\n \"expr\": \"node_zfs_arcstats_meta_bytes{instance=\\\"$instance\\\"}\",\n \"legendFormat\": \"Metadata\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"bytes\",\n \"custom\": {\n \"fillOpacity\": 20,\n \"showPoints\": \"never\",\n \"stacking\": {\n \"mode\": \"normal\"\n }\n }\n }\n }\n },\n {\n \"type\": \"timeseries\",\n \"title\": \"MRU vs MFU Cache Size\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 12,\n \"y\": 59\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"node_zfs_arcstats_mru_bytes{instance=\\\"$instance\\\"}\",\n\ + \ \"legendFormat\": \"MRU (Most Recently Used)\"\n },\n {\n \"expr\": \"node_zfs_arcstats_mfu_bytes{instance=\\\"$instance\\\"}\",\n \"legendFormat\": \"MFU (Most Frequently Used)\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"bytes\",\n \"custom\": {\n \"fillOpacity\": 20,\n \"showPoints\": \"never\",\n \"stacking\": {\n \"mode\": \"normal\"\n }\n }\n }\n }\n },\n {\n \"type\": \"piechart\",\n \"title\": \"ARC Composition (Data vs Metadata)\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 0,\n \"y\": 67\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"node_zfs_arc_data_percent{instance=\\\"$instance\\\"}\",\n \"legendFormat\"\ + : \"Data\"\n },\n {\n \"expr\": \"node_zfs_arc_metadata_percent{instance=\\\"$instance\\\"}\",\n \"legendFormat\": \"Metadata\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"percent\"\n }\n },\n \"options\": {\n \"legend\": {\n \"displayMode\": \"table\",\n \"placement\": \"right\",\n \"showLegend\": true,\n \"values\": [\n \"value\"\n ]\n },\n \"pieType\": \"pie\"\n }\n },\n {\n \"type\": \"piechart\",\n \"title\": \"ARC Composition (MRU vs MFU)\",\n \"gridPos\": {\n \"h\": 8,\n \"w\": 12,\n \"x\": 12,\n \"y\": 67\n },\n \"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"${datasource}\"\n },\n \"targets\": [\n {\n \"expr\": \"node_zfs_arc_mru_percent{instance=\\\"$instance\\\"}\",\n \"legendFormat\"\ + : \"MRU (Recently Used)\"\n },\n {\n \"expr\": \"node_zfs_arc_mfu_percent{instance=\\\"$instance\\\"}\",\n \"legendFormat\": \"MFU (Frequently Used)\"\n }\n ],\n \"fieldConfig\": {\n \"defaults\": {\n \"unit\": \"percent\"\n }\n },\n \"options\": {\n \"legend\": {\n \"displayMode\": \"table\",\n \"placement\": \"right\",\n \"showLegend\": true,\n \"values\": [\n \"value\"\n ]\n },\n \"pieType\": \"pie\"\n }\n }\n ],\n \"time\": {\n \"from\": \"now-6h\",\n \"to\": \"now\"\n }\n}" diff --git a/f3s/prometheus/zfs-recording-rules.yaml b/f3s/prometheus/zfs-recording-rules.yaml new file mode 100644 index 0000000..c445ea7 --- /dev/null +++ b/f3s/prometheus/zfs-recording-rules.yaml @@ -0,0 +1,106 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: freebsd-zfs-rules + namespace: monitoring + labels: + release: prometheus +spec: + groups: + # FreeBSD ZFS ARC metrics - calculated values for easier dashboard consumption + - name: freebsd-zfs-arc + interval: 30s + rules: + # ARC Hit Rate (percentage) + # Rate of hits divided by total requests (hits + misses) + - record: node_zfs_arc_hit_rate_percent + expr: | + 100 * ( + rate(node_zfs_arcstats_hits_total{os="freebsd"}[5m]) / + (rate(node_zfs_arcstats_hits_total{os="freebsd"}[5m]) + + rate(node_zfs_arcstats_misses_total{os="freebsd"}[5m])) + ) + labels: + os: freebsd + + # ARC Memory Usage Percentage (current size vs maximum) + - record: node_zfs_arc_memory_usage_percent + expr: | + 100 * ( + node_zfs_arcstats_size_bytes{os="freebsd"} / + node_zfs_arcstats_c_max_bytes{os="freebsd"} + ) + labels: + os: freebsd + + # ARC Target vs Max Ratio (how close to maximum target is) + - record: node_zfs_arc_target_percent + expr: | + 100 * ( + node_zfs_arcstats_c_bytes{os="freebsd"} / + node_zfs_arcstats_c_max_bytes{os="freebsd"} + ) + labels: + os: freebsd + + # ARC Metadata Percentage (metadata vs total ARC size) + - record: node_zfs_arc_metadata_percent + expr: | + 100 * ( + node_zfs_arcstats_meta_bytes{os="freebsd"} / + node_zfs_arcstats_size_bytes{os="freebsd"} + ) + labels: + os: freebsd + + # ARC Data Percentage (data vs total ARC size) + - record: node_zfs_arc_data_percent + expr: | + 100 * ( + node_zfs_arcstats_data_bytes{os="freebsd"} / + node_zfs_arcstats_size_bytes{os="freebsd"} + ) + labels: + os: freebsd + + # MFU Percentage (Most Frequently Used vs total ARC) + - record: node_zfs_arc_mfu_percent + expr: | + 100 * ( + node_zfs_arcstats_mfu_bytes{os="freebsd"} / + node_zfs_arcstats_size_bytes{os="freebsd"} + ) + labels: + os: freebsd + + # MRU Percentage (Most Recently Used vs total ARC) + - record: node_zfs_arc_mru_percent + expr: | + 100 * ( + node_zfs_arcstats_mru_bytes{os="freebsd"} / + node_zfs_arcstats_size_bytes{os="freebsd"} + ) + labels: + os: freebsd + + # Demand Data Hit Rate (percentage) + - record: node_zfs_arc_demand_data_hit_rate_percent + expr: | + 100 * ( + rate(node_zfs_arcstats_demand_data_hits_total{os="freebsd"}[5m]) / + (rate(node_zfs_arcstats_demand_data_hits_total{os="freebsd"}[5m]) + + rate(node_zfs_arcstats_demand_data_misses_total{os="freebsd"}[5m])) + ) + labels: + os: freebsd + + # Demand Metadata Hit Rate (percentage) + - record: node_zfs_arc_demand_metadata_hit_rate_percent + expr: | + 100 * ( + rate(node_zfs_arcstats_demand_metadata_hits_total{os="freebsd"}[5m]) / + (rate(node_zfs_arcstats_demand_metadata_hits_total{os="freebsd"}[5m]) + + rate(node_zfs_arcstats_demand_metadata_misses_total{os="freebsd"}[5m])) + ) + labels: + os: freebsd |
