commit 63d09ca36469a68bd2e537b510fbd6a3d5041c5b Author: tseed Date: Wed Nov 30 12:05:03 2022 +0000 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ce5019 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv/ +backup_ansible/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..ed0e1ba --- /dev/null +++ b/README.md @@ -0,0 +1,49 @@ +# What is this? + +Side project to build a CI testbench to work on various Ansible roles to fit with employers default HPC deployment. + +# Drivers + +* There is not a true representation of the stack suitable for CI +* Existing testbench is slow to re-provision and very manual +* There isnt enough hardware to test multiple stacks +* Corp hypervisors are unsuitable (possibly Python IPMI listener -> Proxmox/VMWare API would be ok but many ancillary virtual networks would be required that may change on a stack to stack basis) +* Updating Ansible in vi on customer systems is tedious + +# Goal + +The aim is to simulate baremetal node provision using XCAT -> iPXE/IPMI -> virtualBMC -> QEMU VMs, and continue to develop the Ansible roles that configure the various classes of node. + +# Components + +Use commodity hardware to act as hypervisors and model the storage and network components +* tested on 2 and 3 nodes, single nvme, single NIC + +Generate static or dynamic Ansible inventory natively via the XCAT API. +* working model +* requires networks to be pulled from XCAT + +Use a dynamic role model triggered by XCAT group membership. +* existing working model +* all Ansible variables imported under top level object ready for keypairDB integration +* various helper roles to deep merge dictionaries and lists for individual site/deployment customisations + + +Use VXLAN point to point between each hypervisor to simulate the various cluster networks. +* working model that will scale to many hypervisors + +Use hyperconverged Ceph to provide RBD for VM disk images, CephFS+Ganesha for NFS mounts hosting scheduler/HPC software +* latest Ceph is now nearly all yaml spec driven allowing automation, most exisitng Ansible is behind +* cluster build automation complete +* OSD + Pools complete +* RBD complete +* NFS outstanding + +Deploy XCAT container, seed with inventory of to-be provisioned VMs +* to complete + +Deploy virtualBMC +* working model + +Deploy QEMU with RBD disk +* to complete diff --git a/cluster/README.md b/cluster/README.md new file mode 100644 index 0000000..b774332 --- /dev/null +++ b/cluster/README.md @@ -0,0 +1,89 @@ + +# setup hypervisor hosts + +- AlmaLinux 8, minimal install +- LVM, root uses 30G, no home volume, all remaining disk provisioned by ceph +- 3 nodes - 192.168.140.1-3/24 +- user: ansible, has password-less sudo and ssh keys setup + +## network + +```sh +nmcli con add type ethernet ifname ens1 con-name ctlplane connection.autoconnect yes ip4 192.168.140.41/24 gw4 192.168.140.1 ipv4.dns 1.1.1.1,8.8.8.8 ipv4.dns-search local +nmcli con del ens1 && reboot +``` + +## ansible user + +```sh +groupadd -r -g 1001 ansible && useradd -r -u 1001 -g 1001 -m -s /bin/bash ansible ;\ +echo "%ansible ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/ansible ;\ +chmod 0440 /etc/sudoers.d/ansible ;\ +passwd ansible ;\ +hostnamectl set-hostname qemu01.local ;\ +hostnamectl set-hostname --transient qemu01.local ;\ +hostnamectl set-hostname --pretty qemu01 ;\ +hostnamectl + +ssh-copy-id -i ~/.ssh/id_rsa.pub ansible@192.168.140.41 +``` + +# setup python venv + +Setup a venv the easy way. + +```sh +sudo apt-get update +sudo apt-get install python3-dev libffi-dev gcc libssl-dev +sudo apt install python3-venv +mkdir -p /home/tseed/ansible/venv +python3 -m venv /home/tseed/ansible/venv +source /home/tseed/ansible/venv/bin/activate +``` + +# setup ansible environment + +## install additional ansible galaxy collection +ansible-galaxy collection install community.general + +## record collections file for replicating this environment +nano -cw requirements.yml + +```sh +collections: +- name: community.general +``` + +## install requirements from file on new environment + +```sh +ansible-galaxy collection install -r requirements.yml +ansible-galaxy collection install community.general --upgrade + +dnf install sshpass / apt-get install sshpass +pip install jmespath +``` + +# run playbook + +## start venv + +```sh +source /home/tseed/ansible/venv/bin/activate +``` + +## run hypervisor build playbook + +This only builds hypervisors up to Ceph RBD, VM provisioning not complete + +```sh +ansible-playbook bootstrap_hypervisors.yml +``` + +## run dynamic roles from XCAT inventory for the various provisioned VMs + +Used in production stack to provision various node classes, there are no real roles in this repo - just framework stuff and ntp/os_packages + +```sh +ansible-playbook -l all site.yml +``` diff --git a/cluster/action_plugins/__pycache__/merge_vars.cpython-38.pyc b/cluster/action_plugins/__pycache__/merge_vars.cpython-38.pyc new file mode 100644 index 0000000..6650ce1 Binary files /dev/null and b/cluster/action_plugins/__pycache__/merge_vars.cpython-38.pyc differ diff --git a/cluster/action_plugins/merge_vars.py b/cluster/action_plugins/merge_vars.py new file mode 100644 index 0000000..a215e7b --- /dev/null +++ b/cluster/action_plugins/merge_vars.py @@ -0,0 +1 @@ +from ansible_merge_vars import ActionModule diff --git a/cluster/ansible.cfg b/cluster/ansible.cfg new file mode 100644 index 0000000..e1d25fb --- /dev/null +++ b/cluster/ansible.cfg @@ -0,0 +1,11 @@ +[defaults] +inventory = ./hosts +remote_user = ansible +ask_pass = false +host_key_checking = False + +[privilege_escalation] +become = true +become_method = sudo +become_user = root +become_ask_pass = false \ No newline at end of file diff --git a/cluster/bootstrap_hypervisors.yml b/cluster/bootstrap_hypervisors.yml new file mode 100644 index 0000000..efe1fcb --- /dev/null +++ b/cluster/bootstrap_hypervisors.yml @@ -0,0 +1,252 @@ +--- +- name: populate inventory + hosts: localhost + user: ansible + # become: yes + gather_facts: false + + tasks: + + ######## wipe inventory to ensure this playbook only uses it own dynamically generated variables + + - name: refresh inventory + meta: refresh_inventory + + ######## load core group_vars + # + # load the following core environment files under vars['testbench'] + # - inventory/group_vars/cluster.yml + # - inventory/group_vars/networks.yml + + - name: load core environment configuration + block: + + - name: set runtime facts + ansible.builtin.set_fact: + _env_files: + - 'cluster.yml' + - 'hypervisor.yml' + - 'networks.yml' + _env_dir: "{{ ansible_inventory_sources[0] | dirname }}/group_vars" + config_namespace: "testbench" + + - name: include vars from core config files + ansible.builtin.include_vars: + file: "{{ env_path }}" + name: "env_import_{{ env_namespace }}" + loop: "{{ _env_files }}" + loop_control: + loop_var: entry + vars: + env_path: "{{ _env_dir }}/{{ entry }}" + env_namespace: "{{ entry.split('.yml')[0] }}" + + - name: append env vars to temp dict + ansible.builtin.set_fact: + _env_dict: "{{ _env_dict | default({}) | combine (env_import, recursive=True) }}" + loop: "{{ lookup('ansible.builtin.varnames', 'env_import_').split(',') }}" + loop_control: + loop_var: entry + vars: + env_import: "{{ vars[entry] }}" + + - name: copy dict of env vars under top level namespace, access @ vars[config_namespace] + ansible.builtin.set_fact: + { "{{ config_namespace }}": "{{ _env_dict }}" } + + # think i only need to include hypervisor.yml here - it looks nicer to only include a small set of vars then ref directly at top level not config_namespace + + ######## populate arp cache, find dhcp ip of hypervisor and add to inventory + + # uncomment if arp cache stale, this is slow so comment during dev + # - name: populate arp cache + # command: nmap -sn {{ range }} + # vars: + # dhcp_network: "{{ vars[config_namespace]['hypervisor']['nmcli_con_names']['primary'] }}" + # network: "{{ vars[config_namespace]['hypervisor']['cluster_networks'][dhcp_network]['network'] }}" + # netmask: "{{ vars[config_namespace]['hypervisor']['cluster_networks'][dhcp_network]['netmask'] }}" + # range: "{{ network }}/{{ (network + '/' + netmask) | ansible.utils.ipaddr('prefix') }}" + + # WSL2 specific method to get host arp cache + - name: get arp table + ansible.builtin.command: '/mnt/c/Windows/system32/arp.exe -a' + register: _arp_cache + + # windows arp.exe parse, write new mac_map with dhcp_ip + - name: find dhcp ip + ansible.builtin.set_fact: + _update_mac_map: "{{ _update_mac_map | default([]) + [new_record] }}" + loop: "{{ _arp_cache['stdout_lines'] }}" + loop_control: + loop_var: entry + vars: + check_record: "{{ entry | trim | regex_search('^[0-9]+') is not none }}" + format_record: "{{ entry | trim | regex_replace('\\s+', ',') | split(',') }}" + dhcp_ip: "{{ format_record[0] }}" + arp_mac: "{{ format_record[1] | regex_replace('-', ':') }}" + mac_map: "{{ vars[config_namespace]['hypervisor']['mac_map'] }}" + match_host: "{{ mac_map | selectattr('mac', '==', arp_mac) | map(attribute='host') }}" + match_ip: "{{ mac_map | selectattr('mac', '==', arp_mac) | map(attribute='ip') }}" + ipv6_link_local: "{{ 'fe80::0000:0000:0000:0000' | ansible.utils.slaac(arp_mac) }}" + nmcli_con: "{{ mac_map | selectattr('mac', '==', arp_mac) | map(attribute='nmcli_con') }}" + new_record: "{{ { 'host': match_host[0], 'mac': arp_mac, 'dhcp_ip': dhcp_ip, 'ip': match_ip[0], 'ipv6': ipv6_link_local, 'nmcli_con': nmcli_con[0] } }}" + when: + - check_record + - match_host | length >0 + + - name: fail with insufficient hosts matched, check mac_map + fail: + when: + - _update_mac_map is not defined + - _update_mac_map | length <2 + + # sort to ensure first host in mac_map gets the first vxlan ip, initially the arp cache dictates the order in which hosts are discovered + - name: sort mac_map + set_fact: + _sort_mac_map: "{{ _sort_mac_map | default([]) + mac_map_entry }}" + loop: "{{ vars[config_namespace]['hypervisor']['mac_map'] }}" + loop_control: + loop_var: entry + vars: + host: "{{ entry['host'] }}" + mac_map_entry: "{{ _update_mac_map | selectattr('host', '==', host) }}" + + - name: write global mac map + set_fact: + # mac_map: "{{ _update_mac_map }}" + mac_map: "{{ _sort_mac_map }}" + delegate_to: localhost + delegate_facts: true + + ######## update the in-memory inventory with the hypervisors + + - name: add hosts to in-memory inventory + ansible.builtin.add_host: > + name={{ host }} + groups={{ host_groups }} + ansible_ssh_host={{ ansible_ssh_host }} + ansible_ssh_common_args='-o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no"' + ansible_user={{ ansible_user }} + ansible_password={{ ansible_password }} + loop: "{{ hostvars['localhost']['mac_map'] }}" + loop_control: + loop_var: entry + vars: + host: "{{ entry['host'] }}" + # set host group membership, auto-create groups + host_groups: + - all + - hypervisor + - ceph + ansible_ssh_host: "{{ entry['dhcp_ip'] }}" + ansible_user: "{{ vars[config_namespace]['hypervisor']['ssh_user'] }}" + ansible_password: "{{ vars[config_namespace]['hypervisor']['ssh_password'] }}" + + ######## bootstrap hypervisors + +- name: run roles on hypervisors + hosts: hypervisor + gather_facts: yes + tasks: + + ######## load core group_vars + # + # load the following core environment files under vars['testbench'] + # - inventory/group_vars/cluster.yml + # - inventory/group_vars/networks.yml + + - name: load core environment configuration + block: + + # roles: + # hypervisor_network - setup interfaces + # hypervisor_vxlan - setup overlay networks - we also want to add ceph_public and ceph_cluster - we should do an overlay here + # hypervisor_ceph - great reference https://github.com/jcmdln/cephadm-playbook + # hypervisor_qemu - not written + # hypervisor_qemu_gui - not written, great qt5 web container for virt-manager that accepts qemu api endpoints over ssh as ENV vars + # + # need a role to replace nested dict items - needs to accept a dict as path maybe + + - name: set runtime facts + ansible.builtin.set_fact: + _run_roles: + # - hypervisor_network + # - ntp + # - os_packages + # - hypervisor_prep + # - hypervisor_vxlan + # - cephadm_prep + # - cephadm_bootstrap + - cephadm_services + _env_dir: "{{ ansible_inventory_sources[0] | dirname }}/group_vars" + _env_files: + - 'cluster.yml' + - 'hypervisor.yml' + - 'networks.yml' + config_namespace: "testbench" + + - name: include vars from core config files + ansible.builtin.include_vars: + file: "{{ env_path }}" + name: "env_import_{{ env_namespace }}" + loop: "{{ _env_files }}" + loop_control: + loop_var: entry + vars: + env_path: "{{ _env_dir }}/{{ entry }}" + env_namespace: "{{ entry.split('.yml')[0] }}" + + - name: append env vars to temp dict + ansible.builtin.set_fact: + _env_dict: "{{ _env_dict | default({}) | combine (env_import, recursive=True) }}" + loop: "{{ lookup('ansible.builtin.varnames', 'env_import_').split(',') }}" + loop_control: + loop_var: entry + vars: + env_import: "{{ vars[entry] }}" + + - name: copy dict of env vars under top level namespace, access @ vars[config_namespace] + ansible.builtin.set_fact: + { "{{ config_namespace }}": "{{ _env_dict }}" } + + ######## set some global variables used by roles for (vm) cluster node provisioning, if these roles are to be reused in the bootstrap of the hypervisors some static values will be required + + # this needs to loop over hypervisor.cluster_networks but exclude primary/external for vxlan creation// + + # - debug: + # msg: + # - "{{ groups }}" + # - "{{ ['all'] + hostvars[inventory_hostname]['group_names'] }}" + + # - fail: + # msg: + + - name: populate the active_role_groups variable, add ceph_cluster network for vxlan creation + ansible.builtin.set_fact: + # active_role_groups: ['all', 'hypervisor', 'ceph'] # this should be a copy of hostvars['groups'] with additional all group + active_role_groups: "{{ ['all'] + hostvars[inventory_hostname]['group_names'] }}" + _cluster_networks: "{{ vars[config_namespace] | combine( {'cluster_networks' :{'cephclus': { 'comment': comment, 'gateway': 'null', 'mtu': 'null', 'nameserver': 'null', 'netmask': netmask, 'network': network } } }, recursive=True) }}" + vars: + network: "{{ vars['hypervisor']['cluster_networks']['cephclus']['network'] }}" + netmask: "{{ vars['hypervisor']['cluster_networks']['cephclus']['netmask'] }}" + comment: "{{ vars['hypervisor']['cluster_networks']['cephclus']['comment'] }}" + + - ansible.builtin.set_fact: + { "{{ config_namespace }}": "{{ _cluster_networks }}" } + + ######## run roles against hypervisor hosts + + # - debug: + # msg: + # - "{{ hostvars[inventory_hostname] }}" + + # - fail: + # msg: + + + - ansible.builtin.include_role: + name: "{{ entry }}" + loop: "{{ _run_roles }}" + loop_control: + loop_var: entry + label: run {{ entry }} role on {{ inventory_hostname }} \ No newline at end of file diff --git a/cluster/example_xcat_inventory.yml b/cluster/example_xcat_inventory.yml new file mode 100644 index 0000000..c38f648 --- /dev/null +++ b/cluster/example_xcat_inventory.yml @@ -0,0 +1,207 @@ +all: + hosts: + compute001: + ansible_ssh_host: 172.22.10.1 + xcat_nics: + - device: ib0 + ip: 172.23.10.1 + network: infiniband + type: Infiniband + - device: ens18 + ip: 172.22.10.1 + network: cluster + type: Ethernet + ipmi_nic: + - device: ipmi + ip: 172.21.10.1 + network: ipmi + type: bmc + compute002: + ansible_ssh_host: 172.22.10.2 + xcat_nics: + - device: ens18 + ip: 172.22.10.2 + network: cluster + type: Ethernet + - device: ib0 + ip: 172.23.10.2 + network: infiniband + type: Infiniband + ipmi_nic: + - device: ipmi + ip: 172.21.10.2 + network: ipmi + type: bmc + gateway01: + ansible_ssh_host: 172.22.1.254 + xcat_nics: + - device: ens18 + ip: 172.22.1.254 + network: cluster + type: Ethernet + ipmi_nic: [] + hmem001: + ansible_ssh_host: 172.22.2.1 + xcat_nics: + - device: ens18 + ip: 172.22.2.1 + network: cluster + type: Ethernet + - device: ib0 + ip: 172.23.2.1 + network: infiniband + type: Infiniband + ipmi_nic: + - device: ipmi + ip: 172.21.2.1 + network: ipmi + type: bmc + mail01: + ansible_ssh_host: 172.22.1.230 + xcat_nics: + - device: ens18 + ip: 172.22.1.230 + network: cluster + type: Ethernet + ipmi_nic: + - device: ens19 + ip: 172.21.1.230 + network: ipmi + type: Ethernet + monitoring01: + ansible_ssh_host: 172.22.1.224 + xcat_nics: + - device: ens18 + ip: 172.22.1.224 + network: cluster + type: Ethernet + ipmi_nic: [] + nfs01: + ansible_ssh_host: 172.22.1.225 + xcat_nics: + - device: ens18 + ip: 172.22.1.225 + network: cluster + type: Ethernet + ipmi_nic: [] + repos01: + ansible_ssh_host: 172.22.1.223 + xcat_nics: + - device: ens18 + ip: 172.22.1.223 + network: cluster + type: Ethernet + ipmi_nic: [] + sl1: + ansible_ssh_host: 172.22.1.1 + xcat_nics: + - device: ib0 + ip: 172.23.1.1 + network: infiniband + type: Infiniband + ipmi_nic: + - device: ipmi + ip: 172.21.1.1 + network: ipmi + type: bmc + wlm01: + ansible_ssh_host: 172.22.1.221 + xcat_nics: + - device: ens18 + ip: 172.22.1.221 + network: cluster + type: Ethernet + ipmi_nic: [] + xcat01: + ansible_ssh_host: 172.22.1.220 + xcat_nics: + - device: ens18 + ip: 172.22.1.220 + network: cluster + type: Ethernet + ipmi_nic: + - device: ens19 + ip: 172.21.1.220 + network: ipmi + type: Ethernet +compute: + hosts: + compute001: + compute002: +slurm: + hosts: + compute001: + compute002: + hmem001: +ansible: + hosts: + compute001: + compute002: + hmem001: + mail01: + monitoring01: + nfs01: + wlm01: +gateway: + hosts: + gateway01: +vm: + hosts: + gateway01: + mail01: + monitoring01: + nfs01: + repos01: + sl1: + wlm01: + xcat01: +external: + hosts: + gateway01: + repos01: +hmem: + hosts: + hmem001: +smtp: + hosts: + mail01: +monitoring: + hosts: + monitoring01: +prometheus: + hosts: + monitoring01: +steel: + hosts: + monitoring01: + nfs01: +inet: + hosts: + monitoring01: +nfs: + hosts: + nfs01: +nfsserver: + hosts: + nfs01: +repos: + hosts: + repos01: +httpd: + hosts: + repos01: +login: + hosts: + sl1: +wlm: + hosts: + wlm01: +mgmt: + hosts: + xcat01: +xcat: + hosts: + xcat01: +ntp: + hosts: + xcat01: diff --git a/cluster/group_vars/cluster.yml b/cluster/group_vars/cluster.yml new file mode 100644 index 0000000..801e004 --- /dev/null +++ b/cluster/group_vars/cluster.yml @@ -0,0 +1,2 @@ +env: + cluster_domain: cluster.local \ No newline at end of file diff --git a/cluster/group_vars/firewalld.yml b/cluster/group_vars/firewalld.yml new file mode 100755 index 0000000..73a2f2c --- /dev/null +++ b/cluster/group_vars/firewalld.yml @@ -0,0 +1,46 @@ +firewalld: + enable: false + firewalld_services: + - name: ssh + short: "SSHooph again" + description: "SSH service" + port: + - port: 22 + protocol: tcp + zone: public + xcat_groups: + - compute + - all + - slurm + - ansible + - test + - test + xcat_networks: + - cluster + - infiniband + - test + - test1 + - name: named + short: "named" + description: "DNS Service" + port: + - port: 53 + protocol: tcp + - port: 953 + protocol: tcp + firewalld_ipsets: + fail2ban-ssh-ipv6: + short: fail2ban-ssh-ipv6 + description: fail2ban-ssh-ipv6 ipset + type: 'hash:ip' + options: + family: + - inet6 + maxelem: + - 65536 + timeout: + - 300 + hashsize: + - 1024 + targets: + - 2a01::1 \ No newline at end of file diff --git a/cluster/group_vars/hypervisor.yml b/cluster/group_vars/hypervisor.yml new file mode 100644 index 0000000..000ca8c --- /dev/null +++ b/cluster/group_vars/hypervisor.yml @@ -0,0 +1,167 @@ +hypervisor: + ssh_user: 'root' + ssh_password: 'Password0' + # connection: 'external' + # map of mac addresses to match to the primary/control-plane interface for bootstrap, this should be ordered with master host first + mac_map: + - host: 'qemu01' + mac: 'b8:97:5a:cf:d7:d3' + ip: '192.168.140.41' + nmcli_con: 'primary' + - host: 'qemu02' + mac: 'b8:97:5a:cf:da:c6' + ip: '192.168.140.42' + nmcli_con: 'primary' + - host: 'qemu03' + mac: 'b8:97:5a:cf:d8:bf' + ip: '192.168.140.43' + nmcli_con: 'primary' + # ceph disk + ceph_disk: /dev/nvme0n1 + # ceph dasboard admin user password + ceph_dash_admin_password: "Password0" + # nmcli connection interface names, device eth0 or nmcli interface names + nmcli_con_names: + primary: 'external' + ceph_public: 'storage' + ceph_cluster: 'cephclus' + ceph_rgw: 'storage' + # hypervisor specific networks to add to the cluster_networks dict imported from group_vars/networks.yml + cluster_networks: + external: + network: 192.168.140.0 + netmask: 255.255.255.0 + gateway: 192.168.140.1 + mtu: + nameserver: 1.1.1.1 + comment: ext + # cephpub: + # network: 172.26.0.0 + # netmask: 255.255.255.0 + # gateway: 172.26.0.1 + # mtu: + # nameserver: 1.1.1.1 + # comment: ext + cephclus: + network: 172.25.0.0 + netmask: 255.255.255.0 + gateway: + mtu: + nameserver: + comment: int + ceph_service_placement: + - host: 'qemu01' + labels: + - _admin + - mon + - osd + - mgr + - mds + - nfs + - rgw + - host: 'qemu02' + labels: + - _admin + - mon + - osd + - mgr + - mds + - host: 'qemu03' + labels: + - _admin + - mon + - osd + - mgr + - mds + # an nfs service uses an cephfs namespace or an rgw bucket, do not include an nfs service spec in this list + ceph_service_spec: + - service_type: alertmanager + service_name: alertmanager + placement: + count: 1 + - service_type: crash + service_name: crash + placement: + host_pattern: '*' + - service_type: grafana + service_name: grafana + placement: + count: 1 + - service_type: node-exporter + service_name: node-exporter + placement: + host_pattern: '*' + - service_type: prometheus + service_name: prometheus + placement: + count: 1 + - service_type: mon + service_name: mon + placement: + label: "mon" + - service_type: mgr + service_name: mgr + placement: + label: "mgr" + # multiple osd spec files on a per host basis can be included with adjusted placement configuration + - service_type: osd + service_id: osd_using_device_file + placement: + label: "osd" + spec: + data_devices: + paths: + - /dev/ceph/ceph_data + # db_devices: + # paths: + # - /dev/sdc + # wal_devices: + # paths: + # - /dev/sdd + - service_type: mds + service_id: cephfs + placement: + label: "mds" + # this rgw configuration provisions rgw instance with no realm and a zonegroup and zone named default and a data pool named .rgw.root + # there are 4 auto provisioned pools .rgw.root (pg32) / default.rgw.log (pg32) / default.rgw.control (pg32) / default.rgw.meta (pg8) + # a multisite configuration (specify realm/zonegroup/zone and specifc data pool) requires additional commands and multiple spec files + - service_type: rgw + service_id: object + placement: + label: "rgw" + count: 1 + spec: + ssl: false + rgw_frontend_port: 8080 + rgw_frontend_type: beast + - service_type: nfs + service_id: ganesha + placement: + label: "nfs" + spec: + port: 2049 + # add 'pg: ' entry if you dont want default allocation, pg autoscaling is enabled + ceph_pools: + - type: rbd + name: vms + # pg: 64 + - type: cephfs + name: cephfs.cluster_volume.data + cephfs_type: data + volume: cephfs_cluster_volume + - type: cephfs + name: cephfs.cluster_volume.meta + cephfs_type: meta + volume: cephfs_cluster_volume + - type: cephfs + name: cephfs.cluster_volume1.data + pg: 16 + cephfs_type: data + volume: cephfs_cluster_volume1 + - type: cephfs + name: cephfs.cluster_volume1.meta + pg: 16 + cephfs_type: meta + volume: cephfs_cluster_volume1 + + diff --git a/cluster/group_vars/networks.yml b/cluster/group_vars/networks.yml new file mode 100755 index 0000000..f697128 --- /dev/null +++ b/cluster/group_vars/networks.yml @@ -0,0 +1,37 @@ +# Generated using xcat2ansible_vars - DO NOT EDIT +cluster_networks: + campus: + network: 192.168.13.0 + netmask: 255.255.255.0 + gateway: 192.168.13.254 + mtu: + nameserver: + comment: ext + cluster: + network: 172.22.0.0 + netmask: 255.255.0.0 + gateway: 172.22.1.254 + mtu: 1500 + nameserver: 172.22.1.220 + comment: int + infiniband: + network: 172.23.0.0 + netmask: 255.255.0.0 + gateway: + mtu: + nameserver: + comment: int + ipmi: + network: 172.21.0.0 + netmask: 255.255.0.0 + gateway: + mtu: + nameserver: + comment: int + storage: + network: 172.24.0.0 + netmask: 255.255.0.0 + gateway: + mtu: + nameserver: + comment: int diff --git a/cluster/group_vars/ntp.yml b/cluster/group_vars/ntp.yml new file mode 100644 index 0000000..8500202 --- /dev/null +++ b/cluster/group_vars/ntp.yml @@ -0,0 +1,14 @@ +ntp: + external_hosts: + - 0.uk.pool.ntp.org + - time.cloudflare.com + - gbg1.ntp.se + - ntp1.hetzner.de + timezone: Europe/London1 +a: + b: "stuff" + c: "stuff" + d: + - "stuff" + - "stuff" + - stuff: "stuff1" diff --git a/cluster/group_vars/roles.yml b/cluster/group_vars/roles.yml new file mode 100755 index 0000000..5bbe5da --- /dev/null +++ b/cluster/group_vars/roles.yml @@ -0,0 +1,21 @@ +roles: + all: + - network + - repos + - yum + - os_packages + - ssh + - ntp + - users + - systemd + - rsyslog + - audittrail + - sysctl + - postfix + hypervisor: + - hypervisor_prep + - vxlan + - libvirt + - podman + ntpd: + - ntp diff --git a/cluster/host_vars.disabled/qemu01.yml b/cluster/host_vars.disabled/qemu01.yml new file mode 100644 index 0000000..7a30b75 --- /dev/null +++ b/cluster/host_vars.disabled/qemu01.yml @@ -0,0 +1,15 @@ +xcat_ip: 192.168.140.40 +xcat_groups: + - hypervisor + - all +xcat_nics: + - device: ib0 + ip: 172.23.10.1 + network: infiniband + type: Infiniband + carrier: ib0 + - device: ens18 + ip: 172.22.10.1 + network: cluster + type: Ethernet + carrier: ens18 \ No newline at end of file diff --git a/cluster/host_vars.disabled/qemu02.yml b/cluster/host_vars.disabled/qemu02.yml new file mode 100644 index 0000000..af13281 --- /dev/null +++ b/cluster/host_vars.disabled/qemu02.yml @@ -0,0 +1,15 @@ +xcat_ip: 192.168.140.41 +xcat_groups: + - hypervisor + - all +xcat_nics: + - device: ib0 + ip: 172.23.10.1 + network: infiniband + type: Infiniband + carrier: ib0 + - device: ens18 + ip: 172.22.10.1 + network: cluster + type: Ethernet + carrier: ens18 \ No newline at end of file diff --git a/cluster/host_vars.disabled/qemu03.yml b/cluster/host_vars.disabled/qemu03.yml new file mode 100644 index 0000000..7b288dc --- /dev/null +++ b/cluster/host_vars.disabled/qemu03.yml @@ -0,0 +1,15 @@ +xcat_ip: 192.168.140.42 +xcat_groups: + - hypervisor + - all +xcat_nics: + - device: ib0 + ip: 172.23.10.1 + network: infiniband + type: Infiniband + carrier: ib0 + - device: ens18 + ip: 172.22.10.1 + network: cluster + type: Ethernet + carrier: ens18 \ No newline at end of file diff --git a/cluster/hosts b/cluster/hosts new file mode 100644 index 0000000..e69de29 diff --git a/cluster/hosts.bak b/cluster/hosts.bak new file mode 100644 index 0000000..8c7c323 --- /dev/null +++ b/cluster/hosts.bak @@ -0,0 +1,17 @@ +[all] +qemu01 ansible_ssh_host=192.168.140.41 +qemu02 ansible_ssh_host=192.168.140.42 +qemu03 ansible_ssh_host=192.168.140.43 + +[hypervisor] +qemu01 +qemu02 +qemu03 + +[test2] +qemu01 +qemu02 +qemu03 + +[ntpd] +qemu01 \ No newline at end of file diff --git a/cluster/inventories/production/hosts b/cluster/inventories/production/hosts new file mode 100644 index 0000000..e69de29 diff --git a/cluster/inventories/staging/hosts b/cluster/inventories/staging/hosts new file mode 100644 index 0000000..e69de29 diff --git a/cluster/requirements.yml b/cluster/requirements.yml new file mode 100644 index 0000000..3ea5c60 --- /dev/null +++ b/cluster/requirements.yml @@ -0,0 +1,2 @@ +collections: +- name: community.general \ No newline at end of file diff --git a/cluster/rhel_phyton_venv.md b/cluster/rhel_phyton_venv.md new file mode 100644 index 0000000..220e564 --- /dev/null +++ b/cluster/rhel_phyton_venv.md @@ -0,0 +1,70 @@ +# install dependencies, get source, compile, install + +```sh +sudo dnf install gcc libffi-devel openssl-libs openssl-devel libuuid-devel +mkdir ~/python +cd ~/python +wget https://www.python.org/ftp/python/3.10.6/Python-3.10.6.tgz +tar -xvzf Python-3.10.6.tgz +mkdir ~/python/3.10.6 # running directory +cd ~/python/Python-3.10.6 # compile directory +./configure --prefix /opt/ocf_tseed/python/3.10.6 +make -j$(nproc) +#make clean # if you install more dependencies +make -n install +make install +ll /opt/ocf_tseed/python/3.10.6 +``` + +# create virtual environment with local python and activate + +```sh +# create venv +/opt/ocf_tseed/python/3.10.6/bin/python3 -m venv --prompt 3.10.6 ~/.venv +source ~/.venv/bin/activate +python --version # check not system version +pip install --upgrade pip +``` + +# update bashrc + +```sh + +vi ~/.bashrc + +# User specific aliases and functions +source $HOME/.venv/bin/activate +``` + +# manually start/stop venv + +## start +```sh +source $HOME/.venv/bin/activate +``` + +## exit venv +```sh +deactivate +``` + +# install pip packages + +The following pip packages are required for the playbook, netaddr is essential for ip filters and nmcli module, ansible-merge-vars is required for complex/deep variable overlay. + +```sh +pip install netaddr ansible-merge-vars jmespath pip-autoremove +``` + +```sh +pip freeze > pip_requirements.txt + +vi requirements.txt + +ansible==6.2.0 +ansible-core==2.13.3 +netaddr==0.8.0 +ansible-merge-vars==5.0.0 + +python -m pip install -r pip_requirements.txt +``` \ No newline at end of file diff --git a/cluster/roles/autofs/.travis.yml b/cluster/roles/autofs/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/autofs/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/autofs/README.md b/cluster/roles/autofs/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/autofs/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/autofs/defaults/main.yml b/cluster/roles/autofs/defaults/main.yml new file mode 100644 index 0000000..6e3d283 --- /dev/null +++ b/cluster/roles/autofs/defaults/main.yml @@ -0,0 +1,72 @@ +autofs: + ## + enabled: true + # is this really required? mapfile name is now using config_namespace leaving only topdir (uded in template) which seems like it could be static? + map_config: + topdir: "/-" + mapfile: steel + timeout: 300 + ## + nfs: + enabled: true + default_version: 3 + lustre: + enabled: false + gpfs: + enabled: false + beegfs: + enabled: false + exports: + - type: nfs + export: "/nfs/home" + # ansible inventory group or node + exporter: + - nfs01 + network: cluster + # ansible inventory group or node + consumer: + - login + - stateless + - compute + mount: /home + opts: "rw,async,no_root_squash" + - type: nfs + export: "/nfs/software" + exporter: + - nfs01 + network: cluster + consumer: + - stateless + - login + - wlm + - compute + mount: /opt/software + opts: "rw,async,no_root_squash" + - type: nfs + export: /nfs/slurm + exporter: + - wlm01 + network: cluster + consumer: + - slurm + - compute + mount: /opt/software + opts: "rw,async,no_root_squash" + # if you only intend to be a consumer you could omit the exporter field but would require export with proper protocol:// + # - type: nfs + # export: /nfs/slurm111 + # network: cluster + # consumer: + # - compute002 + # mount: /opt/software + # opts: "rw,async,no_root_squash" + # example lustre - type and consumer fields will be mandatory, other fields will be filesystem type or role specific + - type: lustre + export: "/lustre/data" + network: cluster + consumer: + - group1 + - group2 + - node1 + - node2 + mount: /home \ No newline at end of file diff --git a/cluster/roles/autofs/handlers/main.yml b/cluster/roles/autofs/handlers/main.yml new file mode 100644 index 0000000..d077413 --- /dev/null +++ b/cluster/roles/autofs/handlers/main.yml @@ -0,0 +1,7 @@ +--- +- name: Restart autofs + ansible.builtin.systemd: + name: autofs + state: restarted + enabled: true + listen: "Restart autofs" \ No newline at end of file diff --git a/cluster/roles/autofs/meta/main.yml b/cluster/roles/autofs/meta/main.yml new file mode 100644 index 0000000..c572acc --- /dev/null +++ b/cluster/roles/autofs/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/cluster/roles/autofs/tasks/main.yml b/cluster/roles/autofs/tasks/main.yml new file mode 100644 index 0000000..ff749ba --- /dev/null +++ b/cluster/roles/autofs/tasks/main.yml @@ -0,0 +1,179 @@ +# Copyright 2022 OCF Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- +# vim: ft=yaml +--- + +######## parse exports list, filter on target host and tag as exporter/consumer, run role with filtered export list by type +# +# NOTE +# - mandatory fields in each list item in the exports dict are 'consumer' and 'type', all other fields are role specific +# - consumers list items can be ansible inventory groups or individual hosts +# - this logic will set a host that is both the exporter and the consumer to have the exporter tag +# it is not expected that a host will mount its own export +# however, if this is required - add another dict to the exports list with only a consumer list and explicitly no exporter list + +# multiple/group exporters entries logic will heavily rely on dynamic named client mount points - this will mean other roles that use mount points will have todo lookup +# make this task only take exporter ansible_hostname[0] and no groups, to simplify render the autofs config +# +# - name: build exports list for target host, add tag for exporter or consumer action +# set_fact: +# _target_exports: "{{ _target_exports | default([]) + ([export_definition]) }}" +# loop: "{{ autofs['exports'] }}" +# loop_control: +# loop_var: entry +# vars: +# consumer_group_match: "{{ entry['consumer'] | intersect(active_role_groups) }}" +# consumer_host_match: "{{ entry['consumer'] | intersect(ansible_hostname) }}" +# exporter_group_match: "{{ entry['exporter'] | default ([]) | intersect(active_role_groups) }}" +# exporter_host_match: "{{ entry['exporter'] | default ([]) | intersect(ansible_hostname) }}" +# toggle_exporter_group: "{{ exporter_group_match | length >0 }}" +# toggle_exporter_host: "{{ exporter_host_match | length >0 }}" +# toggle_exporter: "{{ ((toggle_exporter_group + toggle_exporter_host) | int >0) | ternary('exporter', 'consumer') }}" +# export_definition: "{{ entry | default({}) | combine({ 'action': toggle_exporter }, recursive=True) }}" +# when: +# - consumer_group_match | length>0 or +# consumer_host_match | length>0 or +# exporter_group_match | length>0 or +# exporter_host_match | length >0 + +- name: build exports list for target host, add tag for exporter or consumer action + set_fact: + _target_exports: "{{ _target_exports | default([]) + ([export_definition]) }}" + loop: "{{ autofs['exports'] }}" + loop_control: + loop_var: entry + vars: + consumer_group_match: "{{ entry['consumer'] | intersect(active_role_groups) }}" + consumer_host_match: "{{ entry['consumer'] | intersect(ansible_hostname) }}" + exporter_host_match: "{{ entry['exporter'] | default ([]) | intersect(ansible_hostname) }}" + toggle_exporter_host: "{{ exporter_host_match | length >0 }}" + toggle_exporter: "{{ (toggle_exporter_host | int >0) | ternary('exporter', 'consumer') }}" + export_definition: "{{ entry | default({}) | combine({ 'action': toggle_exporter }, recursive=True) }}" + when: + - consumer_group_match | length>0 or + consumer_host_match | length>0 or + exporter_host_match | length >0 + +# - debug: +# msg: "{{ _target_exports }}" + +######## run role with filtered export list by type + +- name: Run NFS role + include_role: + name: nfs + vars: + exports: "{{ _target_exports | selectattr('type', '==', 'nfs' ) }}" + toggle_run: "{{ exports | length >0 }}" + when: + - autofs['nfs']['enabled'] | bool + - toggle_run + +- name: Run Lustre role + include_role: + name: lustre + vars: + exports: "{{ _target_exports | selectattr('type', '==', 'lustre' ) }}" + toggle_run: "{{ exports | length >0 }}" + when: + - autofs['lustre']['enabled'] | bool + - toggle_run + +- name: Run Spectrum Scale role + include_role: + name: gpfs + vars: + exports: "{{ _target_exports | selectattr('type', '==', 'gpfs' ) }}" + toggle_run: "{{ exports | length >0 }}" + when: + - autofs['gpfs']['enabled'] | bool + - toggle_run + +- name: Run BeeGFS role + include_role: + name: beegfs + vars: + exports: "{{ _target_exports | selectattr('type', '==', 'beegfs' ) }}" + toggle_run: "{{ exports | length >0 }}" + when: + - autofs['beegfs']['enabled'] | bool + - toggle_run + +######## configure autofs + +- name: Install and map autofs paths + block: + - name: Install autofs package + package: + name: autofs + state: latest + + - name: Configure autofs master + template: + dest: /etc/auto.master + src: templates/auto.master.j2 + mode: 0644 + trim_blocks: False + notify: Restart autofs + + - name: Configure autofs process + template: + dest: /etc/autofs.conf + src: templates/autofs.conf.j2 + mode: 0644 + trim_blocks: False + notify: Restart autofs + + # bring logic inboard from the jinja template using the exporter/consumer logic + # this is tailored for nfs (see vers param), likely there will be multiple replica tasks to account for different mount types, these should all add to the _map_list + - name: Build autofs mapping + ansible.builtin.set_fact: + _map_list: "{{ _map_list | default([]) + [autofs_entry] }}" + loop: "{{ _target_exports }}" + loop_control: + loop_var: entry + vars: + mount: "{{ entry['mount'] }}" + fstype: "{{ entry['type'] }}" + vers: "{{ autofs[fstype]['default_version'] }}" + exporter: "{{ entry['exporter'] | first }}" + export: "{{ entry['export'] }}" + autofs_entry: "{{ mount }} -fstype={{ fstype }},vers={{ vers }} {{ exporter }}:{{ export }}" + when: + - _target_exports | selectattr('action', '==', 'consumer' ) + + - name: Configure autofs mapping + template: + dest: /etc/autofs-{{ config_namespace }}.map + src: autofs.map.j2 + mode: 0644 + trim_blocks: False + notify: Restart autofs + + - name: AutoFS configured + ansible.builtin.set_fact: + autofs_configured: true + + when: + - autofs['enabled'] | bool + +# Refresh facts and services facts after these will have been configured by +# the autofs role + +- name: Refresh service facts + ansible.builtin.service_facts: + +- name: Refresh facts + ansible.builtin.setup: \ No newline at end of file diff --git a/cluster/roles/autofs/templates/auto.master.j2 b/cluster/roles/autofs/templates/auto.master.j2 new file mode 100644 index 0000000..81d7ba6 --- /dev/null +++ b/cluster/roles/autofs/templates/auto.master.j2 @@ -0,0 +1,9 @@ +# +# {{ ansible_managed }} +# +# Include /etc/auto.master.d/*.autofs +# The included files must conform to the format of this file. +# ++dir:/etc/auto.master.d ++auto.master +{{ autofs.map_config.topdir }} /etc/autofs-{{ config_namespace }}.map --timeout={{ autofs.timeout }} \ No newline at end of file diff --git a/cluster/roles/autofs/templates/autofs.conf.j2 b/cluster/roles/autofs/templates/autofs.conf.j2 new file mode 100644 index 0000000..1e1e521 --- /dev/null +++ b/cluster/roles/autofs/templates/autofs.conf.j2 @@ -0,0 +1,9 @@ +# +# {{ ansible_managed }} +# +[ autofs ] +timeout = {{ autofs.timeout }} +browse_mode = no +mount_nfs_default_protocol = {{ autofs.nfs.default_version }} +[ amd ] +dismount_interval = 300 diff --git a/cluster/roles/autofs/templates/autofs.map.j2 b/cluster/roles/autofs/templates/autofs.map.j2 new file mode 100644 index 0000000..deffbc3 --- /dev/null +++ b/cluster/roles/autofs/templates/autofs.map.j2 @@ -0,0 +1,7 @@ +# +# {{ ansible_managed }} +# +{%- for entry in _map_list %} +{{ entry }} +{%- endfor %} + diff --git a/cluster/roles/autofs/tests/inventory b/cluster/roles/autofs/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/autofs/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/autofs/tests/test.yml b/cluster/roles/autofs/tests/test.yml new file mode 100644 index 0000000..87930df --- /dev/null +++ b/cluster/roles/autofs/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - prometheus diff --git a/cluster/roles/autofs/vars/main.yml b/cluster/roles/autofs/vars/main.yml new file mode 100644 index 0000000..f01bf99 --- /dev/null +++ b/cluster/roles/autofs/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for template_role diff --git a/cluster/roles/cephadm_bootstrap/.travis.yml b/cluster/roles/cephadm_bootstrap/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/cephadm_bootstrap/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/cephadm_bootstrap/README.md b/cluster/roles/cephadm_bootstrap/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/cephadm_bootstrap/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/cephadm_bootstrap/defaults/main.yml b/cluster/roles/cephadm_bootstrap/defaults/main.yml new file mode 100644 index 0000000..2bec87e --- /dev/null +++ b/cluster/roles/cephadm_bootstrap/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_bootstrap/handlers/main.yml b/cluster/roles/cephadm_bootstrap/handlers/main.yml new file mode 100644 index 0000000..2d28ec4 --- /dev/null +++ b/cluster/roles/cephadm_bootstrap/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_bootstrap/meta/main.yml b/cluster/roles/cephadm_bootstrap/meta/main.yml new file mode 100644 index 0000000..227ad9c --- /dev/null +++ b/cluster/roles/cephadm_bootstrap/meta/main.yml @@ -0,0 +1,53 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.9 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. + \ No newline at end of file diff --git a/cluster/roles/cephadm_bootstrap/tasks/main.yml b/cluster/roles/cephadm_bootstrap/tasks/main.yml new file mode 100644 index 0000000..4a64a4d --- /dev/null +++ b/cluster/roles/cephadm_bootstrap/tasks/main.yml @@ -0,0 +1,172 @@ +--- +- name: bootstrap first ceph node + block: + + - name: create /etc/ceph directory + file: + path: /etc/ceph + state: directory + + - name: check if /etc/ceph/ceph.conf exists + stat: + path: /etc/ceph/ceph.conf + register: cephadm_check_ceph_conf + + - name: bootstrap ceph + ansible.builtin.command: + cmd: "cephadm bootstrap --mon-ip {{ mon_ip }} --cluster-network {{ cluster_network_range }}" + vars: + ipv: "ipv4" + mon_interface: "{{ vars['hypervisor']['nmcli_con_names']['ceph_public'] }}" + mon_ip: "{{ hostvars[inventory_hostname]['ansible_' + mon_interface][ipv].address }}" + cluster_network_name: "{{ vars['hypervisor']['nmcli_con_names']['ceph_cluster'] }}" + # network: "{{ vars['hypervisor']['cluster_networks'][cluster_network_name]['network'] }}" + # netmask: "{{ vars['hypervisor']['cluster_networks'][cluster_network_name]['netmask'] }}" + network: "{{ vars[config_namespace]['cluster_networks'][cluster_network_name]['network'] }}" + netmask: "{{ vars[config_namespace]['cluster_networks'][cluster_network_name]['netmask'] }}" + cluster_network_range: "{{ network }}/{{ (network + '/' + netmask) | ansible.utils.ipaddr('prefix') }}" + register: cephadm_bootstrap + when: not cephadm_check_ceph_conf.stat.exists + + - name: store SSH pubkey as a variable + ansible.builtin.command: + cmd: cat /etc/ceph/ceph.pub + changed_when: + - ceph_rsa_pub.rc is defined + - ceph_rsa_pub.rc > 0 + register: ceph_rsa_pub + + - name: authorize the SSH keypair on all hosts + authorized_key: + key: "{{ ceph_rsa_pub.stdout_lines[0] }}" + user: root + state: present + loop: "{{ groups['ceph'] }}" + loop_control: + loop_var: entry + delegate_to: "{{ entry }}" + + - debug: + msg: "ceph orch host set-addr {{ inventory_hostname }} {{ mon_ip }}" + vars: + ipv: "ipv4" + mon_interface: "{{ vars['hypervisor']['nmcli_con_names']['ceph_public'] }}" + mon_ip: "{{ hostvars[inventory_hostname]['ansible_' + mon_interface][ipv].address }}" + + - name: set host addr in orchestrator + ansible.builtin.command: + cmd: "ceph orch host set-addr {{ inventory_hostname }} {{ mon_ip }}" + changed_when: false + vars: + ipv: "ipv4" + mon_interface: "{{ vars['hypervisor']['nmcli_con_names']['ceph_public'] }}" + mon_ip: "{{ hostvars[inventory_hostname]['ansible_' + mon_interface][ipv].address }}" + + - name: add other ceph hosts + ansible.builtin.command: + cmd: "ceph orch host add {{ host }} {{ mon_ip }}" + changed_when: false + loop: "{{ groups['ceph'] | difference(inventory_hostname) }}" + loop_control: + loop_var: entry + vars: + host: "{{ entry }}" + ipv: "ipv4" + mon_interface: "{{ vars['hypervisor']['nmcli_con_names']['ceph_public'] }}" + mon_ip: "{{ hostvars[host]['ansible_' + mon_interface][ipv].address }}" + + vars: + target_host: "{{ groups['ceph'] | first }}" + when: + - target_host == inventory_hostname + - groups['ceph'] | length >0 + +# https://github.com/jcmdln/cephadm-playbook + + # "stderr_lines": [ + # "Verifying podman|docker is present...", + # "Verifying lvm2 is present...", + # "Verifying time synchronization is in place...", + # "Unit chronyd.service is enabled and running", + # "Repeating the final host check...", + # "podman (/usr/bin/podman) version 4.1.1 is present", + # "systemctl is present", + # "lvcreate is present", + # "Unit chronyd.service is enabled and running", + # "Host looks OK", + # "Cluster fsid: 00699884-38f2-11ed-9df2-b8975acfd7d3", + # "Verifying IP 172.24.0.11 port 3300 ...", + # "Verifying IP 172.24.0.11 port 6789 ...", + # "Mon IP `172.24.0.11` is in CIDR network `172.24.0.0/16`", + # "Pulling container image quay.io/ceph/ceph:v16...", + # "Ceph version: ceph version 16.2.10 (45fa1a083152e41a408d15505f594ec5f1b4fe17) pacific (stable)", + # "Extracting ceph user uid/gid from container image...", + # "Creating initial keys...", + # "Creating initial monmap...", + # "Creating mon...", + # "Waiting for mon to start...", + # "Waiting for mon...", + # "mon is available", + # "Assimilating anything we can from ceph.conf...", + # "Generating new minimal ceph.conf...", + # "Restarting the monitor...", + # "Setting mon public_network to 172.24.0.0/16", + # "Setting cluster_network to 172.25.0.0/24", + # "Wrote config to /etc/ceph/ceph.conf", + # "Wrote keyring to /etc/ceph/ceph.client.admin.keyring", + # "Creating mgr...", + # "Verifying port 9283 ...", + # "Waiting for mgr to start...", + # "Waiting for mgr...", + # "mgr not available, waiting (1/15)...", + # "mgr not available, waiting (2/15)...", + # "mgr not available, waiting (3/15)...", + # "mgr not available, waiting (4/15)...", + # "mgr not available, waiting (5/15)...", + # "mgr not available, waiting (6/15)...", + # "mgr not available, waiting (7/15)...", + # "mgr is available", + # "Enabling cephadm module...", + # "Waiting for the mgr to restart...", + # "Waiting for mgr epoch 5...", + # "mgr epoch 5 is available", + # "Setting orchestrator backend to cephadm...", + # "Generating ssh key...", + # "Wrote public SSH key to /etc/ceph/ceph.pub", + # "Adding key to root@localhost authorized_keys...", + # "Adding host qemu01...", + # "Deploying mon service with default placement...", + # "Deploying mgr service with default placement...", + # "Deploying crash service with default placement...", + # "Deploying prometheus service with default placement...", + # "Deploying grafana service with default placement...", + # "Deploying node-exporter service with default placement...", + # "Deploying alertmanager service with default placement...", + # "Enabling the dashboard module...", + # "Waiting for the mgr to restart...", + # "Waiting for mgr epoch 9...", + # "mgr epoch 9 is available", + # "Generating a dashboard self-signed certificate...", + # "Creating initial admin user...", + # "Fetching dashboard port number...", + # "Ceph Dashboard is now available at:", + # "", + # "\t URL: https://qemu01.cluster.local:8443/", + # "\t User: admin", + # "\tPassword: shm7es74de", + # "", + # "Enabling client.admin keyring and conf on hosts with \"admin\" label", + # "You can access the Ceph CLI with:", + # "", + # "\tsudo /usr/sbin/cephadm shell --fsid 00699884-38f2-11ed-9df2-b8975acfd7d3 -c /etc/ceph/ceph.conf -k /etc/ceph/ceph.client.admin.keyring", + # "", + # "Please consider enabling telemetry to help improve Ceph:", + # "", + # "\tceph telemetry on", + # "", + # "For more information see:", + # "", + # "\thttps://docs.ceph.com/docs/pacific/mgr/telemetry/", + # "", + # "Bootstrap complete." + # ], \ No newline at end of file diff --git a/cluster/roles/cephadm_bootstrap/tests/inventory b/cluster/roles/cephadm_bootstrap/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/cephadm_bootstrap/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/cephadm_bootstrap/tests/test.yml b/cluster/roles/cephadm_bootstrap/tests/test.yml new file mode 100644 index 0000000..c2fd2bd --- /dev/null +++ b/cluster/roles/cephadm_bootstrap/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_bootstrap/vars/main.yml b/cluster/roles/cephadm_bootstrap/vars/main.yml new file mode 100644 index 0000000..f655be4 --- /dev/null +++ b/cluster/roles/cephadm_bootstrap/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_prep/.travis.yml b/cluster/roles/cephadm_prep/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/cephadm_prep/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/cephadm_prep/README.md b/cluster/roles/cephadm_prep/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/cephadm_prep/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/cephadm_prep/defaults/main.yml b/cluster/roles/cephadm_prep/defaults/main.yml new file mode 100644 index 0000000..deada58 --- /dev/null +++ b/cluster/roles/cephadm_prep/defaults/main.yml @@ -0,0 +1,10 @@ +--- +cephadm_packages: + cephadm_host_packages: + - podman + - epel-release + - ceph-common + - cephadm + + # - python3-pip + # - python3-virtualenv \ No newline at end of file diff --git a/cluster/roles/cephadm_prep/handlers/main.yml b/cluster/roles/cephadm_prep/handlers/main.yml new file mode 100644 index 0000000..2d28ec4 --- /dev/null +++ b/cluster/roles/cephadm_prep/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_prep/meta/main.yml b/cluster/roles/cephadm_prep/meta/main.yml new file mode 100644 index 0000000..227ad9c --- /dev/null +++ b/cluster/roles/cephadm_prep/meta/main.yml @@ -0,0 +1,53 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.9 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. + \ No newline at end of file diff --git a/cluster/roles/cephadm_prep/tasks/main.yml b/cluster/roles/cephadm_prep/tasks/main.yml new file mode 100644 index 0000000..77631a9 --- /dev/null +++ b/cluster/roles/cephadm_prep/tasks/main.yml @@ -0,0 +1,108 @@ +--- +- name: download cephadm + ansible.builtin.get_url: + url: https://github.com/ceph/ceph/raw/quincy/src/cephadm/cephadm + dest: ~/ + mode: '0750' + +- name: install ceph repo + ansible.builtin.command: + cmd: "~/cephadm add-repo --release quincy" + +# use repo method instead +# - name: install cephadm +# ansible.builtin.command: +# cmd: "~/cephadm install" + +# ceph from rhel repos, not latest release +# - name: add ceph repository +# package: +# name: "centos-release-ceph-pacific" +# state: present + +- name: install ceph packages + package: + name: "{{ cephadm_packages['cephadm_host_packages'] }}" + state: present + +- name: create ssh keypair + openssh_keypair: + path: /tmp/cephadm_rsa + size: 4096 + owner: "{{ lookup('env', 'USER') }}" + delegate_to: localhost + run_once: true + +- name: store SSH pubkey as a variable + command: >- + cat /tmp/cephadm_rsa.pub + changed_when: + - cephadm_rsa_pub.rc is defined + - cephadm_rsa_pub.rc > 0 + delegate_to: localhost + register: cephadm_rsa_pub + run_once: true + +- name: create ~/.ssh + ansible.builtin.file: + path: ~/.ssh + state: directory + mode: '0700' + +- name: create ~/.ssh/authorized_keys + ansible.builtin.file: + path: ~/.ssh/authorized_keys + state: touch + mode: '0644' + +- name: copy SSH keypair to all hosts + copy: + src: /tmp/{{ entry }} + dest: "~/.ssh/{{ entry }}" + force: true + owner: root + group: root + mode: 0600 + loop: + - cephadm_rsa + - cephadm_rsa.pub + loop_control: + loop_var: entry + vars: + file: "{{ entry | regex_replace('cephadm_', 'id_') }}" + # become: yes + # become_user: root + # become_method: sudo + +- name: Authorize the SSH keypair on all hosts + authorized_key: + key: "{{ cephadm_rsa_pub.stdout_lines[0] }}" + user: root + state: present + +# - name: Authorize local SSH pub key on all hosts +# authorized_key: +# key: "{{ lookup('file', '~/.ssh/id_rsa.pub') }}" +# comment: "" +# user: root +# state: present + +- name: add ~/.ssh/config referencing all other ceph hosts + blockinfile: + block: | + {% for host in groups['ceph'] | difference(inventory_hostname) %} + Host {{ hostvars[host]['inventory_hostname'] }} + HostName {{ hostvars[host]['ansible_' + interface][ipv]['address'] }} + IdentityFile ~/.ssh/cephadm_rsa + PreferredAuthentications publickey + User root + StrictHostKeyChecking accept-new + {% if not loop.last %} + {% endif %} + {% endfor %} + create: true + dest: ~/.ssh/config + vars: + # use nmcli connection as the interface name to find the ip, this relies on the hypervisor_vxlan role creating bridge interfaces rather than physical interfaces such as eth0 + interface: "{{ vars['hypervisor']['nmcli_con_names']['ceph_public'] }}" + ipv: "ipv4" \ No newline at end of file diff --git a/cluster/roles/cephadm_prep/tests/inventory b/cluster/roles/cephadm_prep/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/cephadm_prep/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/cephadm_prep/tests/test.yml b/cluster/roles/cephadm_prep/tests/test.yml new file mode 100644 index 0000000..c2fd2bd --- /dev/null +++ b/cluster/roles/cephadm_prep/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_prep/vars/main.yml b/cluster/roles/cephadm_prep/vars/main.yml new file mode 100644 index 0000000..f655be4 --- /dev/null +++ b/cluster/roles/cephadm_prep/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_services/.travis.yml b/cluster/roles/cephadm_services/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/cephadm_services/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/cephadm_services/README.md b/cluster/roles/cephadm_services/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/cephadm_services/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/cephadm_services/defaults/main.yml b/cluster/roles/cephadm_services/defaults/main.yml new file mode 100644 index 0000000..2bec87e --- /dev/null +++ b/cluster/roles/cephadm_services/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_services/handlers/main.yml b/cluster/roles/cephadm_services/handlers/main.yml new file mode 100644 index 0000000..2d28ec4 --- /dev/null +++ b/cluster/roles/cephadm_services/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_services/meta/main.yml b/cluster/roles/cephadm_services/meta/main.yml new file mode 100644 index 0000000..227ad9c --- /dev/null +++ b/cluster/roles/cephadm_services/meta/main.yml @@ -0,0 +1,53 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.9 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. + \ No newline at end of file diff --git a/cluster/roles/cephadm_services/tasks/main.yml b/cluster/roles/cephadm_services/tasks/main.yml new file mode 100644 index 0000000..f469731 --- /dev/null +++ b/cluster/roles/cephadm_services/tasks/main.yml @@ -0,0 +1,267 @@ +--- +######## runtime_facts +# - name: runtime facts +# ansible.builtin.set_fact: +# _tmp_service_location: "/root/ceph_service_definition" + +# set the networks +# ceph config set global public_network 192.168.101.0/24 +# ceph config set global cluster_network 192.168.101.0/24 + +# default service spec +# https://docs.ceph.com/en/latest/cephadm/services/#updating-service-specifications +# osd explained +# https://docs.ceph.com/en/latest/rados/configuration/bluestore-config-ref/ +# https://docs.ceph.com/en/latest/cephadm/services/osd/#drivegroups # great ref for spinning, ssd and nvme as data db and wal respectively +# ceph orch ls --export +# ceph orch apply -i myservice.yaml [--dry-run] # import excellent +# ceph orch redeploy grafana - this seems very important after changes +# + +######## configure ceph and provision ceph services + +- name: configure ceph and provision ceph services via the first ceph host + block: + + - name: set networks, PG autoscaling on, memory autotune on + ansible.builtin.command: + cmd: "{{ entry }}" + loop: + - "ceph config set global public_network {{ public_network_range }}" + - "ceph config set global cluster_network {{ cluster_network_range }}" + - "ceph config set global osd_pool_default_pg_autoscale_mode on" + - "ceph config set osd osd_memory_target_autotune true" + loop_control: + loop_var: entry + vars: + public_network_name: "{{ vars['hypervisor']['nmcli_con_names']['ceph_public'] }}" + public_network: "{{ vars[config_namespace]['cluster_networks'][public_network_name]['network'] }}" + public_netmask: "{{ vars[config_namespace]['cluster_networks'][public_network_name]['netmask'] }}" + public_network_range: "{{ public_network }}/{{ (public_network + '/' + public_netmask) | ansible.utils.ipaddr('prefix') }}" + cluster_network_name: "{{ vars['hypervisor']['nmcli_con_names']['ceph_cluster'] }}" + cluster_network: "{{ vars[config_namespace]['cluster_networks'][cluster_network_name]['network'] }}" + cluster_netmask: "{{ vars[config_namespace]['cluster_networks'][cluster_network_name]['netmask'] }}" + cluster_network_range: "{{ cluster_network }}/{{ (cluster_network + '/' + cluster_netmask) | ansible.utils.ipaddr('prefix') }}" + + - name: apply ceph service labels to hosts + ansible.builtin.command: + cmd: "ceph orch host label add {{ host }} {{ label }}" + with_subelements: + - "{{ hypervisor['ceph_service_placement'] }}" + - labels + loop_control: + loop_var: entry + vars: + host: "{{ entry[0]['host'] }}" + label: "{{ entry[1] }}" + + - name: create yaml ceph service definition fact + set_fact: + _ceph_service_definition: "{{ _ceph_service_definition | default() + '---\n' + content }}" + loop: "{{ hypervisor['ceph_service_spec'] }}" + loop_control: + loop_var: entry + vars: + content: "{{ entry | to_nice_yaml(indent=2,sort_keys=False) }}" + when: + - not entry['service_type'] == 'nfs' + + - name: create cephadm service spec file + copy: + content: "{{ _ceph_service_definition }}" + dest: "/root/ceph_service_spec.yml" + force: yes + validate: "ceph orch apply -i %s --dry-run" + register: _ceph_service_definition_check + ignore_errors: yes + + - name: stop ceph deployment where services file does not validate + meta: end_play + when: + - _ceph_service_definition_check['exit_status'] is defined and not _ceph_service_definition_check['exit_status'] == 0 + + - name: apply ceph service spec + ansible.builtin.command: + cmd: "ceph orch apply -i /root/ceph_service_spec.yml" + register: _apply_ceph_service_spec + + # - debug: + # msg: + # - "{{ _apply_ceph_service_spec }}" + + - name: wait for OSD provision + ansible.builtin.command: + cmd: "ceph orch ls -f json" + register: _ceph_service + until: osd_running == osd_count + retries: 12 + delay: 10 + vars: + osd_service: "{{ _ceph_service['stdout'] | from_json | selectattr('service_type', '==', 'osd') | first }}" + osd_count: "{{ osd_service['status']['size'] | int }}" + osd_running: "{{ osd_service['status']['running'] | int }}" + + - name: query ceph osds + ansible.builtin.command: + cmd: "ceph osd df -f json" + register: _ceph_osd_info + + - name: determine if too many placement groups are being requested + debug: + msg: + - "too many placement groups are being requested, consider adding more OSDs, provisioning less pools or setting placement groups manually" + - "available placement groups: {{ available_pg }}" + - "requested placement groups: {{ requested_pg }}" + vars: + pg_per_osd: 250 + pool_default_pg: 32 + device_health_metrics_pg: 1 + osd_count: "{{ (_ceph_osd_info['stdout'] | from_json)['nodes'] | length }}" + rgw_pg: "{{ hypervisor['ceph_service_spec'] | selectattr('service_type', '==', 'rgw') | length * 104 }}" + nfs_pg: "{{ hypervisor['ceph_service_spec'] | selectattr('service_type', '==', 'nfs') | length * 1 }}" + static_pg_allocation: "{{ hypervisor['ceph_pools'] | selectattr('pg', 'defined') | map(attribute='pg') | sum }}" + default_pg_allocation: "{{ hypervisor['ceph_pools'] | selectattr('pg', 'undefined') | length * 32 }}" + available_pg: "{{ (osd_count | int) * pg_per_osd - device_health_metrics_pg }}" + requested_pg: "{{ ((static_pg_allocation | int) + (default_pg_allocation | int) + (rgw_pg | int) + (nfs_pg | int) + device_health_metrics_pg) * (osd_count | int) }}" + too_many_pg: "{{ (requested_pg > available_pg) }}" + register: _too_many_pg + when: + - too_many_pg + + - name: stop ceph deployment where too many placement groups are being requested + meta: end_play + when: + - not _too_many_pg['skipped'] + + # this will skip pools that already exist + - name: create pools + ansible.builtin.command: + cmd: "ceph osd pool create {{ name }} {{ pg }}" + loop: "{{ hypervisor['ceph_pools'] }}" + loop_control: + loop_var: entry + vars: + name: "{{ entry['name'] }}" + pg: "{{ entry['pg'] | default() | int }}" + + # this will skip volumes that already exist + - name: create cephfs volumes + ansible.builtin.command: + cmd: "ceph fs new {{ cephfs_volume_name }} {{ cephfs_meta_pool }} {{ cephfs_data_pool }}" + loop: "{{ cephfs_volumes }}" + loop_control: + loop_var: entry + vars: + cephfs_volumes: "{{ hypervisor['ceph_pools'] | selectattr('type', '==', 'cephfs') | map(attribute='volume') | unique }}" + cephfs_volume_name: "{{ entry }}" + cephfs_data_pool: "{{ hypervisor['ceph_pools'] | selectattr('type', '==', 'cephfs') | selectattr('volume', '==', entry) | selectattr('cephfs_type', '==', 'data') | map(attribute='name') | first | default() }}" + cephfs_meta_pool: "{{ hypervisor['ceph_pools'] | selectattr('type', '==', 'cephfs') | selectattr('volume', '==', entry) | selectattr('cephfs_type', '==', 'meta') | map(attribute='name') | first | default() }}" + cephfs_mds_service_present: "{{ hypervisor['ceph_service_spec'] | selectattr('service_type', '==', 'mds') | length >0 }}" + when: + - cephfs_mds_service_present + - cephfs_data_pool | length >0 and cephfs_meta_pool | length >0 + + # rgw multisite config is required here + + # if nfs service(s) exist in hypervisor['ceph_service_spec'], provision service first to ensure the ceph_service_spec can validate nfs service_type entry + # nfs config file requires an rgw pool or cephfs namespace(volume) + - name: provision ceph nfs + block: + + - name: deploy nfs service + ansible.builtin.command: + cmd: "ceph nfs cluster create {{ nfs_service }}" + loop: "{{ hypervisor['ceph_service_spec'] | selectattr('service_type', '==', 'nfs') }}" + loop_control: + loop_var: entry + vars: + nfs_service: "{{ entry['service_id'] }}" + + - name: create yaml ceph nfs service definition fact + set_fact: + _ceph_nfs_service_definition: "{{ _ceph_nfs_service_definition | default() + '---\n' + content }}" + loop: "{{ hypervisor['ceph_service_spec'] }}" + loop_control: + loop_var: entry + vars: + content: "{{ entry | to_nice_yaml(indent=2,sort_keys=False) }}" + when: + - entry['service_type'] == 'nfs' + + - name: create cephadm nfs service spec file + copy: + content: "{{ _ceph_nfs_service_definition }}" + dest: "/root/ceph_nfs_service_spec.yml" + force: yes + validate: "ceph orch apply -i %s --dry-run" + register: _ceph_nfs_service_definition_check + ignore_errors: yes + + - name: stop ceph deployment where nfs services file does not validate + meta: end_play + when: + - _ceph_nfs_service_definition_check['exit_status'] is defined and not _ceph_nfs_service_definition_check['exit_status'] == 0 + + - name: apply ceph nfs service spec + ansible.builtin.command: + cmd: "ceph orch apply -i /root/ceph_nfs_service_spec.yml" + register: _apply_ceph_service_spec + + # some kind of nfs config required here + # https://docs.ceph.com/en/quincy/mgr/nfs/ + # https://docs.ceph.com/en/latest/mgr/nfs/#mgr-nfs + # ceph nfs cluster config set -i + # https://github.com/nfs-ganesha/nfs-ganesha/blob/next/src/config_samples/ceph.conf + + vars: + nfs_service_present: "{{ hypervisor['ceph_service_spec'] | selectattr('service_type', '==', 'nfs') | length >0 }}" + when: + - nfs_service_present + +# ceph fs volume rm cephfs_cluster_volume1 --yes-i-really-mean-it +# ceph fs volume rm cephfs_cluster_volume --yes-i-really-mean-it +# ceph orch rm nfs.ganesha +# ceph orch rm rgw.object +# ceph orch rm mds.cephfs +# ceph orch ls +# ceph osd lspools +# ceph osd pool rm .nfs .nfs --yes-i-really-really-mean-it +# ceph osd pool rm .rgw.root .rgw.root --yes-i-really-really-mean-it +# ceph osd pool rm default.rgw.log default.rgw.log --yes-i-really-really-mean-it +# ceph osd pool rm default.rgw.control default.rgw.control --yes-i-really-really-mean-it +# ceph osd pool rm default.rgw.meta default.rgw.meta --yes-i-really-really-mean-it +# ceph osd pool rm vms --yes-i-really-really-mean-it + +# set dashboard password here +# echo Password0 > password.txt +# ceph dashboard ac-user-set-password admin -i password.txt +# rm -f password.txt + + - name: create dashboard password file + copy: + content: "{{ hypervisor['ceph_dash_admin_password'] }}" + dest: "/root/dashboard_admin_password.txt" + force: yes + + - name: apply ceph nfs service spec + ansible.builtin.command: + cmd: "ceph dashboard ac-user-set-password admin -i /root/dashboard_admin_password.txt" + + - name: remove dashboard password file + ansible.builtin.file: + path: "/root/dashboard_admin_password.txt" + state: absent + + vars: + target_host: "{{ groups['ceph'] | first }}" + when: + - target_host == inventory_hostname + - groups['ceph'] | length >0 + +# you need to split rgw and nfs into specialized services as the logic is different +# - cephfs configuration - need to determine if there is service type mds in ceph_service_spec +# - nfs - check if mds is there - not going to use rgw +# - rgw - add multisite later + +# do pools - vms / cephfs data + meta cephfs.cluster_volume.data cephfs.cluster_volume.meta / +# do cephfs + nfs diff --git a/cluster/roles/cephadm_services/tests/inventory b/cluster/roles/cephadm_services/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/cephadm_services/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/cephadm_services/tests/test.yml b/cluster/roles/cephadm_services/tests/test.yml new file mode 100644 index 0000000..c2fd2bd --- /dev/null +++ b/cluster/roles/cephadm_services/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - roles/role-template \ No newline at end of file diff --git a/cluster/roles/cephadm_services/vars/main.yml b/cluster/roles/cephadm_services/vars/main.yml new file mode 100644 index 0000000..f655be4 --- /dev/null +++ b/cluster/roles/cephadm_services/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/firewalld/.travis.yml b/cluster/roles/firewalld/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/firewalld/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/firewalld/README.md b/cluster/roles/firewalld/README.md new file mode 100644 index 0000000..7e22abd --- /dev/null +++ b/cluster/roles/firewalld/README.md @@ -0,0 +1,110 @@ +Role Name +========= + +This role configures firewalld. + +Requirements +------------ + +The role handles overlay configuration merged from inventory/group_vars/firewalld.yml. +To merge configurations there is a dependency on the merge_vars role to facilitate deep merging of dictionaries with nested lists, the merge vars role depends on 3rd party plugin ansible_merge_vars. + +Role Variables +-------------- + +This role accepts custom configuration from inventory/group_vars/firewalld.yml. +As the role creates dynamic firewall rulesets read the comments in the following files to understand the behaviour. +- firewalld/defaults/main.yml +- inventory/group_vars/firewalld.yml (as listed below) + +An example of custom rulesets injected at inventory/group_vars/firewalld.yml follows: + +```yml +# This is an example to demonstrate +# - behaviour of the role +# - how to add overlay/merged custom configuration items to group_vars inventory/group_vars/firewalld.xml + +firewalld: + enable: true + +# create new ruleset +# - each xcat_network with a corresponding entry in inventroy/networks.yml will have an ipset automatically generated +# - each service with an xcat_network entry will assign the service to a zone of that name, the zone accepts ingress from the corresponding ipset +# - xcat_groups will assign the ruleset to hosts in groups +# +# this ruleset applies inbound ftp to cluster and infiniband zones on hosts in groups all/compute/slurm/ansible + firewalld_services: + - name: ftp + short: "FTP" + description: "FTP service" + port: + - port: 21 + protocol: tcp + xcat_groups: + - compute + - all + - slurm + - ansible + xcat_networks: + - cluster + - infiniband + +# create new ruleset with a custom zone +# - the xcat_networks entry zabbix is not present in inventory/networks.yml, a new zone zabbix will be created +# - the zone requires an ipset named zabbix to add an ingress source + - name: zabbix + short: "Zabbix" + description: "Zabbix Ports" + port: + - port: 10050 + protocol: tcp + - port: 10051 + protocol: tcp + xcat_groups: + - all + xcat_networks: + - zabbix + +# create new ipset +# - this ipset is for the corresponding auto-generated zabbix zone required by the zabbix service(ruleset) + firewalld_ipsets: + zabbix: + short: zabbix + description: zabbix ipset + type: 'hash:ip' + targets: + - 172.22.1.220/32 + # - 172.22.1.0/24 + # - 10.0.10.0/16 + +# create new zone +# - this zone example has an embedded ruleset to allow ANY inbound from IP range, no service or ipset is required + firewalld_zones: + - name: mgt + short: "MGT" + description: "management host" + target: "ACCEPT" + source: + - address: 172.22.1.220/32 + + # network <-> network allow all rule + # - ipset cluster has a corresponding inventory/group_vars/network.yml entry and is thus auto generated and populated with source address range + # - ipsets can only be bound to a single zone, to use this format of rule, any service with a 'cluster' entry in 'xcat_networks:' list requires 'cluster' to be removed. + # + # - name: cluster2cluster + # short: "cluster2cluster" + # description: "allow ingress from cluster network" + # target: "ACCEPT" + # source: + # - ipset: cluster +``` + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/firewalld/defaults/main.yml b/cluster/roles/firewalld/defaults/main.yml new file mode 100644 index 0000000..af80744 --- /dev/null +++ b/cluster/roles/firewalld/defaults/main.yml @@ -0,0 +1,213 @@ +# Copyright 2022 OCF Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- +# vim: ft=yaml +--- + +firewalld: + ## Toggle firewalld as installed and started + enable: true + + ## INI entries to overide + firewalld_conf_file: /etc/firewalld/firewalld.conf + firewalld_conf: + DefaultZone: "public" + LogDenied: "off" + + ## Configure permanent firewalld services (xml config file) + firewalld_services: + - name: ssh + short: "SSH" + description: "SSH service" + port: + - port: 22 + protocol: tcp + xcat_groups: + - all + xcat_networks: + - campus + - cluster + - infiniband + - ipmi + - lustre + - name: dhcpd + short: "dhcp" + description: "DHCP Service" + port: + - port: 7911 + protocol: tcp + xcat_groups: + - compute + - all + xcat_networks: + - cluster + # # + # # Sample rulesets + # # + # - name: zabbix + # short: "Zabbix" + # description: "Zabbix Ports" + # port: + # - port: 10050 + # protocol: tcp + # - port: 10051 + # protocol: tcp + # xcat_groups: + # - compute + # - all + # - slurm + # - ansible + # xcat_networks: + # - cluster + # - infiniband + # - name: bacula + # short: "Bacula" + # description: "Bacula Client" + # port: + # - port: 9102 + # protocol: tcp + # xcat_groups: + # - compute + # - all + # - slurm + # - ansible + # xcat_networks: + # - cluster + # - infiniband + # - name: ftp + # short: "FTP" + # description: "FTP Client/Server" + # port: + # - port: 21 + # protocol: tcp + # xcat_groups: + # - compute + # - all + # - slurm + # - ansible + # xcat_networks: + # - cluster + # - infiniband + # - name: xCAT + # short: "xcatd" + # description: "xCAT Services" + # port: + # - port: 3001 + # protocol: tcp + # - port: 3002 + # protocol: tcp + # - port: 3003 + # protocol: tcp + # - port: 623 + # protocol: udp + # xcat_groups: + # - compute + # - all + # - slurm + # - ansible + # xcat_networks: + # - cluster + # - infiniband + # - name: rsyslogd + # short: "rsyslogd" + # description: "Rsyslog Service" + # port: + # - port: 514 + # protocol: tcp + # xcat_groups: + # - compute + # - all + # - slurm + # - ansible + # xcat_networks: + # - cluster + # - infiniband + # - name: named + # short: "named" + # description: "DNS Service" + # port: + # - port: 53 + # protocol: tcp + # - port: 953 + # protocol: tcp + # xcat_groups: + # - compute + # - all + # - slurm + # - ansible + # xcat_networks: + # - cluster + # - infiniband + + ## Configure permanent firewalld zones (xml config file) + firewalld_zones: + # + # network <-> network allow all rules (ipset cluster is auto generated from xcat_networks) + # ipsets can only be bound to a single zone, to use this format of rule, any service with a 'cluster' entry in 'xcat_networks:' list requires 'cluster' to be removed. + # + # - name: cluster2cluster + # short: "cluster2cluster" + # description: "allow ingress from cluster network" + # target: "ACCEPT" + # source: + # - ipset: cluster + # + # inbuilt safety rule + # + - name: public + short: "Public" + description: "For use in public areas. You do not trust the other computers on networks to not harm your computer. Only selected incoming connections are accepted." + service: + - name: "ssh" + # + # accept any traffic from management hosts + # + # - name: mgt + # short: "MGT" + # description: "Trust my management hosts" + # target: "ACCEPT" + # source: + # - address: 172.22.1.220/32 + # - address: 172.22.1.221/32 + + ## Configure permanent firewalld ipsets (xml config file) + firewalld_ipsets: + fail2ban-ssh: + short: fail2ban-ssh + description: fail2ban-ssh ipset + type: 'hash:ip' + options: + maxelem: + - 65536 + timeout: + - 300 + hashsize: + - 1024 + targets: + - 10.0.0.1 + # fail2ban-ssh-ipv6: + # short: fail2ban-ssh-ipv6 + # description: fail2ban-ssh-ipv6 ipset + # type: 'hash:ip' + # options: + # family: + # - inet6 + # maxelem: + # - 65536 + # timeout: + # - 300 + # hashsize: + # - 1024 + # targets: + # - 2a01::1 diff --git a/cluster/roles/firewalld/firewalld.yml.example b/cluster/roles/firewalld/firewalld.yml.example new file mode 100644 index 0000000..67e7adf --- /dev/null +++ b/cluster/roles/firewalld/firewalld.yml.example @@ -0,0 +1,66 @@ +# This is an example to demonstrate +# - behaviour of the role +# - how to add overlay/merged custom configuration items to group_vars inventory/group_vars/firewalld.xml + +firewalld: + enable: true + +# create new ruleset +# - each xcat_network with a corresponding entry in inventroy/networks.yml will have an ipset automatically generated +# - each xcat_network entry will assign service to a zone of that name, the zone accepts ingress from the corresponding ipset +# - xcat_groups will assign the ruleset to hosts in groups +# +# this ruleset applies inbound ftp to cluster and infiniband zones on hosts in groups all/compute/slurm/ansible + firewalld_services: + - name: ftp + short: "FTP" + description: "FTP service" + port: + - port: 21 + protocol: tcp + xcat_groups: + - compute + - all + - slurm + - ansible + xcat_networks: + - cluster + - infiniband + +# create new ruleset with a custom zone +# - the xcat_networks entry zabbix is not present in inventroy/networks.yml, a new zone zabbix will be created +# - the zone requires an ipset named zabbix to add an ingress source + - name: zabbix + short: "Zabbix" + description: "Zabbix Ports" + port: + - port: 10050 + protocol: tcp + - port: 10051 + protocol: tcp + xcat_groups: + - all + xcat_networks: + - zabbix + +# create new ipset +# - this ipset is for the corresponding auto-generated zabbix zone required by the zabbix service(ruleset) + firewalld_ipsets: + zabbix: + short: zabbix + description: zabbix ipset + type: 'hash:ip' + targets: + - 172.22.1.220/32 + # - 172.22.1.0/24 + # - 10.0.10.0/16 + +# create new zone +# - this zone has an embedded ruleset to allow ANY inbound from IP range, no ipset is required + firewalld_zones: + - name: mgt + short: "MGT" + description: "management host" + target: "ACCEPT" + source: + - address: 172.22.1.220/32 \ No newline at end of file diff --git a/cluster/roles/firewalld/handlers/main.yml b/cluster/roles/firewalld/handlers/main.yml new file mode 100644 index 0000000..0258ced --- /dev/null +++ b/cluster/roles/firewalld/handlers/main.yml @@ -0,0 +1,23 @@ +# Copyright 2022 OCF Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- +# vim: ft=yaml +--- + +- name: reload/enable firewalld + ansible.builtin.systemd: + name: firewalld + state: reloaded + enabled: true + listen: "reload_firewalld" \ No newline at end of file diff --git a/cluster/roles/firewalld/meta/main.yml b/cluster/roles/firewalld/meta/main.yml new file mode 100644 index 0000000..c572acc --- /dev/null +++ b/cluster/roles/firewalld/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/cluster/roles/firewalld/tasks/main.yml b/cluster/roles/firewalld/tasks/main.yml new file mode 100644 index 0000000..92d5cdb --- /dev/null +++ b/cluster/roles/firewalld/tasks/main.yml @@ -0,0 +1,493 @@ +# Copyright 2022 OCF Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- +# vim: ft=yaml +--- + +######## inherit custom variables from inventory/host_vars/firewalld.yml + +- name: merge custom vars + block: + + - name: set role variable sources + set_fact: + role_info: + role_defaults_file: "{{ role_path }}/defaults/main.yml" + role_override_file: "{{ ansible_inventory_sources[0] | dirname }}/group_vars/{{ role_name }}.yml" + vars_return: "placeholder" + + - set_fact: + source_role: "{{ role_name }}" + + - name: run merge_vars role + include_role: + name: "merge_vars" + vars: + a_config_file: "{{ role_info['role_defaults_file'] }}" + b_config_file: "{{ role_info['role_override_file'] }}" + calling_role: "{{ source_role }}" + + - name: merge custom vars to vars[] + set_fact: + { "{{ entry }}": "{{ role_info['vars_return'][entry] }}" } + loop: "{{ role_info['vars_return'] | list }}" + loop_control: + loop_var: entry + when: + - not role_info['vars_return'] == 'placeholder' + + delegate_to: localhost + +######## setup packages + +- name: update package facts + ansible.builtin.package_facts: + manager: auto + strategy: all + when: ansible_facts['packages'] is not defined + +- name: install firewalld packages + block: + + - name: install firewalld + ansible.builtin.package: + name: + - firewalld + - ipset + - nftables + state: latest + + - name: Install python-firewall + package: + name: python-firewall + state: present + when: + - ansible_facts['os_family'] == 'RedHat' and ansible_facts['distribution_major_version'] == '7' + + - name: Install python3-firewall + package: + name: python3-firewall + state: present + when: + - ansible_facts['os_family'] == 'RedHat' and ansible_facts['distribution_major_version'] == '8' + + when: + - vars['firewalld']['enable'] | bool + - ansible_facts['packages']['firewalld'] is not defined or + ansible_facts['packages']['ipset'] is not defined or + ansible_facts['packages']['nftables'] is not defined + +- name: update service facts + ansible.builtin.service_facts: + +######## disable firewall + +- name: disable firewalld + ansible.builtin.systemd: + name: firewalld + enabled: no + state: stopped + when: + - ansible_facts['services']['firewalld.service'] is not defined + - not vars['firewalld']['enable'] | bool + +######## render firewalld config file + +- name: update INI entries in firewalld config + ini_file: + path: "{{ firewalld['firewalld_conf_file'] }}" + no_extra_spaces: true + # write to root of document not under a section + section: null + option: "{{ entry.key }}" + value: "{{ entry.value }}" + loop: "{{ firewalld['firewalld_conf'] | dict2items }}" + loop_control: + loop_var: entry + notify: reload_firewalld + when: + - firewalld['enable'] | bool + +######## map services to zones and networks + +# map host 'xcat_groups' (hostvars[ansible_hostname]) to services 'xcat_groups' (vars['firewalld']['firewalld_services'] list item ['xcat_groups']) +# determine if the service (firewall rule) is applicable to the host + +- name: map services to zones + block: + + - name: find firewalld services to be applied to each xcat_groups that this host is a member of + set_fact: + target_services: "{{ target_services | default([]) + [service] }}" + when: xcat_group in hostvars[ansible_hostname]['xcat_groups'] + with_subelements: + - "{{ firewalld['firewalld_services'] }}" + - xcat_groups + - skip_missing: True + loop_control: + loop_var: entry + vars: + xcat_group: "{{ entry.1 }}" + service: "{{ entry.0 }}" + + # - debug: + # msg: + # - "{{ target_services }}" + + - name: remove duplicate service entries where host in multiple xcat_groups + set_fact: + target_services: "{{ target_services | unique }}" + + when: + - firewalld['enable'] | bool + +######## configure ipsets + +- name: configure ipsets + block: + + - name: list existing ipsets in /etc/firewalld/ipsets + find: + paths: "/etc/firewalld/ipsets/" + patterns: "*.xml" + recurse: no + file_type: file + register: ipsets_files_all + + - name: exclude ipsets managed by ansible + set_fact: + ipsets_files: "{{ ipsets_files | default([]) + [file_path] }}" + loop: "{{ ipsets_files_all['files'] }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry['path'] }}" + file_name: "{{ entry['path'].split('/')[-1].split('.')[0] }}" + when: + - ipsets_files_all['files'] | length >0 + - file_name not in firewalld['firewalld_ipsets'] + - file_name not in vars['steel']['xcat_networks'] | list + + - name: disable ipsets not managed by ansible + copy: + remote_src: yes + src: "{{ file_path }}" + dest: "{{ new_file_path }}" + loop: "{{ ipsets_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + new_file_path: "{{ entry.split('.')[0] }}.ansible_disabled" + register: ipsets_disabled + notify: reload_firewalld + when: + - ipsets_files is defined + - ipsets_files | length >0 + + - file: + path: "{{ file_path }}" + state: absent + loop: "{{ ipsets_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + when: + - not ipsets_disabled['skipped'] | bool + + - name: generate ipsets from steel['xcat_networks'] + set_fact: + generated_ipsets: "{{ generated_ipsets | default({}) | combine({ 'firewalld_ipsets': { network_name: { 'short': network_name, 'description': description, 'type': 'hash:ip', 'targets': [network_cidr] } } }, recursive=True) }}" + # generated_ipsets: "{{ generated_ipsets | default({}) | combine({ 'firewalld_ipsets': { network_name: { 'short': network_name, 'description': description, 'type': 'hash:ip', 'options': { 'maclem': [65536], 'timeout': [300], 'hashsize': [1024] }, 'targets': [network_cidr] } } }, recursive=True) }}" # example with additional options + loop: "{{ steel['xcat_networks'] | dict2items }}" + loop_control: + loop_var: entry + vars: + network_name: "{{ entry.key }}" + network_range: "{{ entry.value['network'] }}" + network_mask: "{{ entry.value['netmask'] }}" + network_cidr: "{{ network_range }}/{{ (network_range + '/' + network_mask) | ansible.utils.ipaddr('prefix') }}" + description: "{{ network_name }} ipset" + + # required where we have provided custom ipsets + - name: merge generated generate ipsets + set_fact: + firewalld: "{{ firewalld | default({}) | combine( generated_ipsets, recursive=True) }}" + when: + - generated_ipsets is defined + + - name: render firewalld ipsets + template: + src: "{{ role_path }}/templates/ipset_template.xml.j2" + dest: /etc/firewalld/ipsets/{{ entry }}.xml + loop: "{{ firewalld['firewalld_ipsets'] | list }}" + loop_control: + loop_var: entry + vars: + short: "{{ firewalld['firewalld_ipsets'][entry]['short'] }}" + description: "{{ firewalld['firewalld_ipsets'][entry]['description'] }}" + type: "{{ firewalld['firewalld_ipsets'][entry]['type'] }}" + options: "{{ firewalld['firewalld_ipsets'][entry]['options'] }}" + targets: "{{ firewalld['firewalld_ipsets'][entry]['targets'] }}" + notify: reload_firewalld + when: + - firewalld['firewalld_ipsets'] is defined + + when: + - firewalld['enable'] | bool + +######## configure services + +- name: configure services + block: + + - name: list existing services in /etc/firewalld/services + find: + paths: "/etc/firewalld/services/" + patterns: "*.xml" + recurse: no + file_type: file + register: services_files_all + + - name: exclude services managed by ansible + set_fact: + services_files: "{{ services_files | default([]) + [file_path] }}" + loop: "{{ services_files_all['files'] }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry['path'] }}" + file_name: "{{ entry['path'].split('/')[-1].split('.')[0] }}" + when: + - services_files_all['files'] | length >0 + - file_name not in firewalld['firewalld_services'] | map(attribute='name') + + # - debug: + # msg: + # - "{{ services_files }}" + + - name: disable services not managed by ansible + copy: + remote_src: yes + src: "{{ file_path }}" + dest: "{{ new_file_path }}" + loop: "{{ services_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + new_file_path: "{{ entry.split('.')[0] }}.ansible_disabled" + register: services_disabled + notify: reload_firewalld + when: + - services_files is defined + - services_files | length >0 + + # - debug: + # msg: + # - "{{ services_disabled }}" + + - file: + path: "{{ file_path }}" + state: absent + loop: "{{ services_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + when: + - not services_disabled['skipped'] | bool + + - name: render firewalld services + template: + src: "{{ role_path }}/templates/service_template.xml.j2" + dest: /etc/firewalld/services/{{ name }}.xml + loop: "{{ target_services }}" + loop_control: + loop_var: entry + vars: + name: "{{ entry['name'] }}" + short: "{{ entry['short'] }}" + description: "{{ entry['description'] }}" + port: "{{ entry['port'] }}" + notify: reload_firewalld + when: + - firewalld['firewalld_services'] is defined + - firewalld['firewalld_services'] | length >0 + + when: + - firewalld['enable'] | bool + +######## configure zones + +- name: configure zones + block: + + # there are no preset zone names, zones are dynamically generated from top level source inventory/networks.yml + # to create a custom zone + # - a custom firewalld_services entry with an (arbritrary) xcat_networks list item will generate a new zone + # - a custom firewalld_ipsets entry named the same as the custom services entry will be required to control ingress + # + # - name: generate all zone names from xcat_networks entry in 'firewalld_merged['firewalld_services']' + - name: generate all zone names from xcat_networks entry in 'firewalld['firewalld_services']' + set_fact: + zone_list: "{{ zone_list | default([]) + zone }}" + loop: "{{ target_services }}" + loop_control: + loop_var: entry + vars: + zone: "{{ entry['xcat_networks'] }}" + + - name: filter on unique zones from services + set_fact: + zone_list: "{{ zone_list | unique }}" + + # this is the pivotal task in the playbook to ensure the zones dictionary is in the format accepted for the jinja2 loops in the zone_template.xml.j2 + # loop unique zones, match all services bound to the zone using xcat_networks, get a list of service names and format into a list of dicts each with the same key 'name:', render zones template in compatible format for jinja + # + - name: create zones dictionary + set_fact: + firewalld_zones: "{{ firewalld_zones | default([]) + ([{ 'name': zone_name, 'short': zone_name, 'description': zone_description, 'source': [{ 'ipset': zone_name }], 'service': service_trim }] ) }}" + # firewalld_zones: "{{ firewalld_zones | default([]) + ([{ 'name': zone_name, 'short': zone_name, 'description': zone_description, 'source': [{ 'ipset': zone_name }], 'service': [{ 'name': 'ssh' }, { 'name': 'ftp' }] }] ) }}" # format required + loop: "{{ zone_list }}" + loop_control: + loop_var: entry + vars: + zone_name: "{{ entry }}" + zone_description: "{{ entry }} zone" + # use mapping to return list of services + service: "{{ target_services | selectattr('xcat_networks', 'search', entry) | map(attribute='name') }}" + # + # inline jinja to create a list of dicts for the services used in this zone + service_format: >- + {% set results = [] %} + {% for svc in service|default([]) %} + {% set sub_results = {} %} + {% set _ = sub_results.update({"name": svc}) %} + {% set _ = results.append(sub_results) %} + {% endfor -%} + {{results}} + # trim whitespaces to allow ansible to interperet as list item in the firewalld_zones dict + service_trim: "{{ service_format | trim }}" + + # - name: add pre-defined zones from firewalld_merged['firewalld_zones'] + - name: add pre-defined zones from firewalld['firewalld_zones'] + set_fact: + firewalld_zones: "{{ firewalld_zones | default([]) + [entry] }}" + loop: "{{ firewalld['firewalld_zones'] }}" + loop_control: + loop_var: entry + when: + - firewalld['firewalld_zones'] is defined + - firewalld['firewalld_zones'] | length >0 + + # - debug: + # msg: + # - "{{ firewalld_zones }}" + + - name: list existing zones in /etc/firewalld/zones + find: + paths: "/etc/firewalld/zones/" + patterns: "*.xml" + recurse: no + file_type: file + register: zones_files_all + + - name: exclude zones managed by ansible + set_fact: + zone_files: "{{ zone_files | default([]) + [file_path] }}" + loop: "{{ zones_files_all['files'] }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry['path'] }}" + file_name: "{{ entry['path'].split('/')[-1].split('.')[0] }}" + when: + - zones_files_all['files'] | length >0 + - file_name not in firewalld_zones | map(attribute='name') + + # - debug: + # msg: + # - "{{ zone_files }}" + + - name: disable zones not managed by ansible + copy: + remote_src: yes + src: "{{ file_path }}" + dest: "{{ new_file_path }}" + loop: "{{ zone_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + new_file_path: "{{ entry.split('.')[0] }}.ansible_disabled" + register: zones_disabled + notify: reload_firewalld + when: + - zone_files is defined + - zone_files | length >0 + + # - debug: + # msg: + # - "{{ zones_disabled }}" + + - file: + path: "{{ file_path }}" + state: absent + loop: "{{ zone_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + when: + - not zones_disabled['skipped'] | bool + + - name: render firewalld zones + template: + src: "{{ role_path }}/templates/zone_template.xml.j2" + dest: /etc/firewalld/zones/{{ name }}.xml + loop: "{{ firewalld_zones | list }}" + loop_control: + loop_var: entry + vars: + name: "{{ entry['name'] }}" + short: "{{ entry['short'] }}" + description: "{{ entry['description'] }}" + service: "{{ entry['service'] }}" + ipset: "{{ entry['name'] }}" + notify: reload_firewalld + when: + - firewalld_zones is defined + - firewalld_zones | length >0 + + when: + - firewalld['enable'] | bool + +######## start firewalld +# +# handler starts/reloads/enables firewalld service + +# - name: Flush handlers +# meta: flush_handlers + +# - name: Start and enable firewalld +# ansible.builtin.systemd: +# name: firewalld.service +# state: restarted +# # daemon_reload: yes +# enabled: yes +# when: +# - ansible_facts['services']['firewalld.service'] is defined +# - firewalld['enable'] | bool \ No newline at end of file diff --git a/cluster/roles/firewalld/tasks/main.yml.working_reference b/cluster/roles/firewalld/tasks/main.yml.working_reference new file mode 100644 index 0000000..ee78ff7 --- /dev/null +++ b/cluster/roles/firewalld/tasks/main.yml.working_reference @@ -0,0 +1,670 @@ +# Copyright 2022 OCF Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- +# vim: ft=yaml +--- + +######## inherit custom variables from hostvars/firewalld.yml + +# this works - much tidier solution - dont have to load EVERYTHING under steel just overrides - will this work with deep dicts though?? put a control task in here to notify a user of ANY clash +# - name: merge steel['firewalld'] over role defaults +# set_fact: +# firewalld: "{{ firewalld | default({}) | combine( steel['firewalld'], recursive=True) }}" +# when: steel['firewalld'] is defined + + + + + +######## inherit custom variables from inventory/host_vars/firewalld.yml + +# deep merging of dicts for variable overlay is handled 'once' by a 3rd party ansible plugin +# the rest of the playbook explicitly does not use this plugin in case it is deprecated or functionally duplicated by ansible native in future +# - ansible_merge_vars + +- name: merge custom vars over inbuilt vars + block: + + - name: copy sets of 'to merge' vars to ansible_merge_vars compatible named variables + include_vars: + file: "{{ role_path }}/defaults/main.yml" + name: merge + +# merge precedence is controlled by alphabetical name of merge variables, variable declation order seems to also influence behaviour in some circumstances +# use the order and format as below + - name: copy sets of 'to merge' vars to ansible_merge_vars compatible named variables + set_fact: + a_inbuilt_config__to_merge: "{{ merge['firewalld'] }}" + b_custom_config__to_merge: "{{ vars['steel']['firewalld'] }}" + + - name: merge custom vars over inbuilt vars + merge_vars: + suffix_to_merge: _config__to_merge + merged_var_name: firewalld_merged + expected_type: 'dict' + recursive_dict_merge: true + + when: + - steel['firewalld'] is defined + +- name: fallback behaviour - where no custom vars inventory/host_vars/firewalld.yml + set_fact: + firewalld_merged: "{{ firewalld }}" + when: + - steel['firewalld'] is not defined + +# - debug: +# msg: +# - "{{ firewalld_merged }}" + +# - fail: +# msg: + +######## setup packages + +- name: update package facts + ansible.builtin.package_facts: + manager: auto + strategy: all + when: ansible_facts['packages'] is not defined + +- name: install firewalld packages + block: + + - name: install firewalld + ansible.builtin.package: + name: + - firewalld + - ipset + - nftables + state: latest + + - name: Install python-firewall + package: + name: python-firewall + state: present + when: + - ansible_facts['os_family'] == 'RedHat' and ansible_facts['distribution_major_version'] == '7' + + - name: Install python3-firewall + package: + name: python3-firewall + state: present + when: + - ansible_facts['os_family'] == 'RedHat' and ansible_facts['distribution_major_version'] == '8' + + when: + - vars['firewalld']['enable'] | bool + - ansible_facts['packages']['firewalld'] is not defined or + ansible_facts['packages']['ipset'] is not defined or + ansible_facts['packages']['nftables'] is not defined + +- name: update service facts + ansible.builtin.service_facts: + +######## disable firewall + +- name: disable firewalld + ansible.builtin.systemd: + name: firewalld + enabled: no + state: stopped + when: + - ansible_facts['services']['firewalld.service'] is not defined + # - not vars['firewalld']['enable'] | bool + - not vars['firewalld_merged']['enable'] | bool + +######## render firewalld config file + +- name: update INI entries in firewalld config + ini_file: + # path: "{{ firewalld.firewalld_conf_file }}" + path: "{{ firewalld_merged['firewalld_conf_file'] }}" + no_extra_spaces: true + # write to root of document not under a section + section: null + option: "{{ entry.key }}" + value: "{{ entry.value }}" + # loop: "{{ firewalld['firewalld_conf'] | dict2items }}" + loop: "{{ firewalld_merged['firewalld_conf'] | dict2items }}" + loop_control: + loop_var: entry + # notify: reload firewalld + when: + # - vars['firewalld']['enable'] | bool + - firewalld_merged['enable'] | bool + +######## map services to zones and networks + +# map host 'xcat_groups' (hostvars[ansible_hostname]) to services 'xcat_groups' (vars['firewalld']['firewalld_services'] list item ['xcat_groups']) +# determine if the service (firewall rule) is applicable to the host + +- name: map services to zones and networks + block: + + - name: find firewalld services to be applied to each xcat_groups that this host is a member of + set_fact: + target_services: "{{ target_services | default([]) + [service] }}" + # target_services: "{{ target_services | default([]) | combine({ 'firewalld_services': [firewalld_service] }, recursive=True) }}" + when: xcat_group in hostvars[ansible_hostname]['xcat_groups'] + with_subelements: + # - "{{ vars['firewalld']['firewalld_services'] }}" + - "{{ firewalld_merged['firewalld_services'] }}" + - xcat_groups + - skip_missing: True + loop_control: + loop_var: entry + vars: + xcat_group: "{{ entry.1 }}" + service: "{{ entry.0 }}" + + # - debug: + # msg: + # - "{{ target_services }}" + + - name: remove duplicate service entries where host in multiple xcat_groups + set_fact: + target_services: "{{ target_services | unique }}" + + - name: find all networks where the host has an interface (source = hostvars[ansible_hostname]['xcat_nics']) + set_fact: + nic_list: "{{ nic_list | default([]) + [xcat_nic] }}" + loop: "{{ hostvars[ansible_hostname]['xcat_nics'] | list }}" + loop_control: + loop_var: entry + vars: + xcat_nic: "{{ entry['network'] }}" + + # we need a double check task here - why? - what is in xcat is not necessarily what is configured on the host, i.e no ib0 adapter + # LAB has no ib0 - write something here to remove infiniband from nic_list + # + # - name: find all networks where the host has an interface (source = nmcli ), remove where adapter is not present on host + + + # UPDATE - prob dont need this! we can assign services to multiple zones + # + # account for the following condition: + # - where a service has multiple 'xcat_networks' entries (i.e cluster and infiniband) + # - and the host has network adapters in multiple 'xcat_networks' (i.e cluster and infiniband) + # we must ensure that the service is duplicated for each network + # - make unique services with the service name suffixed with _network (i.e ssh_cluster and ssh_infiniband) + # write role defensively even though in usual operation this may not occur + # + # - name: find firewalld services to be applied to each xcat_networks that this host is a member of + # set_fact: + # target_services_by_network: "{{ target_services_by_network | default([]) + [service] }}" + # when: network in nic_list + # with_subelements: + # - "{{ target_services }}" + # - xcat_networks + # - skip_missing: True + # loop_control: + # loop_var: entry + # vars: + # tmp_service: "{{ entry.0 }}" + # network: "{{ entry.1 }}" + # name: "{{ tmp_service['name'] }}_{{ network }}" + # description: "{{ tmp_service['description'] }} IPset {{ network }}" + # service: "{{ tmp_service | default({}) | combine({ 'name': name, 'short': name, 'description': description, 'network': network }) }}" + + # - debug: + # msg: + # - "{{ target_services_by_network }}" + + when: + # - vars['firewalld']['enable'] | bool + - firewalld_merged['enable'] | bool + # - not vars['firewalld']['enable'] | bool + +######## configure ipsets + +- name: configure ipsets + block: + + - name: list existing ipsets in /etc/firewalld/ipsets + find: + paths: "/etc/firewalld/ipsets/" + patterns: "*.xml" + recurse: no + file_type: file + register: ipsets_files_all + + - name: exclude ipsets managed by ansible + set_fact: + ipsets_files: "{{ ipsets_files | default([]) + [file_path] }}" + loop: "{{ ipsets_files_all['files'] }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry['path'] }}" + file_name: "{{ entry['path'].split('/')[-1].split('.')[0] }}" + when: + - ipsets_files_all['files'] | length >0 + # - file_name not in firewalld['firewalld_ipsets'] + - file_name not in firewalld_merged['firewalld_ipsets'] + + # - debug: + # msg: + # - "{{ ipset_files }}" + + - name: disable ipsets not managed by ansible + copy: + remote_src: yes + src: "{{ file_path }}" + dest: "{{ new_file_path }}" + loop: "{{ ipsets_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + new_file_path: "{{ entry.split('.')[0] }}.ansible_disabled" + register: ipsets_disabled + # notify: reload firewalld + when: + - ipsets_files is defined + - ipsets_files | length >0 + + - file: + path: "{{ file_path }}" + state: absent + loop: "{{ ipsets_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + when: + - not ipsets_disabled['skipped'] | bool + + - name: generate ipsets from steel['xcat_networks'] + set_fact: + generated_ipsets: "{{ generated_ipsets | default({}) | combine({ 'firewalld_ipsets': { network_name: { 'short': network_name, 'description': description, 'type': 'hash:ip', 'targets': [network_cidr] } } }, recursive=True) }}" + # example with additional options + # generated_ipsets: "{{ generated_ipsets | default({}) | combine({ 'firewalld_ipsets': { network_name: { 'short': network_name, 'description': description, 'type': 'hash:ip', 'options': { 'maclem': [65536], 'timeout': [300], 'hashsize': [1024] }, 'targets': [network_cidr] } } }, recursive=True) }}" + loop: "{{ steel['xcat_networks'] | dict2items }}" + loop_control: + loop_var: entry + vars: + network_name: "{{ entry.key }}" + network_range: "{{ entry.value['network'] }}" + network_mask: "{{ entry.value['netmask'] }}" + network_cidr: "{{ network_range }}/{{ (network_range + '/' + network_mask) | ansible.utils.ipaddr('prefix') }}" + description: "{{ network_name }} ipset" + + # required where we have provided custom ipsets + - name: merge generated generate ipsets + set_fact: + # firewalld: "{{ firewalld | default({}) | combine( generated_ipsets, recursive=True) }}" + firewalld_merged: "{{ firewalld_merged | default({}) | combine( generated_ipsets, recursive=True) }}" + when: + - generated_ipsets is defined + + - name: render firewalld ipsets + template: + src: "{{ role_path }}/templates/ipset_template.xml.j2" + dest: /etc/firewalld/ipsets/{{ entry }}.xml + # loop: "{{ firewalld['firewalld_ipsets'] | list }}" + loop: "{{ firewalld_merged['firewalld_ipsets'] | list }}" + loop_control: + loop_var: entry + vars: + # short: "{{ firewalld['firewalld_ipsets'][entry]['short'] }}" + # description: "{{ firewalld['firewalld_ipsets'][entry]['description'] }}" + # type: "{{ firewalld['firewalld_ipsets'][entry]['type'] }}" + # options: "{{ firewalld['firewalld_ipsets'][entry]['options'] }}" + # targets: "{{ firewalld['firewalld_ipsets'][entry]['targets'] }}" + short: "{{ firewalld_merged['firewalld_ipsets'][entry]['short'] }}" + description: "{{ firewalld_merged['firewalld_ipsets'][entry]['description'] }}" + type: "{{ firewalld_merged['firewalld_ipsets'][entry]['type'] }}" + options: "{{ firewalld_merged['firewalld_ipsets'][entry]['options'] }}" + targets: "{{ firewalld_merged['firewalld_ipsets'][entry]['targets'] }}" + # notify: reload firewalld + when: + # - firewalld['firewalld_ipsets'] is defined + - firewalld_merged['firewalld_ipsets'] is defined + + when: + # - vars['firewalld']['enable'] | bool + - firewalld_merged['enable'] | bool + # - not vars['firewalld']['enable'] | bool + +######## configure services + +- name: configure services + block: + + - name: list existing services in /etc/firewalld/services + find: + paths: "/etc/firewalld/services/" + patterns: "*.xml" + recurse: no + file_type: file + register: services_files_all + + - name: exclude services managed by ansible + set_fact: + services_files: "{{ services_files | default([]) + [file_path] }}" + loop: "{{ services_files_all['files'] }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry['path'] }}" + file_name: "{{ entry['path'].split('/')[-1].split('.')[0] }}" + when: + - services_files_all['files'] | length >0 + # - file_name not in firewalld['firewalld_services'] | map(attribute='name') + - file_name not in firewalld_merged['firewalld_services'] | map(attribute='name') + + # - debug: + # msg: + # - "{{ services_files }}" + + - name: disable services not managed by ansible + copy: + remote_src: yes + src: "{{ file_path }}" + dest: "{{ new_file_path }}" + loop: "{{ services_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + new_file_path: "{{ entry.split('.')[0] }}.ansible_disabled" + register: services_disabled + # notify: reload firewalld + when: + - services_files is defined + - services_files | length >0 + + # - debug: + # msg: + # - "{{ services_disabled }}" + + - file: + path: "{{ file_path }}" + state: absent + loop: "{{ services_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + when: + - not services_disabled['skipped'] | bool + + +# UPDATE - dont use this dict use a simpler one that hasnt had network added - pointless with the behaviour - logic is in zones +# the source of truth for services is actually "{{ target_services_by_network }}" - your loops above need to use this!! +# list of dicts so not changed the format - ok + # { + # "description": "DHCP Service", + # "name": "dhcpd_cluster", + # "network": "cluster", + # "port": [ + # { + # "port": 7911, + # "protocol": "tcp" + # } + # ], + # "short": "dhcp", + # "zone": "public" + # } +# NOW + # { + # "description": "TSEED service", + # "name": "tseed", + # "port": [ + # { + # "port": 22, + # "protocol": "tcp" + # } + # ], + # "short": "TSEED", + # "xcat_networks": [ + # "cluster", + # "infiniband", + # "tseed" + # ], + # "zone": "mgt" + # }, + + + - name: render firewalld services + template: + src: "{{ role_path }}/templates/service_template.xml.j2" + dest: /etc/firewalld/services/{{ name }}.xml + # debug: + # msg: + # - "name {{name}}" + # - "short {{short}}" + # - "description {{description}}" + # - "rules {{port}}" + # # - "ipset {{network}}" + # loop: "{{ target_services_by_network }}" + loop: "{{ target_services }}" + loop_control: + loop_var: entry + vars: + name: "{{ entry['name'] }}" + short: "{{ entry['short'] }}" + description: "{{ entry['description'] }}" + port: "{{ entry['port'] }}" + # notify: reload firewalld + when: + # - firewalld['firewalld_services'] is defined + # - firewalld['firewalld_services'] | length >0 + - firewalld_merged['firewalld_services'] is defined + - firewalld_merged['firewalld_services'] | length >0 + + when: + # - vars['firewalld']['enable'] | bool + - firewalld_merged['enable'] | bool + # - not vars['firewalld']['enable'] | bool + +######## configure zones + +- name: configure zones + block: + + # Update: rename firewalld_services xcat_networks to zones_or_xcat_networks - will have to put in a qualifying test to match an ipset though + + # there are no preset zone names, zones are dynamically generated from top level source inventory/networks.yml + # to create a custom zone + # - a custom firewalld_services entry with an (arbritrary) xcat_networks list item will generate a new zone + # - a custom firewalld_ipsets entry named the same as the custom services entry will be required to control ingress + # + - name: generate all zone names from xcat_networks entry in 'firewalld_merged['firewalld_services']' + set_fact: + # zone_list: "{{ zone_list | default([]) + [zone] }}" + zone_list: "{{ zone_list | default([]) + zone }}" + # loop: "{{ target_services_by_network }}" + loop: "{{ target_services }}" + loop_control: + loop_var: entry + vars: + # zone: "{{ entry['zone'] }}" + zone: "{{ entry['xcat_networks'] }}" + + - name: filter on unique zones from services + set_fact: + zone_list: "{{ zone_list | unique }}" + + # # loop unique zones, match all services bound to the zone using xcat_networks, get a list of service names + # - name: create zones dictionary + # set_fact: + # firewalld_zones: "{{ firewalld_zones | default([]) + ([{ 'name': zone_name, 'short': zone_name, 'description': zone_description, 'source': [{ 'ipset': zone_name }], 'service': service }] ) }}" + # # firewalld_zones: "{{ firewalld_zones | default([]) + ([{ 'name': zone_name, 'short': zone_name, 'description': zone_description, 'source': [{ 'ipset': zone_name }], 'service': [{ 'name': 'ssh' }] }] ) }}" # format we are looking for + # loop: "{{ zone_list }}" + # loop_control: + # loop_var: entry + # vars: + # zone_name: "{{ entry }}" + # zone_description: "{{ entry }} zone" + # service: "{{ target_services | selectattr('xcat_networks', 'search', entry) | map(attribute='name') }}" + + # - debug: + # msg: + # - "{{ target_services }}" + + # this is the pivotal task in the playbook to ensure the zones dictionary is in the format accepted for the jinja2 loops in the zone_template.xml.j2 + # this is voodoo, sneeze and its gone + # loop unique zones, match all services bound to the zone using xcat_networks, get a list of service names + # + - name: create zones dictionary + set_fact: + # firewalld_zones: "{{service_format1}}" + firewalld_zones: "{{ firewalld_zones | default([]) + ([{ 'name': zone_name, 'short': zone_name, 'description': zone_description, 'source': [{ 'ipset': zone_name }], 'service': service_trim }] ) }}" + # firewalld_zones: "{{ firewalld_zones | default([]) + ([{ 'name': zone_name, 'short': zone_name, 'description': zone_description, 'source': [{ 'ipset': zone_name }], 'service': [{ 'name': 'ssh' }] }] ) }}" # format we are looking for + # debug: + # msg: + # - "{{ service_trim }}" + loop: "{{ zone_list }}" + loop_control: + loop_var: entry + vars: + zone_name: "{{ entry }}" + zone_description: "{{ entry }} zone" + # use mapping to find list of services + service: "{{ target_services | selectattr('xcat_networks', 'search', entry) | map(attribute='name') }}" + # + # inline jinja to create a list of dicts for the services used in this zone + service_format: >- + {% set results = [] %} + {% for svc in service|default([]) %} + {% set sub_results = {} %} + {% set _ = sub_results.update({"name": svc}) %} + {% set _ = results.append(sub_results) %} + {% endfor -%} + {{results}} + # + # create a list of items that 'look' like one element dicts + # service_format: >- + # {% set results = [] %} + # {% for svc in service|default([]) %} + # {% set d = ({"name": svc}) %} + # {% set _ = results.append(d) %} + # {% endfor -%} + # {{ results }} + # + # trim whitespaces to allow ansible to interperet as list item in the firewalld_zones dict + service_trim: "{{ service_format | trim }}" + + # - debug: + # msg: + # - "{{ firewalld_zones }}" + + - name: list existing zones in /etc/firewalld/zones + find: + paths: "/etc/firewalld/zones/" + patterns: "*.xml" + recurse: no + file_type: file + register: zones_files_all + + - name: exclude zones managed by ansible + set_fact: + zone_files: "{{ zone_files | default([]) + [file_path] }}" + loop: "{{ zones_files_all['files'] }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry['path'] }}" + file_name: "{{ entry['path'].split('/')[-1].split('.')[0] }}" + when: + - zones_files_all['files'] | length >0 + - file_name not in firewalld_zones | map(attribute='name') + + # - debug: + # msg: + # - "{{ zone_files }}" + + - name: disable zones not managed by ansible + copy: + remote_src: yes + src: "{{ file_path }}" + dest: "{{ new_file_path }}" + loop: "{{ zone_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + new_file_path: "{{ entry.split('.')[0] }}.ansible_disabled" + register: zones_disabled + # notify: reload firewalld + when: + - zone_files is defined + - zone_files | length >0 + + # - debug: + # msg: + # - "{{ zones_disabled }}" + + - file: + path: "{{ file_path }}" + state: absent + loop: "{{ zone_files }}" + loop_control: + loop_var: entry + vars: + file_path: "{{ entry }}" + when: + - not zones_disabled['skipped'] | bool + + - name: render firewalld zones + template: + src: "{{ role_path }}/templates/zone_template.xml.j2" + dest: /etc/firewalld/zones/{{ name }}.xml + # debug: + # msg: + # # - "{{ name }}" + # # - "{{ short }}" + # # - "{{ description }}" + # # - "{{ service }}" + # # - "{{ ipset }}" + # - "{{ entry }}" + loop: "{{ firewalld_zones | list }}" + loop_control: + loop_var: entry + vars: + name: "{{ entry['name'] }}" + short: "{{ entry['short'] }}" + description: "{{ entry['description'] }}" + service: "{{ entry['service'] }}" + ipset: "{{ entry['name'] }}" + # notify: reload firewalld + when: + - firewalld_zones is defined + - firewalld_zones | length >0 + + when: + # - vars['firewalld']['enable'] | bool + - firewalld_merged['enable'] | bool + # - not vars['firewalld']['enable'] | bool + +######## start firewalld + +- name: Start and enable firewalld + service: + name: firewalld + state: started + enabled: true + when: + - ansible_facts['services']['firewalld.service'] is defined + # - vars['firewalld']['enable'] | bool + - firewalld_merged['enable'] | bool + +# - fail: +# msg: +# - "stop" + +# - name: Flush all handlers +# meta: flush_handlers \ No newline at end of file diff --git a/cluster/roles/firewalld/templates/firewalld.conf.j2 b/cluster/roles/firewalld/templates/firewalld.conf.j2 new file mode 100644 index 0000000..387ffac --- /dev/null +++ b/cluster/roles/firewalld/templates/firewalld.conf.j2 @@ -0,0 +1,58 @@ +# firewalld config file + +# default zone +# The default zone used if an empty zone string is used. +# Default: public +DefaultZone=public + +# Minimal mark +# Marks up to this minimum are free for use for example in the direct +# interface. If more free marks are needed, increase the minimum +# Default: 100 +MinimalMark=100 + +# Clean up on exit +# If set to no or false the firewall configuration will not get cleaned up +# on exit or stop of firewalld +# Default: true +#CleanupOnExit=yes +CleanupOnExit=no + +# Lockdown +# If set to enabled, firewall changes with the D-Bus interface will be limited +# to applications that are listed in the lockdown whitelist. +# The lockdown whitelist file is lockdown-whitelist.xml +# Default: false +Lockdown=no + +# IPv6_rpfilter +# Performs a reverse path filter test on a packet for IPv6. If a reply to the +# packet would be sent via the same interface that the packet arrived on, the +# packet will match and be accepted, otherwise dropped. +# The rp_filter for IPv4 is controlled using sysctl. +# Default: true +IPv6_rpfilter=yes + +# IndividualCalls +# Do not use combined -restore calls, but individual calls. This increases the +# time that is needed to apply changes and to start the daemon, but is good for +# debugging. +# Default: false +IndividualCalls=no + +# LogDenied +# Add logging rules right before reject and drop rules in the INPUT, FORWARD +# and OUTPUT chains for the default rules and also final reject and drop rules +# in zones. Possible values are: all, unicast, broadcast, multicast and off. +# Default: off +LogDenied=off + +# AutomaticHelpers +# For the secure use of iptables and connection tracking helpers it is +# recommended to turn AutomaticHelpers off. But this might have side effects on +# other services using the netfilter helpers as the sysctl setting in +# /proc/sys/net/netfilter/nf_conntrack_helper will be changed. +# With the system setting, the default value set in the kernel or with sysctl +# will be used. Possible values are: true, no and system. +# Default: system +AutomaticHelpers=system diff --git a/cluster/roles/firewalld/templates/ipset_template.xml.j2 b/cluster/roles/firewalld/templates/ipset_template.xml.j2 new file mode 100644 index 0000000..46c2303 --- /dev/null +++ b/cluster/roles/firewalld/templates/ipset_template.xml.j2 @@ -0,0 +1,15 @@ + + +{% if short is defined %} + {{ short }} +{% endif %} +{% if description is defined %} + {{ description }} +{% endif %} +{% for name,value in (option|default({})).items() %} + diff --git a/cluster/roles/firewalld/templates/service_template.xml.j2 b/cluster/roles/firewalld/templates/service_template.xml.j2 new file mode 100644 index 0000000..1fa4c98 --- /dev/null +++ b/cluster/roles/firewalld/templates/service_template.xml.j2 @@ -0,0 +1,20 @@ + + +{% if short is defined %} + {{ short }} +{% endif %} +{% if description is defined %} + {{ description }} +{% endif %} +{% for tag in entry %} +{# Tags which can be used several times #} +{% if tag in ['port','protocol','source-port','module'] %} +{% for subtag in entry[tag] %} + <{{ tag }}{% for name,value in subtag.items() %} {{ name }}="{{ value }}"{% endfor %}/> +{% endfor %} +{# Tags which can be used once #} +{% elif tag in ['destination'] %} + <{{ tag }}{% for name,value in tag.items()|default({}) %} {{ name }}="{{ value }}"{% endfor %}/> +{% endif %} +{% endfor %} + diff --git a/cluster/roles/firewalld/templates/zone_template.xml.j2 b/cluster/roles/firewalld/templates/zone_template.xml.j2 new file mode 100644 index 0000000..92c3ef6 --- /dev/null +++ b/cluster/roles/firewalld/templates/zone_template.xml.j2 @@ -0,0 +1,35 @@ + + + {{ short|default(name)|upper }} +{% if description is defined %} + {{ description }} +{% endif %} +{% for tag in entry %} +{# Settings which can be used several times #} +{% if tag in ['interface','source','service','port','protocol','icmp-block','forward-port','source-port'] %} +{% for subtag in entry[tag] %} + <{{ tag }}{% for name,value in subtag.items() %} {{ name }}="{{ value }}"{% endfor %}/> +{% endfor %} +{# Settings which can be used once #} +{% elif tag in ['icmp-block-inversion','masquerade'] and item[tag] == true %} + <{{ tag }}/> +{% endif %} +{% endfor %} +{# Begin rich rule #} +{% for rule in entry.rule|default([]) %} + +{% for tag in rule %} +{% if tag in ['source','destination','service','port','icmp-block','icmp-type','masquerade','forward-port'] %} + <{{ tag }}{% for name,value in tag.items()|default({}) %} {{ name }}="{{ value }}"{% endfor %}/> +{% elif tag in ['log','audit','accept','drop','mark','reject'] %} + <{{ tag }}{% for name,value in tag.items() %} {{ name }}="{{ value }}"{% endfor %}> +{% endif %} +{% if tag.limit is defined %} + +{% endif %} + +{% endfor %} + +{# End rich rule #} +{% endfor %} + diff --git a/cluster/roles/firewalld/tests/inventory b/cluster/roles/firewalld/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/firewalld/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/firewalld/tests/test.yml b/cluster/roles/firewalld/tests/test.yml new file mode 100644 index 0000000..87930df --- /dev/null +++ b/cluster/roles/firewalld/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - prometheus diff --git a/cluster/roles/firewalld/vars/main.yml b/cluster/roles/firewalld/vars/main.yml new file mode 100644 index 0000000..f01bf99 --- /dev/null +++ b/cluster/roles/firewalld/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for template_role diff --git a/cluster/roles/hypervisor_network/.travis.yml b/cluster/roles/hypervisor_network/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/hypervisor_network/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/hypervisor_network/README.md b/cluster/roles/hypervisor_network/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/hypervisor_network/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/hypervisor_network/defaults/main.yml b/cluster/roles/hypervisor_network/defaults/main.yml new file mode 100644 index 0000000..2bec87e --- /dev/null +++ b/cluster/roles/hypervisor_network/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_network/handlers/main.yml b/cluster/roles/hypervisor_network/handlers/main.yml new file mode 100644 index 0000000..2d28ec4 --- /dev/null +++ b/cluster/roles/hypervisor_network/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_network/meta/main.yml b/cluster/roles/hypervisor_network/meta/main.yml new file mode 100644 index 0000000..227ad9c --- /dev/null +++ b/cluster/roles/hypervisor_network/meta/main.yml @@ -0,0 +1,53 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.9 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. + \ No newline at end of file diff --git a/cluster/roles/hypervisor_network/tasks/main.yml b/cluster/roles/hypervisor_network/tasks/main.yml new file mode 100644 index 0000000..1c343f5 --- /dev/null +++ b/cluster/roles/hypervisor_network/tasks/main.yml @@ -0,0 +1,110 @@ +--- +- name: find active interfaces + ansible.builtin.command: + cmd: ip -j a + register: _interfaces + +- name: match interface to mac + set_fact: + _interface: "{{ interface }}" + vars: + host: "{{ inventory_hostname }}" + mac: "{{ (hostvars['localhost']['mac_map'] | selectattr('host', '==', host) | map(attribute='mac'))[0] }}" + query: "[?address=='{{ mac }}'].ifname" + interface: "{{ (_interfaces['stdout'] | from_json | json_query(query))[0] }}" + +- name: find nmcli connections + ansible.builtin.command: + cmd: nmcli --get-values device,con-uuid,connection device + register: _connections + +- name: match nmcli connection to interface + set_fact: + _connection_remove_uuid: "{{ nmcli_con_uuid }}" + _connection_remove_name: "{{ nmcli_con_name }}" + loop: "{{ _connections['stdout_lines'] }}" + loop_control: + loop_var: entry + vars: + interface: "{{ entry.split(':')[0] }}" + nmcli_con_uuid: "{{ entry.split(':')[1] }}" + nmcli_con_name: "{{ entry.split(':')[2] }}" + when: + - _interface == interface + +- name: update nmcli connection + block: + + - name: create primary nmcli connection + ansible.builtin.command: + cmd: "{{ entry }}" + loop: + - nmcli con add con-name "{{ conn_name }}" type ethernet ifname "{{ ifname }}" ipv4.method manual ipv4.address "{{ ip4 }}" ipv4.gateway "{{ gw4 }}" ipv4.dns "{{ dns4 }}" ipv6.method link-local ipv6.addr-gen-mode eui64 connection.autoconnect yes + - nmcli con mod "{{ _connection_remove_uuid }}" connection.autoconnect no + loop_control: + loop_var: entry + vars: + nmcli_con: "primary" + conn_name: "{{ vars[config_namespace]['hypervisor']['nmcli_con_names'][nmcli_con] }}" + ifname: "{{ _interface }}" + host: "{{ inventory_hostname }}" + ip: "{{ (hostvars['localhost']['mac_map'] | selectattr('host', '==', host) | selectattr('nmcli_con', '==', nmcli_con) | map(attribute='ip'))[0] }}" + network: "{{ vars[config_namespace]['hypervisor']['cluster_networks'][conn_name]['network'] }}" + netmask: "{{ vars[config_namespace]['hypervisor']['cluster_networks'][conn_name]['netmask'] }}" + ip4: "{{ ip }}/{{ (network + '/' + netmask) | ansible.utils.ipaddr('prefix') }}" + gw4: "{{ vars[config_namespace]['hypervisor']['cluster_networks'][conn_name]['gateway'] }}" + dns4: "{{ vars[config_namespace]['hypervisor']['cluster_networks'][conn_name]['nameserver'] }}" + register: provision_connection + + - name: set new connection live + ansible.builtin.command: + cmd: "nmcli con up {{ conn_uuid }}" + vars: + conn_uuid: "{{ provision_connection['results'][0]['stdout'].split('(')[1].split(')')[0] }}" + async: 1 + poll: 0 + + - name: add "{{ inventory_hostname }}" to in-memory inventory with static ip + # ansible.builtin.add_host: > + # name={{ host }} + # groups={{ ['all', 'hypervisor'] }} + # ansible_ssh_host={{ ansible_ssh_host }} + # ansible_ssh_common_args='-o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no"' + # ansible_user={{ ansible_user }} + # ansible_password={{ ansible_password }} + ansible.builtin.add_host: > + name={{ host }} + groups={{ active_role_groups }} + ansible_ssh_host={{ ansible_ssh_host }} + ansible_ssh_common_args='-o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no"' + ansible_user={{ ansible_user }} + ansible_password={{ ansible_password }} + vars: + host: "{{ inventory_hostname }}" + ansible_ssh_host: "{{ (hostvars['localhost']['mac_map'] | selectattr('host', '==', host) | map(attribute='ip'))[0] }}" + ansible_user: "{{ vars[config_namespace]['hypervisor']['ssh_user'] }}" + ansible_password: "{{ vars[config_namespace]['hypervisor']['ssh_password'] }}" + + - name: remove old connection + ansible.builtin.command: + cmd: "nmcli con del {{ _connection_remove_uuid }}" + + - name: update facts to include new interface + setup: + gather_subset: + - all_ipv4_addresses + - all_ipv6_addresses + - default_ipv4 + - default_ipv6 + - interfaces + + vars: + host: "{{ inventory_hostname }}" + ip: "{{ (hostvars['localhost']['mac_map'] | selectattr('host', '==', host) | selectattr('nmcli_con', '==', nmcli_con) | map(attribute='ip'))[0] }}" + dhcp_ip: "{{ (hostvars['localhost']['mac_map'] | selectattr('host', '==', host) | selectattr('nmcli_con', '==', nmcli_con) | map(attribute='dhcp_ip'))[0] }}" + connection_remove: "{{ _connection_remove_name }}" + nmcli_con: "primary" + conn_name: "{{ vars[config_namespace]['hypervisor']['nmcli_con_names'][nmcli_con] }}" + when: + - not ip == dhcp_ip or + not connection_remove == conn_name \ No newline at end of file diff --git a/cluster/roles/hypervisor_network/tests/inventory b/cluster/roles/hypervisor_network/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/hypervisor_network/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/hypervisor_network/tests/test.yml b/cluster/roles/hypervisor_network/tests/test.yml new file mode 100644 index 0000000..c2fd2bd --- /dev/null +++ b/cluster/roles/hypervisor_network/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_network/vars/main.yml b/cluster/roles/hypervisor_network/vars/main.yml new file mode 100644 index 0000000..f655be4 --- /dev/null +++ b/cluster/roles/hypervisor_network/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_prep/.travis.yml b/cluster/roles/hypervisor_prep/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/hypervisor_prep/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/hypervisor_prep/README.md b/cluster/roles/hypervisor_prep/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/hypervisor_prep/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/hypervisor_prep/defaults/main.yml b/cluster/roles/hypervisor_prep/defaults/main.yml new file mode 100644 index 0000000..755d7a7 --- /dev/null +++ b/cluster/roles/hypervisor_prep/defaults/main.yml @@ -0,0 +1,8 @@ +--- +hypervisor_prep: + container_dir: "/opt/containers" + compose_dir: "/compose" + bin_dir: "/bin" + data_dir: "/data" + etc_dir: "/etc" + docker_compose_url: "https://github.com/docker/compose/releases/download/v2.6.0/docker-compose-linux-x86_64" \ No newline at end of file diff --git a/cluster/roles/hypervisor_prep/handlers/main.yml b/cluster/roles/hypervisor_prep/handlers/main.yml new file mode 100644 index 0000000..2d28ec4 --- /dev/null +++ b/cluster/roles/hypervisor_prep/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_prep/meta/main.yml b/cluster/roles/hypervisor_prep/meta/main.yml new file mode 100644 index 0000000..227ad9c --- /dev/null +++ b/cluster/roles/hypervisor_prep/meta/main.yml @@ -0,0 +1,53 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.9 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. + \ No newline at end of file diff --git a/cluster/roles/hypervisor_prep/tasks/main.yml b/cluster/roles/hypervisor_prep/tasks/main.yml new file mode 100644 index 0000000..780bf2e --- /dev/null +++ b/cluster/roles/hypervisor_prep/tasks/main.yml @@ -0,0 +1,169 @@ +--- +######## runtime_facts +- name: runtime facts + ansible.builtin.set_fact: + _docker_compose_url: "{{ hypervisor_prep['docker_compose_url'] }}" + _container_dir: "{{ hypervisor_prep['container_dir'] }}" + _compose_directory: "{{ hypervisor_prep['container_dir'] }}{{ hypervisor_prep['compose_dir'] }}" + _bin_directory: "{{ hypervisor_prep['container_dir'] }}{{ hypervisor_prep['bin_dir'] }}" + _etc_directory: "{{ hypervisor_prep['container_dir'] }}{{ hypervisor_prep['etc_dir'] }}" + _data_directory: "{{ hypervisor_prep['container_dir'] }}{{ hypervisor_prep['data_dir'] }}" + +######## copy ssh pub key + +- name: Authorize local SSH pub key on all hosts + authorized_key: + key: "{{ lookup('file', '~/.ssh/id_rsa.pub') }}" + comment: "" + user: root + state: present + +######## set hostname + +- name: change hostname + hostname: + name: "{{ inventory_hostname }}" + +- name: add hostname to /etc/hosts ipv4 + lineinfile: + dest: /etc/hosts + regexp: '^127\.0\.0\.1[ \t]+localhost' + line: "127.0.0.1 {{ inventory_hostname }}.{{ vars[config_namespace]['env']['cluster_domain'] }} {{ inventory_hostname }} localhost" + state: present + +- name: add hostname to /etc/hosts ipv6 + lineinfile: + dest: /etc/hosts + regexp: '^\:\:1[ \t]+localhost' + line: "::1 {{ inventory_hostname }}.{{ vars[config_namespace]['env']['cluster_domain'] }} {{ inventory_hostname }} localhost localhost.localdomain localhost6 localhost6.localdomain6" + state: present + +- name: add cluster hosts to /etc/hosts + lineinfile: + path: /etc/hosts + regexp: ".*[ \t]+{{ host }}" + line: "{{ ip }} {{ host }}.{{ vars[config_namespace]['env']['cluster_domain'] }} {{ host }}" + state: present + loop: "{{ groups['hypervisor'] | list | difference(inventory_hostname) }}" + loop_control: + loop_var: entry + vars: + host: "{{ entry }}" + # ip: "{{ hostvars[host]['ansible_default_ipv4']['address'] }}" + ip: "{{ hypervisor['mac_map'] | selectattr('host', '==', entry) | map(attribute='ip') | first }}" + +######## change security + +- name: set SELinux to permissive mode, podman requires selinux + ansible.posix.selinux: + policy: targeted + state: permissive + +- name: disable firewalld + ansible.builtin.systemd: + state: stopped + enabled: no + name: firewalld + +######## install podman + +- name: install podman on first hypervisor or all ceph nodes + block: + + - name: update package facts + ansible.builtin.package_facts: + manager: auto + strategy: all + + - name: install podman + package: + name: "podman" + state: present + + # start/stop podman in the shell via service podman.socket, this will restart podman.service, podman.socket is used for docker-compose integration + - name: enable podman services + ansible.builtin.systemd: + name: podman.socket + enabled: yes + state: started + + - name: install docker-compose + ansible.builtin.get_url: + url : "{{ _docker_compose_url }}" + dest: /usr/local/bin/docker-compose + mode: 0750 + + - name: softlink podman socket to docker socket + ansible.builtin.file: + src: /run/podman/podman.sock + dest: /var/run/docker.sock + owner: root + group: root + state: link + + - name: create container directories + ansible.builtin.file: + path: "{{ entry }}" + state: directory + owner: root + group: root + mode: 0755 + loop: + - "{{ _container_dir }}" + - "{{ _compose_directory }}" + - "{{ _bin_directory }}" + - "{{ _etc_directory }}" + - "{{ _data_directory }}" + loop_control: + loop_var: entry + + vars: + hypervisor_host: "{{ groups['hypervisor'] | first }}" + host: "{{ inventory_hostname }}" + when: + - hypervisor_host == host or + host in groups['ceph'] + +######## setup LVM for ceph + +- name: setup LVM for ceph + block: + + - name: read device information + community.general.parted: + device: "{{ hypervisor['ceph_disk'] }}" + unit: "MiB" + register: device_info + + - name: create new primary partition for LVM + community.general.parted: + device: "{{ hypervisor['ceph_disk'] }}" + number: "{{ partition_number }}" + unit: MiB + part_start: "{{ part_start | int + 1 }}MiB" + part_end: "100%" + flags: [ lvm ] + label: gpt + part_type: primary + state: present + vars: + last_partition: "{{ device_info['partitions'] | length - 1 }}" + part_start: "{{ device_info['partitions'][last_partition | int ]['end'] }}" + partition_number: "{{ device_info['partitions'] | length + 1 }}" + + - name: create volume group + community.general.lvg: + vg: ceph + # /dev/nvme0n1p4 + pvs: "{{ hypervisor['ceph_disk'] }}p{{ partition_number }}" + vars: + partition_number: "{{ device_info['partitions'] | length + 1 }}" + + - name: create logical volume + community.general.lvol: + vg: ceph + lv: ceph_data + size: 100%FREE + + when: + - not ansible_lvm['lvs']['ceph_data'] is defined \ No newline at end of file diff --git a/cluster/roles/hypervisor_prep/tests/inventory b/cluster/roles/hypervisor_prep/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/hypervisor_prep/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/hypervisor_prep/tests/test.yml b/cluster/roles/hypervisor_prep/tests/test.yml new file mode 100644 index 0000000..c2fd2bd --- /dev/null +++ b/cluster/roles/hypervisor_prep/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_prep/vars/main.yml b/cluster/roles/hypervisor_prep/vars/main.yml new file mode 100644 index 0000000..f655be4 --- /dev/null +++ b/cluster/roles/hypervisor_prep/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_vxlan/.travis.yml b/cluster/roles/hypervisor_vxlan/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/hypervisor_vxlan/README.md b/cluster/roles/hypervisor_vxlan/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/hypervisor_vxlan/defaults/main.yml b/cluster/roles/hypervisor_vxlan/defaults/main.yml new file mode 100644 index 0000000..2bec87e --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_vxlan/handlers/main.yml b/cluster/roles/hypervisor_vxlan/handlers/main.yml new file mode 100644 index 0000000..2d28ec4 --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_vxlan/meta/main.yml b/cluster/roles/hypervisor_vxlan/meta/main.yml new file mode 100644 index 0000000..227ad9c --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/meta/main.yml @@ -0,0 +1,53 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.9 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. + \ No newline at end of file diff --git a/cluster/roles/hypervisor_vxlan/tasks/create_bridges.yml b/cluster/roles/hypervisor_vxlan/tasks/create_bridges.yml new file mode 100644 index 0000000..cd80ceb --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/tasks/create_bridges.yml @@ -0,0 +1,160 @@ +--- +- name: load global port dict + set_fact: + _port_dict: "{{ hostvars['localhost']['_global_port_dict'] }}" + when: + - hostvars['localhost']['_global_port_dict'] is defined + +- name: set runtime facts + set_fact: + _vxlan_id: "{{ 1000 + ((ansible_loop['index0'] | int) * 1000 ) }}" + _cluster_network: "{{ cluster_network }}" + +- name: build global port dict + set_fact: + # ipv4 vxlan endpoints + #_port_dict: "{{ _port_dict | default({}) | combine({ network: { local_host: { remote_host: { 'name': port_name, 'vxlan_id': vxlan_id, 'remote_ip': remote_ip } } } }, recursive=True) | combine({ network: { remote_host: { local_host: { 'name': rev_port_name, 'vxlan_id': vxlan_id, 'remote_ip': local_ip } } } }, recursive=True) }}" + # ipv6 link-local vxlan endpoints + # _port_dict: "{{ _port_dict | default({}) | combine({ network: { local_host: { remote_host: { 'name': port_name, 'vxlan_id': vxlan_id, 'local_ip': local_ipv6, 'remote_ip': remote_ipv6 } } } }, recursive=True) | combine({ network: { remote_host: { local_host: { 'name': rev_port_name, 'vxlan_id': vxlan_id, 'local_ip': remote_ipv6, 'remote_ip': local_ipv6 } } } }, recursive=True) }}" + _port_dict: "{{ _port_dict | default({}) | combine({ network: { local_host: { remote_host: { 'name': port_name, 'vxlan_id': vxlan_id, 'local_ip': local_ip, 'remote_ip': remote_ip } } } }, recursive=True) | combine({ network: { remote_host: { local_host: { 'name': rev_port_name, 'vxlan_id': vxlan_id, 'local_ip': remote_ip, 'remote_ip': local_ip } } } }, recursive=True) }}" + loop: "{{ hypervisor_list|product(hypervisor_list) }}" + loop_control: + loop_var: entry + extended: yes + vars: + hypervisor_list: "{{ hostvars[ansible_hostname]['groups']['hypervisor'] }}" + network: "{{ _cluster_network }}" + local_host: "{{ entry[0] }}" + local_ip: "{{ hostvars[local_host]['ansible_default_ipv4']['address'] }}" + remote_host: "{{ entry[1] }}" + remote_ip: "{{ hostvars[remote_host]['ansible_default_ipv4']['address'] }}" + port_name: "{{ network }}_{{ remote_host }}" + rev_port_name: "{{ network }}_{{ local_host }}" + vxlan_id: "{{ ((_vxlan_id | int) + (ansible_loop['index0'] | int)) }}" + # ipv6 link-local tunnel endpoints, relies upon nmcli ipv6.addr-gen-mode eui64 + local_mac: "{{ (vars[config_namespace]['hypervisor']['mac_map'] | selectattr('host', '==', local_host) | map(attribute='mac'))[0] }}" + local_ipv6: "{{ 'fe80::0000:0000:0000:0000' | ansible.utils.slaac(local_mac) }}" + remote_mac: "{{ (vars[config_namespace]['hypervisor']['mac_map'] | selectattr('host', '==', remote_host) | map(attribute='mac'))[0] }}" + remote_ipv6: "{{ 'fe80::0000:0000:0000:0000' | ansible.utils.slaac(remote_mac) }}" + when: + - not local_host == remote_host + - local_host == ansible_hostname + - not _port_dict[network][local_host][remote_host] is defined + +- name: set host ip last octet + set_fact: + _host_oct: "{{ _host_oct | default({}) | combine({entry: oct}, recursive=True) }}" + loop: "{{ hostvars[ansible_hostname]['groups']['hypervisor'] }}" + loop_control: + loop_var: entry + extended: yes + vars: + oct: "{{ 11 + (ansible_loop['index0'] | int) }}" + +# - debug: +# msg: +# - "{{ _cluster_network }}" +# - "{{ _port_dict }}" + +- name: setup bridge for vxlan + community.general.nmcli: + conn_name: "{{ _cluster_network }}" + # ifname: "{{ _cluster_network }}" + type: bridge + ip4: "{{ ip }}" + # method4: manual + # method6: disabled + state: present + # autoconnect: yes + vars: + network: "{{ vars[config_namespace]['cluster_networks'][_cluster_network]['network'] }}" + netmask: "{{ vars[config_namespace]['cluster_networks'][_cluster_network]['netmask'] }}" + prefix: "{{ (network + '/' + netmask) | ansible.utils.ipaddr('prefix') }}" + oct: "{{ _host_oct[ansible_hostname] }}" + ip: "{{ (network + '/' + prefix) | ansible.utils.ipaddr(oct) }}" + +- name: remove existing vxlan ports + ansible.builtin.command: + cmd: nmcli connection del "{{ port_name }}" + loop: "{{ _port_dict[_cluster_network][ansible_hostname] | list }}" + loop_control: + loop_var: entry + vars: + # vxlan port names are max 15 char + port_name_prefix: "{{ _port_dict[_cluster_network][ansible_hostname][entry]['name'].split('_')[0] }}" + port_name_suffix: "{{ _port_dict[_cluster_network][ansible_hostname][entry]['name'].split('_')[1] }}" + port_name_prefix_max_char: "{{ 15 - (port_name_suffix | length + 1) }}" + port_name: "{{ port_name_prefix[:(port_name_prefix_max_char | int)] }}_{{ port_name_suffix }}" + ignore_errors: True + +- name: setup vxlan port for bridge + ansible.builtin.command: + cmd: nmcli connection add type vxlan slave-type bridge con-name "{{ port_name }}" ifname "{{ port_name }}" id "{{ vxlan_id }}" local "{{ local_ip }}" remote "{{ remote_ip }}" master "{{ _cluster_network }}" + loop: "{{ _port_dict[_cluster_network][ansible_hostname] | list }}" + loop_control: + loop_var: entry + vars: + vxlan_id: "{{ _port_dict[_cluster_network][ansible_hostname][entry]['vxlan_id'] }}" + remote_ip: "{{ _port_dict[_cluster_network][ansible_hostname][entry]['remote_ip'] }}" + # local_ip: "{{ hostvars[ansible_hostname]['ansible_default_ipv4']['address'] }}" + local_ip: "{{ _port_dict[_cluster_network][ansible_hostname][entry]['local_ip'] }}" + # vxlan port names are max 15 char + port_name_prefix: "{{ _port_dict[_cluster_network][ansible_hostname][entry]['name'].split('_')[0] }}" + port_name_suffix: "{{ _port_dict[_cluster_network][ansible_hostname][entry]['name'].split('_')[1] }}" + port_name_prefix_max_char: "{{ 15 - (port_name_suffix | length + 1) }}" + port_name: "{{ port_name_prefix[:(port_name_prefix_max_char | int)] }}_{{ port_name_suffix }}" + +- name: write global port dict + set_fact: + _global_port_dict: "{{ _port_dict }}" + delegate_to: localhost + delegate_facts: true + +# - debug: +# msg: +# - "{{ hostvars['localhost']['_global_port_dict'] }}" + +# nmcli con del campus_qemu02 campus_qemu03 cluster_qemu02 cluster_qemu03 infiniba_qemu02 infiniba_qemu03 ipmi_qemu02 ipmi_qemu03 storage_qemu02 storage_qemu03 campus_qemu01 cluster_qemu01 infiniba_qemu01 ipmi_qemu01 storage_qemu01 +# nmcli con del br_cluster campus cluster infiniband ipmi storage cluster_qemu02 cluster_qemu03 campus_qemu02 campus_qemu03 infiniba_qemu02 infiniba_qemu03 ipmi_qemu02 ipmi_qemu03 storage_qemu03 storage_qemu02 campus_qemu01 cluster_qemu01 infiniba_qemu01 ipmi_qemu01 storage_qemu01 campus_br + +# working config +# +# qemu01 +# nmcli connection add type bridge con-name br0 ifname br0 ipv4.method disabled ipv6.method disabled +# +# nmcli connection add type bridge con-name br0 ifname br0 ipv4.method manual ipv4.addresses 192.168.180.20/24 ipv6.method disabled ;\ +# nmcli connection add type vxlan slave-type bridge con-name br0-vxlan10 ifname vxlan10 id 10 local 192.168.140.41 remote 192.168.140.42 master br0 ;\ +# nmcli connection add type vxlan slave-type bridge con-name br0-vxlan11 ifname vxlan11 id 11 local 192.168.140.41 remote 192.168.140.43 master br0 ;\ +# nmcli con show +# +# nmcli connection add type bridge con-name campus ifname campus ipv4.method manual ipv4.addresses 192.168.180.20/24 ipv6.method disabled ;\ +# nmcli connection add type vxlan slave-type bridge con-name campus_qemu02 ifname campus_qemu02 id 10 local 192.168.140.41 remote 192.168.140.42 master campus ;\ +# nmcli connection add type vxlan slave-type bridge con-name campus_qemu03 ifname campus_qemu03 id 11 local 192.168.140.41 remote 192.168.140.43 master campus ;\ +# nmcli con show +# # qemu02 +# nmcli connection add type bridge con-name br0 ifname br0 ipv4.method disabled ipv6.method disabled +# +# nmcli connection add type bridge con-name br0 ifname br0 ipv4.method manual ipv4.addresses 192.168.180.21/24 ipv6.method disabled ;\ +# nmcli connection add type vxlan slave-type bridge con-name br0-vxlan10 ifname vxlan10 id 10 local 192.168.140.42 remote 192.168.140.41 master br0 ;\ +# nmcli connection add type vxlan slave-type bridge con-name br0-vxlan12 ifname vxlan12 id 12 local 192.168.140.42 remote 192.168.140.43 master br0 ;\ +# nmcli con show +# +# nmcli connection add type bridge con-name campus ifname campus ipv4.method manual ipv4.addresses 192.168.180.21/24 ipv6.method disabled ;\ +# nmcli connection add type vxlan slave-type bridge con-name campus_qemu01 ifname campus_qemu01 id 10 local 192.168.140.42 remote 192.168.140.41 master campus ;\ +# nmcli connection add type vxlan slave-type bridge con-name campus_qemu03 ifname campus_qemu03 id 12 local 192.168.140.42 remote 192.168.140.43 master campus ;\ +# nmcli con show +# # qemu03 +# nmcli connection add type bridge con-name br0 ifname br0 ipv4.method disabled ipv6.method disabled +# +# nmcli connection add type bridge con-name br0 ifname br0 ipv4.method manual ipv4.addresses 192.168.180.22/24 ipv6.method disabled ;\ +# nmcli connection add type vxlan slave-type bridge con-name br0-vxlan11 ifname vxlan11 id 11 local 192.168.140.43 remote 192.168.140.41 master br0 ;\ +# nmcli connection add type vxlan slave-type bridge con-name br0-vxlan12 ifname vxlan12 id 12 local 192.168.140.43 remote 192.168.140.42 master br0 ;\ +# nmcli con show +# +# nmcli connection add type bridge con-name campus ifname campus ipv4.method manual ipv4.addresses 192.168.180.22/24 ipv6.method disabled ;\ +# nmcli connection add type vxlan slave-type bridge con-name campus_qemu01 ifname campus_qemu01 id 11 local 192.168.140.43 remote 192.168.140.41 master campus ;\ +# nmcli connection add type vxlan slave-type bridge con-name campus_qemu02 ifname campus_qemu02 id 12 local 192.168.140.43 remote 192.168.140.42 master campus ;\ +# nmcli con show +# +# nmcli con del br0 br0-vxlan11 br0-vxlan12 br0-vxlan10 bridge-slave-ens1 +# nmcli con del campus campus_qemu01 campus_qemu02 campus_qemu03 \ No newline at end of file diff --git a/cluster/roles/hypervisor_vxlan/tasks/main.yml b/cluster/roles/hypervisor_vxlan/tasks/main.yml new file mode 100644 index 0000000..70e66a2 --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/tasks/main.yml @@ -0,0 +1,175 @@ +--- +# - name: install rdo repo +# ansible.builtin.package: +# name: "https://repos.fedorapeople.org/repos/openstack/openstack-yoga/rdo-release-yoga-1.el8.noarch.rpm" +# state: present +# disable_gpg_check: true + +# - name: update package facts +# ansible.builtin.package_facts: +# manager: auto +# strategy: all + +# - name: install openvswitch +# ansible.builtin.package: +# name: "openvswitch" +# state: present + +# - name: enable openvswitch service +# ansible.builtin.systemd: +# name: openvswitch +# enabled: yes +# state: started + +# nmcli con mod ctlplane ipv6.method link-local ipv6.addr-gen-mode eui64 +# nmcli con reload ens1 / systemctl restart NetworkManager +# nmcli con mod ctlplane ipv6.method link-local ipv6.addr-gen-mode stable-privacy +# this: nmcli dev mod ens1 ipv6.method link-local ipv6.addr-gen-mode eui64 +# what is the best way to setup our hypervisor? + +# - name: install needed network manager tools/libs +# package: +# name: +# - NetworkManager-libnm +# - nm-connection-editor +# state: present + +# - name: update package facts +# ansible.builtin.package_facts: +# manager: auto +# strategy: all + +- name: get facts for hypervisors + ansible.builtin.setup: + delegate_to: "{{ entry }}" + delegate_facts: true + loop: "{{ hostvars[ansible_hostname]['groups']['hypervisor'] }}" + loop_control: + loop_var: entry + when: + - not entry == inventory_hostname + +- name: create bridges and vxlan ports for each network + include_tasks: create_bridges.yml + loop: "{{ vars[config_namespace]['cluster_networks'] | list }}" + loop_control: + loop_var: cluster_network + extended: yes + + + + + + + + + + + +# # here here +# - name: setup bridge for vxlan +# community.general.nmcli: +# conn_name: "{{ _cluster_network }}_br" +# ifname: "{{ _cluster_network }}" +# type: "bridge" +# ip4: "{{ ip }}" +# state: present +# autoconnect: yes +# vars: +# network: "{{ vars[config_namespace]['cluster_networks'][_cluster_network]['network'] }}" +# netmask: "{{ vars[config_namespace]['cluster_networks'][_cluster_network]['netmask'] }}" +# prefix: "{{ (network + '/' + netmask) | ansible.utils.ipaddr('prefix') }}" +# oct: "{{ _host_oct[ansible_hostname] }}" +# ip: "{{ (network + '/' + prefix) | ansible.utils.ipaddr(oct) }}" +# # need cluster_br name up to 15 chars + +# # # here here +# - name: setup vxlan port for bridge +# community.general.nmcli: +# conn_name: "{{ port_name }}" +# # ifname: "{{ port_name }}" +# type: "vxlan" +# vxlan_id: "{{ _vxlan_id }}" +# vxlan_remote: "{{ remote_ip }}:8476" # can only use between 2 hosts, with 3 we need to get clever +# vxlan_local: "{{ hostvars[ansible_hostname]['ansible_default_ipv4']['address'] }}" +# # slave-type +# master: "{{ _cluster_network }}" # master bridge +# state: present +# autoconnect: yes +# loop: "{{ _port_dict[_cluster_network] | dict2items }}" +# loop_control: +# loop_var: entry +# vars: +# remote_ip: "{{ entry['value'] }}" +# port_name: "{{ entry['key'] }}" + + + + + + +# - fail: +# msg: + + +# create interfaces on vxlan interfaces - see if we can ping 11/12/13 +# https://www.sidorenko.io/post/2018/11/openstack-networking-open-vswitch-and-vxlan-introduction/ + +# - name: set host ip last octet (for use in each network) +# set_fact: +# _host_oct: "{{ _host_oct | default({}) | combine({entry: oct}, recursive=True) }}" +# loop: "{{ hostvars[ansible_hostname]['groups']['hypervisor'] }}" +# loop_control: +# loop_var: entry +# extended: yes +# vars: +# oct: "{{ 11 + (ansible_loop['index0'] | int) }}" + +# - name: setup network interfaces +# community.general.nmcli: +# conn_name: "{{ entry }}_slave" +# # ifname: "{{ xcat_nic.device }}" +# ifname: "{{ entry }}" +# type: "ethernet" +# hairpin: true +# # ip4: "{{ xcat_nic.ip }}/{{ (xcat_nic.ip + '/' + netmask) | ansible.utils.ipaddr('prefix') }}" +# ip4: "{{ ip }}" +# # gw4: "{{ vars['steel']['xcat_networks'][xcat_nic.network]['gateway'] }}" +# state: present +# autoconnect: yes +# # mtu: "{{ vars['steel']['xcat_networks'][xcat_nic.network]['mtu']|int }}" +# # debug: +# # msg: +# # - "{{ ip }}" +# loop: "{{ vars[config_namespace]['cluster_networks'] | list }}" +# loop_control: +# loop_var: entry +# extended: yes +# vars: +# network: "{{ vars[config_namespace]['cluster_networks'][entry]['network'] }}" +# netmask: "{{ vars[config_namespace]['cluster_networks'][entry]['netmask'] }}" +# prefix: "{{ (network + '/' + netmask) | ansible.utils.ipaddr('prefix') }}" +# # oct: "{{ 11 + (ansible_loop['index0'] | int) }}" +# oct: "{{ _host_oct[ansible_hostname] }}" +# ip: "{{ (network + '/' + prefix) | ansible.utils.ipaddr(oct) }}" + + + + +# https://enterprise-support.nvidia.com/s/article/howto-configure-vxlan-for-connectx-3-pro--linux-ovs-x +# https://blog.oddbit.com/post/2021-04-17-vm-ovs-vxlan/ +# https://docs.openvswitch.org/en/latest/faq/vxlan/ +# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/configuring_and_managing_networking/assembly_using-a-vxlan-to-create-a-virtual-layer-2-domain-for-vms_configuring-and-managing-networking +# https://www.sidorenko.io/post/2018/11/openstack-networking-open-vswitch-and-vxlan-introduction/ + +# not vxlan - just vlan +#https://blog.christophersmart.com/2020/07/27/how-to-create-linux-bridges-and-open-vswitch-bridges-with-networkmanager/ + +# want +# ovs-vsctl add-port br-vx vx_qemu002 -- set interface vx_qemu002 type=vxlan options:remote_ip=192.168.140.106 options:key=1000 options:dst_port=4789 +# ovs-vsctl add-port br-vx vx_qemu003 -- set interface vx_qemu003 type=vxlan options:remote_ip=192.168.140.151 options:key=1000 options:dst_port=4789 + +# ovs-vsctl show +# ovs-vsctl br-get-external-id br-vx +# ovs-vsctl list-ports br-vx +# ovs-vsctl list-ifaces br-vx diff --git a/cluster/roles/hypervisor_vxlan/tests/inventory b/cluster/roles/hypervisor_vxlan/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/hypervisor_vxlan/tests/test.yml b/cluster/roles/hypervisor_vxlan/tests/test.yml new file mode 100644 index 0000000..c2fd2bd --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - roles/role-template \ No newline at end of file diff --git a/cluster/roles/hypervisor_vxlan/vars/main.yml b/cluster/roles/hypervisor_vxlan/vars/main.yml new file mode 100644 index 0000000..f655be4 --- /dev/null +++ b/cluster/roles/hypervisor_vxlan/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/merge_vars/.travis.yml b/cluster/roles/merge_vars/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/merge_vars/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/merge_vars/README.md b/cluster/roles/merge_vars/README.md new file mode 100644 index 0000000..9dd9a97 --- /dev/null +++ b/cluster/roles/merge_vars/README.md @@ -0,0 +1,95 @@ +Role Info +========= + +A role called by another role inline to deep merge variables, this would typically merge custom variables source from a file in 'group_vars' with the role defaults or role vars files. +Dictionaries will be merged, nested lists will be appended to. +The role makes the assumption that the group_vars file is named the same as the role: role_name = ntp AND variables-file: = group_vars/ntp.yml + +If you only want to overide variables without merge (sometimes necessary where you do not want nested lists to be appended), the ansible 'include_vars' module will suffice. + +```yml +- include_vars: + file: "{{ ansible_inventory_sources[0] | dirname }}/group_vars/{{ role_name }}.yml" + name: "merge_{{ role_name }}" + +- name: merge custom vars to vars[] + set_fact: + { "{{ entry }}": "{{ 'merge_'role_name[entry] }}" } + loop: "{{ 'merge_'role_name | list }}" + loop_control: + loop_var: entry + +# - name: merge steel['firewalld'] over role defaults +# set_fact: +# firewalld: "{{ firewalld | default({}) | combine( steel['firewalld'], recursive=True) }}" +# when: steel['firewalld'] is defined +``` + +Role Variables +-------------- + +Accepts variables from calling role/task, returns dictionary `vars_return`. + +```yml +## set calling role variables +- name: set role variable sources + set_fact: + role_info: + role_defaults_file: "{{ role_path }}/defaults/main.yml" + role_override_file: "{{ ansible_inventory_sources[0] | dirname }}/group_vars/{{ role_name }}.yml" + vars_return: "placeholder" +``` + +Dependencies +------------ + +Dependency on the ansible-merge-vars plugin. + +> https://github.com/leapfrogonline/ansible-merge-vars + +Example Playbook +---------------- + +The top of your calling role/task should include the following block as the first item. +Change 'role_defaults_file' and 'role_override_file' file locations to merge variables within files. +Only 2 files maybe specified. +Omit/replace `merge custom vars to vars[]` task to copy `role_info['vars_return']` to a location of your choosing, by default this task places variables at the root of vars[]. + +```yml +- name: merge custom vars + block: + + - name: set role variable sources + set_fact: + role_info: + role_defaults_file: "{{ role_path }}/defaults/main.yml" + role_override_file: "{{ ansible_inventory_sources[0] | dirname }}/group_vars/{{ role_name }}.yml" + vars_return: "placeholder" + + - set_fact: + source_role: "{{ role_name }}" + + - name: run merge_vars role + include_role: + name: "merge_vars" + vars: + a_config_file: "{{ role_info['role_defaults_file'] }}" + b_config_file: "{{ role_info['role_override_file'] }}" + calling_role: "{{ source_role }}" + + - name: merge custom vars to vars[] + set_fact: + { "{{ entry }}": "{{ role_info['vars_return'][entry] }}" } + loop: "{{ role_info['vars_return'] | list }}" + loop_control: + loop_var: entry + when: + - not role_info['vars_return'] == 'placeholder' + + delegate_to: localhost +``` + +License +------- + +BSD diff --git a/cluster/roles/merge_vars/defaults/main.yml b/cluster/roles/merge_vars/defaults/main.yml new file mode 100644 index 0000000..2bec87e --- /dev/null +++ b/cluster/roles/merge_vars/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/merge_vars/handlers/main.yml b/cluster/roles/merge_vars/handlers/main.yml new file mode 100644 index 0000000..2d28ec4 --- /dev/null +++ b/cluster/roles/merge_vars/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/merge_vars/meta/main.yml b/cluster/roles/merge_vars/meta/main.yml new file mode 100644 index 0000000..227ad9c --- /dev/null +++ b/cluster/roles/merge_vars/meta/main.yml @@ -0,0 +1,53 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.9 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. + \ No newline at end of file diff --git a/cluster/roles/merge_vars/tasks/main.yml b/cluster/roles/merge_vars/tasks/main.yml new file mode 100644 index 0000000..3cbffaf --- /dev/null +++ b/cluster/roles/merge_vars/tasks/main.yml @@ -0,0 +1,61 @@ +--- +# variables passed to role +# a_config_file - role defaults file +# b_config_file - role override file +# calling_role - source role name + +- name: check for vars files + stat: + path: "{{ entry }}" + loop: "{{ [ a_config_file, b_config_file ] }}" + loop_control: + loop_var: entry + register: override_present + +- name: toggle on merge + set_fact: + files_present: True + +- name: toggle off merge + set_fact: + files_present: False + when: + - not override_present['results'][0]['stat']['exists'] | bool or + not override_present['results'][1]['stat']['exists'] | bool + +- name: report missing file(s) + debug: + msg: + - "file(s) present: {{ files_present }}" + - "{{ [ a_config_file, b_config_file ] }}" + when: + - not files_present | bool + +- name: include var files + include_vars: + file: "{{ entry }}" + name: "{{ include_name }}" + loop: "{{ [ a_config_file, b_config_file ] }}" + loop_control: + loop_var: entry + extended: true + vars: + include_name: "{{ ansible_loop['index0'] }}{{ calling_role }}_config__to_merge" + when: + - files_present | bool + +- name: merge custom vars + merge_vars: + suffix_to_merge: "{{ calling_role }}_config__to_merge" + merged_var_name: role_variables + expected_type: 'dict' + recursive_dict_merge: true + when: + - files_present | bool + +- name: return merged vars + set_fact: + role_info: + vars_return: "{{ role_variables }}" + when: + - files_present | bool \ No newline at end of file diff --git a/cluster/roles/merge_vars/tests/inventory b/cluster/roles/merge_vars/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/merge_vars/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/merge_vars/tests/test.yml b/cluster/roles/merge_vars/tests/test.yml new file mode 100644 index 0000000..c2fd2bd --- /dev/null +++ b/cluster/roles/merge_vars/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - roles/role-template \ No newline at end of file diff --git a/cluster/roles/merge_vars/vars/main.yml b/cluster/roles/merge_vars/vars/main.yml new file mode 100644 index 0000000..f655be4 --- /dev/null +++ b/cluster/roles/merge_vars/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/monitoring/.travis.yml b/cluster/roles/monitoring/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/monitoring/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/monitoring/README.md b/cluster/roles/monitoring/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/monitoring/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/monitoring/RESOURCES.txt b/cluster/roles/monitoring/RESOURCES.txt new file mode 100644 index 0000000..d89ff0c --- /dev/null +++ b/cluster/roles/monitoring/RESOURCES.txt @@ -0,0 +1,66 @@ + +# problem with postfix + +1) doesnt populate the mail relay correctly - this needs to do a group check on item[0] or something so we can get mail01 as the actual relay for clients +2) sudo nano -cw /etc/postfix/header_check does an outbound replace - we dont want that for alertmanager as we may want to send out different classes of email + +# node exporter stats collection + +systemctl service file /usr/lib/systemd/system/node_exporter.service loads $OPTIONS +/etc/sysconfig/node_exporter contains $OPTIONS + +OPTIONS="--collector.textfile.directory /var/lib/node_exporter/textfile_collector" + +Chris used the following switches, and a custom/cut-down dashboard: + + --collector.cpu \ + --collector.diskstats \ + --collector.filesystem \ + --collector.loadavg \ + --collector.meminfo \ + --collector.filefd \ + --collector.netdev \ + --collector.stat \ + --collector.netstat \ + --collector.systemd \ + --collector.uname \ + --collector.vmstat \ + --collector.time \ + --collector.mdadm \ + --collector.tcpstat \ + --collector.hwmon \ + --collector.arp \ + --web.max-requests=40 \ + --web.listen-address=0.0.0.0:{{ node_exporter_port }} \ + --web.telemetry-path=/metrics + + The full list of node exporter collections is @ https://github.com/prometheus/node_exporter + we can disable all collectors and then enable only the once we want + + --web.disable-exporter-metrics + arp WANT + bonding WANT + # boottime - provided in stat + cpu WANT + diskstats WANT + filesystem WANT + hardwaremon + infiniband WANT + loadavgs WANT + meminfo WANT + # netclass - lots of useful stuff but remains static really + netdev WANT + nfs WANT + nfsd WANT + nvme WANT + # os - fairly static do we need a million entries of rhel8 + powersupplyclass + sockstat + stat WANT + thermal_zone + time + vmstat WANT + xfs WANT - we assume nearly everything will run xfs now + + 15/~80 collectors should chop out a lot of data + we dont know how to marry metric names to fields though? - this guy has a methodology - its nice too https://mac-blog.org.ua/node-exporter-metrics \ No newline at end of file diff --git a/cluster/roles/monitoring/defaults/main.yml b/cluster/roles/monitoring/defaults/main.yml new file mode 100644 index 0000000..7b1a6b9 --- /dev/null +++ b/cluster/roles/monitoring/defaults/main.yml @@ -0,0 +1,350 @@ +--- +monitoring: + # install # + monitoring_dir: "/opt/monitoring/containers" + compose_dir: "/compose" + bin_dir: "/bin" + data_dir: "/data" + etc_dir: "/etc" + docker_compose_url: "https://github.com/docker/compose/releases/download/v2.6.0/docker-compose-linux-x86_64" + # labels to add to every message, these are intended to be merged from group_vars/monitoring.yml with the merge_vars role on a per customer basis + external_labels: + # environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" + customer: "ocf" + cluster: "ansible" + site: "sheffield" + environment: "development" + +grafana: + # install # + grafana_data_dir: "/grafana" + grafana_config_dir: "/grafana" + container_registry: "docker.io" + container_repository: "grafana/grafana" + container_tag: "9.1.3" + grafana_datasource_files: + - grafana/datasources/*.yml + grafana_dashboard_files: + # - grafana/dashboards/*.yml # this is used for the defunkt dashboard provisioning method + - grafana/dashboards/*.json + # api # + default_admin: "admin" + default_admin_password: "admin" + admin_user: "ocf" + admin_user_password: "6zrEtDXmeh9ZFQN7" + admin_user_name: "ocf admin" + admin_user_email: "support@ocf.co.uk" + overwrite_dashboards: false + +alertmanager: + # install # + alertmanager_config_dir: "/alertmanager" + alertmanager_db_dir: "/alertmanager" + container_registry: "quay.io" + container_repository: "prometheus/alertmanager" + container_tag: "v0.24.0" + + # config file # + alertmanager_resolve_timeout: 3m + alertmanager_web_listen_address: '0.0.0.0:9093' + + # SMTP default params + # alertmanager_smtp: {} + alertmanager_smtp: + from: 'prometheus_alertmanager' + smarthost: 'mail01:25' + require_tls: False + + # alertmanager_receivers: [] + alertmanager_receivers: + - name: 'email' + email_configs: + - send_resolved: True + to: 'tseed@ocf.co.uk' + from: 'noreply@ocf.co.uk' + headers: + subject: '{% raw %}{{ .Status | toUpper }} {{ .CommonLabels.severity | toUpper }}: {{ .CommonLabels.alertname }} [ Customer: {{ .CommonLabels.customer }}, Site: {{ .CommonLabels.site }}, Cluster: {{ .CommonLabels.cluster }} ] {% endraw %}' + # add another email config for a different mail relay, different 'to' field, different 'email body' format with OCF logo, as these are under the same receiver the same email will be sent to both + # - send_resolved: True + # to: 'tseed@ocf.co.uk' + # from: 'customer@ocf.co.uk' + # smarthost: 'mail01:25' + # require_tls: False + # html: '{% raw %}{{ template "custom_mail_html" . }}{% endraw %}' + # headers: + # subject: '{% raw %}{{ .Status | toUpper }} {{ .CommonLabels.severity | toUpper }}: {{ .CommonLabels.alertname }} [ Customer: {{ .CommonLabels.customer }}, Site: {{ .CommonLabels.site }}, Cluster: {{ .CommonLabels.cluster }} ] {% endraw %}' + # add another receiver this will be triggered by an additional alertmanager route + # - name: 'customer_specific_email' + # email_configs: + # - send_resolved: True + # to: 'tseed@ocf.co.uk' + # from: 'customer@ocf.co.uk' + # smarthost: 'mail01:25' + # require_tls: False + # headers: + # subject: '{% raw %}{{ .Status | toUpper }} {{ .CommonLabels.severity | toUpper }}: {{ .CommonLabels.alertname }} [ slurm queue backlog ] {% endraw %}' + + alertmanager_inhibit_rules: [] + # alertmanager_inhibit_rules: + # - equal: ['instance'] + # # for multiple alerts for same instance, if one alert has 'severity' label equal to 'critical' + # source_matchers: + # - severity = critical + # # and the other alert 'warning' + # target_matchers: + # - severity = warning + # # then mute alert with severity 'warning' to handle alert flooding, instance down will be a critical alert superceding other instance alerts + + # alertmanager_route: {} + alertmanager_route: + # the default receiver route + receiver: 'email' + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + # use the following param values for more realtime testing + # group_wait: 10s + # group_interval: 10s + # repeat_interval: 4h + group_by: ['alertname'] + # add additional routes for other receivers, use labels to match (alertname, severity, any-label-you-tag-to-source) + # routes: + # - receiver: 'customer_specific_email' + # matchers: + # - data_source="slurm" + # continue: True + + alertmanager_template_files: + - alertmanager/templates/*.tmpl + +prometheus: + # install # + prometheus_db_dir: "/prometheus" + prometheus_config_dir: "/prometheus" + prometheus_binaries_url: "https://github.com/prometheus/prometheus/releases/download/placeholderA/prometheus-placeholderB.linux-amd64.tar.gz" + container_registry: "quay.io" + container_repository: "prometheus/prometheus" + container_tag: "v2.38.0" + + # site # + node_exporter_port: "9100" + slurm_exporter_port: "9200" + nvidia_exporter_port: "9300" + + # config file # + prometheus_global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 15s + + ## web config section + ### these settings are highly dependent on external dns configuration and cert creation - need to run a CA role that will use cfssl + prometheus_web_config: + tls_server_config: {} + http_server_config: {} + basic_auth_users: {} + + ## alertmanager section + ### leave as empty list items to omit from the config + # + # prometheus_alertmanager_config: [] + prometheus_alertmanager_config: + - timeout: 10s + scheme: http + path_prefix: / + static_configs: + - targets: ["10.5.0.3:9093"] + prometheus_alert_relabel_configs: [] + # prometheus_alert_relabel_configs: + # - action: labeldrop + # regex: replica + + ## metric storage section + ### remote read/write used for metric storage, leave as empty list items to omit from the config, electing to use inbuilt TSDB + # + prometheus_remote_write: [] + # prometheus_remote_write: + # - url: https://dev.kausal.co/prom/push + # basic_auth: + # password: FOO + prometheus_remote_read: [] + # prometheus_remote_read: + # - url: https://demo.cloudalchemy.org:9201/read + # basic_auth: + # password: FOO + + ## scrape config section + ### this dict is appended dynamically with 'file service discovery configurations' (file_sd_configs) built from the ansible inventory + ### each file_sd_configs job relates to a scrape target + ### https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config + # + prometheus_scrape_configs: + # static target definition for prometheus itself + # + - job_name: "prometheus" + # metrics_path: /metrics # metrics_path defaults to '/metrics' + # scheme: http # scheme defaults to 'http' + static_configs: + - targets: + - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090" + # if you define non ansible managed targets by file use this template + # you may add a counterpart entry in prometheus_targets: that will be rendered into a file (see below) + # OR you may put a file @ files/targets/node.yml - these are picked up using file masks in prometheus_static_targets_files: - see file/targets/README.md + # + # - job_name: "node" + # file_sd_configs: + # - files: + # - "{{ prometheus_config_dir }}/file_sd/node.yml" + + # scrape targets # + ## scrape targets are dynamically built from the ansible inventory, include entries here for any static file_sd_configs and ensure the prometheus_scrape_configs dictionary has a job entry to use the file + ## leave as empty dict to omit from the config + # + prometheus_targets: {} + # node: + # - targets: + # - "somehost:9100" + # labels: + # env: "dev" + # type: 'sometype' + + ## alternatively include static target files at files/targets/*.yml + prometheus_static_targets_files: + - prometheus/targets/*.yml + - prometheus/targets/*.json + + # alert rules # + ## alert rules read from static files at files/rules/*.rules + prometheus_alert_rules_files: + - prometheus/rules/*.rules + + ## alert rules that get rendered into /etc/prometheus/rules/ansible_managed.rules + prometheus_alert_rules: + # useful rule to test alert pipeline is working, disable for a live system + - alert: Watchdog + expr: vector(1) + # for: 10m + for: 1m + labels: + severity: warning + annotations: + description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty." + summary: 'Ensure entire alerting pipeline is functional' + - alert: InstanceDown + expr: 'up == 0' + for: 5m + labels: + severity: critical + annotations: + description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}' + summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}' + - alert: RebootRequired + expr: 'node_reboot_required > 0' + labels: + severity: warning + annotations: + description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}' + summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}' + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}' + summary: 'Filesystem is predicted to run out of space within the next 24 hours.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}' + summary: 'Filesystem is predicted to run out of space within the next 4 hours.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' + summary: 'Filesystem has less than 5% space left.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' + summary: 'Filesystem has less than 3% space left.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}' + summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}' + summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' + summary: 'Filesystem has less than 5% inodes left.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' + summary: 'Filesystem has less than 3% inodes left.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}' + summary: 'Network interface is reporting many receive errors.' + expr: "increase(node_network_receive_errs_total[2m]) > 10\n" + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}' + summary: 'Network interface is reporting many transmit errors.' + expr: "increase(node_network_transmit_errs_total[2m]) > 10\n" + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}' + summary: 'Number of conntrack are getting close to the limit' + expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n" + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}' + summary: 'Clock skew detected.' + expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n" + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}' + summary: 'Clock not synchronising.' + expr: "min_over_time(node_timex_sync_status[5m]) == 0\n" + for: 10m + labels: + severity: warning \ No newline at end of file diff --git a/cluster/roles/monitoring/files/alertmanager/templates/email.html.tmpl b/cluster/roles/monitoring/files/alertmanager/templates/email.html.tmpl new file mode 100644 index 0000000..ffc4812 --- /dev/null +++ b/cluster/roles/monitoring/files/alertmanager/templates/email.html.tmpl @@ -0,0 +1,412 @@ +{{ define "custom_mail_html" }} + + + + + + +{{ template "__subject" . }} + + + + + + + + + + + +
+
+ +

+ + {{ if gt (len .Alerts.Firing) 0 }} + + + + + +
+ {{ else }} + + {{ end }} + {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} + {{ .Name }}={{ .Value }} + {{ end }} +
+ + + + + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + {{ range .Alerts.Firing }} + + + + {{ end }} + + {{ if gt (len .Alerts.Resolved) 0 }} + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + + + + {{ end }} + {{ range .Alerts.Resolved }} + + + + {{ end }} +
+ View in {{ template "__alertmanager" . }} +
+ [{{ .Alerts.Firing | len }}] Firing +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+
+
+
+ [{{ .Alerts.Resolved | len }}] Resolved +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+ +
+
+ + + + +{{ end }} \ No newline at end of file diff --git a/cluster/roles/monitoring/files/grafana/dashboards/dashboards.yml.disabled b/cluster/roles/monitoring/files/grafana/dashboards/dashboards.yml.disabled new file mode 100644 index 0000000..a1473f9 --- /dev/null +++ b/cluster/roles/monitoring/files/grafana/dashboards/dashboards.yml.disabled @@ -0,0 +1,24 @@ +apiVersion: 1 + +providers: + # an unique provider name. Required + - name: 'Default' + # Org id. Default to 1 + orgId: 1 + # name of the dashboard folder. + folder: '/etc/grafana/provisioning/custom_dashboards' + # folder UID. will be automatically generated if not specified + folderUid: '' + # provider type. Default to 'file' + type: file + # disable dashboard deletion + disableDeletion: true + # how often Grafana will scan for changed dashboards + updateIntervalSeconds: 10 + # allow updating provisioned dashboards from the UI + allowUiUpdates: false + options: + # path to dashboard files on disk. Required when using the 'file' type + path: /etc/grafana/provisioning/dashboards + # use folder names from filesystem to create folders in Grafana + foldersFromFilesStructure: true \ No newline at end of file diff --git a/cluster/roles/monitoring/files/grafana/dashboards/node-exporter-full.json b/cluster/roles/monitoring/files/grafana/dashboards/node-exporter-full.json new file mode 100644 index 0000000..adc93cb --- /dev/null +++ b/cluster/roles/monitoring/files/grafana/dashboards/node-exporter-full.json @@ -0,0 +1,14116 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.3.7" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "$$hashKey": "object:1058", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": 1860, + "graphTooltip": 0, + "id": null, + "iteration": 1641803921158, + "links": [ + { + "icon": "external link", + "tags": [], + "title": "GitHub", + "type": "link", + "url": "https://github.com/rfmoz/grafana-dashboards" + }, + { + "icon": "external link", + "tags": [], + "title": "Grafana", + "type": "link", + "url": "https://grafana.com/grafana/dashboards/1860" + } + ], + "panels": [ + { + "collapsed": false, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 261, + "panels": [], + "repeat": null, + "title": "Quick CPU / Mem / Disk", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Busy state of all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 20, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.7", + "targets": [ + { + "expr": "(((count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))) - avg(sum by (mode)(rate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[$__rate_interval])))) * 100) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "hide": false, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "title": "CPU Busy", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Busy state of all CPU cores together (5 min average)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 155, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.7", + "targets": [ + { + "expr": "avg(node_load5{instance=\"$node\",job=\"$job\"}) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A", + "step": 240 + } + ], + "title": "Sys Load (5m avg)", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Busy state of all CPU cores together (15 min average)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 19, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.7", + "targets": [ + { + "expr": "avg(node_load15{instance=\"$node\",job=\"$job\"}) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)) * 100", + "hide": false, + "intervalFactor": 1, + "refId": "A", + "step": 240 + } + ], + "title": "Sys Load (15m avg)", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Non available RAM memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "decimals": 0, + "mappings": [], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "hideTimeOverride": false, + "id": 16, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.7", + "targets": [ + { + "expr": "((node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} )) * 100", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "refId": "A", + "step": 240 + }, + { + "expr": "100 - ((node_memory_MemAvailable_bytes{instance=\"$node\",job=\"$job\"} * 100) / node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "B", + "step": 240 + } + ], + "title": "RAM Used", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Used Swap", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 25 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 21, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.7", + "targets": [ + { + "expr": "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} )) * 100", + "intervalFactor": 1, + "refId": "A", + "step": 240 + } + ], + "title": "SWAP Used", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Used Root FS", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 154, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.7", + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A", + "step": 240 + } + ], + "title": "Root FS Used", + "type": "gauge" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "description": "Total number of CPU cores", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "short", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 1 + }, + "id": 14, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 1, + "description": "System uptime", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 1 + }, + "hideTimeOverride": true, + "id": 15, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "$$hashKey": "object:1094", + "name": "value to text", + "value": 1 + }, + { + "$$hashKey": "object:1095", + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "null", + "nullText": null, + "postfix": "s", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", + "intervalFactor": 1, + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "$$hashKey": "object:1097", + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "description": "Total RootFS", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 3 + }, + "id": 23, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A", + "step": 240 + } + ], + "thresholds": "70,90", + "title": "RootFS Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "description": "Total RAM", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 20, + "y": 3 + }, + "id": 75, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "70%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "intervalFactor": 1, + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "RAM Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "description": "Total SWAP", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 22, + "y": 3 + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "70%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}", + "intervalFactor": 1, + "refId": "A", + "step": 240 + } + ], + "thresholds": "", + "title": "SWAP Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "collapsed": false, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 263, + "panels": [], + "repeat": null, + "title": "Basic CPU / Mem / Net / Disk", + "type": "row" + }, + { + "aliasColors": { + "Busy": "#EAB839", + "Busy Iowait": "#890F02", + "Busy other": "#1F78C1", + "Idle": "#052B51", + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "Basic CPU info", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 77, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 250, + "sort": null, + "sortDesc": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": true, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Busy Iowait", + "color": "#890F02" + }, + { + "alias": "Idle", + "color": "#7EB26D" + }, + { + "alias": "Busy System", + "color": "#EAB839" + }, + { + "alias": "Busy User", + "color": "#0A437C" + }, + { + "alias": "Busy Other", + "color": "#6D1F62" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance)(rate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Busy System", + "refId": "A", + "step": 240 + }, + { + "expr": "sum by (instance)(rate(node_cpu_seconds_total{mode='user',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Busy User", + "refId": "B", + "step": 240 + }, + { + "expr": "sum by (instance)(rate(node_cpu_seconds_total{mode='iowait',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy Iowait", + "refId": "C", + "step": 240 + }, + { + "expr": "sum by (instance)(rate(node_cpu_seconds_total{mode=~\".*irq\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy IRQs", + "refId": "D", + "step": 240 + }, + { + "expr": "sum (rate(node_cpu_seconds_total{mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Busy Other", + "refId": "E", + "step": 240 + }, + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Idle", + "refId": "F", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:123", + "format": "short", + "label": "", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:124", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "SWAP Used": "#BF1B00", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap Used": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "Basic memory usage", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 6 + }, + "hiddenSeries": false, + "id": 78, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "RAM Total", + "color": "#E0F9D7", + "fill": 0, + "stack": false + }, + { + "alias": "RAM Cache + Buffer", + "color": "#052B51" + }, + { + "alias": "RAM Free", + "color": "#7EB26D" + }, + { + "alias": "Avaliable", + "color": "#DEDAF7", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "RAM Total", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "RAM Used", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "RAM Cache + Buffer", + "refId": "C", + "step": 240 + }, + { + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "RAM Free", + "refId": "D", + "step": 240 + }, + { + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "SWAP Used", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Recv_bytes_eth2": "#7EB26D", + "Recv_bytes_lo": "#0A50A1", + "Recv_drop_eth2": "#6ED0E0", + "Recv_drop_lo": "#E0F9D7", + "Recv_errs_eth2": "#BF1B00", + "Recv_errs_lo": "#CCA300", + "Trans_bytes_eth2": "#7EB26D", + "Trans_bytes_lo": "#0A50A1", + "Trans_drop_eth2": "#6ED0E0", + "Trans_drop_lo": "#E0F9D7", + "Trans_errs_eth2": "#BF1B00", + "Trans_errs_lo": "#CCA300", + "recv_bytes_lo": "#0A50A1", + "recv_drop_eth0": "#99440A", + "recv_drop_lo": "#967302", + "recv_errs_eth0": "#BF1B00", + "recv_errs_lo": "#890F02", + "trans_bytes_eth0": "#7EB26D", + "trans_bytes_lo": "#0A50A1", + "trans_drop_eth0": "#99440A", + "trans_drop_lo": "#967302", + "trans_errs_eth0": "#BF1B00", + "trans_errs_lo": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Basic network info per interface", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 74, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "recv {{device}}", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "trans {{device}} ", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "pps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "Disk space used of all filesystems mounted", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 13 + }, + "height": "", + "hiddenSeries": false, + "id": 152, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mountpoint}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Used Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 265, + "panels": [ + { + "aliasColors": { + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 3 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 250, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": true, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "System - Processes executing in kernel mode", + "refId": "A", + "step": 240 + }, + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode='user',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "User - Normal processes executing in user mode", + "refId": "B", + "step": 240 + }, + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode='nice',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Nice - Niced processes executing in user mode", + "refId": "C", + "step": 240 + }, + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Idle - Waiting for something to happen", + "refId": "D", + "step": 240 + }, + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode='iowait',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Iowait - Waiting for I/O to complete", + "refId": "E", + "step": 240 + }, + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode='irq',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Irq - Servicing interrupts", + "refId": "F", + "step": 240 + }, + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode='softirq',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Softirq - Servicing softirqs", + "refId": "G", + "step": 240 + }, + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode='steal',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Steal - Time spent in other operating systems when running in a virtualized environment", + "refId": "H", + "step": 240 + }, + { + "expr": "sum by (mode)(rate(node_cpu_seconds_total{mode='guest',instance=\"$node\",job=\"$job\"}[$__rate_interval])) * 100", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Guest - Time spent running a virtual CPU for a guest operating system", + "refId": "I", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "percentage", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap - Swap memory usage": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839", + "Unused - Free memory unassigned": "#052B51" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 3 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Hardware Corrupted - *./", + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Apps - Memory used by user-space applications", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "PageTables - Memory used to map between virtual and physical memory addresses", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified", + "refId": "C", + "step": 240 + }, + { + "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)", + "refId": "D", + "step": 240 + }, + { + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Cache - Parked file data (file content) cache", + "refId": "E", + "step": 240 + }, + { + "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Buffers - Block device (e.g. harddisk) cache", + "refId": "F", + "step": 240 + }, + { + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Unused - Free memory unassigned", + "refId": "G", + "step": 240 + }, + { + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Swap - Swap space used", + "refId": "H", + "step": 240 + }, + { + "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working", + "refId": "I", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Stack", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "receive_packets_eth0": "#7EB26D", + "receive_packets_lo": "#E24D42", + "transmit_packets_eth0": "#7EB26D", + "transmit_packets_lo": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 84, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:5871", + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:5884", + "format": "bps", + "label": "bits out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:5885", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 15 + }, + "height": "", + "hiddenSeries": false, + "id": 156, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mountpoint}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 27 + }, + "hiddenSeries": false, + "id": 229, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads completed", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", + "intervalFactor": 1, + "legendFormat": "{{device}} - Writes completed", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "iops", + "label": "IO read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "io time": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 27 + }, + "hiddenSeries": false, + "id": 42, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*read*./", + "transform": "negative-Y" + }, + { + "alias": "/.*sda.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde.*/", + "color": "#E24D42" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{device}} - Successfully read bytes", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{device}} - Successfully written bytes", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "I/O Usage Read / Write", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:965", + "format": "Bps", + "label": "bytes read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:966", + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "io time": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 39 + }, + "hiddenSeries": false, + "id": 127, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "I/O Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1041", + "format": "percentunit", + "label": "%util", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1042", + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "CPU / Memory / Net / Disk", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 266, + "panels": [ + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 70 + }, + "hiddenSeries": false, + "id": 136, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Inactive - Memory which has been less recently used. It is more eligible to be reclaimed for other purposes", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Active - Memory that has been used more recently and usually not reclaimed unless absolutely necessary", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Active / Inactive", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 70 + }, + "hiddenSeries": false, + "id": 135, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Committed_AS - *./" + }, + { + "alias": "/.*CommitLimit - *./", + "color": "#BF1B00", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Committed_AS - Amount of memory presently allocated on the system", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "CommitLimit - Amount of memory currently available to be allocated on the system", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Commited", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 80 + }, + "hiddenSeries": false, + "id": 191, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Inactive_file - File-backed memory on inactive LRU list", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Inactive_anon - Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Active_file - File-backed memory on active LRU list", + "refId": "C", + "step": 240 + }, + { + "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Active_anon - Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs", + "refId": "D", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Active / Inactive Detail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 80 + }, + "hiddenSeries": false, + "id": 130, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writeback - Memory which is actively being written back to disk", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "WritebackTmp - Memory used by FUSE for temporary writeback buffers", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Dirty - Memory which is waiting to get written back to the disk", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Writeback and Dirty", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 90 + }, + "hiddenSeries": false, + "id": 138, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:4131", + "alias": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "fill": 0 + }, + { + "$$hashKey": "object:4138", + "alias": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Mapped - Used memory in mapped pages files which have been mmaped, such as libraries", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Shmem - Used shared memory (shared between several processes, thus including RAM disks)", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "refId": "C", + "step": 240 + }, + { + "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "ShmemPmdMapped - Ammount of shared (shmem/tmpfs) memory backed by huge pages", + "refId": "D", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Shared and Mapped", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4106", + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:4107", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 90 + }, + "hiddenSeries": false, + "id": 131, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "SUnreclaim - Part of Slab, that cannot be reclaimed on memory pressure", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "SReclaimable - Part of Slab, that might be reclaimed, such as caches", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Slab", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 100 + }, + "hiddenSeries": false, + "id": 70, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "VmallocChunk - Largest contigious block of vmalloc area which is free", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "VmallocTotal - Total size of vmalloc memory area", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "VmallocUsed - Amount of vmalloc area which is used", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Vmalloc", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 100 + }, + "hiddenSeries": false, + "id": 159, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Bounce - Memory used for block device bounce buffers", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Bounce", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 110 + }, + "hiddenSeries": false, + "id": 129, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Inactive *./", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AnonHugePages - Memory in anonymous huge pages", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AnonPages - Memory in user pages not backed by files", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Anonymous", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 110 + }, + "hiddenSeries": false, + "id": 160, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "KernelStack - Kernel memory stack. This is not reclaimable", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "PerCPU - Per CPU memory allocated dynamically by loadable modules", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Kernel / CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#806EB7", + "Total RAM + Swap": "#806EB7", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 120 + }, + "hiddenSeries": false, + "id": 140, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HugePages_Free - Huge pages in the pool that are not yet allocated", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HugePages_Rsvd - Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HugePages_Surp - Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory HugePages Counter", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "pages", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#806EB7", + "Total RAM + Swap": "#806EB7", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 120 + }, + "hiddenSeries": false, + "id": 71, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HugePages - Total size of the pool of huge pages", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Hugepagesize - Huge Page size", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory HugePages Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 130 + }, + "hiddenSeries": false, + "id": 128, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "DirectMap1G - Amount of pages mapped as this size", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "DirectMap2M - Amount of pages mapped as this size", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "DirectMap4K - Amount of pages mapped as this size", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory DirectMap", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 130 + }, + "hiddenSeries": false, + "id": 137, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Unevictable - Amount of unevictable memory that can't be swapped out for a variety of reasons", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MLocked - Size of pages locked to memory using the mlock() system call", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Unevictable and MLocked", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 140 + }, + "hiddenSeries": false, + "id": 132, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "NFS Unstable - Memory in NFS pages sent to the server, but not yet commited to the storage", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory NFS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Memory Meminfo", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 267, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 176, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*out/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pagesin - Page in operations", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pagesout - Page out operations", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Pages In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "pages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*out/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pswpin - Pages swapped in", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pswpout - Pages swapped out", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Pages Swap In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "pages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 33 + }, + "hiddenSeries": false, + "id": 175, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:6118", + "alias": "Pgfault - Page major and minor fault operations", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pgfault - Page major and minor fault operations", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pgmajfault - Major page fault operations", + "refId": "B", + "step": 240 + }, + { + "expr": "rate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval]) - rate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Pgminfault - Minor page fault operations", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Page Faults", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6133", + "format": "short", + "label": "faults", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6134", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 33 + }, + "hiddenSeries": false, + "id": 307, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "oom killer invocations ", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "OOM Killer", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:5373", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:5374", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Memory Vmstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 293, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 260, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Variation*./", + "color": "#890F02" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Estimated error in seconds", + "refId": "A", + "step": 240 + }, + { + "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Time offset in between local system and reference clock", + "refId": "B", + "step": 240 + }, + { + "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Maximum error in seconds", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Syncronized Drift", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 291, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Phase-locked loop time adjust", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time PLL Adjust", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 34 + }, + "hiddenSeries": false, + "id": 168, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Variation*./", + "color": "#890F02" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Is clock synchronized to a reliable server (1 = yes, 0 = no)", + "refId": "A", + "step": 240 + }, + { + "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Local clock frequency adjustment", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Syncronized Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 34 + }, + "hiddenSeries": false, + "id": 294, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Seconds between clock ticks", + "refId": "A", + "step": 240 + }, + { + "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "International Atomic Time (TAI) offset", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Misc", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "System Timesync", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 312, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 62, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Processes blocked waiting for I/O to complete", + "refId": "A", + "step": 240 + }, + { + "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Processes in runnable state", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 315, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ state }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 17 + }, + "hiddenSeries": false, + "id": 148, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_forks_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Processes forks second", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes Forks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6640", + "format": "short", + "label": "forks / sec", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6641", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 17 + }, + "hiddenSeries": false, + "id": 149, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Max.*/", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Processes virtual memory size in bytes", + "refId": "A", + "step": 240 + }, + { + "expr": "process_resident_memory_max_bytes{instance=\"$node\",job=\"$job\"}", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Maximum amount of virtual memory available in bytes", + "refId": "B", + "step": 240 + }, + { + "expr": "rate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Processes virtual memory size in bytes", + "refId": "C", + "step": 240 + }, + { + "expr": "rate(process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Maximum amount of virtual memory available in bytes", + "refId": "D", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 27 + }, + "hiddenSeries": false, + "id": 313, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:709", + "alias": "PIDs limit", + "color": "#F2495C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Number of PIDs", + "refId": "A", + "step": 240 + }, + { + "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "PIDs limit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PIDs Number and Limit", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 27 + }, + "hiddenSeries": false, + "id": 305, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:4963", + "alias": "/.*waiting.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{ cpu }} - seconds spent running a process", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{ cpu }} - seconds spent by processing waiting for this CPU", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Process schedule stats Running / Waiting", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4860", + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4861", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 37 + }, + "hiddenSeries": false, + "id": 314, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:709", + "alias": "Threads limit", + "color": "#F2495C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Allocated threads", + "refId": "A", + "step": 240 + }, + { + "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Threads limit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Threads Number and Limit", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "System Processes", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 269, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Context switches", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_intr_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Interrupts", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Context Switches / Interrupts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 1m", + "refId": "A", + "step": 240 + }, + { + "expr": "node_load5{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 5m", + "refId": "B", + "step": 240 + }, + { + "expr": "node_load15{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 15m", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "System Load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6261", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6262", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 18 + }, + "hiddenSeries": false, + "id": 259, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Critical*./", + "color": "#E24D42", + "fill": 0 + }, + { + "alias": "/.*Max*./", + "color": "#EF843C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ type }} - {{ info }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Interrupts Detail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 18 + }, + "hiddenSeries": false, + "id": 306, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{ cpu }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Schedule timeslices executed by each cpu", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4860", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4861", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 28 + }, + "hiddenSeries": false, + "id": 151, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Entropy available to random number generators", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Entropy", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6568", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6569", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 28 + }, + "hiddenSeries": false, + "id": 308, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Time spent", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU time spent in user and system contexts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4860", + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4861", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 38 + }, + "hiddenSeries": false, + "id": 64, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:6323", + "alias": "/.*Max*./", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Maximum open file descriptors", + "refId": "A", + "step": 240 + }, + { + "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Open file descriptors", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Descriptors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6338", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6339", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "System Misc", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 304, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 26 + }, + "hiddenSeries": false, + "id": 158, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:6726", + "alias": "/.*Critical*./", + "color": "#E24D42", + "fill": 0 + }, + { + "$$hashKey": "object:6727", + "alias": "/.*Max*./", + "color": "#EF843C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip }} {{ sensor }} temp", + "refId": "A", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip }} {{ sensor }} Critical Alarm", + "refId": "B", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip }} {{ sensor }} Critical", + "refId": "C", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip }} {{ sensor }} Critical Historical", + "refId": "D", + "step": 240 + }, + { + "expr": "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ chip }} {{ sensor }} Max", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Hardware temperature monitor", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6750", + "format": "celsius", + "label": "temperature", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6751", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 26 + }, + "hiddenSeries": false, + "id": 300, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1655", + "alias": "/.*Max*./", + "color": "#EF843C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Current {{ name }} in {{ type }}", + "refId": "A", + "step": 240 + }, + { + "expr": "node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Max {{ name }} in {{ type }}", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Throttle cooling device", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1678", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1679", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 36 + }, + "hiddenSeries": false, + "id": 302, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ power_supply }} online", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Power supply", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1678", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1679", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Hardware Misc", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 296, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 297, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ name }} Connections", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Systemd Sockets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 10 + }, + "hiddenSeries": false, + "id": 298, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Failed", + "color": "#F2495C" + }, + { + "alias": "Inactive", + "color": "#FF9830" + }, + { + "alias": "Active", + "color": "#73BF69" + }, + { + "alias": "Deactivating", + "color": "#FFCB7D" + }, + { + "alias": "Activating", + "color": "#C8F2C2" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Activating", + "refId": "A", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Active", + "refId": "B", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Deactivating", + "refId": "C", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Failed", + "refId": "D", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Inactive", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Systemd Units State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Systemd", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 270, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The number (after merges) of I/O requests completed per second for the device", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 11 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "$$hashKey": "object:2033", + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "$$hashKey": "object:2034", + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "$$hashKey": "object:2035", + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "$$hashKey": "object:2036", + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "$$hashKey": "object:2037", + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "$$hashKey": "object:2038", + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "$$hashKey": "object:2039", + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "$$hashKey": "object:2040", + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "$$hashKey": "object:2041", + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "$$hashKey": "object:2042", + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "$$hashKey": "object:2043", + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "$$hashKey": "object:2044", + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "$$hashKey": "object:2045", + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "$$hashKey": "object:2046", + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "$$hashKey": "object:2047", + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "$$hashKey": "object:2048", + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "$$hashKey": "object:2049", + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "$$hashKey": "object:2050", + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "$$hashKey": "object:2051", + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "$$hashKey": "object:2052", + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "$$hashKey": "object:2053", + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads completed", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "intervalFactor": 1, + "legendFormat": "{{device}} - Writes completed", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps Completed", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2186", + "format": "iops", + "label": "IO read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2187", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The number of bytes read from or written to the device per second", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 11 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "{{device}} - Read bytes", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Written bytes", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Data", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:369", + "format": "Bps", + "label": "bytes read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:370", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / rate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - Read wait time avg", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / rate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Write wait time avg", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Average Wait Time", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:441", + "format": "s", + "label": "time. read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:442", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The average queue length of the requests that were issued to the device", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 21 + }, + "hiddenSeries": false, + "id": 35, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average Queue Size", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:513", + "format": "none", + "label": "aqu-sz", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:514", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The number of read and write requests merged per second that were queued to the device", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 31 + }, + "hiddenSeries": false, + "id": 133, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "intervalFactor": 1, + "legendFormat": "{{device}} - Read merged", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "intervalFactor": 1, + "legendFormat": "{{device}} - Write merged", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Merged", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:585", + "format": "iops", + "label": "I/Os", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:586", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Percentage of elapsed time during which I/O requests were issued to the device (bandwidth utilization for the device). Device saturation occurs when this value is close to 100% for devices serving requests serially. But for devices serving requests in parallel, such as RAID arrays and modern SSDs, this number does not reflect their performance limits.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 31 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - discard", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Spent Doing I/Os", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:657", + "format": "percentunit", + "label": "%util", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:658", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "The number of outstanding requests at the instant the sample was taken. Incremented as requests are given to appropriate struct request_queue and decremented as they finish.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 41 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_disk_io_now{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO now", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Instantaneous Queue Size", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:729", + "format": "none", + "label": "Outstanding req.", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:730", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 41 + }, + "hiddenSeries": false, + "id": 301, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2034", + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "$$hashKey": "object:2035", + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "$$hashKey": "object:2036", + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "$$hashKey": "object:2037", + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "$$hashKey": "object:2038", + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "$$hashKey": "object:2039", + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "$$hashKey": "object:2040", + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "$$hashKey": "object:2041", + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "$$hashKey": "object:2042", + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "$$hashKey": "object:2043", + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "$$hashKey": "object:2044", + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "$$hashKey": "object:2045", + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "$$hashKey": "object:2046", + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "$$hashKey": "object:2047", + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "$$hashKey": "object:2048", + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "$$hashKey": "object:2049", + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "$$hashKey": "object:2050", + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "$$hashKey": "object:2051", + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "$$hashKey": "object:2052", + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "$$hashKey": "object:2053", + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - Discards completed", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Discards merged", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps Discards completed / merged", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2186", + "format": "iops", + "label": "IOs", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2187", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Storage Disk", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 271, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "id": 43, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Available", + "metric": "", + "refId": "A", + "step": 240 + }, + { + "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Free", + "refId": "B", + "step": 240 + }, + { + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Size", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Filesystem space available", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3826", + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3827", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 12 + }, + "hiddenSeries": false, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Free file nodes", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Nodes Free", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3894", + "format": "short", + "label": "file nodes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3895", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 22 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Max open files", + "refId": "A", + "step": 240 + }, + { + "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Open files", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Descriptor", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "files", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 22 + }, + "hiddenSeries": false, + "id": 219, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - File nodes total", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Nodes Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "file Nodes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "/ ReadOnly": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 32 + }, + "hiddenSeries": false, + "id": 44, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - ReadOnly", + "refId": "A", + "step": 240 + }, + { + "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{mountpoint}} - Device error", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Filesystem in ReadOnly / Error", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3670", + "format": "short", + "label": "counter", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3671", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Storage Filesystem", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 272, + "panels": [ + { + "aliasColors": { + "receive_packets_eth0": "#7EB26D", + "receive_packets_lo": "#E24D42", + "transmit_packets_eth0": "#7EB26D", + "transmit_packets_lo": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 60, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic by Packets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 30 + }, + "hiddenSeries": false, + "id": 142, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive errors", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Rransmit errors", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 143, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive drop", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit drop", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Drop", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 141, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive compressed", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit compressed", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Compressed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 50 + }, + "hiddenSeries": false, + "id": 146, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive multicast", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Multicast", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 50 + }, + "hiddenSeries": false, + "id": 144, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive fifo", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit fifo", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Fifo", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 60 + }, + "hiddenSeries": false, + "id": 145, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:576", + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{device}} - Receive frame", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Frame", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:589", + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:590", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 60 + }, + "hiddenSeries": false, + "id": 231, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Statistic transmit_carrier", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Carrier", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 70 + }, + "hiddenSeries": false, + "id": 232, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} - Transmit colls", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Colls", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 70 + }, + "hiddenSeries": false, + "id": 61, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:663", + "alias": "NF conntrack limit", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "NF conntrack entries", + "refId": "A", + "step": 240 + }, + { + "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "NF conntrack limit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "NF Contrack", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:678", + "format": "short", + "label": "entries", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:679", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 80 + }, + "hiddenSeries": false, + "id": 230, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ device }} - ARP entries", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ARP Entries", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Entries", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 80 + }, + "hiddenSeries": false, + "id": 288, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ device }} - Bytes", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "MTU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 90 + }, + "hiddenSeries": false, + "id": 280, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ device }} - Speed", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 90 + }, + "hiddenSeries": false, + "id": 289, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_transmit_queue_length{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ device }} - Interface transmit queue length", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue Length", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "none", + "label": "packets", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 100 + }, + "hiddenSeries": false, + "id": 290, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:232", + "alias": "/.*Dropped.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{cpu}} - Processed", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{cpu}} - Dropped", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Softnet Packets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:207", + "format": "short", + "label": "packetes drop (-) / process (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:208", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 100 + }, + "hiddenSeries": false, + "id": 310, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU {{cpu}} - Squeezed", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Softnet Out of Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:207", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:208", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 110 + }, + "hiddenSeries": false, + "id": 309, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{interface}} - Operational state UP", + "refId": "A", + "step": 240 + }, + { + "expr": "node_network_carrier{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "instant": false, + "legendFormat": "{{device}} - Physical link state", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Operational Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Network Traffic", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 273, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 32 + }, + "hiddenSeries": false, + "id": 63, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_alloc - Allocated sockets", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_inuse - Tcp sockets currently in use", + "refId": "B", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_mem - Used memory for tcp", + "refId": "C", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_orphan - Orphan sockets", + "refId": "D", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCP_tw - Sockets wating close", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat TCP", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 32 + }, + "hiddenSeries": false, + "id": 124, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "UDPLITE_inuse - Udplite sockets currently in use", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "UDP_inuse - Udp sockets currently in use", + "refId": "B", + "step": 240 + }, + { + "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "UDP_mem - Used memory for udp", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat UDP", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 42 + }, + "hiddenSeries": false, + "id": 125, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "FRAG_inuse - Frag sockets currently in use", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "RAW_inuse - Raw sockets currently in use", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat FRAG / RAW", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1572", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1573", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 42 + }, + "hiddenSeries": false, + "id": 220, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "mem_bytes - TCP sockets in that state", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "mem_bytes - UDP sockets in that state", + "refId": "B", + "step": 240 + }, + { + "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 1, + "legendFormat": "FRAG_memory - Used memory for frag", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat Memory Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 52 + }, + "hiddenSeries": false, + "id": 126, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Sockets_used - Sockets currently in use", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "sockets", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Network Sockstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 274, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 33 + }, + "height": "", + "hiddenSeries": false, + "id": 221, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1876", + "alias": "/.*Out.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InOctets - Received octets", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "OutOctets - Sent octets", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Netstat IP In / Out Octets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1889", + "format": "short", + "label": "octects out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1890", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 33 + }, + "height": "", + "hiddenSeries": false, + "id": 81, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_Ip_Forwarding{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Forwarding - IP forwarding", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Netstat IP Forwarding", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1957", + "format": "short", + "label": "datagrams", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1958", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 43 + }, + "height": "", + "hiddenSeries": false, + "id": 115, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InMsgs - Messages which the entity received. Note that this counter includes all those counted by icmpInErrors", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "OutMsgs - Messages which this entity attempted to send. Note that this counter includes all those counted by icmpOutErrors", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ICMP In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "messages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 43 + }, + "height": "", + "hiddenSeries": false, + "id": 50, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InErrors - Messages which the entity received but determined as having ICMP-specific errors (bad ICMP checksums, bad length, etc.)", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ICMP Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "messages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 53 + }, + "height": "", + "hiddenSeries": false, + "id": 55, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*Snd.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InDatagrams - Datagrams received", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "OutDatagrams - Datagrams sent", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "UDP In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "datagrams out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 53 + }, + "height": "", + "hiddenSeries": false, + "id": 109, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "InErrors - UDP Datagrams that could not be delivered to an application", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "NoPorts - UDP Datagrams received on a port with no listener", + "refId": "B", + "step": 240 + }, + { + "expr": "rate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "InErrors Lite - UDPLite Datagrams that could not be delivered to an application", + "refId": "C" + }, + { + "expr": "rate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "RcvbufErrors - UDP buffer errors received", + "refId": "D", + "step": 240 + }, + { + "expr": "rate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "SndbufErrors - UDP buffer errors send", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "UDP Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4232", + "format": "short", + "label": "datagrams", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4233", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 63 + }, + "height": "", + "hiddenSeries": false, + "id": 299, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*Snd.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "InSegs - Segments received, including those received in error. This count includes segments received on currently established connections", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "OutSegs - Segments sent, including those on current connections but excluding those containing only retransmitted octets", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "datagrams out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 63 + }, + "height": "", + "hiddenSeries": false, + "id": 104, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "ListenOverflows - Times the listen queue of a socket overflowed", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "ListenDrops - SYNs to LISTEN sockets ignored", + "refId": "B", + "step": 240 + }, + { + "expr": "rate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "TCPSynRetrans - SYN-SYN/ACK retransmits to break down retransmissions in SYN, fast/timeout retransmits", + "refId": "C", + "step": 240 + }, + { + "expr": "rate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "RetransSegs - Segments retransmitted - that is, the number of TCP segments transmitted containing one or more previously transmitted octets", + "refId": "D" + }, + { + "expr": "rate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "InErrs - Segments received in error (e.g., bad TCP checksums)", + "refId": "E" + }, + { + "expr": "rate(node_netstat_Tcp_OutRsts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "OutRsts - Segments sent with RST flag", + "refId": "F" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 73 + }, + "height": "", + "hiddenSeries": false, + "id": 85, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:454", + "alias": "/.*MaxConn *./", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "CurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT", + "refId": "A", + "step": 240 + }, + { + "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "MaxConn - Limit on the total number of TCP connections the entity can support (Dinamic is \"-1\")", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:469", + "format": "short", + "label": "connections", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:470", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 73 + }, + "height": "", + "hiddenSeries": false, + "id": 91, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Sent.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "SyncookiesFailed - Invalid SYN cookies received", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "SyncookiesRecv - SYN cookies received", + "refId": "B", + "step": 240 + }, + { + "expr": "rate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "SyncookiesSent - SYN cookies sent", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP SynCookie", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 83 + }, + "height": "", + "hiddenSeries": false, + "id": 82, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "ActiveOpens - TCP connections that have made a direct transition to the SYN-SENT state from the CLOSED state", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "PassiveOpens - TCP connections that have made a direct transition to the SYN-RCVD state from the LISTEN state", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP Direct Transition", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "connections", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Network Netstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 279, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 54 + }, + "hiddenSeries": false, + "id": 40, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{collector}} - Scrape duration", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Exporter Scrape Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 54 + }, + "hiddenSeries": false, + "id": 157, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1969", + "alias": "/.*error.*/", + "color": "#F2495C", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{collector}} - Scrape success", + "refId": "A", + "step": 240 + }, + { + "expr": "node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{collector}} - Scrape textfile error (1 = true)", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Exporter Scrape", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1484", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1485", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Node Exporter", + "type": "row" + } + ], + "refresh": "1m", + "schemaVersion": 26, + "style": "dark", + "tags": [ + "linux" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "error": null, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": "label_values(node_uname_info, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(node_uname_info{job=\"$job\"}, instance)", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Host:", + "multi": false, + "name": "node", + "options": [], + "query": "label_values(node_uname_info{job=\"$job\"}, instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + }, + "error": null, + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "diskdevices", + "options": [ + { + "selected": true, + "text": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+" + } + ], + "query": "[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Node Exporter Full", + "uid": "rYdddlPWk", + "version": 67 +} \ No newline at end of file diff --git a/cluster/roles/monitoring/files/grafana/dashboards/nvidia-exporter.json b/cluster/roles/monitoring/files/grafana/dashboards/nvidia-exporter.json new file mode 100644 index 0000000..e78a31e --- /dev/null +++ b/cluster/roles/monitoring/files/grafana/dashboards/nvidia-exporter.json @@ -0,0 +1,2077 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.0.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Nvidia GPU Metrics based on the prometheus metrics from github.com/utkuozdemir/nvidia_gpu_exporter", + "editable": true, + "gnetId": 14574, + "graphTooltip": 0, + "id": null, + "iteration": 1625771691879, + "links": [], + "panels": [ + { + "datasource": "prometheus", + "description": "The official product name of the GPU. This is an alphanumeric string. For all products.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 23, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "name" + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Name", + "type": "stat" + }, + { + "datasource": "prometheus", + "description": "The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "": { + "text": "" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "prefix:P" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 4, + "y": 0 + }, + "id": 22, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "value" + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_pstate{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "P-State", + "type": "stat" + }, + { + "datasource": "prometheus", + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 6, + "options": { + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "title": "GPU Utilization %", + "transformations": [], + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts / The software power limit in watts.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 21, + "options": { + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"} / nvidia_smi_power_default_limit_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Power Draw %", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Fan Speed %", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 16, + "options": { + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Temperature", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 0 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "thresholds": [ + { + "$$hashKey": "object:1370", + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.7, + "yaxis": "left" + }, + { + "$$hashKey": "object:1376", + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.9, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Utilization %", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1352", + "format": "percentunit", + "label": "", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1353", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "prometheus", + "description": "The version of the installed NVIDIA display driver. This is an alphanumeric string.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "name" + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{driver_version}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Driver Version", + "type": "stat" + }, + { + "datasource": "prometheus", + "description": "The BIOS of the GPU board.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 3, + "y": 3 + }, + "id": 34, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "name" + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_gpu_info{uuid=\"$gpu\"}", + "instant": true, + "interval": "", + "legendFormat": "{{vbios_version}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Vbios Version", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": null, + "description": "Information about factors that are reducing the frequency of clocks. If all throttle reasons are returned as \"Not Active\" it means that clocks are running as high as possible.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "text": "Not Active" + }, + "1": { + "text": "Active" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 32, + "interval": null, + "links": [], + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_clocks_throttle_reasons_gpu_idle{uuid=\"$gpu\"}", + "instant": false, + "interval": "", + "legendFormat": "Idle", + "refId": "A" + }, + { + "exemplar": true, + "expr": "nvidia_smi_clocks_throttle_reasons_hw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Thermal Slowdown", + "refId": "B" + }, + { + "exemplar": true, + "expr": "nvidia_smi_clocks_throttle_reasons_sw_power_cap{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Power Cap", + "refId": "C" + }, + { + "exemplar": true, + "expr": "nvidia_smi_clocks_throttle_reasons_applications_clocks_setting{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "App Clocks Setting", + "refId": "D" + }, + { + "exemplar": true, + "expr": "nvidia_smi_clocks_throttle_reasons_hw_power_brake_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "HW Power Brake", + "refId": "E" + }, + { + "exemplar": true, + "expr": "nvidia_smi_clocks_throttle_reasons_sw_thermal_slowdown{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "SW Thermal Slowdown", + "refId": "F" + }, + { + "exemplar": true, + "expr": "nvidia_smi_clocks_throttle_reasons_sync_boost{uuid=\"$gpu\"}", + "hide": false, + "interval": "", + "legendFormat": "Sync Boost", + "refId": "G" + } + ], + "title": "Throttle Reasons", + "type": "bargauge" + }, + { + "datasource": "prometheus", + "description": "Current frequency of graphics (shader) clock\n/\nMaximum frequency of graphics (shader) clock.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 5 + }, + "id": 20, + "options": { + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_graphics_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "GPU Clock Speed %", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "Current frequency of memory clock / Maximum frequency of memory clock", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 5 + }, + "id": 33, + "options": { + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"} / nvidia_smi_clocks_max_memory_clock_hz{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Clock Speed %", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "Total memory allocated by active contexts / Total installed GPU memory.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 5 + }, + "id": 25, + "options": { + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"} / nvidia_smi_memory_total_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Allocation %", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "Percent of time over the past sample period during which global (device) memory was being read or written.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 5 + }, + "id": 7, + "options": { + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.3", + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_utilization_memory_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Utilization %", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Percent of time over the past sample period during which one or more kernels was executing on the GPU.\nThe sample period may be between 1 second and 1/6 second depending on the product.", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 5 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_utilization_gpu_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [ + { + "$$hashKey": "object:1370", + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.7, + "yaxis": "left" + }, + { + "$$hashKey": "object:1376", + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.9, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Utilization %", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1352", + "format": "percentunit", + "label": "", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1353", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Total memory allocated by active contexts.", + "fieldConfig": { + "defaults": { + "unit": "decbytes" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_memory_used_bytes{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Allocation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1352", + "format": "decbytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1353", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Core GPU temperature. in degrees C.", + "fieldConfig": { + "defaults": { + "unit": "celsius" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 10 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_temperature_gpu{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "thresholds": [ + { + "$$hashKey": "object:1805", + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 70, + "yaxis": "left" + }, + { + "$$hashKey": "object:1811", + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 80, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Temperature", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1761", + "format": "celsius", + "label": "", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1762", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "The last measured power draw for the entire board, in watts. Only available if power management is supported. This reading is accurate to within +/- 5 watts", + "fieldConfig": { + "defaults": { + "unit": "watt" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 10 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_power_draw_watts{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Power Draw", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:658", + "format": "watt", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:659", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.\n", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 10 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_fan_speed_ratio{uuid=\"$gpu\"}", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "thresholds": [ + { + "$$hashKey": "object:1168", + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.9, + "yaxis": "left" + }, + { + "$$hashKey": "object:1174", + "colorMode": "warning", + "fill": true, + "line": true, + "op": "gt", + "value": 0.7, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Fan Speed %", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1275", + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1276", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Current frequency of graphics (shader) clock.", + "fieldConfig": { + "defaults": { + "unit": "hertz" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_clocks_current_graphics_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Graphics Clock Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1642", + "format": "hertz", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1643", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Current frequency of video encoder/decoder clock.", + "fieldConfig": { + "defaults": { + "unit": "hertz" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 15 + }, + "hiddenSeries": false, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_clocks_current_video_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Video Clock Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1642", + "format": "hertz", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1643", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Current frequency of SM (Streaming Multiprocessor) clock.", + "fieldConfig": { + "defaults": { + "unit": "hertz" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 15 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_clocks_current_sm_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "SM Clock Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1642", + "format": "hertz", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1643", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Current frequency of memory clock.", + "fieldConfig": { + "defaults": { + "unit": "hertz" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 15 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "nvidia_smi_clocks_current_memory_clock_hz{uuid=\"$gpu\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{uuid}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Clock Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1642", + "format": "hertz", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1643", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 30, + "style": "dark", + "tags": [ + "nvidia", + "nvidia-smi", + "nvidia_gpu_exporter", + "prometheus" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "definition": "label_values(nvidia_smi_index, uuid)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": "GPU", + "multi": false, + "name": "gpu", + "options": [], + "query": { + "query": "label_values(nvidia_smi_index, uuid)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Nvidia GPU Metrics", + "uid": "vlvPlrgnk", + "version": 6 + } \ No newline at end of file diff --git a/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter-nodes.json.disabled b/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter-nodes.json.disabled new file mode 100644 index 0000000..00b3440 --- /dev/null +++ b/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter-nodes.json.disabled @@ -0,0 +1,553 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.0.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1593456204933, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_node_nodes{cluster=\"$cluster\",nodes=~\"$nodeset\"}) by (state)", + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "loadavg", + "fill": 0, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_node_cpus{cluster=\"$cluster\",nodes=~\"$nodeset\"}) by (state)", + "legendFormat": "{{state}}", + "refId": "A" + }, + { + "expr": "sum(slurm_node_load{cluster=\"$cluster\",nodes=~\"$nodeset\"})", + "legendFormat": "loadavg", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU count (cores)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 20 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "used", + "fill": 0, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_node_bytes{cluster=\"$cluster\",nodes=~\"$nodeset\"}) by (state)", + "legendFormat": "{{state}}", + "refId": "A" + }, + { + "expr": "sum(slurm_node_used_bytes{cluster=\"$cluster\",nodes=~\"$nodeset\"})", + "legendFormat": "used", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "utilization", + "fill": 0, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_node_gpus{cluster=\"$cluster\",nodes=~\"$nodeset\"}) by (state)", + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPUs", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "definition": "label_values(slurm_stats_server_threads, cluster)", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "cluster", + "options": [], + "query": "label_values(slurm_stats_server_threads, cluster)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "definition": "label_values(slurm_node_nodes,nodes)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "nodeset", + "options": [], + "query": "label_values(slurm_node_nodes,nodes)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "nodes", + "uid": "n8o02ZMGz", + "version": 2 +} \ No newline at end of file diff --git a/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter-queued.json.disabled b/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter-queued.json.disabled new file mode 100644 index 0000000..0795fb7 --- /dev/null +++ b/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter-queued.json.disabled @@ -0,0 +1,887 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.0.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table-old", + "name": "Table (old)", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1593456245590, + "links": [], + "panels": [ + { + "columns": [], + "datasource": "${DS_PROMETHEUS}", + "description": "click on zoom buttons to filter", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fontSize": "100%", + "gridPos": { + "h": 48, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 12, + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "jobs", + "align": "auto", + "colorMode": "value", + "colors": [ + "#37872D", + "#E0B400", + "#C4162A" + ], + "decimals": 0, + "pattern": "Value", + "thresholds": [ + "5", + "20" + ], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(slurm_job_count{state=\"pending\"}) by (account,partition,user)", + "format": "table", + "instant": true, + "legendFormat": "jobs", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Job counts", + "transform": "table", + "type": "table-old" + }, + { + "aliasColors": { + "cca": "red", + "cca pend": "dark-red", + "ccb": "green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccq": "purple", + "ccq pend": "dark-purple", + "other": "yellow", + "scc": "blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 20, + "x": 4, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_count{state=\"pending\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Job count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca pend": "dark-red", + "ccb": "green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccq": "purple", + "ccq pend": "dark-purple", + "other": "yellow", + "scc": "blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 20, + "x": 4, + "y": 8 + }, + "hiddenSeries": false, + "id": 7, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_nodes{state=\"pending\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca loadavg": "light-red", + "cca pend": "dark-red", + "ccb": "green", + "ccb loadavg": "light-green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccm loadavg": "light-orange", + "ccq": "purple", + "ccq loadavg": "light-purple", + "ccq pend": "dark-purple", + "other": "yellow", + "other loadavg": "light-yellow", + "scc": "blue", + "scc loadavg": "light-blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 20, + "x": 4, + "y": 16 + }, + "hiddenSeries": false, + "id": 8, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_cpus{state=\"pending\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU count (cores)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca loadavg": "light-red", + "cca pend": "dark-red", + "ccb": "green", + "ccb loadavg": "light-green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccm loadavg": "light-orange", + "ccq": "purple", + "ccq loadavg": "light-purple", + "ccq pend": "dark-purple", + "other": "yellow", + "other loadavg": "light-yellow", + "scc": "blue", + "scc loadavg": "light-blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 20, + "x": 4, + "y": 24 + }, + "hiddenSeries": false, + "id": 9, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_bytes{state=\"pending\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca loadavg": "light-red", + "cca pend": "dark-red", + "ccb": "green", + "ccb loadavg": "light-green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccm loadavg": "light-orange", + "ccq": "purple", + "ccq loadavg": "light-purple", + "ccq pend": "dark-purple", + "other": "yellow", + "other loadavg": "light-yellow", + "scc": "blue", + "scc loadavg": "light-blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 20, + "x": 4, + "y": 32 + }, + "id": 10, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_gpus{state=\"pending\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPUs", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca loadavg": "light-red", + "cca pend": "dark-red", + "ccb": "green", + "ccb loadavg": "light-green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccm loadavg": "light-orange", + "ccq": "purple", + "ccq loadavg": "light-purple", + "ccq pend": "dark-purple", + "other": "yellow", + "other loadavg": "light-yellow", + "scc": "blue", + "scc loadavg": "light-blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 20, + "x": 4, + "y": 40 + }, + "id": 13, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_seconds{state=\"pending\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total queue wait time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "datasource": "prometheus", + "filters": [ + { + "key": "cluster", + "operator": "=", + "value": "iron" + } + ], + "hide": 0, + "label": "", + "name": "Filters", + "skipUrlSync": false, + "type": "adhoc" + }, + { + "allValue": null, + "current": { + "tags": [], + "text": "account", + "value": "account" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "grouping", + "options": [ + { + "selected": true, + "text": "account", + "value": "account" + }, + { + "selected": false, + "text": "partition", + "value": "partition" + }, + { + "selected": false, + "text": "user", + "value": "user" + } + ], + "query": "account,partition,user", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "queued", + "uid": "u5sJ2WMMz", + "version": 1 +} \ No newline at end of file diff --git a/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter-running.json.disabled b/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter-running.json.disabled new file mode 100644 index 0000000..c425cac --- /dev/null +++ b/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter-running.json.disabled @@ -0,0 +1,1127 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.0.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "table-old", + "name": "Table (old)", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1593456121824, + "links": [], + "panels": [ + { + "columns": [], + "datasource": "${DS_PROMETHEUS}", + "description": "click on zoom buttons to filter", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fontSize": "100%", + "gridPos": { + "h": 56, + "w": 5, + "x": 0, + "y": 0 + }, + "id": 12, + "interval": "", + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 5, + "desc": true + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "jobs", + "align": "auto", + "colorMode": "value", + "colors": [ + "#C8F2C2", + "#FFF899", + "#FFA6B0" + ], + "decimals": 0, + "pattern": "Value #A", + "thresholds": [ + "10", + "50" + ], + "type": "number", + "unit": "short" + }, + { + "alias": "avg load", + "align": "auto", + "colorMode": "value", + "colors": [ + "#C0D8FF", + "#C8F2C2", + "#FFF899" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #B", + "thresholds": [ + "0.5", + "2" + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "used mem", + "align": "auto", + "colorMode": null, + "colors": [ + "#C8F2C2", + "#FFF899", + "#FFA6B0" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #C", + "thresholds": [ + "" + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "nodes", + "align": "auto", + "colorMode": "value", + "colors": [ + "#C8F2C2", + "#FFF899", + "#FFA6B0" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 0, + "mappingType": 1, + "pattern": "Value #D", + "thresholds": [ + "10", + "50" + ], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(slurm_job_count{state=\"running\"}) by (account,user,partition)", + "format": "table", + "instant": true, + "legendFormat": "jobs", + "refId": "A" + }, + { + "expr": "sum(slurm_job_nodes{state=\"running\"}) by (account,user,partition)", + "format": "table", + "instant": true, + "legendFormat": "nodes", + "refId": "D" + }, + { + "expr": "(sum(slurm_job_load) by (account,user,partition))/(sum(slurm_job_cpus{state=\"running\"}) by (account,user,partition))", + "format": "table", + "instant": true, + "legendFormat": "load", + "refId": "B" + }, + { + "expr": "sum(slurm_job_used_bytes) by (account,user,partition)", + "format": "table", + "instant": true, + "legendFormat": "memory", + "refId": "C" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Jobs", + "transform": "table", + "type": "table-old" + }, + { + "aliasColors": { + "cca": "red", + "cca pend": "dark-red", + "ccb": "green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccq": "purple", + "ccq pend": "dark-purple", + "other": "yellow", + "scc": "blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 0 + }, + "hiddenSeries": false, + "id": 2, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_count{state=\"running\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Job count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca pend": "dark-red", + "ccb": "green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccq": "purple", + "ccq pend": "dark-purple", + "other": "yellow", + "scc": "blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 8 + }, + "hiddenSeries": false, + "id": 7, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_nodes{state=\"running\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca loadavg": "light-red", + "cca pend": "dark-red", + "ccb": "green", + "ccb loadavg": "light-green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccm loadavg": "light-orange", + "ccq": "purple", + "ccq loadavg": "light-purple", + "ccq pend": "dark-purple", + "other": "yellow", + "other loadavg": "light-yellow", + "scc": "blue", + "scc loadavg": "light-blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 16 + }, + "hiddenSeries": false, + "id": 8, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/ loadavg$/", + "fill": 0, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_cpus{state=\"running\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "A" + }, + { + "expr": "sum(slurm_job_load) by ($grouping)", + "legendFormat": "{{$grouping}} loadavg", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU count (cores)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "2", + "min": "0", + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca loadavg": "light-red", + "cca pend": "dark-red", + "ccb": "green", + "ccb loadavg": "light-green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccm loadavg": "light-orange", + "ccq": "purple", + "ccq loadavg": "light-purple", + "ccq pend": "dark-purple", + "other": "yellow", + "other loadavg": "light-yellow", + "scc": "blue", + "scc loadavg": "light-blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 24 + }, + "hiddenSeries": false, + "id": 14, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(sum(slurm_job_load) by ($grouping))/(sum(slurm_job_cpus{state=\"running\"}) by ($grouping))", + "legendFormat": "{{$grouping}}", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average CPU load (efficiency)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percentunit", + "label": "", + "logBase": 1, + "max": "2", + "min": "0", + "show": true + }, + { + "decimals": null, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "2", + "min": "0", + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca loadavg": "light-red", + "cca pend": "dark-red", + "ccb": "green", + "ccb loadavg": "light-green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccm loadavg": "light-orange", + "ccq": "purple", + "ccq loadavg": "light-purple", + "ccq pend": "dark-purple", + "other": "yellow", + "other loadavg": "light-yellow", + "scc": "blue", + "scc loadavg": "light-blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 32 + }, + "hiddenSeries": false, + "id": 13, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_seconds{state=\"running\"}*slurm_job_cpus{state=\"running\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cumulative CPU time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca loadavg": "light-red", + "cca pend": "dark-red", + "cca used": "super-light-red", + "ccb": "green", + "ccb loadavg": "light-green", + "ccb pend": "dark-green", + "ccb used": "super-light-green", + "ccm": "orange", + "ccm loadavg": "light-orange", + "ccm used": "super-light-orange", + "ccq": "purple", + "ccq loadavg": "light-purple", + "ccq pend": "dark-purple", + "ccq used": "super-light-purple", + "other": "yellow", + "other loadavg": "light-yellow", + "other used": "super-light-yellow", + "scc": "blue", + "scc loadavg": "light-blue", + "scc pend": "dark-blue", + "scc used": "super-light-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 40 + }, + "hiddenSeries": false, + "id": 9, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/ used$/", + "fill": 0, + "linewidth": 2, + "stack": "B" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_bytes{state=\"running\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "A" + }, + { + "expr": "sum(slurm_job_used_bytes) by ($grouping)", + "legendFormat": "{{$grouping}} used", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "cca": "red", + "cca loadavg": "light-red", + "cca pend": "dark-red", + "ccb": "green", + "ccb loadavg": "light-green", + "ccb pend": "dark-green", + "ccm": "orange", + "ccm loadavg": "light-orange", + "ccq": "purple", + "ccq loadavg": "light-purple", + "ccq pend": "dark-purple", + "other": "yellow", + "other loadavg": "light-yellow", + "scc": "blue", + "scc loadavg": "light-blue", + "scc pend": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 48 + }, + "hiddenSeries": false, + "id": 10, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(slurm_job_gpus{state=\"running\"}) by ($grouping)", + "legendFormat": "{{$grouping}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPUs", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "datasource": "prometheus", + "filters": [ + { + "key": "cluster", + "operator": "=", + "value": "iron" + } + ], + "hide": 0, + "label": "", + "name": "Filters", + "skipUrlSync": false, + "type": "adhoc" + }, + { + "allValue": null, + "current": { + "tags": [], + "text": "account", + "value": "account" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "grouping", + "options": [ + { + "selected": false, + "text": "nodes", + "value": "nodes" + }, + { + "selected": true, + "text": "account", + "value": "account" + }, + { + "selected": false, + "text": "partition", + "value": "partition" + }, + { + "selected": false, + "text": "user", + "value": "user" + } + ], + "query": "nodes,account,partition,user", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "running", + "uid": "HL3p2ZGMz", + "version": 2 +} \ No newline at end of file diff --git a/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter.json b/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter.json new file mode 100644 index 0000000..2bff1d6 --- /dev/null +++ b/cluster/roles/monitoring/files/grafana/dashboards/slurm-exporter.json @@ -0,0 +1,956 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.6.2" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": 4323, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": 283, + "panels": [ + { + "aliasColors": { + "Total Nodes": "#052b51" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "slurm_nodes_alloc", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Allocated Nodes", + "refId": "A" + }, + { + "expr": "slurm_nodes_comp", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Completing Nodes", + "refId": "B" + }, + { + "expr": "slurm_nodes_idle", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Idle Nodes", + "refId": "C" + }, + { + "expr": "slurm_nodes_alloc + slurm_nodes_down + slurm_nodes_drain + slurm_nodes_idle + slurm_nodes_mix", + "format": "time_series", + "instant": false, + "intervalFactor": 2, + "legendFormat": "Total Nodes", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Nodes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "Down Nodes": "#e24d42", + "Nodes in *fail* state": "#6d1f62" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "slurm_nodes_down", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Down Nodes", + "refId": "A" + }, + { + "expr": "slurm_nodes_drain", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Draining Nodes", + "refId": "B" + }, + { + "expr": "slurm_nodes_err", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Nodes in *error* state", + "refId": "C" + }, + { + "expr": "slurm_nodes_fail", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Nodes in *fail* state", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Fail/Down/Drain/Err Nodes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Cluster Nodes", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 358, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "slurm_queue_completing", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Completing Jobs", + "refId": "A" + }, + { + "expr": "slurm_queue_running", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Running Jobs", + "refId": "B" + }, + { + "expr": "slurm_queue_pending", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pending Jobs", + "refId": "C" + }, + { + "expr": "slurm_queue_completed", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Completed Jobs", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "RUNNING/COMPL/PEND Jobs", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "Timed out Jobs": "#890f02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "slurm_queue_timeout", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Timed out Jobs", + "refId": "A" + }, + { + "expr": "slurm_queue_failed", + "format": "time_series", + "instant": false, + "intervalFactor": 2, + "legendFormat": "Failed Jobs", + "refId": "B" + }, + { + "expr": "slurm_queue_node_fail", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Failed jobs (due to NodeFail)", + "refId": "C" + }, + { + "expr": "slurm_queue_suspended", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Suspended Jobs", + "refId": "D" + }, + { + "expr": "slurm_queue_cancelled", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Cancelled Jobs", + "refId": "E" + }, + { + "expr": "slurm_queue_preempted", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Preempted Jobs", + "refId": "F" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "FAIL/SUSP/CANC/PREEMPT/TIMEDOUT Jobs", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "SLURM Jobs", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 151, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${DS_PROMETHEUS}", + "description": "The number of current active slurmctld threads.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "slurm_scheduler_threads", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Slurm Scheduler Threads", + "refId": "A" + } + ], + "thresholds": "", + "title": "Slurm Scheduler Threads", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${DS_PROMETHEUS}", + "description": "The agent mechanism helps to control communication between the Slrum daemons and the controller for a best effort.", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "slurm_scheduler_queue_size", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Agent Queue Size", + "refId": "A" + } + ], + "thresholds": "", + "title": "Agent Queue Size", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "slurm_scheduler_backfill_depth_mean", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Mean of processed jobs during backfilling scheduling cycles", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Scheduler Backfill Depth Mean", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "SLURM Scheduler Details", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": false, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "slurm_scheduler_last_cycle", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Scheduler Last Cycle Time", + "refId": "A" + }, + { + "expr": "slurm_scheduler_mean_cycle", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Scheduler Mean Cycle Time", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Scheduler Cycles", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": false, + "min": true, + "show": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "slurm_scheduler_backfill_last_cycle", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Scheduler Backfill Last Cycle", + "refId": "A" + }, + { + "expr": "slurm_scheduler_backfill_mean_cycle", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Scheduler Backfill Mean Cycle", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Backfill Scheduler Cycles", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "SLURM Scheduler Cycles", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "SLURM Dashboard", + "version": 14, + "description": "This dashboard can be used to visualize the status of the SLURM workload manager." + } \ No newline at end of file diff --git a/cluster/roles/monitoring/files/grafana/datasources/datasource.yml b/cluster/roles/monitoring/files/grafana/datasources/datasource.yml new file mode 100644 index 0000000..10ba421 --- /dev/null +++ b/cluster/roles/monitoring/files/grafana/datasources/datasource.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + uid: prometheus + type: prometheus + access: proxy + orgId: 1 + url: http://10.5.0.2:9090 + basicAuth: false + isDefault: true + editable: true \ No newline at end of file diff --git a/cluster/roles/monitoring/files/nvidia_exporter/nvidia_gpu_exporter b/cluster/roles/monitoring/files/nvidia_exporter/nvidia_gpu_exporter new file mode 100755 index 0000000..ea0cc3a Binary files /dev/null and b/cluster/roles/monitoring/files/nvidia_exporter/nvidia_gpu_exporter differ diff --git a/cluster/roles/monitoring/files/prometheus/rules/README.md b/cluster/roles/monitoring/files/prometheus/rules/README.md new file mode 100644 index 0000000..061ddfc --- /dev/null +++ b/cluster/roles/monitoring/files/prometheus/rules/README.md @@ -0,0 +1,8 @@ +# Add custom alerting rule files here + +Rules must qualify with the following conditions: +- files must have extension `.rules` +- multiple files are accepted, multiple rules may exist in a single file +- files must be valid yaml or json +- rules must conform to the specified syntax https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/ +- rules must pass validation `promtool check rules` \ No newline at end of file diff --git a/cluster/roles/monitoring/files/prometheus/targets/README.md b/cluster/roles/monitoring/files/prometheus/targets/README.md new file mode 100644 index 0000000..b2c8566 --- /dev/null +++ b/cluster/roles/monitoring/files/prometheus/targets/README.md @@ -0,0 +1,6 @@ +# Add custom scrape target files here + +Rules must qualify with the following conditions: +- files must have extension `.yml` +- multiple files are accepted, multiple targets may exist in a single file +- files must be valid yaml \ No newline at end of file diff --git a/cluster/roles/monitoring/files/slurm_exporter/prometheus-slurm-exporter b/cluster/roles/monitoring/files/slurm_exporter/prometheus-slurm-exporter new file mode 100755 index 0000000..ef6a3c4 Binary files /dev/null and b/cluster/roles/monitoring/files/slurm_exporter/prometheus-slurm-exporter differ diff --git a/cluster/roles/monitoring/files/test.prometheus.yml b/cluster/roles/monitoring/files/test.prometheus.yml new file mode 100644 index 0000000..dd5601d --- /dev/null +++ b/cluster/roles/monitoring/files/test.prometheus.yml @@ -0,0 +1,25 @@ +#jinja2: trim_blocks: True, lstrip_blocks: True +# http://prometheus.io/docs/operating/configuration/ + +global: + scrape_interval: 15s + evaluation_interval: 15s + +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +scrape_configs: + - job_name: "prometheus" + # metrics_path: defaults to '/metrics' + # scheme: defaults to 'http'. + static_configs: + - targets: ["localhost:9090"] + + \ No newline at end of file diff --git a/cluster/roles/monitoring/handlers/main.yml b/cluster/roles/monitoring/handlers/main.yml new file mode 100644 index 0000000..1c775be --- /dev/null +++ b/cluster/roles/monitoring/handlers/main.yml @@ -0,0 +1,18 @@ +--- +- name: restart monitoring + ansible.builtin.systemd: + name: monitoring-stack + state: restarted + listen: "restart monitoring" + +- name: restart slurm_exporter + ansible.builtin.systemd: + name: prometheus-slurm-exporter + state: restarted + listen: "restart slurm_exporter" + +- name: restart nvidia_exporter + ansible.builtin.systemd: + name: nvidia_gpu_exporter + state: restarted + listen: "restart nvidia_exporter" \ No newline at end of file diff --git a/cluster/roles/monitoring/meta/main.yml b/cluster/roles/monitoring/meta/main.yml new file mode 100644 index 0000000..c572acc --- /dev/null +++ b/cluster/roles/monitoring/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/cluster/roles/monitoring/tasks/grafana_configure.yml b/cluster/roles/monitoring/tasks/grafana_configure.yml new file mode 100644 index 0000000..74fff42 --- /dev/null +++ b/cluster/roles/monitoring/tasks/grafana_configure.yml @@ -0,0 +1,44 @@ +--- +######## runtime_facts + +- name: runtime facts + ansible.builtin.set_fact: + _datasources_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ grafana['grafana_config_dir'] }}/provisioning/datasources" + _dashboard_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ grafana['grafana_config_dir'] }}/provisioning/dashboards" + +######## grafana config files, datasources and dashboard templates + +# load data sources and dashboards from the local filesystem https://grafana.com/docs/grafana/latest/administration/provisioning/#datasources?utm_source=grafana_ds_list +# node_exporter full dashboard https://raw.githubusercontent.com/rfmoz/grafana-dashboards/master/prometheus/node-exporter-full.json +# slurm_exporter dashboards https://github.com/flatironinstitute/slurm-prometheus-exporter/tree/main/grafana + +- name: configure datasources + copy: + src: "{{ entry }}" + dest: "{{ _datasources_directory }}" + force: true + owner: grafana + group: root + mode: 0640 + with_fileglob: "{{ grafana['grafana_datasource_files'] }}" + loop_control: + loop_var: entry + notify: + - restart monitoring + +# provisioning method for dashboards is inflexible, many drawbacks +# using the API the end user can update/save their dashboards without fear of overwrite +# +# - name: configure dashboards +# copy: +# src: "{{ entry }}" +# dest: "{{ _dashboard_directory }}" +# force: true +# owner: grafana +# group: root +# mode: 0640 +# with_fileglob: "{{ grafana['grafana_dashboard_files'] }}" +# loop_control: +# loop_var: entry +# notify: +# - restart monitoring \ No newline at end of file diff --git a/cluster/roles/monitoring/tasks/grafana_install.yml b/cluster/roles/monitoring/tasks/grafana_install.yml new file mode 100644 index 0000000..3086dc1 --- /dev/null +++ b/cluster/roles/monitoring/tasks/grafana_install.yml @@ -0,0 +1,71 @@ +--- +######## runtime_facts + +- name: runtime facts + ansible.builtin.set_fact: + _grafana_image: "{{ grafana['container_registry'] }}/{{ grafana['container_repository'] }}:{{ grafana['container_tag'] }}" + _grafana_data_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['data_dir'] }}{{ grafana['grafana_data_dir'] }}" + _dashboard_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ grafana['grafana_config_dir'] }}/provisioning/dashboards" + _datasources_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ grafana['grafana_config_dir'] }}/provisioning/datasources" + _plugins_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ grafana['grafana_config_dir'] }}/provisioning/plugins" + _notifiers_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ grafana['grafana_config_dir'] }}/provisioning/notifiers" + _alerting_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ grafana['grafana_config_dir'] }}/provisioning/alerting" + +######## user and directories + +# dont need local group +# +# - name: create grafana system group +# ansible.builtin.group: +# name: grafana +# system: true +# gid: 472 +# state: present + +# grafana container spawns services as non root grafana user, bind mount files uid will be uid:472 +# either bind mount to container /etc/passwd with a modified passwd file with container-user:grafana uid matching os-user:grafana uid (may need to run container with --user grafana) +# or make the os-user:grafana have uid 472 to match the container /etc/passwd +# prometheus does not have this issue as the container daemons run as root so the container can be run with uid specified -u 992:988 to ensure container created files match os uid:gid on bind mounts +- name: create grafana system user + ansible.builtin.user: + name: grafana + shell: "/usr/sbin/nologin" + # group: grafana + group: nobody + createhome: false + system: true + uid: 472 + home: "{{ _grafana_data_directory }}" + +- name: create grafana data directories + ansible.builtin.file: + path: "{{ entry }}" + state: directory + owner: grafana + group: root + mode: 0750 + loop: + - "{{ _grafana_data_directory }}" + loop_control: + loop_var: entry + +- name: create grafana configuration directories + ansible.builtin.file: + path: "{{ entry }}" + state: directory + owner: grafana + group: root + mode: 0750 + loop: + - "{{ _dashboard_directory }}" + - "{{ _datasources_directory }}" + - "{{ _plugins_directory }}" + - "{{ _notifiers_directory }}" + - "{{ _alerting_directory }}" + loop_control: + loop_var: entry + +- name: pull grafana image + containers.podman.podman_image: + name: "{{ _grafana_image }}" + diff --git a/cluster/roles/monitoring/tasks/grafana_post_configure.yml b/cluster/roles/monitoring/tasks/grafana_post_configure.yml new file mode 100644 index 0000000..a4e4fac --- /dev/null +++ b/cluster/roles/monitoring/tasks/grafana_post_configure.yml @@ -0,0 +1,122 @@ +--- +######## runtime_facts + +- name: runtime facts + ansible.builtin.set_fact: + _default_admin: "{{ grafana['default_admin'] }}" + _default_admin_password: "{{ grafana['default_admin_password'] }}" + _admin_user: "{{ grafana['admin_user'] }}" + _admin_user_password: "{{ grafana['admin_user_password'] }}" + _admin_user_name: "{{ grafana['admin_user_name'] }}" + _admin_user_email: "{{ grafana['admin_user_email'] }}" + _overwrite_dashboards: "{{ grafana['overwrite_dashboards'] }}" + +######## grafana post configure, user accounts and dashboards, requires grafana to be running + +- name: wait for grafana to be available + uri: + url: "http://{{ inventory_hostname }}.{{ vars[config_namespace]['ocfenv']['cluster_domain'] }}:3000/api/health" + method: GET + status_code: 200 + return_content: no + register: _health_request + until: _health_request['status'] == 200 + retries: 12 + delay: 5 + +- name: check admin user already has access + uri: + url: "http://{{ inventory_hostname }}.{{ vars[config_namespace]['ocfenv']['cluster_domain'] }}:3000/api/users/lookup?loginOrEmail={{ _admin_user }}" + method: GET + user: "{{ _admin_user }}" + password: "{{ _admin_user_password }}" + force_basic_auth: yes + status_code: 200 + register: _admin_user_request + ignore_errors: true + +# this is a silent service account so cli commnds are not required to change db user record where lockout +- name: create admin service account where not present + uri: + url: "http://{{ inventory_hostname }}.{{ vars[config_namespace]['ocfenv']['cluster_domain'] }}:3000/api/admin/users" + method: POST + user: "{{ _default_admin }}" + password: "{{ _default_admin_password }}" + force_basic_auth: yes + headers: + Content-Type: application/json + body: '{"name":"{{ _admin_user_name }}", "email":"{{ _admin_user_email }}", "login":"{{ _admin_user }}", "password":"{{ _admin_user_password }}", "OrgId":1}' + body_format: json + status_code: 200 + register: _post_user_request + ignore_errors: true + when: + - not _admin_user_request['status'] == 200 + +- name: update admin service account permissions + uri: + url: "http://{{ inventory_hostname }}.{{ vars[config_namespace]['ocfenv']['cluster_domain'] }}:3000/api/admin/users/{{ id }}/permissions" + method: PUT + user: "{{ _default_admin }}" + password: "{{ _default_admin_password }}" + force_basic_auth: yes + headers: + Content-Type: application/json + body: '{"isGrafanaAdmin": true}' + body_format: json + status_code: 200 + vars: + id: "{{ _post_user_request['json']['id'] }}" + register: _post_permissions_request + ignore_errors: true + when: + - not _post_user_request['skipped'] is defined + +- name: service account '{{ _admin_user }}' and default admin account '{{ _default_admin }}' cannot use the API, reset '{{ _default_admin }}' user password to 'admin' and remove the '{{ _admin_user }}' service account, then re-run + fail: + msg: + - "service account {{ _admin_user }} and default admin account {{ _default_admin }} cannot use the API, reset {{ _default_admin }} user password to 'admin' and remove the {{ _admin_user }} service account, then re-run" + when: + - not _post_user_request['skipped'] is defined and not _post_user_request['status'] == 200 + +# dashboard request body https://grafana.com/docs/grafana/v9.0/developers/http_api/dashboard/ +# +# { +# "dashboard": { +# "id": null, +# "uid": null, +# "title": "Production Overview", +# "tags": [ "templated" ], +# "timezone": "browser", +# "schemaVersion": 16, +# "version": 0, +# "refresh": "25s" +# }, +# "folderId": 0, +# "folderUid": "l3KqBxCMz", +# "message": "Made changes to xyz", +# "overwrite": false +# } + +# merge request body with content of each dashboard under key 'dashboard' +- name: upload dashboards + uri: + url: "http://{{ inventory_hostname }}.{{ vars[config_namespace]['ocfenv']['cluster_domain'] }}:3000/api/dashboards/db" + method: POST + user: "{{ _default_admin }}" + password: "{{ _default_admin_password }}" + force_basic_auth: yes + headers: + Content-Type: application/json + body: "{{ json_body }}" + body_format: json + with_fileglob: "{{ grafana['grafana_dashboard_files'] }}" + loop_control: + loop_var: entry + vars: + overwrite: "{{ _overwrite_dashboards }}" + json_request: '{ "message":"upload dashboard", "overwrite":false }' + json_dashboard_content: "{{ (lookup('file', entry) | from_json) }}" + json_body: "{{ json_request | from_json | combine({ 'dashboard': json_dashboard_content, 'overwrite': overwrite }, recursive=True) }}" + register: _post_dashboard_request + ignore_errors: true \ No newline at end of file diff --git a/cluster/roles/monitoring/tasks/main.yml b/cluster/roles/monitoring/tasks/main.yml new file mode 100644 index 0000000..8f645ab --- /dev/null +++ b/cluster/roles/monitoring/tasks/main.yml @@ -0,0 +1,82 @@ +--- +# - debug: +# msg: "{{ active_role_groups }}" + +# - fail: +# msg: + +- name: install node_exporter + ansible.builtin.include_tasks: + file: node_exporter_install.yml + when: "'all' in active_role_groups" + + # may want a 'monitoring_client' entry in ansible inventory (xcat db thus steel.yml) and roles.yml? - when: "'monitoring_client' in active_role_groups" + # depends on how this role is supposed to run + # + # client+server dual purpose roles are a not so clear model for support staff where entries would follow: + # + # # roles.yml + # + # roles: + # all: + # - monitoring + # monitoring: + # - monitoring + # + # # inventory (steel.yml) + # + # [all] + # compute001 + # ... + # mail01 + # + # [monitoring] + # monitoring01 + +- name: install slurm_exporter + ansible.builtin.include_tasks: + file: slurm_exporter_install.yml + when: "'wlm' in active_role_groups" + +- name: install nvidia_exporter + ansible.builtin.include_tasks: + file: nvidia_exporter_install.yml + when: "'gpu' in active_role_groups" + +# stack listens on: (there is no rproxy or ssl yet) +# grafana http://172.22.1.224:3000/login (initial login- admin:admin) +# prometheus http://172.22.1.224:9090 +# alertmanager http://172.22.1.224:9093 + +- name: monitoring server setup + block: + + - name: install podman / docker-compose + ansible.builtin.include_tasks: + file: podman_install.yml + + - name: install prometheus server + ansible.builtin.include_tasks: + file: prometheus_install.yml + + - name: configure prometheus server + ansible.builtin.include_tasks: + file: prometheus_configure.yml + + - name: install grafana server + ansible.builtin.include_tasks: + file: grafana_install.yml + + - name: configure grafana server + ansible.builtin.include_tasks: + file: grafana_configure.yml + + - name: render docker-compose file, start monitoring stack + ansible.builtin.include_tasks: + file: monitoring_stack_up.yml + + - name: post configure grafana server + ansible.builtin.include_tasks: + file: grafana_post_configure.yml + + when: "'monitoring' in active_role_groups" \ No newline at end of file diff --git a/cluster/roles/monitoring/tasks/monitoring_stack_up.yml b/cluster/roles/monitoring/tasks/monitoring_stack_up.yml new file mode 100644 index 0000000..3136656 --- /dev/null +++ b/cluster/roles/monitoring/tasks/monitoring_stack_up.yml @@ -0,0 +1,55 @@ +--- +######## runtime_facts, used to render docker-compose.yml + +- name: runtime facts + ansible.builtin.set_fact: + _compose_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['compose_dir'] }}" + _prometheus_image: "{{ prometheus['container_registry'] }}/{{ prometheus['container_repository'] }}:{{ prometheus['container_tag'] }}" + _prometheus_config_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ prometheus['prometheus_config_dir'] }}" + _prometheus_db_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['data_dir'] }}{{ prometheus['prometheus_db_dir'] }}" + _alertmanager_image: "{{ alertmanager['container_registry'] }}/{{ alertmanager['container_repository'] }}:{{ alertmanager['container_tag'] }}" + _alertmanager_config_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ alertmanager['alertmanager_config_dir'] }}" + _alertmanager_db_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['data_dir'] }}{{ alertmanager['alertmanager_db_dir'] }}" + _grafana_image: "{{ grafana['container_registry'] }}/{{ grafana['container_repository'] }}:{{ grafana['container_tag'] }}" + _grafana_data_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['data_dir'] }}{{ grafana['grafana_data_dir'] }}" + _grafana_config_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ grafana['grafana_config_dir'] }}/provisioning" + _grafana_init_user: "{{ grafana['default_admin'] }}" + _grafana_init_user_pass: "{{ grafana['default_admin_password'] }}" + +######## collect variables and render full stack compose file + +- name: find prometheus uid:gid + ansible.builtin.getent: + database: passwd + key: prometheus + +- name: get prometheus uid:gid + ansible.builtin.set_fact: + prometheus_uid: "{{ ansible_facts['getent_passwd']['prometheus'][1] }}" + prometheus_gid: "{{ ansible_facts['getent_passwd']['prometheus'][2] }}" + +- name: render full stack docker-compose.yml + ansible.builtin.template: + src: "docker-compose.yml.j2" + dest: "{{ _compose_directory }}/docker-compose.yml" + owner: root + group: root + mode: 0640 + +- name: write systemd unit file + ansible.builtin.template: + src: monitoring-stack.service.j2 + dest: "/lib/systemd/system/monitoring-stack.service" + owner: root + group: root + mode: 0655 + +- name: enable/start monitoring service + ansible.builtin.systemd: + name: monitoring-stack + state: started + enabled: true + daemon_reload: true + +- name: Flush handlers + meta: flush_handlers \ No newline at end of file diff --git a/cluster/roles/monitoring/tasks/node_exporter_install.yml b/cluster/roles/monitoring/tasks/node_exporter_install.yml new file mode 100644 index 0000000..a5787d4 --- /dev/null +++ b/cluster/roles/monitoring/tasks/node_exporter_install.yml @@ -0,0 +1,33 @@ +--- +######## runtime_facts + +- name: runtime facts + ansible.builtin.set_fact: + _node_exporter_repo: "https://copr.fedorainfracloud.org/coprs/ibotty/prometheus-exporters/repo/epel-8/ibotty-prometheus-exporters-epel-8.repo" + _repo_file: "/etc/yum.repos.d/_copr_ibotty-prometheus-exporters.repo" + +######## setup node exporter repo, update package cache + +- name: download prometheus exporters repo + ansible.builtin.get_url: + url: "{{ _node_exporter_repo }}" + dest: "{{ _repo_file }}" + mode: '0655' + +- name: update package facts + ansible.builtin.package_facts: + manager: auto + strategy: all + +######## install node exporter and enable service + +- name: install node exporter + package: + name: node_exporter + state: present + +- name: enable node exporter services + ansible.builtin.systemd: + name: node_exporter.service + enabled: yes + state: started diff --git a/cluster/roles/monitoring/tasks/nvidia_exporter_install.yml b/cluster/roles/monitoring/tasks/nvidia_exporter_install.yml new file mode 100644 index 0000000..a2c813d --- /dev/null +++ b/cluster/roles/monitoring/tasks/nvidia_exporter_install.yml @@ -0,0 +1,34 @@ +--- +# https://github.com/utkuozdemir/nvidia_gpu_exporter +# https://github.com/utkuozdemir/nvidia_gpu_exporter/blob/master/INSTALL.md + +- name: copy nvidia_gpu_exporter + copy: + src: "nvidia_exporter/nvidia_gpu_exporter" + dest: "/usr/bin/nvidia_gpu_exporter" + force: true + owner: root + group: root + mode: 0650 + notify: + - restart nvidia_exporter + +- name: write systemd unit file + ansible.builtin.template: + src: nvidia_gpu_exporter.service.j2 + dest: "/lib/systemd/system/nvidia_gpu_exporter.service" + owner: root + group: root + mode: 0655 + notify: + - restart nvidia_exporter + +- name: enable/start nvidia_gpu_exporter service + ansible.builtin.systemd: + name: nvidia_gpu_exporter + state: started + enabled: true + daemon_reload: true + +- name: Flush handlers + meta: flush_handlers \ No newline at end of file diff --git a/cluster/roles/monitoring/tasks/podman_install.yml b/cluster/roles/monitoring/tasks/podman_install.yml new file mode 100644 index 0000000..6637730 --- /dev/null +++ b/cluster/roles/monitoring/tasks/podman_install.yml @@ -0,0 +1,62 @@ +--- +######## runtime_facts + +- name: runtime facts + ansible.builtin.set_fact: + _monitoring_directory: "{{ monitoring['monitoring_dir'] }}" + _compose_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['compose_dir'] }}" + _bin_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['bin_dir'] }}" + _etc_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}" + _data_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['data_dir'] }}" + +######## install podman + +- name: update package facts + ansible.builtin.package_facts: + manager: auto + strategy: all + +- name: install podman + package: + name: "podman" + state: present + +# start/stop podman in the shell via service podman.socket, this will restart podman +# podman.socket is used for docker-compose integration, nobody has time for podman command parameters in systemd +- name: enable podman services + ansible.builtin.systemd: + name: podman.socket + enabled: yes + state: started + +- name: install docker-compose + ansible.builtin.get_url: + url : "{{ monitoring['docker_compose_url'] }}" + dest: /usr/local/bin/docker-compose + mode: 'u+x,g+x' + +- name: softlink podman socket to docker socket + ansible.builtin.file: + src: /run/podman/podman.sock + dest: /var/run/docker.sock + owner: root + group: root + state: link + +######## create container directories + +- name: create monitoring directories + ansible.builtin.file: + path: "{{ entry }}" + state: directory + owner: root + group: root + mode: 0755 + loop: + - "{{ _monitoring_directory }}" + - "{{ _compose_directory }}" + - "{{ _bin_directory }}" + - "{{ _etc_directory }}" + - "{{ _data_directory }}" + loop_control: + loop_var: entry \ No newline at end of file diff --git a/cluster/roles/monitoring/tasks/prometheus_configure.yml b/cluster/roles/monitoring/tasks/prometheus_configure.yml new file mode 100644 index 0000000..2391786 --- /dev/null +++ b/cluster/roles/monitoring/tasks/prometheus_configure.yml @@ -0,0 +1,290 @@ +--- +######## runtime_facts + +- name: runtime facts + ansible.builtin.set_fact: + suffix_domain: ".{{ vars[config_namespace]['ocfenv']['cluster_domain'] }}" + node_exporter_port: "{{ prometheus['node_exporter_port'] }}" + slurm_exporter_port: "{{ prometheus['slurm_exporter_port'] }}" + nvidia_exporter_port: "{{ prometheus['nvidia_exporter_port'] }}" + _static_targets_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ prometheus['prometheus_config_dir'] }}/file_sd" + _rules_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ prometheus['prometheus_config_dir'] }}/rules" + _bin_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['bin_dir'] }}" + _config_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ prometheus['prometheus_config_dir'] }}" + _alertmanager_config_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ alertmanager['alertmanager_config_dir'] }}" + _alertmanager_templates_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ alertmanager['alertmanager_config_dir'] }}/templates" + # container paths, these paths are relative to the bind mounts in the compose file, they are used to render the 'in container' prometheus config file + _bind_mount_static_targets_directory: "{{ monitoring['etc_dir'] }}{{ prometheus['prometheus_config_dir'] }}/file_sd" + _bind_mount_rules_directory: "{{ monitoring['etc_dir'] }}{{ prometheus['prometheus_config_dir'] }}/rules" + _bind_mount_alertmanager_config_directory: "{{ monitoring['etc_dir'] }}{{ alertmanager['alertmanager_config_dir'] }}" # think email templates + +######## generate prometheus node_exporter scrape targets from ansible inventory groups +# hosts are in multiple groups, logic here ensures node_exporter host labels/tags are in a primary group only +# for example: +# if a node is in both compute and gpu groups the node will have a node exporter tag of compute only +# other exporter clients such as gpu/slurm/whatever will still be installed and prometheus datasources created with those data sources having appropriate labels (this is just node_exporter tags which will be used in dashboard filtering) +# the ordered list 'prometheus_ansible_targets' can be changed such that 'gpu' precedes 'compute', any node in the inventory group 'gpu' (and 'compute') will have a prometheus node_exporter datasource tag of 'gpu' rather than 'compute' + +- name: inventory groups used to determine target hosts to collect node exporter metrics + ansible.builtin.set_fact: + # prometheus_ansible_targets: "{{ ['compute', 'hmem', 'gpu', 'login', 'wlm', 'xcat', 'nfs', 'repos', 'gateway', 'smtp'] }}" + prometheus_ansible_targets: + - compute + - hmem + - gpu + - login + - wlm + - xcat + - nfs + - repos + - gateway + - smtp + + # the order of the prometheus_ansible_targets list dictates which group the host will be in (after further sorting) where the host is in multiple groups + # this primary group will be used to add a tag to the prometheus target +- name: select hosts by ansible group membership + set_fact: + intersect_prometheus_targets: "{{ intersect_prometheus_targets | default([]) + ([ { 'target': entry, 'hosts': match_host } ]) }}" + loop: "{{ prometheus_ansible_targets }}" + loop_control: + loop_var: entry + ignore_errors: yes + vars: + inventory_hosts: "{{ hostvars | list }}" + match_host: "{{ groups[entry] | intersect(inventory_hosts) }}" + when: + - groups[entry] is defined + - groups[entry] | length >0 + + # this task relies on the ordered list passed from the previous task, a node that maybe found in compute and hmem groups will be indexed as list item[0] and item[1] respectively +- name: remove hosts duplicated in multiple groups, first group membership takes precedence, subsequent group membership is removed + set_fact: + prometheus_targets_sorted: "{{ prometheus_targets_sorted | default({}) | combine({ target_host: target_groups[0] }, recursive=True) }}" + with_subelements: + - "{{ intersect_prometheus_targets }}" + - hosts + loop_control: + loop_var: entry + vars: + target_group: "{{ entry.0['target'] }}" + target_host: "{{ entry.1 }}" + target_groups: "{{ intersect_prometheus_targets | selectattr('hosts', 'search', target_host) | map(attribute='target') }}" + + # note: additional labels are added to the node_exporter target definitions to assist in source identification +- name: generate prometheus node_exporter targets + set_fact: + gen_prometheus_targets: "{{ gen_prometheus_targets | default({}) | combine({ target_group_name: [ { 'targets': target_hosts, 'labels': { 'data_source': 'node_exporter', 'node_type': target_group } } ] }, recursive=True) }}" + loop: "{{ prometheus_ansible_targets }}" + loop_control: + loop_var: entry + vars: + target_group_name: "node_exporter_{{ entry }}" + target_group: "{{ entry }}" + node_exporter_port_format: ":{{ node_exporter_port }}" + # target_hosts: "{{ prometheus_targets_sorted | dict2items | selectattr('value', '==', target_group) | map(attribute='key') | map('regex_replace', '$', suffix_domain) | map('regex_replace', '$', node_exporter_port_format) }}" + # use short name in grafana vs fqdn + target_hosts: "{{ prometheus_targets_sorted | dict2items | selectattr('value', '==', target_group) | map(attribute='key') | map('regex_replace', '$', node_exporter_port_format) }}" + when: + - target_hosts | length >0 + +######## generate prometheus slurm_exporter scrape targets + +- name: generate prometheus slurm_exporter targets + set_fact: + gen_prometheus_targets: "{{ gen_prometheus_targets | default({}) | combine({ 'slurm_exporter': [ { 'targets': target_hosts, 'labels': { 'data_source': 'slurm_exporter', 'node_type': 'wlm' } } ] }, recursive=True) }}" + vars: + slurm_exporter_port_format: ":{{ slurm_exporter_port }}" + # target_hosts: "{{ groups['wlm'] | map('regex_replace', '$', suffix_domain) | map('regex_replace', '$', slurm_exporter_port_format) }}" + # use short name in grafana vs fqdn + target_hosts: "{{ groups['wlm'] | map('regex_replace', '$', slurm_exporter_port_format) }}" + when: + - groups['wlm'] is defined + - groups['wlm'] | length >0 + +######## generate prometheus nvidia_exporter scrape targets + +- name: generate prometheus nvidia_exporter targets + set_fact: + gen_prometheus_targets: "{{ gen_prometheus_targets | default({}) | combine({ 'gpu_exporter': [ { 'targets': target_hosts, 'labels': { 'data_source': 'nvidia_exporter', 'node_type': 'gpu' } } ] }, recursive=True) }}" + vars: + nvidia_exporter_port_format: ":{{ nvidia_exporter_port }}" + # target_hosts: "{{ groups['gpu'] | map('regex_replace', '$', suffix_domain) | map('regex_replace', '$', nvidia_exporter_port_format) }}" + # use short name in grafana vs fqdn + target_hosts: "{{ groups['gpu'] | map('regex_replace', '$', nvidia_exporter_port_format) }}" + when: + - groups['gpu'] is defined + - groups['gpu'] | length >0 + +######## merge statically defined targets found in defaults/main.yml to scrape targets + +- name: combine defaults/main.yml prometheus_targets with ansible generated (node_exporter) prometheus_ansible_targets + set_fact: + gen_prometheus_targets: "{{ gen_prometheus_targets | default({}) | combine( prometheus['prometheus_targets'] ) }}" + +######## generate prometheus node_exporter scrape jobs from the generated scape targets (each scrape job entry will reference the 'file_sd_configs' scrape target files) + +- name: get list of prometheus target groups + set_fact: + prometheus_target_groups: "{{ prometheus_target_groups | default([]) + [value] }}" + loop: "{{ prometheus_targets_sorted | dict2items }}" + loop_control: + loop_var: entry + vars: + value: "{{ entry['value'] }}" + +- set_fact: + prometheus_target_groups: "{{ prometheus_target_groups | unique }}" + +- name: generate scrape config job items + set_fact: + gen_prometheus_scrape_configs: "{{ gen_prometheus_scrape_configs | default([]) + ([ { 'job_name': prometheus_job, 'file_sd_configs': [{ 'files': [ file_path ] }] } ]) }}" + loop: "{{ prometheus_target_groups }}" + loop_control: + loop_var: entry + vars: + # prometheus_job: "{{ entry }}" + prometheus_job: "node_exporter_{{ entry }}" + # file_path: "{{ _static_targets_directory }}/{{ prometheus_job }}.yml" + file_path: "{{ _bind_mount_static_targets_directory }}/{{ prometheus_job }}.yml" + +######## generate prometheus slurm_exporter scrape job + +- name: generate slurm scrape config job items + set_fact: + gen_prometheus_scrape_configs: "{{ gen_prometheus_scrape_configs | default([]) + ([ { 'job_name': prometheus_job, 'scrape_interval': '30s', 'scrape_timeout': '30s', 'file_sd_configs': [{ 'files': [ file_path ] }] } ]) }}" + vars: + prometheus_job: "slurm_exporter" + # file_path: "{{ _static_targets_directory }}/{{ prometheus_job }}.yml" + file_path: "{{ _bind_mount_static_targets_directory }}/{{ prometheus_job }}.yml" + +######## generate prometheus nvidia_exporter scrape job + +- name: generate nvidia scrape config job items + set_fact: + gen_prometheus_scrape_configs: "{{ gen_prometheus_scrape_configs | default([]) + ([ { 'job_name': prometheus_job, 'scrape_interval': '10s', 'scrape_timeout': '10s', 'file_sd_configs': [{ 'files': [ file_path ] }] } ]) }}" + vars: + prometheus_job: "gpu_exporter" + # file_path: "{{ _static_targets_directory }}/{{ prometheus_job }}.yml" + file_path: "{{ _bind_mount_static_targets_directory }}/{{ prometheus_job }}.yml" + +######## merge statically defined scrape jobs found in defaults/main.yml to scrape jobs + +- name: combine defaults/main.yml prometheus_scrape_configs with ansible generated gen_prometheus_scrape_configs + set_fact: + gen_prometheus_scrape_configs: "{{ gen_prometheus_scrape_configs | default([]) + prometheus['prometheus_scrape_configs'] }}" + +######## render prometheus node_exporter scrape targets + +- name: configure prometheus static targets + copy: + content: | + #jinja2: lstrip_blocks: True + {{ entry.value | to_nice_yaml(indent=2,sort_keys=False) }} + dest: "{{ _static_targets_directory }}/{{ entry.key }}.yml" + force: true + owner: root + group: prometheus + mode: 0640 + loop: "{{ gen_prometheus_targets | dict2items }}" + loop_control: + loop_var: entry + notify: + - restart monitoring + when: + - gen_prometheus_targets | length >0 + +# if there are files in files/prometheus/targets copy these to the monitoring host +- name: copy prometheus custom static targets + copy: + src: "{{ entry }}" + dest: "{{ _static_targets_directory }}" + force: true + owner: root + group: prometheus + mode: 0640 + with_fileglob: "{{ prometheus['prometheus_static_targets_files'] }}" + loop_control: + loop_var: entry + notify: + - restart monitoring + +######## render prometheus alert rules + +# FIX - rules file has nextline chars to clean, this only impacts visibility + +- name: alerting rules file + template: + src: "alert.rules.j2" + dest: "{{ _rules_directory }}/ansible_managed.rules" + owner: root + group: prometheus + mode: 0640 + validate: "{{ _bin_directory }}/promtool check rules %s" + notify: + - restart monitoring + when: + - prometheus['prometheus_alert_rules'] | length >0 + +# if there are files in files/prometheus/rules copy these to the monitoring host +- name: copy custom alerting rule files + copy: + src: "{{ item }}" + dest: "{{ _rules_directory }}" + owner: root + group: prometheus + mode: 0640 + validate: "{{ _bin_directory }}/promtool check rules %s" + with_fileglob: "{{ prometheus['prometheus_alert_rules_files'] }}" + notify: + - restart monitoring + +######## render prometheus config files + +- name: configure prometheus + template: + src: "prometheus.yml.j2" + dest: "{{ _config_directory }}/prometheus.yml" + force: true + owner: root + group: prometheus + mode: 0640 + validate: "{{ _bin_directory }}/promtool check config %s" + notify: + - restart monitoring + +- name: configure prometheus web + copy: + content: "{{ prometheus['prometheus_web_config'] | to_nice_yaml(indent=2,sort_keys=False) }}" + dest: "{{ _config_directory }}/web.yml" + force: true + owner: root + group: prometheus + mode: 0640 + notify: + - restart monitoring + +######## render alertmanager config file + +- name: copy alertmanager config + template: + force: true + src: "alertmanager.yml.j2" + dest: "{{ _alertmanager_config_directory }}/alertmanager.yml" + owner: root + group: prometheus + mode: 0640 + # validate: "{{ _alertmanager_binary_install_dir }}/amtool check-config %s" # we need to get this setup + notify: + - restart monitoring + +- name: copy alertmanager template files + copy: + src: "{{ item }}" + dest: "{{ _alertmanager_templates_directory }}" + force: true + owner: root + group: prometheus + mode: 0640 + with_fileglob: "{{ alertmanager['alertmanager_template_files'] }}" + notify: + - restart monitoring \ No newline at end of file diff --git a/cluster/roles/monitoring/tasks/prometheus_install.yml b/cluster/roles/monitoring/tasks/prometheus_install.yml new file mode 100644 index 0000000..abeb7e9 --- /dev/null +++ b/cluster/roles/monitoring/tasks/prometheus_install.yml @@ -0,0 +1,176 @@ +--- +######## runtime_facts + +- name: runtime facts + ansible.builtin.set_fact: + _podman_image: "{{ prometheus['container_registry'] }}/{{ prometheus['container_repository'] }}:{{ prometheus['container_tag'] }}" + _home_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['data_dir'] }}{{ prometheus['prometheus_db_dir'] }}" + _bin_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['bin_dir'] }}" + _db_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['data_dir'] }}{{ prometheus['prometheus_db_dir'] }}" + _config_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ prometheus['prometheus_config_dir'] }}" + _rules_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ prometheus['prometheus_config_dir'] }}/rules" + _static_targets_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ prometheus['prometheus_config_dir'] }}/file_sd" + _compose_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['compose_dir'] }}" + _alertmanager_config_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ alertmanager['alertmanager_config_dir'] }}" + _alertmanager_templates_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['etc_dir'] }}{{ alertmanager['alertmanager_config_dir'] }}/templates" + _alertmanager_db_directory: "{{ monitoring['monitoring_dir'] }}{{ monitoring['data_dir'] }}{{ alertmanager['alertmanager_db_dir'] }}" + +######## user and directories + +- name: create prometheus system group + ansible.builtin.group: + name: prometheus + system: true + state: present + +- name: create prometheus system user + ansible.builtin.user: + name: prometheus + shell: "/usr/sbin/nologin" + group: prometheus + createhome: false + system: true + home: "{{ _home_directory }}" + +- name: create prometheus data directory + ansible.builtin.file: + path: "{{ entry }}" + state: directory + owner: prometheus + group: prometheus + mode: 0750 + loop: + - "{{ _db_directory }}" + loop_control: + loop_var: entry + +- name: create prometheus configuration directories + ansible.builtin.file: + path: "{{ entry }}" + state: directory + owner: root + group: prometheus + mode: 0750 + loop: + - "{{ _config_directory }}" + - "{{ _rules_directory }}" + - "{{ _static_targets_directory }}" + - "{{ _alertmanager_config_directory }}" + - "{{ _alertmanager_templates_directory }}" + - "{{ _alertmanager_db_directory }}" + loop_control: + loop_var: entry + +######## extract promtool for prometheus config validation + +- name: runtime facts + ansible.builtin.set_fact: + _placeholderA: "{{ prometheus['container_tag'] }}" + _placeholderB: "{{ prometheus['container_tag'] | regex_replace('^v', '') }}" + +- name: runtime facts + ansible.builtin.set_fact: + _prometheus_binary_pkg: "{{ _bin_directory }}/{{ (prometheus['prometheus_binaries_url'] | regex_replace('placeholderA', _placeholderA) | regex_replace('placeholderB', _placeholderB) | urlsplit('path')).split('/')[-1] }}" + _prometheus_binary_extract: "{{ _bin_directory }}/{{ (prometheus['prometheus_binaries_url'] | regex_replace('placeholderA', _placeholderA) | regex_replace('placeholderB', _placeholderB) | urlsplit('path')).split('/')[-1].split('.tar.gz')[0] }}" + _prometheus_binary_url: "{{ prometheus['prometheus_binaries_url'] | regex_replace('placeholderA', _placeholderA) | regex_replace('placeholderB', _placeholderB) }}" + +- name: check for prometheus binaries package + ansible.builtin.stat: + path: "{{ _prometheus_binary_pkg }}" + register: check_prometheus_binaries + +- name: download prometheus binaries package + ansible.builtin.get_url: + url: "{{ _prometheus_binary_url }}" + dest: "{{ _bin_directory }}" + mode: 'u+x,g+x' + when: + - not check_prometheus_binaries['stat']['exists'] | bool + +- name: extract promtool + ansible.builtin.unarchive: + src: "{{ _prometheus_binary_pkg }}" + dest: "{{ _bin_directory }}" + remote_src: yes + +- name: move promtool to bin + copy: + src: "{{ _prometheus_binary_extract }}/promtool" + dest: "{{ _bin_directory }}" + owner: root + group: prometheus + mode: 0650 + remote_src: yes + +- name: tidy up + ansible.builtin.file: + path: "{{ _prometheus_binary_extract }}" + state: absent + +######## setup podman and test docker-compose + +- name: pull prometheus image + containers.podman.podman_image: + name: "{{ _podman_image }}" + +- name: find prometheus uid:gid + ansible.builtin.getent: + database: passwd + key: prometheus + +- name: get prometheus uid:gid + ansible.builtin.set_fact: + prometheus_uid: "{{ ansible_facts['getent_passwd']['prometheus'][1] }}" + prometheus_gid: "{{ ansible_facts['getent_passwd']['prometheus'][2] }}" + +- name: check for prometheus config + ansible.builtin.stat: + path: "{{ _config_directory }}/prometheus.yml" + register: check_prometheus_config + +- name: render test docker-compose.yml + ansible.builtin.template: + src: "prometheus.docker-compose.yml.j2" + dest: "{{ _compose_directory }}/prometheus.docker-compose.yml" + owner: root + group: root + mode: 0640 + +# if this test passes you know the host is setup to work with podman and docker-compose +- name: first-run prometheus test, also first test of docker compose with podman + block: + + - name: first-run install simple prometheus.yml for docker-compose test + copy: + src: "test.prometheus.yml" + dest: "{{ _config_directory }}/prometheus.yml" + owner: root + group: prometheus + mode: 0640 + validate: "{{ _bin_directory }}/promtool check config %s" + + - name: first-run docker-compose test + ansible.builtin.command: + cmd: /usr/local/bin/docker-compose -p prometheus -f prometheus.docker-compose.yml up -d + args: + chdir: "{{ _compose_directory }}" + + - name: wait for docker-compose running state + ansible.builtin.command: + cmd: /usr/local/bin/docker-compose ls --format json + register: compose_pass + until: test + retries: 5 + delay: 5 + vars: + test: "{{ compose_pass['stdout'] | from_json | community.general.json_query(query) | regex_search('running*', ignorecase=True) is not none }}" + query: "[?Name == 'prometheus' ].Status" + + - name: first-run stop docker-compose test + ansible.builtin.command: + cmd: /usr/local/bin/docker-compose -p prometheus -f prometheus.docker-compose.yml down + args: + chdir: "{{ _compose_directory }}" + + when: + - not check_prometheus_config['stat']['exists'] | bool \ No newline at end of file diff --git a/cluster/roles/monitoring/tasks/slurm_exporter_install.yml b/cluster/roles/monitoring/tasks/slurm_exporter_install.yml new file mode 100644 index 0000000..73201a1 --- /dev/null +++ b/cluster/roles/monitoring/tasks/slurm_exporter_install.yml @@ -0,0 +1,61 @@ +--- +# https://github.com/vpenso/prometheus-slurm-exporter +# https://github.com/vpenso/prometheus-slurm-exporter/blob/master/DEVELOPMENT.md +# the prometheus-slurm-exporter bin was compiled on a rocky8 box with slurm 22.05.3 +# going forward i suspect the bin will want to be compiled for different versions of slurm with this task matching the lmod version with the systemd unit file reference +# the bin is only 12m so its likely better to have a bunch of pre-compiled versions pulled from repo01 +# the lmod lookup method is likely wrong too, without knowledge of the stack/lmod, the following method to ensure systemd finds the slurm bins seems rudimentary + +- name: find slurm module version + ansible.builtin.find: + # paths: /opt/slurm/modules/slurm + paths: "/opt/slurm/modules/el{{ ansible_distribution_major_version }}/slurm" + recurse: no + file_type: file + use_regex: yes + patterns: ['^[0-9]'] + register: slurm_version + +- name: fail when slurm lmod module not found + fail: + msg: "slurm not present" + when: + - slurm_version['matched'] == 0 + +- name: get slurm bin path + set_fact: + _slurm_path: "{{ slurm_path }}" + vars: + slurm_directory: "{{ slurm_version['files'][0]['path'].split('/')[-1] }}" + slurm_path: "/opt/slurm/{{ slurm_directory }}/el{{ ansible_distribution_major_version }}/bin" + +- name: copy prometheus-slurm-exporter + copy: + src: "slurm_exporter/prometheus-slurm-exporter" + dest: "/usr/bin/prometheus-slurm-exporter" + force: true + owner: root + group: root + mode: 0650 + notify: + - restart slurm_exporter + +- name: write systemd unit file + ansible.builtin.template: + src: prometheus-slurm-exporter.service.j2 + dest: "/lib/systemd/system/prometheus-slurm-exporter.service" + owner: root + group: root + mode: 0655 + notify: + - restart slurm_exporter + +- name: enable/start prometheus-slurm-exporter service + ansible.builtin.systemd: + name: prometheus-slurm-exporter + state: started + enabled: true + daemon_reload: true + +- name: Flush handlers + meta: flush_handlers \ No newline at end of file diff --git a/cluster/roles/monitoring/templates/alert.rules.j2 b/cluster/roles/monitoring/templates/alert.rules.j2 new file mode 100644 index 0000000..6139e30 --- /dev/null +++ b/cluster/roles/monitoring/templates/alert.rules.j2 @@ -0,0 +1,6 @@ +{{ ansible_managed | comment }} + +groups: +- name: ansible managed alert rules + rules: + {{ prometheus['prometheus_alert_rules'] | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }} diff --git a/cluster/roles/monitoring/templates/alertmanager.yml.j2 b/cluster/roles/monitoring/templates/alertmanager.yml.j2 new file mode 100644 index 0000000..1bbdedf --- /dev/null +++ b/cluster/roles/monitoring/templates/alertmanager.yml.j2 @@ -0,0 +1,19 @@ +{{ ansible_managed | comment }} + +global: + resolve_timeout: {{ alertmanager.alertmanager_resolve_timeout | quote}} +{% for key, value in alertmanager.alertmanager_smtp.items() %} + smtp_{{ key }}: {{ value | quote }} +{% endfor %} +templates: +- '{{ _bind_mount_alertmanager_config_directory }}/templates/*.tmpl' +{% if alertmanager.alertmanager_receivers | length %} +receivers: +{{ alertmanager.alertmanager_receivers | to_nice_yaml(indent=2) }} +{% endif %} +{% if alertmanager.alertmanager_inhibit_rules | length %} +inhibit_rules: +{{ alertmanager.alertmanager_inhibit_rules | to_nice_yaml(indent=2) }} +{% endif %} +route: + {{ alertmanager.alertmanager_route | to_nice_yaml(indent=2) | indent(2, False) }} \ No newline at end of file diff --git a/cluster/roles/monitoring/templates/docker-compose.yml.j2 b/cluster/roles/monitoring/templates/docker-compose.yml.j2 new file mode 100644 index 0000000..1ddcbf3 --- /dev/null +++ b/cluster/roles/monitoring/templates/docker-compose.yml.j2 @@ -0,0 +1,75 @@ +version: '3.9' + +networks: + monitor-net: + driver: bridge + ipam: + config: + - subnet: 10.5.0.0/24 + gateway: 10.5.0.1 + +services: + prometheus: + user: {{ prometheus_uid }}:{{ prometheus_gid }} + image: {{ _prometheus_image }} + container_name: prometheus + restart: unless-stopped + volumes: + - type: bind + source: {{ _prometheus_config_directory }} + target: /etc/prometheus + - type: bind + source: {{ _prometheus_db_directory }} + target: /data + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/data' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + ports: + - 9090:9090 + networks: + monitor-net: + ipv4_address: 10.5.0.2 + + alertmanager: + user: {{ prometheus_uid }}:{{ prometheus_gid }} + image: {{ _alertmanager_image }} + container_name: alertmanager + restart: unless-stopped + volumes: + - type: bind + source: {{ _alertmanager_config_directory }} + target: /etc/alertmanager + - type: bind + source: {{ _alertmanager_db_directory }} + target: /alertmanager + command: + - '--config.file=/etc/alertmanager/alertmanager.yml' + - '--storage.path=/alertmanager' + ports: + - 9093:9093 + networks: + monitor-net: + ipv4_address: 10.5.0.3 + + grafana: + image: {{ _grafana_image }} + container_name: grafana + restart: unless-stopped + volumes: + - type: bind + source: {{ _grafana_config_directory }} + target: /etc/grafana/provisioning + - type: bind + source: {{ _grafana_data_directory }} + target: /var/lib/grafana + environment: + - GF_SECURITY_ADMIN_USER={{ _grafana_init_user }} + - GF_SECURITY_ADMIN_PASSWORD={{ _grafana_init_user_pass }} + - GF_USERS_ALLOW_SIGN_UP=false + ports: + - 3000:3000 + networks: + monitor-net: + ipv4_address: 10.5.0.4 \ No newline at end of file diff --git a/cluster/roles/monitoring/templates/monitoring-stack.service.j2 b/cluster/roles/monitoring/templates/monitoring-stack.service.j2 new file mode 100644 index 0000000..6d2a699 --- /dev/null +++ b/cluster/roles/monitoring/templates/monitoring-stack.service.j2 @@ -0,0 +1,14 @@ +[Unit] +Description=prometheus service +Requires=podman.socket +After=podman.socket + +[Service] +Type=oneshot +RemainAfterExit=true +WorkingDirectory={{ _compose_directory }} +ExecStart=/usr/local/bin/docker-compose up -d --remove-orphans +ExecStop=/usr/local/bin/docker-compose down + +[Install] +WantedBy=multi-user.target diff --git a/cluster/roles/monitoring/templates/nvidia_gpu_exporter.service.j2 b/cluster/roles/monitoring/templates/nvidia_gpu_exporter.service.j2 new file mode 100644 index 0000000..c1a2c82 --- /dev/null +++ b/cluster/roles/monitoring/templates/nvidia_gpu_exporter.service.j2 @@ -0,0 +1,13 @@ +[Unit] +Description=Prometheus NVIDIA GPU Exporter +After=network-online.target + +[Service] +ExecStart=/usr/bin/nvidia_gpu_exporter --web.listen-address=0.0.0.0:{{ prometheus['nvidia_exporter_port'] }} +SyslogIdentifier=nvidia_gpu_exporter +Restart=always +RestartSec=15 + +[Install] +WantedBy=multi-user.target + diff --git a/cluster/roles/monitoring/templates/prometheus-slurm-exporter.service.j2 b/cluster/roles/monitoring/templates/prometheus-slurm-exporter.service.j2 new file mode 100644 index 0000000..affc9cc --- /dev/null +++ b/cluster/roles/monitoring/templates/prometheus-slurm-exporter.service.j2 @@ -0,0 +1,12 @@ +[Unit] +Description=Prometheus SLURM Exporter + +[Service] +Environment="PATH={{ _slurm_path }}:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" +ExecStart=/usr/bin/prometheus-slurm-exporter --listen-address=0.0.0.0:{{ prometheus['slurm_exporter_port'] }} +Restart=always +RestartSec=15 + +[Install] +WantedBy=multi-user.target + diff --git a/cluster/roles/monitoring/templates/prometheus.docker-compose.yml.j2 b/cluster/roles/monitoring/templates/prometheus.docker-compose.yml.j2 new file mode 100644 index 0000000..4d1d71c --- /dev/null +++ b/cluster/roles/monitoring/templates/prometheus.docker-compose.yml.j2 @@ -0,0 +1,21 @@ +version: '3.9' +services: + prometheus: + user: {{ prometheus_uid }}:{{ prometheus_gid }} + image: {{ _podman_image }} + container_name: prometheus + restart: unless-stopped + volumes: + - type: bind + source: {{ _config_directory }} + target: /etc/prometheus + - type: bind + source: {{ _db_directory }} + target: /data + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/data' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + ports: + - 9090:9090 \ No newline at end of file diff --git a/cluster/roles/monitoring/templates/prometheus.yml.j2 b/cluster/roles/monitoring/templates/prometheus.yml.j2 new file mode 100644 index 0000000..88e782d --- /dev/null +++ b/cluster/roles/monitoring/templates/prometheus.yml.j2 @@ -0,0 +1,34 @@ +#jinja2: trim_blocks: True, lstrip_blocks: True +{{ ansible_managed | comment }} +# http://prometheus.io/docs/operating/configuration/ + +global: + {{ prometheus['prometheus_global'] | to_nice_yaml(indent=2,sort_keys=False) | indent(2, False) }} + external_labels: + {{ monitoring['external_labels'] | to_nice_yaml(indent=2,sort_keys=False) | indent(4, False) }} + +{% if prometheus['prometheus_remote_write'] != [] %} +remote_write: + {{ prometheus['prometheus_remote_write'] | to_nice_yaml(indent=2,sort_keys=False) | indent(2, False) }} +{% endif %} + +{% if prometheus['prometheus_remote_read'] != [] %} +remote_read: + {{ prometheus['prometheus_remote_read'] | to_nice_yaml(indent=2,sort_keys=False) | indent(2, False) }} +{% endif %} + +rule_files: + - {{ _bind_mount_rules_directory }}/*.rules + +{% if prometheus['prometheus_alertmanager_config'] | length > 0 %} +alerting: + alertmanagers: + {{ prometheus['prometheus_alertmanager_config'] | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }} + {% if prometheus['prometheus_alert_relabel_configs'] | length > 0 %} + alert_relabel_configs: + {{ prometheus['prometheus_alert_relabel_configs'] | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }} + {% endif %} +{% endif %} + +scrape_configs: + {{ gen_prometheus_scrape_configs | to_nice_yaml(indent=2,sort_keys=False) | indent(2,False) }} diff --git a/cluster/roles/monitoring/tests/inventory b/cluster/roles/monitoring/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/monitoring/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/monitoring/tests/test.yml b/cluster/roles/monitoring/tests/test.yml new file mode 100644 index 0000000..87930df --- /dev/null +++ b/cluster/roles/monitoring/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - prometheus diff --git a/cluster/roles/monitoring/vars/main.yml b/cluster/roles/monitoring/vars/main.yml new file mode 100644 index 0000000..f01bf99 --- /dev/null +++ b/cluster/roles/monitoring/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for template_role diff --git a/cluster/roles/nfs/.travis.yml b/cluster/roles/nfs/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/nfs/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/nfs/README.md b/cluster/roles/nfs/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/nfs/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/nfs/defaults/main.yml b/cluster/roles/nfs/defaults/main.yml new file mode 100644 index 0000000..e62e5f9 --- /dev/null +++ b/cluster/roles/nfs/defaults/main.yml @@ -0,0 +1,49 @@ +# Copyright 2022 OCF Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- +# vim: ft=yaml +--- + +# nfs: +# pkg: nfs-utils +# service: nfs +# exports: +# nfs01: +# /nfs/home: +# opts: "rw,async,no_root_squash" +# /nfs/software: +# opts: "rw,async,no_root_squash" +# wlm01: +# /opt/slurm: +# opts: "rw,async,no_root_squash" + +# nfs: +# pkg: nfs-utils +# service: nfs +# exports: +# - path: "/nfs/home" +# opts: "rw,async,no_root_squash" +# network: cluster +# hosts: +# - nfs01 +# - path: "/nfs/software" +# opts: "rw,async,no_root_squash" +# network: cluster +# hosts: +# - nfs01 +# - path: /opt/slurm +# opts: "rw,async,no_root_squash" +# network: cluster +# hosts: +# - wlm01 diff --git a/cluster/roles/nfs/handlers/main.yml b/cluster/roles/nfs/handlers/main.yml new file mode 100644 index 0000000..f22a420 --- /dev/null +++ b/cluster/roles/nfs/handlers/main.yml @@ -0,0 +1,12 @@ +--- +- name: Restart NFS server + ansible.builtin.systemd: + name: nfs-server + state: restarted + enabled: true + listen: "Restart NFS server" + +- name: Export all NFS directories + ansible.builtin.shell: + cmd: exportfs -a + listen: "Export all NFS directories" \ No newline at end of file diff --git a/cluster/roles/nfs/meta/main.yml b/cluster/roles/nfs/meta/main.yml new file mode 100644 index 0000000..c572acc --- /dev/null +++ b/cluster/roles/nfs/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/cluster/roles/nfs/tasks/main.yml b/cluster/roles/nfs/tasks/main.yml new file mode 100644 index 0000000..fe08006 --- /dev/null +++ b/cluster/roles/nfs/tasks/main.yml @@ -0,0 +1,36 @@ +# Copyright 2022 OCF Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- +# vim: ft=yaml +--- + +# - debug: +# msg: "{{ exports }}" + +- name: Install NFS packages + ansible.builtin.package: + name: "nfs-utils" + state: latest + +- name: Enable rpcbind and start + ansible.builtin.service: + name: rpcbind + state: started + enabled: true + +- name: Run NFS server tasks + ansible.builtin.include_tasks: + file: server.yml + when: + - "'nfs' in hostvars[ansible_hostname]['group_names']" \ No newline at end of file diff --git a/cluster/roles/nfs/tasks/server.yml b/cluster/roles/nfs/tasks/server.yml new file mode 100644 index 0000000..fed730f --- /dev/null +++ b/cluster/roles/nfs/tasks/server.yml @@ -0,0 +1,83 @@ +# Copyright 2022 OCF Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -*- coding: utf-8 -*- +# vim: ft=yaml +--- +######## runtime vars + +- name: get exports to be served + set_fact: + _serve_exports: "{{ serve_exports }}" + vars: + serve_exports: "{{ exports | selectattr('action', '==', 'exporter' ) }}" + +######## run disk_management to ensure mountpoints in place + +# If you do not have a second VM disk to put LVM onto use this to test +# - name: Configure NFS exports directories +# file: +# path: "{{ path }}" +# state: directory +# mode: '0755' +# loop: "{{ _serve_exports }}" +# loop_control: +# loop_var: entry +# vars: +# path: "{{ entry['export'] }}" + +- name: Configure NFS storage + include_role: + name: disk_management + +######## configure nfsd + +- name: Configure NFS exports + ansible.builtin.template: + src: templates/exports.j2 + dest: "{{ file_name }}" + owner: root + group: root + mode: 0644 + # trim_blocks: False + loop: "{{ _serve_exports }}" + loop_control: + loop_var: entry + vars: + path: "{{ entry['export'] }}" + name: "{{ entry['export'].split('/')[-1] }}" + file_name: "/etc/exports.d/ansible_{{ name }}.exports" + opts: "{{ entry['opts'] }}" + network_name: "{{ entry['network'] }}" + network_range: "{{ vars[config_namespace]['xcat_networks'][network_name]['network'] }}" + network_mask: "{{ vars[config_namespace]['xcat_networks'][network_name]['netmask'] }}" + network_cidr: "{{ network_range }}/{{ (network_range + '/' + network_mask) | ansible.utils.ipaddr('prefix') }}" + notify: + - Export all NFS directories + +- name: Configure NFS server + ansible.builtin.template: + src: templates/nfs.conf.j2 + dest: /etc/nfs.conf + owner: root + group: root + mode: 0644 + trim_blocks: false + notify: + - Restart NFS server + +- name: Enable NFS server and start + ansible.builtin.service: + name: nfs-server + state: started + enabled: true \ No newline at end of file diff --git a/cluster/roles/nfs/templates/exports.j2 b/cluster/roles/nfs/templates/exports.j2 new file mode 100644 index 0000000..3e7325c --- /dev/null +++ b/cluster/roles/nfs/templates/exports.j2 @@ -0,0 +1,4 @@ +# +# {{ ansible_managed }} +# +{{ path }} {{ network_cidr }}({{ opts }}) diff --git a/cluster/roles/nfs/templates/nfs.conf.j2 b/cluster/roles/nfs/templates/nfs.conf.j2 new file mode 100644 index 0000000..3b0572d --- /dev/null +++ b/cluster/roles/nfs/templates/nfs.conf.j2 @@ -0,0 +1,79 @@ +# +# {{ ansible_managed }} +# +[general] +# pipefs-directory=/var/lib/nfs/rpc_pipefs +# +[exportfs] +# debug=0 +# +[gssd] +# verbosity=0 +# rpc-verbosity=0 +# use-memcache=0 +# use-machine-creds=1 +use-gss-proxy=1 +# avoid-dns=1 +# limit-to-legacy-enctypes=0 +# context-timeout=0 +# rpc-timeout=5 +# keytab-file=/etc/krb5.keytab +# cred-cache-directory= +# preferred-realm= +# +[lockd] +# port=0 +# udp-port=0 +# +[mountd] +# debug=0 +manage-gids=y +# descriptors=0 +# port=0 +# threads=1 +# reverse-lookup=n +# state-directory-path=/var/lib/nfs +# ha-callout= +# +[nfsdcld] +# debug=0 +# storagedir=/var/lib/nfs/nfsdcld +# +[nfsdcltrack] +# debug=0 +# storagedir=/var/lib/nfs/nfsdcltrack +# +[nfsd] +# debug=0 +threads={{ ansible_facts['processor_nproc'] }} +# host= +# port=0 +# grace-time=90 +# lease-time=90 +# tcp=y +# vers2=n +# vers3=y +# vers4=y +# vers4.0=y +# vers4.1=y +# vers4.2=y +# rdma=n +# rdma-port=20049 +# +[statd] +# debug=0 +# port=0 +# outgoing-port=0 +# name= +# state-directory-path=/var/lib/nfs/statd +# ha-callout= +# no-notify=0 +# +[sm-notify] +# debug=0 +# force=0 +# retry-time=900 +# outgoing-port= +# outgoing-addr= +# lift-grace=y +# diff --git a/cluster/roles/nfs/tests/inventory b/cluster/roles/nfs/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/nfs/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/nfs/tests/test.yml b/cluster/roles/nfs/tests/test.yml new file mode 100644 index 0000000..87930df --- /dev/null +++ b/cluster/roles/nfs/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - prometheus diff --git a/cluster/roles/nfs/vars/main.yml b/cluster/roles/nfs/vars/main.yml new file mode 100644 index 0000000..f01bf99 --- /dev/null +++ b/cluster/roles/nfs/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for template_role diff --git a/cluster/roles/ntp/.travis.yml b/cluster/roles/ntp/.travis.yml new file mode 100755 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/ntp/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/ntp/README.md b/cluster/roles/ntp/README.md new file mode 100755 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/ntp/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/ntp/defaults/main.yml b/cluster/roles/ntp/defaults/main.yml new file mode 100755 index 0000000..f113264 --- /dev/null +++ b/cluster/roles/ntp/defaults/main.yml @@ -0,0 +1,11 @@ +--- +ntp: + external_time_sources: + - "pool 0.uk.pool.ntp.org iburst prefer" + - "pool 1.uk.pool.ntp.org" + - "pool 2.uk.pool.ntp.org" + - "pool 3.uk.pool.ntp.org" + allow_network: + - cluster + - ipmi + timezone: Europe/London diff --git a/cluster/roles/ntp/handlers/main.yml b/cluster/roles/ntp/handlers/main.yml new file mode 100755 index 0000000..bda36bf --- /dev/null +++ b/cluster/roles/ntp/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: restart chronyd systemd service + ansible.builtin.systemd: + name: chronyd + state: restarted + listen: restart_chronyd diff --git a/cluster/roles/ntp/meta/main.yml b/cluster/roles/ntp/meta/main.yml new file mode 100755 index 0000000..c572acc --- /dev/null +++ b/cluster/roles/ntp/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/cluster/roles/ntp/tasks/main.yml b/cluster/roles/ntp/tasks/main.yml new file mode 100755 index 0000000..8380983 --- /dev/null +++ b/cluster/roles/ntp/tasks/main.yml @@ -0,0 +1,117 @@ +--- + +# - name: merge custom vars +# block: + +# - name: set role variable sources +# set_fact: +# role_info: +# role_defaults_file: "{{ role_path }}/defaults/main.yml" +# role_override_file: "{{ ansible_inventory_sources[0] | dirname }}/group_vars/{{ role_name }}.yml" +# vars_return: "placeholder" + +# - set_fact: +# source_role: "{{ role_name }}" + +# - name: run merge_vars role +# include_role: +# name: "merge_vars" +# vars: +# a_config_file: "{{ role_info['role_defaults_file'] }}" +# b_config_file: "{{ role_info['role_override_file'] }}" +# calling_role: "{{ source_role }}" + +# - name: merge custom vars to vars[] +# set_fact: +# { "{{ entry }}": "{{ role_info['vars_return'][entry] }}" } +# loop: "{{ role_info['vars_return'] | list }}" +# loop_control: +# loop_var: entry +# when: +# - not role_info['vars_return'] == 'placeholder' + +# - debug: +# msg: +# - "{{ vars['ntp'] }}" +# - "{{ vars['a'] }}" + +# delegate_to: localhost + +- name: get facts for localhost + ansible.builtin.setup: + delegate_to: localhost + delegate_facts: true + +- name: test for clock skew + set_fact: + _clock_skew: True + when: + - (((hostvars[ansible_hostname]['ansible_date_time']['epoch_int'] | int) - (hostvars['localhost']['ansible_date_time']['epoch_int'] | int)) | abs) >86400 + +# manually set date on host where it differs from the localhost by more than 1 day, host must be able to validate ssl certs to download ntp packages +# test with: +# - date --set="2 year ago" +# - date --set="2 year" +- name: set host time to localhost time + ansible.builtin.command: date --set '@{{ hostvars['localhost']['ansible_date_time']['epoch_int'] }}' + when: + - _clock_skew is defined + +- name: install ntp packages + ansible.builtin.package: + name: + - tzdata + - chrony + state: latest + +- name: update package facts + ansible.builtin.package_facts: + manager: auto + strategy: all + +- name: set timezone to Europe/London + community.general.timezone: + name: Europe/London + when: + - "'tzdata' in ansible_facts['packages']" + +- name: set facts to render config as ntp client + set_fact: + _enable_ntp_servers: "{{ ['pool 0.europe.pool.ntp.org iburst prefer', 'pool 1.europe.pool.ntp.org', 'pool 2.europe.pool.ntp.org', 'pool 3.europe.pool.ntp.org'] }}" + +- name: set facts to render config as ntp client with private ntp sources + set_fact: + _enable_ntp_servers: "{{ vars['groups']['ntpd'] | map('regex_replace', '$', suffix_domain) | map('regex_replace', '$', ' iburst prefer') | map('regex_replace', '^', 'server ') }}" + vars: + suffix_domain: ".{{ vars[config_namespace]['env']['cluster_domain'] }}" + when: + - vars['groups']['ntpd'] is defined + +- name: set facts to render config as ntp server + set_fact: + _enable_ntp_server: true + _external_time_sources: "{{ ntp['external_time_sources'] }}" + _allow_network: "{{ _allow_network | default([]) + [cidr_range] }}" + loop: "{{ ntp['allow_network'] }}" + loop_control: + loop_var: entry + vars: + cidr_range: "{{ vars[config_namespace]['cluster_networks'][entry]['network'] }}/{{ (vars[config_namespace]['cluster_networks'][entry]['network'] + '/' + vars[config_namespace]['cluster_networks'][entry]['netmask']) | ansible.utils.ipaddr('prefix') }}" + when: + # - "'ntp_server' in hostvars[ansible_hostname]['group_names']" + - "'ntpd' in active_role_groups" + +- name: configure chrony.conf + ansible.builtin.template: + src: templates/chrony.conf.j2 + dest: /etc/chrony.conf + owner: root + group: root + mode: 0644 + notify: restart_chronyd + +- name: start chronyd service + ansible.builtin.service: + name: chronyd + state: started + enabled: true \ No newline at end of file diff --git a/cluster/roles/ntp/templates/chrony.conf.j2 b/cluster/roles/ntp/templates/chrony.conf.j2 new file mode 100644 index 0000000..c2da320 --- /dev/null +++ b/cluster/roles/ntp/templates/chrony.conf.j2 @@ -0,0 +1,55 @@ +# +# {{ ansible_managed }} +# +# Use public servers from the pool.ntp.org project. +# Please consider joining the pool (http://www.pool.ntp.org/join.html). +{% if _enable_ntp_server is defined %} +{% for server in _external_time_sources %} +{{ server }} +{% endfor %} +{% endif %} +{% if not _enable_ntp_server is defined %} +{% for server in _enable_ntp_servers %} +{{ server }} +{% endfor %} +{% endif %} + +# Record the rate at which the system clock gains/losses time. +driftfile /var/lib/chrony/drift + +# Allow the system clock to be stepped in the first three updates +# if its offset is larger than 1 second. +makestep 1.0 3 + +# Enable kernel synchronization of the real-time clock (RTC). +rtcsync + +# Enable hardware timestamping on all interfaces that support it. +#hwtimestamp * + +# Increase the minimum number of selectable sources required to adjust +# the system clock. +#minsources 2 + +{% if _enable_ntp_server is defined %} +# Allow NTP client access from local network +{% for network in _allow_network %} +allow {{ network }} +{% endfor %} +{% endif %} + +# Serve time even if not synchronized to a time source. +local stratum 8 + +# Specify file containing keys for NTP authentication. +keyfile /etc/chrony.keys + +# Get TAI-UTC offset and leap seconds from the system tz database. +leapsectz right/UTC + +# Specify directory for log files. +logdir /var/log/chrony + +# Select which information is logged. +#log measurements statistics tracking +manual diff --git a/cluster/roles/ntp/templates/chrony.conf.j2.old b/cluster/roles/ntp/templates/chrony.conf.j2.old new file mode 100755 index 0000000..28a3193 --- /dev/null +++ b/cluster/roles/ntp/templates/chrony.conf.j2.old @@ -0,0 +1,49 @@ +# +# {{ ansible_managed }} +# +# Use public servers from the pool.ntp.org project. +# Please consider joining the pool (http://www.pool.ntp.org/join.html). +#pool 2.centos.pool.ntp.org iburst +{% for ntpserv in ntp.external_hosts %} +server {{ ntpserv }} +{% endfor %} + +# Record the rate at which the system clock gains/losses time. +driftfile /var/lib/chrony/drift + +# Allow the system clock to be stepped in the first three updates +# if its offset is larger than 1 second. +makestep 1.0 3 + +# Enable kernel synchronization of the real-time clock (RTC). +rtcsync + +# Enable hardware timestamping on all interfaces that support it. +#hwtimestamp * + +# Increase the minimum number of selectable sources required to adjust +# the system clock. +#minsources 2 + +{% if 'ntp' in hostvars[ansible_hostname]['xcat_groups'] %} +# Allow NTP client access from local network +{% for net in ntp.allow_from %} +allow {{ net }} +{% endfor %} +{% endif %} + +# Serve time even if not synchronized to a time source. +local stratum 8 + +# Specify file containing keys for NTP authentication. +keyfile /etc/chrony.keys + +# Get TAI-UTC offset and leap seconds from the system tz database. +leapsectz right/UTC + +# Specify directory for log files. +logdir /var/log/chrony + +# Select which information is logged. +#log measurements statistics tracking +manual diff --git a/cluster/roles/ntp/tests/inventory b/cluster/roles/ntp/tests/inventory new file mode 100755 index 0000000..878877b --- /dev/null +++ b/cluster/roles/ntp/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/ntp/tests/test.yml b/cluster/roles/ntp/tests/test.yml new file mode 100755 index 0000000..87930df --- /dev/null +++ b/cluster/roles/ntp/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - prometheus diff --git a/cluster/roles/ntp/vars/main.yml b/cluster/roles/ntp/vars/main.yml new file mode 100755 index 0000000..f01bf99 --- /dev/null +++ b/cluster/roles/ntp/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for template_role diff --git a/cluster/roles/os_packages/.travis.yml b/cluster/roles/os_packages/.travis.yml new file mode 100755 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/os_packages/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/os_packages/README.md b/cluster/roles/os_packages/README.md new file mode 100755 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/os_packages/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/os_packages/defaults/main.yml b/cluster/roles/os_packages/defaults/main.yml new file mode 100755 index 0000000..99e3c36 --- /dev/null +++ b/cluster/roles/os_packages/defaults/main.yml @@ -0,0 +1,28 @@ +--- +os_packages: + all: + - "@core" + - ca-certificates + - unzip + - bzip2 + - dos2unix + - bind-utils + - net-tools + - vim + - nano + - tree + - telnet + - traceroute + - nmap + - mlocate + - tmux + - wget + - curl + - mlocate + hypervisor: + - lvm2 + - libvirt + - lxc + - podman + - ipmitool + - python39 \ No newline at end of file diff --git a/cluster/roles/os_packages/handlers/main.yml b/cluster/roles/os_packages/handlers/main.yml new file mode 100755 index 0000000..a7881cd --- /dev/null +++ b/cluster/roles/os_packages/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for os_packages diff --git a/cluster/roles/os_packages/meta/main.yml b/cluster/roles/os_packages/meta/main.yml new file mode 100755 index 0000000..c572acc --- /dev/null +++ b/cluster/roles/os_packages/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/cluster/roles/os_packages/tasks/main.yml b/cluster/roles/os_packages/tasks/main.yml new file mode 100755 index 0000000..1726bb8 --- /dev/null +++ b/cluster/roles/os_packages/tasks/main.yml @@ -0,0 +1,36 @@ +--- +- name: upgrade all packages + yum: + name: '*' + state: latest + +- name: install epel repo + ansible.builtin.package: + name: "epel-release" + state: latest + +- name: update package facts + ansible.builtin.package_facts: + manager: auto + strategy: all + +- name: build package list + set_fact: + _target_packages: "{{ _target_packages | default([]) + pkglist }}" + loop: "{{ active_role_groups }}" + loop_control: + loop_var: role + ignore_errors: true + vars: + pkglist: "{{ os_packages[role] }}" + when: + - os_packages[role] is defined + +- name: install packages + ansible.builtin.package: + name: "{{ install_pkg }}" + state: latest + ignore_errors: yes + loop: "{{ _target_packages }}" + loop_control: + loop_var: install_pkg \ No newline at end of file diff --git a/cluster/roles/os_packages/tests/inventory b/cluster/roles/os_packages/tests/inventory new file mode 100755 index 0000000..878877b --- /dev/null +++ b/cluster/roles/os_packages/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/os_packages/tests/test.yml b/cluster/roles/os_packages/tests/test.yml new file mode 100755 index 0000000..46ae16e --- /dev/null +++ b/cluster/roles/os_packages/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - packages diff --git a/cluster/roles/os_packages/vars/main.yml b/cluster/roles/os_packages/vars/main.yml new file mode 100755 index 0000000..10320fb --- /dev/null +++ b/cluster/roles/os_packages/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for os_packages diff --git a/cluster/roles/role-template/.travis.yml b/cluster/roles/role-template/.travis.yml new file mode 100644 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/role-template/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/role-template/README.md b/cluster/roles/role-template/README.md new file mode 100644 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/role-template/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/role-template/defaults/main.yml b/cluster/roles/role-template/defaults/main.yml new file mode 100644 index 0000000..2bec87e --- /dev/null +++ b/cluster/roles/role-template/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/role-template/handlers/main.yml b/cluster/roles/role-template/handlers/main.yml new file mode 100644 index 0000000..2d28ec4 --- /dev/null +++ b/cluster/roles/role-template/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/role-template/meta/main.yml b/cluster/roles/role-template/meta/main.yml new file mode 100644 index 0000000..227ad9c --- /dev/null +++ b/cluster/roles/role-template/meta/main.yml @@ -0,0 +1,53 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.9 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. + \ No newline at end of file diff --git a/cluster/roles/role-template/tasks/main.yml b/cluster/roles/role-template/tasks/main.yml new file mode 100644 index 0000000..117be7b --- /dev/null +++ b/cluster/roles/role-template/tasks/main.yml @@ -0,0 +1,2 @@ +--- +# tasks file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/role-template/tests/inventory b/cluster/roles/role-template/tests/inventory new file mode 100644 index 0000000..878877b --- /dev/null +++ b/cluster/roles/role-template/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/role-template/tests/test.yml b/cluster/roles/role-template/tests/test.yml new file mode 100644 index 0000000..c2fd2bd --- /dev/null +++ b/cluster/roles/role-template/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - roles/role-template \ No newline at end of file diff --git a/cluster/roles/role-template/vars/main.yml b/cluster/roles/role-template/vars/main.yml new file mode 100644 index 0000000..f655be4 --- /dev/null +++ b/cluster/roles/role-template/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for roles/role-template \ No newline at end of file diff --git a/cluster/roles/variable_interchange/.travis.yml b/cluster/roles/variable_interchange/.travis.yml new file mode 100755 index 0000000..36bbf62 --- /dev/null +++ b/cluster/roles/variable_interchange/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/cluster/roles/variable_interchange/README.md b/cluster/roles/variable_interchange/README.md new file mode 100755 index 0000000..225dd44 --- /dev/null +++ b/cluster/roles/variable_interchange/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/cluster/roles/variable_interchange/defaults/main.yml b/cluster/roles/variable_interchange/defaults/main.yml new file mode 100755 index 0000000..d5f2337 --- /dev/null +++ b/cluster/roles/variable_interchange/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for slurm diff --git a/cluster/roles/variable_interchange/handlers/main.yml b/cluster/roles/variable_interchange/handlers/main.yml new file mode 100755 index 0000000..7748c8b --- /dev/null +++ b/cluster/roles/variable_interchange/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for slurm diff --git a/cluster/roles/variable_interchange/meta/main.yml b/cluster/roles/variable_interchange/meta/main.yml new file mode 100755 index 0000000..c572acc --- /dev/null +++ b/cluster/roles/variable_interchange/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/cluster/roles/variable_interchange/tasks/main.yml b/cluster/roles/variable_interchange/tasks/main.yml new file mode 100755 index 0000000..1078fd0 --- /dev/null +++ b/cluster/roles/variable_interchange/tasks/main.yml @@ -0,0 +1,36 @@ +--- +- name: variable interchange + set_fact: + # set fact at top level vars + example_var: "example123" + # append to steel dictionary + steel: "{{ steel | combine({'example_var1': 'example456'}) }}" + +- name: variable interchange + set_fact: + # append to steel dictionary into same keyspace, must use recursive=True to not overwrite dict + steel: "{{ steel | default({}) | combine({'ocfenv': {'example_var2': 'example789'}}, recursive=True) }}" + +- name: variable interchange + set_fact: + # overwrite existing keypair object, must use recursive=True to not overwrite dict + # use instance variable + # you cannot unset variables in ansible + steel: "{{ steel | default({}) | combine({'ocfenv': {'example_var2': new_entry}}, recursive=True) }}" + vars: + new_entry: '789example' + +# show hostvars +# show dummy host + +- name: failed task + debug: + msg: "{{ new_entry }}" + ignore_errors: True + register: debug_result + +- name: failed task output + debug: + msg: + - "{{ debug_result['failed'] }}" + - "{{ debug_result.failed }}" \ No newline at end of file diff --git a/cluster/roles/variable_interchange/tests/inventory b/cluster/roles/variable_interchange/tests/inventory new file mode 100755 index 0000000..878877b --- /dev/null +++ b/cluster/roles/variable_interchange/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/cluster/roles/variable_interchange/tests/test.yml b/cluster/roles/variable_interchange/tests/test.yml new file mode 100755 index 0000000..e60bd71 --- /dev/null +++ b/cluster/roles/variable_interchange/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - slurm diff --git a/cluster/roles/variable_interchange/vars/main.yml b/cluster/roles/variable_interchange/vars/main.yml new file mode 100755 index 0000000..937af3a --- /dev/null +++ b/cluster/roles/variable_interchange/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for slurm diff --git a/cluster/site.yml b/cluster/site.yml new file mode 100644 index 0000000..16c0e66 --- /dev/null +++ b/cluster/site.yml @@ -0,0 +1,170 @@ +--- +- name: entrypoint + hosts: all + user: ansible + become: yes + gather_facts: true + vars: + test_run_roles: + # - network + # - repos + # - yum + # - ssh + # - users + # - systemd + # - rsyslog + # - audittrail + # - nhc + # - prometheus + # - sysctl + # - limits + # START + # - ntp + # - os_packages + - hypervisor_prep + # - vxlan + # - firewalld + + tasks: + + ######## load core group_vars + # + # load the following core environment files under vars['steel'] + # - inventory/group_vars/cluster.yml + # - inventory/group_vars/roles.yml + # - inventory/group_vars/networks.yml + + - name: load core environment configuration + block: + + - name: set env import variables + ansible.builtin.set_fact: + env_files: + - 'cluster.yml' + - 'roles.yml' + - 'networks.yml' + env_dir: "{{ ansible_inventory_sources[0] | dirname }}/group_vars" + config_namespace: "steel" + + - name: include vars from core config files + ansible.builtin.include_vars: + file: "{{ env_path }}" + name: "env_import_{{ env_namespace }}" + loop: "{{ env_files }}" + loop_control: + loop_var: entry + vars: + env_path: "{{ env_dir }}/{{ entry }}" + env_namespace: "{{ entry.split('.yml')[0] }}" + + - name: append env vars to temp dict + ansible.builtin.set_fact: + env_dict: "{{ env_dict | default({}) | combine (env_import, recursive=True) }}" + loop: "{{ lookup('ansible.builtin.varnames', 'env_import_').split(',') }}" + loop_control: + loop_var: entry + vars: + env_import: "{{ vars[entry] }}" + + - name: copy dict of env vars under top level namespace + set_fact: + { "{{ config_namespace }}": "{{ env_dict }}" } + + ######## generate list of roles to run against host + + - name: list all hostvars groups where the host is a member + ansible.builtin.set_fact: + _hostvars_groups: "{{ hostvars[ansible_hostname]['group_names'] | default([]) + (['all']) }}" + + - name: intersect hostvars groups with role groups, uses role groups to dictate role runtime order + ansible.builtin.set_fact: + active_role_groups: "{{ role_groups | intersect(group) }}" + loop: + - "{{ _hostvars_groups }}" + loop_control: + loop_var: entry + vars: + group: "{{ entry }}" + role_groups: "{{ vars[config_namespace]['roles'] | list }}" + + - name: list roles to run against "{{ ansible_hostname }}" + ansible.builtin.set_fact: + _run_roles: "{{ _run_roles | default([]) + vars[config_namespace]['roles'][entry] }}" + loop: "{{ active_role_groups }}" + loop_control: + loop_var: entry + + # this is required for roles to be dual purpose (client/server) + # - roles.yml may have role 'ntp' under groups 'all' and again under 'ntpd' + # - the 'ntp' role will be run only once for the 'all' group but will select a client/server setup based on ansible inventory membership in the 'ntpd' group + - name: select only unique roles, preserve runtime order + ansible.builtin.set_fact: + _run_roles: "{{ _run_roles | unique }}" + + - name: roles to run on "{{ ansible_hostname }}" + ansible.builtin.debug: + msg: + - run roles "{{ _run_roles }}" + + # global variables we can use for evaluations, we use these to determine role/task execution: + # - "{{ active_role_groups }}" # role groups that host is a member of, whitelisted/intersected from vars[config_namespace]['roles'] / group_vars/roles.yml + # - "{{ _run_roles }}" # all roles to run on host, role runtime execution is ordered from vars[config_namespace]['role'] / group_vars/roles.yml + # + # example usage of active_role_groups: + # + # - a task can check if host is in the monitoring group + # when: "'monitoring' in active_role_groups" # this method includes all groups host is a member and also the 'all' group, crucially it only includes groups whitelisted in vars[config_namespace]['role'] + # when: "'monitoring' in hostvars[ansible_hostname]['group_names']" # this is a ansible native test but excludes the 'all' group + # when: inventory_hostname in groups['monitoring'] # this is a ansible native method, probably the easiest ansible native method, notice inventory_hostname usage where external dns or hosts file entries are not available + # + # - ansible.builtin.debug: + # msg: + # - "{{ groups['all'] }}" + # - "{{ hostvars[ansible_hostname]['group_names'] }}" + # - "{{ active_role_groups }}" + # - "{{ _run_roles }}" + + ######## load vars from each target role + + # try to code around this method to make code more modular/portable: + # - strategies such as pull other host facts with delegate_facts where possible, chain roles where applicable - run roles on all hosts with skip tasks where unavoidable + # + # if this task is truely necessary extend its use further to fit future design patterns: + # + # info.yml should load variables into config_namespace and also contain merge OR overwrite functions to source from group_vars/.yml + # each role main.yml would run the info.yml as the first task also, to be a catch_all function for any future portability (it could always check if vars[config_namespace] was already populated) + # role tasks should all source variables from vars[config_namespace] + + # - name: "Load variables {{ ansible_hostname }}" + # ansible.builtin.include_role: + # name: "{{ role_var }}" + # public: true + # tasks_from: info.yml + # loop: "{{ _run_roles }}" + # loop_control: + # loop_var: role_var + # label: Setup {{ role_var }} variables on {{ ansible_hostname }} + + # - name: Output vars structure + # ansible.builtin.debug: + # msg: "{{ vars[role_var] }}" + # when: vars[role_var] is defined + # loop: "{{ _run_roles }}" + # loop_control: + # loop_var: role_var + # tags: never + + ######## run roles against hosts + + - name: override run_roles for testing + set_fact: + _run_roles: "{{ test_run_roles }}" + when: test_run_roles is defined + + - name: run roles on "{{ ansible_hostname }}" + ansible.builtin.include_role: + name: "{{ entry }}" + loop: "{{ _run_roles }}" + loop_control: + loop_var: entry + label: run {{ entry }} role on {{ ansible_hostname }} \ No newline at end of file diff --git a/cluster/xcat_dynamic_inventory.yml b/cluster/xcat_dynamic_inventory.yml new file mode 100644 index 0000000..b8d21cb --- /dev/null +++ b/cluster/xcat_dynamic_inventory.yml @@ -0,0 +1,302 @@ +--- +- name: build inventory + hosts: localhost + # user: ansible + vars: + ## default inventory file specified in the ansible.cfg + # _inventory_file_name: "{{ ansible_inventory_sources[0] }}" + # + ## custom inventory file + _inventory_file_name: "{{ ansible_inventory_sources[0].split('/')[:-1] | join('/') }}/xcat-inventory.yml" + _overwrite_inventory_file: False + # + ## enable dynamic inventory, disables writing inventory file + _dynamic_inventory: True + ## xcat API connection + _xcat_api: + xcat_user: "ansible" + xcat_password: "0cf5t33lAcc355" + xcat_http_scheme: "http" + xcat_api_endpoint: "127.0.0.1" + xcat_api_endpoint_port: "80" + + # - This playbook can be called as the first task in the runner/site task to act as a purely dynamic inventory sourced from xcat, use the _dynamic_inventory variable to enable + # - it will populate an in-memory inventory from xcat, meaning other inventory files do not need to be written + # - when present, existing inventory file or group_vars/ + host_vars/ inventory sources will also be used as per ansibles inventory sourcing precedence (this will merge hostvars/groups, so watch out) + + tasks: + +###### getting the xcat web service replying to API calls without ssl, purely for dev + +# sudo nano -cw /etc/httpd/conf.d/xcat-ws.conf + +# ORIGIONAL + +# ScriptAlias /xcatrhevh /opt/xcat/ws/xcatrhevh.cgi +# ScriptAlias /xcatws /opt/xcat/ws/xcatws.cgi +# LoadModule rewrite_module /usr/lib64/apache2-prefork/mod_rewrite.so +# RewriteEngine On +# RewriteCond %{SERVER_PORT} 80 +# RewriteCond %{HTTPS} !=on +# RewriteRule ^/?xcatws/(.*) https://%{SERVER_NAME}/xcatws/$1 [R,L] +# RewriteRule ^/?xcatwsv2/(.*) https://%{SERVER_NAME}/xcatwsv2/$1 [R,L] + +# +# Require all granted +# + +# DISABLED SSL AND CHANGE REDIRECT TO CGI SCRIPT @ HTTP + +# ScriptAlias /xcatrhevh /opt/xcat/ws/xcatrhevh.cgi +# ScriptAlias /xcatws /opt/xcat/ws/xcatws.cgi +# LoadModule rewrite_module /usr/lib64/apache2-prefork/mod_rewrite.so +# RewriteEngine On +# RewriteCond %{SERVER_PORT} 80 +# RewriteCond %{HTTPS} !=off +# RewriteRule ^/?xcatws/(.*) http://%{SERVER_NAME}/xcatws/$1 [R,L] +# RewriteRule ^/?xcatwsv2/(.*) http://%{SERVER_NAME}/xcatwsv2/$1 [R,L] + +# +# Require all granted +# + +###### finding xcat service account password + +# [root@xcat01(ansible-service) ~]# /opt/xcat/bin/gettab key=xcat passwd.username passwd.password +# passwd.username: ansible +# passwd.password: 0cf5t33lAcc355 + +###### testing curl + +# curl -X GET 'http://127.0.0.1/xcatws/nodes?userName=ansible&userPW=0cf5t33lAcc355' + +###### query the xcat API + + - name: set runtime facts + set_fact: + _xcat_api_token: "{{ _xcat_api['xcat_http_scheme'] }}://{{ _xcat_api['xcat_api_endpoint'] }}:{{ _xcat_api['xcat_api_endpoint_port'] }}/xcatws/tokens" + _xcat_api_nodes: "{{ _xcat_api['xcat_http_scheme'] }}://{{ _xcat_api['xcat_api_endpoint'] }}:{{ _xcat_api['xcat_api_endpoint_port'] }}/xcatws/nodes" + + - name: get API auth cookie + uri: + url: "{{ _xcat_api_token }}?pretty=1" + validate_certs: no + method: POST + headers: + Content-Type: application/json + body: '{"userName":"{{ xcat_user }}","userPW":"{{ xcat_password }}"}' + body_format: json + status_code: 201 + vars: + xcat_user: "{{ _xcat_api['xcat_user'] }}" + xcat_password: "{{ _xcat_api['xcat_password'] }}" + register: request + + - name: set API token + set_fact: + _xcat_api_token: "{{ request['json']['token']['id'] }}" + + - name: get nodes list + uri: + url: "{{ _xcat_api_nodes }}" + validate_certs: no + method: GET + headers: + X-Auth-Token: "{{ _xcat_api_token }}" + status_code: 200 + register: request + + - name: set node list + set_fact: + _node_list: "{{ request['json'] }}" + + - name: get nodes attributes + uri: + url: "{{ _xcat_api_nodes }}/{{ entry }}?pretty=1" + validate_certs: no + method: GET + headers: + X-Auth-Token: "{{ _xcat_api_token }}" + status_code: 200 + loop: "{{ _node_list }}" + loop_control: + loop_var: entry + register: request + + # - debug: + # msg: + # - "{{ request }}" + # - "{{ request['results'][1] }}" + # - "{{ _node_list }}" + +###### sort the API request into inventory ingestable format + + - name: create list of hosts and interfaces + set_fact: + _host_interfaces: "{{ _host_interfaces | default([]) +[ {'host': host, 'interfaces': interfaces} ] }}" + loop: "{{ request['results'] }}" + loop_control: + loop_var: entry + vars: + host: "{{ entry['entry'] }}" + host_record: "{{ entry['json'] }}" + interfaces: "{{ host_record[host] | dict2items | selectattr('key', 'search', '^nicips.' ) | map(attribute='key') | map('split', '.') | map('last') }}" + when: + - interfaces | length >0 + + - name: build inventory artefacts + set_fact: + _xcat_nics: "{{ _xcat_nics | default([]) +[{ 'host': host, 'intspec': [{ 'device': interface, 'ip': nicip, 'network': nicnetwork, 'type': nictype }] }] }}" + _host_groups: "{{ _host_groups | default([]) +[{ 'host': host, 'groups': remove_all_group }] }}" + # build a dict to the ansible yaml inventory spec, include placeholder fields for xcat_nics/ipmi_nic, these fields will be populated with selectattr based on host name from xcat_nics dict + _all_host_group: "{{ _all_host_group | default({}) | combine({ 'all': { 'hosts': { host: { 'ansible_ssh_host': primary_ip, 'xcat_nics': 'placeholder', 'ipmi_nic': 'placeholder' } } } }, recursive=True) }}" + # unused - groups are created from host entries + # _xcat_groups: "{{ _xcat_groups | default([]) +[remove_all_group] }}" + with_subelements: + - "{{ _host_interfaces }}" + - "interfaces" + loop_control: + loop_var: entry + vars: + host: "{{ entry.0['host'] }}" + interface: "{{ entry.1 }}" + request_host_record: "{{ request['results'] | selectattr('entry', '==', host) | map(attribute='json') }}" + nicips_key: "nicips.{{ interface }}" + nicnetworks_key: "nicnetworks.{{ interface }}" + nictype_key: "nictypes.{{ interface }}" + nicip: "{{ request_host_record[0][host][nicips_key] | default('incomplete_record') }}" + nicnetwork: "{{ request_host_record[0][host][nicnetworks_key] | default('incomplete_record') }}" + nictype: "{{ request_host_record[0][host][nictype_key] | default('incomplete_record') }}" + host_groups: "{{ request_host_record[0][host]['groups'].split(',') | default([]) }}" + # remove 'all' group if added by xcat, this is a special group that will be used to store inventory hostvars and merged into the inventory dict + remove_all_group: "{{ host_groups | difference(['all']) }}" + primary_ip: "{{ request_host_record[0][host]['ip'] }}" + when: + - not nicip == 'incomplete_record' + - not nicnetwork == 'incomplete_record' + - not nictype == 'incomplete_record' + - host_groups | length >0 + + # unused - groups are automatically created from host entries + # - name: set runtime facts + # set_fact: + # _xcat_groups: "{{ _xcat_groups | flatten | unique | sort }}" + + - name: build 'all' group inventory dict + set_fact: + _all_host_group: "{{ _all_host_group | default({}) | combine({ 'all': { 'hosts': { host: all_host_group_entry } } }, recursive=True) }}" + loop: "{{ _all_host_group['all']['hosts'] | list }}" + loop_control: + loop_var: entry + vars: + xcat_nics: "{{ _xcat_nics | selectattr('host', '==', entry) | map(attribute='intspec') | flatten | rejectattr('network', '==', 'ipmi') }}" + ipmi_nic: "{{ _xcat_nics | selectattr('host', '==', entry) | map(attribute='intspec') | flatten | selectattr('network', '==', 'ipmi') }}" + host: "{{ entry }}" + all_host_group_entry: "{{ _all_host_group['all']['hosts'][entry] | combine({'xcat_nics': xcat_nics, 'ipmi_nic': ipmi_nic }, recursive=True) }}" + + - name: append groups to inventory dict + set_fact: + _other_host_groups: "{{ _other_host_groups | default({}) | combine({ group: { 'hosts': { host: none } } }, recursive=True) }}" + with_subelements: + - "{{ _host_groups }}" + - "groups" + loop_control: + loop_var: entry + vars: + host: "{{ entry.0['host'] }}" + group: "{{ entry.1 }}" + + - name: combine _all_host_group with _other_host_groups, inventory complete + set_fact: + _all_host_group: "{{ _all_host_group | default({}) | combine(_other_host_groups, recursive=True) }}" + +###### write the inventory dict to flat file + + - name: write flat file inventory + block: + + - name: find current user uid facts + ansible.builtin.getent: + database: passwd + key: "{{ current_user }}" + vars: + current_user: "{{ lookup('env', 'USER') }}" + + - name: get current uid:gid + ansible.builtin.set_fact: + _current_uid: "{{ ansible_facts['getent_passwd'][current_user][1] }}" + _current_gid: "{{ ansible_facts['getent_passwd'][current_user][2] }}" + vars: + current_user: "{{ lookup('env', 'USER') }}" + + - name: find current user primary group gid facts + getent: + database: group + key: "{{ _current_gid }}" + + - name: get user/group names + set_fact: + _current_user: "{{ current_user }}" + _current_group: "{{ primary_group[0] }}" + vars: + current_user: "{{ lookup('env', 'USER') }}" + primary_group: "{{ ansible_facts['getent_group'] | dict2items | map(attribute='key') }}" + + - name: check for existing inventory file + ansible.builtin.stat: + path: "{{ _inventory_file_name }}" + register: _inventory_present + + # note: the regex removal of 'null' (sourced from _other_host_groups {host: none}), this tidies the inventory file to look like the ansible spec examples although it is invalid yaml + # ansible doesnt mind the 'null' host value in an inventory file but doesnt like strings + - name: write to file + copy: + content: | + #jinja2: lstrip_blocks: True + {{ _all_host_group | to_nice_yaml(indent=2,sort_keys=False) | regex_replace('null', '') }} + dest: "{{ _inventory_file_name }}" + force: true + owner: "{{ _current_user }}" + group: "{{ _current_group }}" + mode: 0640 + when: + - not _inventory_present['stat']['exists'] | bool or _overwrite_inventory_file | bool + + # test inventory + # ansible -i output.yml -a "uname -a" + + when: + - not _dynamic_inventory + +###### write the networks dict to flat file + + # this same logic should be used to get all unique networks - these will be used in an api query + # need to also get some other info from xcat such as the cluster name? + +###### add hosts to in-memory inventory + + - name: add hosts to in-memory inventory + add_host: > + name={{ host }} + groups={{ host_groups }} + ansible_ssh_host={{ ansible_ssh_host }} + xcat_nics={{ xcat_nics }} + ipmi_nic={{ ipmi_nic }} + dynamic_inventory=True + loop: "{{ _all_host_group['all']['hosts'] | list }}" + loop_control: + loop_var: entry + vars: + host: "{{ entry }}" + ansible_ssh_host: "{{ _all_host_group['all']['hosts'][entry]['ansible_ssh_host'] }}" + xcat_nics: "{{ _all_host_group['all']['hosts'][entry]['xcat_nics'] }}" + ipmi_nic: "{{ _all_host_group['all']['hosts'][entry]['ipmi_nic'] }}" + host_groups: "{{ _host_groups | selectattr('host', '==', entry) | map(attribute='groups') | flatten | unique }}" + when: + - _dynamic_inventory + + # - debug: + # msg: + # - "{{ hostvars['compute001'] }}" + # - "{{ groups }}" + # - "{{ hostvars['compute001']['dynamic_inventory'] }}"