diff --git a/slurm_qos/tasks/main.yml b/slurm_qos/tasks/main.yml index a909786..4ea8a28 100644 --- a/slurm_qos/tasks/main.yml +++ b/slurm_qos/tasks/main.yml @@ -1,25 +1,50 @@ --- -- name: Ensure Slurm QoS exist +- name: Validate QoS settings + ansible.builtin.assert: + that: + - item.name is defined + fail_msg: "QoS entry missing 'name': {{ item }}" + loop: "{{ slurm_qos_settings }}" + when: slurm_qos_assert_vars | bool + +- name: Validate account settings + ansible.builtin.assert: + that: + - item.name is defined + fail_msg: "Account entry missing 'name': {{ item }}" + loop: "{{ slurm_qos_accounts }}" + when: slurm_qos_assert_vars | bool + +- name: Validate user settings + ansible.builtin.assert: + that: + - item.username is defined + fail_msg: "User entry missing 'username': {{ item }}" + loop: "{{ slurm_qos_users }}" + when: slurm_qos_assert_vars | bool + +- name: Ensure Slurm QoS configured when: slurm_qos_settings | default([]) | length > 0 block: - - name: Check Slurm QoS list + - name: Check existing Slurm QoS ansible.builtin.command: "sacctmgr -Pn show qos format=name" register: qos_list changed_when: false failed_when: false - - name: Add Slurm QoS (if missing) + - name: Add missing Slurm QoS ansible.builtin.command: "sacctmgr -i add qos {{ item.name }}" loop: "{{ slurm_qos_settings }}" loop_control: label: "{{ item.name }}" when: qos_list.stdout is not search(item.name) - changed_when: true + register: qos_add + changed_when: qos_add.rc == 0 - - name: Modify Slurm QoS (if already exists) + - name: Configure Slurm QoS ansible.builtin.command: > sacctmgr -i modify qos where name={{ item.name }} set - {% if item.max_tres is defined and item.max_tres is not none %}MaxTRES={{ item.max_tres }}{% endif %} + {% if item.max_tres_per_user is defined and item.max_tres_per_user is not none %}MaxTRESPU={{ item.max_tres_per_user }}{% endif %} {% if item.max_submit_jobs is defined and item.max_submit_jobs is not none %}MaxSubmitJobs={{ item.max_submit_jobs }}{% endif %} {% if item.max_jobs_per_user is defined and item.max_jobs_per_user is not none %}MaxJobsPerUser={{ item.max_jobs_per_user }}{% endif %} {% if item.grace_time is defined and item.grace_time is not none %}GraceTime={{ item.grace_time }}{% endif %} @@ -27,8 +52,9 @@ loop: "{{ slurm_qos_settings }}" loop_control: label: "{{ item.name }}" - when: qos_list.stdout is search(item.name) - changed_when: true + register: qos_modify + changed_when: qos_modify.rc == 0 + failed_when: qos_modify.rc != 0 and "Nothing modified" not in qos_modify.stdout - name: Ensure Slurm accounts exist when: slurm_qos_accounts | default([]) | length > 0 @@ -44,6 +70,11 @@ slurm_cluster_name: "{{ cluster_list.stdout_lines[0] }}" when: cluster_list.stdout_lines | length == 1 + - name: Fail if multiple clusters detected without explicit config + ansible.builtin.fail: + msg: "Multiple clusters found. Set 'slurm_cluster_name' explicitly." + when: cluster_list.stdout_lines | length > 1 and slurm_cluster_name is not defined + - name: Check if Slurm account exists ansible.builtin.command: "sacctmgr -Pn list account name={{ item.name }} format=account" register: account_check @@ -66,7 +97,7 @@ when: item.stdout is not search(item.item.name) changed_when: true -- name: Ensure Slurm users exist with partition limits +- name: Ensure Slurm users and partitions configured when: slurm_qos_users | default([]) | length > 0 block: - name: Check existing Slurm users @@ -75,96 +106,103 @@ changed_when: false failed_when: false - - name: Add Slurm user if not already present + - name: Add new Slurm users if missing with default preempt partition ansible.builtin.command: > sacctmgr -i add user name={{ item.username }} cluster={{ slurm_cluster_name }} account={{ item.sponsor | default('orcd') }} - {% if item.partitions is defined and item.partitions | length > 0 %} - partition={{ item.partitions | map(attribute='name') | join(',') }} - {% else %} - partition=debug,preempt - {% endif %} - {% if item.qos_list is defined and item.qos_list | length > 0 %} - QOS={{ item.qos_list | join(',') }} - {% else %} - QOS=debug_qos,preempt_qos - {% endif %} - {% if item.qos_default is defined %} - DefaultQOS={{ item.qos_default }} - {% else %} - DefaultQOS=preempt_qos - {% endif %} - {% if itm.comment is defined %}Comment="{{ item.comment }}"{% endif %} + partition=preempt + QOS=preempt_qos + DefaultQOS={{ item.qos_default | default('preempt_qos') }} + {% if item.comment is defined %}Comment="{{ item.comment }}"{% endif %} loop: "{{ slurm_qos_users }}" loop_control: label: "{{ item.username }}" when: user_list.stdout is not search(item.username) - changed_when: true + register: user_add + changed_when: user_add.rc == 0 - - name: Check existing partition associations - ansible.builtin.command: | - sacctmgr -Pn show assoc where user={{ item.username }} cluster={{ slurm_cluster_name }} account={{ item.sponsor | default('orcd') }} format=Partition,GrpTRES,MaxNodes - register: assoc_check + - name: Configure Slurm user attributes + ansible.builtin.command: > + sacctmgr -i modify user where name={{ item.username }} + cluster={{ slurm_cluster_name }} + account={{ item.sponsor | default('orcd') }} set + {% if item.comment is defined %}Comment="{{ item.comment }}"{% endif %} loop: "{{ slurm_qos_users }}" loop_control: label: "{{ item.username }}" - when: user_list.stdout is search(item.username) + register: user_modify + changed_when: user_modify.rc == 0 + failed_when: user_modify.rc != 0 and "Nothing modified" not in user_modify.stdout + + - name: Check existing partition associations for users + ansible.builtin.command: > + sacctmgr -Pn show assoc where user={{ item.username }} + cluster={{ slurm_cluster_name }} + account={{ item.sponsor | default('orcd') }} + format=partition + loop: "{{ slurm_qos_users }}" + loop_control: + label: "{{ item.username }}" + register: assoc_check changed_when: false failed_when: false - - name: Ensure partition associations exist for existing users + - name: Add missing partition associations for users ansible.builtin.command: > sacctmgr -i add user name={{ item.0.username }} cluster={{ slurm_cluster_name }} account={{ item.0.sponsor | default('orcd') }} partition={{ item.1.name }} - {% if item.1.name == 'admin' %} - QOS=admin_qos DefaultQOS=admin_qos + {% if item.1.qos is defined %} + QOS={{ item.1.qos }} + DefaultQOS={{ item.1.qos }} {% else %} - QOS={{ item.0.qos_list | join(',') }} DefaultQOS={{ item.0.qos_default | default('preempt_qos') }} + QOS={{ slurm_partition_qos_defaults[item.1.name] | default(item.0.qos_default | default('preempt_qos')) }} + DefaultQOS={{ slurm_partition_qos_defaults[item.1.name] | default(item.0.qos_default | default('preempt_qos')) }} {% endif %} - loop: "{{ slurm_qos_users | subelements('partitions') }}" + loop: "{{ slurm_qos_users | subelements('partitions', skip_missing=True) }}" loop_control: label: "{{ item.0.username }} - {{ item.1.name }}" - when: - - user_list.stdout is search(item.0.username) - - assoc_check.results | selectattr('item.username', 'equalto', item.0.username) | map(attribute='stdout_lines') | flatten | join(',') is not search(item.1.name) - changed_when: true + when: > + assoc_check.results | selectattr('item.username', 'equalto', item.0.username) | map(attribute='stdout') | join('') is not search(item.1.name) + register: assoc_add + changed_when: assoc_add.rc == 0 - - name: Debug partition-specific limits - ansible.builtin.debug: - msg: > - For {{ item.0.username }} - {{ item.1.name }}: - grptres={{ grptres | default('unset') }}, - max_nodes={{ max_nodes | default('unset') }} - vars: - partition_defaults: "{{ slurm_qos_partition_defaults.partitions[item.1.name] }}" - grptres: "{{ item.1.grptres | default(partition_defaults.grptres) }}" - max_nodes: "{{ item.1.max_nodes | default(partition_defaults.max_nodes) }}" - loop: "{{ slurm_qos_users | subelements('partitions') }}" - loop_control: - label: "{{ item.0.username }} - {{ item.1.name }}" - when: user_list.stdout is search(item.0.username) - - - name: Update partition-specific limits (GrpTres only) + - name: Ensure partition associations for users (defined partitions) ansible.builtin.command: > sacctmgr -i modify user where name={{ item.0.username }} cluster={{ slurm_cluster_name }} - account={{ item.0.sponsor | default('orcd') }} - partition={{ item.1.name }} - set - {% if grptres is defined and grptres is not none %} GrpTRES={{ grptres }}{% endif %} - vars: - partition_defaults: "{{ slurm_qos_partition_defaults.partitions[item.1.name] }}" - grptres: "{{ item.1.grptres | default(partition_defaults.grptres) }}" - max_nodes: "{{ item.1.max_nodes | default(partition_defaults.max_nodes) }}" - loop: "{{ slurm_qos_users | subelements('partitions') }}" + account={{ item.sponsor | default('orcd') }} + partition={{ item.1.name }} set + {% if item.1.qos is defined %} + QOS={{ item.1.qos }} + DefaultQOS={{ item.1.qos }} + {% else %} + QOS={{ slurm_partition_qos_defaults[item.1.name] | default(item.0.qos_default | default('preempt_qos')) }} + DefaultQOS={{ slurm_partition_qos_defaults[item.1.name] | default(item.0.qos_default | default('preempt_qos')) }} + {% endif %} + loop: "{{ slurm_qos_users | subelements('partitions', skip_missing=True) }}" loop_control: label: "{{ item.0.username }} - {{ item.1.name }}" - when: - - user_list.stdout is search(item.0.username) - - grptres is defined and grptres is not none - changed_when: true - failed_when: false + register: assoc_modify + changed_when: assoc_modify.rc == 0 + failed_when: assoc_modify.rc != 0 and "Nothing modified" not in assoc_modify.stdout + + - name: Ensure default preempt partition association + ansible.builtin.command: > + sacctmgr -i modify user where + name={{ item.username }} + cluster={{ slurm_cluster_name }} + account={{ item.sponsor | default('orcd') }} + partition=preempt set + QOS=preempt_qos + DefaultQOS={{ item.qos_default | default('preempt_qos') }} + loop: "{{ slurm_qos_users }}" + loop_control: + label: "{{ item.username }}" + when: item.partitions is not defined or item.partitions | length == 0 + register: default_assoc_modify + changed_when: default_assoc_modify.rc == 0 + failed_when: default_assoc_modify.rc != 0 and "Nothing modified" not in default_assoc_modify.stdout