slurm-web,也称为 slurm-wlm-web,是为 Slurm 工作负载管理器提供的一个 Web 界面。Slurm 是一个用于管理大型和小型 Linux 集群的开源、容错且高度可扩展的集群管理和作业调度系统。slurm-web 的主要功能是提供一个方便用户监控和管理 Slurm 集群的途径,而无需直接使用命令行工具。
Slurm-web 在 Slurm 的基础上提供了一个 Web 界面,具有直观的图形视图、清晰的洞察力和先进的可视化功能,可用于跟踪作业和监控组织内 HPC 超级计算机的状态。Slurm-web要求Slurmdbd(版本 >= 23.02)的功能强大的 slurm HPC 集群。
Slurm-web的地址:https://github.com/rackslab/Slurm-web,前端采用vue,后端为Python的Flask。
安装步骤

- 
安装 slurmrestd python
 $ sudo apt install slurmrestd
 $ sudo systemctl enable slurmrestd
 $ sudo systemctl start slurmrestd
列出可用的 API 版本
```python
# sinfo --version
slurm-wlm 23.11.4
root@andrew-HP:~# slurmrestd -d list -u slurm
Possible data_parser plugins:
data_parser/v0.0.39
data_parser/v0.0.40
```
- 
使用unix套接字测试 API ```python curl --unix-socket /run/slurmrestd/slurmrestd.socket http://slurm/slurm/v0.0.40/diag{ 
 "statistics": {
 "parts_packed": 1,
 "req_time": {
 "set": true,
 "infinite": false,
 "number": 1736735146
 },
 "req_time_start": {
 "set": true,
 "infinite": false,
 "number": 1736728764
 },
 "server_thread_count": 2,
 "agent_queue_size": 0,
 "agent_count": 0,
 "agent_thread_count": 0,
 "dbd_agent_queue_size": 0,
 "gettimeofday_latency": 15,
 "schedule_cycle_max": 44,
 "schedule_cycle_last": 43,
 "schedule_cycle_total": 107,
 "schedule_cycle_mean": 14,
 "schedule_cycle_mean_depth": 0,
 "schedule_cycle_per_minute": 1,
 "schedule_queue_length": 0,
 "schedule_exit": {
 "end_job_queue": 107,
 "default_queue_depth": 0,
 "max_job_start": 0,
 "max_rpc_cnt": 0,
 "max_sched_time": 0,
 "licenses": 0
 },
 "jobs_submitted": 0,
 "jobs_started": 0,
 "jobs_completed": 0,
 "jobs_canceled": 0,
 "jobs_failed": 0,
 "jobs_pending": 0,
 "jobs_running": 0,
 "job_states_ts": {
 "set": true,
 "infinite": false,
 "number": 1736735127
 },
 "bf_backfilled_jobs": 0,
 "bf_last_backfilled_jobs": 0,
 "bf_backfilled_het_jobs": 0,
 "bf_cycle_counter": 0,
 "bf_cycle_mean": 0,
 "bf_depth_mean": 0,
 "bf_depth_mean_try": 0,
 "bf_cycle_sum": 0,
 "bf_cycle_last": 0,
 "bf_last_depth": 0,
 "bf_last_depth_try": 0,
 "bf_depth_sum": 0,
 "bf_depth_try_sum": 0,
 "bf_queue_len": 0,
 "bf_queue_len_mean": 0,
 "bf_queue_len_sum": 0,
 "bf_table_size": 0,
 "bf_table_size_mean": 0,
 "bf_when_last_cycle": {
 "set": true,
 "infinite": false,
 "number": 0
 },
 "bf_active": false,
 "bf_exit": {
 "end_job_queue": 0,
 "bf_max_job_start": 0,
 "bf_max_job_test": 0,
 "bf_max_time": 0,
 "bf_node_space_size": 0,
 "state_changed": 0
 },
 "rpcs_by_message_type": [
 {
 "message_type": "MESSAGE_NODE_REGISTRATION_STATUS",
 "type_id": 1002,
 "count": 4,
 "average_time": 117,
 "total_time": 470
 },
 {
 "message_type": "ACCOUNTING_REGISTER_CTLD",
 "type_id": 10003,
 "count": 1,
 "average_time": 93108,
 "total_time": 93108
 },
 {
 "message_type": "REQUEST_PING",
 "type_id": 1008,
 "count": 297,
 "average_time": 91,
 "total_time": 27259
 },
 {
 "message_type": "REQUEST_JOB_INFO",
 "type_id": 2003,
 "count": 297,
 "average_time": 61,
 "total_time": 18125
 },
 {
 "message_type": "REQUEST_NODE_INFO",
 "type_id": 2007,
 "count": 299,
 "average_time": 72,
 "total_time": 21815
 },
 {
 "message_type": "REQUEST_PARTITION_INFO",
 "type_id": 2009,
 "count": 299,
 "average_time": 48,
 "total_time": 14621
 }
 ],
 "rpcs_by_user": [
 {
 "user": "root",
 "user_id": 0,
 "count": 4,
 "average_time": 117,
 "total_time": 470
 },
 {
 "user": "slurm",
 "user_id": 1052,
 "count": 1193,
 "average_time": 146,
 "total_time": 174928
 }
 ]
 },
 "meta": {
 "plugin": {
 "type": "openapi/slurmctld",
 "name": "Slurm OpenAPI slurmctld",
 "data_parser": "data_parser/v0.0.40",
 "accounting_storage": "accounting_storage/slurmdbd"
 },
 "client": {
 "source": "/run/slurmrestd/slurmrestd.socket->socket:[78242] (fd 8)",
 "user": "root",
 "group": "root"
 },
 "command": [],
 "slurm": {
 "version": {
 "major": "23",
 "micro": "4",
 "minor": "11"
 },
 "release": "23.11.4",
 "cluster": "cluster"
 }
 },
 "errors": [],
 "warnings": []
 ```
参考资料
- 软件测试精品书籍文档下载持续更新 https://github.com/china-testing/python-testing-examples 请点赞,谢谢!
- 本文涉及的python测试开发库 谢谢点赞! https://github.com/china-testing/python_cn_resouce
- python精品书籍下载 https://github.com/china-testing/python_cn_resouce/blob/main/python_good_books.md
- Linux精品书籍下载 https://www.cnblogs.com/testing-/p/17438558.html
- 官网:https://slurm-web.com/
- https://medium.com/@satishdotpatel/setup-slurm-web-for-slurm-hpc-clusters-13a9873094a1
安装 slurm-web
- 
下载软件包的密钥 ```sh 
 $ curl -sS https://pkgs.rackslab.io/keyring.asc | gpg --dearmor | tee /usr/share/keyrings/rackslab.gpg > /dev/null
 $ vi /etc/apt/sources.list.d/rackslab.sources
 Types: deb
 URIs: https://pkgs.rackslab.io/deb
 Suites: ubuntu24.04
 Components: maindp
 Architectures: amd64
 Signed-By: /usr/share/keyrings/rackslab.gpg$ sudo apt update 
 $ sudo apt install slurm-web-agent slurm-web-gateway
 ```
- 
JWT 密钥 
slurm-web 使用 JWT 令牌在组件之间进行身份验证。
```sh
$ /usr/libexec/slurm-web/slurm-web-gen-jwt-key
```
- RacksDB
Slurm-web 使用 RacksDB 生成数据中心机架与计算节点的图形表示。
```sh
$ apt install racksdb
$ cp -r /usr/share/doc/python3-racksdb/examples/db/* /var/lib/racksdb/
$ racksdb datacenters
```
- 
Slurm-web 配置文件 ```sh vi /etc/slurm-web/agent.ini[service] 
 cluster=cluster
 interface=localhost
 port=5012vi /etc/slurm-web/gateway.ini[service] 
 interface=0.0.0.0
 port=5011[ui] 
 host=http://172.16.37.34:5011[agents] 
 url=http://localhost:5012$ sudo systemctl restart slurm-web-agent.service 
 $ sudo systemctl restart slurm-web-gateway.service
 ```
从浏览器访问 Web UI http://
Slurm-web 仅支持 LDAP 身份验证(默认身份验证已禁用)

选择集群
群集和正在运行的作业概览
作业状态
作业详情
节点状态
服务质量(默认正常)
文章整理自互联网,只做测试使用。发布者:Lomu,转转请注明出处:https://www.it1024doc.com/6506.html
 
                
