Getting started with AMD (ROCM Kernel) ===================================================== 最后更新:07/06/2025。 作者:`Yusheng Su `_ 设置 ----- 如果您在使用 AMD GPUs (如 MI300) 配合 ROCM 平台的情况下运行,无法使用之前的快速入门指南来运行 verl。您需要按照以下步骤构建一个 Docker 镜像,并在 verl 的 RLHF(Reinforcement Learning from Human Feedback,人机反馈强化学习)训练中启动 Ray 时设置环境变量 ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES`` 或 ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES``。 docker/Dockerfile.rocm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash FROM "rlsys/rocm-6.3.4-patch:rocm6.3.4-numa-patch_ubuntu-22.04" SHELL ["/bin/bash", "-ceuxo", "pipefail"] ENV MAX_JOBS=512 ENV PATH="/usr/local/python3.12/bin:$PATH" RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \ ln -sf /usr/bin/pip3.12 /usr/bin/pip ############################################ RUN apt-get update RUN apt-get install -y pkg-config liblzma-dev ############################################ ########################################### ##########Install TransformerEngine######## ########################################### WORKDIR /workspace/ # transformer-engine install # https://github.com/ROCm/TransformerEngine RUN rm -rf TransformerEngine RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git WORKDIR /workspace/TransformerEngine git checkout 236178e5 # git checkout bb061ade # git checkout 864405c ENV NVTE_FRAMEWORK=pytorch ENV NVTE_ROCM_ARCH=gfx942 ENV NVTE_USE_HIPBLASLT=1 ENV NVTE_USE_ROCM=1 # export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}" ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr" RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv WORKDIR /workspace/ ########################################### ########################################### ########################################### #################################################################################### ################Install vllm - sglang require vllm 0.6.7 dependency################# #################################################################################### #### Require vllm 0.6.7 - checkout 113274a0 WORKDIR /workspace/ RUN rm -rf vllm RUN pip uninstall -y vllm # Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html RUN git clone https://github.com/ROCm/vllm.git # git clone https://github.com/vllm-project/vllm.git WORKDIR /workspace/vllm RUN git checkout 113274a0 ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942" #ENV MAX_JOBS=512 ENV MAX_JOBS=${MAX_JOBS} RUN pip install "boto3>=1.26.0" RUN pip install setuptools_scm # will add src into py. You can delete the repo RUN python3 setup.py install WORKDIR /workspace/ #################################################################################### #################################################################################### #################################################################################### ########################################### ############For hack docker################ ########################################### RUN pip install setuptools==75.8.0 ########################################### ########################################### ########################################### ########################################### ############build sgalng################### ########################################### # Set environment variables ENV BASE_DIR=/sgl-workspace ENV BUILD_TYPE=all ENV SGL_REPO=https://github.com/sgl-project/sglang ENV SGL_BRANCH=v0.4.6.post5 ENV TRITON_REPO=https://github.com/ROCm/triton.git ENV TRITON_COMMIT=improve_fa_decode_3.0.0 ENV AITER_REPO=https://github.com/ROCm/aiter.git ENV AITER_COMMIT=v0.1.2 # v0.1.2 version - commit id: 9d11f47 # ENV AITER_COMMIT=9d11f47 ENV HIP_FORCE_DEV_KERNARG=1 ENV HSA_NO_SCRATCH_RECLAIM=1 ENV SGLANG_SET_CPU_AFFINITY=1 ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 ENV NCCL_MIN_NCHANNELS=112 ENV MOE_PADDING=1 ENV VLLM_FP8_PADDING=1 ENV VLLM_FP8_ACT_PADDING=1 ENV VLLM_FP8_WEIGHT_PADDING=1 ENV VLLM_FP8_REDUCE_CONV=1 ENV TORCHINDUCTOR_MAX_AUTOTUNE=1 ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1 ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942" ENV AMDGPU_TARGETS=gfx942 ENV ROCM_ARCH=gfx942 ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942" # Switch to working directory WORKDIR /sgl-workspace # Clean and create directory RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace # Clone and build sglang RUN git clone ${SGL_REPO} \ && cd sglang \ && git checkout ${SGL_BRANCH} || echo "Using default branch" \ && cd sgl-kernel \ && rm -f pyproject.toml \ && mv pyproject_rocm.toml pyproject.toml \ && python setup_rocm.py install \ && cd .. \ && if [ "$BUILD_TYPE" = "srt" ]; then \ python -m pip --no-cache-dir install -e "python[srt_hip]"; \ else \ python -m pip --no-cache-dir install -e "python[all_hip]"; \ fi \ && cd /sgl-workspace \ && cp -r /sgl-workspace/sglang /sglang \ && python -m pip cache purge # Install common Python packages RUN pip install IPython orjson python-multipart torchao pybind11 # Rebuild Triton RUN pip uninstall -y triton || true \ && git clone ${TRITON_REPO} \ && cd triton \ && git checkout ${TRITON_COMMIT} \ && cd python \ && python3 setup.py install \ && cd /sgl-workspace # ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1" # ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942" # Build aiter #version: Commit 9d11f47 # && git checkout ${AITER_COMMIT} \ RUN pip uninstall -y aiter || true RUN git clone ${AITER_REPO} \ && cd aiter \ && git checkout ${AITER_COMMIT} \ && git submodule sync \ && git submodule update --init --recursive \ && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \ && cd /sgl-workspace # Copy MI300X config RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \ /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \ -type f -name '*MI300X*' | \ xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {} # Environment setup complete. RUN echo "Environment setup complete." WORKDIR /workspace/ ########################################### ########################################### ########################################### ########################################### ###############vllm v0.8.5################# ########################################### WORKDIR /workspace/ ENV VLLM_TARGET_DEVICE=rocm ENV ROCM_PATH=/opt/rocm ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev # Find the repo path in: DockerFile/Dockerfile.rocm_yang # RUN git clone https://github.com/RLFoundation/vllm-patch.git RUN pip uninstall -y vllm || true RUN rm -rf vllm-patch RUN git clone https://github.com/RLFoundation/vllm-patch.git \ && cd vllm-patch \ && git checkout v0.8.5-sleep-numa \ && rm -rf build/ dist/ *.egg-info \ && ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \ && SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install # RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop WORKDIR /workspace/ ########################################### ########################################### ########################################### ######################################### #### Install megatron-core############### ######################################### RUN pip uninstall -y megatron-core && \ git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \ cd Megatron-LM-amd_version && \ pip install -vvv -e . && \ cd /workspace/ ######################################### ######################################### ######################################### ####################################### ################apex################### ####################################### WORKDIR /workspace/ RUN pip uninstall -y apex && \ git clone git@github.com:ROCm/apex.git && \ cd apex && \ python setup.py install && \ cd /workspace/ ####################################### ####################################### ####################################### ################################################################################ ###########################Add torch_memory_saver############################### ################################################################################ # Set environment variables ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__" ENV CFLAGS="-D__HIP_PLATFORM_AMD__" ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__" RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa" ################################################################################ ################################################################################ ################################################################################ ######################################## ######Install ray####################### ######################################## # need to add this patch: https://github.com/ray-project/ray/pull/53531/files RUN pip uninstall ray -y RUN pip install "ray[data,train,tune,serve]>=2.47.0" ######################################## ######################################## ######################################## ########################################## #######Install other dependencies######### ########################################## RUN pip install "tensordict==0.6.2" --no-deps && \ pip install accelerate \ codetiming \ datasets \ dill \ hydra-core \ liger-kernel \ numpy \ pandas \ peft \ "pyarrow>=15.0.0" \ pylatexenc \ torchdata \ wandb \ orjson \ pybind11 WORKDIR /workspace/ RUN git clone https://github.com/volcengine/verl.git && \ cd verl && \ pip install -e . ########################################## ########################################## ########################################## WORKDIR /workspace/ CMD ["/usr/bin/bash"] 构建镜像: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash docker docker/build -t verl-rocm . 运行容器 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 备注:您可以从这个 DockerHub 拉取 docker 镜像:[RLSys Foundation](https://hub.docker.com/u/yushengsuthu) 拉取镜像: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 docker tag rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 verl-rocm:latest 运行容器 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 可选:以非 root 用户权限运行 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: bash docker run --rm -it \ --device /dev/dri \ --device /dev/kfd \ -p 8265:8265 \ --group-add video \ --cap-add SYS_PTRACE \ --security-opt seccomp=unconfined \ --privileged \ -v $HOME/.ssh:/root/.ssh \ -v $HOME:$HOME \ --shm-size 128G \ -w $PWD \ verl-rocm \ /bin/bash (可选):如果您不想使用 root 模式并需要将自己指定为用户,请在上述 docker 启动脚本中添加 ``-e HOST_UID=$(id -u)`` 和 ``-e HOST_GID=$(id -g)``。 示例 ------- 由于 AMD (ROCM) 的特殊 torch 设置, 1. 如果您的 ``ray>=2.45.0``(默认版本),需要在 verl 的 RLHF 训练中启动 ray 时设置 ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES``,并添加这个补丁 [patch](https://github.com/ray-project/ray/pull/53531/files)。 2. 如果您的 ``ray<2.45.0``,需要在 verl 的 RLHF 训练中启动 ray 时设置 ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES``。 推理引擎 ``$ENGINE`` 可以是 ``vllm`` 或 ``sglang``。在以下示例中,我们默认选择 ``vllm``。 PPO ~~~ .. code-block:: bash YOUR_PROJECT_NAME=r1-verl-ppo-upstream YOUR_RUN_NAME=r1-training_ppo-upstream # export HYDRA_FULL_ERROR=1 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # [ray] < 2.45.0 #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 # [ray] >= 2.45.0 export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794 GPUS_PER_NODE=8 MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')" ENGINE=vllm #sglang PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \ data.train_files=data/gsm8k/train.parquet \ data.val_files=data/gsm8k/test.parquet \ data.train_batch_size=256 \ data.val_batch_size=1312 \ data.max_prompt_length=512 \ data.max_response_length=256 \ actor_rollout_ref.model.path=$MODEL_PATH \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.ppo_mini_batch_size=64 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.name=$ENGINE \ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ critic.optim.lr=1e-5 \ critic.model.path=$MODEL_PATH \ critic.ppo_micro_batch_size_per_gpu=4 \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.logger=console \ trainer.project_name=$YOUR_PROJECT_NAME \ trainer.experiment_name=$YOUR_RUN_NAME \ trainer.val_before_train=False \ trainer.n_gpus_per_node=$GPUS_PER_NODE \ trainer.nnodes=1 \ trainer.save_freq=10 \ trainer.test_freq=10 \ trainer.total_epochs=15 #2>&1 | tee verl_demo.log GRPO ~~~~ .. code-block:: bash YOUR_PROJECT_NAME=r1-verl-grpo-upstream YOUR_RUN_NAME=r1-training_grpo-upstream # export HYDRA_FULL_ERROR=1 # export FSDP_VERBOSE=1 #export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # [ray] < 2.45.0 #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 # [ray] >= 2.45.0 export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794 GPUS_PER_NODE=8 MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct # MODEL_PATH=Qwen/Qwen2-7B-Instruct python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')" ENGINE=vllm #sglang python3 -m verl.trainer.main_ppo \ algorithm.adv_estimator=grpo \ data.train_files=data/gsm8k/train.parquet \ data.val_files=data/gsm8k/test.parquet \ data.train_batch_size=1024 \ data.val_batch_size=1312 \ data.max_prompt_length=512 \ data.max_response_length=1024 \ actor_rollout_ref.model.path=$MODEL_PATH \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.use_dynamic_bsz=True \ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \ actor_rollout_ref.actor.use_kl_loss=True \ actor_rollout_ref.actor.kl_loss_coef=0.001 \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.model.enable_gradient_checkpointing=Flase \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=$ENGINE \ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ actor_rollout_ref.rollout.n=5 \ actor_rollout_ref.ref.fsdp_config.param_offload=False \ algorithm.kl_ctrl.kl_coef=0.001 \ trainer.critic_warmup=0 \ trainer.logger=console \ trainer.project_name=$YOUR_PROJECT_NAME \ trainer.experiment_name=$YOUR_RUN_NAME \ trainer.n_gpus_per_node=$GPUS_PER_NODE \ trainer.val_before_train=False \ trainer.nnodes=1 \ trainer.save_freq=-1 \ trainer.test_freq=10 \ trainer.total_epochs=15 多节点训练:使用 Docker/Podman 容器的 slurm --------------------------------------------------------------------------------------- 如果您想使用 slurm 运行多节点训练,可以使用以下脚本。 .. note:: 1. 您需要在以下脚本中使用 ``podman`` 或 ``docker``。我们稍后将发布 apptainer 脚本。 2. 如果您想使用 ``podman``,只需在以下脚本中将 ``docker`` 替换为 ``podman``。 该脚本包括以下步骤: 1. SLURM 配置 2. 环境设置 3. Docker/Podman 容器设置 4. Ray 集群初始化 5. 数据预处理 6. 模型设置 7. 训练启动 slurm_script.sh ~~~~~~~~~~~~~~~~~~~~ .. code-block:: bash #!/bin/bash #SBATCH --job-name=verl-ray-on-slurm #SBATCH --nodes=2 #SBATCH --ntasks-per-node=2 #SBATCH --mem=200G #SBATCH --time=30-00:00:00 #SBATCH --gpus-per-node=8 #SBATCH --cpus-per-task=28 #SBATCH --output=../verl_log/slurm-%j.out #SBATCH --error=../verl_log/slurm-%j.err #SBATCH --nodelist=gpu-[0,1] # load necessary modules ### Run this setup # [Cluster]: Use docker # docker pull docker.io/rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 ########################################################################## ###The following setting should be set in different project and cluster### ########################################################################## ### Project CONTAINER_NAME="multinode_verl_training" IMG="verl.rocm" DOCKERFILE="docker/Dockerfile.rocm" # echo $PWD verl_workdir="${HOME}/projects/verl_upstream" export TRANSFORMERS_CACHE="${HOME}/.cache/huggingface" export HF_HOME=$TRANSFORMERS_CACHE ### Cluster Network Setting export NCCL_DEBUG=TRACE export GPU_MAX_HW_QUEUES=2 export TORCH_NCCL_HIGH_PRIORITY=1 export NCCL_CHECKS_DISABLE=1 # export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9 export NCCL_IB_GID_INDEX=3 export NCCL_CROSS_NIC=0 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NCCL_PROTO=Simple export RCCL_MSCCL_ENABLE=0 export TOKENIZERS_PARALLELISM=false export HSA_NO_SCRATCH_RECLAIM=1 ########################################################################## ## Assign using GPUs export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ### For rocm and training script # [ray] < 2.45.0 #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 # [ray] >= 2.45.0 export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794 # Build and launch the Docker container srun bash -c " # Exit on any error set -e # Clean up dangling images (images with tag) docker image prune -f # Need to pull the docker first docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "${IMG}"; then echo \"Building ${IMG} image...\" docker build -f \"${DOCKERFILE}\" -t \"${IMG}\" . else echo \"${IMG} image already exists, skipping build\" fi # Removing old container if exists docker rm \"${CONTAINER_NAME}\" 2>/dev/null || true # Checking network devices ibdev2netdev # Launch the docker docker run --rm -d \ -e HYDRA_FULL_ERROR=1 \ -e RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \ -e RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 \ -e NCCL_DEBUG=${NCCL_DEBUG} \ -e GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES} \ -e TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY} \ -e NCCL_CHECKS_DISABLE=${NCCL_CHECKS_DISABLE} \ -e NCCL_IB_HCA=${NCCL_IB_HCA} \ -e NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX} \ -e NCCL_CROSS_NIC=${NCCL_CROSS_NIC} \ -e CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS} \ -e NCCL_PROTO=${NCCL_PROTO} \ -e RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE} \ -e TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM} \ -e HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM} \ -e TRANSFORMERS_CACHE=${TRANSFORMERS_CACHE} \ -e HF_HOME=${HF_HOME} \ --network host \ --device /dev/dri \ --device /dev/kfd \ --device /dev/infiniband \ --group-add video \ --cap-add SYS_PTRACE \ --security-opt seccomp=unconfined \ --privileged \ -v \${HOME}:\${HOME} \ -v \${HOME}/.ssh:/root/.ssh \ -w "${verl_workdir}" \ --shm-size 128G \ --name \"${CONTAINER_NAME}\" \ \"${IMG}\" \ tail -f /dev/null echo \"Container setup completed\" " # (Optional): If you do not want to root mode and require assign yuorself as the user # Please add `-e HOST_UID=$(id -u)` and `-e HOST_GID=$(id -g)` into the above docker launch script. ### Ray launch the nodes before training # Getting the node names nodes_array=($(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ')) head_node=${nodes_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) # if we detect a space character in the head node IP, we'll # convert it to an ipv4 address. This step is optional. if [[ "$head_node_ip" == *" "* ]]; then IFS=' ' read -ra ADDR <<<"$head_node_ip" if [[ ${#ADDR[0]} -gt 16 ]]; then head_node_ip=${ADDR[1]} else head_node_ip=${ADDR[0]} fi echo "IPV6 address detected. We split the IPV4 address as $head_node_ip" fi port=6379 ip_head=$head_node_ip:$port export ip_head echo "IP Head: $ip_head" # make sure we set environment variables before Ray initialization # Print out all env variables printenv echo "Starting HEAD at $head_node" srun --nodes=1 --ntasks=1 -w "$head_node" \ docker exec "${CONTAINER_NAME}" \ ray start --head --node-ip-address="$head_node_ip" --port=$port \ --dashboard-port=8266 \ --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & # optional, though may be useful in certain versions of Ray < 1.0. sleep 10 # number of nodes other than the head node worker_num=$((SLURM_JOB_NUM_NODES - 1)) for ((i = 1; i <= worker_num; i++)); do node_i=${nodes_array[$i]} echo "Debug: Starting worker on node_i = ${node_i}" if [ -z "$node_i" ]; then echo "Error: Empty node name for worker $i" continue fi echo "Starting WORKER $i at $node_i" srun --nodes=1 --ntasks=1 -w "$node_i" \ docker exec "${CONTAINER_NAME}" \ ray start --address "$ip_head" --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & sleep 5 done # Ray initlization test (See whether any error in the above execution) echo "Testing Ray initialization in the slurm nodes..." docker exec "${CONTAINER_NAME}" python3 -c ' import ray try: ray.init(address="auto") print("\n=== Ray Cluster Status ===") print(f"Number of nodes: {len(ray.nodes())}") for node in ray.nodes(): print("Node: {}, Status: {}".format(node["NodeManagerHostname"], node["Alive"])) # print(f"Node: {node}") ray.shutdown() print("Ray initialization successful!") except Exception as e: print(f"Ray initialization failed: {str(e)}") ' echo "=== Ray test completed ===" ###### # Run data preprocessing echo "Starting data preprocessing..." docker exec "${CONTAINER_NAME}" \ python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k" echo "Starting data preprocessing..." docker exec "${CONTAINER_NAME}" \ python3 "examples/data_preprocess/math_dataset.py" "--local_dir" "../data/math" train_files="../data/gsm8k/train.parquet" val_files="../data/gsm8k/test.parquet" # Download and test model echo "Loading model..." docker exec "${CONTAINER_NAME}" \ python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')" MODEL_PATH="Qwen/Qwen2-7B-Instruct" # Set model path after pipeline test MODEL_PATH="Qwen/Qwen2.5-0.5B-Instruct" echo "== Data and model loading Done ==" echo "Start to train..." docker exec "${CONTAINER_NAME}" \ python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')" MODEL_PATH="Qwen/Qwen2-7B-Instruct" PYTHONUNBUFFERED=1 srun --overlap --nodes=${SLURM_NNODES} --ntasks=1 -w "$head_node" \ docker exec "${CONTAINER_NAME}" \ python3 -m verl.trainer.main_ppo \ data.train_files=$train_files \ data.val_files=$val_files \ data.train_batch_size=1024 \ data.max_prompt_length=1024 \ data.max_response_length=1024 \ actor_rollout_ref.model.path=$MODEL_PATH \ actor_rollout_ref.model.enable_gradient_checkpointing=False \ actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ actor_rollout_ref.ref.fsdp_config.param_offload=True \ critic.optim.lr=1e-5 \ critic.model.use_remove_padding=True \ critic.model.path=$MODEL_PATH \ critic.model.enable_gradient_checkpointing=False \ critic.ppo_micro_batch_size_per_gpu=8 \ critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.optimizer_offload=False \ algorithm.kl_ctrl.kl_coef=0.0001 \ trainer.critic_warmup=0 \ trainer.logger='["console","wandb"]' \ trainer.project_name='verl_example' \ trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \ trainer.n_gpus_per_node=${SLURM_GPUS_PER_NODE} \ trainer.val_before_train=False \ trainer.nnodes=${SLURM_NNODES} \ trainer.save_freq=-1 \ trainer.test_freq=10 \ trainer.total_epochs=15 运行 slurm_script.sh ~~~~~~~~~~~~~~~~~~~~ 只需 sbatch 您的 slurm_script.sh .. code-block:: bash sbatch slurm_script.sh