Skip to content

Commit

Permalink
feat: add image for ray
Browse files Browse the repository at this point in the history
  • Loading branch information
bincherry committed Oct 9, 2024
1 parent 528ebb9 commit 57a568b
Show file tree
Hide file tree
Showing 13 changed files with 677 additions and 40 deletions.
61 changes: 40 additions & 21 deletions common/online-files/start-h100.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/bin/bash


# 配置文件和路径
LOG_FILE="/tmp/boot.log"

Expand All @@ -20,7 +19,7 @@ function init_jupyter() {
echo '{"theme": "dark"}' > /root/.jupyter/lab/user-settings/@jupyterlab/terminal-extension/plugin.jupyterlab-settings
echo '{"locale": "zh_CN"}' > /root/.jupyter/lab/user-settings/@jupyterlab/translation-extension/plugin.jupyterlab-settings

cat <<EOF > /root/.jupyter/jupyter_config.py
cat << EOF > /root/.jupyter/jupyter_config.py
c = get_config()
c.ServerApp.ip = '0.0.0.0'
Expand Down Expand Up @@ -50,7 +49,7 @@ EOF
function init_supervisor() {
mkdir -p /init/supervisor

cat <<EOF > /init/supervisor/supervisor.ini
cat << EOF > /init/supervisor/supervisor.ini
[supervisord]
nodaemon=true
logfile=/tmp/supervisord.log
Expand Down Expand Up @@ -79,14 +78,40 @@ autorestart=true
stderr_logfile=/tmp/tensorboard.err.log
stdout_logfile=/tmp/tensorboard.out.log
EOF

if code-server --version > /dev/null 2>&1; then
cat << EOF >> /init/supervisor/supervisor.ini
[program:code-server]
command=/usr/bin/code-server --bind-addr 0.0.0.0:8889 --disable-telemetry --disable-update-check --disable-workspace-trust --disable-getting-started-override --auth none /root
environment=PASSWORD=%(ENV_CODESERVER_PASSWORD)s
autostart=true
autorestart=true
stderr_logfile=/tmp/code-server.err.log
EOF
fi

if ray --version > /dev/null 2>&1; then
cat << EOF >> /init/supervisor/supervisor.ini
[program:ray]
command=bash -c 'if [ -n "\$KUBERAY_GEN_RAY_START_CMD" ]; then bash -lc "ulimit -n 65536; \$KUBERAY_GEN_RAY_START_CMD"; else bash -c "ulimit -n 65536; ray start --head --block --port=6379"; fi'
autostart=true
autorestart=true
stderr_logfile=/tmp/ray.err.log
EOF
fi

cat << EOF >> /init/supervisor/supervisor.ini
[include]
files=/etc/supervisord/supervisor-other.ini
EOF
}

# 初始化 MOTD
function init_motd() {
cat <<EOF > /etc/matrixdc-motd
cat << EOF > /etc/matrixdc-motd
#!/bin/bash
printf "+----------------------------------------------------------------------------------------------------------------+\n"
Expand Down Expand Up @@ -168,7 +193,7 @@ function init_shutdown() {
rm /usr/sbin/shutdown
fi

cat <<EOF > /usr/bin/shutdown
cat << EOF > /usr/bin/shutdown
#!/bin/bash
rm -rf /root/.local/share/Trash
ps -ef | grep supervisord | grep -v grep | awk '{print \$2}' | xargs kill
Expand All @@ -178,7 +203,7 @@ EOF

# 初始化 Conda 源
function init_conda_source() {
cat <<EOF > /root/.condarc
cat << EOF > /root/.condarc
channels:
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
Expand All @@ -200,7 +225,7 @@ function init_pip_source() {
fi

# 更新 pip 源
cat <<EOF > /etc/pip.conf
cat << EOF > /etc/pip.conf
[global]
trusted-host = pypi.tsinghua.mirrors.com
index-url = https://pypi.tsinghua.mirrors.com/simple
Expand All @@ -222,7 +247,7 @@ function init_apt_source() {
fi
fi
# 写入新的 apt 源配置
cat <<EOF > /etc/apt/sources.list
cat << EOF > /etc/apt/sources.list
deb http://apt.tsinghua.mirrors.com/ubuntu/ jammy main restricted universe multiverse
deb http://apt.tsinghua.mirrors.com/ubuntu/ jammy-updates main restricted universe multiverse
deb http://apt.tsinghua.mirrors.com/ubuntu/ jammy-backports main restricted universe multiverse
Expand All @@ -232,7 +257,6 @@ EOF
# apt update
}


# 初始化 SSH 配置,延长空闲断连时间到30分钟
function init_ssh_config() {
config_lines=(
Expand All @@ -249,7 +273,6 @@ function init_ssh_config() {
done
}


# 函数: 写入日志
function log_info() {
echo "$1" >> "$LOG_FILE"
Expand Down Expand Up @@ -279,10 +302,9 @@ function set_ssh_password() {
log_info "passwd set finished"
else
log_info "Error: /sync/root-passwd file not found."
fi
fi
}


# 函数: 创建 TensorBoard 日志目录
function create_tensorboard_dir() {
mkdir -p "/root/tensorboard-logs"
Expand All @@ -297,8 +319,8 @@ function start_supervisord() {
cp -f /init/bin/* /bin/
if [ -f "/bin/supervisord" ]; then
log_info "supervisord bin set finished"
else
log_info "supervisord bin not found"
else
log_info "supervisord bin not found"
fi
else
log_info "/bin/supervisord文件存在"
Expand All @@ -308,8 +330,6 @@ function start_supervisord() {
/bin/supervisord -c /init/supervisor/supervisor.ini
}



# 主程序
function main() {
initialize_environment
Expand All @@ -331,13 +351,12 @@ function main() {

touch "$flag_file"

else
else
echo "Ignore..."
fi
fi

start_supervisord
start_supervisord
}


# 执行主程序
main
main
58 changes: 39 additions & 19 deletions common/online-files/start.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
#!/bin/bash


# 配置文件和路径
LOG_FILE="/tmp/boot.log"


# 函数: 捕获异常
function try_catch() {
"$@"
Expand All @@ -21,7 +19,7 @@ function init_jupyter() {
echo '{"theme": "dark"}' > /root/.jupyter/lab/user-settings/@jupyterlab/terminal-extension/plugin.jupyterlab-settings
echo '{"locale": "zh_CN"}' > /root/.jupyter/lab/user-settings/@jupyterlab/translation-extension/plugin.jupyterlab-settings

cat <<EOF > /root/.jupyter/jupyter_config.py
cat << EOF > /root/.jupyter/jupyter_config.py
c = get_config()
c.ServerApp.ip = '0.0.0.0'
Expand Down Expand Up @@ -51,7 +49,7 @@ EOF
function init_supervisor() {
mkdir -p /init/supervisor

cat <<EOF > /init/supervisor/supervisor.ini
cat << EOF > /init/supervisor/supervisor.ini
[supervisord]
nodaemon=true
logfile=/tmp/supervisord.log
Expand Down Expand Up @@ -80,14 +78,40 @@ autorestart=true
stderr_logfile=/tmp/tensorboard.err.log
stdout_logfile=/tmp/tensorboard.out.log
EOF

if code-server --version > /dev/null 2>&1; then
cat << EOF >> /init/supervisor/supervisor.ini
[program:code-server]
command=/usr/bin/code-server --bind-addr 0.0.0.0:8889 --disable-telemetry --disable-update-check --disable-workspace-trust --disable-getting-started-override --auth none /root
environment=PASSWORD=%(ENV_CODESERVER_PASSWORD)s
autostart=true
autorestart=true
stderr_logfile=/tmp/code-server.err.log
EOF
fi

if ray --version > /dev/null 2>&1; then
cat << EOF >> /init/supervisor/supervisor.ini
[program:ray]
command=bash -c 'if [ -n "\$KUBERAY_GEN_RAY_START_CMD" ]; then bash -lc "ulimit -n 65536; \$KUBERAY_GEN_RAY_START_CMD"; else bash -c "ulimit -n 65536; ray start --head --block --port=6379"; fi'
autostart=true
autorestart=true
stderr_logfile=/tmp/ray.err.log
EOF
fi

cat << EOF >> /init/supervisor/supervisor.ini
[include]
files=/etc/supervisord/supervisor-other.ini
EOF
}

# 初始化 MOTD
function init_motd() {
cat <<EOF > /etc/matrixdc-motd
cat << EOF > /etc/matrixdc-motd
#!/bin/bash
printf "+----------------------------------------------------------------------------------------------------------------+\n"
Expand Down Expand Up @@ -169,7 +193,7 @@ function init_shutdown() {
rm /usr/sbin/shutdown
fi

cat <<EOF > /usr/bin/shutdown
cat << EOF > /usr/bin/shutdown
#!/bin/bash
rm -rf /root/.local/share/Trash
ps -ef | grep supervisord | grep -v grep | awk '{print \$2}' | xargs kill
Expand All @@ -179,7 +203,7 @@ EOF

# 初始化 Conda 源
function init_conda_source() {
cat <<EOF > /root/.condarc
cat << EOF > /root/.condarc
channels:
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
- https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
Expand All @@ -191,7 +215,7 @@ EOF

# 初始化 pip 源
function init_pip_source() {
cat <<EOF > /etc/pip.conf
cat << EOF > /etc/pip.conf
[global]
index-url = https://pypi.tuna.tsinghua.edu.cn/simple
trusted-host = pypi.tuna.tsinghua.edu.cn
Expand Down Expand Up @@ -239,7 +263,6 @@ function init_ssh_config() {
done
}


# 函数: 写入日志
function log_info() {
echo "$1" >> "$LOG_FILE"
Expand Down Expand Up @@ -269,10 +292,9 @@ function set_ssh_password() {
log_info "passwd set finished"
else
log_info "Error: /sync/root-passwd file not found."
fi
fi
}


# 函数: 创建 TensorBoard 日志目录
function create_tensorboard_dir() {
mkdir -p "/root/tensorboard-logs"
Expand All @@ -287,8 +309,8 @@ function start_supervisord() {
cp -f /init/bin/* /bin/
if [ -f "/bin/supervisord" ]; then
log_info "supervisord bin set finished"
else
log_info "supervisord bin not found"
else
log_info "supervisord bin not found"
fi
else
log_info "/bin/supervisord文件存在"
Expand All @@ -298,7 +320,6 @@ function start_supervisord() {
/bin/supervisord -c /init/supervisor/supervisor.ini
}


# 主程序
function main() {
initialize_environment
Expand All @@ -320,13 +341,12 @@ function main() {

touch "$flag_file"

else
else
echo "Ignore..."
fi
fi

start_supervisord
start_supervisord
}


# 执行主程序
main
main
25 changes: 25 additions & 0 deletions ray/serve_config_examples/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM rayproject/ray:2.35.0-py310-cpu

USER root

RUN apt-get update && \
apt-get install -y --no-install-recommends git git-lfs && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

RUN apt-get update && \
apt-get install -y --no-install-recommends build-essential && \
pip install torch --no-cache-dir -c /home/ray/requirements_compiled.txt && \
pip install transformers --no-cache-dir -c /home/ray/requirements_compiled.txt && \
apt-get purge -y build-essential && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

USER ray

RUN git clone https://hf-mirror.com/google-t5/t5-small
ADD --chown=ray:users text_ml.py /home/ray/text_ml.py

WORKDIR /home/ray
Loading

0 comments on commit 57a568b

Please sign in to comment.