https://cmjava.ltd:8090/upload/2024/05/nvdia-auto_install.sh
#!/bin/sh
AUTO_INSTALL="/root/auto_install"
CUDA_DIR="/root/auto_install/cuda"
mkdir -p ${AUTO_INSTALL}
mkdir -p ${CUDA_DIR}
log=${AUTO_INSTALL}"/auto_install.log"
PROCESS_NAME="$0"
DRIVER_PROCESS_NAME="${AUTO_INSTALL}/NVIDIA-Linux-x86_64"
CUDA_PROCESS_NAME="${AUTO_INSTALL}/cuda"
DOWNLOAD_DRIVER="NVIDIA-Linux"
RAPIDS_PROCESS_NAME="${AUTO_INSTALL}/rapids"
AIACC_TRAIN_PROCESS_NAME="${AUTO_INSTALL}/aiacc_train"
AIACC_INFERENCE_PROCESS_NAME="${AUTO_INSTALL}/aiacc_inference"
CUDNN_PROCESS_NAME="${AUTO_INSTALL}/cudnn"
NCCL_PROCESS_NAME="${AUTO_INSTALL}/nccl"
DOWNLOAD_PROCESS_NAME="wget"
RDMA_PROCESS_NAME_0="aiacc-scc-rdma.sh 0"
RDMA_PROCESS_NAME_1="aiacc-scc-rdma.sh 1"
ERDMA_PROCESS_NAME="auto_install_erdma.sh"
SUCCESS_STR="ALL INSTALL OK"
DOWNLOAD_SUCCESS_STR="Download OK"
DRIVER_FAIL_STR="Driver INSTALL FAIL"
CUDA_FAIL_STR="CUDA INSTALL FAIL"
CUDNN_FAIL_STR="CUDNN INSTALL FAIL"
NCCL_FAIL_STR="NCCL INSTALL FAIL"
AIACC_TRAIN_FAIL_STR="AIACC-Training ENV INSTALL FAIL"
AIACC_INFERENCE_FAIL_STR="AIACC-Inference ENV INSTALL FAIL"
RAPIDS_FAIL_STR="RAPIDS INSTALL FAIL"
DOWNLOAD_FAIL_STR="Download FAIL"
RDMA_NIC_DRIVER_FAIL_STR="Mellanox NIC Driver INSTALL FAIL"
RDMA_PEER_MEM_FAIL_STR="nvidia_peermem module INSTALL FAIL"
ERDMA_FAIL_STR="ERDMA INSTALL FAIL"
INSTANCE_TYPE=$(curl http://100.100.100.200/latest/meta-data/instance/instance-type)
INSTANCE_FAMILY=$(echo ${INSTANCE_TYPE} | awk -F '.' '{print $2}')
install_notes="The script automatically downloads and installs a NVIDIA GPU driver and CUDA, CUDNN library if you choose GPU auto install. if you choose install RDMA or ERDMA, RDMA or ERDMA software will install.
if you choose install perseus, perseus environment will install as well.
1. The installation takes 15 to 20 minutes, depending on the intranet bandwidth and the quantity of vCPU cores of the instance. Please do not operate the GPU or install any GPU-related software until the GPU driver is installed successfully.
2. After the GPU is installed successfully, the instance will restarts automatically."
check_install()
{
b=''
if [ "$1" = "NVIDIA" ]; then
ProcessName=${DRIVER_PROCESS_NAME}
t=2
elif [ "$1" = "cuda" ]; then
ProcessName=${CUDA_PROCESS_NAME}
t=3
elif [ "$1" = "cudnn" ]; then
ProcessName=${CUDNN_PROCESS_NAME}
t=0.5
elif [ "$1" = "nccl" ]; then
ProcessName=${NCCL_PROCESS_NAME}
t=0.5
elif [ "$1" = "rapids" ]; then
ProcessName=${RAPIDS_PROCESS_NAME}
t=1
elif [ "$1" = "aiacc-train" ]; then
ProcessName=${AIACC_TRAIN_PROCESS_NAME}
t=1
elif [ "$1" = "aiacc-inference" ]; then
ProcessName=${AIACC_INFERENCE_PROCESS_NAME}
t=1
elif [ "$1" = "scc-rdma 0" ]; then
ProcessName="${RDMA_PROCESS_NAME_0}"
t=2
elif [ "$1" = "scc-rdma 1" ]; then
ProcessName="${RDMA_PROCESS_NAME_1}"
t=1
elif [ "$1" = "erdma" ]; then
ProcessName="${ERDMA_PROCESS_NAME}"
t=2
fi
i=0
while true
do
pid_num=$(ps -ef | grep "${ProcessName}" | grep -v grep | wc -l)
if [ $pid_num -eq 0 ]; then
str=$(printf "%-100s" "#")
b=$(echo "$str" | sed 's/ /#/g')
printf "| %-100s | %d%% \r\n" "$b" "100";
break
fi
i=$(($i+1))
if [ $i -ge 90 ];then
i=90
fi
str=$(printf "%-${i}s" "#")
b=$(echo "$str" | sed 's/ /#/g')
printf "| %-100s | %d%% \r" "$b" "$i";
sleep $t
done
echo
return 0
}
check_download()
{
name=$1
i=0
b=''
filesize=0
percent=0
sleep 0.5
while true
do
pid_num=$(ps -ef | grep wget |grep ${name} |grep -v grep | wc -l)
if [ $pid_num -eq 0 ]; then
filesize=$(du -sk ${AUTO_INSTALL}/${name}* | awk '{print $1}')
str=$(printf "%-100s" "#")
b=$(echo "$str" | sed 's/ /#/g')
printf "%-8s| %-100s | %d%% \r\n" "${filesize}K" "$b" "100";
break
fi
line=$(tail -2 ${log})
filesize=$(echo $line | awk -F ' ' '{print $1}')
percent=$(echo $line | awk -F '%' '{print $1}' | awk -F ' ' '{print $NF}')
if [ "$percent" -ge 0 ] 2>/dev/null ;then
str=$(printf "%-${percent}s" "#")
b=$(echo "$str" | sed 's/ /#/g')
printf "%-8s| %-100s | %d%% \r" "${filesize}" "$b" "$percent";
else
continue
fi
sleep 0.5
done
return 0
}
check_install_log()
{
if [ ! -f "$log" ];then
echo "NVIDIA install log $log not exist! Install may fail!"
echo
exit 1
fi
if [ "$1" = "NVIDIA" ]; then
succstr=$(cat $log |grep "${SUCCESS_STR}")
str2=$(cat $log |grep "INSTALL_ERROR")
if [ -n "${succstr}" ] && [ -z "${str2}" ]; then
echo "${succstr} !!"
echo
return 0
else
echo "Install may have some INSTALL_ERROR, please check log $log !"
return 1
fi
fi
if [ "$1" = "DRIVER" ]; then
failstr=${DRIVER_FAIL_STR}
elif [ "$1" = "CUDA" ]; then
failstr=${CUDA_FAIL_STR}
elif [ "$1" = "CUDNN" ]; then
failstr=${CUDNN_FAIL_STR}
elif [ "$1" = "NCCL" ]; then
failstr=${NCCL_FAIL_STR}
elif [ "$1" = "AIACC-Inference" ]; then
failstr=${AIACC_INFERENCE_FAIL_STR}
elif [ "$1" = "AIACC-Train" ]; then
failstr=${AIACC_TRAIN_FAIL_STR}
elif [ "$1" = "RAPIDS" ]; then
failstr=${RAPIDS_FAIL_STR}
elif [ "$1" = "scc-rdma 0" ]; then
failstr=${RDMA_NIC_DRIVER_FAIL_STR}
elif [ "$1" = "scc-rdma 1" ]; then
failstr=${RDMA_PEER_MEM_FAIL_STR}
elif [ "$1" = "ERDMA" ]; then
failstr=${ERDMA_FAIL_STR}
fi
str1=$(cat $log |grep "${failstr}")
if [ -n "${str1}" ] ;then
echo
echo "${failstr} ! please check install log ${log} !"
return 1
fi
}
check_install_process()
{
if [ "$1" = "NULL" ]; then
echo "CHECKING AUTO INSTALL,INSTALL RDMA=${6}, PLEASE WAIT ......"
else
echo "CHECKING AUTO INSTALL, DRIVER_VERSION=${1} CUDA_VERSION=${2} CUDNN_VERSION=${3} INSTALL AIACC-Training=${4} INSTALL AIACC-Inference=${5} , INSTALL RDMA=${6}, INSTALL eRDMA=${7} PLEASE WAIT ......"
fi
echo "$install_notes"
echo
while true
do
pid_num=$(ps -ef | grep ${PROCESS_NAME} |grep -v grep | grep -v check | wc -l)
if [ $pid_num -eq 0 ];then
check_install_log "NVIDIA"
return 0
else
pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep driver | grep -v rdma |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "Driver-${1} downloading, it takes 30 seconds or more. Remaining installation time 15 to 20 minutes!"
check_download ${DOWNLOAD_DRIVER}
fi
pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep cuda |grep -v nccl |grep -v rapids |grep -v miniconda |grep -v Tensor |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "CUDA-${2} downloading, it takes 3 minutes or more. Remaining installation time 14 - 19 minutes!"
while true
do
check_download "cuda"
sleep 1
pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep cuda |grep -v nccl |grep -v rapids |grep -v miniconda |grep -v Tensor | grep -v grep | wc -l)
if [ $pid_num -eq 0 ];then
break
fi
done
fi
pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep cudnn |grep -v Tensor |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "cuDNN-${3} downloading, it tasks 1 minutes or more. Remaining installation time 12 - 16 minutes!"
check_download "cudnn"
fi
#add rapids file download check
pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep rapids |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "RAPIDS downloading, it tasks 3 minutes or more. Remaining installation time 4 - 6 minutes!"
check_download "rapids"
fi
#add aiacc-train file download check
pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} | grep aiacc-train |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "AIACC-Trainging downloading, it tasks 3 minutes or more. Remaining installation time 4 - 6 minutes!"
check_download "aiacc_train"
fi
#add aiacc-inference file download check
pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} | grep aiacc-inference | grep -v Tensor | grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "AIACC-Inference downloading, it tasks 3 minutes or more. Remaining installation time 4 - 6 minutes!"
check_download "aiacc_inference"
fi
pid_num=$(ps -ef | grep "${DRIVER_PROCESS_NAME}" |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo
echo "Driver-${1} installing, it tasks 1 to 3 minutes. Remaining installation time 11 to 15 minutes!"
check_install "NVIDIA"
check_install_log "DRIVER"
fi
pid_num=$(ps -ef | grep "${CUDA_PROCESS_NAME}" |grep -v nccl |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "CUDA-${2} installing, it tasks 2 to 5 minutes. Remaining installation time 9 to 12 minutes!"
check_install "cuda"
check_install_log "CUDA"
fi
pid_num=$(ps -ef | grep ${CUDNN_PROCESS_NAME} |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "cuDNN-${3} installing, it takes about 10 seconds. Remaining installation time 6 to 9 minutes!"
check_install "cudnn"
check_install_log "CUDNN"
fi
pid_num=$(ps -ef | grep ${NCCL_PROCESS_NAME} |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "NCCL installing, it taskes about 10 seconds. "
check_install "nccl"
check_install_log "NCCL"
fi
pid_num=$(ps -ef | grep ${RAPIDS_PROCESS_NAME} |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "RAPIDS installing, it taskes about 60 seconds. Installation will be successful soon, please wait......"
check_install "rapids"
check_install_log "RAPIDS"
fi
pid_num=$(ps -ef | grep ${AIACC_TRAIN_PROCESS_NAME} |grep tar |grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "AIACC-Training installing, it taskes about 60 seconds. Please wait......"
check_install "aiacc-train"
check_install_log "AIACC-Train"
fi
pid_num=$(ps -ef | grep ${AIACC_INFERENCE_PROCESS_NAME} |grep tar | grep -v Tensor| grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "AIACC-Inference installing, it taskes about 60 seconds. Please wait......"
check_install "aiacc-inference"
check_install_log "AIACC-Inference"
fi
pid_num=$(ps -ef | grep "${RDMA_PROCESS_NAME_0}" | grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "Installing Mellanox NIC Driver, NIC Libs and Network configure, it takes about 120 seconds, please wait......"
check_install "scc-rdma 0"
check_install_log "scc-rdma 0"
fi
pid_num=$(ps -ef | grep "${RDMA_PROCESS_NAME_1}" | grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "Installing nv_peer_mem/nvidia_peermem module, it takes about 30 seconds, please wait......"
check_install "scc-rdma 1"
check_install_log "scc-rdma 1"
fi
pid_num=$(ps -ef | grep "${ERDMA_PROCESS_NAME}" | grep -v grep | wc -l)
if [ $pid_num -gt 0 ];then
echo "Installing erdma driver, it takes about 180 seconds, please wait......"
check_install "erdma"
check_install_log "ERDMA"
fi
fi
sleep 1
done
}
create_nvidia_repo_centos()
{
baseurl_centos=$(curl http://100.100.100.200/latest/meta-data/source-address | head -1)
#cudaurl=$baseurl_centos"/opsx/ecs/linux/rpm/cuda/${version}/\$basearch/"
driverurl=$baseurl_centos"/opsx/ecs/linux/rpm/driver/${version}/\$basearch/"
#echo "[ecs-cuda]" > /etc/yum.repos.d/nvidia.repo
#echo "name=ecs cuda - \$basearch" >> /etc/yum.repos.d/nvidia.repo
#echo "baseurl=$cudaurl" >> /etc/yum.repos.d/nvidia.repo
#echo "enabled=1" >> /etc/yum.repos.d/nvidia.repo
#echo "gpgcheck=0" >> /etc/yum.repos.d/nvidia.repo
echo "[ecs-driver]" >> /etc/yum.repos.d/nvidia.repo
echo "name=ecs driver - \$basearch" >> /etc/yum.repos.d/nvidia.repo
echo "baseurl=$driverurl" >> /etc/yum.repos.d/nvidia.repo
echo "enabled=1" >> /etc/yum.repos.d/nvidia.repo
echo "gpgcheck=0" >> /etc/yum.repos.d/nvidia.repo
yum clean all >> $log 2>&1
yum makecache >> $log 2>&1
}
create_repo_centos8()
{
rename '.repo' '.repo.bak' /etc/yum.repos.d/*.repo
wget https://mirrors.aliyun.com/repo/Centos-vault-8.5.2111.repo -O /etc/yum.repos.d/Centos-vault-8.5.2111.repo
wget https://mirrors.aliyun.com/repo/epel-archive-8.repo -O /etc/yum.repos.d/epel-archive-8.repo
sed -i 's/mirrors.cloud.aliyuncs.com/url_tmp/g' /etc/yum.repos.d/Centos-vault-8.5.2111.repo && sed -i 's/mirrors.aliyun.com/mirrors.cloud.aliyuncs.com/g' /etc/yum.repos.d/Centos-vault-8.5.2111.repo && sed -i 's/url_tmp/mirrors.aliyun.com/g' /etc/yum.repos.d/Centos-vault-8.5.2111.repo
sed -i 's/mirrors.aliyun.com/mirrors.cloud.aliyuncs.com/g' /etc/yum.repos.d/epel-archive-8.repo
}
disable_nouveau_centos()
{
if [ ! -f /etc/modprobe.d/blacklist-nouveau.conf ];then
echo "blacklist nouveau" > /etc/modprobe.d/blacklist-nouveau.conf
echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouveau.conf
fi
content=$(lsmod |grep nouveau)
if [ -n "$content" ];then
rmmod nouveau
echo "***exec \"dracut --force\" to regenerate the kernel initramfs"
dracut --force
fi
}
disable_nouveau_alinux()
{
if [ ! -f /etc/modprobe.d/blacklist-nouv.conf ]; then
echo "blacklist nouveau" > /etc/modprobe.d/blacklist-nouv.conf
echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouv.conf
fi
if lsmod | grep -q nouveau; then
rmmod nouveau
echo "***exec \"dracut --force\" to regenerate the kernel initramfs"
dracut --force
fi
}
disable_nouveau_ubuntu()
{
if [ ! -f /etc/modprobe.d/blacklist-nouveau.conf ];then
echo "blacklist nouveau" > /etc/modprobe.d/blacklist-nouveau.conf
echo "blacklist lbm-nouveau" >> /etc/modprobe.d/blacklist-nouveau.conf
echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouveau.conf
fi
content=$(lsmod |grep nouveau)
if [ -n "$content" ];then
rmmod nouveau
echo "***exec \"update-initramfs -u\" to regenerate the kernel initramfs"
update-initramfs -u
fi
}
install_kernel_centos()
{
kernel_version=$(uname -r)
kernel_devel_num=$(rpm -qa | grep kernel-devel | grep $kernel_version | wc -l)
if [ $kernel_devel_num -eq 0 ];then
echo "******exec \"yum install -y kernel-devel-$kernel_version\""
yum install -y kernel-devel-$kernel_version
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: install kernel-devel fail!!!"
return 1
fi
fi
return 0
}
install_kernel_alinux()
{
kernel_version=$(uname -r)
if ! rpm -qa | grep kernel-devel | grep -q $kernel_version; then
echo "******exec \"yum install -y kernel-devel-$kernel_version\""
yum install -y kernel-devel-$kernel_version
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: install kernel-devel fail!!!"
return 1
fi
fi
return 0
}
install_kernel_sles()
{
kernel_version=$(uname -r|awk -F'-' '{print $1"-"$2}')
kernel_devel_num=$(rpm -qa | grep kernel-default-devel | wc -l)
if [ $kernel_devel_num -eq 0 ];then
echo "***exec \"zypper install -y kernel-default-devel=$kernel_version\""
zypper install -y kernel-default-devel=$kernel_version
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: install kernel-default-devel fail!!!"
return 1
fi
fi
}
install_kernel_ubuntu()
{
kernel_version=$(uname -r)
linux_headers_num=$(dpkg --list |grep linux-headers | grep $kernel_version | wc -l)
if [ $linux_headers_num -eq 0 ];then
echo "***exec \"apt-get install -y --allow-unauthenticated linux-headers-$kernel_version\""
apt-get install -y --allow-unauthenticated linux-headers-$kernel_version
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: install linux-headers fail!!!"
return 1
fi
fi
}
install_kernel_debian()
{
apt-get update
kernel_version=$(uname -r)
if [ "${kernel_version}" = "4.19.0-17-amd64" ];then
cd ${AUTO_INSTALL}
wget -t 100 --timeout=10 ${download_url}/rdma/sccgn7ex/Debian10u10/linux-header.tar.gz
tar zxf linux-header.tar.gz
cd linux-header
dpkg -i linux-kbuild-4.19_4.19.194-3_amd64.deb
dpkg -i linux-compiler-gcc-8-x86_4.19.194-3_amd64.deb
dpkg -i linux-headers-4.19.0-17-common_4.19.194-3_all.deb
dpkg -i linux-headers-4.19.0-17-amd64_4.19.194-3_amd64.deb
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: install linux-headers fail!!!"
return 1
fi
else
linux_headers_num=$(dpkg --list |grep linux-headers | grep $kernel_version | wc -l)
if [ $linux_headers_num -eq 0 ];then
echo "***exec \"apt-get install -y linux-headers-$kernel_version\""
apt-get install -y linux-headers-$kernel_version
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: install linux-headers fail!!!"
return 1
fi
fi
fi
}
download()
{
cd ${AUTO_INSTALL}
wget -t 100 --timeout=10 ${download_url}/nvidia/driver/${driver_file}
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: Download driver fail!!! return: $?"
return 1
fi
if [ "$os" = "centos" -a "$version" = "6" ];then
if [ "${cuda_big_version}" = "8.0" -o "${cuda_big_version}" = "9.0" -o "${cuda_big_version}" = "9.2" \
-o "${cuda_big_version}" = "10.0" ];then
ar=$(curl ${download_url}/nvidia/cuda/${cuda_version}/ > ./tmp)
echo "${download_url}/nvidia/cuda/${cuda_version}/"
cudafilelist=$(cat ./tmp | perl -n -e'/>(cuda[^<]*)</ && print "$1 \n"' | grep -v ubuntu)
else #cuda10.1 cuda10.2 , cuda11 not support rhel6
ar=$(curl ${download_url}/nvidia/cuda/${cuda_version}/rhel6/ > ./tmp)
cudafilelist=$(cat ./tmp | perl -n -e'/>(cuda[^<]*)</ && print "$1 \n"')
fi
else
ar=$(curl ${download_url}/nvidia/cuda/${cuda_version}/ > ./tmp)
echo "${download_url}/nvidia/cuda/${cuda_version}/"
cudafilelist=$(cat ./tmp | perl -n -e'/>(cuda[^<]*)</ && print "$1 \n"' | grep -v ubuntu)
fi
if [ -z "$cudafilelist" ]; then
echo "INSTALL_ERROR: Download CUDA fail!!! get cuda-${cuda_version} filename fail!!"
return 1
fi
cd ${CUDA_DIR}
echo $cudafilelist
for cudafile in $cudafilelist
do
sleep 1
if [ "$os" = "centos" -a "$version" = "6" ];then
if [ "${cuda_big_version}" = "8.0" -o "${cuda_big_version}" = "9.0" -o "${cuda_big_version}" = "9.2" \
-o "${cuda_big_version}" = "10.0" ];then
wget -t 100 --timeout=10 ${download_url}/nvidia/cuda/${cuda_version}/$cudafile
else
wget -t 100 --timeout=10 ${download_url}/nvidia/cuda/${cuda_version}/rhel6/$cudafile
fi
else
wget -t 100 --timeout=10 ${download_url}/nvidia/cuda/${cuda_version}/$cudafile
fi
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: Download CUDA fail!!! wget $cudafile fail! return: $?"
return 1
fi
done
chmod +x ${CUDA_DIR}/*
cd ${AUTO_INSTALL}
if [ "$cuda_big_version" \> "12" ]; then
wget -t 100 --timeout=10 ${download_url}/nvidia/cudnn/12.x/${cudnn_file}
elif [ "$cuda_big_version" = "11.7" -o "$cuda_big_version" = "11.8" ]; then
wget -t 100 --timeout=10 ${download_url}/nvidia/cudnn/11.x/${cudnn_file}
else
wget -t 100 --timeout=10 ${download_url}/nvidia/cudnn/${cuda_big_version}/${cudnn_file}
fi
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: Download cuDNN fail!!! return :$?"
return 1
fi
chmod +x ${AUTO_INSTALL}/*
echo "$DOWNLOAD_SUCCESS_STR !"
return 0
}
install_driver()
{
${AUTO_INSTALL}/${driver_file} --silent
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: driver install fail!!!"
return 1
fi
echo "DRIVER $driver_version install OK !"
return 0
}
install_cuda()
{
cd ${CUDA_DIR}
cuda_file=$(ls -S | grep cuda | grep $cuda_version | head -1)
echo "cuda file: "$cuda_file
if [ -z "$cuda_file" ]
then
echo "INSTALL_ERROR: cuda file is null, cuda install fail!!!"
return 1
fi
sh ${CUDA_DIR}/$cuda_file --silent --toolkit --samples --samplespath=/root
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: cuda install fail!!!"
return 1
fi
cuda_patchfile=$(ls | grep cuda | grep $cuda_version | grep -v ${cuda_file})
for cuda_patch in $cuda_patchfile
do
echo "install cuda patch file: "$cuda_patch
sh ${CUDA_DIR}/${cuda_patch} --silent --installdir=/usr/local/cuda --accept-eula
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: cuda patch install fail!!!"
return 1
fi
done
echo "CUDA $cuda_version install OK !"
return 0
}
install_cudnn()
{
mkdir ${AUTO_INSTALL}/cudnn
#tar zxvf ${AUTO_INSTALL}/$cudnn_file -C /usr/local
tar xvf ${AUTO_INSTALL}/$cudnn_file -C ${AUTO_INSTALL}/cudnn
if [ "$cudnn_version" \< "8.4" ];then
cp ${AUTO_INSTALL}/cudnn/cuda/include/* /usr/local/cuda/include
cp -P ${AUTO_INSTALL}/cudnn/cuda/lib64/* /usr/local/cuda/lib64
else
cp ${AUTO_INSTALL}/cudnn/cudnn-*-archive/include/cudnn*.h /usr/local/cuda/include
cp -P ${AUTO_INSTALL}/cudnn/cudnn-*-archive/lib/libcudnn* /usr/local/cuda/lib64
fi
chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn*
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: CUDNN INSTALL FAIL !!!"
return 1
fi
echo "CUDNN $cudnn_version install OK !"
return 0
}
install_nccl()
{
cd ${AUTO_INSTALL}
#download nccl
curl ${download_url}/nvidia/nccl/${cuda_big_version}/ > ./tmp
br=$(cat ./tmp | perl -n -e'/>nccl_(.*)-1\+cuda.*/ && print "$1 \n"')
cr=$(echo "${br}" | sort -rV | head -n1)
nccl_version=$(echo ${cr} | awk -F ' ' '{print $1}')
echo "max nccl version:$nccl_version"
nccl_dir="nccl_${nccl_version}-1+cuda${cuda_big_version}_x86_64"
nccl_file="${nccl_dir}.txz"
echo $nccl_file
wget -t 100 --timeout=10 ${download_url}/nvidia/nccl/${cuda_big_version}/${nccl_file}
chmod +x $nccl_file
tar xf ${AUTO_INSTALL}/${nccl_file} && cp -r ${nccl_dir} /usr/local/nccl
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: NCCL INSTALL FAIL !!!"
return 1
fi
echo "NCCL $nccl_version install OK !"
return 0
}
install_fabricmanager()
{
cd ${AUTO_INSTALL}
mig_gpu_id=$(nvidia-smi --query-gpu=pci.bus_id,mig.mode.current --format=csv,noheader | grep Enabled | awk -F ',' '{print $1}' | xargs |sed 's/ /,/g')
if [ -n "${mig_gpu_id}" ] ;then
echo "Closing Enabled Mig: ${mig_gpu_id}"
nvidia-smi -i ${mig_gpu_id} -mig 0
else
echo "All Mig disabled."
fi
url_base="http://mirrors.cloud.aliyuncs.com/nvidia-cuda"
if [ "$os" = "ubuntu" ]; then
driver_version_main=$(echo $driver_version | awk -F '.' '{print $1}')
pkg="nvidia-fabricmanager-${driver_version_main}_${driver_version}-1_amd64.deb"
if [ "$version" = "16.04" ]; then
wget -t 100 --timeout=10 ${url_base}/ubuntu1604/x86_64/${pkg}
elif [ "$version" = "18.04" ]; then
wget -t 100 --timeout=10 ${url_base}/ubuntu1804/x86_64/${pkg}
elif [ "$version" = "20.04" ]; then
wget -t 100 --timeout=10 ${url_base}/ubuntu2004/x86_64/${pkg}
elif [ "$version" = "22.04" ]; then
wget -t 100 --timeout=10 ${url_base}/ubuntu2204/x86_64/${pkg}
fi
dpkg -i ${pkg}
#disable nvidia-fabricmanager update
pkg=$(dpkg --list |grep nvidia-fabricmanager | awk -F ' ' '{print $2}')
echo "nvidia-fabricmanager pkg: $pkg , apt-mark hold it!"
apt-mark hold $pkg
elif [ "$os" = "debian" ];then
#nvidia-fabricmanager-470_470.82.01-1_amd64.deb
driver_version_main=$(echo $driver_version | awk -F '.' '{print $1}')
pkg="nvidia-fabricmanager-${driver_version_main}_${driver_version}-1_amd64.deb"
if [ "$version" = "10" ]; then
wget -t 100 --timeout=10 ${url_base}/debian10/x86_64/${pkg}
fi
dpkg -i ${pkg}
elif [ "$os" = "centos" -a "$version" = "8" ] || [ "$os" = "alinux" -a "$version" = "3" ]; then
pkg="nvidia-fabric-manager-${driver_version}-1.x86_64.rpm"
wget -t 100 --timeout=10 ${url_base}/rhel8/x86_64/${pkg}
yum install -y ${pkg}
elif [ "$os" = "centos" -a "$version" = "7" ]|| [ "$os" = "alinux" -a "$version" = "2" ]; then
pkg="nvidia-fabric-manager-${driver_version}-1.x86_64.rpm"
wget -t 100 --timeout=10 ${url_base}/rhel7/x86_64/${pkg}
rpm -ivh ${pkg}
fi
systemctl enable nvidia-fabricmanager
systemctl start nvidia-fabricmanager
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: nvidia-fabricmanager start FAIL !!!"
return 1
fi
systemctl status nvidia-fabricmanager
return 0
}
enable_pm()
{
if [ "$os" = "centos" -o "$os" = "alinux" ];then
yum install bzip2 -y
fi
cd /usr/share/doc/NVIDIA_GLX-1.0/sample*
bunzip2 nvidia-persistenced-init.tar.bz2
tar xvf nvidia-persistenced-init.tar
cd nvidia-persistenced-init && sh install.sh -u root
}
set_env()
{
env_path="/usr/local/cuda/bin:"
#env_library="/usr/local/cuda-${cuda_big_version}/lib64:/usr/local/nccl/lib:"
env_library="/usr/local/cuda/lib64:"
env1="export PATH=${env_path}\$PATH"
env2="export LD_LIBRARY_PATH=${env_library}\$LD_LIBRARY_PATH"
echo $env1 >> ${env_file}
echo $env2 >> ${env_file}
}
install_dependencies()
{
cd ${AUTO_INSTALL}
curl ${download_url}/aiacc_1211/ > ./tmp
if [ "$os" = "ubuntu" ]; then
#download the latest openmpi pkg
br=$(cat ./tmp | perl -n -e'/>openmpi_(.*)_amd64.deb/ && print "$1 \n"')
cr=$(echo "${br}" | sort -rV | head -n1)
openmpi_version=$(echo ${cr} | awk -F ' ' '{print $1}')
openmpi_file=openmpi_${openmpi_version}_amd64.deb
wget -t 100 --timeout=10 ${download_url}/aiacc_1211/${openmpi_file}
dpkg -i ${openmpi_file}
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: Openmpi INSTALL FAIL !!!"
return 1
fi
mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real
echo "#!/bin/bash" > /usr/local/bin/mpirun
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun
chmod a+x /usr/local/bin/mpirun
mkdir -p /root/.openmpi
echo "hwloc_base_binding_policy=none" >> /root/.openmpi/mca-params.conf
apt-get update
apt-get install -y curl openssh-client openssh-server
elif [ "$os" = "centos" -a "$version" = "7" ]; then
#yum -y update
yum clean all
yum -y install epel-release
yum -y install perl openssh-clients openssh-server openblas-devel
#download the latest openmpi pkg
br=$(cat ./tmp | perl -n -e'/>openmpi-(.*).el7.x86_64.rpm/ && print "$1 \n"')
cr=$(echo "${br}" | sort -rV | head -n1)
openmpi_version=$(echo ${cr} | awk -F ' ' '{print $1}')
openmpi_file=openmpi-${openmpi_version}.el7.x86_64.rpm
wget -t 100 --timeout=10 ${download_url}/aiacc_1211/${openmpi_file}
rpm -Uivh ${openmpi_file}
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: Openmpi INSTALL FAIL !!!"
return 1
fi
mv /usr/bin/mpirun /usr/bin/mpirun.real
echo '#!/bin/bash' > /usr/bin/mpirun
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun
chmod a+x /usr/bin/mpirun
mkdir -p /root/.openmpi
echo "hwloc_base_binding_policy=none" >> /root/.openmpi/mca-params.conf
fi
echo "AIACC-Training install_dependencies OK !"
rm -f ./tmp
return 0
}
install_aiacc_base()
{
cd ${AUTO_INSTALL}
#download the latest base pkg
#curl ${download_url}/aiacc/cuda${cuda_big_version}/ > ./tmp
curl ${download_url}/aiacc_1211/cuda${cuda_big_version}/ > ./tmp
#miniconda-cuda10.0-aiacc-base.tgz
aiacc_base_file=$(cat ./tmp | perl -n -e'/>(miniconda[^"]*.tgz)</ && print "$1 \n"' | grep "aiacc-base" | sort -rV | head -1 )
echo "aiacc_base_file=${aiacc_base_file}"
if [ -z "${aiacc_base_file}" ]; then
echo "INSTALL_ERROR: AIACC INSTALL FAIL! Get AIACC-base package fail !!!"
return 1
fi
#wget -t 100 --timeout=10 ${download_url}/aiacc/cuda${cuda_big_version}/${aiacc_base_file}
wget -t 100 --timeout=10 ${download_url}/aiacc_1211/cuda${cuda_big_version}/${aiacc_base_file}
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: AIACC-base INSTALL FAIL! Download AIACC-base env package fail!!! return :$?"
return 1
fi
chmod +x ${AUTO_INSTALL}/*
tar zxvf ${AUTO_INSTALL}/${aiacc_base_file} -C /root
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: AIACC-base INSTALL FAIL! INSTALL AIACC-base env package fail!!! return :$?"
return 1
fi
echo "AIACC-base install OK !"
#rm -f ${AUTO_INSTALL}/${aiacc_base_file}
return 0
}
install_aiacc_train()
{
mkdir ${AUTO_INSTALL}/aiacc_train
cd ${AUTO_INSTALL}
#download the latest perseus pkg
#curl ${download_url}/aiacc/cuda${cuda_big_version}/ > ./tmp
curl ${download_url}/aiacc_1211/cuda${cuda_big_version}/ > ./tmp
aiacc_train_file=$(cat ./tmp | perl -n -e'/>(miniconda-cuda[^"]*.tgz)</ && print "$1 \n"' | grep "aiacc-train" | sort -rV | head -1 )
echo "aiacc_train_file=${aiacc_train_file}"
if [ -z "${aiacc_train_file}" ]; then
echo "INSTALL_ERROR: AIACC-Training INSTALL FAIL! Get AIACC-Training package fail !!!"
return 1
fi
#wget -t 100 --timeout=10 -P ${AUTO_INSTALL}/aiacc_train ${download_url}/aiacc/cuda${cuda_big_version}/${aiacc_train_file}
wget -t 100 --timeout=10 -P ${AUTO_INSTALL}/aiacc_train ${download_url}/aiacc_1211/cuda${cuda_big_version}/${aiacc_train_file}
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: AIACC-Training INSTALL FAIL! Download AIACC-Training env package fail!!! return :$?"
return 1
fi
chmod +x ${AUTO_INSTALL}/aiacc_train/*
echo "tar zxvf ${AUTO_INSTALL}/aiacc_train/${aiacc_train_file} -C /root"
tar zxvf ${AUTO_INSTALL}/aiacc_train/${aiacc_train_file} -C /root
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: AIACC-Training INSTALL FAIL! INSTALL AIACC-Training env package fail!!! return :$?"
return 1
fi
echo "AIACC-Training unpack OK !"
install_dependencies
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: AIACC-Training INSTALL FAIL! INSTALL dependencies fail!!! return :$?"
return 1
fi
rm -f ./tmp
echo "AIACC-Training ENV INSTALL OK !"
cd /root
wget -t 100 --timeout=10 ${download_url}/aiacc_1211/ali-perseus-demos.tgz
if [ $? -ne 0 ]; then
echo "INSTALL_WARNING: AIACC-Training download demo fail!!! "
fi
#rm -rf ${AUTO_INSTALL}/aiacc_train
return 0
}
install_aiacc_inference()
{
mkdir ${AUTO_INSTALL}/aiacc_inference
cd ${AUTO_INSTALL}
#download the latest aiacc_inference pkg
#curl ${download_url}/aiacc/cuda${cuda_big_version}/ > ./tmp
curl ${download_url}/aiacc_1211/cuda${cuda_big_version}/ > ./tmp
#miniconda-cuda10.0-aiacc-inference-1.0.2.tgz
aiacc_inference_file=$(cat ./tmp | perl -n -e'/>(miniconda-cuda[^"]*.tgz)</ && print "$1 \n"' | grep "aiacc-inference" | sort -rV | head -1 )
echo "aiacc_inference_file=${aiacc_inference_file}"
if [ -z ${aiacc_inference_file} ]; then
echo "INSTALL_ERROR: AIACC-Inference INSTALL FAIL! Get AIACC-Inference package fail !!!"
return 1
fi
#wget -t 100 --timeout=10 -P ${AUTO_INSTALL}/aiacc_inference ${download_url}/aiacc/cuda${cuda_big_version}/${aiacc_inference_file}
wget -t 100 --timeout=10 -P ${AUTO_INSTALL}/aiacc_inference ${download_url}/aiacc_1211/cuda${cuda_big_version}/${aiacc_inference_file}
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: AIACC-Inference INSTALL FAIL! Download AIACC-Inference env package fail!!! return :$?"
return 1
fi
chmod +x ${AUTO_INSTALL}/aiacc_inference/*
echo "tar zxvf ${AUTO_INSTALL}/aiacc_inference/${aiacc_inference_file} -C /root"
tar zxvf ${AUTO_INSTALL}/aiacc_inference/${aiacc_inference_file} -C /root
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: AIACC-Inference INSTALL FAIL! INSTALL AIACC-Inference env package fail!!! return :$?"
return 1
fi
rm -f ./tmp
#download trt
os_version=${version}
if [ "${os_version}" = "8" ];then
os_version="7"
fi
curl ${download_url}/nvidia/tensorrt/cuda${cuda_big_version}/ > ./tmp
#TensorRT-7.0.0.11.Ubuntu-16.04.x86_64-gnu.cuda-10.0.cudnn7.6.tar.gz
trt_file_list=$(cat ./tmp | perl -n -e'/(TensorRT-[^"]*.gz)">/ && print "$1 \n"'| grep -i "${os}-${os_version}" | grep "cudnn${cudnn_big_version}" )
echo "trt file list: ${trt_file_list}"
if [ -z "$trt_file_list" ]; then
echo "INSTALL_WARNING:AIACC-Inference INSTALL FAIL! Download TensorRT fail!!! get TensorRT filename fail!!"
else
for trt_file in $trt_file_list
do
sleep 1
wget -t 100 --timeout=10 -P ${AUTO_INSTALL}/aiacc_inference ${download_url}/nvidia/tensorrt/cuda${cuda_big_version}/${trt_file}
if [ $? -ne 0 ]; then
echo "INSTALL_WARNING: AIACC-Inference INSTALL FAIL! Download TensorRT file fail!!! "
else
chmod +x ${AUTO_INSTALL}/aiacc_inference/*
tar zxvf ${AUTO_INSTALL}/aiacc_inference/${trt_file} -C /usr/local
if [ $? -ne 0 ]; then
echo "INSTALL_WARNING: AIACC-Inference TensorRT INSTALL FAIL! return :$?"
fi
fi
done
fi
echo "AIACC-Inference TensorRT install OK !"
rm -f ./tmp
#aiacc_inference_demo.tgz
cd /root
wget -t 100 --timeout=10 ${download_url}/aiacc_1211/aiacc_inference_demo.tgz
if [ $? -ne 0 ]; then
echo "INSTALL_WARNING: AIACC-Inference download demo fail!!! "
fi
echo "AIACC-Inference install OK !"
#rm -rf ${AUTO_INSTALL}/aiacc_inference
return 0
}
install_rapids()
{
cd ${AUTO_INSTALL}
#rapids_file="rapids0.8_py3.6_cuda${cuda_big_version}.tar.gz"
rapids_env_file="env_add_to_bashrc.log"
#download the latest rapids pkg
curl ${download_url}/rapids/cuda${cuda_big_version}/ > ./tmp
br=$(cat ./tmp | perl -n -e'/>rapids(.*)_miniconda(.*)_cuda.*_py(.*).tar.gz/ && print "$1 $2 $3\n"')
cr=$(echo "${br}" | sort -rV | head -n1)
rapids_version=$(echo ${cr} | awk -F ' ' '{print $1}')
if [ -z "${rapids_version}" ]; then
echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! get rapids package name fail!!! return :$?"
return 1
fi
miniconda_version=$(echo ${cr} | awk -F ' ' '{print $2}')
if [ -z "${miniconda_version}" ]; then
echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! get rapids package name fail!!! return :$?"
return 1
fi
py_version=$(echo ${cr} | awk -F ' ' '{print $3}')
if [ -z "${py_version}" ]; then
echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! get rapids package name fail!!! return :$?"
return 1
fi
rapids_file="rapids${rapids_version}_miniconda${miniconda_version}_cuda${cuda_big_version}_py${py_version}.tar.gz"
wget -t 100 --timeout=10 ${download_url}/rapids/cuda${cuda_big_version}/${rapids_file}
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! Download rapids package fail!!! return :$?"
return 1
fi
wget -t 100 --timeout=10 ${download_url}/rapids/cuda${cuda_big_version}/${rapids_env_file}
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: RAPIDS INSTALL FAIL!Download rapids package fail!!! return :$?"
return 1
fi
chmod +x ${AUTO_INSTALL}/*
tar zxvf ${AUTO_INSTALL}/${rapids_file} -C /root && cat ${AUTO_INSTALL}/${rapids_env_file} >> /root/.bashrc
#cat ${AUTO_INSTALL}/${rapids_env_file} >> /root/.bashrc && source .bashrc
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! Install rapids package fail!!! return :$?"
return 1
fi
echo "RAPIDS INSTALL OK !"
}
if [ -f "/etc/os-release" ];then
#os=$(cat /etc/os-release |grep "^ID="|awk -F '=' '{print $2}'|sed 's/\"//g')
os=$(. /etc/os-release;echo $ID)
if [ "$os" = "ubuntu" ];then
profile_file="/root/.profile"
env_file="/root/.bashrc"
elif [ "$os" = "centos" ];then
profile_file="/root/.bash_profile"
env_file="/root/.bashrc"
elif [ "$os" = "alinux" ];then
profile_file="/root/.bash_profile"
env_file="/root/.bashrc"
elif [ "$os" = "sles" ];then
env_file="/root/.bash_profile"
profile_file="/root/.bash_profile"
elif [ "$os" = "debian" ]; then
profile_file="/root/.profile"
env_file="/root/.bashrc"
fi
else
issue=$(cat /etc/issue | grep CentOS)
if [ -n "$issue" ];then
os="centos"
env_file="/root/.bashrc"
profile_file="/root/.bash_profile"
fi
fi
if [ "$1" = "check" ];then
check_install_process $2 $3 $4 $5 $6 $7 $8
sed -i '/auto_install/d' $profile_file
exit 0
else
driver_version="$1"
cuda_version="$2"
cudnn_version="$3"
is_install_aiacc_train="$4"
is_install_aiacc_inference="$5"
is_install_rdma="$6"
is_install_rapids="$8"
is_install_erdma="$7"
echo "begin to install, driver: $driver_version, cuda: $cuda_version, cudnn: $cudnn_version " >> $log 2>&1
driver_file="NVIDIA-Linux-x86_64-"${driver_version}".run"
cuda_big_version=$(echo $cuda_version | awk -F'.' '{print $1"."$2}')
if [ "$cuda_big_version" \> "12" ]; then
cudnn_file="cudnn-linux-x86_64-"${cudnn_version}"_cuda12-archive.tar.xz"
elif [ "$cuda_big_version" = "11.7" -o "$cuda_big_version" = "11.8" ]; then
cudnn_file="cudnn-linux-x86_64-"${cudnn_version}"_cuda11-archive.tar.xz"
else
cudnn_file="cudnn-"${cuda_big_version}"-linux-x64-v"${cudnn_version}".tgz"
fi
cudnn_big_version=$(echo $cudnn_version | awk -F'.' '{print $1}')
echo "sh ${PROCESS_NAME} check $driver_version $cuda_version $cudnn_version ${is_install_aiacc_train} ${is_install_aiacc_inference} ${is_install_rdma} ${is_install_erdma} ${is_install_rapids}" | tee -a $profile_file
#echo "sh ${PROCESS_NAME} check $driver_version $cuda_version $cudnn_version ${is_install_perseus}" | tee -a $profile_file
fi
echo "os:$os" >> $log 2>&1
ubuntu_version=""
if [ "$os" = "ubuntu" ]; then
disable_nouveau_ubuntu >> $log 2>&1
apt-get update
# version=$(cat /etc/os-release |grep "VERSION_ID=" | awk -F '=' '{print $2}'|sed 's/\"//g')
version=$(. /etc/os-release;echo $VERSION_ID)
if [ "$version" = "16.04" ]; then
ubuntu_version="ubuntu1604"
elif [ "$version" = "18.04" ];then
ubuntu_version="ubuntu1804"
elif [ "$version" = "20.04" ];then
ubuntu_version="ubuntu2004"
elif [ "$version" = "22.04" ];then
ubuntu_version="ubuntu2204"
else
echo "ERROR: Ubuntu version $version is not supported!" >> $log 2>&1
exit 1
fi
echo "stop daily update service" >> $log 2>&1 #stop apt-daily
systemctl stop apt-daily.timer
systemctl stop apt-daily.service
systemctl stop apt-daily-upgrade.timer
systemctl stop apt-daily-upgrade.service
elif [ "$os" = "centos" ]; then
disable_nouveau_centos >> $log 2>&1
if [ ! -f "/usr/bin/gcc" ]; then
yum install -y gcc
fi
if [ -f "/etc/os-release" ];then
#version=$(cat /etc/os-release |grep "VERSION_ID=" | awk -F '=' '{print $2}'|sed 's/\"//g')
version=$(. /etc/os-release;echo $VERSION_ID)
else
if [ ! -f "/usr/bin/lsb_release" ]; then
pkgname=$(yum provides /usr/bin/lsb_release |grep centos|grep x86_64 |head -1 |awk -F: '{print $1}')
if [ -z "$pkgname" ]; then
echo "INSTALL_ERROR: /usr/bin/lsb_release pkg not exists!" >> $log 2>&1
exit 1
fi
yum install -y $pkgname >> $log 2>&1
fi
str=$(lsb_release -r | awk -F'[:.]' '{print $2}')
version=$(echo $str | sed 's/ //g')
fi
create_nvidia_repo_centos
elif [ "$os" = "alinux" ]; then
disable_nouveau_alinux >> $log 2>&1
if [ ! -f "/usr/bin/gcc" ]; then
yum install -y gcc
fi
version=$(cat /etc/os-release | grep "VERSION_ID=" | awk -F '=' '{print $2}' | sed 's/\"//g' | cut -d. -f1)
###change log
#aliyun linux2 在安装驱动之前,要先安装一下elf这个包,否则报错
if [ "$version" = "2" ]; then
echo "install elfutils-libelf-devel" >> $log 2>&1
yum install elfutils-libelf-devel -y
fi
elif [ "$os" = "debian" ]; then
version=$(. /etc/os-release;echo $VERSION_ID)
echo "os is Debian, version:${version}" >> $log 2>&1
apt-get update
else
echo "INSTALL_ERROR: Invalid OS!! INSTALL FAIL!" >> $log 2>&1
exit 1
fi
baseurl=$(curl http://100.100.100.200/latest/meta-data/source-address | head -1)
download_url="${baseurl}/opsx/ecs/linux/binary"
install_kernel_${os} >> $log 2>&1
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: kernel-devel install fail!!!" >> $log 2>&1
exit 1
fi
begin_download=$(date '+%s')
if [ "$INSTANCE_FAMILY" = "sccgn7ex" -a "$is_install_rdma" = "TRUE" ]; then
#install scc begin
echo "$os install scc 0 begin ... " >> $log 2>&1
cd ${AUTO_INSTALL}
wget ${download_url}/rdma/aiacc-scc-rdma.sh ### && sh aiacc-scc-rdma.sh 0 $os
sh aiacc-scc-rdma.sh 0 $os
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: $INSTANCE_FAMILY $RDMA_NIC_DRIVER_FAIL_STR ! " >> $log 2>&1
exit 1
fi
echo "install scc end ... " >> $log 2>&1
#install scc end
if [ "$driver_version" = "NULL" ]; then
echo "only install RDMA. exit. " >> $log 2>&1
echo ${SUCCESS_STR} >> $log 2>&1
sleep 60
reboot
exit 0
fi
fi
if [ "$is_install_erdma" = "TRUE" ]; then
echo "$os install ofed and erdma begin ... " >> $log 2>&1
mkdir ${AUTO_INSTALL}/erdma -p
cd ${AUTO_INSTALL}
wget ${download_url}/erdma/auto_install_erdma.sh
bash -x auto_install_erdma.sh ${AUTO_INSTALL}/erdma >> $log 2>&1
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: ${INSTANCE_FAMILY} ${ERDMA_FAIL_STR}!" >> $log 2>&1
exit 1
fi
if [ "$driver_version" = "NULL" ]; then
echo "only install eRDMA. exit. " >> $log 2>&1
echo ${SUCCESS_STR} >> $log 2>&1
sleep 60
reboot
exit 0
else
# download nvidia-peermem and nvidia-peermem.service
# install driver, install nvidia-peermem
wget -P /sbin/ ${download_url}/erdma/nvidia-peermem
wget -P /etc/systemd/system/ ${download_url}/erdma/nvidia-peermem.service
systemctl daemon-reload
# enable nvidia-peermem ONBOOT
systemctl enable nvidia-peermem
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: ${INSTANCE_FAMILY} ${ERDMA_FAIL_STR}!" >> $log 2>&1
exit 1
fi
fi
fi
if [ "$os" = "centos" ]; then
if [ "$version" = "8" ]; then
echo "no nvidia source, install elfutils-libelf-devel"
#create_repo_centos8
yum install elfutils-libelf-devel -y
fi
fi
if [ "$driver_version" = "NULL" ]; then
echo "Driver version is NULL, do not install. exit. " >> $log 2>&1
#exit 0
fi
download >> $log 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
end_download=$(date '+%s')
time_download=$((end_download-begin_download))
echo "NVIDIA download OK! Using time $time_download s !!" >> $log 2>&1
begin=$(date '+%s')
install_driver >> $log 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
enable_pm >> $log 2>&1
if [ "$INSTANCE_FAMILY" = "ebmgn7" -o "$INSTANCE_FAMILY" = "ebmgn7e" -o "$INSTANCE_FAMILY" = "sccgn7ex" -o "$INSTANCE_FAMILY" = "ebmgn7ex" \
-o "$INSTANCE_FAMILY" = "ebmgn7vx" -o "$INSTANCE_FAMILY" = "ebmgn7v" ]; then
install_fabricmanager >> $log 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
fi
echo "NVIDIA install driver OK!!!" >> $log 2>&1
install_cuda >> $log 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
echo "NVIDIA install cuda OK!!" >> $log 2>&1
install_cudnn >> $log 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
echo "NVIDIA install cudnn OK!!!" >> $log 2>&1
#install_nccl >> $log 2>&1
#if [ $? -ne 0 ]; then
# exit 1
#fi
set_env
cd ${AUTO_INSTALL}
rm -f ${AUTO_INSTALL}/tmp
rm -rf ${AUTO_INSTALL}/cuda
rm -f ${AUTO_INSTALL}/NVIDIA*
rm -f ${AUTO_INSTALL}/nvidia*
rm -rf ${AUTO_INSTALL}/cudnn*
if [ "${is_install_aiacc_train}" = "TRUE" -o "${is_install_aiacc_inference}" = "TRUE" ]; then
install_aiacc_base >> $log 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
echo "AIACC miniconda base install OK!!!" >> $log 2>&1
fi
if [ "${is_install_aiacc_train}" = "TRUE" ]; then
install_aiacc_train >> $log 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
echo "AIACC-Traing ENV install OK!!!" >> $log 2>&1
fi
if [ "${is_install_aiacc_inference}" = "TRUE" ]; then
install_aiacc_inference >> $log 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
echo "AIACC-Inference ENV install OK!!!" >> $log 2>&1
fi
if [ "${is_install_aiacc_train}" = "TRUE" -o "${is_install_aiacc_inference}" = "TRUE" ]; then
echo "echo " >> ${profile_file}
if [ "${is_install_aiacc_train}" = "TRUE" -a "${is_install_aiacc_inference}" = "FALSE" ]; then
echo "echo \"###### AIACC-Training has installed on your machine! \" " >> ${profile_file}
elif [ "${is_install_aiacc_train}" = "FALSE" -a "${is_install_aiacc_inference}" = "TRUE" ]; then
echo "echo \"###### AIACC-Inference has installed on your machine! \" " >> ${profile_file}
else
echo "echo \"###### AIACC-Training and AIACC-Inference has installed on your machine! \" " >> ${profile_file}
fi
echo "echo -e \"###### Please execute [ \033[31m . /root/miniconda/etc/profile.d/conda.sh \033[0m ] to init AIACC miniconda. \" " >> ${profile_file}
echo "echo -e \"###### You can execute [ \033[31m conda env list \033[0m ] to check the AIACC miniconda envs. \" " >> ${profile_file}
echo "echo -e \"###### Please activate env with [ \033[31m conda activate AIACC environments name \033[0m ] eg: 'conda activate aiacct_tf1.15_tr1.4.0_mx1.5.0_cu10.0_py36', 'conda activate aiaccix_1.2.0a0' \" " >> ${profile_file}
echo "echo " >> ${profile_file}
fi
if [ "${is_install_rapids}" = "TRUE" ]; then
install_rapids >> $log 2>&1
if [ $? -ne 0 ]; then
exit 1
fi
echo "RAPIDS install OK!!!" >> $log 2>&1
fi
if [ "$INSTANCE_FAMILY" = "sccgn7ex" -a "$is_install_rdma" = "TRUE" ]; then
#install scc begin
echo "$os install scc 1 begin ... " >> $log 2>&1
cd ${AUTO_INSTALL}
sh aiacc-scc-rdma.sh 1 $os
if [ $? -ne 0 ]; then
echo "INSTALL_ERROR: $INSTANCE_FAMILY $RDMA_PEER_MEM_FAIL_STR ! " >> $log 2>&1
exit 1
fi
echo "install scc end ... " >> $log 2>&1
#install scc end
fi
end=$(date '+%s')
time_install=$((end-begin))
echo "Install using time $time_install !"
echo "Install using time $time_install !" >> $log 2>&1
lsmod |grep nvidia >> $log 2>&1
nvidia-smi >> $log 2>&1
#disable GSP
if [ "$driver_version" \> "510" ] ;then
if [ "$INSTANCE_FAMILY" = "gn5" -o "$INSTANCE_FAMILY" = "gn5i" -o "$INSTANCE_FAMILY" = "ebmgn5i" -o "$INSTANCE_FAMILY" = "ebmgn5" -o "$INSTANCE_FAMILY" = "ebmgn6v" -o "$INSTANCE_FAMILY" = "ebmgn6e" -o "$INSTANCE_FAMILY" = "gn6v" -o "$INSTANCE_FAMILY" = "gn6e" ]; then
echo "$INSTANCE_FAMILY not support GSP" >> $log 2>&1
else
echo options nvidia NVreg_EnableGpuFirmware=0 > /etc/modprobe.d/nvidia-gsp.conf
fi
fi
echo ${SUCCESS_STR} >> $log 2>&1
ldconfig
echo "reboot......" >> $log 2>&1
sleep 60
reboot
评论区