标签搜索

目 录CONTENT

文章目录

Linux下安装Nvdia驱动、Cuda、cudnn

陈铭
2024-05-30 / 0 评论 / 0 点赞 / 12 阅读 / 8,621 字 / 正在检测是否收录...

https://cmjava.ltd:8090/upload/2024/05/nvdia-auto_install.sh

#!/bin/sh

AUTO_INSTALL="/root/auto_install"
CUDA_DIR="/root/auto_install/cuda"

mkdir -p ${AUTO_INSTALL}
mkdir -p ${CUDA_DIR}
log=${AUTO_INSTALL}"/auto_install.log"
PROCESS_NAME="$0"

DRIVER_PROCESS_NAME="${AUTO_INSTALL}/NVIDIA-Linux-x86_64"
CUDA_PROCESS_NAME="${AUTO_INSTALL}/cuda"
DOWNLOAD_DRIVER="NVIDIA-Linux"


RAPIDS_PROCESS_NAME="${AUTO_INSTALL}/rapids"
AIACC_TRAIN_PROCESS_NAME="${AUTO_INSTALL}/aiacc_train"
AIACC_INFERENCE_PROCESS_NAME="${AUTO_INSTALL}/aiacc_inference"
CUDNN_PROCESS_NAME="${AUTO_INSTALL}/cudnn"
NCCL_PROCESS_NAME="${AUTO_INSTALL}/nccl"
DOWNLOAD_PROCESS_NAME="wget"
RDMA_PROCESS_NAME_0="aiacc-scc-rdma.sh 0"
RDMA_PROCESS_NAME_1="aiacc-scc-rdma.sh 1"
ERDMA_PROCESS_NAME="auto_install_erdma.sh"

SUCCESS_STR="ALL INSTALL OK"
DOWNLOAD_SUCCESS_STR="Download OK"

DRIVER_FAIL_STR="Driver INSTALL FAIL"
CUDA_FAIL_STR="CUDA INSTALL FAIL"
CUDNN_FAIL_STR="CUDNN INSTALL FAIL"
NCCL_FAIL_STR="NCCL INSTALL FAIL"
AIACC_TRAIN_FAIL_STR="AIACC-Training ENV INSTALL FAIL"
AIACC_INFERENCE_FAIL_STR="AIACC-Inference ENV INSTALL FAIL"
RAPIDS_FAIL_STR="RAPIDS INSTALL FAIL"
DOWNLOAD_FAIL_STR="Download FAIL"
RDMA_NIC_DRIVER_FAIL_STR="Mellanox NIC Driver INSTALL FAIL"
RDMA_PEER_MEM_FAIL_STR="nvidia_peermem module INSTALL FAIL"
ERDMA_FAIL_STR="ERDMA INSTALL FAIL"

INSTANCE_TYPE=$(curl http://100.100.100.200/latest/meta-data/instance/instance-type)
INSTANCE_FAMILY=$(echo ${INSTANCE_TYPE} | awk -F '.' '{print $2}')


install_notes="The script automatically downloads and installs a NVIDIA GPU driver and CUDA, CUDNN library if you choose GPU auto install. if you choose install RDMA or ERDMA, RDMA or ERDMA software will install.
if you choose install perseus, perseus environment will install as well.
1. The installation takes 15 to 20 minutes, depending on the intranet bandwidth and the quantity of vCPU cores of the instance. Please do not operate the GPU or install any GPU-related software until the GPU driver is installed successfully.
2. After the GPU is installed successfully, the instance will restarts automatically."


check_install()
{
    b=''
    if [ "$1" = "NVIDIA" ]; then
        ProcessName=${DRIVER_PROCESS_NAME}
        t=2
    elif [ "$1" = "cuda" ]; then
        ProcessName=${CUDA_PROCESS_NAME}
        t=3
    elif [ "$1" = "cudnn" ]; then
        ProcessName=${CUDNN_PROCESS_NAME}
        t=0.5
    elif [ "$1" = "nccl" ]; then
        ProcessName=${NCCL_PROCESS_NAME}
        t=0.5
    elif [ "$1" = "rapids" ]; then
        ProcessName=${RAPIDS_PROCESS_NAME}
        t=1
    elif [ "$1" = "aiacc-train" ]; then
        ProcessName=${AIACC_TRAIN_PROCESS_NAME}
        t=1
    elif [ "$1" = "aiacc-inference" ]; then
        ProcessName=${AIACC_INFERENCE_PROCESS_NAME}
        t=1
    elif [ "$1" = "scc-rdma 0" ]; then
        ProcessName="${RDMA_PROCESS_NAME_0}"
        t=2
    elif [ "$1" = "scc-rdma 1" ]; then
        ProcessName="${RDMA_PROCESS_NAME_1}"
        t=1
    elif [ "$1" = "erdma" ]; then    
        ProcessName="${ERDMA_PROCESS_NAME}"
        t=2
    fi
    i=0
    while true
    do
        pid_num=$(ps -ef | grep "${ProcessName}" | grep -v grep | wc -l)
        if [ $pid_num -eq 0 ]; then
            str=$(printf "%-100s" "#")
            b=$(echo "$str" | sed 's/ /#/g')
            printf "| %-100s | %d%% \r\n" "$b" "100";
            break
        fi
        i=$(($i+1))
        if [ $i -ge 90 ];then
            i=90
        fi
        str=$(printf "%-${i}s" "#")
        b=$(echo "$str" | sed 's/ /#/g')
        printf "| %-100s | %d%% \r" "$b" "$i";
        sleep $t
    done
    echo
    return 0
}

check_download()
{
    name=$1
    i=0
    b=''
    filesize=0
    percent=0

    sleep 0.5
    while true
    do
        pid_num=$(ps -ef | grep wget |grep ${name} |grep -v grep | wc -l)
        if [ $pid_num -eq 0 ]; then
            filesize=$(du -sk ${AUTO_INSTALL}/${name}* | awk '{print $1}')
            str=$(printf "%-100s" "#")
            b=$(echo "$str" | sed 's/ /#/g')
            printf "%-8s| %-100s | %d%% \r\n" "${filesize}K" "$b" "100";
            break
        fi
        line=$(tail -2 ${log})
        filesize=$(echo $line | awk -F ' ' '{print $1}')
        percent=$(echo $line | awk -F '%' '{print $1}' | awk -F ' ' '{print $NF}')
        if [ "$percent" -ge 0 ] 2>/dev/null ;then
           str=$(printf "%-${percent}s" "#")
           b=$(echo "$str" | sed 's/ /#/g')
           printf "%-8s| %-100s | %d%% \r" "${filesize}" "$b" "$percent";
        else
            continue
        fi
        sleep 0.5

    done
    return 0
}

check_install_log()
{
    if [ ! -f "$log" ];then
        echo "NVIDIA install log $log not exist! Install may fail!"
        echo
        exit 1
    fi

    if [ "$1" = "NVIDIA" ]; then
        succstr=$(cat $log |grep "${SUCCESS_STR}")
        str2=$(cat $log |grep "INSTALL_ERROR")
        if [ -n "${succstr}" ] && [ -z "${str2}" ]; then
            echo "${succstr} !!"
            echo
            return 0
        else
            echo "Install may have some INSTALL_ERROR, please check log $log !"
            return 1
        fi
    fi

    if [ "$1" = "DRIVER" ]; then
        failstr=${DRIVER_FAIL_STR}
    elif [ "$1" = "CUDA" ]; then
        failstr=${CUDA_FAIL_STR}
    elif [ "$1" = "CUDNN" ]; then
        failstr=${CUDNN_FAIL_STR}
    elif [ "$1" = "NCCL" ]; then
        failstr=${NCCL_FAIL_STR}
    elif [ "$1" = "AIACC-Inference" ]; then
        failstr=${AIACC_INFERENCE_FAIL_STR}
    elif [ "$1" = "AIACC-Train" ]; then
        failstr=${AIACC_TRAIN_FAIL_STR}
    elif [ "$1" = "RAPIDS" ]; then
        failstr=${RAPIDS_FAIL_STR}
    elif [ "$1" = "scc-rdma 0" ]; then
        failstr=${RDMA_NIC_DRIVER_FAIL_STR}
    elif [ "$1" = "scc-rdma 1" ]; then
        failstr=${RDMA_PEER_MEM_FAIL_STR}
    elif [ "$1" = "ERDMA" ]; then
        failstr=${ERDMA_FAIL_STR}
    fi
    str1=$(cat $log |grep "${failstr}")
    if [ -n "${str1}" ] ;then
        echo
        echo "${failstr} ! please check install log ${log} !"
        return 1
    fi
}

check_install_process()
{
    if [ "$1" = "NULL" ]; then
        echo "CHECKING AUTO INSTALL,INSTALL RDMA=${6}, PLEASE WAIT ......"
    else
        echo "CHECKING AUTO INSTALL, DRIVER_VERSION=${1} CUDA_VERSION=${2} CUDNN_VERSION=${3} INSTALL AIACC-Training=${4} INSTALL AIACC-Inference=${5} , INSTALL RDMA=${6}, INSTALL eRDMA=${7} PLEASE WAIT ......"
    fi
    echo "$install_notes"
    echo

    while true
    do
        pid_num=$(ps -ef | grep ${PROCESS_NAME} |grep -v grep | grep -v check | wc -l)
        if [ $pid_num -eq 0 ];then
            check_install_log "NVIDIA"
            return 0
        else
            pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep driver | grep -v rdma |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "Driver-${1} downloading, it takes 30 seconds or more. Remaining installation time 15 to 20 minutes!"
                check_download ${DOWNLOAD_DRIVER}
            fi

            pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep cuda |grep -v nccl |grep -v rapids |grep -v miniconda |grep -v Tensor |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "CUDA-${2} downloading, it takes 3 minutes or more. Remaining installation time 14 - 19 minutes!"
                while true
                do
                    check_download "cuda"
                    sleep 1
                    pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep cuda |grep -v nccl |grep -v rapids |grep -v miniconda |grep -v Tensor | grep -v grep | wc -l)
                    if [ $pid_num -eq 0 ];then
                        break
                    fi
                done
            fi

            pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep cudnn |grep -v Tensor |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "cuDNN-${3} downloading, it tasks 1 minutes or more. Remaining installation time 12 - 16 minutes!"
                check_download "cudnn"
            fi

            #add rapids file download check
            pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} |grep rapids |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "RAPIDS downloading, it tasks 3 minutes or more. Remaining installation time 4 - 6 minutes!"
                check_download "rapids"
            fi

            #add aiacc-train file download check
            pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} | grep aiacc-train |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "AIACC-Trainging downloading, it tasks 3 minutes or more. Remaining installation time 4 - 6 minutes!"
                check_download "aiacc_train"
            fi

            #add aiacc-inference file download check
            pid_num=$(ps -ef | grep ${DOWNLOAD_PROCESS_NAME} | grep aiacc-inference | grep -v Tensor | grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "AIACC-Inference downloading, it tasks 3 minutes or more. Remaining installation time 4 - 6 minutes!"
                check_download "aiacc_inference"
            fi


            pid_num=$(ps -ef | grep "${DRIVER_PROCESS_NAME}" |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo
                echo "Driver-${1} installing, it tasks 1 to 3 minutes. Remaining installation time 11 to 15 minutes!"
                check_install "NVIDIA"
                check_install_log "DRIVER"
            fi
            pid_num=$(ps -ef | grep "${CUDA_PROCESS_NAME}" |grep -v nccl |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "CUDA-${2} installing, it tasks 2 to 5 minutes. Remaining installation time 9 to 12 minutes!"
                check_install "cuda"
                check_install_log "CUDA"
            fi
            pid_num=$(ps -ef | grep ${CUDNN_PROCESS_NAME} |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "cuDNN-${3} installing, it takes about 10 seconds. Remaining installation time 6 to 9 minutes!"
                check_install "cudnn"
                check_install_log "CUDNN"
            fi
            pid_num=$(ps -ef | grep ${NCCL_PROCESS_NAME} |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "NCCL installing, it taskes about 10 seconds. "
                check_install "nccl"
                check_install_log "NCCL"
            fi

            pid_num=$(ps -ef | grep ${RAPIDS_PROCESS_NAME} |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "RAPIDS installing, it taskes about 60 seconds. Installation will be successful soon, please wait......"
                check_install "rapids"
                check_install_log "RAPIDS"
            fi

            pid_num=$(ps -ef | grep ${AIACC_TRAIN_PROCESS_NAME} |grep tar |grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "AIACC-Training installing, it taskes about 60 seconds. Please wait......"
                check_install "aiacc-train"
                check_install_log "AIACC-Train"
            fi

            pid_num=$(ps -ef | grep ${AIACC_INFERENCE_PROCESS_NAME} |grep tar | grep -v Tensor| grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "AIACC-Inference installing, it taskes about 60 seconds. Please wait......"
                check_install "aiacc-inference"
                check_install_log "AIACC-Inference"
            fi
            pid_num=$(ps -ef | grep "${RDMA_PROCESS_NAME_0}" | grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "Installing Mellanox NIC Driver, NIC Libs and Network configure, it takes about 120 seconds, please wait......"
                check_install "scc-rdma 0"
                check_install_log "scc-rdma 0"

            fi

	        pid_num=$(ps -ef | grep "${RDMA_PROCESS_NAME_1}" | grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "Installing nv_peer_mem/nvidia_peermem module, it takes about 30 seconds, please wait......"
                check_install "scc-rdma 1"
                check_install_log "scc-rdma 1"

            fi

            pid_num=$(ps -ef | grep "${ERDMA_PROCESS_NAME}" | grep -v grep | wc -l)
            if [ $pid_num -gt 0 ];then
                echo "Installing erdma driver, it takes about 180 seconds, please wait......"
                check_install "erdma"
                check_install_log "ERDMA"

            fi

        fi
        sleep 1
    done
}

create_nvidia_repo_centos()
{
    baseurl_centos=$(curl http://100.100.100.200/latest/meta-data/source-address | head -1)
    #cudaurl=$baseurl_centos"/opsx/ecs/linux/rpm/cuda/${version}/\$basearch/"
    driverurl=$baseurl_centos"/opsx/ecs/linux/rpm/driver/${version}/\$basearch/"
    #echo "[ecs-cuda]" > /etc/yum.repos.d/nvidia.repo
    #echo "name=ecs cuda - \$basearch" >> /etc/yum.repos.d/nvidia.repo
    #echo "baseurl=$cudaurl" >> /etc/yum.repos.d/nvidia.repo
    #echo "enabled=1" >> /etc/yum.repos.d/nvidia.repo
    #echo "gpgcheck=0" >> /etc/yum.repos.d/nvidia.repo
    echo "[ecs-driver]" >> /etc/yum.repos.d/nvidia.repo
    echo "name=ecs driver - \$basearch" >> /etc/yum.repos.d/nvidia.repo
    echo "baseurl=$driverurl" >> /etc/yum.repos.d/nvidia.repo
    echo "enabled=1" >> /etc/yum.repos.d/nvidia.repo
    echo "gpgcheck=0" >> /etc/yum.repos.d/nvidia.repo
    yum clean all >> $log 2>&1
    yum makecache >> $log 2>&1
}


create_repo_centos8()
{
    rename '.repo' '.repo.bak' /etc/yum.repos.d/*.repo
    wget https://mirrors.aliyun.com/repo/Centos-vault-8.5.2111.repo -O /etc/yum.repos.d/Centos-vault-8.5.2111.repo
    wget https://mirrors.aliyun.com/repo/epel-archive-8.repo -O /etc/yum.repos.d/epel-archive-8.repo
    
    sed -i 's/mirrors.cloud.aliyuncs.com/url_tmp/g'  /etc/yum.repos.d/Centos-vault-8.5.2111.repo &&  sed -i 's/mirrors.aliyun.com/mirrors.cloud.aliyuncs.com/g' /etc/yum.repos.d/Centos-vault-8.5.2111.repo && sed -i 's/url_tmp/mirrors.aliyun.com/g' /etc/yum.repos.d/Centos-vault-8.5.2111.repo
    sed -i 's/mirrors.aliyun.com/mirrors.cloud.aliyuncs.com/g' /etc/yum.repos.d/epel-archive-8.repo
}
disable_nouveau_centos()
{
    if  [ ! -f /etc/modprobe.d/blacklist-nouveau.conf ];then
        echo "blacklist nouveau" > /etc/modprobe.d/blacklist-nouveau.conf
        echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouveau.conf
    fi
    content=$(lsmod |grep nouveau)
    if [ -n "$content" ];then
        rmmod nouveau
        echo "***exec \"dracut --force\" to regenerate the kernel initramfs"
        dracut --force
    fi
}

disable_nouveau_alinux()
{
    if [ ! -f /etc/modprobe.d/blacklist-nouv.conf ]; then
        echo "blacklist nouveau" > /etc/modprobe.d/blacklist-nouv.conf
        echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouv.conf
    fi
    if lsmod | grep -q nouveau; then
        rmmod nouveau
        echo "***exec \"dracut --force\" to regenerate the kernel initramfs"
        dracut --force
    fi
}

disable_nouveau_ubuntu()
{
    if  [ ! -f /etc/modprobe.d/blacklist-nouveau.conf ];then
        echo "blacklist nouveau" > /etc/modprobe.d/blacklist-nouveau.conf
        echo "blacklist lbm-nouveau" >> /etc/modprobe.d/blacklist-nouveau.conf
        echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist-nouveau.conf
    fi
    content=$(lsmod |grep nouveau)
    if [ -n "$content" ];then
        rmmod nouveau
        echo "***exec \"update-initramfs -u\" to regenerate the kernel initramfs"
        update-initramfs -u
    fi
}
install_kernel_centos()
{
    kernel_version=$(uname -r)
    kernel_devel_num=$(rpm -qa | grep kernel-devel | grep $kernel_version | wc -l)
    if [ $kernel_devel_num -eq 0 ];then
        echo "******exec \"yum install -y kernel-devel-$kernel_version\""
        yum install -y kernel-devel-$kernel_version
        if [ $? -ne 0 ]; then
            echo "INSTALL_ERROR: install kernel-devel fail!!!"
            return 1
        fi
    fi
    return 0
}
install_kernel_alinux()
{
    kernel_version=$(uname -r)
    if ! rpm -qa | grep kernel-devel | grep -q $kernel_version; then
        echo "******exec \"yum install -y kernel-devel-$kernel_version\""
        yum install -y kernel-devel-$kernel_version
        if [ $? -ne 0 ]; then
            echo "INSTALL_ERROR: install kernel-devel fail!!!"
            return 1
        fi
    fi
    return 0
}
install_kernel_sles()
{
    kernel_version=$(uname -r|awk -F'-' '{print $1"-"$2}')
    kernel_devel_num=$(rpm -qa | grep kernel-default-devel | wc -l)
    if [ $kernel_devel_num -eq 0 ];then
        echo "***exec \"zypper install -y kernel-default-devel=$kernel_version\""
        zypper install -y kernel-default-devel=$kernel_version
        if [ $? -ne 0 ]; then
            echo "INSTALL_ERROR: install kernel-default-devel fail!!!"
            return 1
        fi
    fi
}
install_kernel_ubuntu()
{
    kernel_version=$(uname -r)
    linux_headers_num=$(dpkg --list |grep linux-headers | grep $kernel_version | wc -l)
    if [ $linux_headers_num -eq 0 ];then
        echo "***exec \"apt-get install -y --allow-unauthenticated linux-headers-$kernel_version\""
        apt-get install -y --allow-unauthenticated linux-headers-$kernel_version
        if [ $? -ne 0 ]; then
            echo "INSTALL_ERROR: install linux-headers fail!!!"
            return 1
        fi
    fi
}


install_kernel_debian()
{
    apt-get update
    kernel_version=$(uname -r)
    if [ "${kernel_version}" = "4.19.0-17-amd64" ];then
	cd ${AUTO_INSTALL}
        wget -t 100 --timeout=10 ${download_url}/rdma/sccgn7ex/Debian10u10/linux-header.tar.gz
        tar zxf linux-header.tar.gz
        cd  linux-header
        dpkg -i linux-kbuild-4.19_4.19.194-3_amd64.deb
        dpkg -i linux-compiler-gcc-8-x86_4.19.194-3_amd64.deb
        dpkg -i linux-headers-4.19.0-17-common_4.19.194-3_all.deb
        dpkg -i linux-headers-4.19.0-17-amd64_4.19.194-3_amd64.deb
            if [ $? -ne 0 ]; then
                echo "INSTALL_ERROR: install linux-headers fail!!!"
                return 1
    	    fi

    else
        linux_headers_num=$(dpkg --list |grep linux-headers | grep $kernel_version | wc -l)
        if [ $linux_headers_num -eq 0 ];then
            echo "***exec \"apt-get install -y  linux-headers-$kernel_version\""
            apt-get install -y linux-headers-$kernel_version
            if [ $? -ne 0 ]; then
                echo "INSTALL_ERROR: install linux-headers fail!!!"
                return 1
    	    fi
        fi
    fi
}


download()
{
    cd ${AUTO_INSTALL}
    wget -t 100 --timeout=10 ${download_url}/nvidia/driver/${driver_file}
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: Download driver fail!!! return: $?"
        return 1
    fi

    if [ "$os" = "centos" -a "$version" = "6" ];then
        if [ "${cuda_big_version}" = "8.0" -o "${cuda_big_version}" = "9.0" -o "${cuda_big_version}" = "9.2" \
             -o "${cuda_big_version}" = "10.0" ];then

            ar=$(curl ${download_url}/nvidia/cuda/${cuda_version}/ > ./tmp)
            echo "${download_url}/nvidia/cuda/${cuda_version}/"
            cudafilelist=$(cat ./tmp | perl -n -e'/>(cuda[^<]*)</ && print "$1 \n"' | grep -v ubuntu)
        else #cuda10.1 cuda10.2 , cuda11 not support rhel6
            ar=$(curl ${download_url}/nvidia/cuda/${cuda_version}/rhel6/ > ./tmp)
            cudafilelist=$(cat ./tmp | perl -n -e'/>(cuda[^<]*)</ && print "$1 \n"')
        fi
    else
        ar=$(curl ${download_url}/nvidia/cuda/${cuda_version}/ > ./tmp)
        echo "${download_url}/nvidia/cuda/${cuda_version}/"
        cudafilelist=$(cat ./tmp | perl -n -e'/>(cuda[^<]*)</ && print "$1 \n"' | grep -v ubuntu)
    fi

    if [ -z "$cudafilelist" ]; then
        echo "INSTALL_ERROR: Download CUDA fail!!! get cuda-${cuda_version} filename fail!!"
        return 1
    fi

    cd ${CUDA_DIR}
    echo $cudafilelist
    for cudafile in $cudafilelist
    do
        sleep 1
        if [ "$os" = "centos" -a "$version" = "6" ];then
            if [ "${cuda_big_version}" = "8.0" -o "${cuda_big_version}" = "9.0" -o "${cuda_big_version}" = "9.2" \
                 -o "${cuda_big_version}" = "10.0" ];then
                wget -t 100 --timeout=10 ${download_url}/nvidia/cuda/${cuda_version}/$cudafile
            else
                wget -t 100 --timeout=10 ${download_url}/nvidia/cuda/${cuda_version}/rhel6/$cudafile
            fi
        else
            wget -t 100 --timeout=10 ${download_url}/nvidia/cuda/${cuda_version}/$cudafile
        fi
        if [ $? -ne 0 ]; then
            echo "INSTALL_ERROR: Download CUDA fail!!! wget $cudafile fail! return: $?"
            return 1
        fi
    done
    chmod +x ${CUDA_DIR}/*

    cd ${AUTO_INSTALL}
    if [ "$cuda_big_version" \> "12" ]; then
        wget -t 100 --timeout=10 ${download_url}/nvidia/cudnn/12.x/${cudnn_file}
    elif [ "$cuda_big_version" = "11.7" -o "$cuda_big_version" = "11.8" ]; then
        wget -t 100 --timeout=10 ${download_url}/nvidia/cudnn/11.x/${cudnn_file}
    else
        wget -t 100 --timeout=10 ${download_url}/nvidia/cudnn/${cuda_big_version}/${cudnn_file}
    fi
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: Download cuDNN fail!!! return :$?"
        return 1
    fi

    chmod +x ${AUTO_INSTALL}/*
    echo "$DOWNLOAD_SUCCESS_STR !"
    return 0
}

install_driver()
{
    ${AUTO_INSTALL}/${driver_file} --silent
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: driver install fail!!!"
        return 1
    fi
    echo "DRIVER $driver_version install OK !"
    return 0
}

install_cuda()
{
    cd ${CUDA_DIR}
    cuda_file=$(ls -S | grep cuda | grep $cuda_version | head -1)
    echo "cuda file: "$cuda_file
    if [ -z "$cuda_file" ]
    then
        echo "INSTALL_ERROR: cuda file is null, cuda install fail!!!"
        return 1
    fi

    sh ${CUDA_DIR}/$cuda_file --silent --toolkit --samples --samplespath=/root
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: cuda install fail!!!"
        return 1
    fi

    cuda_patchfile=$(ls | grep cuda | grep $cuda_version | grep -v ${cuda_file})
    for cuda_patch in $cuda_patchfile
    do
        echo "install cuda patch file: "$cuda_patch
        sh ${CUDA_DIR}/${cuda_patch} --silent --installdir=/usr/local/cuda --accept-eula
        if [ $? -ne 0 ]; then
            echo "INSTALL_ERROR: cuda patch install fail!!!"
            return 1
        fi
    done
    echo "CUDA $cuda_version install OK !"
    return 0
}

install_cudnn()
{
    mkdir ${AUTO_INSTALL}/cudnn
    #tar zxvf ${AUTO_INSTALL}/$cudnn_file -C /usr/local
    tar xvf ${AUTO_INSTALL}/$cudnn_file -C ${AUTO_INSTALL}/cudnn

    if [ "$cudnn_version" \< "8.4" ];then
        cp  ${AUTO_INSTALL}/cudnn/cuda/include/* /usr/local/cuda/include
        cp -P ${AUTO_INSTALL}/cudnn/cuda/lib64/* /usr/local/cuda/lib64
    else
        cp ${AUTO_INSTALL}/cudnn/cudnn-*-archive/include/cudnn*.h /usr/local/cuda/include
        cp -P ${AUTO_INSTALL}/cudnn/cudnn-*-archive/lib/libcudnn* /usr/local/cuda/lib64
    fi

    chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn*

    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: CUDNN INSTALL FAIL !!!"
        return 1
    fi
    echo "CUDNN $cudnn_version install OK !"
    return 0
}

install_nccl()
{
    cd ${AUTO_INSTALL}
    #download nccl
    curl ${download_url}/nvidia/nccl/${cuda_big_version}/ > ./tmp
    br=$(cat ./tmp | perl -n -e'/>nccl_(.*)-1\+cuda.*/ && print "$1 \n"')
    cr=$(echo "${br}" | sort -rV | head -n1)
    nccl_version=$(echo ${cr} | awk -F ' ' '{print $1}')
    echo "max nccl version:$nccl_version"
    nccl_dir="nccl_${nccl_version}-1+cuda${cuda_big_version}_x86_64"
    nccl_file="${nccl_dir}.txz"

    echo $nccl_file
    wget -t 100 --timeout=10 ${download_url}/nvidia/nccl/${cuda_big_version}/${nccl_file}
    chmod +x $nccl_file
    tar xf ${AUTO_INSTALL}/${nccl_file} && cp -r ${nccl_dir} /usr/local/nccl
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: NCCL INSTALL FAIL !!!"
        return 1
    fi
    echo "NCCL $nccl_version install OK !"
    return 0
}

install_fabricmanager()
{
    cd ${AUTO_INSTALL}
    mig_gpu_id=$(nvidia-smi --query-gpu=pci.bus_id,mig.mode.current --format=csv,noheader | grep Enabled | awk -F ',' '{print $1}' | xargs |sed 's/ /,/g')
    if [ -n "${mig_gpu_id}" ] ;then
        echo "Closing Enabled Mig: ${mig_gpu_id}"
        nvidia-smi -i ${mig_gpu_id} -mig 0
    else
        echo "All Mig disabled."
    fi

    url_base="http://mirrors.cloud.aliyuncs.com/nvidia-cuda"
    if [ "$os" = "ubuntu" ]; then
        driver_version_main=$(echo $driver_version | awk -F '.' '{print $1}')
        pkg="nvidia-fabricmanager-${driver_version_main}_${driver_version}-1_amd64.deb"
        if [ "$version" = "16.04" ]; then
            wget  -t 100 --timeout=10 ${url_base}/ubuntu1604/x86_64/${pkg}
        elif [ "$version" = "18.04" ]; then
            wget  -t 100 --timeout=10 ${url_base}/ubuntu1804/x86_64/${pkg}
        elif [ "$version" = "20.04" ]; then
            wget  -t 100 --timeout=10 ${url_base}/ubuntu2004/x86_64/${pkg}
        elif [ "$version" = "22.04" ]; then
            wget  -t 100 --timeout=10 ${url_base}/ubuntu2204/x86_64/${pkg}
        fi
        dpkg -i ${pkg}
	#disable nvidia-fabricmanager update
        pkg=$(dpkg --list |grep nvidia-fabricmanager | awk -F ' ' '{print $2}')
	echo "nvidia-fabricmanager pkg: $pkg ,  apt-mark hold it!"
	apt-mark hold $pkg


    elif [ "$os" = "debian" ];then
	#nvidia-fabricmanager-470_470.82.01-1_amd64.deb
	driver_version_main=$(echo $driver_version | awk -F '.' '{print $1}')
        pkg="nvidia-fabricmanager-${driver_version_main}_${driver_version}-1_amd64.deb"
	if [ "$version" = "10" ]; then
	     wget  -t 100 --timeout=10 ${url_base}/debian10/x86_64/${pkg}
	fi
	dpkg -i ${pkg}
    elif [ "$os" = "centos" -a "$version" = "8" ] || [ "$os" = "alinux" -a "$version" = "3" ]; then
        pkg="nvidia-fabric-manager-${driver_version}-1.x86_64.rpm"
        wget  -t 100 --timeout=10 ${url_base}/rhel8/x86_64/${pkg}
        yum install -y ${pkg}
    elif [ "$os" = "centos" -a "$version" = "7" ]|| [ "$os" = "alinux" -a "$version" = "2" ]; then
        pkg="nvidia-fabric-manager-${driver_version}-1.x86_64.rpm"
        wget  -t 100 --timeout=10 ${url_base}/rhel7/x86_64/${pkg}
        rpm -ivh ${pkg}
    fi

    systemctl enable nvidia-fabricmanager
    systemctl start nvidia-fabricmanager
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: nvidia-fabricmanager start FAIL !!!"
        return 1
    fi
    systemctl status nvidia-fabricmanager

    return 0
}

enable_pm()
{
    if [ "$os" = "centos" -o "$os" = "alinux" ];then
        yum install bzip2 -y
    fi
    cd /usr/share/doc/NVIDIA_GLX-1.0/sample*
    bunzip2 nvidia-persistenced-init.tar.bz2
    tar xvf nvidia-persistenced-init.tar
    cd nvidia-persistenced-init && sh install.sh -u root
}

set_env()
{
    env_path="/usr/local/cuda/bin:"
    #env_library="/usr/local/cuda-${cuda_big_version}/lib64:/usr/local/nccl/lib:"
    env_library="/usr/local/cuda/lib64:"
    env1="export PATH=${env_path}\$PATH"
    env2="export LD_LIBRARY_PATH=${env_library}\$LD_LIBRARY_PATH"

    echo $env1 >> ${env_file}
    echo $env2 >> ${env_file}

}

install_dependencies()
{
    cd ${AUTO_INSTALL}

    curl ${download_url}/aiacc_1211/ > ./tmp
    if [ "$os" = "ubuntu" ]; then
        #download the latest openmpi pkg
        br=$(cat ./tmp | perl -n -e'/>openmpi_(.*)_amd64.deb/ && print "$1 \n"')
        cr=$(echo "${br}" | sort -rV | head -n1)
        openmpi_version=$(echo ${cr} | awk -F ' ' '{print $1}')
        openmpi_file=openmpi_${openmpi_version}_amd64.deb

        wget -t 100 --timeout=10 ${download_url}/aiacc_1211/${openmpi_file}
        dpkg -i ${openmpi_file}
        if [ $? -ne 0 ]; then
            echo "INSTALL_ERROR: Openmpi INSTALL FAIL !!!"
            return 1
        fi

        mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real
        echo "#!/bin/bash" > /usr/local/bin/mpirun
        echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun
        chmod a+x /usr/local/bin/mpirun

        mkdir -p /root/.openmpi
        echo "hwloc_base_binding_policy=none" >> /root/.openmpi/mca-params.conf

        apt-get update
        apt-get install -y curl openssh-client openssh-server

    elif [ "$os" = "centos" -a "$version" = "7" ]; then
        #yum -y update
        yum clean all
        yum -y install epel-release
        yum -y install perl openssh-clients openssh-server openblas-devel


        #download the latest openmpi pkg
        br=$(cat ./tmp | perl -n -e'/>openmpi-(.*).el7.x86_64.rpm/ && print "$1 \n"')
        cr=$(echo "${br}" | sort -rV | head -n1)
        openmpi_version=$(echo ${cr} | awk -F ' ' '{print $1}')
        openmpi_file=openmpi-${openmpi_version}.el7.x86_64.rpm

        wget -t 100 --timeout=10 ${download_url}/aiacc_1211/${openmpi_file}
        rpm -Uivh ${openmpi_file}
        if [ $? -ne 0 ]; then
            echo "INSTALL_ERROR: Openmpi INSTALL FAIL !!!"
            return 1
        fi

        mv /usr/bin/mpirun /usr/bin/mpirun.real
        echo '#!/bin/bash' > /usr/bin/mpirun
        echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/bin/mpirun
        chmod a+x /usr/bin/mpirun

        mkdir -p /root/.openmpi
        echo "hwloc_base_binding_policy=none" >> /root/.openmpi/mca-params.conf
    fi
    echo "AIACC-Training install_dependencies OK !"
    rm -f ./tmp
    return 0
}

install_aiacc_base()
{
    cd ${AUTO_INSTALL}
    #download the latest base pkg
    #curl ${download_url}/aiacc/cuda${cuda_big_version}/ > ./tmp
    curl ${download_url}/aiacc_1211/cuda${cuda_big_version}/ > ./tmp
    #miniconda-cuda10.0-aiacc-base.tgz

    aiacc_base_file=$(cat ./tmp | perl -n -e'/>(miniconda[^"]*.tgz)</ && print "$1 \n"'  | grep "aiacc-base" |  sort -rV | head -1 )
    echo "aiacc_base_file=${aiacc_base_file}"
    if [ -z "${aiacc_base_file}" ]; then
        echo "INSTALL_ERROR: AIACC INSTALL FAIL! Get AIACC-base  package fail !!!"
        return 1
    fi

    #wget -t 100 --timeout=10 ${download_url}/aiacc/cuda${cuda_big_version}/${aiacc_base_file}
    wget -t 100 --timeout=10 ${download_url}/aiacc_1211/cuda${cuda_big_version}/${aiacc_base_file}
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: AIACC-base INSTALL FAIL! Download AIACC-base env package fail!!! return :$?"
        return 1
    fi

    chmod +x ${AUTO_INSTALL}/*
    tar zxvf ${AUTO_INSTALL}/${aiacc_base_file} -C /root
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: AIACC-base INSTALL FAIL! INSTALL AIACC-base env package fail!!! return :$?"
        return 1
    fi

    echo "AIACC-base install OK !"
    #rm -f ${AUTO_INSTALL}/${aiacc_base_file}

    return 0
}




install_aiacc_train()
{
    mkdir ${AUTO_INSTALL}/aiacc_train
    cd ${AUTO_INSTALL}

    #download the latest perseus pkg
    #curl ${download_url}/aiacc/cuda${cuda_big_version}/ > ./tmp
    curl ${download_url}/aiacc_1211/cuda${cuda_big_version}/ > ./tmp
    aiacc_train_file=$(cat ./tmp | perl -n -e'/>(miniconda-cuda[^"]*.tgz)</ && print "$1 \n"'  | grep "aiacc-train" |  sort -rV | head -1 )
    echo "aiacc_train_file=${aiacc_train_file}"
    if [ -z "${aiacc_train_file}" ]; then
        echo "INSTALL_ERROR: AIACC-Training INSTALL FAIL! Get AIACC-Training  package fail !!!"
        return 1
    fi

    #wget -t 100 --timeout=10 -P ${AUTO_INSTALL}/aiacc_train ${download_url}/aiacc/cuda${cuda_big_version}/${aiacc_train_file}
    wget -t 100 --timeout=10 -P ${AUTO_INSTALL}/aiacc_train ${download_url}/aiacc_1211/cuda${cuda_big_version}/${aiacc_train_file}
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: AIACC-Training INSTALL FAIL! Download AIACC-Training env package fail!!! return :$?"
        return 1
    fi

    chmod +x ${AUTO_INSTALL}/aiacc_train/*
    echo "tar zxvf ${AUTO_INSTALL}/aiacc_train/${aiacc_train_file} -C /root"
    tar zxvf ${AUTO_INSTALL}/aiacc_train/${aiacc_train_file} -C /root
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: AIACC-Training INSTALL FAIL! INSTALL AIACC-Training env package fail!!! return :$?"
        return 1
    fi

    echo "AIACC-Training unpack OK !"

    install_dependencies
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: AIACC-Training INSTALL FAIL! INSTALL dependencies fail!!! return :$?"
        return 1
    fi
    rm -f ./tmp
    echo "AIACC-Training ENV INSTALL OK !"


    cd /root
    wget -t 100 --timeout=10 ${download_url}/aiacc_1211/ali-perseus-demos.tgz
    if [ $? -ne 0 ]; then
        echo "INSTALL_WARNING: AIACC-Training download demo fail!!! "
    fi


    #rm -rf ${AUTO_INSTALL}/aiacc_train

    return 0
}


install_aiacc_inference()
{

    mkdir ${AUTO_INSTALL}/aiacc_inference
    cd ${AUTO_INSTALL}

    #download the latest aiacc_inference pkg
    #curl ${download_url}/aiacc/cuda${cuda_big_version}/ > ./tmp
    curl ${download_url}/aiacc_1211/cuda${cuda_big_version}/ > ./tmp
    #miniconda-cuda10.0-aiacc-inference-1.0.2.tgz

    aiacc_inference_file=$(cat ./tmp | perl -n -e'/>(miniconda-cuda[^"]*.tgz)</ && print "$1 \n"'  | grep "aiacc-inference" |  sort -rV | head -1 )
    echo "aiacc_inference_file=${aiacc_inference_file}"
    if [ -z ${aiacc_inference_file} ]; then
        echo "INSTALL_ERROR: AIACC-Inference INSTALL FAIL! Get AIACC-Inference  package fail !!!"
        return 1
    fi

    #wget -t 100 --timeout=10  -P ${AUTO_INSTALL}/aiacc_inference ${download_url}/aiacc/cuda${cuda_big_version}/${aiacc_inference_file}
    wget -t 100 --timeout=10  -P ${AUTO_INSTALL}/aiacc_inference ${download_url}/aiacc_1211/cuda${cuda_big_version}/${aiacc_inference_file}
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: AIACC-Inference INSTALL FAIL! Download AIACC-Inference env package fail!!! return :$?"
        return 1
    fi

    chmod +x ${AUTO_INSTALL}/aiacc_inference/*
    echo "tar zxvf ${AUTO_INSTALL}/aiacc_inference/${aiacc_inference_file} -C /root"
    tar zxvf ${AUTO_INSTALL}/aiacc_inference/${aiacc_inference_file} -C /root
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: AIACC-Inference INSTALL FAIL! INSTALL AIACC-Inference env package fail!!! return :$?"
        return 1
    fi
    rm -f ./tmp

    #download trt
    os_version=${version}
    if [ "${os_version}" = "8" ];then
        os_version="7"
    fi
    curl ${download_url}/nvidia/tensorrt/cuda${cuda_big_version}/ > ./tmp
    #TensorRT-7.0.0.11.Ubuntu-16.04.x86_64-gnu.cuda-10.0.cudnn7.6.tar.gz
    trt_file_list=$(cat ./tmp | perl -n -e'/(TensorRT-[^"]*.gz)">/ && print "$1 \n"'| grep -i "${os}-${os_version}" | grep "cudnn${cudnn_big_version}" )
    echo "trt file list: ${trt_file_list}"

    if [ -z "$trt_file_list" ]; then
        echo "INSTALL_WARNING:AIACC-Inference INSTALL FAIL! Download TensorRT fail!!! get TensorRT filename fail!!"
    else
        for trt_file in $trt_file_list
        do
            sleep 1
            wget -t 100 --timeout=10 -P ${AUTO_INSTALL}/aiacc_inference ${download_url}/nvidia/tensorrt/cuda${cuda_big_version}/${trt_file}
            if [ $? -ne 0 ]; then
                echo "INSTALL_WARNING: AIACC-Inference INSTALL FAIL! Download TensorRT file fail!!! "
            else
                chmod +x ${AUTO_INSTALL}/aiacc_inference/*
                tar zxvf ${AUTO_INSTALL}/aiacc_inference/${trt_file} -C /usr/local
                if [ $? -ne 0 ]; then
                    echo "INSTALL_WARNING: AIACC-Inference TensorRT INSTALL FAIL!  return :$?"
                fi
            fi
        done
    fi


    echo "AIACC-Inference TensorRT install OK !"
    rm -f ./tmp

    #aiacc_inference_demo.tgz
    cd /root
    wget -t 100 --timeout=10 ${download_url}/aiacc_1211/aiacc_inference_demo.tgz
    if [ $? -ne 0 ]; then
        echo "INSTALL_WARNING: AIACC-Inference download demo fail!!! "
    fi


    echo "AIACC-Inference install OK !"


    #rm -rf ${AUTO_INSTALL}/aiacc_inference

    return 0
}

install_rapids()
{
    cd ${AUTO_INSTALL}
    #rapids_file="rapids0.8_py3.6_cuda${cuda_big_version}.tar.gz"
    rapids_env_file="env_add_to_bashrc.log"

    #download the latest rapids pkg
    curl ${download_url}/rapids/cuda${cuda_big_version}/ > ./tmp
    br=$(cat ./tmp | perl -n -e'/>rapids(.*)_miniconda(.*)_cuda.*_py(.*).tar.gz/ && print "$1 $2 $3\n"')
    cr=$(echo "${br}" | sort -rV | head -n1)

    rapids_version=$(echo ${cr} | awk -F ' ' '{print $1}')
    if [ -z "${rapids_version}" ]; then
        echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! get rapids package name fail!!! return :$?"
        return 1
    fi

    miniconda_version=$(echo ${cr} | awk -F ' ' '{print $2}')
    if [ -z "${miniconda_version}" ]; then
        echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! get rapids package name fail!!! return :$?"
        return 1
    fi

    py_version=$(echo ${cr} | awk -F ' ' '{print $3}')
    if [ -z "${py_version}" ]; then
        echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! get rapids package name fail!!! return :$?"
        return 1
    fi
    rapids_file="rapids${rapids_version}_miniconda${miniconda_version}_cuda${cuda_big_version}_py${py_version}.tar.gz"

    wget -t 100 --timeout=10 ${download_url}/rapids/cuda${cuda_big_version}/${rapids_file}
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! Download rapids package fail!!! return :$?"
        return 1
    fi

    wget -t 100 --timeout=10 ${download_url}/rapids/cuda${cuda_big_version}/${rapids_env_file}
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: RAPIDS INSTALL FAIL!Download rapids package fail!!! return :$?"
        return 1
    fi

    chmod +x ${AUTO_INSTALL}/*
    tar zxvf ${AUTO_INSTALL}/${rapids_file} -C /root && cat ${AUTO_INSTALL}/${rapids_env_file} >> /root/.bashrc
    #cat ${AUTO_INSTALL}/${rapids_env_file} >> /root/.bashrc && source .bashrc
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: RAPIDS INSTALL FAIL! Install rapids package fail!!! return :$?"
        return 1
    fi
    echo "RAPIDS INSTALL OK !"

}

if [ -f "/etc/os-release" ];then
    #os=$(cat /etc/os-release |grep "^ID="|awk -F '=' '{print $2}'|sed 's/\"//g')
    os=$(. /etc/os-release;echo $ID)
    if [ "$os" = "ubuntu" ];then
        profile_file="/root/.profile"
        env_file="/root/.bashrc"
    elif [ "$os" = "centos" ];then
        profile_file="/root/.bash_profile"
        env_file="/root/.bashrc"
    elif [ "$os" = "alinux" ];then
        profile_file="/root/.bash_profile"
        env_file="/root/.bashrc"
    elif [ "$os" = "sles" ];then
        env_file="/root/.bash_profile"
        profile_file="/root/.bash_profile"
    elif [ "$os" = "debian" ]; then
        profile_file="/root/.profile"
        env_file="/root/.bashrc"
    fi
else
    issue=$(cat /etc/issue | grep CentOS)
    if [ -n "$issue" ];then
        os="centos"
        env_file="/root/.bashrc"
        profile_file="/root/.bash_profile"
    fi
fi


if [ "$1" = "check" ];then
    check_install_process $2 $3 $4 $5 $6 $7 $8
    sed -i '/auto_install/d' $profile_file
    exit 0
else
    driver_version="$1"
    cuda_version="$2"
    cudnn_version="$3"
    is_install_aiacc_train="$4"
    is_install_aiacc_inference="$5"
    is_install_rdma="$6"
    is_install_rapids="$8"
    is_install_erdma="$7"

    echo "begin to install, driver: $driver_version, cuda: $cuda_version, cudnn: $cudnn_version " >> $log 2>&1
    driver_file="NVIDIA-Linux-x86_64-"${driver_version}".run"
    cuda_big_version=$(echo $cuda_version | awk -F'.' '{print $1"."$2}')

    if [ "$cuda_big_version" \> "12" ]; then
        cudnn_file="cudnn-linux-x86_64-"${cudnn_version}"_cuda12-archive.tar.xz"
    elif [ "$cuda_big_version" = "11.7" -o "$cuda_big_version" = "11.8" ]; then
        cudnn_file="cudnn-linux-x86_64-"${cudnn_version}"_cuda11-archive.tar.xz"
    else
        cudnn_file="cudnn-"${cuda_big_version}"-linux-x64-v"${cudnn_version}".tgz"
    fi

    cudnn_big_version=$(echo $cudnn_version | awk -F'.' '{print $1}')

    echo "sh ${PROCESS_NAME} check $driver_version $cuda_version $cudnn_version ${is_install_aiacc_train} ${is_install_aiacc_inference} ${is_install_rdma} ${is_install_erdma} ${is_install_rapids}" | tee -a $profile_file
    #echo "sh ${PROCESS_NAME} check $driver_version $cuda_version $cudnn_version ${is_install_perseus}" | tee -a $profile_file
fi
echo "os:$os" >> $log 2>&1
ubuntu_version=""

if [ "$os" = "ubuntu" ]; then
    disable_nouveau_ubuntu >> $log 2>&1
    apt-get update

#    version=$(cat /etc/os-release |grep "VERSION_ID=" | awk -F '=' '{print $2}'|sed 's/\"//g')
    version=$(. /etc/os-release;echo $VERSION_ID)
    if [ "$version" = "16.04" ]; then
        ubuntu_version="ubuntu1604"
    elif [ "$version" = "18.04" ];then
        ubuntu_version="ubuntu1804"
    elif [ "$version" = "20.04" ];then
        ubuntu_version="ubuntu2004"
    elif [ "$version" = "22.04" ];then
        ubuntu_version="ubuntu2204"
    else
        echo "ERROR: Ubuntu version $version is not supported!" >> $log 2>&1
        exit 1
    fi

    echo "stop daily update service" >> $log 2>&1  #stop apt-daily
    systemctl stop apt-daily.timer
    systemctl stop apt-daily.service
    systemctl stop apt-daily-upgrade.timer
    systemctl stop apt-daily-upgrade.service

elif [ "$os" = "centos" ]; then
    disable_nouveau_centos >> $log 2>&1

    if [ ! -f "/usr/bin/gcc" ]; then
        yum install -y gcc
    fi


    if [ -f "/etc/os-release" ];then
        #version=$(cat /etc/os-release |grep "VERSION_ID=" | awk -F '=' '{print $2}'|sed 's/\"//g')
        version=$(. /etc/os-release;echo $VERSION_ID)
    else
        if [ ! -f "/usr/bin/lsb_release" ]; then
            pkgname=$(yum provides /usr/bin/lsb_release |grep centos|grep x86_64 |head -1 |awk -F: '{print $1}')
            if [ -z "$pkgname" ]; then
                echo "INSTALL_ERROR: /usr/bin/lsb_release pkg not exists!" >> $log 2>&1
                exit 1
            fi
            yum install -y $pkgname >> $log 2>&1
        fi
        str=$(lsb_release -r | awk -F'[:.]' '{print $2}')
        version=$(echo $str | sed 's/ //g')

    fi


    create_nvidia_repo_centos

elif [ "$os" = "alinux" ]; then
    disable_nouveau_alinux >> $log 2>&1

    if [ ! -f "/usr/bin/gcc" ]; then
        yum install -y gcc
    fi

    version=$(cat /etc/os-release | grep "VERSION_ID=" | awk -F '=' '{print $2}' | sed 's/\"//g' | cut -d. -f1)
    ###change log
    #aliyun linux2 在安装驱动之前,要先安装一下elf这个包,否则报错
    if [ "$version" = "2" ]; then
       echo "install elfutils-libelf-devel"  >> $log 2>&1
       yum install elfutils-libelf-devel -y
    fi
elif [ "$os" = "debian" ]; then
    version=$(. /etc/os-release;echo $VERSION_ID)
    echo "os is Debian, version:${version}" >> $log 2>&1
    apt-get update
else
    echo "INSTALL_ERROR: Invalid OS!! INSTALL FAIL!" >> $log 2>&1
    exit 1
fi


baseurl=$(curl http://100.100.100.200/latest/meta-data/source-address | head -1)
download_url="${baseurl}/opsx/ecs/linux/binary"


install_kernel_${os} >> $log 2>&1
if [ $? -ne 0 ]; then
    echo "INSTALL_ERROR: kernel-devel install fail!!!" >> $log 2>&1
    exit 1
fi

begin_download=$(date '+%s')


if [ "$INSTANCE_FAMILY" = "sccgn7ex" -a "$is_install_rdma" = "TRUE" ]; then
    #install scc begin
    echo "$os install scc 0 begin ... " >> $log 2>&1
    cd ${AUTO_INSTALL}
    wget ${download_url}/rdma/aiacc-scc-rdma.sh ### && sh aiacc-scc-rdma.sh 0 $os 
    sh aiacc-scc-rdma.sh 0 $os 
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: $INSTANCE_FAMILY $RDMA_NIC_DRIVER_FAIL_STR ! " >> $log 2>&1
        exit 1
    fi

    echo "install scc end ... " >> $log 2>&1
    #install scc end
    if [ "$driver_version" = "NULL" ]; then
	    echo "only install RDMA. exit. " >> $log 2>&1
            echo  ${SUCCESS_STR} >> $log 2>&1
	    sleep 60
	    reboot
            exit 0
    fi
fi

if [ "$is_install_erdma" = "TRUE" ]; then
    echo "$os install ofed and erdma begin ... " >> $log 2>&1
    mkdir ${AUTO_INSTALL}/erdma -p
    cd ${AUTO_INSTALL}
    wget ${download_url}/erdma/auto_install_erdma.sh
    bash -x auto_install_erdma.sh ${AUTO_INSTALL}/erdma >> $log 2>&1
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: ${INSTANCE_FAMILY} ${ERDMA_FAIL_STR}!" >> $log 2>&1
        exit 1
    fi
    if [ "$driver_version" = "NULL" ]; then
        echo "only install eRDMA. exit. " >> $log 2>&1
        echo  ${SUCCESS_STR} >> $log 2>&1
        sleep 60
        reboot
        exit 0
    else
        # download nvidia-peermem and nvidia-peermem.service
        # install driver, install nvidia-peermem
        wget -P /sbin/ ${download_url}/erdma/nvidia-peermem
        wget -P /etc/systemd/system/ ${download_url}/erdma/nvidia-peermem.service
        systemctl daemon-reload
        # enable nvidia-peermem ONBOOT
        systemctl enable nvidia-peermem
        if [ $? -ne 0 ]; then
            echo "INSTALL_ERROR: ${INSTANCE_FAMILY} ${ERDMA_FAIL_STR}!" >> $log 2>&1
            exit 1
        fi
    fi

fi


if [ "$os" = "centos" ]; then
    if [ "$version" = "8" ]; then
       echo "no nvidia source, install elfutils-libelf-devel"
      
       #create_repo_centos8

       yum install elfutils-libelf-devel -y
    fi
fi

if [ "$driver_version" = "NULL" ]; then
    echo "Driver version is NULL, do not install. exit. " >> $log 2>&1
    #exit 0
fi

download >> $log 2>&1
if [ $? -ne 0 ]; then
    exit 1
fi
end_download=$(date '+%s')
time_download=$((end_download-begin_download))
echo "NVIDIA download OK! Using time $time_download s !!" >> $log 2>&1

begin=$(date '+%s')
install_driver >> $log 2>&1
if [ $? -ne 0 ]; then
    exit 1
fi

enable_pm >> $log 2>&1

if [ "$INSTANCE_FAMILY" = "ebmgn7" -o "$INSTANCE_FAMILY" = "ebmgn7e" -o "$INSTANCE_FAMILY" = "sccgn7ex"  -o "$INSTANCE_FAMILY" = "ebmgn7ex" \
    -o "$INSTANCE_FAMILY" = "ebmgn7vx" -o "$INSTANCE_FAMILY" = "ebmgn7v" ]; then
    install_fabricmanager >> $log 2>&1
    if [ $? -ne 0 ]; then
        exit 1
    fi
fi

echo "NVIDIA install driver OK!!!" >> $log 2>&1

install_cuda >> $log 2>&1
if [ $? -ne 0 ]; then
    exit 1
fi
echo "NVIDIA install cuda OK!!"  >> $log 2>&1

install_cudnn >> $log 2>&1
if [ $? -ne 0 ]; then
    exit 1
fi
echo "NVIDIA install cudnn OK!!!" >> $log 2>&1

#install_nccl >> $log 2>&1
#if [ $? -ne 0 ]; then
#    exit 1
#fi


set_env
cd ${AUTO_INSTALL}
rm -f ${AUTO_INSTALL}/tmp
rm -rf ${AUTO_INSTALL}/cuda
rm -f ${AUTO_INSTALL}/NVIDIA*
rm -f ${AUTO_INSTALL}/nvidia*
rm -rf ${AUTO_INSTALL}/cudnn*


if [ "${is_install_aiacc_train}" = "TRUE" -o "${is_install_aiacc_inference}" = "TRUE" ]; then
    install_aiacc_base >> $log 2>&1
    if [ $? -ne 0 ]; then
        exit 1
    fi
    echo "AIACC miniconda base install OK!!!" >> $log 2>&1
fi



if [ "${is_install_aiacc_train}" = "TRUE" ]; then
    install_aiacc_train >> $log 2>&1
    if [ $? -ne 0 ]; then
        exit 1
    fi
    echo "AIACC-Traing ENV install OK!!!" >> $log 2>&1
fi

if [ "${is_install_aiacc_inference}" = "TRUE" ]; then
    install_aiacc_inference >> $log 2>&1
    if [ $? -ne 0 ]; then
        exit 1
    fi
    echo "AIACC-Inference ENV install OK!!!" >> $log 2>&1
fi

if [ "${is_install_aiacc_train}" = "TRUE" -o "${is_install_aiacc_inference}" = "TRUE" ]; then
    echo "echo " >> ${profile_file}
    if [ "${is_install_aiacc_train}" = "TRUE" -a "${is_install_aiacc_inference}" = "FALSE" ]; then
        echo "echo \"###### AIACC-Training  has installed on your machine! \" " >> ${profile_file}
    elif [ "${is_install_aiacc_train}" = "FALSE" -a "${is_install_aiacc_inference}" = "TRUE" ]; then
        echo "echo \"###### AIACC-Inference  has installed on your machine! \" " >> ${profile_file}
    else
        echo "echo \"###### AIACC-Training and AIACC-Inference has installed on your machine! \" " >> ${profile_file}
    fi

    echo "echo -e \"###### Please execute [ \033[31m . /root/miniconda/etc/profile.d/conda.sh \033[0m ] to init AIACC miniconda. \" " >> ${profile_file}
    echo "echo -e \"###### You can execute [ \033[31m conda env list \033[0m ] to check the AIACC miniconda envs. \" " >> ${profile_file}
    echo "echo -e \"###### Please activate env with [ \033[31m conda activate AIACC environments name \033[0m ] eg: 'conda activate aiacct_tf1.15_tr1.4.0_mx1.5.0_cu10.0_py36', 'conda activate aiaccix_1.2.0a0' \" " >> ${profile_file}
    echo "echo " >> ${profile_file}
fi


if [ "${is_install_rapids}" = "TRUE" ]; then
    install_rapids >> $log 2>&1
    if [ $? -ne 0 ]; then
        exit 1
    fi
    echo "RAPIDS install OK!!!" >> $log 2>&1
fi


if [ "$INSTANCE_FAMILY" = "sccgn7ex" -a "$is_install_rdma" = "TRUE" ]; then
    #install scc begin
    echo "$os install scc 1 begin ... " >> $log 2>&1
    cd ${AUTO_INSTALL}
    sh aiacc-scc-rdma.sh 1 $os 
    if [ $? -ne 0 ]; then
        echo "INSTALL_ERROR: $INSTANCE_FAMILY $RDMA_PEER_MEM_FAIL_STR ! " >> $log 2>&1
        exit 1
    fi

    echo "install scc end ... " >> $log 2>&1
    #install scc end
fi


end=$(date '+%s')
time_install=$((end-begin))
echo "Install using time $time_install !"
echo "Install using time $time_install !" >> $log 2>&1

lsmod |grep nvidia >> $log 2>&1
nvidia-smi >> $log 2>&1

#disable GSP
if [ "$driver_version" \> "510" ] ;then
    if [ "$INSTANCE_FAMILY" = "gn5" -o "$INSTANCE_FAMILY" = "gn5i" -o "$INSTANCE_FAMILY" = "ebmgn5i" -o "$INSTANCE_FAMILY" = "ebmgn5" -o "$INSTANCE_FAMILY" = "ebmgn6v" -o "$INSTANCE_FAMILY" = "ebmgn6e" -o "$INSTANCE_FAMILY" = "gn6v" -o "$INSTANCE_FAMILY" = "gn6e" ]; then
        echo "$INSTANCE_FAMILY not support GSP" >> $log 2>&1
    else
        echo options nvidia NVreg_EnableGpuFirmware=0 > /etc/modprobe.d/nvidia-gsp.conf
    fi
fi

echo  ${SUCCESS_STR} >> $log 2>&1
ldconfig
echo "reboot......" >> $log 2>&1
sleep 60
reboot
0

评论区