Docker - 开启gpu

2022-08-30

查看显卡

lspci | grep NVIDIA

资料

nvidia-docker

cuda下载

cudnn下载

启用Docker虚拟机GPU,加速深度学习

测试gpu效果

镜像源

接入日志服务

wget http://logtail-release-cn-hangzhou.oss-cn-hangzhou-internal.aliyuncs.com/linux64/logtail.sh -O logtail.sh; chmod 755 logtail.sh; ./logtail.sh install cn-hangzhou
echo zhejiang-haigang > /etc/ilogtail/user_defined_id
touch /etc/ilogtail/users/xxxx
/etc/init.d/ilogtaild restart

升级内核

yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm -y
yum --enablerepo=elrepo-kernel install -y  kernel-ml-devel kernel-ml
grub2-set-default 0
reboot

升级gcc?

yum install -y centos-release-scl
yum install -y devtoolset-8-gcc*
scl enable  devtoolset-8 bash
mv /usr/bin/gcc /usr/bin/gcc-4.8.5
ln -s /opt/rh/devtoolset-8/root/bin/gcc /usr/bin/gcc
mv /usr/bin/g++ /usr/bin/g++-4.8.5
ln -s /opt/rh/devtoolset-8/root/bin/g++ /usr/bin/g++

安装cuda?

## https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=CentOS&target_version=7&target_type=runfile_local
cd /usr/local/src
wget https://developer.download.nvidia.com/compute/cuda/11.5.0/local_installers/cuda_11.5.0_495.29.05_linux.run
sudo sh cuda_11.5.0_495.29.05_linux.run

安装docker

VERSION=19.03.5
URL=http://aliacs-k8s-cn-hangzhou.oss-cn-hangzhou.aliyuncs.com/public/pkg/docker/docker-${VERSION}.tar.gz
curl -ssL $URL -o /tmp/docker-${VERSION}.tar.gz
cd /tmp
tar -xf docker-${VERSION}.tar.gz
cd /tmp/pkg/docker/${VERSION}/rpm
yum localinstall -y $(ls .)
systemctl enable docker && systemctl restart docker

docker支持GPU

distribution=$(. /etc/os-release;echo $ID$VERSION_ID)    && curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo
distribution="centos7" # 如操作系统未非标,例如 aliyun linux2
curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.repo | \
  sudo tee /etc/yum.repos.d/nvidia-container-runtime.repo

yum clean expire-cache
yum install -y nvidia-docker2 nvidia-container-runtime
sed  -i '/runtimes/i\\    "default-runtime": "nvidia",' /etc/docker/daemon.json
systemctl restart docker

# 设置持久模式
nvidia-smi -pm 1
 
#验证GPU是否可用
docker run --rm -e NVIDIA_VISIBLE_DEVICES=all ccr.ccs.tencentyun.com/njq-apps/nvidia-cuda:11.0-base nvidia-smi