报错原因
在跑train时提示如下
RuntimeError: DataLoader worker (pid 6209) is killed by signal: Bus error.
It is possible that dataloader's workers are out of shared memory.
Please try to raise your shared memory limit.
解决方法:
- 停止docker
// 停止docker服务
systemctl stop docker
- 找到要修改的容器ID
gpu@gpu-workstation:~$ docker ps -a
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
71050781f79d kaggle "--shm-size 8G" 5 minutes ago Created 6006/tcp, 0.0.0.0:8890->8888/tcp, :::8890->8888/tcp kaggle
485f0e25b37c lab_vm "jupyter notebook --…" 3 weeks ago Up About a minute 6006/tcp, 0.0.0.0:8889->8888/tcp, :::8889->8888/tcp lab_vm
209fad8c7a0d ufoym/deepo:latest "jupyter notebook --…" 3 weeks ago Up 4 hours 6006/tcp, 0.0.0.0:8888->8888/tcp, :::8888->8888/tcp recursing_golick
8dc30befdfa8 portainer/portainer "/portainer" 3 weeks ago Up 4 hours 0.0.0.0:8000->9000/tcp, :::8000->9000/tcp portainerUI
gpu@gpu-workstation:~$ ^C
gpu@gpu-workstation:~$ docker inspect 485f0e25b37c | grep Id
"Id": "485f0e25b37cbe2bc09312356e2ad23a63ae237f6ea74cf084e298a0f3fd5f55",
- 找到hostconfig.json文件
gpu@gpu-workstation:/$ cd /home/docker
gpu@gpu-workstation:/home/docker$ ls
buildkit containers image network overlay2 plugins runtimes swarm tmp trust volumes
gpu@gpu-workstation:/home/docker$ cd containers
gpu@gpu-workstation:/home/docker/containers$ ls
209fad8c7a0db43a836c5a8e6835cfa4938be23f43884e1f34459d6d45d3970a 71050781f79d36864b2b7bb42dc2b60b17c5dc8eedce7426e96ad8ee1d4c546b
485f0e25b37cbe2bc09312356e2ad23a63ae237f6ea74cf084e298a0f3fd5f55 8dc30befdfa8c6b5606f3fa3f3d430af2b657817569c3ff0b2c888714b9ed719
gpu@gpu-workstation:/home/docker/containers/485f0e25b37cbe2bc09312356e2ad23a63ae237f6ea74cf084e298a0f3fd5f55$ ls
485f0e25b37cbe2bc09312356e2ad23a63ae237f6ea74cf084e298a0f3fd5f55-json.log checkpoints config.v2.json hostconfig.json hostname hosts mounts resolv.conf resolv.conf.hash
- 修改shmsize 为:
ShmSize":8259460864
gpu@gpu-workstation:/home/docker/containers/485f0e25b37cbe2bc09312356e2ad23a63ae237f6ea74cf084e298a0f3fd5f55$ vim hostconfig.json
{"Binds":["dl-vol:/home/common-dir"],"ContainerIDFile":"","LogConfig":{"Type":"json-file","Config":{"max-file":"1","max-size":"50m"}},"NetworkMode":"default","PortBindings":{"8888/tcp":[{"HostIp":"","HostPort":"8889"}]},"RestartPolicy":{"Name":"no","MaximumRetryCount":0},"AutoRemove":false,"VolumeDriver":"","VolumesFrom":null,"CapAdd":null,"CapDrop":null,"CgroupnsMode":"host","Dns":[],"DnsOptions":[],"DnsSearch":[],"ExtraHosts":null,"GroupAdd":null,"IpcMode":"private","Cgroup":"","Links":null,"OomScoreAdj":0,"PidMode":"","Privileged":false,"PublishAllPorts":false,"ReadonlyRootfs":false,"SecurityOpt":null,"UTSMode":"","UsernsMode":"","ShmSize":8259460864,"Runtime":"nvidia","ConsoleSize":[0,0],"Isolation":"","CpuShares":0,"Memory":0,"NanoCpus":0,"CgroupParent":"","BlkioWeight":0,"BlkioWeightDevice":[],"BlkioDeviceReadBps":null,"BlkioDeviceWriteBps":null,"BlkioDeviceReadIOps":null,"BlkioDeviceWriteIOps":null,"CpuPeriod":0,"CpuQuota":0,"CpuRealtimePeriod":0,"CpuRealtimeRuntime":0,"CpusetCpus":"","CpusetMems":"","Devices":[],"DeviceCgroupRules":null,"DeviceRequests":null,"KernelMemory":0,"KernelMemoryTCP":0,"MemoryReservation":0,"MemorySwap":0,"MemorySwappiness":null,"OomKillDisable":false,"PidsLimit":null,"Ulimits":null,"CpuCount":0,"CpuPercent":0,"IOMaximumIOps":0,"IOMaximumBandwidth":0,"MaskedPaths":["/proc/asound","/proc/acpi","/proc/kcore","/proc/keys","/proc/latency_stats","/proc/timer_list","/proc/timer_stats","/proc/sched_debug","/proc/scsi","/sys/firmware"],"ReadonlyPaths":["/proc/bus","/proc/fs","/proc/irq","/proc/sys","/proc/sysrq-trigger"]}
- 重启服务器,进入docker确认。
docker ps -aq | xargs -I {} docker start {}
root@485f0e25b37c:/
Filesystem 1K-blocks Used Available Use% Mounted on
overlay 959862832 37967920 873066752 5% /
tmpfs 65536 0 65536 0% /dev
tmpfs 24633980 0 24633980 0% /sys/fs/cgroup
shm 8065880 0 8065880 0% /dev/shm
/dev/sda2 959862832 37967920 873066752 5% /home/common-dir
tmpfs 24633980 12 24633968 1% /proc/driver/nvidia
udev 24588824 0 24588824 0% /dev/nvidia0
tmpfs 24633980 0 24633980 0% /proc/asound
tmpfs 24633980 0 24633980 0% /proc/acpi
tmpfs 24633980 0 24633980 0% /proc/scsi
tmpfs 24633980 0 24633980 0% /sys/firmware
|