You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# Use one of the GPU based XDL images as the parent image
FROM registry.cn-hangzhou.aliyuncs.com/xdl/xdl:ubuntu-gpu-mxnet1.3
#FROM registry.cn-hangzhou.aliyuncs.com/xdl/xdl:ubuntu-gpu-tf1.12# Remove Nvidia related packages from the parent image to avoid conflict with the host Nvidia driver's libraries and tools.
RUN apt -y remove nvidia-*
ENV CUDA_VERSION 9.0.176
ENV CUDA_PKG_VERSION 9-0=$CUDA_VERSION-1
# nvidia-docker 1.0
LABEL com.nvidia.volumes.needed="nvidia_driver"
LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
RUN echo"/usr/local/nvidia/lib">> /etc/ld.so.conf.d/nvidia.conf && \
echo"/usr/local/nvidia/lib64">> /etc/ld.so.conf.d/nvidia.conf
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
# nvidia-container-runtime# Add these environment variables, so nvidia-docker will map the driver libraries and tools from the host to the container. # Refer to: https://devblogs.nvidia.com/gpu-containers-runtime
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
>>>>>>>>>>>> 0 >>>>>>>>>>>
cuda alloc cuda alloc save checkpoint at global_step[2], ckpt version[ckpt-...................2]
2022-12-20 08:42:26,839 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = false
......
hdfsOpenFile(hdfs://localhost:9000//user/root/tdm-gpu/tdm_mock_train/checkpoint/checkpoints): FileSystem#open((Lorg/apache/hadoop/fs/Path;I)Lorg/apache/hadoop/fs/FSDataInputStream;) error:RemoteException: File does not exist: /user/root/tdm-gpu/tdm_mock_train/checkpoint/checkpoints at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86) at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76) at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:153) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1946) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:755) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:433) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:527) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1036) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1000) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:928) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1729) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2916)java.io.FileNotFoundException: File does not exist: /user/root/tdm-gpu/tdm_mock_train/checkpoint/checkpoints at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86) at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76) at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:153) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1946) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:755) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:433) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:527) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1036) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1000) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:928) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1729) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2916) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:423) at org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:121) at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:88) at org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations(DFSClient.java:865) at org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:852) at org.apache.hadoop.hdfs.DFSClient.getLocatedBlocks(DFSClient.java:841) at org.apache.hadoop.hdfs.DFSClient.open(DFSClient.java:1005) at org.apache.hadoop.hdfs.DistributedFileSystem$4.doCall(DistributedFileSystem.java:320) at org.apache.hadoop.hdfs.DistributedFileSystem$4.doCall(DistributedFileSystem.java:316) at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) at org.apache.hadoop.hdfs.DistributedFileSystem.open(DistributedFileSystem.java:328)Caused by: org.apache.hadoop.ipc.RemoteException(java.io.FileNotFoundException): File does not exist: /user/root/tdm-gpu/tdm_mock_train/checkpoint/checkpoints at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:86) at org.apache.hadoop.hdfs.server.namenode.INodeFile.valueOf(INodeFile.java:76) at org.apache.hadoop.hdfs.server.namenode.FSDirStatAndListingOp.getBlockLocations(FSDirStatAndListingOp.java:153) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getBlockLocations(FSNamesystem.java:1946) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.getBlockLocations(NameNodeRpcServer.java:755) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.getBlockLocations(ClientNamenodeProtocolServerSideTranslatorPB.java:433) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:527) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1036) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1000) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:928) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1729) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2916) at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1545) at org.apache.hadoop.ipc.Client.call(Client.java:1491) at org.apache.hadoop.ipc.Client.call(Client.java:1388) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:233) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:118) at com.sun.proxy.$Proxy9.getBlockLocations(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getBlockLocations(ClientNamenodeProtocolTranslatorPB.java:321) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422) at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165) at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157) at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359) at com.sun.proxy.$Proxy10.getBlockLocations(Unknown Source) at org.apache.hadoop.hdfs.DFSClient.callGetBlockLocations(DFSClient.java:863) ... 7 more2022-12-2008:42:27,456 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = falseresult is None, finished success.start put item_emb2022-12-2008:42:27,513 INFO sasl.SaslDataTransferClient: SASL encryption trust check: localHostTrusted = false, remoteHostTrusted = falseGetLatestCheckpoint [hdfs://localhost:9000//user/root/tdm-gpu/tdm_mock_train/checkpoint//ckpt-...................2]Start convert [item_emb], part[0]finish convert [item_emb], part[0], id_cnt[0], duration[15ms]finish convert ckpt[hdfs://localhost:9000//user/root/tdm-gpu/tdm_mock_train/checkpoint/], duration[72ms]rm -f data/item_embhadoop fs -get hdfs://localhost:9000//user/root/tdm-gpu/emb_converted//item_emb data/item_embsed -i 's/..//' data/item_embhadoop fs -put -f data/item_emb hdfs://localhost:9000//user/root/tdm-gpu/emb_converted/finish put item_embException AttributeError: "'NoneType' object has no attribute 'NNSymbolFree'" in <object repr() failed> ignored......
The text was updated successfully, but these errors were encountered:
问题描述
我根据TDM训练示例 执行命令,在执行train.py文件时,系统显示许多异常,虽然最后打印了
finish put item_emb
正常退出,但我查看$EMB_HDFSPATH
下生成的item_emb
文件,发现该文件为空。执行时的异常信息为
运行环境
显卡:NVIDIA T4 GPU
系统:TensorFlow 2.8.0 + Miniconda + OpenCV 4 + Python 3.9.12 + CUDA11.2.2 + cuDNN8.2.1 + GPU驱动版本 460.106.00
环境配置说明
最开始采用TDM 官方Wiki,使用官方GPU镜像进行实验,在执行
单机试验小数据集
的训练部分时报错::Cuda Stream Create Error Error Code: Aborted (core dumped)
,Wiki上有人说是需要宿主机包含Nvidia-driver,但我的服务器确实有显卡驱动,看到Issue的一篇文章讲官方GPU镜像存在问题,镜像内的nvidia-driver与宿主机的nvidia-driver冲突 #116 ,作者提交的修改后的Dockerfile_for_gpu
获得了官方肯定,于是修改镜像,但是作者的Dockerfile并跑成功,于是就按照指示添加环境变量,删除镜像内的nvidia-dirver。随后进入修改后的容器,重新按照步骤执行,在执行train.py文件时,出现这一问题。修改后的文件(Dockerfile_driver_remove)如下所示。通过如下命令,生成容器。
详细报错信息
The text was updated successfully, but these errors were encountered: