Hello @mlauber71 - would you happen to have the similar metanode for env propagation for GPU execution of Keras. My CPU Keras execution works just fine (testing done using the Image_Classification_MNIST_Solution workflow). When I switch to GPU Keras execution, the analysis hangs during 1st second of GPU usage.
Keras GPU python env modules installed by Knime is below:
cmd: C:\Users..\miniforge3\Scripts\conda-env-script.py create --file C:\Users..\Desktop\knime_4.7.7\plugins\org.knime.python2.envconfigs_4.7.0.v202211181342\envconfigs\windows\py36_knime_dl_gpu.yml --name py3_knime_kerasv2_gpu --json
conda version: 23.3.1
+anaconda/win-64::hdf5-1.8.18-vc14h7a021fe_0
+anaconda/win-64::icu-58.2-vc14hc45fdbb_0
+anaconda/win-64::jpeg-9b-vc14h4d7706e_1
+anaconda/win-64::pixman-0.34.0-vc14h00fde18_1
+anaconda/win-64::qt-5.9.7-vc14h73c81de_0
+anaconda/win-64::yaml-0.1.7-vc14h4cb57cf_1
+anaconda/win-64::zlib-1.2.11-vc14h1cdd9ab_1
+conda-forge/win-64::arrow-cpp-5.0.0-py36he9238d2_8_cpu
+conda-forge/win-64::aws-c-cal-0.5.11-he19cf47_0
+conda-forge/win-64::aws-c-event-stream-0.2.7-h70e1b0c_13
+conda-forge/win-64::aws-c-io-0.10.5-h2fe331c_0
+conda-forge/win-64::aws-checksums-0.1.11-h1e232aa_7
+conda-forge/win-64::aws-sdk-cpp-1.8.186-hb0612c5_3
+conda-forge/win-64::grpc-cpp-1.40.0-h2431d41_1
+conda-forge/win-64::jpype1-0.6.3-py36h79cbd7a_1001
+conda-forge/win-64::libprotobuf-3.18.0-h7755175_1
+conda-forge/win-64::protobuf-3.18.0-py36he2d232f_0
+conda-forge/win-64::pyarrow-5.0.0-py36h6720e24_8_cpu
+conda-forge/win-64::python_abi-3.6-2_cp36m
+conda-forge/win-64::re2-2021.09.01-h0e60522_0
+defaults/noarch::absl-py-0.15.0-pyhd3eb1b0_0
+defaults/noarch::attrs-21.4.0-pyhd3eb1b0_0
+defaults/noarch::backcall-0.2.0-pyhd3eb1b0_0
+defaults/noarch::colorama-0.4.4-pyhd3eb1b0_0
+defaults/noarch::cycler-0.11.0-pyhd3eb1b0_0
+defaults/noarch::dataclasses-0.8-pyh4f3eec9_6
+defaults/noarch::decorator-5.1.1-pyhd3eb1b0_0
+defaults/noarch::gast-0.5.3-pyhd3eb1b0_0
+defaults/noarch::importlib_metadata-4.8.1-hd3eb1b0_0
+defaults/noarch::ipython_genutils-0.2.0-pyhd3eb1b0_1
+defaults/noarch::jsonschema-3.2.0-pyhd3eb1b0_2
+defaults/noarch::keras-applications-1.0.8-py_1
+defaults/noarch::keras-preprocessing-1.1.2-pyhd3eb1b0_0
+defaults/noarch::parquet-cpp-1.5.1-h34088ae_4
+defaults/noarch::parso-0.8.3-pyhd3eb1b0_0
+defaults/noarch::pickleshare-0.7.5-pyhd3eb1b0_1003
+defaults/noarch::prompt_toolkit-2.0.10-py_0
+defaults/noarch::py4j-0.10.9.2-pyhd3eb1b0_0
+defaults/noarch::pygments-2.11.2-pyhd3eb1b0_0
+defaults/noarch::pyparsing-3.0.4-pyhd3eb1b0_0
+defaults/noarch::pytz-2021.3-pyhd3eb1b0_0
+defaults/noarch::six-1.16.0-pyhd3eb1b0_1
+defaults/noarch::typing_extensions-4.1.1-pyh06a4308_0
+defaults/noarch::wcwidth-0.2.5-pyhd3eb1b0_0
+defaults/noarch::werkzeug-2.0.3-pyhd3eb1b0_0
+defaults/noarch::wheel-0.37.1-pyhd3eb1b0_0
+defaults/noarch::zipp-3.6.0-pyhd3eb1b0_0
+defaults/win-64::_tflow_select-2.1.0-gpu
+defaults/win-64::abseil-cpp-20210324.2-hd77b12b_0
+defaults/win-64::astor-0.8.1-py36haa95532_0
+defaults/win-64::aws-c-common-0.6.2-h2bbff1b_0
+defaults/win-64::blas-1.0-mkl
+defaults/win-64::bzip2-1.0.8-he774522_0
+defaults/win-64::c-ares-1.19.1-h2bbff1b_0
+defaults/win-64::ca-certificates-2023.08.22-haa95532_0
+defaults/win-64::cairo-1.14.12-hf171d8a_3
+defaults/win-64::certifi-2021.5.30-py36haa95532_0
+defaults/win-64::coverage-5.5-py36h2bbff1b_2
+defaults/win-64::cudatoolkit-9.0-1
+defaults/win-64::cudnn-7.6.5-cuda9.0_0
+defaults/win-64::cython-0.29.24-py36hd77b12b_0
+defaults/win-64::freetype-2.10.4-hd328e21_0
+defaults/win-64::gflags-2.2.2-ha925a31_0
+defaults/win-64::glog-0.5.0-hd77b12b_0
+defaults/win-64::grpcio-1.36.1-py36hc60d5dd_1
+defaults/win-64::h5py-2.8.0-py36hf7173ca_0
+defaults/win-64::icc_rt-2022.1.0-h6049295_2
+defaults/win-64::importlib-metadata-4.8.1-py36haa95532_0
+defaults/win-64::intel-openmp-2023.1.0-h59b6b97_46319
+defaults/win-64::ipython-7.1.1-py36h39e3cac_0
+defaults/win-64::jedi-0.13.3-py36_0
+defaults/win-64::jupyter_core-4.8.1-py36haa95532_0
+defaults/win-64::keras-2.2.4-0
+defaults/win-64::keras-base-2.2.4-py36_0
+defaults/win-64::kiwisolver-1.3.1-py36hd77b12b_0
+defaults/win-64::lerc-3.0-hd77b12b_0
+defaults/win-64::libbrotlicommon-1.0.9-h2bbff1b_7
+defaults/win-64::libbrotlidec-1.0.9-h2bbff1b_7
+defaults/win-64::libbrotlienc-1.0.9-h2bbff1b_7
+defaults/win-64::libcurl-7.82.0-h86230a5_0
+defaults/win-64::libdeflate-1.17-h2bbff1b_0
+defaults/win-64::libpng-1.6.37-h2a8f88b_0
+defaults/win-64::libssh2-1.10.0-hcd4344a_2
+defaults/win-64::libthrift-0.15.0-he1d8c1a_0
+defaults/win-64::libtiff-4.5.0-h8a3f274_0
+defaults/win-64::libutf8proc-2.6.1-h2bbff1b_1
+defaults/win-64::lz4-c-1.9.3-h2bbff1b_1
+defaults/win-64::markdown-3.3.4-py36haa95532_0
+defaults/win-64::matplotlib-3.0.3-py36hc8f65d3_0
+defaults/win-64::mkl-2020.2-256
+defaults/win-64::mkl-service-2.3.0-py36h196d8e1_0
+defaults/win-64::mkl_fft-1.2.0-py36h45dec08_0
+defaults/win-64::mkl_random-1.1.1-py36h47e9c7a_0
+defaults/win-64::nbformat-4.4.0-py36_0
+defaults/win-64::numpy-1.16.1-py36h19fb1c0_1
+defaults/win-64::numpy-base-1.16.1-py36hc3f5095_1
+defaults/win-64::olefile-0.46-py36_0
+defaults/win-64::openssl-1.1.1v-h2bbff1b_0
+defaults/win-64::pandas-0.23.4-py36h830ac7b_0
+defaults/win-64::pillow-5.3.0-py36hdc69c19_0
+defaults/win-64::pip-21.2.2-py36haa95532_0
+defaults/win-64::pyqt-5.9.2-py36h6538335_2
+defaults/win-64::pyrsistent-0.17.3-py36he774522_0
+defaults/win-64::python-3.6.13-h3758d61_0
+defaults/win-64::python-dateutil-2.7.5-py36_0
+defaults/win-64::pywin32-228-py36hbaba5e8_1
+defaults/win-64::pyyaml-5.3.1-py36he774522_0
+defaults/win-64::scipy-1.1.0-py36h29ff71c_2
+defaults/win-64::setuptools-58.0.4-py36haa95532_0
+defaults/win-64::sip-4.19.8-py36h6538335_0
+defaults/win-64::snappy-1.1.9-h6c2663c_0
+defaults/win-64::sqlite-3.41.2-h2bbff1b_0
+defaults/win-64::tensorboard-1.12.2-py36h33f27b4_0
+defaults/win-64::tensorflow-1.12.0-gpu_py36ha5f9131_0
+defaults/win-64::tensorflow-base-1.12.0-gpu_py36h6e53903_0
+defaults/win-64::tensorflow-gpu-1.12.0-h0d30ee6_0
+defaults/win-64::termcolor-1.1.0-py36haa95532_1
+defaults/win-64::tk-8.6.12-h2bbff1b_0
+defaults/win-64::tornado-6.1-py36h2bbff1b_0
+defaults/win-64::traitlets-4.3.3-py36haa95532_0
+defaults/win-64::vc-14.2-h21ff451_1
+defaults/win-64::vs2015_runtime-14.27.29016-h5e58377_2
+defaults/win-64::wincertstore-0.2-py36h7fe50ca_0
+defaults/win-64::xz-5.4.2-h8cc25b3_0
+defaults/win-64::zstd-1.5.0-h19a0ad4_1
update specs: [‘numpy=1.16.1’, ‘h5py=2.8’, ‘ipython=7.1’, “pyarrow[version=‘>=5.0.0’]”, ‘matplotlib=3.0’, ‘pillow=5.3’, ‘pandas=0.23’, ‘python=3.6’, ‘jedi=0.13’, ‘tensorflow-gpu=1.12.0’, ‘keras=2.2.4’, ‘pip’, ‘jpype1=0.6.3’, ‘cairo=1.14’, ‘python-dateutil=2.7’, ‘py4j’, ‘scipy=1.1’, ‘nbformat=4.4’]
and error I am seeing in the log is:
2023-09-10 18:22:05,558 : ERROR : KNIME-Worker-46-Keras Network Learner 3:96 : : DLKerasLearnerNodeModel : Keras Network Learner : 3:96 : Blas GEMM launch failed : a.shape=(200, 100), b.shape=(100, 10), m=200, n=10, k=100
[[{{node output_1/MatMul}} = MatMul[T=DT_FLOAT, _class=[“loc:@training/Adadelta/gradients/output_1/MatMul_grad/MatMul”], transpose_a=false, transpose_b=false, _device=“/job:localhost/replica:0/task:0/device:GPU:0”](dense_1/Relu, output_1/kernel/read)]]
[[{{node loss/mul/_97}} = _Recvclient_terminated=false, recv_device=“/job:localhost/replica:0/task:0/device:CPU:0”, send_device=“/job:localhost/replica:0/task:0/device:GPU:0”, send_device_incarnation=1, tensor_name=“edge_605_loss/mul”, tensor_type=DT_FLOAT, _device=“/job:localhost/replica:0/task:0/device:CPU:0”]]
2023-09-10 18:22:05,558 : ERROR : KNIME-Worker-46-Keras Network Learner 3:96 : : Node : Keras Network Learner : 3:96 : Execute failed: An error occured during training of the Keras deep learning network. See log for details.
java.lang.RuntimeException: An error occured during training of the Keras deep learning network. See log for details.
at org.knime.dl.keras.base.nodes.learner.DLKerasLearnerNodeModel.handleGeneralException(DLKerasLearnerNodeModel.java:751)
at org.knime.dl.keras.base.nodes.learner.DLKerasLearnerNodeModel.executeInternal(DLKerasLearnerNodeModel.java:721)
at org.knime.dl.keras.base.nodes.learner.DLKerasLearnerNodeModel.execute(DLKerasLearnerNodeModel.java:320)
at org.knime.core.node.NodeModel.executeModel(NodeModel.java:549)
at org.knime.core.node.Node.invokeFullyNodeModelExecute(Node.java:1267)
at org.knime.core.node.Node.execute(Node.java:1041)
at org.knime.core.node.workflow.NativeNodeContainer.performExecuteNode(NativeNodeContainer.java:595)
at org.knime.core.node.exec.LocalNodeExecutionJob.mainExecute(LocalNodeExecutionJob.java:98)
at org.knime.core.node.workflow.NodeExecutionJob.internalRun(NodeExecutionJob.java:201)
at org.knime.core.node.workflow.NodeExecutionJob.run(NodeExecutionJob.java:117)
at org.knime.core.util.ThreadUtils$RunnableWithContextImpl.runWithContext(ThreadUtils.java:367)
at org.knime.core.util.ThreadUtils$RunnableWithContext.run(ThreadUtils.java:221)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source)
at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
at org.knime.core.util.ThreadPool$MyFuture.run(ThreadPool.java:123)
at org.knime.core.util.ThreadPool$Worker.run(ThreadPool.java:246)
Caused by: org.knime.python2.kernel.PythonIOException: Blas GEMM launch failed : a.shape=(200, 100), b.shape=(100, 10), m=200, n=10, k=100
[[{{node output_1/MatMul}} = MatMul[T=DT_FLOAT, _class=[“loc:@training/Adadelta/gradients/output_1/MatMul_grad/MatMul”], transpose_a=false, transpose_b=false, _device=“/job:localhost/replica:0/task:0/device:GPU:0”](dense_1/Relu, output_1/kernel/read)]]
[[{{node loss/mul/_97}} = _Recvclient_terminated=false, recv_device=“/job:localhost/replica:0/task:0/device:CPU:0”, send_device=“/job:localhost/replica:0/task:0/device:GPU:0”, send_device_incarnation=1, tensor_name=“edge_605_loss/mul”, tensor_type=DT_FLOAT, _device=“/job:localhost/replica:0/task:0/device:CPU:0”]]
at org.knime.python2.kernel.messaging.AbstractTaskHandler.handleFailureMessage(AbstractTaskHandler.java:146)
at org.knime.python2.kernel.messaging.AbstractTaskHandler.handle(AbstractTaskHandler.java:92)
at org.knime.dl.python.core.DLPythonAbstractCommands$DLTrainingTask.runInternal(DLPythonAbstractCommands.java:931)
at org.knime.core.util.ThreadUtils$CallableWithContextImpl.callWithContext(ThreadUtils.java:383)
at org.knime.core.util.ThreadUtils$CallableWithContext.call(ThreadUtils.java:269)
at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown Source)
at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.base/java.lang.Thread.run(Unknown Source)