你好,我试图用一些CuDNNLSTM层构建一个tensorflow神经网络,但它无法调用cuda求解器。由于在删除CuDNNLSTM层时没有出现错误,因此我认为它与LSTM层有关。
class ActorNet:
def __init__(self, N_STATES, N_ACTIONS, MAX_STEP, BATCH_SIZE, lr_a):
self.N_STATES = N_STATES
self.N_ACTIONS = N_ACTIONS
self.MAX_STEP = MAX_STEP
self.BATCH_SIZE = BATCH_SIZE
self.lr_a = lr_a
self.g = tf.Graph()
self.cwd = os.getcwd()
self.path = os.path.join(self.cwd, 'simple') + "\\model.ckpt"
with self.g.as_default():
self.sess = tf.InteractiveSession()
self.a_input_states = tf.placeholder("float", [self.MAX_STEP, None, self.N_ACTIONS + self.N_STATES + 1],
name='input_placeholder')
self.a_input_states_tp = tf.transpose(self.a_input_states, [1,0,2])
self.a_grad_from_critic = tf.placeholder("float", [1, None, self.N_ACTIONS], name='input_placeholder')
with tf.variable_scope('actor'):
self.model = models.Sequential()
self.model.add(layers.BatchNormalization())
self.model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.model.add(layers.BatchNormalization())
self.model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.model.add(layers.BatchNormalization())
self.model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.model.add(layers.BatchNormalization())
self.model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.model.add(layers.BatchNormalization())
self.model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.model.add(layers.BatchNormalization())
self.model.add(layers.Dense(self.N_ACTIONS, activation='tanh'))
self.lstm_output = self.model(self.a_input_states_tp)
self.lstm_output_tp = tf.transpose(self.lstm_output, [1, 0, 2])
self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')
self.gradient = tf.gradients(self.lstm_output_tp[-1:,:,:], self.params, -self.a_grad_from_critic / self.BATCH_SIZE)
self.gradient_a = tf.gradients(self.lstm_output_tp[-1:,:,:], self.a_input_states)
self.opt = tf.train.AdamOptimizer(self.lr_a)
self.optimizer = self.opt.apply_gradients(zip(self.gradient, self.params))
print("Initialized Actor Network...")
self.t_a_input_states = tf.placeholder("float", [self.MAX_STEP, None, self.N_ACTIONS + self.N_STATES + 1],
name='input_placeholder')
self.t_a_input_states_tp = tf.transpose(self.t_a_input_states, [1,0,2])
self.t_a_grad_from_critic = tf.placeholder("float", [self.MAX_STEP, None, self.N_ACTIONS],
name='input_placeholder')
with tf.variable_scope('target_actor'):
self.t_model = models.Sequential()
self.t_model.add(layers.BatchNormalization())
self.t_model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.t_model.add(layers.BatchNormalization())
self.t_model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.t_model.add(layers.BatchNormalization())
self.t_model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.t_model.add(layers.BatchNormalization())
self.t_model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.t_model.add(layers.BatchNormalization())
self.t_model.add(layers.CuDNNLSTM(500, return_sequences=True))
self.t_model.add(layers.BatchNormalization())
self.t_model.add(layers.Dense(self.N_ACTIONS, activation='tanh'))
self.t_lstm_output = self.t_model(self.t_a_input_states_tp)
self.t_lstm_output_tp = tf.transpose(self.t_lstm_output, [1, 0, 2])
self.t_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_actor')
print("Initialized Target Actor Network...")
self.sess.run(tf.global_variables_initializer())
self.sess.run([t_param.assign(self.params[ii]) for ii, t_param in enumerate(self.t_params)])
self.update_target_actor_op = [t_param.assign(TAU * self.params[ii] + (1-TAU) * t_param) for ii, t_param in enumerate(self.t_params)]
self.saver = tf.train.Saver(max_to_keep=200)例如,当我调用它时,
ActorNet(10, 3, 30, 20, 0.001)然后抛出错误。
InternalError (see above for traceback): tensorflow/core/kernels/cuda_solvers.cc:468: cuSolverDN call failed with status =7
[[Node: actor/sequential/cu_dnnlstm/recurrent_kernel/Initializer/Qr = Qr[T=DT_FLOAT, _class=["loc:@actor/sequential/cu_dnnlstm/recurrent_kernel"], full_matrices=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](actor/sequential/cu_dnnlstm/recurrent_kernel/Initializer/random_normal)]]错误发生在
self.sess.run(tf.global_variables_initializer())移除LSTM层时未出现错误。我使用的是Tensorflow 1.9.0、Cuda 10.1和RTX 2080。我能做些什么来解决这个问题?我尝试为tf会话设置一个随机种子,但它不起作用。
发布于 2020-11-25 18:34:01
我刚刚通过更新tensorflow-gpu 1.9.0到1.11.0解决了这个问题,不知道它是由1.9.0引起的吗?Tensorflow 1.9.0和1.11.0都应该使用Cuda 9.0 btw。
https://stackoverflow.com/questions/60309894
复制相似问题