问这个tensorflow模型如何在CPU上收敛，而不是在GPU上收敛？
EN

Stack Overflow用户

提问于 2016-09-05 05:05:56

回答 1查看 871关注 0票数 0

我们遇到了一个奇怪的问题，我们相对简单的模型在CPU上收敛，但在使用GPU的服务器上不收敛。在两次运行之间不会对代码进行任何修改。代码也不包含任何显式的条件语句来更改不同体系结构上的工作流。

可能的原因是什么？这个tensorflow模型如何在CPU上收敛，而不是在GPU上收敛？如果代码太长，您无法阅读，我们仍然对一般的推测和提示表示感谢。

#!/usr/bin/python
from __future__ import print_function
import tensorflow as tf
import os
import numpy as np
import input_data # copy from tensorflow/examples/tutorials/mnist/input_data.py
# wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/mnist/input_data.py if needed

mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

force_gpu = False
debug = True # histogram_summary ...
# _cpu='/cpu:0'
default_learning_rate=0.001
tensorboard_logs = '/tmp/tensorboard-logs/'


# $(sleep 5; open http://0.0.0.0:6006) & tensorboard --debug --logdir=/tmp/tensorboard-logs/

class net():

    def __init__(self,model,data,name=0,learning_rate=default_learning_rate,batch_size=64):
            self.session=sess=session=tf.Session()
            self.model=model
            self.data=data # assigned to self.x=net.input via train
            self.batch_size=batch_size
            self.layers=[]
            self.last_width=self.input_width(data)
            self.learning_rate=learning_rate

            self.generate_model(model)

    def generate_model(self,model, name=''):
        if not model: return self
        with tf.name_scope('state'):
            self.keep_prob = tf.placeholder(tf.float32)  # 1 for testing! else 1 - dropout
            self.train_phase = tf.placeholder(tf.bool, name='train_phase')
            self.global_step = tf.Variable(0)  # dont set, feed or increment global_step, tensorflow will do it automatically
        with tf.name_scope('data'):
            n_input=28*28
            n_classes=10
            self.x = x = self.input  = tf.placeholder(tf.float32, [None, n_input])
            self.last_layer=x
            self.y = y = self.target = tf.placeholder(tf.float32, [None, n_classes])
            if not force_gpu: tf.image_summary("mnist", tf.reshape(self.x, [-1, 28, 28, 1], "mnist_images"))
        with tf.name_scope('model'):
            model(self)
        if(self.last_width!=n_classes): self.classifier()  # 10 classes auto


    def input_width(self,data):
        return 28*28


    def add(self, layer):
        self.layers.append(layer)
        self.last_layer = layer
        self.last_shape = layer.get_shape()

    def reshape(self,shape):
        self.last_layer = tf.reshape(self.last_layer,shape)
        self.last_shape = shape
        self.last_width = shape[-1]

    def batchnorm(self):
        from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
        with tf.name_scope('batchnorm') as scope:
            input = self.last_layer
            train_op=batch_norm(input, is_training=True, center=False, updates_collections=None, scope=scope)
            test_op=batch_norm(input, is_training=False, updates_collections=None, center=False,scope=scope, reuse=True)
            self.add(tf.cond(self.train_phase,lambda:train_op,lambda:test_op))

    # Fully connected layer
    def dense(self, hidden=1024, depth=1, act=tf.nn.tanh, dropout=False, parent=-1): #
        if parent==-1: parent=self.last_layer
        shape = self.last_layer.get_shape()
        if shape and len(shape)>2:
            self.last_width= int(shape[1]*shape[2]*shape[3])
            print("reshapeing ",shape,"to",self.last_width)
            parent = tf.reshape(parent, [-1, self.last_width])

        width = hidden
        while depth>0:
            with tf.name_scope('Dense_{:d}'.format(hidden)) as scope:
                print("Dense ", self.last_width, width)
                nr = len(self.layers)
                # if self.last_width == width:
                #   M = closest_unitary(np.random.rand(self.last_width, width) / (self.last_width + width))
                #   weights = tf.Variable(m, name="weights_dense_" + str(nr))
                # else:
                weights = tf.Variable(tf.random_uniform([self.last_width, width], minval=-1. / width, maxval=1. / width), name="weights_dense")
                bias = tf.Variable(tf.random_uniform([width],minval=-1./width,maxval=1./width), name="bias_dense")
                dense1 = tf.matmul(parent, weights, name='dense_'+str(nr))+ bias
                tf.histogram_summary('dense_'+str(nr),dense1)
                tf.histogram_summary('weights_'+str(nr),weights)
                tf.histogram_summary('bias_'+str(nr),bias)
                tf.histogram_summary('dense_'+str(nr)+'/sparsity', tf.nn.zero_fraction(dense1))
                tf.histogram_summary('weights_'+str(nr)+'/sparsity', tf.nn.zero_fraction(weights))
                if act: dense1 = act(dense1)
                # if norm: dense1 = self.norm(dense1,lsize=1) # SHAPE!
                if dropout: dense1 = tf.nn.dropout(dense1, self.keep_prob)
                self.layers.append(dense1)
                self.last_layer = parent = dense1
                self.last_width = width
                depth=depth-1
                self.last_shape=[-1,width] # dense

    # Convolution Layer
    def conv(self,shape,act=tf.nn.relu,pool=True,dropout=False,norm=True,name=None): # True why dropout bad in tensorflow??
        with tf.name_scope('conv'):
            print("input  shape ",self.last_shape)
            print("conv   shape ",shape)
            width=shape[-1]
            filters=tf.Variable(tf.random_normal(shape))
            # filters = tf.Variable(tf.random_uniform(shape, minval=-1. / width, maxval=1. / width), name="filters")
            _bias=tf.Variable(tf.random_normal([shape[-1]]))

            # # conv1 = conv2d('conv', _X, _weights, _bias)
            conv1=tf.nn.bias_add(tf.nn.conv2d(self.last_layer,filter=filters, strides=[1, 1, 1, 1], padding='SAME'), _bias)
            if debug: tf.histogram_summary('conv_' + str(len(self.layers)), conv1)
            if act: conv1=act(conv1)
            if pool: conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
            if norm: conv1 = tf.nn.lrn(conv1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
            if debug: tf.histogram_summary('norm_' + str(len(self.layers)), conv1)
            if dropout: conv1 = tf.nn.dropout(conv1,self.keep_prob)
            print("output shape ",conv1.get_shape())
            self.add(conv1)

    def classifier(self,classes=10):  # Define loss and optimizer
        with tf.name_scope('prediction'):# prediction
            if self.last_width!=classes:
                # print("Automatically adding dense prediction")
                self.dense(hidden=classes, act= False, dropout = False)
            # cross_entropy = -tf.reduce_sum(y_*y)
        with tf.name_scope('classifier'):
            y_=self.target
            manual=False # True
            if classes>100:
                print("using sampled_softmax_loss")
                y=prediction=self.last_layer
                self.cost = tf.reduce_mean(tf.nn.sampled_softmax_loss(y, y_))  # for big vocab
            elif manual:
                # prediction = y =self.last_layer=tf.nn.softmax(self.last_layer)
                # self.cost = cross_entropy = -tf.reduce_sum(y_ * tf.log(y+ 1e-10)) # against NaN!
                prediction = y = tf.nn.log_softmax(self.last_layer)
                self.cost = cross_entropy = -tf.reduce_sum(y_ * y)
            else:
                y = prediction = self.last_layer
                self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_)) # prediction, target

            # if not gpu:
            tf.scalar_summary('cost', self.cost)
            # self.cost = tf.Print(self.cost , [self.cost ], "debug cost : ")
            learning_scheme=self.learning_rate
            # learning_scheme=tf.train.exponential_decay(self.learning_rate, self.global_step, decay_steps, decay_size)
            self.optimizer = tf.train.AdamOptimizer(learning_scheme).minimize(self.cost)

            # Evaluate model
            correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(self.target, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            if not force_gpu: tf.scalar_summary('accuracy', self.accuracy)
            # Launch the graph

    def next_batch(self,batch_size=10):
        return self.data.train.next_batch(batch_size)

    def train(self,steps=-1,dropout=None,display_step=10,test_step=200): #epochs=-1,
        steps = 9999999 if steps==-1 else steps
        session=self.session
        # with tf.device(_cpu):

        # import tensorflow.contrib.layers as layers
        # t = tf.verify_tensor_all_finite(t, msg)
        tf.add_check_numerics_ops()
        self.summaries = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(tensorboard_logs, session.graph) #
        if not dropout:dropout=1. # keep all
        x=self.x
        y=self.y
        keep_prob=self.keep_prob
        session.run([tf.initialize_all_variables()])
        step = 0 # show first
        while step < steps:
            # print("step %d \r" % step)# end=' ')
            batch_xs, batch_ys = self.next_batch(self.batch_size)

            # tf.train.shuffle_batch_join(example_list, batch_size, capacity=min_queue_size + batch_size * 16, min_queue_size)
            # Fit training using batch data
            feed_dict = {x: batch_xs, y: batch_ys, keep_prob: dropout, self.train_phase: True}
            loss,_= session.run([self.cost,self.optimizer], feed_dict=feed_dict)
            if step % test_step == 0: self.test(step)
            if step % display_step == 0:
                # Calculate batch accuracy, loss
                feed = {x: batch_xs, y: batch_ys, keep_prob: 1., self.train_phase: False}
                acc , summary = session.run([self.accuracy,self.summaries], feed_dict=feed)
                # self.summary_writer.add_summary(summary, step) # only test summaries for smoother curve
                print("\rStep {:d} Loss= {:.6f} Accuracy= {:.3f}".format(step,loss,acc),end=' ')
                if str(loss)=="nan": return print("\nLoss gradiant explosion, exiting!!!") #restore!
            step += 1
        print("\nOptimization Finished!")
        self.test(step,number=10000) # final test

    def inputs(self,data):
        self.inputs, self.labels = load_data()#...)

    def test(self,step,number=400):#256
        session=sess=self.session
        run_metadata = tf.RunMetadata()
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        # Calculate accuracy for 256 mnist test images
        test_labels = self.data.test.labels[:number]
        test_images = self.data.test.images[:number]
        feed_dict = {self.x: test_images, self.y: test_labels, self.keep_prob: 1., self.train_phase:False}
        accuracy,summary= self.session.run([self.accuracy, self.summaries], feed_dict=feed_dict)
        # accuracy,summary = session.run([self.accuracy, self.summaries], feed_dict, run_options, run_metadata)
        print('\t'*3+"Test Accuracy:",accuracy)
        # self.summary_writer.add_run_metadata(run_metadata, 'step #%03d' % step)
        self.summary_writer.add_summary(summary,global_step=step)


def dense(net): # best with lr ~0.001
    # type: (layer.net) -> None
    # net.batchnorm() # start lower, else no effect
    # net.dense(400,act=None)#  # ~95% we can do better:
    net.dense(400, act=tf.nn.tanh)# 0.996 YAY  only 0.985 on full set, Step 5000 flat
    return # 0.957% without any model!!

def alex(net):
    # type: (layer.net) -> None
    print("Building Alex-net")
    net.reshape(shape=[-1, 28, 28, 1])  # Reshape input pictures
    # net.batchnorm()
    net.conv([3, 3, 1, 64])
    net.conv([3, 3, 64, 128])
    net.conv([3, 3, 128, 256])
    net.dense(1024,act=tf.nn.relu)
    net.dense(1024,act=tf.nn.relu)


# net=layer.net(dense,data=mnist, learning_rate=0.01 )#,'mnist' baseline
_net=net(alex,data=mnist, learning_rate=0.001)#,'mnist'
_net.train(50000,dropout=0.6,display_step=1,test_step=10)

tensorflow

回答 1

Stack Overflow用户

发布于 2016-09-07 07:12:07

一般而言，浮点计算在添加许多数字时可能有点不确定(并且一些GPU存在错误)。你有没有尝试重新调整超参数(不同的学习率等等)来解决这个问题？

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/39321500

复制

相似问题

问这个tensorflow模型如何在CPU上收敛，而不是在GPU上收敛？
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问这个tensorflow模型如何在CPU上收敛，而不是在GPU上收敛？EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问这个tensorflow模型如何在CPU上收敛，而不是在GPU上收敛？
EN