首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >这个tensorflow模型如何在CPU上收敛,而不是在GPU上收敛?

这个tensorflow模型如何在CPU上收敛,而不是在GPU上收敛?
EN

Stack Overflow用户
提问于 2016-09-05 05:05:56
回答 1查看 871关注 0票数 0

我们遇到了一个奇怪的问题,我们相对简单的模型在CPU上收敛,但在使用GPU的服务器上不收敛。在两次运行之间不会对代码进行任何修改。代码也不包含任何显式的条件语句来更改不同体系结构上的工作流。

可能的原因是什么?这个tensorflow模型如何在CPU上收敛,而不是在GPU上收敛?如果代码太长,您无法阅读,我们仍然对一般的推测和提示表示感谢。

代码语言:javascript
复制
#!/usr/bin/python
from __future__ import print_function
import tensorflow as tf
import os
import numpy as np
import input_data # copy from tensorflow/examples/tutorials/mnist/input_data.py
# wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/mnist/input_data.py if needed

mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

force_gpu = False
debug = True # histogram_summary ...
# _cpu='/cpu:0'
default_learning_rate=0.001
tensorboard_logs = '/tmp/tensorboard-logs/'


# $(sleep 5; open http://0.0.0.0:6006) & tensorboard --debug --logdir=/tmp/tensorboard-logs/

class net():

    def __init__(self,model,data,name=0,learning_rate=default_learning_rate,batch_size=64):
            self.session=sess=session=tf.Session()
            self.model=model
            self.data=data # assigned to self.x=net.input via train
            self.batch_size=batch_size
            self.layers=[]
            self.last_width=self.input_width(data)
            self.learning_rate=learning_rate

            self.generate_model(model)

    def generate_model(self,model, name=''):
        if not model: return self
        with tf.name_scope('state'):
            self.keep_prob = tf.placeholder(tf.float32)  # 1 for testing! else 1 - dropout
            self.train_phase = tf.placeholder(tf.bool, name='train_phase')
            self.global_step = tf.Variable(0)  # dont set, feed or increment global_step, tensorflow will do it automatically
        with tf.name_scope('data'):
            n_input=28*28
            n_classes=10
            self.x = x = self.input  = tf.placeholder(tf.float32, [None, n_input])
            self.last_layer=x
            self.y = y = self.target = tf.placeholder(tf.float32, [None, n_classes])
            if not force_gpu: tf.image_summary("mnist", tf.reshape(self.x, [-1, 28, 28, 1], "mnist_images"))
        with tf.name_scope('model'):
            model(self)
        if(self.last_width!=n_classes): self.classifier()  # 10 classes auto


    def input_width(self,data):
        return 28*28


    def add(self, layer):
        self.layers.append(layer)
        self.last_layer = layer
        self.last_shape = layer.get_shape()

    def reshape(self,shape):
        self.last_layer = tf.reshape(self.last_layer,shape)
        self.last_shape = shape
        self.last_width = shape[-1]

    def batchnorm(self):
        from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
        with tf.name_scope('batchnorm') as scope:
            input = self.last_layer
            train_op=batch_norm(input, is_training=True, center=False, updates_collections=None, scope=scope)
            test_op=batch_norm(input, is_training=False, updates_collections=None, center=False,scope=scope, reuse=True)
            self.add(tf.cond(self.train_phase,lambda:train_op,lambda:test_op))

    # Fully connected layer
    def dense(self, hidden=1024, depth=1, act=tf.nn.tanh, dropout=False, parent=-1): #
        if parent==-1: parent=self.last_layer
        shape = self.last_layer.get_shape()
        if shape and len(shape)>2:
            self.last_width= int(shape[1]*shape[2]*shape[3])
            print("reshapeing ",shape,"to",self.last_width)
            parent = tf.reshape(parent, [-1, self.last_width])

        width = hidden
        while depth>0:
            with tf.name_scope('Dense_{:d}'.format(hidden)) as scope:
                print("Dense ", self.last_width, width)
                nr = len(self.layers)
                # if self.last_width == width:
                #   M = closest_unitary(np.random.rand(self.last_width, width) / (self.last_width + width))
                #   weights = tf.Variable(m, name="weights_dense_" + str(nr))
                # else:
                weights = tf.Variable(tf.random_uniform([self.last_width, width], minval=-1. / width, maxval=1. / width), name="weights_dense")
                bias = tf.Variable(tf.random_uniform([width],minval=-1./width,maxval=1./width), name="bias_dense")
                dense1 = tf.matmul(parent, weights, name='dense_'+str(nr))+ bias
                tf.histogram_summary('dense_'+str(nr),dense1)
                tf.histogram_summary('weights_'+str(nr),weights)
                tf.histogram_summary('bias_'+str(nr),bias)
                tf.histogram_summary('dense_'+str(nr)+'/sparsity', tf.nn.zero_fraction(dense1))
                tf.histogram_summary('weights_'+str(nr)+'/sparsity', tf.nn.zero_fraction(weights))
                if act: dense1 = act(dense1)
                # if norm: dense1 = self.norm(dense1,lsize=1) # SHAPE!
                if dropout: dense1 = tf.nn.dropout(dense1, self.keep_prob)
                self.layers.append(dense1)
                self.last_layer = parent = dense1
                self.last_width = width
                depth=depth-1
                self.last_shape=[-1,width] # dense

    # Convolution Layer
    def conv(self,shape,act=tf.nn.relu,pool=True,dropout=False,norm=True,name=None): # True why dropout bad in tensorflow??
        with tf.name_scope('conv'):
            print("input  shape ",self.last_shape)
            print("conv   shape ",shape)
            width=shape[-1]
            filters=tf.Variable(tf.random_normal(shape))
            # filters = tf.Variable(tf.random_uniform(shape, minval=-1. / width, maxval=1. / width), name="filters")
            _bias=tf.Variable(tf.random_normal([shape[-1]]))

            # # conv1 = conv2d('conv', _X, _weights, _bias)
            conv1=tf.nn.bias_add(tf.nn.conv2d(self.last_layer,filter=filters, strides=[1, 1, 1, 1], padding='SAME'), _bias)
            if debug: tf.histogram_summary('conv_' + str(len(self.layers)), conv1)
            if act: conv1=act(conv1)
            if pool: conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
            if norm: conv1 = tf.nn.lrn(conv1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
            if debug: tf.histogram_summary('norm_' + str(len(self.layers)), conv1)
            if dropout: conv1 = tf.nn.dropout(conv1,self.keep_prob)
            print("output shape ",conv1.get_shape())
            self.add(conv1)

    def classifier(self,classes=10):  # Define loss and optimizer
        with tf.name_scope('prediction'):# prediction
            if self.last_width!=classes:
                # print("Automatically adding dense prediction")
                self.dense(hidden=classes, act= False, dropout = False)
            # cross_entropy = -tf.reduce_sum(y_*y)
        with tf.name_scope('classifier'):
            y_=self.target
            manual=False # True
            if classes>100:
                print("using sampled_softmax_loss")
                y=prediction=self.last_layer
                self.cost = tf.reduce_mean(tf.nn.sampled_softmax_loss(y, y_))  # for big vocab
            elif manual:
                # prediction = y =self.last_layer=tf.nn.softmax(self.last_layer)
                # self.cost = cross_entropy = -tf.reduce_sum(y_ * tf.log(y+ 1e-10)) # against NaN!
                prediction = y = tf.nn.log_softmax(self.last_layer)
                self.cost = cross_entropy = -tf.reduce_sum(y_ * y)
            else:
                y = prediction = self.last_layer
                self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_)) # prediction, target

            # if not gpu:
            tf.scalar_summary('cost', self.cost)
            # self.cost = tf.Print(self.cost , [self.cost ], "debug cost : ")
            learning_scheme=self.learning_rate
            # learning_scheme=tf.train.exponential_decay(self.learning_rate, self.global_step, decay_steps, decay_size)
            self.optimizer = tf.train.AdamOptimizer(learning_scheme).minimize(self.cost)

            # Evaluate model
            correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(self.target, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
            if not force_gpu: tf.scalar_summary('accuracy', self.accuracy)
            # Launch the graph

    def next_batch(self,batch_size=10):
        return self.data.train.next_batch(batch_size)

    def train(self,steps=-1,dropout=None,display_step=10,test_step=200): #epochs=-1,
        steps = 9999999 if steps==-1 else steps
        session=self.session
        # with tf.device(_cpu):

        # import tensorflow.contrib.layers as layers
        # t = tf.verify_tensor_all_finite(t, msg)
        tf.add_check_numerics_ops()
        self.summaries = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(tensorboard_logs, session.graph) #
        if not dropout:dropout=1. # keep all
        x=self.x
        y=self.y
        keep_prob=self.keep_prob
        session.run([tf.initialize_all_variables()])
        step = 0 # show first
        while step < steps:
            # print("step %d \r" % step)# end=' ')
            batch_xs, batch_ys = self.next_batch(self.batch_size)

            # tf.train.shuffle_batch_join(example_list, batch_size, capacity=min_queue_size + batch_size * 16, min_queue_size)
            # Fit training using batch data
            feed_dict = {x: batch_xs, y: batch_ys, keep_prob: dropout, self.train_phase: True}
            loss,_= session.run([self.cost,self.optimizer], feed_dict=feed_dict)
            if step % test_step == 0: self.test(step)
            if step % display_step == 0:
                # Calculate batch accuracy, loss
                feed = {x: batch_xs, y: batch_ys, keep_prob: 1., self.train_phase: False}
                acc , summary = session.run([self.accuracy,self.summaries], feed_dict=feed)
                # self.summary_writer.add_summary(summary, step) # only test summaries for smoother curve
                print("\rStep {:d} Loss= {:.6f} Accuracy= {:.3f}".format(step,loss,acc),end=' ')
                if str(loss)=="nan": return print("\nLoss gradiant explosion, exiting!!!") #restore!
            step += 1
        print("\nOptimization Finished!")
        self.test(step,number=10000) # final test

    def inputs(self,data):
        self.inputs, self.labels = load_data()#...)

    def test(self,step,number=400):#256
        session=sess=self.session
        run_metadata = tf.RunMetadata()
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        # Calculate accuracy for 256 mnist test images
        test_labels = self.data.test.labels[:number]
        test_images = self.data.test.images[:number]
        feed_dict = {self.x: test_images, self.y: test_labels, self.keep_prob: 1., self.train_phase:False}
        accuracy,summary= self.session.run([self.accuracy, self.summaries], feed_dict=feed_dict)
        # accuracy,summary = session.run([self.accuracy, self.summaries], feed_dict, run_options, run_metadata)
        print('\t'*3+"Test Accuracy:",accuracy)
        # self.summary_writer.add_run_metadata(run_metadata, 'step #%03d' % step)
        self.summary_writer.add_summary(summary,global_step=step)


def dense(net): # best with lr ~0.001
    # type: (layer.net) -> None
    # net.batchnorm() # start lower, else no effect
    # net.dense(400,act=None)#  # ~95% we can do better:
    net.dense(400, act=tf.nn.tanh)# 0.996 YAY  only 0.985 on full set, Step 5000 flat
    return # 0.957% without any model!!

def alex(net):
    # type: (layer.net) -> None
    print("Building Alex-net")
    net.reshape(shape=[-1, 28, 28, 1])  # Reshape input pictures
    # net.batchnorm()
    net.conv([3, 3, 1, 64])
    net.conv([3, 3, 64, 128])
    net.conv([3, 3, 128, 256])
    net.dense(1024,act=tf.nn.relu)
    net.dense(1024,act=tf.nn.relu)


# net=layer.net(dense,data=mnist, learning_rate=0.01 )#,'mnist' baseline
_net=net(alex,data=mnist, learning_rate=0.001)#,'mnist'
_net.train(50000,dropout=0.6,display_step=1,test_step=10)
EN

回答 1

Stack Overflow用户

发布于 2016-09-07 07:12:07

一般而言,浮点计算在添加许多数字时可能有点不确定(并且一些GPU存在错误)。你有没有尝试重新调整超参数(不同的学习率等等)来解决这个问题?

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/39321500

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档