请帮助我理解为什么当训练测试准确率为0.97时,预测不能正确工作。
是来自数据,还是应该改变网络?
输入数据为32500 (5个手势,6500个图像) RGB图像,像素为640x480。
已加载并调整大小的图像IMG_WIDTH = 100,IMG_HEIGHT = 77。下面是加载、调整图像大小并返回np.array的函数。
def load_data(data_dir):
"""
Load image data from directory `data_dir`.
Assume `data_dir` has one directory named after each category, numbered
0 through NUM_CATEGORIES - 1. Inside each category directory will be some
number of image files.
Return tuple `(images, labels)`. `images` should be a list of all
of the images in the data directory, where each image is formatted as a
numpy ndarray with dimensions IMG_WIDTH x IMG_HEIGHT x 3. `labels` should
be a list of integer labels, representing the categories for each of the
corresponding `images`.
"""
images = []
labels = []
for dir in range(0, NUM_CATEGORIES):
# get path for each gesture
d = os.path.join(data_dir, f"{str(dir)}")
# os.listdir(d) return the list of all names of images in that folder
for image_path in os.listdir(d):
# get the full path of specific image
full_path = os.path.join(data_dir, f"{str(dir)}", image_path)
# Returns an image that is loaded from the specified file
image = cv2.imread(full_path)
# get dimension for each image
dim = (100, 77)
# resized the image
image_resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
# add image and their directory name to images and labels list
images.append(image_resized)
labels.append(dir)
return images, labels这是我的模型。
def get_model():
"""
Returns a compiled convolutional neural network model. Assume that the
`input_shape` of the first layer is `(IMG_WIDTH=100, IMG_HEIGHT=77, 3)`.
The output layer should have `NUM_GESTURE = 5` units, one for each category.
"""
# Create a convolutional neural network
model = tf.keras.models.Sequential(
[
# Convolutional layer. Learn 32 filters using a 3x3 kernel
tf.keras.layers.Conv2D(
32, (5, 5), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
),
# Max-pooling layer, using 2x2 pool size
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Conv2D(
64, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
),
# Max-pooling layer, using 2x2 pool size
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Conv2D(
64, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
),
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Conv2D(
128, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
),
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Flatten(),
# Add a hidden layer with dropout
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.3),
# Add an output layer with output units for all 5 gestures
tf.keras.layers.Dense(5, activation='softmax')
])
# Train neural network
model.compile(
optimizer='adam',
loss="categorical_crossentropy",
metrics=["accuracy"]
)
return model
labels = tf.keras.utils.to_categorical(labels)
x_train, x_test, y_train, y_test = train_test_split(
np.array(images), np.array(labels), test_size=0.4)
model = get_model()
model.fit(x_train, y_train, batch_size=64, epochs=10)
model.evaluate(x_test, y_test, verbose=2)结果是0.97。fit result
从视频中,我保存了图像,并希望实时预测手势。
GESTURE = {0:"ok", 1:"down", 2:"up", 3:"palm", 4:"l"}
video = cv2.VideoCapture(0)
while True:
# Capture the video frame
ret, img = video.read()
# Display the resulting frame
# to flip the video with 180 degree
image = cv2.flip(img, 1)
# save image for prediction
image = cv2.imwrite('Frame'+str(0)+'.jpg', image)
image_addr = "Frame0.jpg"
image = cv2.imread(image_addr)
dim = (100,77)
image = tf.keras.preprocessing.image.load_img(image_addr, target_size=dim)
# Converts a PIL Image instance to a Numpy array. Return a 3D Numpy array.
input_arr = tf.keras.preprocessing.image.img_to_array(image)
# Convert single image to a batch.
input_arr = np.array([input_arr])
input_arr = input_arr.astype('float32')/255
# Generates output predictions for the input samples. Return Numpy array(s) of predictions.
predictions = model.predict(input_arr)
# Return the index_array of the maximum values along an axis.
pre_class = np.argmax(predictions, axis=-1)
# for writing in the video
text = GESTURE[pre_class[0]]
font = cv2.FONT_HERSHEY_SIMPLEX
image = cv2.flip(img, 1)
cv2.putText(image,
text,
(50, 50),
font, 2,
(0, 0, 0),
2,
cv2.LINE_4)
cv2.imshow('video', image)
# the 'q' button is set as the
# quitting button you may use any
# desired button of your choice
k = cv2.waitKey(1)
if k == ord('q'):
break
video.release()
cv2.destroyAllWindows()发布于 2021-07-05 02:49:53
我不是专家,但通常情况下,当你在训练数据和测试数据"The result is 0.97“上表现良好,但在新的最终用户数据上表现不佳时,这是因为存在数据不匹配(尽管它可能过拟合)。
正如在中一样,您训练和测试的数据是如此不同(像素值、像素的概率分布,或者模型明显的不可见的差异),以至于模型无法对其进行泛化,性能很差。
使用您在生产/最终产品中使用的相同数据作为测试集是一种良好的实践。Andrew Ng使用这个数据集拆分(如果你有足够的数据,这是适用的):
从训练数据中:
从最终产品数据:
你可以查看这篇文章,了解更多关于原因的信息:https://cs230.stanford.edu/blog/split/
https://stackoverflow.com/questions/68225312
复制相似问题