我用一些数据训练了一个LSTM模型。当我使用训练中使用的相同的数据来评估训练模型的性能时,我得到了不同的结果。我所使用的评估指标是精确性、精确性、回忆性和F1评分。我用过PyTorch。
我的测试代码test_model.py
import numpy as np
import json
from train_utils import *
from dataset_utils import *
from models import *
from metrics import *
import torch
import torch.nn as nn
import torch.utils.tensorboard as tb
from torch.utils.data import Dataset, DataLoader
def test(encoded_seq, y_label, model_path, model_class, config):
device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')
testset = SequenceDataset(encoded_seq, y_label)
test_dataloader = torch.utils.data.DataLoader(testset, batch_size=len(testset))
checkpoint = torch.load(model_path, map_location=torch.device(device))
args = {'n_nts': config['MODEL']['embedding_dim'], 'n_bins': encoded_seq.shape[1],
'bin_rnn_size': config['MODEL']['hidden_dim'], 'num_layers': config['MODEL']['hidden_layers'],
'dropout': config['TRAINER']['dropout'], 'bidirectional': config['MODEL']['bidirectional']}
model = att_DNA(args, 2)
model.load_state_dict(checkpoint['state_dict'])
model.to(device)
loss_fn = nn.CrossEntropyLoss()
data, labels = iter(test_dataloader).next()
model.eval()
raw_out = model.forward(data.to(device))[0]
loss = loss_fn(raw_out, labels.long().to(device))
print("Loss: ", loss)
m = Metrics() # m.metrics initialised to {0,0,0}
metrics, predictions = m.get_metrics(raw_out.detach().clone().cpu(), labels)
#print('True labels', labels)
#print('Predicted labels', predictions)
print('Metrics: ', metrics)
model = None # Clear model这些指标在metrics.py中计算如下:
class Metrics():
def __init__(self):
self.metrics = {'prec': 0, 'recall': 0, 'f1': 0, 'acc': 0}
def pred_from_raw(raw):
pred = torch.argmax(torch.softmax(raw, dim=1), dim=1).cpu().numpy()
return pred
def metrics_classification(raw, y_true, avg):
f1 = metrics.f1_score(pred, y_true.numpy(), average=avg, zero_division=0)
prec = metrics.precision_score(pred, y_true.numpy(), average=avg, zero_division=0)
rec = metrics.recall_score(pred, y_true.numpy(), average=avg, zero_division=0)
return f1, prec, rec
def get_metrics(self, raw, y_true, avg=None):
print("Calculating metrics")
f1, prec, recall = metrics_classification(raw, y_true, avg)
pred = pred_from_raw(raw)
acc = metrics.accuracy_score(pred, y_true.numpy())
self.metrics = {'prec': prec, 'recall': recall, 'f1': f1, 'acc': acc}
return self.metrics, pred数据以下列方式加载:
class SequenceDataset(Dataset):
def __init__(self, data, labels):
self.data = torch.from_numpy(data).float()
self.labels = torch.tensor(labels).float()
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
# Return data (seq_len, batch, input_dim), label for index
return (self.data[idx], self.labels[idx])我的培训指标看起来如下:
Epoch: 4 ------ TRAIN ------
train loss: 0.5386, {'prec': 0.7369, 'recall': 0.7262, 'f1': 0.7002, 'acc': 0.7664}
Epoch: 4 ------ VAL ------
val loss: 0.5143, {'prec': 0.7692, 'recall': 0.7653, 'f1': 0.7386, 'acc': 0.7942}虽然我使用经过训练的模型(通过脚本test_model.py )(在培训数据上)的测试指标是:
Metrics: {'prec': array([0.5017, 0.4878]), 'recall': array([0.4949, 0.4946]),
'f1': array([0.4983, 0.4912]), 'acc': 0.4948}为什么即使在相同的数据上进行培训和评估,衡量标准也有差异呢?
发布于 2022-02-18 09:13:18
您可以共享完整的源代码吗?或者您可以在调用def test(encoded_seq, y_label, model_path, model_class, config)函数时进一步检查您的列车、val和测试数据。
https://stackoverflow.com/questions/71169230
复制相似问题