我使用SWA方法在pytorch中训练模型。
SWA:https://pytorch.org/blog/stochastic-weight-averaging-in-pytorch/
我的火车代码的丢失立即跳到了南。
模型的损失上升到nan
损失产出如下。
1.损失:张量(4.8463,设备=‘cuda:0’,grad_fn=)
2.损失:张量(118317.8516,设备=‘cuda:0’,grad_fn=)
3.损失:张量(5.7568e+22,device='cuda:0',grad_fn=)
4.损失:张量(nan,设备=‘cuda:0’,grad_fn=)
如果没有SWA方法,损失是不会上升的。使用SWA方法的列车模型代码中是否存在问题?
我很感谢你的建议,谢谢。
#batch_size
batch_size=5
#DataLoader
train_dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=batch_size,shuffle=False)
#dict
dataloaders_dict={"train":train_dataloader,"val":val_dataloader}
#example outputs
#train :torch.Size([5, 25, 32, 32])
#target : torch.Size([5])
def train_model_withSWA(net,dataloaders_dict,criterion,optimizer,num_epochs):
loss_list=[]
acc_list=[]
#validation list
val_loss_list=[]
val_acc_list=[]
for epoch in tqdm(range(num_epochs)):
print("Epoch{}/{}".format(epoch+1,num_epochs))
print("--------------------------")
for phase in ["train","val"]:
if phase=="train":
net.train()
else:
net.eval()
epoch_loss=0.0
epoch_corrects=0
#if (epoch==0) and (phase=="train"):
continue
for inputs,labels in dataloaders_dict[phase]:
#optimizerを初期化:
optimizer.zero_grad()
with torch.set_grad_enabled(phase=="train"):
inputs=inputs.to(device)
labels=labels.to(device)
outputs=net(inputs)
loss=criterion(outputs,labels)
print("loss:",loss)
_,preds=torch.max(outputs,1)
if phase == "train":
loss.backward()
optimizer.step()
epoch_loss += loss.item()*inputs.size(0)
epoch_corrects +=torch.sum(preds==labels.data)
#for swa
optimizer.swap_swa_sgd()
epoch_loss=epoch_loss/len(dataloaders_dict[phase].dataset)
epoch_acc=epoch_corrects.double()/len(dataloaders_dict[phase].dataset)
print("{} Loss:{:.4f} Acc:{:.4f}".format(phase,epoch_loss,epoch_acc))
if phase=="train":
loss_list.append(epoch_loss.detach().numpy())
acc_list.append(epoch_acc.detach().numpy())
else:
val_loss_list.append(epoch_loss.detach().numpy())
val_acc_list.append(epoch_acc.detach().numpy())
from torchcontrib.optim import SWA
# ignore warning
import warnings
warnings.filterwarnings('ignore') # set to ignore
#criterion
criterion = nn.CrossEntropyLoss()
net=net.to(device)
base_opt = torch.optim.SGD(net.parameters(), lr=0.1)
optimizer = SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
### train with SWA
train_model_withSWA(net=net,dataloaders_dict=dataloaders_dict,
criterion=criterion,
optimizer=optimizer,
num_epochs=num_epochs)
#model' loss jump up to nan....
#loss: tensor(4.8463, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(118317.8516, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(5.7568e+22, device='cuda:0', grad_fn=<NllLossBackward>)
#loss: tensor(nan, device='cuda:0', grad_fn=<NllLossBackward>)
# In additional.
I use Self Attention and Positional Encoder code
class Self_Attention(nn.Module):
""" Self-Attention Layer"""
def __init__(self, in_dim):
super(Self_Attention, self).__init__()
#pointwise convolution
self.query_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.key_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.value_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
# softmax
self.softmax = nn.Softmax(dim=-2)
#output = x +gamma*o
# first:gamma=0
self.gamma = nn.Parameter(torch.zeros(1))
def forward(self, x):
x=x.to(device)
X = x
#B,C',W,H→B,C',N
proj_query = self.query_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
proj_query = proj_query.permute(0, 2, 1) # transpose
proj_key = self.key_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C',N
# bmm
S = torch.bmm(proj_query, proj_key)
#
attention_map_T = self.softmax(S)
attention_map = attention_map_T.permute(0, 2, 1)
# Self-Attention Map
proj_value = self.value_conv(X).view(
X.shape[0], -1, X.shape[2]*X.shape[3]) # size:B,C,N
o = torch.bmm(proj_value, attention_map.permute(
0, 2, 1))
# Self-Attention Map
o = o.view(X.shape[0], X.shape[1], X.shape[2], X.shape[3])
out = x+self.gamma*o
#print("gamma:",self.gamma)
return out, attention_map
class PositionalEncoder(nn.Module):
def __init__(self, d_model=300, max_seq_len=256):
super(PositionalEncoder,self).__init__()
self.d_model = d_model
pe = torch.zeros(max_seq_len, d_model)
# GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pe = pe.to(device)
for pos in range(max_seq_len):
for i in range(0, d_model, 2):
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
pe[pos, i + 1] = math.cos(pos /
(10000 ** ((2 * (i + 1))/d_model)))
#
self.pe = pe.unsqueeze(0)
#
self.pe.requires_grad = False
def forward(self, x):
x=x.to(device)
ret = math.sqrt(self.d_model)*x + self.pe
return ret发布于 2019-11-15 16:49:31
如果没有在data_preprocessing中进行标准化,则损失不会跳到nan。
这可能是由我的预处理错误引起的。我不理解SWA方法中的归一化和损失跳跃的关系...我为我的错误感到抱歉。
https://stackoverflow.com/questions/58777329
复制相似问题