文章/答案/技术大牛

发布

社区首页 >问答首页 >带扣面和拥抱面的多级序列分类

问带扣面和拥抱面的多级序列分类
EN

Stack Overflow用户

提问于 2021-08-25 18:41:52

回答 1查看 210关注 0票数 0

我希望通过快速和拥抱脸来实现DistilBERT，以解决多序列分类问题。我找到了一个有用的教程，它给出了一个很好的例子，说明如何使用二进制分类来完成这个任务。守则如下：

# !pip install torch==1.9.0
# !pip install torchtext==0.10
# !pip install transformers==4.7
# !pip install fastai==2.4

from fastai.text.all import *
from sklearn.model_selection import train_test_split
import pandas as pd
import glob
from transformers import AutoTokenizer, AutoModelForSequenceClassification


hf_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
hf_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

"""
train_df and val_df looks like this:

      label text
4240    5   whoa interesting.
13      7   you could you could we just
4639    4   you set the goal,
28      1   because ive already agreed to that
66      8   oh hey freshman thats you gona need
"""

print(list(train_df.label.value_counts().index))
"""
[4, 1, 5, 6, 7, 0, 2, 3, 8]
"""

class HF_Dataset(torch.utils.data.Dataset):
    def __init__(self, df, hf_tokenizer):
        self.df = df
        self.hf_tokenizer = hf_tokenizer
        
        self.label_map = {
            0:0,
            1:0,
            2:0,
            3:0,
            4:1,
            5:1,
            6:1,
            7:1,
            8:1
        }
        
    def __len__(self):
        return len(self.df)

    def decode(self, token_ids):
        return ' '.join([hf_tokenizer.decode(x) for x in tokenizer_outputs['input_ids']])
    
    def decode_to_original(self, token_ids):
        return self.hf_tokenizer.decode(token_ids.squeeze())

    def __getitem__(self, index):
        label, text = self.df.iloc[index]
        label = self.label_map[label]
        label = torch.tensor(label)

        tokenizer_output = self.hf_tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
        
        tokenizer_output['input_ids'].squeeze_()
        tokenizer_output['attention_mask'].squeeze_()
        
        return tokenizer_output, label
        

train_dataset = HF_Dataset(train_df, hf_tokenizer)
valid_dataset = HF_Dataset(valid_df, hf_tokenizer)

train_dl = DataLoader(train_dataset, bs=16, shuffle=True)
valid_dl = DataLoader(valid_dataset, bs=16)
dls = DataLoaders(train_dl, valid_dl)
hf_model(**batched_data)


class HF_Model(nn.Module):
  
    def __init__(self, hf_model):
        super().__init__()
        
        self.hf_model = hf_model
        
    def forward(self, tokenizer_outputs):
        
        model_output = self.hf_model(**tokenizer_outputs)
        
        return model_output.logits
        
model = HF_Model(hf_model)
# Manually popping the model onto the gpu since the data is in a dictionary format
# (doesn't automatically place model + data on gpu otherwise)
learn = Learner(dls, model, loss_func=nn.CrossEntropyLoss(), metrics=[accuracy])
learn.fit_one_cycle(3, 1e-4)

这个很好用。但是，我将我的多类标签映射到两个标签上，这样就可以工作了。我实际上有9节课。我尝试调整HF_Dataset()类中的标签映射方案，以匹配实际的标签，如下所示：

class HF_Dataset(torch.utils.data.Dataset):
    def __init__(self, df, hf_tokenizer):
        self.df = df
        self.hf_tokenizer = hf_tokenizer
        
        self.label_map = {
            0:0,
            1:1,
            2:2,
            3:3,
            4:4,
            5:5,
            6:6,
            7:7,
            8:8
        }
        
    def __len__(self):
        return len(self.df)

    def decode(self, token_ids):
        return ' '.join([hf_tokenizer.decode(x) for x in tokenizer_outputs['input_ids']])
    
    def decode_to_original(self, token_ids):
        return self.hf_tokenizer.decode(token_ids.squeeze())

    def __getitem__(self, index):
        label, text = self.df.iloc[index]
        label = self.label_map[label]
        label = torch.tensor(label)

        tokenizer_output = self.hf_tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
        
        tokenizer_output['input_ids'].squeeze_()
        tokenizer_output['attention_mask'].squeeze_()
        
        return tokenizer_output, label

每一行都能工作到learn.fit_one_cycle为止。

下面是这一行的完整堆栈跟踪：

 0.00% [0/3 00:00<00:00]
epoch   train_loss  valid_loss  accuracy    time

 0.00% [0/519 00:00<00:00]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-21-0ec2ff9e12e1> in <module>
----> 1 learn.fit_one_cycle(3, 1e-4)

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/callback/schedule.py in fit_one_cycle(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt)
    111     scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
    112               'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
--> 113     self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
    114 
    115 # Cell

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in fit(self, n_epoch, lr, wd, cbs, reset_opt)
    219             self.opt.set_hypers(lr=self.lr if lr is None else lr)
    220             self.n_epoch = n_epoch
--> 221             self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
    222 
    223     def _end_cleanup(self): self.dl,self.xb,self.yb,self.pred,self.loss = None,(None,),(None,),None,None

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_fit(self)
    210         for epoch in range(self.n_epoch):
    211             self.epoch=epoch
--> 212             self._with_events(self._do_epoch, 'epoch', CancelEpochException)
    213 
    214     def fit(self, n_epoch, lr=None, wd=None, cbs=None, reset_opt=False):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_epoch(self)
    204 
    205     def _do_epoch(self):
--> 206         self._do_epoch_train()
    207         self._do_epoch_validate()
    208 

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_epoch_train(self)
    196     def _do_epoch_train(self):
    197         self.dl = self.dls.train
--> 198         self._with_events(self.all_batches, 'train', CancelTrainException)
    199 
    200     def _do_epoch_validate(self, ds_idx=1, dl=None):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in all_batches(self)
    167     def all_batches(self):
    168         self.n_iter = len(self.dl)
--> 169         for o in enumerate(self.dl): self.one_batch(*o)
    170 
    171     def _do_one_batch(self):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in one_batch(self, i, b)
    192         b = self._set_device(b)
    193         self._split(b)
--> 194         self._with_events(self._do_one_batch, 'batch', CancelBatchException)
    195 
    196     def _do_epoch_train(self):

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _with_events(self, f, event_type, ex, final)
    161 
    162     def _with_events(self, f, event_type, ex, final=noop):
--> 163         try: self(f'before_{event_type}');  f()
    164         except ex: self(f'after_cancel_{event_type}')
    165         self(f'after_{event_type}');  final()

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/fastai/learner.py in _do_one_batch(self)
    173         self('after_pred')
    174         if len(self.yb):
--> 175             self.loss_grad = self.loss_func(self.pred, *self.yb)
    176             self.loss = self.loss_grad.clone()
    177         self('after_loss')

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1049         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1050                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051             return forward_call(*input, **kwargs)
   1052         # Do not call functions when jit is used
   1053         full_backward_hooks, non_full_backward_hooks = [], []

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/modules/loss.py in forward(self, input, target)
   1119     def forward(self, input: Tensor, target: Tensor) -> Tensor:
   1120         return F.cross_entropy(input, target, weight=self.weight,
-> 1121                                ignore_index=self.ignore_index, reduction=self.reduction)
   1122 
   1123 

~/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
   2822     if size_average is not None or reduce is not None:
   2823         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2824     return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
   2825 
   2826 

IndexError: Target 6 is out of bounds.

这看起来应该是一个简单的解决办法。我是否需要调整模型体系结构中的某些内容以允许它接受9个标签？还是我需要一个热编码我的标签？如果是这样的话，是否有预先建立的解决方案来完成这一工作？

pytorch

huggingface-transformers

fast-ai

python

deep-learning

回答 1

Stack Overflow用户

回答已采纳

发布于 2021-08-25 19:09:25

加载模型时需要定义num_labels=9：

hf_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=9)

默认值为2，这适合于第一个用例，但当您试图更改时会中断。

请注意，库明确表示，分类器( 生成，您感兴趣的.logits )是随机初始化的：

一些DistilBertForSequenceClassification的权重没有从基于蒸馏器的模型检查点初始化，而是新初始化的：“分类器.偏倚”、“分类器.权重”、“预分类器”、“pre_classifier.bias” 您可能应该在下游任务上对该模型进行培训，以便能够将其用于预测和推断。

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/68928299

复制

相似问题

问带扣面和拥抱面的多级序列分类
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问带扣面和拥抱面的多级序列分类EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问带扣面和拥抱面的多级序列分类
EN