目前,我能够使用下面的配置文件来训练语义角色标注模型。此配置文件基于一个由AllenNLP提供,适用于默认的bert-base-uncased模型和GroNLP/bert-base-dutch-cased。
{
"dataset_reader": {
"type": "srl_custom",
"bert_model_name": "GroNLP/bert-base-dutch-cased"
},
"data_loader": {
"batch_sampler": {
"type": "bucket",
"batch_size": 32
}
},
"train_data_path": "./data/SRL/SONAR_1_SRL/MANUAL500/",
"validation_data_path": "./data/SRL/SONAR_1_SRL/MANUAL500/",
"model": {
"type": "srl_bert",
"embedding_dropout": 0.1,
"bert_model": "GroNLP/bert-base-dutch-cased"
},
"trainer": {
"optimizer": {
"type": "huggingface_adamw",
"lr": 5e-5,
"correct_bias": false,
"weight_decay": 0.01,
"parameter_groups": [
[
[
"bias",
"LayerNorm.bias",
"LayerNorm.weight",
"layer_norm.weight"
],
{
"weight_decay": 0.0
}
]
]
},
"learning_rate_scheduler": {
"type": "slanted_triangular"
},
"checkpointer": {
"keep_most_recent_by_count": 2
},
"grad_norm": 1.0,
"num_epochs": 3,
"validation_metric": "+f1-measure-overall"
}
}将bert_model_name和bert_model参数的值从GroNLP/bert-base-dutch-cased交换到roberta-base将不会开箱即用,因为RobertaTokenizer 只支持BertTokenizer而不是RobertaTokenizer。因此,我将配置文件更改为:
{
"dataset_reader": {
"type": "srl_custom",
"token_indexers": {
"tokens": {
"type": "pretrained_transformer",
"model_name": "roberta-base"
}
}
},
"data_loader": {
"batch_sampler": {
"type": "bucket",
"batch_size": 32
}
},
"train_data_path": "./data/SRL/SONAR_1_SRL/MANUAL500/",
"validation_data_path": "./data/SRL/SONAR_1_SRL/MANUAL500/",
"model": {
"type": "srl_bert",
"embedding_dropout": 0.1,
"bert_model": "roberta-base"
},
"trainer": {
"optimizer": {
"type": "huggingface_adamw",
"lr": 5e-5,
"correct_bias": false,
"weight_decay": 0.01,
"parameter_groups": [
[
[
"bias",
"LayerNorm.bias",
"LayerNorm.weight",
"layer_norm.weight"
],
{
"weight_decay": 0.0
}
]
]
},
"learning_rate_scheduler": {
"type": "slanted_triangular"
},
"checkpointer": {
"keep_most_recent_by_count": 2
},
"grad_norm": 1.0,
"num_epochs": 15,
"validation_metric": "+f1-measure-overall"
}
}然而,这仍然不起作用。我收到以下错误:
2022-02-22 16:19:34,122 - INFO - allennlp.training.gradient_descent_trainer - Training
0%| | 0/1546 [00:00<?, ?it/s]2022-02-22 16:19:34,142 - INFO - allennlp.data.samplers.bucket_batch_sampler - No sorting keys given; trying to guess a good one
2022-02-22 16:19:34,142 - INFO - allennlp.data.samplers.bucket_batch_sampler - Using ['tokens'] as the sorting keys
0%| | 0/1546 [00:00<?, ?it/s]
2022-02-22 16:19:34,526 - CRITICAL - root - Uncaught exception
Traceback (most recent call last):
File "C:\Program Files\Python39\lib\runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Program Files\Python39\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\Scripts\allennlp.exe\__main__.py", line 7, in <module>
sys.exit(run())
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\__main__.py", line 39, in run
main(prog="allennlp")
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\__init__.py", line 119, in main
args.func(args)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 111, in train_model_from_args
train_model_from_file(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 177, in train_model_from_file
return train_model(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 258, in train_model
model = _train_worker(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 508, in _train_worker
metrics = train_loop.run()
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\commands\train.py", line 581, in run
return self.trainer.train()
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\training\gradient_descent_trainer.py", line 771, in train
metrics, epoch = self._try_train()
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\training\gradient_descent_trainer.py", line 793, in _try_train
train_metrics = self._train_epoch(epoch)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\training\gradient_descent_trainer.py", line 510, in _train_epoch
batch_outputs = self.batch_outputs(batch, for_training=True)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp\training\gradient_descent_trainer.py", line 403, in batch_outputs
output_dict = self._pytorch_model(**batch)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp_models\structured_prediction\models\srl_bert.py", line 141, in forward
bert_embeddings, _ = self.bert_model(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_bert.py", line 989, in forward
embedding_output = self.embeddings(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\transformers\models\bert\modeling_bert.py", line 215, in forward
token_type_embeddings = self.token_type_embeddings(token_type_ids)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\modules\sparse.py", line 156, in forward
return F.embedding(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\torch\nn\functional.py", line 1916, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self我不完全明白出了什么问题,也找不到任何关于如何将配置文件更改为加载到“自定义”BERT/RoBERTa模型中的文档(其中一个没有提到这里)。我正在运行默认的allennlp train config.jsonnet命令来开始训练。然而,allennlp train config.jsonnet --dry-run不会产生错误。
提前感谢!蒂杰斯
编辑:--我现在已将"srl_bert“替换为自定义的"srl_roberta”类,以使用RobertaModel。然而,这仍然会产生相同的错误。
EDIT2:,我现在按照Dirk的建议使用AutoTokenizer。看起来,更改SrlReader类以支持基于RoBERTa的模型需要进行更多的更改,如将BERTs、字片标记器替换为RoBERTa的BPE令牌器。是否有一种简单的方法来适应SrlReader类,还是从零开始编写一个新的RobertaSrlReader更好?
我继承了SrlReader类,并将这条线更改为:
self.bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)它会产生以下错误,因为RoBERTa令牌化与BERT不同:
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp_models\structured_prediction\dataset_readers\srl.py", line 255, in text_to_instance
wordpieces, offsets, start_offsets = self._wordpiece_tokenize_input(
File "C:\Users\denbe\AppData\Roaming\Python\Python39\site-packages\allennlp_models\structured_prediction\dataset_readers\srl.py", line 196, in _wordpiece_tokenize_input
word_pieces = self.bert_tokenizer.wordpiece_tokenizer.tokenize(token)
AttributeError: 'RobertaTokenizerFast' object has no attribute 'wordpiece_tokenizer'发布于 2022-02-24 02:14:23
解决这一问题的最简单方法是修补SrlReader,使其使用PretrainedTransformerTokenizer (来自AllenNLP)或AutoTokenizer (来自Huggingface)而不是BertTokenizer。SrlReader是一个旧类,是针对Huggingface的旧版本编写的,因此升级起来并不容易。
如果您想在AllenNLP项目中提交一个拉请求,我很乐意帮助您将其合并到AllenNLP中!
https://stackoverflow.com/questions/71223907
复制相似问题