注意:这个问题以不同的方式开始,但我删除了所有以前(现在不必要的)信息。
我有一个CsvDataset,它由一个标签(浮动)和一个文本(字符串)组成。我想要转换每一行,这样我就可以将其输入到预先训练的Bert模型中。不幸的是,我无法通过.map函数
files = glob.glob("example*.tsv")
d = tf.data.experimental.CsvDataset(files,
[tf.float32, tf.string],
select_cols=[3,4],
field_delim="\t",
header=True)
parsed_dataset = d.map(lambda label, text: tf.py_func(_decode_record, [label, text], [tf.float32, tf.string]))
def _decode_record(label, text):
"""Decodes a row to a TensorFlow example."""
label_list = [1, 2, 3, 4, 5]
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(text)
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0: (max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label_map[label]
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["label_ids"] = create_int_feature([label_id])
features["is_real_example"] = create_int_feature(
[int(True)])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
return tf_example这与:tensorflow.python.framework.errors_impl.UnimplementedError: Unsupported object type Example [[{{node PyFunc}}]] [Op:IteratorGetNextSync]中断
发布于 2019-05-23 23:24:36
我找到了解决这个问题的办法。下面的代码可以完成这项工作。我的问题是我误解了tf.py_func的Tout参数
def _convert(label, text):
"""Decodes a csv-line to a TensorFlow Example, serialized as a string."""
label_list = [1, 2, 3, 4, 5]
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(text)
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0: (max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label_map[label]
print("types", type(label_id), type(input_ids))
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["label_ids"] = create_int_feature([label_id])
features["is_real_example"] = create_int_feature(
[int(True)])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
# we cannot return the example here because tf.py_func only accepts true tf datatypes
return tf_example.SerializeToString()
name_to_features = {
'input_ids': tf.FixedLenFeature([128], tf.int64),
'input_mask': tf.FixedLenFeature([128], tf.int64),
'segment_ids': tf.FixedLenFeature([128], tf.int64),
'label_ids': tf.FixedLenFeature([1], tf.int64),
'is_real_example': tf.FixedLenFeature([1], tf.int64)
}
def _decode_record(record):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
print(example)
return example
parsed_dataset = d.map(lambda label, text: tf.py_func(_convert, [label, text], tf.string))
parsed_dataset = parsed_dataset.map(_decode_record)请注意,此解决方案使用tf.py_func,因此不适用于GPU或TPU等加速器
https://stackoverflow.com/questions/56227420
复制相似问题