使用弹性搜索版本6.8.0
hive> select * from provider1;
OK
{"id","k11",}
{"id","k12",}
{"id","k13",}
{"id","k14",}
{"id":"K1","name":"Ravi","salary":500}
{"id":"K2","name":"Ravi","salary":500}
{"id":"K3","name":"Ravi","salary":500}
{"id":"K4","name":"Ravi","salary":500}
{"id":"K5","name":"Ravi","salary":500}
{"id":"K6","name":"Ravi","salary":"sdfgg"}
{"id":"K7","name":"Ravi","salary":"sdf"}
{"id":"k8"}
{"id":"K9","name":"r1","salary":522}
{"id":"k10","name":"r2","salary":53}
Time taken: 0.179 seconds, Fetched: 14 row(s)ADD JAR /home/smrafi/elasticsearch-hadoop-6.8.0/dist/elasticsearch-hadoop-6.8.0.jar;
CREATE external TABLE hive_es_with_handler( data STRING)
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES(
'es.resource' = 'test_eshadoop/healthCareProvider',
'es.nodes' = 'vpc-pid-pre-prod-es-cluster-b7thvqfj3tp45arxl34gge3yyi.us-east-2.es.amazonaws.com',
'es.input.json' = 'yes',
'es.index.auto.create' = 'true',
'es.write.operation'='upsert',
'es.nodes.wan.only' = 'true',
'es.port' = '443',
'es.net.ssl'='true',
'es.batch.size.entries'='1',
'es.mapping.id' ='id',
'es.batch.write.retry.count'='-1',
'es.batch.write.retry.wait'='60s',
'es.write.rest.error.handlers' = 'es, ignoreBadRecords',
'es.write.data.error.handlers' = 'customLog',
'es.write.data.error.handler.customLog' = 'com.verisys.elshandler.CustomLogOnError',
'es.write.rest.error.handler.es.client.resource'="error_es_index/error",
'es.write.rest.error.handler.es.return.default'='HANDLED',
'es.write.rest.error.handler.log.logger.name' = 'BulkErrors',
'es.write.data.error.handler.log.logger.name' = 'SerializationErrors',
'es.write.rest.error.handler.ignoreBadRecords' = 'com.verisys.elshandler.IgnoreBadRecordHandler',
'es.write.rest.error.handler.es.return.error'='HANDLED');
insert into hive_es_with_handler10 select * from provider1;下面是异常跟踪,它失败了,抱怨error.handler索引不存在。
Caused by: org.elasticsearch.hadoop.serialization.EsHadoopSerializationException: org.codehaus.jackson.JsonParseException: Unexpected character (',' (code 44)): was expecting a colon to separate field name and value at [Source: [B@1e3f0aea; line: 1, column: 7]
at org.elasticsearch.hadoop.serialization.json.JacksonJsonParser.nextToken(JacksonJsonParser.java:95)
at org.elasticsearch.hadoop.serialization.ParsingUtils.doFind(ParsingUtils.java:168)
at org.elasticsearch.hadoop.serialization.ParsingUtils.values(ParsingUtils.java:151)
at org.elasticsearch.hadoop.serialization.field.JsonFieldExtractors.process(JsonFieldExtractors.java:213)
at org.elasticsearch.hadoop.serialization.bulk.JsonTemplatedBulk.preProcess(JsonTemplatedBulk.java:64)
at org.elasticsearch.hadoop.serialization.bulk.TemplatedBulk.write(TemplatedBulk.java:54)
at org.elasticsearch.hadoop.hive.EsSerDe.serialize(EsSerDe.java:171)
at org.apache.hadoop.hive.ql.exec.FileSinkOperator.process(FileSinkOperator.java:725)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:897)
at org.apache.hadoop.hive.ql.exec.SelectOperator.process(SelectOperator.java:95)
at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:897)
at org.apache.hadoop.hive.ql.exec.TableScanOperator.process(TableScanOperator.java:130)
at org.apache.hadoop.hive.ql.exec.MapOperator$MapOpCtx.forward(MapOperator.java:148)
at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:550)
... 9 more
Caused by: org.codehaus.jackson.JsonParseException: Unexpected character (',' (code 44)): was expecting a colon to separate field name and value at [Source: [B@1e3f0aea; line: 1, column: 7]
at org.codehaus.jackson.JsonParser._constructError(JsonParser.java:1433)
at org.codehaus.jackson.impl.JsonParserMinimalBase._reportError(JsonParserMinimalBase.java:521)
at org.codehaus.jackson.impl.JsonParserMinimalBase._reportUnexpectedChar(JsonParserMinimalBase.java:442)
at org.codehaus.jackson.impl.Utf8StreamParser.nextToken(Utf8StreamParser.java:500)
at org.elasticsearch.hadoop.serialization.json.JacksonJsonParser.nextToken(JacksonJsonParser.java:93) ... 22 more我试着使用自定义的SerializationErrorHandler,但是它没有用,而且Handler没有进入上下文,它完全停止了作业,而不是在默认情况下(作为常量处理)继续寻找好的记录。
发布于 2022-11-01 20:36:08
似乎你有无效的JSON
在文档中提到,这不是由Hive处理的。
序列化错误处理程序还不能用于Hive。Apache的使用Hive的SerDe结构在发送到输出格式之前将数据转换为批量条目。SerDe对象没有在对象结束其生命周期时调用的清理方法。因此,我们不支持Hive中的序列化错误处理程序,因为它们不能在作业执行结束时关闭。
https://stackoverflow.com/questions/74277595
复制相似问题