解決Kafka HDFS Sink Connector Integrated with Hive 報錯

Kafka Connect 是distributed模式,分別運行在192.168.1.204和192.168.1.100和192.168.1.200這三臺機器上,開放了18083端口(因爲默認的8083端口被佔用了)用於向kafka connect的worker進程提交connector。

在用confluent hdfs sink connector把kafka中的數據導入到hdfs並導入到hive表時,connector配置如下:

{
        "name":"dev_hdfs-sink",
        "config":{
                "connector.class":"io.confluent.connect.hdfs.HdfsSinkConnector",
                "tasks.max":"1",
                "topics":"user_option5",
                "hdfs.url":"hdfs://192.168.1.204:14000",
                "flush.size":"3",
                "hive.integration":true,
                "hive.database":"test_db",
                "hive.metastore.uris":"thrift://192.168.1.204:9083",
                "schema.compatibility":"BACKWARD"
        }
}

在對hive表執行select的時候遇到了報錯:

12: jdbc:hive2://localhost:10000> select * from user_option5;
19:41:29.901 [main] DEBUG org.apache.thrift.transport.TSaslTransport - writing data length: 133
19:41:29.958 [main] DEBUG org.apache.thrift.transport.TSaslTransport - CLIENT: reading data length: 4594
Error: Error while compiling statement: FAILED: RuntimeException MetaException(message:org.apache.hadoop.hive.serde2.avro.AvroSerdeException Schema for table must be of type RECORD. Received type: STRING) (state=42000,code=40000)

查看connector自動創建的hive表的建表語句:

12: jdbc:hive2://localhost:10000> show create table user_option5;
+--------------------------------------------------------------------+--+
|                           createtab_stmt                           |
+--------------------------------------------------------------------+--+
| CREATE EXTERNAL TABLE `user_option5`(                              |
| )                                                                  |
| PARTITIONED BY (                                                   |
|   `partition` string COMMENT '')                                   |
| ROW FORMAT SERDE                                                   |
|   'org.apache.hadoop.hive.serde2.avro.AvroSerDe'                   |
| STORED AS INPUTFORMAT                                              |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'     |
| OUTPUTFORMAT                                                       |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'    |
| LOCATION                                                           |
|   'hdfs://192.168.1.204:14000/topics/user_option5'                 |
| TBLPROPERTIES (                                                    |
|   'avro.schema.literal'='{"type":"string","connect.version":1}',   |
|   'transient_lastDdlTime'='1533469246')                            |
+--------------------------------------------------------------------+——————————————————————————————————+

可以發現,avro.schema.literal中的type是string,應該是record類型。

報錯的原因是KafkaProducer在發送消息的時候沒有指定消息的schema信息。
在生產者代碼中指定schema信息後就好了:

package com.superid.kafka.producer;

import io.confluent.kafka.serializers.KafkaAvroSerializerConfig;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.stream.IntStream;

/**
 * @author 
 * @create: 2018-08-01 17:55
 */
public class SimuKafkaProducer {

    public static void main(String[] args) {
        Properties props = new Properties();

        props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.1.204:9092,192.168.1.100:9092,192.168.1.200:9092");
        props.put("acks", "all");
        props.put("retries", Integer.MAX_VALUE);
        props.put("batch.size", 16384);
        props.put("linger.ms", 1);
        props.put("buffer.memory", 33554432);
        props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroSerializer.class);
        props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroSerializer.class);

        // Schema Registry location.
        props.put(KafkaAvroSerializerConfig.SCHEMA_REGISTRY_URL_CONFIG,
                "http://192.168.1.204:18081,http://192.168.1.100:18081,http://192.168.1.200:18081");

        KafkaProducer producer = new KafkaProducer(props);
        //要把消息的schema信息列出來
        String userOptionSchema = "{\"type\":\"record\",\"name\":\"user_option8\"," +
                "\"fields\":[" +
                "{\"name\":\"allianceId\",\"type\":\"long\"}," +
                "{\"name\":\"affairId\",\"type\":\"long\"},"+
                "{\"name\":\"userId\",\"type\":\"int\"},"+
                "{\"name\":\"opType\",\"type\":\"string\"},"+
                "{\"name\":\"beOperatedRoleId\",\"type\":\"string\"},"+
                "{\"name\":\"attrs\",\"type\":{\"type\": \"map\", \"values\":\"string\"}}"+
                "]}";

        Schema.Parser parser = new Schema.Parser();
        Schema schema = parser.parse(userOptionSchema);


        try {
            IntStream.range(1, 100).forEach(index -> {
                Map<String, Object> map = new HashMap<>();
                map.put("name", "pilaf");
                map.put("age", index + 20+"");

                GenericRecord avroRecord = new GenericData.Record(schema);
                avroRecord.put("allianceId", 11L);
                avroRecord.put("affairId",23L);
                avroRecord.put("userId",12);
                avroRecord.put("opType","c");
                avroRecord.put("beOperatedRoleId","str111");
                avroRecord.put("attrs",map);
                //發送的ProducerRecord中要包着avroRecord
                producer.send(new ProducerRecord("user_option8", index,avroRecord));

            });
        } catch (Exception e) {
            // may need to do something with it
            e.printStackTrace();
        } finally {
            producer.flush();
            producer.close();
        }

    }
}

刪掉原來的connector,在linux shell命令中執行:

curl -X DELETE localhost:18083/connectors/dev_hdfs-sink

重新配置一下connector(配置信息放在了文件dev_hdfs-sink.json中,換了一個topic名字):

{
        "name":"dev_hdfs-sink",
        "config":{
                "connector.class":"io.confluent.connect.hdfs.HdfsSinkConnector",
                "tasks.max":"1",
                "topics":"user_option8",
                "hdfs.url":"hdfs://192.168.1.204:14000",
                "flush.size":"3",
                "hive.integration":true,
                "hive.database":"test_db",
                "hive.metastore.uris":"thrift://192.168.1.204:9083",
                "schema.compatibility":"BACKWARD"
        }
}

再啓動connector:

cd  /home/tidb/confluent-4.1.1/etc/kafka-connect-hdfs
curl -X POST -H "Content-Type: application/json" --data @dev_hdfs-sink.json http://192.168.1.204:18083/connectors

啓動connector後,運行producer程序向kafka的topic:user_option8寫一些數據後,就會自動把消息放到hdfs上,併入hive表(建表都是自動完成的,不需要手動干預)。

此時,再查看connector自動創建的hive表的建表語句(show create table user_option8):

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+
|                                                                                                                                                                                createtab_stmt                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+
| CREATE EXTERNAL TABLE `user_option8`(                                                                                                                                                                                                                                                                                                                                         |
|   `allianceid` bigint COMMENT '',                                                                                                                                                                                                                                                                                                                                             |
|   `affairid` bigint COMMENT '',                                                                                                                                                                                                                                                                                                                                               |
|   `userid` int COMMENT '',                                                                                                                                                                                                                                                                                                                                                    |
|   `optype` string COMMENT '',                                                                                                                                                                                                                                                                                                                                                 |
|   `beoperatedroleid` string COMMENT '',                                                                                                                                                                                                                                                                                                                                       |
|   `attrs` map<string,string> COMMENT '')                                                                                                                                                                                                                                                                                                                                      |
| PARTITIONED BY (                                                                                                                                                                                                                                                                                                                                                              |
|   `partition` string COMMENT '')                                                                                                                                                                                                                                                                                                                                              |
| ROW FORMAT SERDE                                                                                                                                                                                                                                                                                                                                                              |
|   'org.apache.hadoop.hive.serde2.avro.AvroSerDe'                                                                                                                                                                                                                                                                                                                              |
| STORED AS INPUTFORMAT                                                                                                                                                                                                                                                                                                                                                         |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'                                                                                                                                                                                                                                                                                                                |
| OUTPUTFORMAT                                                                                                                                                                                                                                                                                                                                                                  |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'                                                                                                                                                                                                                                                                                                               |
| LOCATION                                                                                                                                                                                                                                                                                                                                                                      |
|   'hdfs://192.168.1.204:14000/topics/user_option8'                                                                                                                                                                                                                                                                                                                            |
| TBLPROPERTIES (                                                                                                                                                                                                                                                                                                                                                               |
|   'avro.schema.literal'='{"type":"record","name":"user_option8","fields":[{"name":"allianceId","type":"long"},{"name":"affairId","type":"long"},{"name":"userId","type":"int"},{"name":"opType","type":"string"},{"name":"beOperatedRoleId","type":"string"},{"name":"attrs","type":{"type":"map","values":"string"}}],"connect.version":1,"connect.name":"user_option8"}',   |
|   'transient_lastDdlTime'='1533525123')                                                                                                                                                                                                                                                                                                                                       |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+

可以看到,現在的建表語句中的avro.schema.literal中就包含了schema信息。

此時再查詢Hive表,可以看到表中就有數據了:

| user_option8.allianceid  | user_option8.affairid  | user_option8.userid  | user_option8.optype  | user_option8.beoperatedroleid  |       user_option8.attrs       | user_option8.partition  |
+--------------------------+------------------------+----------------------+----------------------+--------------------------------+--------------------------------+-------------------------+--+
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"22"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"23"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"24"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"25"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"26"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"27"}    | 0                       |
+--------------------------+------------------------+----------------------+----------------------+--------------------------------+--------------------------------+-------------------------+--+

看到上面的schema字符串很長很噁心,爲了方便構造schema字符串,筆者做了一個簡單的封裝:

package com.superid.entity;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * @author dufeng
 * @create: 2018-08-06 13:55
 */
public class MySchema {
    /**
     * 用於Hive表
     */
    public static final String RECORD_TYPE = "record";

    /**
     * schema的type類型,一般用record
     */
    private String type;
    /**
     * schema的名字信息
     */
    private String name;
    /**
     * schema中的字段
     */
    private List<Map<String, String>> fields = new ArrayList<>();

    /**
     * 不對外提供公開構造器,讓使用者通過builder構造
     */
    private MySchema(){

    }


    private MySchema(Builder builder) {
        type = builder.type;
        name = builder.name;
        fields = builder.fields;
    }


    public static final class Builder {
        private String type;
        private String name;
        private List<Map<String, String>> fields = new ArrayList<>();

        public Builder() {
        }

        public Builder type(String val) {
            type = val;
            return this;
        }

        public Builder name(String val) {
            name = val;
            return this;
        }

        public Builder fields(List<Map<String, String>> fieldMap) {
            fields = fieldMap;
            return this;
        }

        public Builder field(String name,String type) {
            type = type.toLowerCase();

            Map<String,String> aFieldMap = new HashMap<>();
            aFieldMap.put("name",name);
            aFieldMap.put("type",type);
            fields.add(aFieldMap);

            return this;
        }

        public Builder field(Map<String, String> map) {
            fields.add(map);
            return this;
        }

        public MySchema build() {
            return new MySchema(this);
        }
    }


    @Override
    public String toString() {
        StringBuilder fieldStr = new StringBuilder("[");
        for(Map<String,String> map:fields){
            fieldStr.append("{");
            fieldStr.append("\"name\":").append("\"").append(map.get("name")).append("\",");
            if(map.get("type").startsWith("{")){
                fieldStr.append("\"type\":").append(map.get("type"));
            }else {
                fieldStr.append("\"type\":").append("\"").append(map.get("type")).append("\"");
            }

            fieldStr.append("}");
            fieldStr.append(",");
        }

        //刪除最後一個多餘的逗號
        fieldStr.deleteCharAt(fieldStr.length()-1);

        fieldStr.append("]");

        return "{" +
                "\"type\":\"" + type + '\"' +
                ",\"name\":\"" + name + '\"' +
                ",\"fields\":" + fieldStr.toString() +
                '}';
    }
}

這樣可以通過如下代碼構造schema字符串,儘量減小對業務代碼的污染:

        String userOptionSchema = new MySchema.Builder()
                .type(MySchema.RECORD_TYPE)
                .name("user_option8")
                .field("allianceId","long")
                .field("affairId","long")
                .field("userId","int")
                .field("opType","string")
                .field("beOperatedRoleId","string")
                .field("attrs","{\"type\": \"map\", \"values\":\"string\"}")
                .build()
                .toString();

        Schema.Parser parser = new Schema.Parser();
        Schema schema = parser.parse(userOptionSchema);
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章