解決Kafka HDFS Sink Connector Integrated with Hive 報錯

Kafka Connect 是distributed模式，分別運行在192.168.1.204和192.168.1.100和192.168.1.200這三臺機器上，開放了18083端口（因爲默認的8083端口被佔用了）用於向kafka connect的worker進程提交connector。

在用confluent hdfs sink connector把kafka中的數據導入到hdfs並導入到hive表時，connector配置如下：

{
        "name":"dev_hdfs-sink",
        "config":{
                "connector.class":"io.confluent.connect.hdfs.HdfsSinkConnector",
                "tasks.max":"1",
                "topics":"user_option5",
                "hdfs.url":"hdfs://192.168.1.204:14000",
                "flush.size":"3",
                "hive.integration":true,
                "hive.database":"test_db",
                "hive.metastore.uris":"thrift://192.168.1.204:9083",
                "schema.compatibility":"BACKWARD"
        }
}

在對hive表執行select的時候遇到了報錯：

12: jdbc:hive2://localhost:10000> select * from user_option5;
19:41:29.901 [main] DEBUG org.apache.thrift.transport.TSaslTransport - writing data length: 133
19:41:29.958 [main] DEBUG org.apache.thrift.transport.TSaslTransport - CLIENT: reading data length: 4594
Error: Error while compiling statement: FAILED: RuntimeException MetaException(message:org.apache.hadoop.hive.serde2.avro.AvroSerdeException Schema for table must be of type RECORD. Received type: STRING) (state=42000,code=40000)

查看connector自動創建的hive表的建表語句：

12: jdbc:hive2://localhost:10000> show create table user_option5;
+--------------------------------------------------------------------+--+
|                           createtab_stmt                           |
+--------------------------------------------------------------------+--+
| CREATE EXTERNAL TABLE `user_option5`(                              |
| )                                                                  |
| PARTITIONED BY (                                                   |
|   `partition` string COMMENT '')                                   |
| ROW FORMAT SERDE                                                   |
|   'org.apache.hadoop.hive.serde2.avro.AvroSerDe'                   |
| STORED AS INPUTFORMAT                                              |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'     |
| OUTPUTFORMAT                                                       |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'    |
| LOCATION                                                           |
|   'hdfs://192.168.1.204:14000/topics/user_option5'                 |
| TBLPROPERTIES (                                                    |
|   'avro.schema.literal'='{"type":"string","connect.version":1}',   |
|   'transient_lastDdlTime'='1533469246')                            |
+--------------------------------------------------------------------+——————————————————————————————————+

可以發現，avro.schema.literal中的type是string，應該是record類型。

報錯的原因是KafkaProducer在發送消息的時候沒有指定消息的schema信息。
在生產者代碼中指定schema信息後就好了：

package com.superid.kafka.producer;

import io.confluent.kafka.serializers.KafkaAvroSerializerConfig;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.stream.IntStream;

/**
 * @author 
 * @create: 2018-08-01 17:55
 */
public class SimuKafkaProducer {

    public static void main(String[] args) {
        Properties props = new Properties();

        props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.1.204:9092,192.168.1.100:9092,192.168.1.200:9092");
        props.put("acks", "all");
        props.put("retries", Integer.MAX_VALUE);
        props.put("batch.size", 16384);
        props.put("linger.ms", 1);
        props.put("buffer.memory", 33554432);
        props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroSerializer.class);
        props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroSerializer.class);

        // Schema Registry location.
        props.put(KafkaAvroSerializerConfig.SCHEMA_REGISTRY_URL_CONFIG,
                "http://192.168.1.204:18081,http://192.168.1.100:18081,http://192.168.1.200:18081");

        KafkaProducer producer = new KafkaProducer(props);
        //要把消息的schema信息列出來
        String userOptionSchema = "{\"type\":\"record\",\"name\":\"user_option8\"," +
                "\"fields\":[" +
                "{\"name\":\"allianceId\",\"type\":\"long\"}," +
                "{\"name\":\"affairId\",\"type\":\"long\"},"+
                "{\"name\":\"userId\",\"type\":\"int\"},"+
                "{\"name\":\"opType\",\"type\":\"string\"},"+
                "{\"name\":\"beOperatedRoleId\",\"type\":\"string\"},"+
                "{\"name\":\"attrs\",\"type\":{\"type\": \"map\", \"values\":\"string\"}}"+
                "]}";

        Schema.Parser parser = new Schema.Parser();
        Schema schema = parser.parse(userOptionSchema);


        try {
            IntStream.range(1, 100).forEach(index -> {
                Map<String, Object> map = new HashMap<>();
                map.put("name", "pilaf");
                map.put("age", index + 20+"");

                GenericRecord avroRecord = new GenericData.Record(schema);
                avroRecord.put("allianceId", 11L);
                avroRecord.put("affairId",23L);
                avroRecord.put("userId",12);
                avroRecord.put("opType","c");
                avroRecord.put("beOperatedRoleId","str111");
                avroRecord.put("attrs",map);
                //發送的ProducerRecord中要包着avroRecord
                producer.send(new ProducerRecord("user_option8", index,avroRecord));

            });
        } catch (Exception e) {
            // may need to do something with it
            e.printStackTrace();
        } finally {
            producer.flush();
            producer.close();
        }

    }
}

刪掉原來的connector，在linux shell命令中執行：

curl -X DELETE localhost:18083/connectors/dev_hdfs-sink

重新配置一下connector（配置信息放在了文件dev_hdfs-sink.json中，換了一個topic名字）：

{
        "name":"dev_hdfs-sink",
        "config":{
                "connector.class":"io.confluent.connect.hdfs.HdfsSinkConnector",
                "tasks.max":"1",
                "topics":"user_option8",
                "hdfs.url":"hdfs://192.168.1.204:14000",
                "flush.size":"3",
                "hive.integration":true,
                "hive.database":"test_db",
                "hive.metastore.uris":"thrift://192.168.1.204:9083",
                "schema.compatibility":"BACKWARD"
        }
}

再啓動connector：

cd  /home/tidb/confluent-4.1.1/etc/kafka-connect-hdfs
curl -X POST -H "Content-Type: application/json" --data @dev_hdfs-sink.json http://192.168.1.204:18083/connectors

啓動connector後，運行producer程序向kafka的topic：user_option8寫一些數據後，就會自動把消息放到hdfs上，併入hive表（建表都是自動完成的，不需要手動干預）。

此時，再查看connector自動創建的hive表的建表語句（show create table user_option8）：

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+
|                                                                                                                                                                                createtab_stmt                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+
| CREATE EXTERNAL TABLE `user_option8`(                                                                                                                                                                                                                                                                                                                                         |
|   `allianceid` bigint COMMENT '',                                                                                                                                                                                                                                                                                                                                             |
|   `affairid` bigint COMMENT '',                                                                                                                                                                                                                                                                                                                                               |
|   `userid` int COMMENT '',                                                                                                                                                                                                                                                                                                                                                    |
|   `optype` string COMMENT '',                                                                                                                                                                                                                                                                                                                                                 |
|   `beoperatedroleid` string COMMENT '',                                                                                                                                                                                                                                                                                                                                       |
|   `attrs` map<string,string> COMMENT '')                                                                                                                                                                                                                                                                                                                                      |
| PARTITIONED BY (                                                                                                                                                                                                                                                                                                                                                              |
|   `partition` string COMMENT '')                                                                                                                                                                                                                                                                                                                                              |
| ROW FORMAT SERDE                                                                                                                                                                                                                                                                                                                                                              |
|   'org.apache.hadoop.hive.serde2.avro.AvroSerDe'                                                                                                                                                                                                                                                                                                                              |
| STORED AS INPUTFORMAT                                                                                                                                                                                                                                                                                                                                                         |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'                                                                                                                                                                                                                                                                                                                |
| OUTPUTFORMAT                                                                                                                                                                                                                                                                                                                                                                  |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'                                                                                                                                                                                                                                                                                                               |
| LOCATION                                                                                                                                                                                                                                                                                                                                                                      |
|   'hdfs://192.168.1.204:14000/topics/user_option8'                                                                                                                                                                                                                                                                                                                            |
| TBLPROPERTIES (                                                                                                                                                                                                                                                                                                                                                               |
|   'avro.schema.literal'='{"type":"record","name":"user_option8","fields":[{"name":"allianceId","type":"long"},{"name":"affairId","type":"long"},{"name":"userId","type":"int"},{"name":"opType","type":"string"},{"name":"beOperatedRoleId","type":"string"},{"name":"attrs","type":{"type":"map","values":"string"}}],"connect.version":1,"connect.name":"user_option8"}',   |
|   'transient_lastDdlTime'='1533525123')                                                                                                                                                                                                                                                                                                                                       |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+

可以看到，現在的建表語句中的avro.schema.literal中就包含了schema信息。

此時再查詢Hive表，可以看到表中就有數據了：

| user_option8.allianceid  | user_option8.affairid  | user_option8.userid  | user_option8.optype  | user_option8.beoperatedroleid  |       user_option8.attrs       | user_option8.partition  |
+--------------------------+------------------------+----------------------+----------------------+--------------------------------+--------------------------------+-------------------------+--+
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"22"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"23"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"24"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"25"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"26"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"27"}    | 0                       |
+--------------------------+------------------------+----------------------+----------------------+--------------------------------+--------------------------------+-------------------------+--+

看到上面的schema字符串很長很噁心，爲了方便構造schema字符串，筆者做了一個簡單的封裝：

package com.superid.entity;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * @author dufeng
 * @create: 2018-08-06 13:55
 */
public class MySchema {
    /**
     * 用於Hive表
     */
    public static final String RECORD_TYPE = "record";

    /**
     * schema的type類型，一般用record
     */
    private String type;
    /**
     * schema的名字信息
     */
    private String name;
    /**
     * schema中的字段
     */
    private List<Map<String, String>> fields = new ArrayList<>();

    /**
     * 不對外提供公開構造器，讓使用者通過builder構造
     */
    private MySchema(){

    }


    private MySchema(Builder builder) {
        type = builder.type;
        name = builder.name;
        fields = builder.fields;
    }


    public static final class Builder {
        private String type;
        private String name;
        private List<Map<String, String>> fields = new ArrayList<>();

        public Builder() {
        }

        public Builder type(String val) {
            type = val;
            return this;
        }

        public Builder name(String val) {
            name = val;
            return this;
        }

        public Builder fields(List<Map<String, String>> fieldMap) {
            fields = fieldMap;
            return this;
        }

        public Builder field(String name,String type) {
            type = type.toLowerCase();

            Map<String,String> aFieldMap = new HashMap<>();
            aFieldMap.put("name",name);
            aFieldMap.put("type",type);
            fields.add(aFieldMap);

            return this;
        }

        public Builder field(Map<String, String> map) {
            fields.add(map);
            return this;
        }

        public MySchema build() {
            return new MySchema(this);
        }
    }


    @Override
    public String toString() {
        StringBuilder fieldStr = new StringBuilder("[");
        for(Map<String,String> map:fields){
            fieldStr.append("{");
            fieldStr.append("\"name\":").append("\"").append(map.get("name")).append("\",");
            if(map.get("type").startsWith("{")){
                fieldStr.append("\"type\":").append(map.get("type"));
            }else {
                fieldStr.append("\"type\":").append("\"").append(map.get("type")).append("\"");
            }

            fieldStr.append("}");
            fieldStr.append(",");
        }

        //刪除最後一個多餘的逗號
        fieldStr.deleteCharAt(fieldStr.length()-1);

        fieldStr.append("]");

        return "{" +
                "\"type\":\"" + type + '\"' +
                ",\"name\":\"" + name + '\"' +
                ",\"fields\":" + fieldStr.toString() +
                '}';
    }
}

這樣可以通過如下代碼構造schema字符串，儘量減小對業務代碼的污染：

        String userOptionSchema = new MySchema.Builder()
                .type(MySchema.RECORD_TYPE)
                .name("user_option8")
                .field("allianceId","long")
                .field("affairId","long")
                .field("userId","int")
                .field("opType","string")
                .field("beOperatedRoleId","string")
                .field("attrs","{\"type\": \"map\", \"values\":\"string\"}")
                .build()
                .toString();

        Schema.Parser parser = new Schema.Parser();
        Schema schema = parser.parse(userOptionSchema);

解決Kafka HDFS Sink Connector Integrated with Hive 報錯

Nginx R31 doc 官方文檔-01-nginx 如何安裝

規避Debezium master purged GTID問題

解決Spark Arrays.toString(Dataset.collect())報錯

IDEA2018.1.6打開Scala文件decompiled.class file bytecode version50(java 6)解決

解決Spark standalone部署模式cores爲0的問題

Kafka Connect 日誌配置

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結