streaming讀取kafka數據再保存到es7.7.0代碼及踩過的坑

代碼



import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.elasticsearch.spark.rdd.EsSpark



object StreamingWriteES {

  def main(args: Array[String]): Unit = {
    // offset保存路徑
    // val checkpointPath = "D:\\hadoop\\checkpoint\\kafka-direct"

    val session = SparkSession.builder()
      //.master("local[*]")
      .appName("kafkaStreamingWriteES")
      .config("xpack.security.user","elastic:elastic123")//x-pack認證安全機制的賬號密碼
      .config("es.net.http.auth.user", "elastic")//es的賬號
      .config("es.net.http.auth.pass", "elastic123")//es的密碼
      .getOrCreate()
    session.sparkContext.setLogLevel("WARN")

    val ssc = new StreamingContext(session.sparkContext, Seconds(1))
    //  ssc.checkpoint(checkpointPath)

    val bootstrapServers = "xxxx:9092,xxxx:9092,xxxx:9092"
    val groupId = "test-consumer-group3"
    val topicName = "ecar-photo-gps" //
    val maxPoll = 1000

    val kafkaParams = Map(
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
      ConsumerConfig.GROUP_ID_CONFIG -> groupId,
      ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
    )

    val kafkaTopicDS = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferBrokers,
      ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams))

    val dStream: DStream[String] = kafkaTopicDS.map(_.value)
      .flatMap(_.split(" "))


    val esConf = Map(
     // "es.nodes.wan.only" -> "true",
      "es.nodes" -> "xxx",//單機版es
      "es.port" -> "9200",
      "es.resource" -> "test/ecar_data" //es索引
    )

    dStream.foreachRDD(rdd => {
      EsSpark.saveJsonToEs(rdd, esConf)
    })

    ssc.start()
    ssc.awaitTermination()

  }
}

maven

    <dependencies>
        <!-- spark steaming的依賴 -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.1.1</version>
            <scope>provided</scope>
        </dependency>

        <!-- sparkSteaming跟Kafka整合的依賴 -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>2.1.1</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.1.1</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.1.1</version>
            <scope>provided</scope>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.4.1</version>
        </dependency>

        <!--打jar運行報錯找不到這個包,加上就好了-->
        <dependency>
            <groupId>commons-httpclient</groupId>
            <artifactId>commons-httpclient</artifactId>
            <version>3.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpmime</artifactId>
            <version>4.3.6</version>
        </dependency>

        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>2.11.8</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch</artifactId>
            <version>7.7.0</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch-hadoop</artifactId>
            <version>7.7.0</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.elasticsearch.client</groupId>
            <artifactId>transport</artifactId>
            <version>7.7.0</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.56</version>
        </dependency>

        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-pool2</artifactId>
            <version>2.4.2</version>
        </dependency>

        <!-- 新加的 -->
        <dependency>
            <groupId>org.postgresql</groupId>
            <artifactId>postgresql</artifactId>
            <version>42.1.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-configuration2</artifactId>
            <version>2.2</version>
        </dependency>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>jcl-over-slf4j</artifactId>
            <version>1.7.22</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>1.7.22</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.22</version>
        </dependency>

        <!-- 完 -->
    </dependencies>

錯誤1:

Exception in thread "streaming-job-executor-0" java.lang.NoClassDefFoundError: org/apache/commons/httpclient/protocol/ProtocolSocketFactory

解決辦法:

去這個網址裏面找相對應的依賴:https://mvnrepository.com/

錯誤2:

報什麼elasticsearch-spark-20_2.11依賴和elasticsearch-hadoop依賴同時存在

解決辦法:

只留elasticsearch-hadoop依賴就可以了

錯誤3:

{
  "error" : "Content-Type header [application/x-www-form-urlencoded] is not supported",
  "status" : 406
}

解決辦法:

此原因是公司安裝ES時開啓了x-pack安全機制,進行嚴格的內容類型檢查,嚴格檢查內容類型也可以作爲防止跨站點請求僞造攻擊的一層保護。

 .config("xpack.security.user","elastic:elastic123")//x-pack認證安全機制的賬號密碼

 

使用命令行進行讀寫的時候不用設置,如下:指定一個json文件傳輸到es中

curl -H "Content-Type: application/json"  -XPOST localhost:9200/people/shakespeare/_bulk?pretty  --data-binary test.json

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章