代碼
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.elasticsearch.spark.rdd.EsSpark
object StreamingWriteES {
def main(args: Array[String]): Unit = {
// offset保存路徑
// val checkpointPath = "D:\\hadoop\\checkpoint\\kafka-direct"
val session = SparkSession.builder()
//.master("local[*]")
.appName("kafkaStreamingWriteES")
.config("xpack.security.user","elastic:elastic123")//x-pack認證安全機制的賬號密碼
.config("es.net.http.auth.user", "elastic")//es的賬號
.config("es.net.http.auth.pass", "elastic123")//es的密碼
.getOrCreate()
session.sparkContext.setLogLevel("WARN")
val ssc = new StreamingContext(session.sparkContext, Seconds(1))
// ssc.checkpoint(checkpointPath)
val bootstrapServers = "xxxx:9092,xxxx:9092,xxxx:9092"
val groupId = "test-consumer-group3"
val topicName = "ecar-photo-gps" //
val maxPoll = 1000
val kafkaParams = Map(
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
)
val kafkaTopicDS = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferBrokers,
ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams))
val dStream: DStream[String] = kafkaTopicDS.map(_.value)
.flatMap(_.split(" "))
val esConf = Map(
// "es.nodes.wan.only" -> "true",
"es.nodes" -> "xxx",//單機版es
"es.port" -> "9200",
"es.resource" -> "test/ecar_data" //es索引
)
dStream.foreachRDD(rdd => {
EsSpark.saveJsonToEs(rdd, esConf)
})
ssc.start()
ssc.awaitTermination()
}
}
maven
<dependencies>
<!-- spark steaming的依賴 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.1</version>
<scope>provided</scope>
</dependency>
<!-- sparkSteaming跟Kafka整合的依賴 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.1.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.1</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.4.1</version>
</dependency>
<!--打jar運行報錯找不到這個包,加上就好了-->
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.3.6</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>7.7.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>7.7.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
<version>7.7.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.56</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
<version>2.4.2</version>
</dependency>
<!-- 新加的 -->
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.1.4</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-configuration2</artifactId>
<version>2.2</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>1.7.22</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.22</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.22</version>
</dependency>
<!-- 完 -->
</dependencies>
錯誤1:
Exception in thread "streaming-job-executor-0" java.lang.NoClassDefFoundError: org/apache/commons/httpclient/protocol/ProtocolSocketFactory
解決辦法:
去這個網址裏面找相對應的依賴:https://mvnrepository.com/
錯誤2:
報什麼elasticsearch-spark-20_2.11依賴和elasticsearch-hadoop依賴同時存在
解決辦法:
只留elasticsearch-hadoop依賴就可以了
錯誤3:
{
"error" : "Content-Type header [application/x-www-form-urlencoded] is not supported",
"status" : 406
}
解決辦法:
此原因是公司安裝ES時開啓了x-pack安全機制,進行嚴格的內容類型檢查,嚴格檢查內容類型也可以作爲防止跨站點請求僞造攻擊的一層保護。
.config("xpack.security.user","elastic:elastic123")//x-pack認證安全機制的賬號密碼
使用命令行進行讀寫的時候不用設置,如下:指定一個json文件傳輸到es中
curl -H "Content-Type: application/json" -XPOST localhost:9200/people/shakespeare/_bulk?pretty --data-binary test.json