代码
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.elasticsearch.spark.rdd.EsSpark
object StreamingWriteES {
def main(args: Array[String]): Unit = {
// offset保存路径
// val checkpointPath = "D:\\hadoop\\checkpoint\\kafka-direct"
val session = SparkSession.builder()
//.master("local[*]")
.appName("kafkaStreamingWriteES")
.config("xpack.security.user","elastic:elastic123")//x-pack认证安全机制的账号密码
.config("es.net.http.auth.user", "elastic")//es的账号
.config("es.net.http.auth.pass", "elastic123")//es的密码
.getOrCreate()
session.sparkContext.setLogLevel("WARN")
val ssc = new StreamingContext(session.sparkContext, Seconds(1))
// ssc.checkpoint(checkpointPath)
val bootstrapServers = "xxxx:9092,xxxx:9092,xxxx:9092"
val groupId = "test-consumer-group3"
val topicName = "ecar-photo-gps" //
val maxPoll = 1000
val kafkaParams = Map(
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
)
val kafkaTopicDS = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferBrokers,
ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams))
val dStream: DStream[String] = kafkaTopicDS.map(_.value)
.flatMap(_.split(" "))
val esConf = Map(
// "es.nodes.wan.only" -> "true",
"es.nodes" -> "xxx",//单机版es
"es.port" -> "9200",
"es.resource" -> "test/ecar_data" //es索引
)
dStream.foreachRDD(rdd => {
EsSpark.saveJsonToEs(rdd, esConf)
})
ssc.start()
ssc.awaitTermination()
}
}
maven
<dependencies>
<!-- spark steaming的依赖 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.1</version>
<scope>provided</scope>
</dependency>
<!-- sparkSteaming跟Kafka整合的依赖 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.1.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.1</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.4.1</version>
</dependency>
<!--打jar运行报错找不到这个包,加上就好了-->
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.3.6</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>7.7.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>7.7.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
<version>7.7.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.56</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
<version>2.4.2</version>
</dependency>
<!-- 新加的 -->
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.1.4</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-configuration2</artifactId>
<version>2.2</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>1.7.22</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.22</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.22</version>
</dependency>
<!-- 完 -->
</dependencies>
错误1:
Exception in thread "streaming-job-executor-0" java.lang.NoClassDefFoundError: org/apache/commons/httpclient/protocol/ProtocolSocketFactory
解决办法:
去这个网址里面找相对应的依赖:https://mvnrepository.com/
错误2:
报什么elasticsearch-spark-20_2.11依赖和elasticsearch-hadoop依赖同时存在
解决办法:
只留elasticsearch-hadoop依赖就可以了
错误3:
{
"error" : "Content-Type header [application/x-www-form-urlencoded] is not supported",
"status" : 406
}
解决办法:
此原因是公司安装ES时开启了x-pack安全机制,进行严格的内容类型检查,严格检查内容类型也可以作为防止跨站点请求伪造攻击的一层保护。
.config("xpack.security.user","elastic:elastic123")//x-pack认证安全机制的账号密码
使用命令行进行读写的时候不用设置,如下:指定一个json文件传输到es中
curl -H "Content-Type: application/json" -XPOST localhost:9200/people/shakespeare/_bulk?pretty --data-binary test.json