1、准备
环境:Spark 2.3.1 + Mongo 4.2.3 +scala -2.11.8+ jdk 1.8.0 + Hadoop-2.7.3
业务:将 json 数据 使用 spark 读出来,然后写入 mongo
2、输入数据
json 格式
{
"name": "BeJson",
"url": "http://www.bejson.com",
"page": 88,
"isNonProfit": true,
"address": {
"street": "科技园路.",
"city": "江苏苏州",
"country": "中国"
},
"links": [
{
"name": "Google",
"url": "http://www.google.com"
},
{
"name": "Baidu",
"url": "http://www.baidu.com"
},
{
"name": "SoSo",
"url": "http://www.SoSo.com"
}
]
}
3、代码
package cn.secrank.fdp.sinan.spark.apps
import com.mongodb.spark.MongoSpark
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.slf4j.LoggerFactory
object WriteJsonToMongoProcessing {
private lazy val logger = LoggerFactory.getLogger(v.getClass)
def main(args: Array[String]): Unit = {
logger.warn(s"@@@@@@ 输入的参数长度是:[${args.length}]")
if (args.length > 0) {
System.err.println("@@@@@@ 参数输入有误,请重新输入!")
System.exit(0)
}
val database = "your_database"
val collection = "your_collection"
val host = "127.0.0.1"
val outPutUri = s"mongodb://$host/"
val inputPath ="you_data.json"
val spark = SparkSession
.builder()
.master("local[2]")
.config("spark.mongodb.output.uri", outPutUri)
.config("spark.mongodb.output.database",database)
.config("spark.mongodb.output.collection",collection)
.config("spark.mongodb.output.maxBatchSize",1024)
.appName("MongoSparkConnectorIntro")
.getOrCreate()
val df =spark
.read
.json(inputPath)
MongoSpark.save(
df
.write
.mode(SaveMode.Overwrite)
)
spark.stop()
}
}
4、查询
$ mongo
$ use your_database
$ db.getCollection('your_collection').find({}).count()
5、参考文档
- option:https://docs.mongodb.com/spark-connector/master/configuration/#spark-output-conf
- dataset操作:https://docs.mongodb.com/spark-connector/master/scala/datasets-and-sql/