1、準備
環境:Spark 2.3.1 + Mongo 4.2.3 +scala -2.11.8+ jdk 1.8.0 + Hadoop-2.7.3
業務:將 json 數據 使用 spark 讀出來,然後寫入 mongo
2、輸入數據
json 格式
{
"name": "BeJson",
"url": "http://www.bejson.com",
"page": 88,
"isNonProfit": true,
"address": {
"street": "科技園路.",
"city": "江蘇蘇州",
"country": "中國"
},
"links": [
{
"name": "Google",
"url": "http://www.google.com"
},
{
"name": "Baidu",
"url": "http://www.baidu.com"
},
{
"name": "SoSo",
"url": "http://www.SoSo.com"
}
]
}
3、代碼
package cn.secrank.fdp.sinan.spark.apps
import com.mongodb.spark.MongoSpark
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.slf4j.LoggerFactory
object WriteJsonToMongoProcessing {
private lazy val logger = LoggerFactory.getLogger(v.getClass)
def main(args: Array[String]): Unit = {
logger.warn(s"@@@@@@ 輸入的參數長度是:[${args.length}]")
if (args.length > 0) {
System.err.println("@@@@@@ 參數輸入有誤,請重新輸入!")
System.exit(0)
}
val database = "your_database"
val collection = "your_collection"
val host = "127.0.0.1"
val outPutUri = s"mongodb://$host/"
val inputPath ="you_data.json"
val spark = SparkSession
.builder()
.master("local[2]")
.config("spark.mongodb.output.uri", outPutUri)
.config("spark.mongodb.output.database",database)
.config("spark.mongodb.output.collection",collection)
.config("spark.mongodb.output.maxBatchSize",1024)
.appName("MongoSparkConnectorIntro")
.getOrCreate()
val df =spark
.read
.json(inputPath)
MongoSpark.save(
df
.write
.mode(SaveMode.Overwrite)
)
spark.stop()
}
}
4、查詢
$ mongo
$ use your_database
$ db.getCollection('your_collection').find({}).count()
5、參考文檔
- option:https://docs.mongodb.com/spark-connector/master/configuration/#spark-output-conf
- dataset操作:https://docs.mongodb.com/spark-connector/master/scala/datasets-and-sql/