spark批量寫入redis

最近工作中,在融合數據的時候,需要將10億+的記錄push到redis中,運維的同學幫忙搭建好redis集羣,100主 + 100 從 (單節點8G),最開始打算第一次批量寫入使用spark去寫入到redis,因爲數據存放在Hive表。
一、相關依賴的jar包

compile group: 'com.redislabs', name: 'spark-redis', version: '2.3.0'
compile group: 'redis.clients', name: 'jedis', version: '2.9.0'
compile group: 'org.apache.commons', name: 'commons-pool2', version: '2.0'

我用gradle管理依賴,如果用maven也可以去maven官網尋找。

二、測試用例
1、方法

sc.toRedisKV() 存儲key、value字符串
具體實現:
def toRedisKV(kvs: RDD[(String, String)], ttl: Int = 0)
               (implicit redisConfig: RedisConfig = new RedisConfig(new RedisEndpoint(sc.getConf))) {
    kvs.foreachPartition(partition => setKVs(partition, ttl, redisConfig))
  }
 /**
    * @param arr k/vs which should be saved in the target host
    *            save all the k/vs to the target host
    * @param ttl time to live
    */
  def setKVs(arr: Iterator[(String, String)], ttl: Int, redisConfig: RedisConfig) {
    arr.map(kv => (redisConfig.getHost(kv._1), kv)).toArray.groupBy(_._1).
      mapValues(a => a.map(p => p._2)).foreach {
      x => {
        val conn = x._1.endpoint.connect()
        val pipeline = conn.pipelined
        if (ttl <= 0) {
          x._2.foreach(x => pipeline.set(x._1, x._2))
        }
        else {
          x._2.foreach(x => pipeline.setex(x._1, ttl, x._2))
        }
        pipeline.sync
        conn.close
      }
    }
  }


sc.toRedisHASH() 存儲hash map
/**
    * @param kvs      Pair RDD of K/V
    * @param hashName target hash's name which hold all the kvs
    * @param ttl time to live
    */
  def toRedisHASH(kvs: RDD[(String, String)], hashName: String, ttl: Int = 0)
                 (implicit redisConfig: RedisConfig = new RedisConfig(new RedisEndpoint(sc.getConf))) {
    kvs.foreachPartition(partition => setHash(hashName, partition, ttl, redisConfig))
  }
 /**
    * @param hashName
    * @param arr k/vs which should be saved in the target host
    *            save all the k/vs to hashName(list type) to the target host
    * @param ttl time to live
    */
  def setHash(hashName: String, arr: Iterator[(String, String)], ttl: Int, redisConfig: RedisConfig) {
    val conn = redisConfig.connectionForKey(hashName)
    val pipeline = conn.pipelined
    arr.foreach(x => pipeline.hset(hashName, x._1, x._2))
    if (ttl > 0) pipeline.expire(hashName, ttl)
    pipeline.sync
    conn.close
  }
// 後續再補充
sc.toRedisFixedLIST()
sc.toRedisLIST()
sc.toRedisSET()
sc.toRedisZSET()

2、測試

val activeRiskTableName = "hm_service_risk.test_active_risk_base_db"
    val today = DateUtil.format(DateUtil.getDayBegin, "yyyyMMdd")
    val dataFrame = spark.read.table(activeRiskTableName).filter($"stat_date".equalTo(today)).
      select("id", "update_time", "risk").
      withColumn("update_time", to_timestamp($"update_time", "yyyy-MM-dd HH:mm:ss"))

    val redisHost = "localhost"
    val redisPort: Int = 6379
    val redisAuth: String = ""
    val redisDataRdd = dataFrame.map(row => {
      val id = row.getAs[String]("id")
      val updateTime = row.getAs[Int]("update_time")
      val risk = row.getAs[Int]("risk")
      val redisKey = prefix + AESUtils.aesEncrypt(id)
      val json = new JSONObject()
      json.put("update_time", updateTime)
      json.put("risk", risk)
      (redisKey, json.toJSONString)
    }).rdd

    val redisConfig = new RedisConfig(RedisEndpoint(redisHost, redisPort, redisAuth))
    sc.toRedisKV(redisDataRdd)(redisConfig)

臨時工作沒深入研究,後續有時間深入研究,再補充

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章