作用 將Dstream轉換爲RDD,從而可以和其它的的RDD進行join操作。
scala版本:
package cn.spark.study.streaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
object TransformTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]")
.setAppName("Transform")
val ssc = new StreamingContext(conf, Seconds(2))
val filterArray = Array(("key1", true)) // 這裏麪包含的key將會被過濾掉
val filterRDD = ssc.sparkContext.parallelize(filterArray, 5) // 生成一個並行RDD
val logDStream = ssc.socketTextStream("test", 8897) // 使用socket作爲流數據源
val tmpLogDStream = logDStream
.map { clickLog => (clickLog.split(" ")(1), clickLog) } // 輸入數據格式爲:id1 key1; id2 key2 的格式
val validAdsClickLogDStream = tmpLogDStream.transform(clickLogRDD => { // 做轉換
val joinedRDD = clickLogRDD.leftOuterJoin(filterRDD) // 左外連接,過濾條件中有的,其第二項中的布爾值就爲true,就不要
val filteredRDD = joinedRDD.filter(tuple => {
if(tuple._2._2.getOrElse(false)) {
false
} else {
true
}
})
val validAdsClickLogRDD = filteredRDD.map(tuple => tuple._2._1)
validAdsClickLogRDD
})
validAdsClickLogDStream.print() // 直接打印
ssc.start()
ssc.awaitTermination()
}
}
Java版本:
package cn.spark.study.streaming;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import com.google.common.base.Optional;
import scala.Tuple2;
public class TransformBlacklist {
@SuppressWarnings("deprecation")
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setMaster("local[2]")
.setAppName("TransformBlacklist");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
// 先做一份模擬的黑名單RDD
List<Tuple2<String, Boolean>> filterList = new ArrayList<Tuple2<String, Boolean>>();
filterList.add(new Tuple2<String, Boolean>("tom", true));
final JavaPairRDD<String, Boolean> filterRDD = jssc.sc().parallelizePairs(blacklist);
JavaReceiverInputDStream<String> clickLogDStream = jssc.socketTextStream("test", 8897);
JavaPairDStream<String, String> tmpClickLogDStream = clickLogDStream.mapToPair(
new PairFunction<String, String, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, String> call(String tmpClickLog)
throws Exception {
return new Tuple2<String, String>(
tmpClickLog.split(" ")[1], tmpClickLog); // 轉換爲key 輸入數據-->的形式,爲後續join做準備
}
});
// 執行transform操作了,將每個batch的RDD,filterRDD進行join、filter、map等操作,實時進行過濾
JavaDStream<String> validClickLogDStream = tmpClickLogDStream.transform(
new Function<JavaPairRDD<String,String>, JavaRDD<String>>() {
private static final long serialVersionUID = 1L;
@Override
public JavaRDD<String> call(JavaPairRDD<String, String> clickLogRDD)
throws Exception {
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> joinedRDD =
clickLogRDD.leftOuterJoin(filterRDD);
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> resRDD =
joinedRDD.filter(
new Function<Tuple2<String,
Tuple2<String,Optional<Boolean>>>, Boolean>() {
private static final long serialVersionUID = 1L;
@Override
public Boolean call(
Tuple2<String,
Tuple2<String, Optional<Boolean>>> tuple)
throws Exception {
if(tuple._2._2().isPresent() &&
tuple._2._2.get()) {
return false;
}
return true;
}
});
// 此時,filteredRDD中,就只剩下沒有被黑名單過濾的用戶點擊了
// 進行map操作,轉換成我們想要的格式
JavaRDD<String> resClickLogRDD = resRDD.map(
new Function<Tuple2<String,Tuple2<String,Optional<Boolean>>>, String>() {
private static final long serialVersionUID = 1L;
@Override
public String call(
Tuple2<String, Tuple2<String, Optional<Boolean>>> tuple)
throws Exception {
return tuple._2._1;
}
});
return validResClickLogRDD;
}
});
validClickLogDStream.print();
jssc.start();
jssc.awaitTermination();
jssc.close();
}
}