package com.scala.test; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.LoggerFactory; public class SparkDataSetTest { public static void main(String[] args) { Logger.getLogger("org").setLevel(Level.ERROR); SparkSession spark = SparkSession.builder().master("local[*]").appName("Spark").getOrCreate(); final JavaSparkContext ctx = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD<String> source = spark.read().textFile("E:\\scala\\workspace\\test_files\\student.txt").javaRDD(); JavaRDD<Student> rowRDD = source.map(new Function<String, Student>() { public Student call(String line) throws Exception { String parts[] = line.split(","); Student stu = new Student(); stu.setId(parts[0]); stu.setName(parts[1]); stu.setAge(Integer.valueOf(parts[2])); System.out.println(stu); return stu; } }); Dataset<Row> df = spark.createDataFrame(rowRDD, Student.class); df.select("id", "name", "age").show(); JavaRDD<Row> rowRDD2 = spark.createDataFrame(rowRDD, Student.class).toJavaRDD(); rowRDD2.foreach(k -> { System.out.println("age:" + k.getAs("age").toString()); }); JavaRDD<Row> res = rowRDD2.filter(k -> { if(k.getAs("id").toString().compareTo("2") < 0) { return false; } return true; }); System.out.println("after filter:" + res.count()); System.out.println("df2"); // Dataset<Row> df2 = spark.createDataFrame(rowRDD2, Student.class); // df2.select("id", "name", "age").show(); } }
student.txt
1,2,3 2,3,4