1、普通文本文件
sc.textFile("./dir/*.txt")
如果传递目录,则将目录下的所有文件读取作为RDD。文件路径支持通配符。
但是这样对于大量的小文件读取效率并不高,应该使用wholeTextFiles
def wholeTextFiles(path: String, minPartitions: Int = defaultMinPartitions): RDD[(String, String)])
返回值RDD[(String, String)],其中Key是文件的名称,Value是文件的内容。
2、JDBC
Spark支持通过Java JDBC访问关系型数据库。需要使用JdbcRDD
代码演示
import java.sql.{Connection, DriverManager, PreparedStatement} //ps.executeBatch() conn.close() } } |
3、 HadoopAPI
https://blog.csdn.net/leen0304/article/details/78854530
Spark的整个生态系统与Hadoop是完全兼容的,所以对于Hadoop所支持的文件类型或者数据库类型,Spark也同样支持。
HadoopRDD、newAPIHadoopRDD、saveAsHadoopFile、saveAsNewAPIHadoopFile 是底层API
其他的API接口都是为了方便最终的Spark程序开发者而设置的,是这两个接口的高效实现版本.
4、SequenceFile文件
SequenceFile文件是Hadoop用来存储二进制形式的key-value对而设计的一种平面文件(Flat File)。
https://blog.csdn.net/bitcarmanlee/article/details/78111289
读sc.sequenceFile[ keyClass, valueClass](path)
写RDD.saveAsSequenceFile(path)
要求键和值能够自动转为Writable类型。
5、对象文件
对象文件是将对象序列化后保存的文件
读sc.objectFile[k,v](path) //因为是序列化所以要指定类型
写RDD.saveAsObjectFile()
6、HBase
由于 org.apache.hadoop.hbase.mapreduce.TableInputFormat 类的实现,Spark 可以通过Hadoop输入格式访问HBase。
这个输入格式会返回键值对数据,
其中键的类型为org. apache.hadoop.hbase.io.ImmutableBytesWritable,
而值的类型为org.apache.hadoop.hbase.client.Result。
https://github.com/teeyog/blog/issues/22
7、扩展阅读
import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object DataSourceTest { def main(args: Array[String]): Unit = { val config = new SparkConf().setAppName("DataSourceTest").setMaster("local[*]") val sc = new SparkContext(config) sc.setLogLevel("WARN") System.setProperty("HADOOP_USER_NAME", "root") //1.HadoopAPI println("HadoopAPI") val dataRDD = sc.parallelize(Array((1,"hadoop"), (2,"hive"), (3,"spark"))) dataRDD.saveAsNewAPIHadoopFile("hdfs://node01:8020/spark_hadoop/", classOf[LongWritable], classOf[Text], classOf[TextOutputFormat[LongWritable, Text]]) val inputRDD: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile( "hdfs://node01:8020/spark_hadoop/*", classOf[TextInputFormat], classOf[LongWritable], classOf[Text], conf = sc.hadoopConfiguration ) inputRDD.map(_._2.toString).foreach(println) //2.读取小文件 println("读取小文件") val filesRDD: RDD[(String, String)] = sc.wholeTextFiles("D:\\data\\spark\\files", minPartitions = 3) val linesRDD: RDD[String] = filesRDD.flatMap(_._2.split("\\r\\n")) val wordsRDD: RDD[String] = linesRDD.flatMap(_.split(" ")) wordsRDD.map((_, 1)).reduceByKey(_ + _).collect().foreach(println) //3.操作SequenceFile println("SequenceFile") val dataRDD2: RDD[(Int, String)] = sc.parallelize(List((2, "aa"), (3, "bb"), (4, "cc"), (5, "dd"), (6, "ee"))) dataRDD2.saveAsSequenceFile("D:\\data\\spark\\SequenceFile") val sdata: RDD[(Int, String)] = sc.sequenceFile[Int, String]("D:\\data\\spark\\SequenceFile\\*") sdata.collect().foreach(println) //4.操作ObjectFile println("ObjectFile") val dataRDD3 = sc.parallelize(List((2, "aa"), (3, "bb"), (4, "cc"), (5, "dd"), (6, "ee"))) dataRDD3.saveAsObjectFile("D:\\data\\spark\\ObjectFile") val objRDD = sc.objectFile[(Int, String)]("D:\\data\\spark\\ObjectFile\\*") objRDD.collect().foreach(println) sc.stop() } } |
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put, Result} import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName} import org.apache.hadoop.hbase.mapred.TableOutputFormat import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.mapred.JobConf import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object DataSourceTest2 { def main(args: Array[String]): Unit = { val config = new SparkConf().setAppName("DataSourceTest").setMaster("local[*]") val sc = new SparkContext(config) sc.setLogLevel("WARN") val conf = HBaseConfiguration.create() conf.set("hbase.zookeeper.quorum", "node01:2181,node02:2181,node03:2181") val fruitTable = TableName.valueOf("fruit") val tableDescr = new HTableDescriptor(fruitTable) tableDescr.addFamily(new HColumnDescriptor("info".getBytes)) val admin = new HBaseAdmin(conf) if (admin.tableExists(fruitTable)) { admin.disableTable(fruitTable) admin.deleteTable(fruitTable) } admin.createTable(tableDescr) def convert(triple: (String, String, String)) = { val put = new Put(Bytes.toBytes(triple._1)) put.addImmutable(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(triple._2)) put.addImmutable(Bytes.toBytes("info"), Bytes.toBytes("price"), Bytes.toBytes(triple._3)) (new ImmutableBytesWritable, put) } val dataRDD: RDD[(String, String, String)] = sc.parallelize(List(("1","apple","11"), ("2","banana","12"), ("3","pear","13"))) val targetRDD: RDD[(ImmutableBytesWritable, Put)] = dataRDD.map(convert) val jobConf = new JobConf(conf) jobConf.setOutputFormat(classOf[TableOutputFormat]) jobConf.set(TableOutputFormat.OUTPUT_TABLE, "fruit") //写入数据 targetRDD.saveAsHadoopDataset(jobConf) println("写入数据成功") //读取数据 conf.set(TableInputFormat.INPUT_TABLE, "fruit") val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result]) val count: Long = hbaseRDD.count() println("hBaseRDD RDD Count:"+ count) hbaseRDD.foreach { case (_, result) => val key = Bytes.toString(result.getRow) val name = Bytes.toString(result.getValue("info".getBytes, "name".getBytes)) val color = Bytes.toString(result.getValue("info".getBytes, "price".getBytes)) println("Row key:" + key + " Name:" + name + " Color:" + color) } sc.stop() } } |