測試過程:
1.向虛擬機mini1的7777端口發送一個個的單詞信息
2.Streaming程序接受7777端口的數據,並做處理。
3.將DStream[(String, Int)]轉化成RDD寫入到mysql
程序如下:
package spark.SparkStreaming.file
import java.sql.DriverManager
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
object streaming2Mysql {
def main(args: Array[String]): Unit = {
//SparkSession
val spark: SparkSession = SparkSession.builder()
.appName(streaming2Mysql.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val ssc: StreamingContext = new StreamingContext(sc, Seconds(2))
ssc.checkpoint("file:///C:\\ck")
//DStream,迭代計算,並顯示內容
ssc.socketTextStream("mini1", 7777)
.flatMap(_.split("\\s+"))
.filter(_.nonEmpty)
.map((_, 1))
.updateStateByKey((nowBatch: Seq[Int], historyResult: Option[Int]) => Some(nowBatch.sum + historyResult.getOrElse(0)))
.foreachRDD(rdd => {
if (!rdd.isEmpty()) {
rdd.foreachPartition(itr => {
if (!itr.isEmpty) {
itr.foreach(perEle => {
val word = perEle._1
val cnt = perEle._2
save2DB(word, cnt)
})
}
})
}
})
//啓動SparkStreaming應用
ssc.start
//等待結束(必須要添加)
ssc.awaitTermination
}
/**
* 保存到DB中
* @param word
* @param cnt
*/
def save2DB(word: String, cnt: Int) = {
//加載驅動
classOf[com.mysql.jdbc.Driver]
//獲得連接
val conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/spark-study?useUnicode=true&characterEncoding=utf-8", "root", "root")
//PreparedStatement
var ps = conn.prepareStatement("select word from tb_words where word=?")
ps.setString(1, word)
val rs = ps.executeQuery
val isExist = rs.next()
//操作db
if (isExist) {
//存在就更新
ps = conn.prepareStatement("update tb_words set cnt=? where word=?")
} else {
//否則就插入
ps = conn.prepareStatement("insert into tb_words(cnt,word) values(?,?)")
}
//替換佔位符的值
ps.setInt(1, cnt)
ps.setString(2, word)
//執行sql
ps.executeUpdate
//釋放資源
if (ps != null) {
ps.close
}
if (rs != null) {
rs.close()
}
if (conn != null) {
conn.close()
}
}
}
以上方式,每條數據都會與mysql數據建立一次連接,影響性能。
下面使用連接池的方式和優化sql語句 對上述過程進行優化。
DBCPUtils:DBCP連接池工具類(也可以使用其他的連接池 druid,c3p0等)
DBUtilsWayDemo:測試類
IWordDao:dao層接口
WordDaoImpl:dao層接口實現類
WordBean:數據實體類
連接池配置參數
DBCPUtils
import org.apache.commons.dbcp.BasicDataSourceFactory;
import javax.sql.DataSource;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.Properties;
public class DBCPUtils {
private static DataSource pool;
static {
try {
Properties properties = new Properties();
properties.load(DBCPUtils.class.getClassLoader().getResourceAsStream("dbcp.properties"));
pool = BasicDataSourceFactory.createDataSource(properties);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("連接池獲取失敗");
}
}
/**
* 獲得連接的實例
* @return
* @throws SQLException
*/
public static Connection getConnection() throws SQLException {
return pool.getConnection();
}
/**
* 返回連接池的實例
* @return
*/
public static DataSource getConnectionPool() {
return pool;
}
}
DBUtilsWayDemo
import java.util
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import spark.streaming2Mysql.{IWordDao, WordBean, WordDaoImpl}
object DBUtilsWayDemo {
def main(args: Array[String]): Unit = {
//SparkSession
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val ssc: StreamingContext = new StreamingContext(sc, Seconds(2))
ssc.checkpoint("file:///C:\\Users\\luoyunfan\\Desktop\\spark")
//DStream,迭代計算,並顯示內容
ssc.socketTextStream("mini1", 7777)
.flatMap(_.split("\\s+"))
.filter(_.nonEmpty)
.map((_, 1))
.updateStateByKey((nowBatch: Seq[Int], historyResult: Option[Int]) => Some(nowBatch.sum + historyResult.getOrElse(0)))
.foreachRDD(rdd => {
if (!rdd.isEmpty()) {
rdd.foreachPartition(itr => {
if (!itr.isEmpty) {
val dao: IWordDao = new WordDaoImpl
val container = new util.ArrayList[WordBean]()
itr.foreach(perEle => {
val word = perEle._1
val cnt = perEle._2
container.add(new WordBean(word, cnt))
})
//批處理
dao.batchDealWith2(container)
}
})
}
})
//啓動SparkStreaming應用
ssc.start
//等待結束(必須要添加)
ssc.awaitTermination
}
}
IWordDao
package spark.streaming2Mysql;
import java.util.List;
public interface IWordDao {
/**
* 批量處理,使用高效的sql進行優化
*/
void batchDealWith2(List<WordBean> beans);
}
WordDaoImpl
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanHandler;
import java.sql.SQLException;
import java.util.LinkedList;
import java.util.List;
public class WordDaoImpl implements IWordDao {
private QueryRunner qr = new QueryRunner(DBCPUtils.getConnectionPool());
public void batchDealWith2(List<WordBean> beans) {
try {
String sql = "insert into tb_words(word,cnt) values(?,?) on duplicate key update cnt=?";
Object[][] params = new Object[beans.size()][];
for (int i = 0; i < params.length; i++) {
WordBean bean = beans.get(i);
params[i] = new Object[]{bean.getWord(),bean.getCnt(),bean.getCnt()};
}
qr.batch(sql, params);
} catch (SQLException e) {
e.printStackTrace();
}
}
}
WordBean
import lombok.Data;
import lombok.NoArgsConstructor;
@NoArgsConstructor
@Data
public class WordBean {
private String word;
private int cnt;
public WordBean(String word, int cnt) {
this.word = word;
this.cnt = cnt;
}
}
pom加入這倆依賴
連接池的和WordBean中用到的lombok(不用也可以)
<dependency>
<groupId>commons-dbutils</groupId>
<artifactId>commons-dbutils</artifactId>
<version>1.7</version>
</dependency>
<!-- 使用註解的方式自動生成getter/setter訪問器 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.20</version>
</dependency>
測試:
1、先打開mini1的7777端口準備發送數據
2、運行DBUtilsWayDemo
3、查看數據庫數據
4、向端口發送幾條數據
5、查看數據庫信息