用過MongoDB的小夥伴或多或少都會遇到MongoDB自帶group的限制問題,即group不支持大於2萬的結果集!
我曾在任職公司做過某網站小區均價數據抓取,採用的是Groovy技術。數據抓取下來後存放在MongoDB中。數據抓取完成都是需要經歷數據清洗處理的,其中有一項數據去重時候就遇到MongoDB自帶group的限制問題,即group不支持大於2萬的結果集。幾番折騰後來採用MongoDB mapReduce來處理。
下面是數據去重的部分核心代碼,已做脫敏處理!
package com.script.thirdPriceCrawl
import com.script.util.MongoUtils
import com.mongodb.BasicDBObject
import com.mongodb.DB
import com.mongodb.DBCollection
import com.mongodb.DBObject
import com.mongodb.MapReduceOutput
import org.apache.commons.lang.StringUtils
import java.util.concurrent.ExecutorService
import java.util.concurrent.Executors
import java.util.concurrent.TimeUnit
/**
* 網絡爬蟲:
* 某網站-小區均價數據抓取<br>
* http://www.xxx.com/
*
* 解析完成後通過MapReduce統計/清理重複數據工具類<br>
* 在清洗數據腳本執行之前調用
*
* @author 小輝哥/小輝GE
* 2019年9月5日 下午21:22:12
*
*/
class ThirdPartPriceDataRepeatBatchUtils {
//MapReduce 執行分組後輸出DBCollection
def static final THIRD_PARTY_PRICE_DATAREPEAT_TEMP = "third_party_price_repeat_data_temp"
def static final THIRD_PARTY_PRICE_DATAPRICEREPEAT_TEMP = "third_party_price_repeat_dataprice_temp"
//線程池
def static ExecutorService fixedThreadPoolQuery
//其他變量
def static final CONNECT_CHAR = "#####"
/**
* 每次執行時候初始化線程池, 防止類已加載就要多個線程池中線程佔用內存
* @param type
* @return
*/
def static initThreadPool(){
fixedThreadPoolQuery = Executors.newFixedThreadPool(50)
}
/**
* 全量測試執行時間爲277分鐘, 預計執行時間爲五小時
*
* 返回當月總解析後重複數據
* 判斷分組標準:s_date+source+city+region+name
* 備註:方法在解析完成清洗開始前執行
* @param sdate
* @param thirdPartyColl
* @return
*/
def static findThirdPriceDataRepeatBatch(sdate, DBCollection thirdPartyColl){
try {
println("findThirdPriceDataRepeatBatch 處理重複數據開始")
long start = System.currentTimeMillis()
//query條件, 按sdate篩選
DBObject query = new BasicDBObject().append("s_date", sdate)
//mapfun
String mapfun =
"function(){" +
"emit({s_date:this.s_date, source:this.source, city:this.city, region:this.region, name:this.name}, 1);" +
"};"
//reducefun
String reducefun =
"function(key, values){" +
"return values.length;" +
"};"
//執行MapReduce
MapReduceOutput mapReduce = thirdPartyColl.mapReduce(mapfun, reducefun, THIRD_PARTY_PRICE_DATAREPEAT_TEMP, query)
if(mapReduce!=null && mapReduce.results().size()>0){
//初始化線程池(初始化放在mapReduce.results()結果後, 保證thirdPartyColl.mapReduce發生異常, 線程池未創建)
initThreadPool()
mapReduce.results().each { DBObject o ->
try{
if(o.value > 1) {
fixedThreadPoolQuery.execute(
new Runnable() {
void run() {
try {
println("調用findThirdPriceDataRepeatByPriceBatch, 傳入的obj對象爲: "+o.toString())
findThirdPriceDataRepeatByPriceBatch((int)(o._id.s_date), o._id.source, o._id.city, o._id.region, o._id.name, thirdPartyColl)
} catch (Exception e) {
println "調用findThirdPriceDataRepeatByPriceBatch -> DBObject對象爲"+o.toString()+"發生異常, 異常信息爲:" + e.getMessage()
} finally {
}
}
}
)
}
} catch (Exception e) {
println("findThirdPriceDataRepeat --> mapReduce.results().each { DBObject o -> DBObject對象爲"+o.toString()+", 時發生異常, 異常信息爲" + e.getLocalizedMessage())
}
}
fixedThreadPoolQuery.shutdown()
fixedThreadPoolQuery.awaitTermination(2, TimeUnit.DAYS)
long end = System.currentTimeMillis()
println "findThirdPriceDataRepeatBatch 處理重複數據完成,耗時:" + (end - start)
}
} catch (Exception e) {
println("findThirdPriceDataRepeatBatch(sdate, DBCollection thirdPartyColl){發生異常, 異常信息爲" + e.getLocalizedMessage())
}
}
/**
* 判斷分組標準:s_date+source+city+region+name+avg_price
* @param sdate
* @param source
* @param city
* @param region
* @param name
* @param thirdPartyColl
* @return
*/
def static findThirdPriceDataRepeatByPriceBatch(sdate, source, city, region, name, DBCollection thirdPartyColl){
//按價格分組後, 判斷是否出現多次不同的價格, 並不是根據分組的count判斷,
//而是根據按照價格分組後, 對應sdate+source+city+region+name 還出現多次在mapReduce.results()中, 即爲重複數據
def dataRepeatMap = [:]
try{
//query條件, 按sdate篩選
DBObject query = new BasicDBObject().append("s_date", sdate)
.append("source",source).append("city", city).append("region", region).append("name", name)
//mapfun
String mapfun =
"function(){" +
"emit({s_date:this.s_date, source:this.source, city:this.city, region:this.region, name:this.name, avg_price:this.avg_price}, 1);" +
"};"
//reducefun
String reducefun =
"function(key, values){" +
"return values.length;" +
"};"
//執行MapReduce
MapReduceOutput mapReduce = thirdPartyColl.mapReduce(mapfun, reducefun, THIRD_PARTY_PRICE_DATAPRICEREPEAT_TEMP, query)
if(mapReduce!=null && mapReduce.results().size()>0){
mapReduce.results().each { DBObject o ->
try{
def sd = (int) o._id.s_date
def keys = sd + CONNECT_CHAR + o._id.source + CONNECT_CHAR + o._id.city + CONNECT_CHAR + o._id.region + CONNECT_CHAR + o._id.name
if (!dataRepeatMap.containsKey(keys)) {
println("dataRepeatMap 中不含有 keys:" + keys + "加入dataRepeatMap")
dataRepeatMap.put(keys, keys)
} else {
println("dataRepeatMap 中含有 keys:" + keys + "執行刪除方法, 對象爲: " + o.toString())
deleteThirdPriceDataBatch((int) (o._id.s_date), o._id.source, o._id.city, o._id.region, o._id.name, thirdPartyColl)
}
} catch (Exception e) {
println("調用deleteThirdPriceDataBatch DBObject對象爲"+o.toString()+", 時發生異常, 異常信息爲" + e.getLocalizedMessage())
}
}
}
}catch (Exception e) {
println("findThirdPriceDataRepeatByPriceBatch(sdate, source, city, region, name, DBCollection thirdPartyColl){發生異常, 異常信息爲" + e.getLocalizedMessage())
}
}
/**
* 刪除重複數據
* @param sdate
* @param source
* @param city
* @param region
* @param name
* @param thirdPartyColl
*/
def static deleteThirdPriceDataBatch(sdate, source, city, region, name, DBCollection thirdPartyColl){
DBObject obj = new BasicDBObject();
obj.put("s_date", sdate)
obj.put("source", source)
obj.put("city", city)
obj.put("region", region)
obj.put("name", name)
thirdPartyColl.remove(obj);
}
}
輸出結果在這裏就不展示了!!!
以上代碼僅供參考,如有不當之處,歡迎指出!!!
更多幹貨,歡迎大家關注和聯繫我。期待和大家一起更好的交流、探討技術!!!