Spark SQL解析複雜、嵌套、加密json大文件數據思路(java)


文件gz壓縮格式20G,解壓後200G左右,每行都是一個json字符串,大概500w行。每行json格式簡化後如下:

{
	"tel": "15311111111111",
	"mac": "33:D3:B3:43:83:03",
	"name": "zhongxing",
	"type1": "140102",
	"timestamp": "1572735114",
	"event": {
		"ET": "SGW_QUALITY_REPORT",
		"ED": "eyJ4eHhzdGF0dXMiOlt7IlRpbWUiOiIyMDE5LTExLTAzIDA2OjE2OjAwIiwicnVubmluZ1RpbWUiOjI0MDE4Nywic3RhdHVzMDEiOjIsInN0YXR1czAyIjo0MX1dLCJ4eHhpbmZvIjpbeyJUaW1lIjoiMjAxOS0xMS0wMyAwNjoxNjowMCIsInh4eERldiI6W3sieHh4RGV2MDEiOjEsInh4eERldjAyIjoieHh4LTAwMDAwIiwieHh4RGV2MDMiOiJ6ekctMSIsInh4eERldjA0Ijoi6Zi/6YeMIiwieHh4RGV2MDUiOiIyMTU0ODc4Nzg3IiwieHh4RGV2MDYiOjQsInh4eERldjA3Ijo0LCJ4eHhEZXYwOCI6MCwieHh4RGV2MDkiOiJXUEEvT1NUIiwieHh4RGV2MTAiOiJhLHQsYyIsInh4eERldjExIjoiNDAtYml0IiwieHh4RGV2MTIiOiJBbGwifSx7Inh4eERldjAxIjowLCJ4eHhEZXYwMiI6Ikd1ZXN0LXRlc3QiLCJ4eHhEZXYwMyI6Inp6Ry0yIiwieHh4RGV2MDQiOiJBUFAiLCJ4eHhEZXYwNSI6Ijc3Nzc3NzciLCJ4eHhEZXYwNiI6NCwieHh4RGV2MDciOjQsInh4eERldjA4IjowLCJ4eHhEZXYwOSI6IldQQS9PU1QiLCJ4eHhEZXYxMCI6ImEsdCxjIiwieHh4RGV2MTEiOiI0MC1iaXQiLCJ4eHhEZXYxMiI6Ik9TeXN0ZW0ifV19XSwieHh4cnh0eCI6W3siVGltZSI6IjIwMTktMTEtMDMgMDQ6NTU6MDgiLCJ4eHhMaXN0IjpbeyJ4eHhMaXN0MDEiOiIyNTY0NTg3OTQ1IiwieHh4TGlzdDAyIjoicGhvbmUiLCJ4eHhMaXN0MDMiOiI1RyIsInh4eExpc3QwNCI6IjAwQ0RONTY1QkdGIiwieHh4TGlzdDA1IjoiMTkyLjE2OC4wLjEyNCIsInh4eExpc3QwNiI6Inh455qE5omL5py6IiwieHh4TGlzdDA3IjoiaVBob25lIDZzIiwieHh4TGlzdDA4Ijoi6Iu55p6cIiwieHh4TGlzdDA5IjoieHh4UGhvbmUiLCJ4eHhMaXN0MTAiOjE1MjAwMDAsInh4eExpc3QxMSI6Mjc0MDAwMDAwfSx7Inh4eExpc3QwMSI6IjI1Njg3OTQxMTExIiwieHh4TGlzdDAyIjoicGhvbmUiLCJ4eHhMaXN0MDMiOiI1RyIsInh4eExpc3QwNCI6IjlDQkRGNDU2RkciLCJ4eHhMaXN0MDUiOiIxOTIuMTY4LjAuMTI1IiwieHh4TGlzdDA2IjoieHjnmoTmiYvmnLoiLCJ4eHhMaXN0MDciOiLljY7kuLpHOemdkuaYpeeJiCIsInh4eExpc3QwOCI6IuWNjuS4uiIsInh4eExpc3QwOSI6Inh4eF9Zb3V0aCIsInh4eExpc3QxMCI6MTU0NjAwMCwieHh4TGlzdDExIjoxMDI1ODAwMDB9LHsieHh4TGlzdDAxIjoiMTg5Nzk0NTYxMiIsInh4eExpc3QwMiI6IlNWQyIsInh4eExpc3QwMyI6IjEiLCJ4eHhMaXN0MDQiOiIyNEhEQlYyMzU2QkMiLCJ4eHhMaXN0MDUiOiIyNTUuMjU1LjI1NS4yNTUiLCJ4eHhMaXN0MDYiOiJ4eOeahOacuumhtuebkiIsInh4eExpc3QwNyI6Iue9keaYk+S6keebkiIsInh4eExpc3QwOCI6Iue9keaYkyIsInh4eExpc3QwOSI6ImFuZHJvaWQteHh4eCIsInh4eExpc3QxMCI6NTg5NzEwMDAwLCJ4eHhMaXN0MTEiOjE1Njg3OTAwMDAwfV19LHsiVGltZSI6IjIwMTktMTEtMDMgMDU6MDA6MDkiLCJ4eHhMaXN0IjpbeyJ4eHhMaXN0MDEiOiI1Njg5NzQ1NjY2NCIsInh4eExpc3QwMiI6InBob25lIiwieHh4TGlzdDAzIjoiM0ciLCJ4eHhMaXN0MDQiOiIwMDBDREI0ODJCQyIsInh4eExpc3QwNSI6IjE5Mi4xNjguMC4xMjYiLCJ4eHhMaXN0MDYiOiJ4eOeahOaJi+acuiIsInh4eExpc3QwNyI6ImlQaG9uZSA2cyIsInh4eExpc3QwOCI6IuiLueaenCIsInh4eExpc3QwOSI6Inh4eFBob25lIiwieHh4TGlzdDEwIjo1Njg5NzQwMCwieHh4TGlzdDExIjoyODg1NjEwMDAwfV19XX0=",
		"MAC": "33DA37488333",
		"CJ": "zhongxing",
		"PC": "ZX HNC(3-2)",
		"Version": "1.0"
	}
}

其中 ,ED字段爲加密字段,解密後還是一段JSON:

{
	"xxxstatus": [{
		"Time": "2019-11-03 06:16:00",
		"runningTime": 240187,
		"status01": 2,
		"status02": 41
	}],
	"xxxinfo": [{
		"Time": "2019-11-03 06:16:00",
		"xxxDev": [{
			"xxxDev01": 1,
			"xxxDev02": "xxx-00000",
			"xxxDev03": "zzG-1",
			"xxxDev04": "阿里",
			"xxxDev05": "2154878787",
			"xxxDev06": 4,
			"xxxDev07": 4,
			"xxxDev08": 0,
			"xxxDev09": "WPA/OST",
			"xxxDev10": "a,t,c",
			"xxxDev11": "40-bit",
			"xxxDev12": "All"
		}, {
			"xxxDev01": 0,
			"xxxDev02": "Guest-test",
			"xxxDev03": "zzG-2",
			"xxxDev04": "APP",
			"xxxDev05": "7777777",
			"xxxDev06": 4,
			"xxxDev07": 4,
			"xxxDev08": 0,
			"xxxDev09": "WPA/OST",
			"xxxDev10": "a,t,c",
			"xxxDev11": "40-bit",
			"xxxDev12": "OSystem"
		}]
	}],
	"xxxrxtx": [{
		"Time": "2019-11-03 04:55:08",
		"xxxList": [{
			"xxxList01": "2564587945",
			"xxxList02": "phone",
			"xxxList03": "5G",
			"xxxList04": "00CDN565BGF",
			"xxxList05": "192.168.0.124",
			"xxxList06": "xx的手機",
			"xxxList07": "iPhone 6s",
			"xxxList08": "蘋果",
			"xxxList09": "xxxPhone",
			"xxxList10": 1520000,
			"xxxList11": 274000000
		}, {
			"xxxList01": "25687941111",
			"xxxList02": "phone",
			"xxxList03": "5G",
			"xxxList04": "9CBDF456FG",
			"xxxList05": "192.168.0.125",
			"xxxList06": "xx的手機",
			"xxxList07": "華爲G9青春版",
			"xxxList08": "華爲",
			"xxxList09": "xxx_Youth",
			"xxxList10": 1546000,
			"xxxList11": 102580000
		}, {
			"xxxList01": "1897945612",
			"xxxList02": "SVC",
			"xxxList03": "1",
			"xxxList04": "24HDBV2356BC",
			"xxxList05": "255.255.255.255",
			"xxxList06": "xx的機頂盒",
			"xxxList07": "網易雲盒",
			"xxxList08": "網易",
			"xxxList09": "android-xxxx",
			"xxxList10": 589710000,
			"xxxList11": 15687900000
		}]
	}, {
		"Time": "2019-11-03 05:00:09",
		"xxxList": [{
			"xxxList01": "56897456664",
			"xxxList02": "phone",
			"xxxList03": "3G",
			"xxxList04": "000CDB482BC",
			"xxxList05": "192.168.0.126",
			"xxxList06": "xx的手機",
			"xxxList07": "iPhone 6s",
			"xxxList08": "蘋果",
			"xxxList09": "xxxPhone",
			"xxxList10": 56897400,
			"xxxList11": 2885610000
		}]
	}]
}

我們要做的就是,將xxxstatus、xxxinfo、xxxrxtx加上外面json字段解析成3張表。

		//讀取gz文件數據 
		DataFrame json_df = sc.read().json("D:\\work\\test.txt.gz");
		json_df.printSchema();

首先要解密event的ED字段以及獲取需要的最外層mac、tel字段。ED字段使用的是base64加密,而spark sql自帶的內置函數unbase64便可解析。spark sql自帶很多有用的內置函數,具體參考官方文檔:http://spark.apache.org/docs/latest/api/sql/index.html

		//註冊爲臨時表 
		json_df.registerTempTable("tmp_json_df");
		//解密ED字段 
		DataFrame unbase64_json_df = sc.sql("select mac,tel,unbase64(event.ED) as ED from tmp_json_df");
		unbase64_json_df.show();

ED字段解密後是二進制,需要轉換爲字符串,使用內置函數 decode(bin, charset)

		//解密ED字段 
		DataFrame unbase64_json_df = sc.sql("select mac,tel,decode(unbase64(event.ED), 'UTF-8') as ED from tmp_json_df");
		unbase64_json_df.show();
		unbase64_json_df.printSchema();

 上面結果可以看出雖然ED字段解析出來了,但是ED的數據類型是string,並不能用ED.xxxstatus的方法獲取到裏面的字段。

		//獲取ED內部字段 
		DataFrame ed_df = sc.sql("select mac,tel,ED.xxxstatus,ED.xxxinfo,ED.xxxrxtx from tmp_json_df");
		ed_df.show();

需要將ED字段的String類型轉化爲struct類型,先將數據集轉化爲json格式分析下

		//轉爲爲json並保存 
		unbase64_json_df.toJSON().saveAsTextFile("D:\\work\\test");
{
	"mac": "33:D3:B3:43:83:03",
	"tel": "15378451236",
	"ED": "{\"xxxstatus\":[{\"Time\":\"2019-11-03 06:16:00\",\"runningTime\":240187,\"status01\":2,\"status02\":41}],\"xxxinfo\":[{\"Time\":\"2019-11-03 06:16:00\",\"xxxDev\":[{\"xxxDev01\":1,\"xxxDev02\":\"xxx-00000\",\"xxxDev03\":\"zzG-1\",\"xxxDev04\":\"阿里\",\"xxxDev05\":\"2154878787\",\"xxxDev06\":4,\"xxxDev07\":4,\"xxxDev08\":0,\"xxxDev09\":\"WPA/OST\",\"xxxDev10\":\"a,t,c\",\"xxxDev11\":\"40-bit\",\"xxxDev12\":\"All\"},{\"xxxDev01\":0,\"xxxDev02\":\"Guest-test\",\"xxxDev03\":\"zzG-2\",\"xxxDev04\":\"APP\",\"xxxDev05\":\"7777777\",\"xxxDev06\":4,\"xxxDev07\":4,\"xxxDev08\":0,\"xxxDev09\":\"WPA/OST\",\"xxxDev10\":\"a,t,c\",\"xxxDev11\":\"40-bit\",\"xxxDev12\":\"OSystem\"}]}],\"xxxrxtx\":[{\"Time\":\"2019-11-03 04:55:08\",\"xxxList\":[{\"xxxList01\":\"2564587945\",\"xxxList02\":\"phone\",\"xxxList03\":\"5G\",\"xxxList04\":\"00CDN565BGF\",\"xxxList05\":\"192.168.0.124\",\"xxxList06\":\"xx的手機\",\"xxxList07\":\"iPhone 6s\",\"xxxList08\":\"蘋果\",\"xxxList09\":\"xxxPhone\",\"xxxList10\":1520000,\"xxxList11\":274000000},{\"xxxList01\":\"25687941111\",\"xxxList02\":\"phone\",\"xxxList03\":\"5G\",\"xxxList04\":\"9CBDF456FG\",\"xxxList05\":\"192.168.0.125\",\"xxxList06\":\"xx的手機\",\"xxxList07\":\"華爲G9青春版\",\"xxxList08\":\"華爲\",\"xxxList09\":\"xxx_Youth\",\"xxxList10\":1546000,\"xxxList11\":102580000},{\"xxxList01\":\"1897945612\",\"xxxList02\":\"SVC\",\"xxxList03\":\"1\",\"xxxList04\":\"24HDBV2356BC\",\"xxxList05\":\"255.255.255.255\",\"xxxList06\":\"xx的機頂盒\",\"xxxList07\":\"網易雲盒\",\"xxxList08\":\"網易\",\"xxxList09\":\"android-xxxx\",\"xxxList10\":589710000,\"xxxList11\":15687900000}]},{\"Time\":\"2019-11-03 05:00:09\",\"xxxList\":[{\"xxxList01\":\"56897456664\",\"xxxList02\":\"phone\",\"xxxList03\":\"3G\",\"xxxList04\":\"000CDB482BC\",\"xxxList05\":\"192.168.0.126\",\"xxxList06\":\"xx的手機\",\"xxxList07\":\"iPhone 6s\",\"xxxList08\":\"蘋果\",\"xxxList09\":\"xxxPhone\",\"xxxList10\":56897400,\"xxxList11\":2885610000}]}]}"
}

可以看出,如果將ED字段的大括號外的雙引號去掉,再把大括號內部的轉義字符“\”去掉,再通過sqlcontext讀取下,應該就可以直接取到內部值了。

		//規範化ED字段json字符串,將字符串轉化爲json格式  
		JavaRDD<String> unbase64_json_rdd = unbase64_json_df.toJSON().toJavaRDD().map(new Function<String, String>() {

			@Override
			public String call(String st) throws Exception {
				return st.replaceAll("\\\\\"", "\"").replaceFirst(":\"\\{", ":{").replaceFirst("\"} *$", "}");
			}
		});
		unbase64_json_rdd.saveAsTextFile("D:\\work\\test02");
{
	"mac": "33:D3:B3:43:83:03",
	"tel": "15378451236",
	"ED": {
		"xxxstatus": [{
			"Time": "2019-11-03 06:16:00",
			"runningTime": 240187,
			"status01": 2,
			"status02": 41
		}],
		"xxxinfo": [{
			"Time": "2019-11-03 06:16:00",
			"xxxDev": [{
				"xxxDev01": 1,
				"xxxDev02": "xxx-00000",
				"xxxDev03": "zzG-1",
				"xxxDev04": "阿里",
				"xxxDev05": "2154878787",
				"xxxDev06": 4,
				"xxxDev07": 4,
				"xxxDev08": 0,
				"xxxDev09": "WPA/OST",
				"xxxDev10": "a,t,c",
				"xxxDev11": "40-bit",
				"xxxDev12": "All"
			}, {
				"xxxDev01": 0,
				"xxxDev02": "Guest-test",
				"xxxDev03": "zzG-2",
				"xxxDev04": "APP",
				"xxxDev05": "7777777",
				"xxxDev06": 4,
				"xxxDev07": 4,
				"xxxDev08": 0,
				"xxxDev09": "WPA/OST",
				"xxxDev10": "a,t,c",
				"xxxDev11": "40-bit",
				"xxxDev12": "OSystem"
			}]
		}],
		"xxxrxtx": [{
			"Time": "2019-11-03 04:55:08",
			"xxxList": [{
				"xxxList01": "2564587945",
				"xxxList02": "phone",
				"xxxList03": "5G",
				"xxxList04": "00CDN565BGF",
				"xxxList05": "192.168.0.124",
				"xxxList06": "xx的手機",
				"xxxList07": "iPhone 6s",
				"xxxList08": "蘋果",
				"xxxList09": "xxxPhone",
				"xxxList10": 1520000,
				"xxxList11": 274000000
			}, {
				"xxxList01": "25687941111",
				"xxxList02": "phone",
				"xxxList03": "5G",
				"xxxList04": "9CBDF456FG",
				"xxxList05": "192.168.0.125",
				"xxxList06": "xx的手機",
				"xxxList07": "華爲G9青春版",
				"xxxList08": "華爲",
				"xxxList09": "xxx_Youth",
				"xxxList10": 1546000,
				"xxxList11": 102580000
			}, {
				"xxxList01": "1897945612",
				"xxxList02": "SVC",
				"xxxList03": "1",
				"xxxList04": "24HDBV2356BC",
				"xxxList05": "255.255.255.255",
				"xxxList06": "xx的機頂盒",
				"xxxList07": "網易雲盒",
				"xxxList08": "網易",
				"xxxList09": "android-xxxx",
				"xxxList10": 589710000,
				"xxxList11": 15687900000
			}]
		}, {
			"Time": "2019-11-03 05:00:09",
			"xxxList": [{
				"xxxList01": "56897456664",
				"xxxList02": "phone",
				"xxxList03": "3G",
				"xxxList04": "000CDB482BC",
				"xxxList05": "192.168.0.126",
				"xxxList06": "xx的手機",
				"xxxList07": "iPhone 6s",
				"xxxList08": "蘋果",
				"xxxList09": "xxxPhone",
				"xxxList10": 56897400,
				"xxxList11": 2885610000
			}]
		}]
	}
}

再次通過sqlcontext讀取解析json

		//再次讀取解析規範化後的json 
		DataFrame second_json_df = sc.read().json(unbase64_json_rdd);
		second_json_df.show();
		second_json_df.printSchema();


接下來就簡單了,直接展開讀取需要的數據即可。

		//生成xxxstatus表數據 由於xxxstatus字段是數組需要先展開 
		second_json_df.registerTempTable("tmp_second_json_df");
		DataFrame xxxstatus_ex_df = sc.sql("select mac,tel,explode(ED.xxxstatus) as xxxstatus from tmp_second_json_df");
		xxxstatus_ex_df.show();

由於xxxstatus數組裏只有一個元素,展開後也就一條記錄

		//生成xxxstatus表數據 runningTime status01 status02 都是long類型需要轉化爲String類型
		xxxstatus_ex_df.registerTempTable("tmp_xxxstatus_ex");
		DataFrame xxxstatus_info_df = sc.sql("select mac,tel,xxxstatus.Time,cast(xxxstatus.runningTime as string) runningTime,cast(xxxstatus.status01 as string) status01,cast(xxxstatus.status02 as string) status02 from tmp_xxxstatus_ex");
		xxxstatus_info_df.show();

將xxxstatus數據生成自定的分隔符分隔的文件

		//字段以|作爲分割符 
		JavaRDD<String> xxxstatus_info_rdd = xxxstatus_info_df.toJavaRDD().map(new Function<Row, String>() {

			@Override
			public String call(Row row) throws Exception {
				StringBuilder stb = new StringBuilder();
				for (int i = 0; i < row.length(); i++) {
					stb.append(row.getString(i) + "|");
				}
				return stb.toString();
			}
		});
		//生成一個文件 
		xxxstatus_info_rdd.repartition(1).saveAsTextFile("D:\\work\\test03");
33:D3:B3:43:83:03|15378451236|2019-11-03 06:16:00|240187|2|41|

生成xxxinfo表數據

		//生成xxxinfo數據 
		DataFrame xxxinfo_ex_df = sc.sql("select mac,tel,explode(ED.xxxinfo) as xxxinfo from tmp_second_json_df");
		xxxinfo_ex_df.registerTempTable("tmp_xxxinfo_ex");
		//展開xxxDev數組 
		DataFrame xxxinfo_info_df = sc.sql("select mac,tel,xxxinfo.Time,explode(xxxinfo.xxxDev) xxxDev from tmp_xxxinfo_ex");
		xxxinfo_info_df.registerTempTable("tmp_xxxinfo_info");
		////生成xxxinfo表數據 xxxDev01 xxxDev06 xxxDev07 xxxDev08 都是long類型需要轉化爲String類型
		DataFrame xxxinfo_xxxDev_df = sc.sql("select  mac,tel,Time,cast(xxxDev.xxxDev01 as string) xxxDev01,xxxDev.xxxDev02,xxxDev.xxxDev03,xxxDev.xxxDev04,xxxDev.xxxDev05,cast(xxxDev.xxxDev06 as string) xxxDev06,cast(xxxDev.xxxDev07 as string) xxxDev07,cast(xxxDev.xxxDev08 as string) xxxDev08,xxxDev.xxxDev09,xxxDev.xxxDev10,xxxDev.xxxDev11,xxxDev.xxxDev12 from tmp_xxxinfo_info");
		xxxinfo_xxxDev_df.show();

 

 +-----------------+-----------+-------------------+--------+----------+--------+--------+----------+--------+--------+--------+--------+--------+--------+--------+
|              mac|        tel|               Time|xxxDev01|  xxxDev02|xxxDev03|xxxDev04|  xxxDev05|xxxDev06|xxxDev07|xxxDev08|xxxDev09|xxxDev10|xxxDev11|xxxDev12|
+-----------------+-----------+-------------------+--------+----------+--------+--------+----------+--------+--------+--------+--------+--------+--------+--------+
|33:D3:B3:43:83:03|15378451236|2019-11-03 06:16:00|       1| xxx-00000|   zzG-1|      阿里|2154878787|       4|       4|       0| WPA/OST|   a,t,c|  40-bit|     All|
|33:D3:B3:43:83:03|15378451236|2019-11-03 06:16:00|       0|Guest-test|   zzG-2|     APP|   7777777|       4|       4|       0| WPA/OST|   a,t,c|  40-bit| OSystem|
+-----------------+-----------+-------------------+--------+----------+--------+--------+----------+--------+--------+--------+--------+--------+--------+--------+

生成xxxrxtx數據 

		//生成xxxrxtx數據 
		DataFrame xxxrxtx_ex_df = sc.sql("select mac,tel,explode(ED.xxxrxtx) as xxxrxtx from tmp_second_json_df");
		xxxrxtx_ex_df.registerTempTable("tmp_xxxrxtx_ex");
		//展開xxxList數組 
		DataFrame xxxrxtx_info_df = sc.sql("select mac,tel,xxxrxtx.Time,explode(xxxrxtx.xxxList) xxxList from tmp_xxxrxtx_ex");
		xxxrxtx_info_df.registerTempTable("tmp_xxxrxtx_info");
		////生成xxxrxtx表數據 xxxList10 xxxList11 都是long類型需要轉化爲String類型
		DataFrame xxxrxtx_xxxList_df = sc.sql("select  mac,tel,Time,xxxList.xxxList01,xxxList.xxxList02,xxxList.xxxList03,xxxList.xxxList04,xxxList.xxxList05,xxxList.xxxList06,xxxList.xxxList07,xxxList.xxxList08,xxxList.xxxList09,cast(xxxList.xxxList10 as string) xxxList10,cast(xxxList.xxxList11 as string) xxxList11 from tmp_xxxrxtx_info");
		xxxrxtx_xxxList_df.show();
 +-----------------+-----------+-------------------+-----------+---------+---------+------------+---------------+---------+---------+---------+------------+---------+-----------+
|              mac|        tel|               Time|  xxxList01|xxxList02|xxxList03|   xxxList04|      xxxList05|xxxList06|xxxList07|xxxList08|   xxxList09|xxxList10|  xxxList11|
+-----------------+-----------+-------------------+-----------+---------+---------+------------+---------------+---------+---------+---------+------------+---------+-----------+
|33:D3:B3:43:83:03|15378451236|2019-11-03 04:55:08| 2564587945|    phone|       5G| 00CDN565BGF|  192.168.0.124|    xx的手機|iPhone 6s|       蘋果|    xxxPhone|  1520000|  274000000|
|33:D3:B3:43:83:03|15378451236|2019-11-03 04:55:08|25687941111|    phone|       5G|  9CBDF456FG|  192.168.0.125|    xx的手機|  華爲G9青春版|       華爲|   xxx_Youth|  1546000|  102580000|
|33:D3:B3:43:83:03|15378451236|2019-11-03 04:55:08| 1897945612|      SVC|        1|24HDBV2356BC|255.255.255.255|   xx的機頂盒|     網易雲盒|       網易|android-xxxx|589710000|15687900000|
|33:D3:B3:43:83:03|15378451236|2019-11-03 05:00:09|56897456664|    phone|       3G| 000CDB482BC|  192.168.0.126|    xx的手機|iPhone 6s|       蘋果|    xxxPhone| 56897400| 2885610000|
+-----------------+-----------+-------------------+-----------+---------+---------+------------+---------------+---------+---------+---------+------------+---------+-----------+

關於sparksql解析json大文件的一些建議:

1、一個20G的gz文件,由於gz文件不可分隔的特性導致只能啓動一個task,解析速度非常慢,建議分成多個gz文件。或者將單個gz大文件解壓後再解析。解壓後文件大概200G,生成10個數據表,速度大概要2個多小時

2、json解析時由於json文件的規範變動活着不規範,有些需要的字段,json文件中並沒有,這導致執行sql的時候會報錯,可以先根據數據的shcema判斷下有沒有所需字段,如果沒有可以將字段置空,防止報錯

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章