薪酬與工作滿意度大調查:數據科學家還是21世紀最性感的職業嗎?

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic"},{"type":"strong"}],"text":"本文最初發佈於Medium網站,經原作者授權由InfoQ中文站翻譯並分享。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"我們身處一個數據無處不在的時代。過去幾年來,勞動力市場上與數據相關的崗位一直是熱門。數據科學家、數據分析師和數據工程師是與數據相關的三大職業類別。如果你有興趣進入這個領域,或者已經身在其中,那麼你應該瞭解一下數據領域的行業就業現狀。當你踏上數據行業的職業生涯,相關崗位工作要求、薪水和滿意度等信息可以幫助你做足準備。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在這篇文章中,我會以StackOverflow年度開發人員調查爲基礎來探索一些有趣的問題。文章包含一些數據分析和建模內容,分爲三大部分:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong"}],"text":"第1部分:工資"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"數據行業中哪些崗位的薪水最高?"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"不同國家之間的工資對比;"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"工資水平與工作年限的對應關係;"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"不同性別之間的工資差異;"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"工資vs工作滿意度。"}]}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong"}],"text":"第2部分:數據行業的就業市場變化,對比2020年和2019年的數據"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"數據行業崗位數量的變化;"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"工資變化;"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"工作滿意度的變化。"}]}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong"}],"text":"第3部分:工作滿意度"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"使用XGBoost的多類分類來預測工作滿意度;"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"通過建模獲得的洞察。"}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"本文所用數據的說明:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在呈現分析結果之前,有必要解釋一些重要的數據處理步驟。讀者可以先跳過本節,在後文遇到疑問時再回來細讀。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"這份調查的問題主要針對具有一般開發人員背景的受訪者,包括軟件開發和數據分析領域。2020年的調查數據包含64,461份回覆。我利用了“DevType”問題並對數據進行了預處理,因此只會分析身處數據相關崗位的受訪者。問卷中的問題如下。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"以下哪一項符合你的情況?請選擇所有適用的選項。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"學術研究員"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong"}],"text":"數據或業務分析師(DA)"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong"}],"text":"數據科學家或機器學習專家(DS)"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"數據庫管理員"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"設計師"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"開發人員,後端"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"開發人員,桌面或企業應用程序"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"開發人員,嵌入式應用程序或設備"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"開發人員,前端"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"開發人員,全棧"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"開發人員,遊戲或圖形開發人員,移動設備"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"開發人員、QA或測試人員"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"DevOps專家"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"教育家"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong"}],"text":"工程師,數據(DE)"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"工程師,現場可靠性工程經理"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"市場營銷或銷售專業產品經理"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"科學家"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"高級管理(CSuite、VP等)學生"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"系統管理員"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"其他"}]}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"請注意,受訪者可以選擇多個DevType,甚至可以選擇多個數據崗位(即DA、DS、DE)。我只保留了與數據崗位關聯的數據條目,並按不同類型的崗位做了分類。因此,兩個新創建的實例可能來自具有多個數據崗位的同一個數據實例。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"爲了讓分析結果更加一致可靠,我還過濾了“就業狀態”,只保留當前有工作的受訪者。經過這些數據準備工作,我們篩選出了11,186個數據崗位實例。2019年調查數據也做了同樣的數據準備,創建了17,370個實例,它們會與第2部分中的2020年數據進行比較。"}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"第1部分:工資情況"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"我們先來看看這三種數據職業的分佈情況。在這裏我們可以看到它們的數字非常接近,DA排名第一,然後是DS、DE(圖1a)。就平均工資(美元)而言,我們可以看到DS和DE幾乎相同,而DA的工資略低(圖1b)。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/16\/61\/16b6ebbba9a50ac4c49a5c4bf4479361.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖1(a):2020年調查中的數據職業分佈。(b):數據職業的薪水。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"接着我們來看不同國家(受訪者最多的前9個國家)的工資水平,我們可以在圖2中看到一些有趣的細節:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"美國的數據行業工作收入最高。美國的平均工資遠高於其他國家,甚至高於西方其他發達國家(即德國、英國、加拿大)。"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"與DA相比,DS(和DE)不一定是收入較高的工作。事實上,我們看到在加拿大和法國,DA的薪水比DS高很多。**這可能會讓你想起一句話:數據科學家是21世紀最性感的工作。它可能不像你想象的那麼性感,至少從"},{"type":"text","marks":[{"type":"strong"},{"type":"strong"}],"text":"收入"},{"type":"text","text":"調查數據"},{"type":"text","marks":[{"type":"strong"},{"type":"strong"}],"text":"來看不那麼理想"},{"type":"text","text":"。**然而,StackOverflow調查數據並不一定準確反映了現實世界的總體情況。"}]}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/26\/9f\/264bc131e837802bdc2d5cf23821d09f.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖2:不同國家數據行業工作的薪水。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"與薪水相關的一個重要因素是工作年限。在調查數據中,我們用“專業編程年數”來評價工作經驗,將受訪者按“專業編程年數”分爲四組。在圖3中,從“0-3年”到“6-13年”,我們可以看到工資逐漸緩慢增加,而“13+年”組的工資有大幅上漲。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/63\/4e\/635cd6356e06bc68472a3404c4343b4e.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖3:各個“專業編程年數”組中數據崗位的薪水。這樣的分組是爲了讓各組中的樣本數儘量相當。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"另一個有趣且重要的觀察角度是該領域的性別分佈。開發人員的性別差異是一個受長期關注的問題。我們在這個行業中也看到了這個問題(圖4)。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/f4\/ee\/f431dd39c3507fe615a7475d3528f5ee.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖4:數據行業工作中的性別分佈。爲了方便做圖,所有非二元性別都被分到了“其他”組中。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在工資方面,我們可以看到男性和女性之間沒有實質性差異(圖5a),但女性的工資波動範圍更大(高方差,圖5b)。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/65\/36\/65969f655c0yy06e120941bfa2799636.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖5:按性別劃分的數據行業工作薪資。(a)條形圖。(b)小提琴圖。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"最後,讓我們看看工作滿意度與薪水之間的關係。正如你所料,工作滿意度並不總是與薪水呈正相關,這也是圖6a中的數據顯示的結果。這也可能表明,有一羣數據工程師對自己的工作非常不滿意,但他們的薪水卻很高(見圖6b最右側的綠色高條)。當然,還有許多因素會影響我們在現實世界中的工作滿意度。在第3部分中,我將應用機器學習建模來預測工作滿意度並尋找更多見解。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/76\/35\/7648c16dc1a2361260e88bd036ca9d35.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖6:薪水vs.工作滿意度。"}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"第2部分:數據行業工作的變化,對比2020年和2019年的數據"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在這一部分中,我們來看看數據領域的一些趨勢。由於我們的分析基於StackOverflow的調查數據,而他們2018年的調查形式與2019年和2020年有較大差異,因此我們只使用2019年、2020年的數據來做對比。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"我們首先注意到,2020年從原始數據中篩選出的有效調查回覆總數爲53,159,也就是說15.69%的受訪者從事與數據相關的工作。相比之下,2019年的調查數據有77,420份有效調查回覆,16.71%的回覆者從事數據相關工作。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/52\/ec\/52237deb8200bcae169156c463960aec.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖7:數據崗位和分佈的數量,2019年對比2020年。請注意,由於我們創建數據實例的方式,這裏的總數大於調查回覆的數量。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"首先,數據顯示這三個崗位之間的分佈幾乎沒有變化(圖7)。2019年的統計數字更大,因爲2019年的調查有更多的回覆,這並不一定意味着2020年與數據相關的工作數量有所減少。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/16\/21\/16ca7a2385c3c0bde61ee7d68dfc7b21.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖8:2019年和2020年之間的工資比較。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"談到薪水的變化,也許這是我遇到的最令人驚訝的結果。"},{"type":"text","marks":[{"type":"strong"}],"text":"我們可以看到所有三個數據崗位的薪水都減少了,總體平均減少了約16,000美元(圖8)。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/df\/33\/df873d045289ffb23fa6c46389e1be33.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖9:2019年和2020年的工資對比,僅美國數據。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"爲了進一步驗證這一變化,我只使用美國的數據做了圖,並且觀察到了類似的工資下降情況(圖9)。這次我們看到DS的薪水只是略有下降,而DA和DE的下降幅度更大。如果能使用其他數據源來交叉驗證這種下降趨勢是否屬實,會很有趣的。如果事實的確如此,那麼原因是什麼?這又會產生什麼影響?附帶說明一下,我首先認爲工資的這種下降趨勢可能是由於Covid-19以及市場低迷造成的。但2020年的調查數據實際上是在2020年2月收集的,那時疫情尚未產生影響。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"最後,我們來看看工作滿意度,這是幫助深入瞭解就業市場的另一個重要因素。問卷中關於工作滿意度的問題如下:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong"}],"text":"你對目前的工作滿意嗎?(如果你從事多項工作,請回答你花費最多時間的工作。)"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"非常不滿意Very dissatisfied(-2)"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"有點不滿意Slightly dissatisfied(-1)"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"一般水平Neither satisfied nor dissatisfied(0)"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"略滿意Slightly satisfied(1)"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"非常滿意Very satisfied(2)"}]}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"我將不同等級的滿意度轉換爲範圍從-2到2的值。平均分數如圖10所示。我們可以觀察到,在2019年和2020年,DS的滿意度得分最高,其次是DE和DA。此外,從2019年到2020年,所有三個數據崗位的滿意度都有所下降,這與工資的變化趨勢是一致的。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/bc\/1a\/bca17959cc56927a5d94419c070c171a.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖10:2019年和2020年工作滿意度對比。"}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"第3部分:工作滿意度預測"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在本文的最後一部分,我想構建機器學習模型來預測工作滿意度。正如我們之前看到的,工作滿意度問題有五個答案選項。因此這裏的預測將是一個多分類問題。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"爲避免不同年份的數據分佈不一致,這裏僅使用2020年調查數據進行建模。另外,請注意數據是不平衡的。“非常滿意”和“比較滿意”佔總數據的60%以上,“非常不滿意”不到10%。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"一些重要的數據處理步驟包括:1)數據清洗;2)缺失數據插補;3)分類數據編碼;4)特徵選擇\/工程。這裏我會跳過這些技術細節,你可以參考"},{"type":"link","attrs":{"href":"https:\/\/github.com\/Chancylin\/StackOverflow_Survey?fileGuid=JTgLxc0LSrQdpCO9","title":"","type":null},"content":[{"type":"text","text":"GitHub存儲庫"}]},{"type":"text","text":"中的筆記本和代碼以瞭解更多細節。對於建模,我使用XGBoost算法和oneVsRest方法來解決這個多分類問題。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"3.1:探索性數據分析(EDA)"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在展示建模結果之前,我將展示一些EDA作爲數據科學項目的例程,讓你快速瞭解工作滿意度。如"},{"type":"text","marks":[{"type":"strong"}],"text":"第1部分"},{"type":"text","text":"所示,工資和工作滿意度不一定相互關聯(圖6)。那麼其他可能影響工作滿意度的重要因素都有哪些呢?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"如圖11所示,公司規模和工作滿意度之間肯定存在某種模式,但我們很難得出一般性結論。合理的猜測是,在小公司中,“非常滿意”的比例往往高於“比較滿意”。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/d4\/78\/d43bf7731881ba4d6a3b4fe857f73978.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖11:不同規模公司的工作滿意度分佈。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"加班是另一個與一般意義上的工作滿意度相關的因素。然而這種關聯就算存在也是模糊的(圖12)。的確,在從不加班的組中,“非常滿意”的比例並非最大。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/5e\/27\/5e300a6d7yybe8db40bc847a5355c927.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖12:不同加班時間組之間的工作滿意度分佈。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"最後讓我們檢查一下不同國家的工作滿意度分佈。一個觀察結果很顯眼:在印度和巴西等發展中國家,“非常滿意”的比例相對較低。儘管如此,我們不能就此斷定這是數據領域獨有的特殊現象。相反,無論哪個領域,發展中國家“非常滿意”的比例都可能較低。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/a7\/08\/a760998f9793f0f4478c69f3d52f4408.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖13:不同國家的工作滿意度分佈。請注意,爲簡單起見,我們沒有在建模步驟中使用國家(Country)作爲特徵。國家是一個高基數特徵,應該使用更高級的編碼技術,而不是簡單的單熱編碼(one-hot-encoding)。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"3.2:使用XGBoost處理多分類問題"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"這裏我們來關注模型性能,所以檢查混淆矩陣。我們的模型有兩個觀察結果:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"模型存在過擬合問題,儘管在建模步驟中已經應用了多種技術來解決過擬合問題;"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"該模型可以在合理的水平上正確預測次要類別,儘管將實例預測爲“非常滿意”和“略滿意”會出很多錯誤。"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"該模型被“非常滿意”和“略滿意”混淆(圖14,右)。"}]}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/6a\/62\/6a2f0e8d44aaf849df50d76755e23762.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖14:混淆矩陣的模型性能。左:訓練數據的混淆矩陣。右:測試數據的混淆矩陣。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"當然,模型還有改進的餘地,但與使用樸素貝葉斯的基線模型(這裏沒展示)相比,模型性能相當不錯。此外,數據集本身可能也存在挑戰,因此無論如何都難以學習一般模式(即過擬合問題)。"}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"3.3:模型可解釋性和見解"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"爲了理解模型並獲得更多見解,我們來檢查在樹構建過程中計算的頂級特徵(圖15)。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"使用單熱編碼給解釋帶來了一些不便,因爲原來的特徵現在被分解成多個二值特徵。兩個最重要的特徵是關於“NEWOnboardGood”的:“NEWOnboardGood_1”表示對“你認爲貴公司的入職流程是否良好”的回答爲“是”,“NEWOnboardGood_2”表示“否”。很明顯,良好的入職流程通常與高工作滿意度相關。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"我們還看到“UndergradeMajor”是“NEWOnboardGood”之後的另一個重要特徵。事實上,模型從數據中學到的是:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"如果受訪者是自然科學(如生物、化學、物理等)本科專業,則更有可能表示“非常滿意”;"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"對於擁有另一工程學科(如土木、電氣、機械等)本科專業的受訪者來說結果類似"}]}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"“Bash\/Shell\/PowerShell”特徵很有趣,使用“Bash\/Shell\/PowerShell”作爲其編程語言之一的受訪者比不使用它們的受訪者更有可能“非常滿意”。至少這是模型從數據中學到的結果。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/c2\/90\/c2185d4eba05c5daa8cdf3f837d96890.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖16:XGBoost樹構建過程中通過增益計算出的特徵重要程度。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"我們可以使用"},{"type":"link","attrs":{"href":"https:\/\/github.com\/slundberg\/shap?fileGuid=JTgLxc0LSrQdpCO9","title":"","type":null},"content":[{"type":"text","text":"SHAP"}]},{"type":"text","text":"(一種非常通用的模型可解釋性工具)來更好地理解模型行爲。我們鼓勵你在筆記本中探索更多見解。在這裏,我將向你展示基於SHAP值的頂級特徵,並展示兩個關於Salary(薪資)和Company Size(公司規模)的依賴圖示例。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"首先,我們將看到SHAP值給出的頂級特徵(圖17)與節點分裂期間根據信息增益計算的特徵不同(圖16)。SHAP值方法被認爲能更一致地評估特徵重要性(請參閱SHAP論文以供參考)。在這裏,排名靠前的特徵是“Salary”“NEWOnboardGood_1”“Age”“YeasCode”“OrgSize”,在我看來更符合常識。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/c5\/a7\/c559a9b549c51b61c040539bcd4617a7.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖17:來自SHAP值的主要特徵,與圖16中來自信息增益的特徵對比。從0級到4級:“一般水平”“有點不滿意”“略滿意”“非常不滿意”“非常滿意”。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/68\/a3\/68813bfde4e5a567150773d8a37870a3.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖18:薪水的SHAP依賴圖。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.geekbang.org\/resource\/image\/81\/a1\/816635bc0b1070d08f29e0yyec6603a1.png","alt":null,"title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":"center","origin":null},"content":[{"type":"text","text":"圖19.公司規模的SHAP依賴圖。OrgSize的映射是:{1: ‘Just me — I am a freelancer, sole proprietor, etc.’, 2: ‘2 to 9 employees’, 3: ’10 to 19 employees’, 4: ’20 to 99 employees’, 5: ‘100 to 499 employees’, 6: ‘500 to 999 employees’, 7: ‘1,000 to 4,999 employees’, 8: ‘5,000 to 9,999 employees’, 9: ‘10,000 or more employees’, -1: ‘Missing’}"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"圖18中的SHAP依賴圖告訴我們如何使用薪水來預測實例是否“非常滿意”。我們可以看到,總體而言,更高的薪水意味着實例更有可能“非常滿意”,正如增長趨勢所表明的那樣。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"對於公司規模,較小的規模傾向於對“非常滿意”的預測結果做出積極貢獻(圖19)。"}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","text":"結論"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在這篇文章中,我們使用StackOverflow調查數據進行了一些探索性分析,並對數據行業就業市場有了一些瞭解。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"numberedlist","attrs":{"start":1,"normalizeStart":1},"content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","text":"我們分析了不同數據崗位的薪資分佈,同時也考慮了其他因素,如國家、工作年限、性別。"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","text":"我們將2019年的調查數據與2020年的數據進行了比較。令人驚訝的是,數據相關工作的薪水和工作滿意度都下降了。"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","text":"最後,我們構建了一個XGBoost多分類模型,通過檢查特徵重要性和依賴關係來預測工作滿意度,並獲得了對工作滿意度的一些洞察。"}]}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"這篇文章中的分析和建模更多是出於實踐目的,而沒有進一步調查就無法得出可靠的結論。最重要的是,StackOverflow調查數據本身就可能存在偏差,不一定代表真實世界的羣體情況。我希望你讀完這篇文章能得到一些樂趣。要查看更多技術細節,請訪問"},{"type":"link","attrs":{"href":"https:\/\/github.com\/Chancylin\/StackOverflow_Survey?fileGuid=JTgLxc0LSrQdpCO9","title":"","type":null},"content":[{"type":"text","text":"GitHub存儲庫"}]},{"type":"text","text":"。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong"}],"text":"原文鏈接:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/towardsdatascience.com\/salary-satisfaction-trend-of-data-jobs-f47bdf72afa3?fileGuid=JTgLxc0LSrQdpCO9","title":"","type":null},"content":[{"type":"text","text":"https:\/\/towardsdatascience.com\/salary-satisfaction-trend-of-data-jobs-f47bdf72afa3"}]}]}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章