23個優秀的機器學習訓練公共數據集

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"italic"},{"type":"color","attrs":{"color":"#000000","name":"user"}},{"type":"strong"}],"text":"本文最初發佈於rubikscode.com網站,經原作者授權由InfoQ中文站翻譯並分享。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"Iris數據集的那些示例你是不是已經用膩了呢?不要誤會我的意思,Iris數據集作爲入門用途來說是很不錯的,但其實網絡上還有很多有趣的公共數據集可以用來練習機器學習和深度學習。在這篇文章中,我會分享23個優秀的公共數據集,除了介紹數據集和數據示例外,我還會介紹這些數據集各自可以解決哪些問題。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":" "}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"以下是這23個公共數據集:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":" "}]},{"type":"numberedlist","attrs":{"start":null,"normalizeStart":1},"content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":1,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"帕爾默企鵝數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":2,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"共享單車需求數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":3,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"葡萄酒分類數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":4,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"波士頓住房數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":5,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"電離層數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":6,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"Fashion MNIST數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":7,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"貓與狗數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":8,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"威斯康星州乳腺癌(診斷)數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":9,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"Twitter情緒分析和Sentiment140數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":10,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"BBC新聞數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":11,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"垃圾短信分類器數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":12,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"CelebA數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":13,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"YouTube-8M數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":14,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"亞馬遜評論數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":15,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"紙幣驗證數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":16,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"LabelMe數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":17,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"聲納數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":18,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"皮馬印第安人糖尿病數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":19,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"小麥種子數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":20,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"Jeopardy!數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":21,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"鮑魚數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":22,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"假新聞檢測數據集"}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":23,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"ImageNet數據集"}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"1.帕爾默企鵝數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這是迄今爲止我最喜歡的數據集。我在最近寫的書裏的大多數示例都來自於它。簡單來說,如果你在Iris數據集上做實驗做膩了就可以嘗試一下這一個。它由Kristen Gorman博士和南極洲LTER的帕爾默科考站共同創建。該數據集本質上是由兩個數據集組成的,每個數據集包含344只企鵝的數據。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/44\/02\/44bd8b0015d37463f48abbd8b9812b02.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"就像Iris一樣,這個數據集裏有來自帕爾默羣島3個島嶼的3種不同種類的企鵝,分別是Adelie、Chinstrap和Gentoo。或許“Gentoo”聽起來很耳熟,那是因爲Gentoo Linux就是以它命名的!此外,這些數據集包含每個物種的culmen維度。這裏culmen是鳥喙的上脊。在簡化的企鵝數據中,culmen長度和深度被重命名爲變量culmen_length_mm和culmen_depth_mm。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"1.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\penguins_size.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/66\/16\/666c5a0b80e4d1c864dc248dd581c216.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們使用Pandas庫來做數據可視化,並且加載的是一個更簡單的數據集。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"1.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"它是練習解決分類和聚類問題的好幫手。在這裏,你可以嘗試各種分類算法,如決策樹、隨機森林、SVM,或把它用於聚類問題並練習使用無監督學習。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"1.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"在以下鏈接中可以獲得有關PalmerPenguins數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/allisonhorst.github.io\/palmerpenguins\/articles\/intro.html","title":null,"type":null},"content":[{"type":"text","text":"介紹"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/github.com\/allisonhorst\/palmerpenguins","title":null,"type":null},"content":[{"type":"text","text":"GitHub"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/parulpandey\/palmer-archipelago-antarctica-penguin-data","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"2.共享單車需求數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這個數據集非常有趣。它對於初學者來說有點複雜,但也正因如此,它很適合拿來做練習。它包含了華盛頓特區“首都自行車共享計劃”中自行車租賃需求的數據,自行車共享和租賃系統通常是很好的信息來源。這個數據集包含了有關騎行持續時間、出發地點、到達地點和經過時間的信息,還包含了每一天每小時的天氣信息。"}]},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/3a\/77\/3abe346f35f1c67ba5b3785da71b3477.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"2.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的。首先,我們使用數據集的每小時數據來執行操作:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\hour.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/89\/1d\/897db3c82a0689f0092a8ef354388e1d.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"每日數據是下面的樣子:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\day.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/f6\/2a\/f6a1b0c601139df317202f36f85e642a.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"2.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"由於該數據集包含的信息種類繁多,因此非常適合練習解決迴歸問題。你可以嘗試對其使用多元線性迴歸,或使用神經網絡。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"2.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"在以下鏈接中可以獲得關於該數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/archive.ics.uci.edu\/ml\/datasets\/bike+sharing+dataset","title":null,"type":null},"content":[{"type":"text","text":"UCI"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/c\/bike-sharing-demand","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"3.葡萄酒分類數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這是一個經典之作。如果你喜歡葡萄樹或計劃成爲索馬里人,肯定會更中意它的。該數據集由兩個數據集組成。兩者都包含來自葡萄牙Vinho Verde地區的葡萄酒的化學指標,一種用於紅葡萄酒,另一種用於白葡萄酒。由於隱私限制,數據集裏沒有關於葡萄種類、葡萄酒品牌、葡萄酒售價的數據,但有關於葡萄酒質量的信息。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/a6\/c1\/a64d291d6cffc61121bfc9a92fc60fc1.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"3.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\winequality-white.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/1f\/f8\/1ff0d380ce1f9a85d186ee5afc2608f8.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"3.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這是一個多類分類問題,但也可以被定義爲迴歸問題。它的分類數據是不均衡的(例如,正常葡萄酒的數量比優質或差的葡萄酒多得多),很適合針對不均衡數據集的分類練習。除此之外,數據集中所有特徵並不都是相關的,因此也可以拿來練習特徵工程和特徵選擇。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"3.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.vinhoverde.pt\/en\/about-vinho-verde","title":null,"type":null},"content":[{"type":"text","text":"介紹"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/archive.ics.uci.edu\/ml\/datasets\/Wine+Quality","title":null,"type":null},"content":[{"type":"text","text":"UCI"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"4.波士頓住房數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"雖然我說過會盡量不推薦其他人都推薦的那種數據集,但這個數據集實在太經典了。許多教程、示例和"},{"type":"link","attrs":{"href":"https:\/\/rubikscode.net\/ultimate-guide-to-machine-learning-with-python\/","title":null,"type":null},"content":[{"type":"text","text":"書籍"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]},{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"都使用過它。這個數據集由14個特徵組成,包含美國人口普查局收集的關於馬薩諸塞州波士頓地區住房的信息。這是一個只有506個樣本的小數據集。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"4.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\boston_housing.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/49\/ae\/496d2cd410ea8abfd4d704fac1762cae.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"4.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"該數據集非常適合練習迴歸任務。請注意,因爲這是一個小數據集,你可能會得到樂觀的結果。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"4.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.cs.toronto.edu\/~delve\/data\/boston\/bostonDetail.html","title":null,"type":null},"content":[{"type":"text","text":"介紹"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/c\/boston-housing","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"5.電離層數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這也是一個經典數據集。它實際上起源於1989年,但它確實很有趣。該數據集包含由拉布拉多鵝灣的雷達系統收集的數據。該系統由16個高頻天線的相控陣列組成,旨在檢測電離層中的自由電子。一般來說,電離層有兩種類型的結構:“好”和“壞”。這些雷達會檢測這些結構並傳遞信號。數據集中有34個自變量和1個因變量,總共有351個觀測值。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"5.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\ionsphere.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/1f\/f8\/1ff0d380ce1f9a85d186ee5afc2608f8.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"5.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這顯然是一個二元(2類)分類問題。有趣的是,這是一個不均衡的數據集,所以你也可以用它做這種練習。在這個數據集上實現高精度也非易事​​,基線性能在64%左右,而最高精度在94%左右。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"5.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/archive.ics.uci.edu\/ml\/datasets\/Ionosphere","title":null,"type":null},"content":[{"type":"text","text":"UCI"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"6.Fashion MNIST數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"MNIST數據集是用於練習圖像分類和圖像識別的著名數據集,然而它有點被濫用了。如果你想要一個簡單的數據集來練習圖像分類,你可以試試Fashion MNIST。它曾被《機器學習終極指南》拿來做圖像分類示例。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":" "}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"本質上,這個數據集是MNIST數據集的變體,它與MNIST數據集具有相同的結構,也就是說它有一個60,000個樣本的訓練集和一個10,000個服裝圖像的測試集。所有圖像都經過尺寸歸一化和居中。圖像的大小也固定爲28×28,這樣預處理的圖像數據被減到了最小水平。它也可作爲某些框架(如TensorFlow或PyTorch)的一部分使用。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"6.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/80\/1d\/802ee4aab73399b486d8dc59e368fb1d.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"6.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"它最適合圖像分類和圖像生成任務。你可以使用簡單的卷積神經網絡(CNN)來做嘗試,或者使用生成對抗網絡(GAN)使用它來生成圖像。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"6.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/github.com\/zalandoresearch\/fashion-mnist","title":null,"type":null},"content":[{"type":"text","text":"GitHub"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/zalando-research\/fashionmnist","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"7.貓與狗數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這是一個包含貓狗圖像的數據集。這個數據集包含23,262張貓和狗的圖像,用於二值圖像分類。在主文件夾中,你會找到兩個文件夾train1和test。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":" "}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"train1文件夾包含訓練圖像,而test文件夾包含測試圖像。請注意,圖像名稱以cat或dog開頭。這些名稱本質上是我們的標籤,這意味着我們將使用這些名稱定義目標。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"7.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/a2\/86\/a272e313afdf6efb23b63ab4e5932d86.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"7.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這個數據集有兩重目標。首先,它可用於練習圖像分類以及對象檢測。其次,你可以在這裏面找到無窮無盡的可愛圖片。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"7.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.microsoft.com\/en-us\/download\/details.aspx?id=54765","title":null,"type":null},"content":[{"type":"text","text":"介紹"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/c\/dogs-vs-cats","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"8.威斯康星州乳腺癌(診斷)數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"機器學習和深度學習技術在醫療保健領域中的應用正在穩步增長。如果你想練習並瞭解使用此類數據的效果,這個數據集是一個不錯的選擇。在該數據集中,數據是通過處理乳房腫塊的細針穿刺(FNA)的數字化圖像提取出來的。該數據集中的每個特徵都描述了上述數字化圖像中發現的細胞核的特徵。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"該數據集由569個樣本組成,其中包括357個良性樣本和212個惡性樣本。這個數據集中有三類特徵,其中實值特徵最有趣。它們是從數字化圖像中計算出來的,包含有關區域、細胞半徑、紋理等信息。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"8.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\breast-cancer-wisconsin.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/52\/42\/52cc9690308983bebdfe02c8bd1dac42.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"8.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這個醫療保健數據集適合練習分類和隨機森林、SVM等算法。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"8.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/uciml\/breast-cancer-wisconsin-data","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/archive.ics.uci.edu\/ml\/datasets\/Breast+Cancer+Wisconsin+(Diagnostic)","title":null,"type":null},"content":[{"type":"text","text":"UCI"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"9.Twitter情緒分析和Sentiment140數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"在過去幾年中,情緒分析成爲了一種監控和了解客戶反饋的重要工具。這種對消息和響應所攜帶的潛在情緒基調的檢測過程是完全自動化的,這意味着企業可以更好更快地瞭解客戶的需求並提供更好的產品和服務。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這一過程是通過應用各種NLP(自然語言處理)技術來完成的。這些數據集可以幫助你練習此類技術,實際上非常適合該領域的初學者。Sentiment140包含了使用Twitter API提取的1,600,000條推文。它們的結構略有不同。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"9.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\training.1600000.processed.noemoticon.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/03\/cb\/035cb72c1555fdaa2b55a539c055c6cb.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"9.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"如前所述,這是一個用於情緒分析的數據集。情緒分析是最常見的文本分類工具。該過程會分析文本片段以確定其中包含的情緒是積極的、消極的還是中性的。瞭解品牌和產品引發的社會情緒是現代企業必不可少的工具之一。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"9.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/c\/twitter-sentiment-analysis2","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/kazanova\/sentiment140","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"10.BBC新聞數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們再來看這個類別中另一個有趣的文本數據集。該數據集來自BBC新聞。它由2225篇文章組成,每篇文章都有標籤。所有文章分成5個類別:科技、商業、政治、娛樂和體育。這個數據集沒有失衡,每個類別中的文章數量都是差不多的。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"10.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\BBC News Train.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/d4\/a9\/d4296073f9ea37fe2be1bd962658e4a9.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"10.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"自然,這個數據集最適合用於文本分類練習。你也可以更進一步,練習分析每篇文章的情緒。總的來說,它適用於各種NLP任務和實踐。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"10.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/c\/learn-ai-bbc","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"11.垃圾短信分類器數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"垃圾消息檢測是互聯網中最早投入實踐的機器學習任務之一。這種任務也屬於NLP和文本分類工作。所以,如果你想練習解決這類問題,Spam SMS數據集是一個不錯的選擇。它在實踐中用得非常多,非常適合初學者。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這個數據集最棒的一點是,它是從互聯網的多個來源構建的。例如,它從Grumbletext網站上提取了425條垃圾短信,從新加坡國立大學的NUS SMS Corpus(NSC)隨機選擇了3,375條短信,還有450條短信來自Caroline Tag的博士論文等。數據集本身由兩列組成:標籤(ham或spam)和原始文本。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"11.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"ham What you doing?how are you?\nham Ok lar... Joking wif u oni...\nham dun say so early hor... U c already then say...\nham MY NO. IN LUTON 0125698789 RING ME IF UR AROUND! H*\nham Siva is in hostel aha:-.\nham Cos i was out shopping wif darren jus now n i called him 2 ask wat present he wan lor. Then he started guessing who i was wif n he finally guessed darren lor.\nspam FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! ubscribe6GBP\/ mnth inc 3hrs 16 stop?txtStop\nspam Sunshine Quiz! Win a super Sony DVD recorder if you canname the capital of Australia? Text MQUIZ to 82277. B\nspam URGENT! Your Mobile No 07808726822 was awarded a L2,000 Bonus Caller Prize on 02\/09\/03! This is our 2nd attempt to contact YOU! Call 0871-872-9758 BOX95QU"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"11.2這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"顧名思義,該數據集最適合用於垃圾郵件檢測和文本分類。它也經常用在工作面試中,所以大家最好練習一下。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"11.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/archive.ics.uci.edu\/ml\/datasets\/sms+spam+collection","title":null,"type":null},"content":[{"type":"text","text":"UCI"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/uciml\/sms-spam-collection-dataset","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"12.CelebA數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"如果你想研究人臉檢測解決方案、構建自己的人臉生成器或創建深度人臉僞造模型,那麼這個數據集就是你的最佳選擇。該數據集擁有超過20萬張名人圖像,每張圖像有40個屬性註釋,爲你的研究項目提供了一個很好的起點。此外,它還涵蓋了主要的姿勢和背景類別。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"12.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/76\/ae\/76d0fd90c99a97fe4f6ed05f3bf29dae.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"12.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們可以用這個數據集解決多種問題。比如,我們可以解決各種人臉識別和計算機視覺問題,它可用來使用不同的生成算法生成圖像。此外,你可以使用它來開發新穎的深度人臉僞造模型或深度僞造檢測模型。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"12.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"http:\/\/mmlab.ie.cuhk.edu.hk\/projects\/CelebA.html","title":null,"type":null},"content":[{"type":"text","text":"介紹"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"13.YouTube-8M數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這是最大的多標籤視頻分類數據集。它來自谷歌,擁有800萬個帶有註釋和ID的YouTube分類視頻。這些視頻的註釋由YouTube視頻註釋系統使用48000個視覺實體的詞彙表創建。該詞彙表也可供下載。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":" "}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"請注意,此數據集可用作TensorFlow記錄文件。除此之外,你還可以使用這個數據集的擴展——YouTube-8M Segments數據集。它包含了人工驗證的分段註釋。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"13.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"你可以使用以下命令下載它們:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"mkdir -p ~\/yt8m\/2\/frame\/train\ncd ~\/yt8m\/2\/frame\/train\ncurl data.yt8m.org\/download.py | partition=2\/frame\/train mirror=us python"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"13.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"你可以使用這個數據集執行多種操作。比如可以使用它跟進谷歌的競賽,並開發準確分配視頻級標籤的分類算法。你還可以用它來創建視頻分類模型,也可以用它練習所謂的時間概念定位,也就是找到並分享特定的視頻瞬間。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"13.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/arxiv.org\/abs\/1609.08675","title":null,"type":null},"content":[{"type":"text","text":"介紹"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"http:\/\/research.google.com\/youtube8m\/","title":null,"type":null},"content":[{"type":"text","text":"下載"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"14.亞馬遜評論數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"情緒分析是最常見的文本分類工具。這個過程會分析文本片段以確定情緒傾向是積極的、消極的還是中性的。在監控在線會話時瞭解你的品牌、產品或服務引發的社會情緒是現代商業活動的基本工具之一,而情緒分析是實現這一目標的第一步。該數據集包含了來自亞馬遜的產品評論和元數據,包括1996年5月至2018年10月的2.331億條評論。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"14.1 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這個數據集可以爲任何產品創建情緒分析的入門模型,你可以使用它來快速創建可用於生產的模型。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"14.2 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/jmcauley.ucsd.edu\/data\/amazon\/","title":null,"type":null},"content":[{"type":"text","text":"介紹和下載"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"15.紙幣驗證數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這是一個有趣的數據集。你可以使用它來創建可以檢測真鈔和僞造鈔票的解決方案。該數據集包含了從數字化圖像中提取的許多指標。數據集的圖像是使用通常用於印刷檢查的工業相機創建的,圖像尺寸爲400x400像素。這是一個乾淨的數據集,包含1372個示例且沒有缺失值。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"15.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\data_banknote_authentication.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/9e\/3c\/9e7c13b9d39a678yy1ec3295a2f76d3c.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"15.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"它是練習二元分類和應用各種算法的絕佳數據集。此外,你可以修改它並將其用於聚類,並提出將通過無監督學習對這些數據進行聚類的算法。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"15.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/archive.ics.uci.edu\/ml\/datasets\/banknote+authentication#","title":null,"type":null},"content":[{"type":"text","text":"UCI"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/ritesaluja\/bank-note-authentication-uci-data","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"16.LabelMe數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"LabelMe是另一個計算機視覺數據集。LabelMe是一個帶有真實標籤的大型圖像數據庫,用於物體檢測和識別。它的註釋來自兩個不同的來源,其中就有LabelMe在線註釋工具。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":" "}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"簡而言之,有兩種方法可以利用這個數據集。你可以通過LabelMe Matlab工具箱下載所有圖像,也可以通過LabelMe Matlab工具箱在線使用圖像。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"16.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"標記好的數據如下所示:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/04\/14\/046e448869cfebe74ede49a49df86a14.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"16.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"它是用於對象檢測和對象識別解決方案的絕佳數據集。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"16.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"http:\/\/labelme.csail.mit.edu\/Release3.0\/index.php","title":null,"type":null},"content":[{"type":"text","text":"介紹和下載"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"17.聲納數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"如果你對地質學感興趣,會發現這個數據集非常有趣。它是利用聲納信號製成的,由兩部分組成。第一部分名爲“sonar.mines”,包含111個模式,這些模式是使用在不同角度和不同條件下從金屬圓柱體反射的聲納信號製成的。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":" "}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"第二部分名爲“sonar.rocks”,由97個模式組成,同樣是通過反射聲納信號製成,但這次反射的是岩石上的信號。它是一個不均衡數據集,包含208個示例、60個輸入特徵和一個輸出特徵。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"17.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\sonar.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/2c\/0a\/2cc792e6caf04eca9eccb45d1972510a.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"17.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"該數據集非常適合練習二元分類。它的製作目標是檢測輸入是地雷還是岩石,這是一個有趣的問題,因爲最高的輸出結果達到了88%的準確率。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"17.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.is.umk.pl\/projects\/datasets.html#Sonar","title":null,"type":null},"content":[{"type":"text","text":"介紹"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/archive.ics.uci.edu\/ml\/datasets\/Connectionist+Bench+(Sonar,+Mines+vs.+Rocks)","title":null,"type":null},"content":[{"type":"text","text":"UCI"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"18.皮馬印第安人糖尿病數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這是另一個用於分類練習的醫療保健數據集。它來自美國國家糖尿病、消化和腎臟疾病研究所,其目的是根據某些診斷指標來預測患者是否患有糖尿病。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":" "}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"該數據集包含768個觀測值,具有8個輸入特徵和1個輸出特徵。它不是一個均衡的數據集,並且假設缺失值被替換爲0。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"18.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\pima-indians-dataset.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/a2\/01\/a260f243c90dc3e46bf2d66f48bf2e01.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"18.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"它是另一個適合練習二元分類的數據集。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"18.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/raw.githubusercontent.com\/jbrownlee\/Datasets\/master\/pima-indians-diabetes.names","title":null,"type":null},"content":[{"type":"text","text":"介紹"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/uciml\/pima-indians-diabetes-database","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"19.小麥種子數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這個數據集非常有趣和簡單。它特別適合初學者,可以代替Iris數據集。該數據集包含屬於三種不同小麥品種的種子信息:Kama、Rosa和Canadian。它是一個均衡的數據集,每個類別有70個實例。種子內部內核結構的測量值是使用軟X射線技術檢測的。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"19.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\seeds_dataset.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/f4\/4e\/f4fb05cc0d02661bfc20782yy1a3824e.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"19.2這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這個數據集有利於提升分類技能。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"19.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/archive.ics.uci.edu\/ml\/datasets\/seeds","title":null,"type":null},"content":[{"type":"text","text":"UCI"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/jmcaro\/wheat-seedsuci","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"20.Jeopardy!問題數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這個數據集很不錯,包含216,930個Jeopardy問題、答案和其他數據。它是可用於你NLP項目的絕佳數據集。除了問題和答案,該數據集還包含有關問題類別和價值的信息。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"20.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\joepardy.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/0c\/16\/0c5e94e5f3f275f40ab2c437462ba216.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"20.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這是一個豐富的數據集,可用於多種用途。你可以運行分類算法並預測問題的類別或問題的價值。不過你可以用它做的最酷的事情可能是用它來訓練BERT模型。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"20.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/tunguz\/200000-jeopardy-questions","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"21.鮑魚數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從本質上講這是一個多分類問題,然而,這個數據集也可以被視爲一個迴歸問題。它的目標是使用提供的指標來預測鮑魚的年齡。這個數據集不均衡,4,177個實例有8個輸入變量和1個輸出變量。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"21.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\abalone.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/6y\/ed\/6yye8106a232838b9cc60b284687efed.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"21.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"該數據集可以同時構建爲迴歸和分類任務。這是一個很好的機會,可以使用多元線性迴歸、SVM、隨機森林等算法,或者構建一個可以解決這個問題的神經網絡。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"21.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/archive.ics.uci.edu\/ml\/datasets\/abalone","title":null,"type":null},"content":[{"type":"text","text":"UCI"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/rodolfomendes\/abalone-dataset","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"22.假新聞數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們生活在一個狂野的時代。假新聞、深度造假和其他類型的欺騙技術都成了我們日常生活的一部分,無論我們喜歡與否。這個數據集提供了另一個非常適合練習的NLP任務。它包含標記過的真實和虛假新聞,以及它們的文本和作者。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"22.1 數據集樣本"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"我們加載數據,看看它是什麼樣的:"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"data = pd.read_csv(f\".\\\\Datasets\\\\fake_news\\\\train.csv\")\ndata.head()"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/54\/d1\/54ceyya6cce777691073cc4118yy31d1.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"22.2 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"這是另一個NLP文本分類任務。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"22.3 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/www.kaggle.com\/c\/fake-news\/overview","title":null,"type":null},"content":[{"type":"text","text":"Kaggle"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"heading","attrs":{"align":null,"level":2},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"23.ImageNet數據集"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"最後這個數據集是計算機視覺數據集中的王者——ImageNet。該數據集是用來衡量所有新的深度學習和計算機視覺技術創新的基準。沒有它,深度學習的世界就不會變成今天這樣的狀態。ImageNet是一個按照WordNet層次結構組織的大型圖像數據庫。這意味着每個實體都用一組稱爲-synset的詞和短語來描述。每個同義詞集分配了大約1000個圖像。基本上,層次結構的每個節點都由成百上千的圖像描述。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https:\/\/static001.infoq.cn\/resource\/image\/71\/ea\/71993b95b3f4c0540e8d7c7bc89b59ea.jpg","alt":null,"title":"","style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":"","fromPaste":false,"pastePass":false}},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"23.1 這個公共數據集適合解決什麼問題?"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"它是學術和研究界的標準數據集。它的主要任務是圖像分類,但你也可以將其用於各種任務。"}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"23.2 有用的鏈接"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"從以下鏈接中可以找到關於這個數據集的更多信息:"}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"link","attrs":{"href":"https:\/\/image-net.org\/","title":null,"type":null},"content":[{"type":"text","text":"官方網站"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]}]}]}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"在本文中,我們探索了23個非常適合機器學習應用實踐的數據集。感謝你的閱讀!"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong"}],"text":"作者介紹"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"Nikola M. Zivkovic "},{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"是下列書籍的作者:《"},{"type":"link","attrs":{"href":"https:\/\/rubikscode.net\/ultimate-guide-to-machine-learning-with-python\/","title":null,"type":null},"content":[{"type":"text","text":"機器學習終極指南"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]},{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"》和《"},{"type":"link","attrs":{"href":"https:\/\/rubikscode.net\/deep-learning-for-programmers\/","title":null,"type":null},"content":[{"type":"text","text":"面向程序員的深度學習"}],"marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}]},{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":"》。他喜歡分享知識,還是一位經驗豐富的演講者。他曾在許多聚會、會議上發表演講,並在諾維薩德大學擔任客座講師。"}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}}],"text":" "}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"color","attrs":{"color":"#494949","name":"user"}},{"type":"strong"}],"text":"原文鏈接:"},{"type":"link","attrs":{"href":"https:\/\/rubikscode.net\/2021\/07\/19\/top-23-best-public-datasets-for-practicing-machine-learning","title":null,"type":null},"content":[{"type":"text","text":"https:\/\/rubikscode.net\/2021\/07\/19\/top-23-best-public-datasets-for-practicing-machine-learning"}]}]}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章