數據分析從零開始實戰,Pandas讀寫TSV/Json數據

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"這是我參與11月更文挑戰的第17天。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"一、寫在前面","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本系列學習筆記參考書籍:  ","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"《數據分析實戰》托馬茲·卓巴斯","attrs":{}},{"type":"text","text":",會將自己學習本書的筆記分享給大家,同樣開成一個系列『數據分析從零開始實戰』。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"上一篇文章中帶大家瞭解了數據分析基礎,配置好了數據分析的基本環境,以及利用","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"pandas","attrs":{}}],"attrs":{}},{"type":"text","text":"模塊讀寫csv文件,在本文開頭,我也補充了csv與tsv的基本介紹與區別,意在更好的讓大家理解相關知識點,本文將帶大家繼續學習文件讀取。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"點擊查看第一篇文章:","attrs":{}},{"type":"link","attrs":{"href":"https://xie.infoq.cn/article/50e694b27fb9562229a210e57","title":"","type":null},"content":[{"type":"text","text":"# 數據分析從零開始實戰,Pandas讀寫CSV數據","attrs":{}}]}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"二、上節補充","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"CSV","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"逗號分隔值(Comma-Separated Values,CSV,有時也稱爲字符分隔值,因爲分隔字符也可以不是逗號),其文件以純文本形式存儲表格數據(數字和文本)。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"TSV","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"TSV 是Tab-separated values的縮寫,即製表符分隔值。Python的csv模塊準確的講應該叫做dsv模塊,因爲它實際上是支持範式的分隔符分隔值文件(DSV,delimiter-separated values)的。","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"delimiter參數值默認爲半角逗號,即默認將被處理文件視爲CSV。\n當`delimiter='\\t'`時,被處理文件就是TSV。\n複製代碼","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"三、基本知識概要","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"1.利用pandas讀寫tsv文件","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"2.利用pandas讀寫json文件","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"四、開始動手動腦","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"1.利用pandas讀寫tsv文件","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在文章開頭我已經說明了csv與tsv的差別,相信部分看過第一篇文章的讀者應該知道怎麼處理tsv文件了。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"csv與tsv只是內容的分隔符不一樣,前者是","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":",","attrs":{}}],"attrs":{}},{"type":"text","text":",後者是","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"\\t","attrs":{}}],"attrs":{}},{"type":"text","text":",python讀取這兩類文件都使用","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"csv","attrs":{}}],"attrs":{}},{"type":"text","text":"模塊,也可以直接利用","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"pandas","attrs":{}}],"attrs":{}},{"type":"text","text":",這裏我們講利用pandas讀取方式,使用的函數","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"read_csv()","attrs":{}}],"attrs":{}},{"type":"text","text":"與","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"to_csv()","attrs":{}}],"attrs":{}},{"type":"text","text":"在上一篇 文章中有詳細介紹,這裏我直接上案例代碼。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(1) 讀取tsv文件代碼","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"import pandas as pd\nimport os\n\n# 獲取當前文件父目錄路徑\nfather_path = os.getcwd()\n\n# 原始數據文件路徑\nrpath_tsv = father_path+r'\\data01\\city_station.tsv'\n# 讀取數據\ntsv_read = pd.read_csv(rpath_tsv, sep=\"\\t\")\n# 顯示數據前10條\nprint(tsv_read.head(10))\n複製代碼","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"運行結果","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":" 站點名 代號\n0 北京北 VAP\n1 北京東 BOP\n2 北京 BJP\n3 北京南 VNP\n4 北京西 BXP\n複製代碼","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(2) 寫tsv文件代碼","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"import pandas as pd\nimport os\n\n# 獲取當前文件父目錄路徑\nfather_path = os.getcwd()\n\n# 保存數據文件路徑\npath_tsv = father_path+r'\\data01\\temp_city.tsv'\n\ndata = {\"站點名\": [\"北京北\", \"北京東\", \"北京\", \"北京南\", \"北京西\"],\n \"代號\": [\"VAP\", \"BOP\", \"BJP\", \"VNP\", \"BXP\"]}\ndf = pd.DataFrame(data)\ndf.to_csv(path_tsv, sep=\"\\t\", index=False)\n複製代碼","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"運行結果","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/dd/ddf6cbef432a2d29483a99f420e24c25.webp","alt":"在這裏插入圖片描述","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(3)號外加餐","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"利用csv模塊也可以直接讀取csv和tsv文件","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"csv.reader(csvfile, dialect='excel', **fmtparams)\ncsv.writer(csvfile, dialect='excel', **fmtparams)\n複製代碼","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"csvfile,必須是支持迭代(Iterator)的對象,可以是文件(file)對象或者列表(list)對象,如果是文件對象,打開時需要加\"b\"標誌參數。","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"qdialect,編碼風格,默認爲excel的風格,也就是用逗號(,)分隔,dialect方式也支持自定義","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"fmtparam,格式化參數,用來覆蓋之前dialect對象指定的編碼風格。","attrs":{}}]}]}],"attrs":{}},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"2.利用pandas讀寫json文件","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(1)利用pandas讀取json文件","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"import pandas as pd\nimport os\n\n# 獲取當前文件父目錄路徑\nfather_path = os.getcwd()\n# 原始數據文件路徑\nrpath_json = father_path+r'\\data01\\realEstate_trans.json'\njson_read = pd.read_json(rpath_json)\n\n# 輸出頭10行記錄\nprint(json_read.head(10))\n複製代碼","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"運行結果","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/9e/9ea295228007f9b3529f25642a5a023a.webp","alt":"在這裏插入圖片描述","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"函數解析","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"read_json(path_or_buf,orient,encoding,numpy)","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"常見參數解析:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"path_or_buf:字符串,表示文件路徑;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"orient:指示預期的JSON字符串格式。可以to_json()使用相應的方向值生成兼容的JSON字符串。一組可能的方向是:","attrs":{}}]}]}],"attrs":{}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"'split' : dict like {index -> [index], columns -> [columns], data -> [values]}\n'records' : list like [{column -> value}, ... , {column -> value}]\n'index' : dict like {index -> {column -> value}}\n'columns' : dict like {column -> {index -> value}}\n'values' : just the values array\n複製代碼","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"encoding:字符串,默認爲'utf-8';","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"numpy:布爾值,默認爲False,直接解碼爲numpy數組。僅支持數字數據,但支持非數字列和索引標籤。另請注意,如果numpy = True,則每個術語的JSON順序必須相同。","attrs":{}}]}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(2)利用pandas寫入json文件","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"import pandas as pd\nimport os\n\n# 獲取當前文件父目錄路徑\nfather_path = os.getcwd()\n# 存儲數據文件路徑\nwpath_json = father_path+r'\\data01\\temp_trans.json'\ndata = [{\"city\": \"SACRAMENTO\", \"longitude\": -121.434879, \"street\": \"3526 HIGH ST\", \"sq__ft\": 836, \"latitude\": 38.631913, \"sale_date\": \"Wed May 21 00:00:00 EDT 2008\", \"zip\": 95838, \"beds\": 2, \"type\": \"Residential\", \"state\": \"CA\", \"baths\": 1, \"price\": 59222}, {\"city\": \"SACRAMENTO\", \"longitude\": -121.431028, \"street\": \"51 OMAHA CT\", \"sq__ft\": 1167, \"latitude\": 38.478902, \"sale_date\": \"Wed May 21 00:00:00 EDT 2008\", \"zip\": 95823, \"beds\": 3, \"type\": \"Residential\", \"state\": \"CA\", \"baths\": 1, \"price\": 68212}, {\"city\": \"SACRAMENTO\", \"longitude\": -121.443839, \"street\": \"2796 BRANCH ST\", \"sq__ft\": 796, \"latitude\": 38.618305, \"sale_date\": \"Wed May 21 00:00:00 EDT 2008\", \"zip\": 95815, \"beds\": 2, \"type\": \"Residential\", \"state\": \"CA\", \"baths\": 1, \"price\": 68880}]\ndf = pd.DataFrame(data)\ndf.to_json(wpath_json)\n複製代碼","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"運行結果","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/e0/e0372be6b8cb650e49de9fc1c25badee.webp","alt":"","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"函數解析","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"to_json(path_or_buf,orient,encoding,index)","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"前三個參數和read_json()裏的一樣","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"index:False則選擇不寫入索引,默認爲True。","attrs":{}}]}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"【注】利用json模版的loads()與dumps()方法也可以實現json文件的讀寫。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"五、送你的話","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"我始終覺得,要想學好一門語言,底層是最主要的,所以不要覺得入門的這些基本東西太簡單,","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"學好基礎,才能成大牛","attrs":{}},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"堅持 and 努力 : 終有所獲。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"思想很複雜,","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"實現很有趣,","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"只要不放棄,","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"終有成名日。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"—《老表打油詩》","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"下期見,我是愛貓愛技術的老表,如果覺得本文對你學習有所幫助,歡迎點贊、評論、關注我!","attrs":{}}]}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章