数据分析从零开始实战,Pandas读写TSV/Json数据

{"type":"doc","content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"这是我参与11月更文挑战的第17天。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"一、写在前面","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"本系列学习笔记参考书籍:  ","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"《数据分析实战》托马兹·卓巴斯","attrs":{}},{"type":"text","text":",会将自己学习本书的笔记分享给大家,同样开成一个系列『数据分析从零开始实战』。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"上一篇文章中带大家了解了数据分析基础,配置好了数据分析的基本环境,以及利用","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"pandas","attrs":{}}],"attrs":{}},{"type":"text","text":"模块读写csv文件,在本文开头,我也补充了csv与tsv的基本介绍与区别,意在更好的让大家理解相关知识点,本文将带大家继续学习文件读取。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"点击查看第一篇文章:","attrs":{}},{"type":"link","attrs":{"href":"https://xie.infoq.cn/article/50e694b27fb9562229a210e57","title":"","type":null},"content":[{"type":"text","text":"# 数据分析从零开始实战,Pandas读写CSV数据","attrs":{}}]}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"二、上节补充","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"CSV","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"逗号分隔值(Comma-Separated Values,CSV,有时也称为字符分隔值,因为分隔字符也可以不是逗号),其文件以纯文本形式存储表格数据(数字和文本)。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"TSV","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"TSV 是Tab-separated values的缩写,即制表符分隔值。Python的csv模块准确的讲应该叫做dsv模块,因为它实际上是支持范式的分隔符分隔值文件(DSV,delimiter-separated values)的。","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"delimiter参数值默认为半角逗号,即默认将被处理文件视为CSV。\n当`delimiter='\\t'`时,被处理文件就是TSV。\n复制代码","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"三、基本知识概要","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"1.利用pandas读写tsv文件","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"2.利用pandas读写json文件","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"四、开始动手动脑","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"1.利用pandas读写tsv文件","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"在文章开头我已经说明了csv与tsv的差别,相信部分看过第一篇文章的读者应该知道怎么处理tsv文件了。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"csv与tsv只是内容的分隔符不一样,前者是","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":",","attrs":{}}],"attrs":{}},{"type":"text","text":",后者是","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"\\t","attrs":{}}],"attrs":{}},{"type":"text","text":",python读取这两类文件都使用","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"csv","attrs":{}}],"attrs":{}},{"type":"text","text":"模块,也可以直接利用","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"pandas","attrs":{}}],"attrs":{}},{"type":"text","text":",这里我们讲利用pandas读取方式,使用的函数","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"read_csv()","attrs":{}}],"attrs":{}},{"type":"text","text":"与","attrs":{}},{"type":"codeinline","content":[{"type":"text","text":"to_csv()","attrs":{}}],"attrs":{}},{"type":"text","text":"在上一篇 文章中有详细介绍,这里我直接上案例代码。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(1) 读取tsv文件代码","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"import pandas as pd\nimport os\n\n# 获取当前文件父目录路径\nfather_path = os.getcwd()\n\n# 原始数据文件路径\nrpath_tsv = father_path+r'\\data01\\city_station.tsv'\n# 读取数据\ntsv_read = pd.read_csv(rpath_tsv, sep=\"\\t\")\n# 显示数据前10条\nprint(tsv_read.head(10))\n复制代码","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"运行结果","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":" 站点名 代号\n0 北京北 VAP\n1 北京东 BOP\n2 北京 BJP\n3 北京南 VNP\n4 北京西 BXP\n复制代码","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(2) 写tsv文件代码","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"import pandas as pd\nimport os\n\n# 获取当前文件父目录路径\nfather_path = os.getcwd()\n\n# 保存数据文件路径\npath_tsv = father_path+r'\\data01\\temp_city.tsv'\n\ndata = {\"站点名\": [\"北京北\", \"北京东\", \"北京\", \"北京南\", \"北京西\"],\n \"代号\": [\"VAP\", \"BOP\", \"BJP\", \"VNP\", \"BXP\"]}\ndf = pd.DataFrame(data)\ndf.to_csv(path_tsv, sep=\"\\t\", index=False)\n复制代码","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"运行结果","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/dd/ddf6cbef432a2d29483a99f420e24c25.webp","alt":"在这里插入图片描述","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(3)号外加餐","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"利用csv模块也可以直接读取csv和tsv文件","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"csv.reader(csvfile, dialect='excel', **fmtparams)\ncsv.writer(csvfile, dialect='excel', **fmtparams)\n复制代码","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"csvfile,必须是支持迭代(Iterator)的对象,可以是文件(file)对象或者列表(list)对象,如果是文件对象,打开时需要加\"b\"标志参数。","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"qdialect,编码风格,默认为excel的风格,也就是用逗号(,)分隔,dialect方式也支持自定义","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"fmtparam,格式化参数,用来覆盖之前dialect对象指定的编码风格。","attrs":{}}]}]}],"attrs":{}},{"type":"heading","attrs":{"align":null,"level":5},"content":[{"type":"text","text":"2.利用pandas读写json文件","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(1)利用pandas读取json文件","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"import pandas as pd\nimport os\n\n# 获取当前文件父目录路径\nfather_path = os.getcwd()\n# 原始数据文件路径\nrpath_json = father_path+r'\\data01\\realEstate_trans.json'\njson_read = pd.read_json(rpath_json)\n\n# 输出头10行记录\nprint(json_read.head(10))\n复制代码","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"运行结果","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/9e/9ea295228007f9b3529f25642a5a023a.webp","alt":"在这里插入图片描述","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"函数解析","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"read_json(path_or_buf,orient,encoding,numpy)","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"常见参数解析:","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"path_or_buf:字符串,表示文件路径;","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"orient:指示预期的JSON字符串格式。可以to_json()使用相应的方向值生成兼容的JSON字符串。一组可能的方向是:","attrs":{}}]}]}],"attrs":{}},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"'split' : dict like {index -> [index], columns -> [columns], data -> [values]}\n'records' : list like [{column -> value}, ... , {column -> value}]\n'index' : dict like {index -> {column -> value}}\n'columns' : dict like {column -> {index -> value}}\n'values' : just the values array\n复制代码","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"encoding:字符串,默认为'utf-8';","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"numpy:布尔值,默认为False,直接解码为numpy数组。仅支持数字数据,但支持非数字列和索引标签。另请注意,如果numpy = True,则每个术语的JSON顺序必须相同。","attrs":{}}]}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"(2)利用pandas写入json文件","attrs":{}}]},{"type":"codeblock","attrs":{"lang":null},"content":[{"type":"text","text":"import pandas as pd\nimport os\n\n# 获取当前文件父目录路径\nfather_path = os.getcwd()\n# 存储数据文件路径\nwpath_json = father_path+r'\\data01\\temp_trans.json'\ndata = [{\"city\": \"SACRAMENTO\", \"longitude\": -121.434879, \"street\": \"3526 HIGH ST\", \"sq__ft\": 836, \"latitude\": 38.631913, \"sale_date\": \"Wed May 21 00:00:00 EDT 2008\", \"zip\": 95838, \"beds\": 2, \"type\": \"Residential\", \"state\": \"CA\", \"baths\": 1, \"price\": 59222}, {\"city\": \"SACRAMENTO\", \"longitude\": -121.431028, \"street\": \"51 OMAHA CT\", \"sq__ft\": 1167, \"latitude\": 38.478902, \"sale_date\": \"Wed May 21 00:00:00 EDT 2008\", \"zip\": 95823, \"beds\": 3, \"type\": \"Residential\", \"state\": \"CA\", \"baths\": 1, \"price\": 68212}, {\"city\": \"SACRAMENTO\", \"longitude\": -121.443839, \"street\": \"2796 BRANCH ST\", \"sq__ft\": 796, \"latitude\": 38.618305, \"sale_date\": \"Wed May 21 00:00:00 EDT 2008\", \"zip\": 95815, \"beds\": 2, \"type\": \"Residential\", \"state\": \"CA\", \"baths\": 1, \"price\": 68880}]\ndf = pd.DataFrame(data)\ndf.to_json(wpath_json)\n复制代码","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"运行结果","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"image","attrs":{"src":"https://static001.geekbang.org/infoq/e0/e0372be6b8cb650e49de9fc1c25badee.webp","alt":"","title":null,"style":[{"key":"width","value":"75%"},{"key":"bordertype","value":"none"}],"href":null,"fromPaste":true,"pastePass":true}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"函数解析","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"to_json(path_or_buf,orient,encoding,index)","attrs":{}}]},{"type":"bulletedlist","content":[{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"前三个参数和read_json()里的一样","attrs":{}}]}]},{"type":"listitem","attrs":{"listStyle":null},"content":[{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"index:False则选择不写入索引,默认为True。","attrs":{}}]}]}],"attrs":{}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"【注】利用json模版的loads()与dumps()方法也可以实现json文件的读写。","attrs":{}}]},{"type":"heading","attrs":{"align":null,"level":3},"content":[{"type":"text","text":"五、送你的话","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"我始终觉得,要想学好一门语言,底层是最主要的,所以不要觉得入门的这些基本东西太简单,","attrs":{}},{"type":"text","marks":[{"type":"strong","attrs":{}}],"text":"学好基础,才能成大牛","attrs":{}},{"type":"text","text":"。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"坚持 and 努力 : 终有所获。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"思想很复杂,","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"实现很有趣,","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"只要不放弃,","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"终有成名日。","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"—《老表打油诗》","attrs":{}}]},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null}},{"type":"paragraph","attrs":{"indent":0,"number":0,"align":null,"origin":null},"content":[{"type":"text","text":"下期见,我是爱猫爱技术的老表,如果觉得本文对你学习有所帮助,欢迎点赞、评论、关注我!","attrs":{}}]}]}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章