Python可視化數據學習

使用matplotlib製作圖表

製作簡單的折線圖

import matplotlib.pyplot as plt

input_values = [1,2,3,4,5]
squares = [1,4,9,16,25]
plt.plot(input_values,squares,linewidth=5)
#設置圖表標題，並給座標軸加上標籤
plt.title("Square Number",fontsize=24)
plt.xlabel("Value",fontsize=14)
plt.ylabel("Square of Value",fontsize=14)

#設置刻度標記的字體大小
plt.tick_params(axis='both',labelsize=14)

plt.show()

使用scatter()繪製散點圖

import matplotlib.pyplot as plt


x_values = list(range(1,101))
y_values = [x**2 for x in x_values]
plt.scatter(x_values,y_values,c='red',edgecolor='none',s=4) #edgecolor刪除數據點黑色輪廓


#設置圖表標題並給座標軸加上標籤
plt.title("Square Number",fontsize=24)
plt.xlabel("Value",fontsize=14)
plt.ylabel("Square of Value",fontsize=14)
#設置刻度標記的大小
plt.tick_params(axis='both',which='major',labelsize=14)
plt.axis([0,110,0,12100])
plt.show()

可以使用RGB顏色模式自定義顏色。要指定自定義顏色，可傳遞參數c ，並將其設置爲一個元組，其中包含三個0~1之間的小數值，它們分別表示紅色、綠色和藍色分量。值越接近0，指定的顏色越深，值越接近1，指定的顏色越淺。

plt.scatter(x_values, y_values, c=(0, 0, 0.8), edgecolor='none', s=40)

顏色映射（colormap）是一系列顏色，它們從起始顏色漸變到結束顏色。在可視化中，顏色映射用於突出數據的規律

plt.scatter(x_values, y_values, c=y_values, cmap=plt.cm.Blues,edgecolor='none', s=40)

參數c 設置成了一個 y 值列表，並使用參數cmap 告訴pyplot 使用哪個顏色映射。這些代碼將 y 值較小的點顯示爲淺藍色，並將y 值較大的點顯示爲深藍色，

要讓程序自動將圖表保存到文件中，可將對plt.show() 的調用替換爲對plt.savefig() 的調用：

plt.savefig('squares_plot.png', bbox_inches='tight')

隨機漫步

每次行走都完全是隨機的，沒有明確的方向，結果是由一系列隨機決策決定的

random_walk.py

from random import choice

class RandomWalk():
	"""一個生成隨機漫步數據的類"""
	def __init__(self,num_points = 5000):
		"""初始化隨機漫步的屬性"""
		self.num_points = num_points
		
		#所有隨機漫步都始於(0,0)
		self.x_values = [0]
		self.y_values = [0]

	def fill_walk(self):
		"""計算隨機漫步包含的所有點"""
		#不斷漫步，直到列表達到指定的長度
		while len(self.x_values) < self.num_points:
			#決定前進方向以及沿這個方向前進的距離
			x_step = self.get_step()
			y_step = self.get_step()
			#拒絕原地踏步
			if x_step == 0 and y_step == 0:
				continue
			
			#計算下一個點的x和y的值
			next_x = self.x_values[-1]+x_step
			next_y = self.y_values[-1]+y_step
			
			self.x_values.append(next_x)
			self.y_values.append(next_y)

	def get_step(self):
		self.direction = choice([1,-1])
		self.distance = choice([0,1,2,3,4])
		self.step = self.direction * self.distance
		return self.step

rw_visual.py

import matplotlib.pyplot as plt
from random_walk import RandomWalk
#只要程序處於活動狀態，就不斷地模擬隨機漫步
while True:
	#創建一個RandomWalk實例，並將其包含的點都繪製出來
	rw = RandomWalk()
	rw.fill_walk()
	plt.scatter(rw.x_values,rw.y_values,s=5)
	plt.show()
	keep_running =input("Make another walk?(y/n):")
	if keep_running == 'n':
		break

給點着色、繪製起點終點

我們將使用顏色映射來指出漫步中各點的先後順序，並刪除每個點的黑色輪廓，讓它們的顏色更明顯。爲根據漫步中各點的先後順序進行着色，我們傳遞參數c ，並將其設置爲一個列表，其中包含各點的先後順序。由於這些點是按順序繪製的，因此給參數c指定的列表只需包含數字1~5000

import matplotlib.pyplot as plt
from random_walk import RandomWalk

while True:
	#創建一個RandomWalk實例，並將其包含的點都繪製出來
	rw = RandomWalk()
	rw.fill_walk()
	point_numbers =list(range(rw.num_points))
	plt.scatter(rw.x_values,rw.y_values,c=point_numbers,cmap=plt.cm.Blues,edgecolor='none',s=5)#給點着色
	#突出起點和重點
	plt.scatter(0,0,c='green',edgecolors='none',s=100)
	plt.scatter(rw.x_values[-1],rw.y_values[-1],c='red',edgecolors='none',s=100)
	plt.show()
	
	keep_running =input("Make another walk?(y/n):")
	if keep_running == 'n':
		break

隱藏座標軸

	--snip--
        plt.scatter(rw.x_values[-1],rw.y_values[-1],c='red',edgecolors='none',s=100)
	#隱藏座標軸
	plt.axes().get_xaxis().set_visible(False)
	plt.axes().get_yaxis().set_visible(False)
	
	plt.show()
        --snip--

增加點數

rw = RandomWalk(50000)

圖表適合屏幕大小時，更能有效地將數據中的規律呈現出來。爲讓繪圖窗口更適合屏幕大小：

# 設置繪圖窗口的尺寸
    plt.figure(figsize=(10, 6))

函數figure() 用於指定圖表的寬度、高度、分辨率和背景色。你需要給形參figsize 指定一個元組，向matplotlib指出繪圖窗口的尺寸，單位爲英寸。

Python假定屏幕分辨率爲80像素/英寸，如果上述代碼指定的圖表尺寸不合適，可根據需要調整其中的數字。如果你知道自己的系統的分辨率，可使用形參dpi 向figure() 傳遞該分辨率，以有效地利用可用的屏幕空間，如下所示：

plt.figure(dpi=128, figsize=(10, 6))

使用Pygal模擬擲骰子

要了解使用Pygal可創建什麼樣的圖表，請查看圖表類型畫廊：訪問http://www.pygal.org/ ，單擊Documentation，再單擊Chart types。每個示例都包含源代碼，讓你知道這些圖表是如何生成的。

擲一個骰子

die.py

from random import randint

class Die():
	"""表示一個骰子的類"""
	
	def __init__(self,num_sides=6):
		"""骰子默認爲6面"""
		self.num_sides = num_sides
	def roll(self):
		"""返回一個位於1和骰子面數之間的隨機值"""
		return randint(1,self.num_sides)

die_visual.py

from die import Die
import pygal
#創建一個D6
die = Die()
#擲幾次骰子，並將結果存儲在一個列表中
results=[]
for roll_num in range(1000):
	result = die.roll()
	results.append(result)
	
#分析結果統計次數
frequencies =[]
for value in range(1,die.num_sides+1):
	frequency = results.count(value)
	frequencies.append(frequency)

#對結果進行可視化 直方圖
hist = pygal.Bar()
hist.title = "Result of rolling one D6 1000 times."#標題
hist.x_labels=[x for x in range(1,die.num_sides+1)]#橫座標
hist.x_title = "Result"#橫軸標題
hist.y_title = "Frequency of Result"#縱軸標題

hist.add('D6',frequencies)#將值添加到圖表中
hist.render_to_file('die_visual.svg')#將圖表渲染爲svg文件

擲兩個骰子

from die import Die
import pygal
#創建兩個D6骰子
die_1 = Die()
die_2 = Die()
#擲多次骰子，並將結果存儲在一個列表中
results=[]
for roll_num in range(1000):
	result = die_1.roll()+die_2.roll()
	results.append(result)
	
#分析結果統計次數
frequencies =[]
max_result = die_1.num_sides + die_2.num_sides
for value in range(2,max_result+1):
	frequency = results.count(value)
	frequencies.append(frequency)

#對結果進行可視化 直方圖
hist = pygal.Bar()
hist.title = "Result of rolling two D6 1000 times."
hist.x_labels=[x for x in range(2,max_result+1)]
hist.x_title = "Result"
hist.y_title = "Frequency of Result"

hist.add('D6+D6',frequencies)
hist.render_to_file('dice_visual.svg')

處理csv文件

要在文本文件中存儲數據，最簡單的方式是將數據作爲一系列以逗號分隔的值（CSV）寫入文件。這樣的文件稱爲CSV文件。csv 模塊包含在Python標準庫中，可用於分析CSV文件中的數據行。

import csv

filename = 'sitka_weather_07-2014.csv'
with open(filename) as f:
    reader = csv.reader(f)
    header_row = next(reader)

    print(header_row)

調用csv.reader() ，並將前面存儲的文件對象作爲實參傳遞給它，從而創建一個與該文件相關聯的閱讀器（reader ）對象。模塊csv 包含函數next() ，調用它並將閱讀器對象傳遞給它時，它將返回文件中的下一行。

爲讓文件頭數據更容易理解，將列表中的每個文件頭及其位置打印出來：

for index, column_header in enumerate(header_row):
          print(index, column_header)

對列表調用了enumerate() 來獲取每個元素的索引及其值。（請注意，我們刪除了代碼行print(header_row)）。得知日期和最高氣溫分別存儲在第0列和第1列。

提取並讀取數據

	highs = [int(row[1]) for row in reader] #將字符串轉換爲數字，方便matplotlib讀取
	print(highs)

提取最高氣溫並添加到列表中

繪製氣溫圖表

  import csv

  from matplotlib import pyplot as plt

  # 從文件中獲取最高氣溫
  --snip--
# 根據數據繪製圖形
  fig = plt.figure(dpi=128, figsize=(10, 6))
❶ plt.plot(highs, c='red')

  # 設置圖形的格式
❷ plt.title("Daily high temperatures, July 2014", fontsize=24)
❸ plt.xlabel('', fontsize=16)
  plt.ylabel("Temperature (F)", fontsize=16)
  plt.tick_params(axis='both', which='major', labelsize=16)

  plt.show()

模塊datetime

讀取日期數據時，獲得的是一個字符串，因此我們需要想辦法將字符串'2014-7-1' 轉換爲一個表示相應日期的對象。爲創建一個表示2014年7月1日的對象，可使用模塊datetime 中的方法strptime()。在終端會話中看看strptime() 的工作原理：

>>> from datetime import datetime
>>> first_date = datetime.strptime('2014-7-1', '%Y-%m-%d')

下表是模塊datetime中設置日期和時間格式的實參

繪製一年時間的天氣圖

import csv
from matplotlib import pyplot as plt
from datetime import datetime
import matplotlib.dates as mdates
#從文件中獲取最高氣溫
filename = 'sitka_weather_2014.csv'
with open(filename) as f:
	reader = csv.reader(f)
	header_row = next(reader)
	dates, highs = [], []
	for row in reader:
		current_date = datetime.strptime(row[0], "%Y-%m-%d")
		dates.append(current_date)
		high = int(row[1])
		highs.append(high)

#根據數據繪製圖形
fig=plt.figure(dpi=128,figsize=(10,6))
plt.plot(dates,highs,c='red')

#設置圖形格式
plt.title("Daily high temperatures - 2014",fontsize=24)
plt.xlabel("",fontsize=16)
#配置和橫座標
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%Y'))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
fig.autofmt_xdate()#讓日期標籤傾斜
plt.ylabel("Temperature(F)",fontsize=16)
plt.tick_params(axis='both',which = 'major',labelsize=10)

plt.show()

問題1：橫座標刻度太少，自定義配置橫座標

plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%Y'))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())

問題2：列表解析法分別讀取文件兩行內容，有一行是空的

highs = [int(row[1]) for row in reader]
dates = [datetime.strptime(row[0],"%Y-%M-%D") for row in reader]
print(len(highs))
print(len(dates))

第二行讀取的內容是空的........爲什麼......

添加最低氣溫並給圖表區域着色

在其中再添加最低氣溫數據，使其更有用。從數據文件中提取最低氣溫，並將它們添加到圖表。通過着色來呈現每天的氣溫範圍。爲此，我們將使用方法fill_between() ，它接受一個 x 值系列和兩個 y 值系列，並填充兩個 y值系列之間的空間：

--snip--
#從文件中獲取最高氣溫
filename = 'sitka_weather_2014.csv'
with open(filename) as f:
	reader = csv.reader(f)
	header_row = next(reader)
	dates, highs, lows = [], [], []
	for row in reader:
                --snip--
		low = int(row[3])
		lows.append(low)
#根據數據繪製圖形
fig=plt.figure(dpi=128,figsize=(10,6))
plt.plot(dates,highs,c='red',alpha=0.5)
plt.plot(dates,lows,c='blue',alpha=0.5)
plt.fill_between(dates,highs,lows,facecolor='blue',alpha=0.1)

#設置圖形格式
plt.title("Daily high and low temperatures - 2014",fontsize=24)
--snip--

實參alpha 指定顏色的透明度。Alpha 值爲0表示完全透明，1（默認設置）表示完全不透明。通過將alpha 設置爲0.5，可讓紅色和藍色折線的顏色看起來更淺。

向fill_between() 傳遞了一個 x 值系列：列表dates ，還傳遞了兩個 y 值系列：highs 和lows 。實參facecolor 指定了填充區域的顏色，我們還將alpha 設置成了較小的值0.1，讓填充區域將兩個數據系列連接起來的同時不分散觀察者的注意力。

問題3：橫軸座標如何從y軸開始

plt.xlim(dates[0],dates[-1])

錯誤檢查

我們應該能夠使用有關任何地方的天氣數據來運行highs_lows.py中的代碼，但有些氣象站會偶爾出現故障，未能收集部分或全部其應該收集的數據。缺失數據可能會引發異常，如果不妥善地處理，還可能導致程序崩潰。

加利福尼亞死亡谷的氣溫圖

文件death_valley_2014.csv沒有記錄2014年2月16日的數據，表示最高溫度的字符串爲空。爲解決這種問題，我們在從CSV文件中讀取值時執行錯誤檢查代碼，對分析數據集時可能出現的異常進行處理，如下所示：

--snip--
	for row in reader:
		try:
			current_date = datetime.strptime(row[0], "%Y-%m-%d")
			high = int(row[1])
			low = int(row[3])
		except:
			print(current_date,'missing data')
		else:
			dates.append(current_date)
			highs.append(high)
			lows.append(low)
--snip--

從中提取日期、最高氣溫和最低氣溫。只要缺失其中一項數據，Python就會引發ValueError 異常，打印一條錯誤消息，指出缺失數據的日期。打印錯誤消息後，循環將接着處理下一行。如果獲取特定日期的所有數據時沒有發生錯誤，將運行else 代碼塊，並將數據附加到相應列表的末尾

練習1：比較錫特卡和死亡谷的氣溫：在有關錫特卡和死亡谷的圖表中，氣溫刻度反映了數據範圍的不同。爲準確地比較錫特卡和死亡谷的氣溫範圍，需要在y 軸上使用相同的刻度。爲此，請修改y 軸設置，對錫特卡和死亡谷的氣溫範圍進行直接比較（你也可以對任何兩個地方的氣溫範圍進行比較）。你還可以嘗試在一個圖表中呈現這兩個數據集。

import csv
from datetime import datetime
from matplotlib import pyplot as plt

def get_weather_data(filename,dates,highs,lows):
	with open(filename) as f:
		reader = csv.reader(f)
		header_row = next(reader)
		
		for row in reader:
			try:
				current_date = datetime.strptime(row[0],"%Y-%m-%d")
				high = int(row[1])
				low = int(row[3])
			except ValueError:
				print(current_date,"missing data")
			else:
				dates.append(current_date)
				highs.append(high)
				lows.append(low)

dates,highs,lows = [],[],[]
get_weather_data('sitka_weather_2014.csv',dates,highs,lows) 

fig = plt.figure(dpi=128,figsize=(10,6))
plt.plot(dates,highs,c='red',alpha=0.6)
plt.plot(dates,lows,c='blue',alpha=0.6)
plt.fill_between(dates,highs,lows,facecolor='blue',alpha=0.15)

dates,highs,lows = [],[],[]
get_weather_data('death_valley_2014.csv',dates,highs,lows) 


plt.plot(dates,highs,c='red',alpha=0.3)
plt.plot(dates,lows,c='blue',alpha=0.3)
plt.fill_between(dates,highs,lows,facecolor='blue',alpha=0.05)

title = "Daily high and low temperatures - 2014"
title += "\nSitka , AK and Death Valley, CA"

plt.title(title,fontsize=20)
plt.xlabel('',fontsize=16)
fig.autofmt_xdate()
plt.ylabel("Temperature(F)",fontsize=16)
plt.tick_params(axis='both',which = 'major',labelsize=16)
plt.ylim(10,120)
plt.xlim(dates[0],dates[-1])
plt.show()

製作世界人口地圖：JSON格式

獲取數據值

population_data.json文件是一個Python列表每個元素都是一個包含四個鍵的字典：國家名、國別碼、年份以及表示人口數量的值。文件中的每個鍵和值都是字符串。爲處理這些人口數據，我們需要將表示人口數量的字符串轉換爲數字值，爲此我們使用函數int() ，但是由於原始數據的格式常常不統一，因此經常會出現錯誤。Python不能直接將包含小數點的字符串 '1127437398.85751' 轉換爲整數（這個小數值可能是人口數據缺失時通過插值得到的）。爲消除這種錯誤，先將字符串轉換爲浮點數，再將浮點數轉換爲整數：

import json
#將數據加載到一個列表中
filename = 'population_data.json'
with open(filename) as f:
	pop_data = json.load(f)

#打印每個國家2010年的人口數量
for pop_dict in pop_data:
	if pop_dict['Year'] == '2010':
		country_name = pop_dict['Country Name']
		#Python不能直接將包含小數點的字符串'1127437398.85751'轉換爲整數
		population = int(float(pop_dict['Value']))
		print(country_name + ": " + str(population))

獲取兩個字母的國別碼

Pygal中的地圖製作工具要求數據爲特定的格式：用國別碼表示國家，用數字表示人口數量。處理地理政治數據時，經常需要用到幾個標準化國別碼集。population_data.json中包含的是三個字母的國別碼，但Pygal使用兩個字母的國別碼。我們需要想辦法根據國家名獲取兩個字母的國別碼。Pygal使用的國別碼存儲在模塊i18n （internationalization的縮寫）中。字典COUNTRIES 包含的鍵和值分別爲兩個字母的國別碼和國家名。要查看這些國別碼，可從模塊i18n中導入這個字典，並打印其鍵和值：

from pygal.i18n import COUNTRIES

for country_code in sorted(COUNTRIES.keys()):
    print(country_code, COUNTRIES[country_code])

問題1：執行程序時，報錯

ModuleNotFoundError: No module named ‘pygal.i18n‘

經查找是由於pygal.i18n 已經不存在了，現在已經更改成了 pygal_maps_world，需要單獨通過pip下載

pip install pygal_maps_world

然而pip下載報錯，只能從下載tar壓縮包解壓後進入解壓目錄使用python setup.py install命令安裝，安裝成功之後，修改爲

from pygal_maps_world.i18n import COUNTRIES

就可以正常調用國別碼了

製作世界地圖

from pygal_maps_world.maps import World

wm = World()
wm.title = 'North,Central,and South Amercia'
wm.add('North America', ['ca','mx','us'])
wm.add('Central America',['bz','cr','gt','hn','ni','pa','sv'])
wm.add('South Ameirca',['ar','bo','br','cl','co','ec','gf','gy','pe','py','sr','uy','ve'])
wm.render_to_file('americas.svg')

問題2：執行程序時報錯

import pygal
wm = pygal.Worldmap()

AttributeError: module 'pygal' has no attribute 'Worldmap'

對於繪製世界地圖的指令，也就是我遇到的第二個報錯，語句相應地更改模塊名稱：

from pygal_maps_world.maps import World
wm = World()

繪製完整的世界人口地圖

import json
from country_codes import get_country_code
from pygal_maps_world.maps import World
#將數據加載到一個列表中
--snip--

#打印每個國家2010年的人口數量
#創建一個包含人口數量字典
cc_populations = {}
for pop_dict in pop_data:
	if pop_dict['Year'] == '2010':
		country_name = pop_dict['Country Name']
		#Python不能直接將包含小數點的字符串'1127437398.85751'轉換爲整數
		population = int(float(pop_dict['Value']))
		code = get_country_code(country_name)
		if code:
			cc_populations[code] = population
wm = World()
wm.title = 'World Population in 2010,by Country'
wm.add('2010',cc_populations)
wm.render_to_file('world_populations.svg')

創建了一個空字典，用於以Pygal要求的格式存儲國別碼和人口數量。如果返回了國別碼，就將國別碼和人口數量分別作爲鍵和值填充字典cc_populations。創建了一個Worldmap 實例，並設置其title 屬性。我們調用了add() ，並向它傳遞由國別碼和人口數量組成的字典。

根據人口數量將國家分組

根據人口數量分成三組——少於1000萬的85、介於1000萬和10億之間的69以及超過10億的2：

--snip--
#根據人口數量將所有的國家分成三組
cc_pops_1,cc_pops_2,cc_pops_3 ={},{},{}
for cc,pop in cc_populations.items():
	if pop < 10000000:
		cc_pops_1[cc] =pop
	elif pop < 1000000000:
		cc_pops_2[cc] = pop
	else:
		cc_pops_3[cc] = pop

#看看每組分別包含多少個國家
print(len(cc_pops_1),len(cc_pops_2),len(cc_pops_3))

wm = World()
wm.title = 'World Population in 2010,by Country'
wm.add('0-10m',cc_pops_1)
wm.add('10m-1bn',cc_pops_2)
wm.add('>1bn',cc_pops_3)
wm.render_to_file('world_populations.svg')

使用Pygal設置世界地圖的樣式

Pygal使用一種基色，但將指定該基色，並讓三個分組的顏色差別更大：

import json
from country_codes import get_country_code
from pygal_maps_world.maps import World
from pygal.style import RotateStyle
--snip--
#根據人口數量將所有的國家分成三組
cc_pops_1,cc_pops_2,cc_pops_3 ={},{},{}
for cc,pop in cc_populations.items():
	if pop < 10000000:
		--snip--            

wm_style = RotateStyle('#336699')
wm = World(style = wm_style)
wm.title = 'World Population in 2010,by Country'
--snip--

十六進制格式的RGB顏色是一個以井號（#）打頭的字符串，後面跟着6個字符，其中前兩個字符表示紅色分量，接下來的兩個表示綠色分量，最後兩個表示藍色分量。每個分量的取值範圍爲00 （沒有相應的顏色）~FF （包含最多的相應顏色).#336699混合了少量的紅色33、多一些的綠色66和更多一些的藍色99，它爲RotateStyle 提供了一種淡藍色基色。