python爬取NBA球員信息
學習了一段時間python,在此寫一個簡單的爬蟲案例,爬取的球員信息,使用HDFS存儲並利用Hive進行簡單的分析。
過程:
- 登錄百度百科NBA球隊頁面,查看HTML源碼
- 編寫python程序,抓取對應url的html content
- 解析html內容,獲取每個球隊url鏈接,進行輪詢
- 解析html層次結構獲取球員列表數據
- 將球員信息存放之數組對象中
- 編寫數據入庫的方法,將爬取的球員數據寫入至mysql數據庫nbaplayers
- 使用”Sqoop”將”nbaplayers”表中的數據導入至Hadoop集羣的Hive庫
- 利用Hive對球員信息進行簡單的分析
關鍵代碼
- 獲取網頁內容
if __name__ == '__main__':
html = getHtmlContent('http://baike.baidu.com/item/NBA')
teams = crawl(html,type='team')
save_data2mysql(teams)
def getHtmlContent(url): #根據type類型判斷使用不同的爬取方式
header = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Content-Type': 'application/json; charset=utf-8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
}
timeout = random.choice(range(80, 180))
while True:
try:
rep = requests.get(url,headers = header,timeout = timeout)
rep.encoding = 'utf-8'
break
except socket.timeout as e:
print( '3:', e)
time.sleep(random.choice(range(8,15)))
except socket.error as e:
print( '4:', e)
time.sleep(random.choice(range(20, 60)))
return rep.text
保存數據至MySQL
def save_data2mysql(data): #保存數據至數據庫或hdfs conn = pymysql.connect(user='root', passwd='hadoop', host='192.168.100.20', db='hadoop', charset="utf8")#注意編碼需要與mysql一致 print('獲取連接') cur = conn.cursor() for teams in data: for player in teams: print(player) team = player['team'] name=player['name'] area=player['area'] age=player['age'] high=player['high'] weight=player['weight'] birthday=player['birthday'] #拼裝sql values = team+"','"+name+"','"+area+"','"+age+"','"+high+"','"+weight+"','"+birthday sql = "insert into nbaplayers (team,name,area,age,high,weight,birthday) values ('"+values+"')" print(sql) cur.execute(sql) #關閉連接 conn.commit() print('關閉連接') cur.close() conn.close()
獲取球隊信息
def getTeamData(html): print('獲取球隊信息') bs = BeautifulSoup(html,'html.parser') teams = {} body = bs.body table = body.find('table',class_='table-view log-set-param') aTags = table.find_all('a') for a in aTags: teams[a.string]=[a['href']] return teams
獲取球員數據
def getPlayersData(html,team): print('獲取球員信息') bs = BeautifulSoup(html,'html.parser') body = bs.body divs = body.find('div',class_='main-content') tableSize=0; i=0; while tableSize<12: table = divs.find_all(attrs={"log-set-param": "table_view"})[i] trs = table.find_all('tr') tableSize = len(trs) i+=1 players=[] for tr in trs: #如果爲第一行表頭則跳過 if trs.index(tr)==0: continue player={} tds = tr.find_all('td') try: name = parserDes(tds[1]) area = parserDes(tds[2]) high = parserDes(tds[3]) weight = parserDes(tds[4]) age = parserDes(tds[5]) birthday = parserDes(tds[6]) except: print(tds) continue if name!='球員' : player['team']=team player['name']=name player['area']=area player['high']=high.replace('米','') player['weight']=weight.replace('公斤','') player['age']=age.replace('歲','') player['birthday']=birthday players.append(player) return players
- Sqoop連接測試
#通過sqoop將mysql數據導入至hdfs中
#sqoop 使用
#檢查連接
./sqoop list-databases --connect jdbc:mysql://192.168.100.20:3306/ --username root --password hadoop
- 導入數據至Hive
#創建與mysql一樣結構的表
sqoop create-hive-table --connect jdbc:mysql://192.168.100.20:3306/hadoop --table nbaplayers --username root --password hadoop --hive-table nbaplayers
#創建表 並導入數據
sqoop import --connect jdbc:mysql://192.168.100.20:3306/hadoop --username root --password hadoop --table nbaplayers --hive-import -m 1
-flag