一、爬取湛江天氣預報信息
url:http://www.weather.com.cn/weather/10128100101A.shtml
1.思路分析:
①先爬取網頁所有數據,div/ul/li,獲得7天天氣預報的所有信息;
②li下的所有數據進行提取數據;
③保存文件。 --文件操作、json模塊。
import requests
import lxml.html
import json
def parse_url(url,header):
""""解析url地址,獲得網頁的所有數據內容"""
response = requests.get(url,headers=header)
# return response.text 會出現亂碼
#改爲content二進制數據,再轉爲字符串decode即可
return response.content.decode("utf-8")
def get_weather_datas(html_content):
""""從所有html數據信息中獲取ul>li標籤中的所有天氣信息"""
metree = lxml.html.etree
# 1 獲得解析對象(構造一個 XPath 解析對象並對 HTML 文本進行自動修正)
parser = metree.HTML(html_content,metree.HTMLParser())
# 2 使用Xpath語法獲得li所有標籤
li_list = parser.xpath("//div[@class='c7d']/ul[@class='t clearfix']/li")
# print(li_list)
# print(len(li_list)) 7
# 3 獲得li標籤下的所有信息
# 先創建一個空列表
data = []
# 循環遍歷
for ele in li_list:
# 創建一個空字典
item = {}
# 繼續使用XPath語法
item["date"] = ele.xpath("./h1/text()")[0]
item["天氣"] = ele.xpath("./p[@class='wea']/text()")[0]
# item["wea"] = ele.xpath("./p[@class='wea']/@title")[0] 通過title屬性獲取
item["最低溫度"] = ele.xpath("./p[@class='tem']/i/text()")[0]
item["最高溫度"] = ele.xpath("./p[@class='tem']/span/text()")[0]
# print(item["最高溫度"])
data.append(item)
# print(data)
return data
def save_weather_file(datas):
""""保存文件"""
# 列表轉json字符串,並保存到文件中
json_strs = json.dumps(datas,ensure_ascii=False,indent=2)
# print(json_strs)
# print(type(json_strs))
with open("./file/weather.json","w",encoding="utf-8") as files:
files.write(json_strs)
print("數據保存成功!")
def main():
# ①div/ul/li,獲得7天天氣預報的所有信息; --lxml、requests、XPath --list類型
# 網址、請求頭
# print("HelloWorld...")
http_url = "http://www.weather.com.cn/weather/10128100101A.shtml"
headers = \
{"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
# 解析url
html_data = parse_url(http_url,headers)
# print(html_data)
# ②li下的所有數據進行提取數據; --對上述的7條數據進行處理,提取數據 XPath --列表
weather_datas = get_weather_datas(html_data)
# print(weather_datas)
# ③保存文件。 --文件操作、json模塊
save_weather_file(weather_datas)
""""python程序入口,通常寫在程序末尾處"""
if __name__ == '__main__':
main()
2.爬取的數據:
二 、爬取美女網站信息
url:http://www.xiaohuar.com/list-1-1.html
1.思路分析
①獲得所有美女圖片的div列表;
②在當前美女的div中獲得所有信息;
③保存數據。
import requests
import lxml.html
import json
def parse_url(url,header):
""""解析url地址,獲取所有網頁數據信息"""
response = requests.get(url,headers=header)
return response.content.decode("gbk")
def get_xiaohua_datas(html_content):
""""獲取數據內容"""
metree = lxml.html.etree
# 解析對象
parser = metree.HTML(html_content,metree.HTMLParser())
# 解析獲得②在當前美女的div中獲得所有信息
div_list = parser.xpath("//div[@class='item_list infinite_scroll']/div")
# 創建空列表
data = []
# print(div_list)
# print(len(div_list)) 25
for ele in div_list:
item = {} #創建空字典
# 繼續使用xpath 語法
item["title"] = ele.xpath("./div[@class='item_t']/div[@class='img']/a/img/@alt")[0]
item["name"] = ele.xpath("./div[@class='item_t']/div[@class='img']/span/text()")[0]
item["school"] = ele.xpath("./div[@class='item_t']/div[@class='img']/div[@class='btns']/a/text()")[0]
item["like_count"] = ele.xpath("./div[@class='item_b clearfix']/div[@class='items_likes fl']/em/text()")[0]
# print(item["like_count"])
data.append(item)
# print(data)
return data
def save_xiaohua_file(datas):
""""保存文件"""
json_strs = json.dumps(datas,ensure_ascii=False,indent=2)
with open("./file/xiaohua.json","w",encoding="utf-8") as files:
files.write(json_strs)
print("數據保存成功!")
def main():
xiaohua_url = "http://www.xiaohuar.com/list-1-1.html"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
html_data = parse_url(xiaohua_url,headers)
# print(html_data)
# ①獲得所有美女圖片的div列表
xiaohua_datas = get_xiaohua_datas(html_data)
# ③保存數據
save_xiaohua_file(xiaohua_datas)
if __name__ == '__main__':
main()
2.爬取的數據
[
{
"title": "大連國際舞蹈學校校花王鈺萌",
"name": "王鈺萌",
"school": "大連國際舞蹈學校",
"like_count": "159"
},
{
"title": "南昌大學校花曾陽",
"name": "曾陽",
"school": "南昌大學",
"like_count": "220"
},
{
"title": "中國民航大學校花張金玉",
"name": "張金玉",
"school": "中國民航大學",
"like_count": "109"
},
{
"title": "天津財經大學校花卓婭祺",
"name": "卓婭祺",
"school": "天津財經大學",
"like_count": "361"
},
{
"title": "新疆農業大學校花麥合麗婭",
"name": "麥合麗婭",
"school": "新疆農業大學",
"like_count": "53"
},
{
"title": "成都職業技術學院校花楊萍",
"name": "楊萍",
"school": "成都職業技術學院",
"like_count": "108"
},
{
"title": "東北師範大學校花尹思凝",
"name": "尹思凝",
"school": "東北師範大學",
"like_count": "109"
},
{
"title": "北京理工大學珠海學院校花韋若愚",
"name": "韋若愚",
"school": "北京理工大學珠海學院",
"like_count": "122"
},
{
"title": "廈門理工學院校花袁慧",
"name": "袁慧",
"school": "廈門理工學院",
"like_count": "78"
},
{
"title": "湖北藝術學院校花王媛茜",
"name": "王媛茜",
"school": "湖北藝術學院",
"like_count": "96"
},
{
"title": "文光中心校花陳裏佳",
"name": "陳裏佳",
"school": "文光中心",
"like_count": "48"
},
{
"title": "大連外國語大學校花高夢馨",
"name": "高夢馨",
"school": "大連外國語大學",
"like_count": "115"
},
{
"title": "舟山技師學院校花宋世傑",
"name": "宋世傑",
"school": "舟山技師學院",
"like_count": "99"
},
{
"title": "上海財經大學校花徐逸岑",
"name": "徐逸岑",
"school": "上海財經大學",
"like_count": "123"
},
{
"title": "武漢大學校花丁婷婷",
"name": "丁婷婷",
"school": "武漢大學",
"like_count": "121"
},
{
"title": "行健學院校花徐豔琛",
"name": "徐豔琛",
"school": "行健學院",
"like_count": "149"
},
{
"title": "上海交通大學校花唐雨喬",
"name": "唐雨喬",
"school": "上海交通大學",
"like_count": "105"
},
{
"title": "溫州大學校花湯以斯貼",
"name": "湯以斯貼",
"school": "溫州大學",
"like_count": "289"
},
{
"title": "華東大學校花趙夢潔",
"name": "趙夢潔",
"school": "華東大學",
"like_count": "604"
},
{
"title": "鄞州職業高級中學校花翁川美",
"name": "翁川美",
"school": "鄞州職業高級中學",
"like_count": "109"
},
{
"title": "中央戲劇學院校花劉垚昕",
"name": "劉垚昕",
"school": "中央戲劇學院",
"like_count": "585"
},
{
"title": "星源初中校花廖炯炅",
"name": "廖炯炅",
"school": "星源初中",
"like_count": "99"
},
{
"title": "廣州華夏職業學院校花鄧杏琳",
"name": "鄧杏琳",
"school": "廣州華夏職業學院",
"like_count": "97"
},
{
"title": "芷江師範校花滕之雅",
"name": "滕之雅",
"school": "芷江師範",
"like_count": "208"
},
{
"title": "鐵嶺師範校花施玉",
"name": "施玉",
"school": "鐵嶺師範",
"like_count": "186"
}
]
三、總結
爬取數據的基本步驟(以案例二爲例)
1.寫入對應的url
2.請求頭,解析url地址,獲取所有網頁數據信息:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
html_data = parse_url(xiaohua_url,headers)
3.獲取數據後可以獲得所有美女圖片的div列表
xiaohua_datas = get_xiaohua_datas(html_data)
4.根據需要的內容來保存數據
save_xiaohua_file(xiaohua_datas)