python:爬取天氣預報+美女網站數據信息

一、爬取湛江天氣預報信息

 url:http://www.weather.com.cn/weather/10128100101A.shtml

1.思路分析:

①先爬取網頁所有數據,div/ul/li,獲得7天天氣預報的所有信息;

②li下的所有數據進行提取數據;

③保存文件。 --文件操作、json模塊。

import requests
import lxml.html
import json

def parse_url(url,header):
    """"解析url地址,獲得網頁的所有數據內容"""
    response = requests.get(url,headers=header)
    # return response.text  會出現亂碼

    #改爲content二進制數據,再轉爲字符串decode即可
    return response.content.decode("utf-8")  


def get_weather_datas(html_content):

    """"從所有html數據信息中獲取ul>li標籤中的所有天氣信息"""

    metree = lxml.html.etree

    # 1 獲得解析對象(構造一個 XPath 解析對象並對 HTML 文本進行自動修正)
    parser = metree.HTML(html_content,metree.HTMLParser())

    # 2 使用Xpath語法獲得li所有標籤
    li_list = parser.xpath("//div[@class='c7d']/ul[@class='t clearfix']/li")
    # print(li_list)
    # print(len(li_list))  7

    # 3 獲得li標籤下的所有信息
        # 先創建一個空列表
    data = []
        #  循環遍歷
    for ele in li_list:
        # 創建一個空字典
        item = {}
        # 繼續使用XPath語法
        item["date"] = ele.xpath("./h1/text()")[0]
        item["天氣"] = ele.xpath("./p[@class='wea']/text()")[0]
        # item["wea"] = ele.xpath("./p[@class='wea']/@title")[0]  通過title屬性獲取
        item["最低溫度"] = ele.xpath("./p[@class='tem']/i/text()")[0]
        item["最高溫度"] = ele.xpath("./p[@class='tem']/span/text()")[0]
        # print(item["最高溫度"])
        data.append(item)
        # print(data)
    return data

def save_weather_file(datas):
    """"保存文件"""

    # 列表轉json字符串,並保存到文件中
    json_strs = json.dumps(datas,ensure_ascii=False,indent=2)
    # print(json_strs)
    # print(type(json_strs))
    with open("./file/weather.json","w",encoding="utf-8") as files:
        files.write(json_strs)
    print("數據保存成功!")

def main():
    # ①div/ul/li,獲得7天天氣預報的所有信息;  --lxml、requests、XPath   --list類型
    # 網址、請求頭
    # print("HelloWorld...")
    http_url = "http://www.weather.com.cn/weather/10128100101A.shtml"
    headers = \
        {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
    # 解析url
    html_data = parse_url(http_url,headers)
    # print(html_data)

    # ②li下的所有數據進行提取數據;  --對上述的7條數據進行處理,提取數據  XPath   --列表
    weather_datas = get_weather_datas(html_data)
    # print(weather_datas)


    # ③保存文件。  --文件操作、json模塊
    save_weather_file(weather_datas)


""""python程序入口,通常寫在程序末尾處"""
if __name__ == '__main__':
    main()

2.爬取的數據:

  

 


二 、爬取美女網站信息

   url:http://www.xiaohuar.com/list-1-1.html

1.思路分析

   ①獲得所有美女圖片的div列表;

   ②在當前美女的div中獲得所有信息;

   ③保存數據。

import requests
import lxml.html
import json

def parse_url(url,header):
    """"解析url地址,獲取所有網頁數據信息"""

    response = requests.get(url,headers=header)
    return response.content.decode("gbk")

def get_xiaohua_datas(html_content):
    """"獲取數據內容"""

    metree = lxml.html.etree
    # 解析對象
    parser = metree.HTML(html_content,metree.HTMLParser())
    # 解析獲得②在當前美女的div中獲得所有信息
    div_list = parser.xpath("//div[@class='item_list infinite_scroll']/div")
        # 創建空列表
    data = []
    # print(div_list)
    # print(len(div_list))  25
    for ele in div_list:
        item = {}  #創建空字典
        # 繼續使用xpath 語法
        item["title"] = ele.xpath("./div[@class='item_t']/div[@class='img']/a/img/@alt")[0]
        item["name"] = ele.xpath("./div[@class='item_t']/div[@class='img']/span/text()")[0]
        item["school"] = ele.xpath("./div[@class='item_t']/div[@class='img']/div[@class='btns']/a/text()")[0]
        item["like_count"] = ele.xpath("./div[@class='item_b clearfix']/div[@class='items_likes fl']/em/text()")[0]
        # print(item["like_count"])
        data.append(item)
    # print(data)
    return data

def save_xiaohua_file(datas):
    """"保存文件"""

    json_strs = json.dumps(datas,ensure_ascii=False,indent=2)
    with open("./file/xiaohua.json","w",encoding="utf-8") as files:
        files.write(json_strs)
    print("數據保存成功!")

def main():
    xiaohua_url = "http://www.xiaohuar.com/list-1-1.html"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
    html_data = parse_url(xiaohua_url,headers)
    # print(html_data)

    # ①獲得所有美女圖片的div列表
    xiaohua_datas = get_xiaohua_datas(html_data)

    # ③保存數據
    save_xiaohua_file(xiaohua_datas)


if __name__ == '__main__':
    main()

2.爬取的數據

[
  {
    "title": "大連國際舞蹈學校校花王鈺萌",
    "name": "王鈺萌",
    "school": "大連國際舞蹈學校",
    "like_count": "159"
  },
  {
    "title": "南昌大學校花曾陽",
    "name": "曾陽",
    "school": "南昌大學",
    "like_count": "220"
  },
  {
    "title": "中國民航大學校花張金玉",
    "name": "張金玉",
    "school": "中國民航大學",
    "like_count": "109"
  },
  {
    "title": "天津財經大學校花卓婭祺",
    "name": "卓婭祺",
    "school": "天津財經大學",
    "like_count": "361"
  },
  {
    "title": "新疆農業大學校花麥合麗婭",
    "name": "麥合麗婭",
    "school": "新疆農業大學",
    "like_count": "53"
  },
  {
    "title": "成都職業技術學院校花楊萍",
    "name": "楊萍",
    "school": "成都職業技術學院",
    "like_count": "108"
  },
  {
    "title": "東北師範大學校花尹思凝",
    "name": "尹思凝",
    "school": "東北師範大學",
    "like_count": "109"
  },
  {
    "title": "北京理工大學珠海學院校花韋若愚",
    "name": "韋若愚",
    "school": "北京理工大學珠海學院",
    "like_count": "122"
  },
  {
    "title": "廈門理工學院校花袁慧",
    "name": "袁慧",
    "school": "廈門理工學院",
    "like_count": "78"
  },
  {
    "title": "湖北藝術學院校花王媛茜",
    "name": "王媛茜",
    "school": "湖北藝術學院",
    "like_count": "96"
  },
  {
    "title": "文光中心校花陳裏佳",
    "name": "陳裏佳",
    "school": "文光中心",
    "like_count": "48"
  },
  {
    "title": "大連外國語大學校花高夢馨",
    "name": "高夢馨",
    "school": "大連外國語大學",
    "like_count": "115"
  },
  {
    "title": "舟山技師學院校花宋世傑",
    "name": "宋世傑",
    "school": "舟山技師學院",
    "like_count": "99"
  },
  {
    "title": "上海財經大學校花徐逸岑",
    "name": "徐逸岑",
    "school": "上海財經大學",
    "like_count": "123"
  },
  {
    "title": "武漢大學校花丁婷婷",
    "name": "丁婷婷",
    "school": "武漢大學",
    "like_count": "121"
  },
  {
    "title": "行健學院校花徐豔琛",
    "name": "徐豔琛",
    "school": "行健學院",
    "like_count": "149"
  },
  {
    "title": "上海交通大學校花唐雨喬",
    "name": "唐雨喬",
    "school": "上海交通大學",
    "like_count": "105"
  },
  {
    "title": "溫州大學校花湯以斯貼",
    "name": "湯以斯貼",
    "school": "溫州大學",
    "like_count": "289"
  },
  {
    "title": "華東大學校花趙夢潔",
    "name": "趙夢潔",
    "school": "華東大學",
    "like_count": "604"
  },
  {
    "title": "鄞州職業高級中學校花翁川美",
    "name": "翁川美",
    "school": "鄞州職業高級中學",
    "like_count": "109"
  },
  {
    "title": "中央戲劇學院校花劉垚昕",
    "name": "劉垚昕",
    "school": "中央戲劇學院",
    "like_count": "585"
  },
  {
    "title": "星源初中校花廖炯炅",
    "name": "廖炯炅",
    "school": "星源初中",
    "like_count": "99"
  },
  {
    "title": "廣州華夏職業學院校花鄧杏琳",
    "name": "鄧杏琳",
    "school": "廣州華夏職業學院",
    "like_count": "97"
  },
  {
    "title": "芷江師範校花滕之雅",
    "name": "滕之雅",
    "school": "芷江師範",
    "like_count": "208"
  },
  {
    "title": "鐵嶺師範校花施玉",
    "name": "施玉",
    "school": "鐵嶺師範",
    "like_count": "186"
  }
]

三、總結

爬取數據的基本步驟(以案例二爲例)

1.寫入對應的url

2.請求頭,解析url地址,獲取所有網頁數據信息:

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"} 

html_data = parse_url(xiaohua_url,headers) 

3.獲取數據後可以獲得所有美女圖片的div列表

xiaohua_datas = get_xiaohua_datas(html_data) 

4.根據需要的內容來保存數據

 save_xiaohua_file(xiaohua_datas)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章