python:爬取天气预报+美女网站数据信息

一、爬取湛江天气预报信息

 url:http://www.weather.com.cn/weather/10128100101A.shtml

1.思路分析:

①先爬取网页所有数据,div/ul/li,获得7天天气预报的所有信息;

②li下的所有数据进行提取数据;

③保存文件。 --文件操作、json模块。

import requests
import lxml.html
import json

def parse_url(url,header):
    """"解析url地址,获得网页的所有数据内容"""
    response = requests.get(url,headers=header)
    # return response.text  会出现乱码

    #改为content二进制数据,再转为字符串decode即可
    return response.content.decode("utf-8")  


def get_weather_datas(html_content):

    """"从所有html数据信息中获取ul>li标签中的所有天气信息"""

    metree = lxml.html.etree

    # 1 获得解析对象(构造一个 XPath 解析对象并对 HTML 文本进行自动修正)
    parser = metree.HTML(html_content,metree.HTMLParser())

    # 2 使用Xpath语法获得li所有标签
    li_list = parser.xpath("//div[@class='c7d']/ul[@class='t clearfix']/li")
    # print(li_list)
    # print(len(li_list))  7

    # 3 获得li标签下的所有信息
        # 先创建一个空列表
    data = []
        #  循环遍历
    for ele in li_list:
        # 创建一个空字典
        item = {}
        # 继续使用XPath语法
        item["date"] = ele.xpath("./h1/text()")[0]
        item["天气"] = ele.xpath("./p[@class='wea']/text()")[0]
        # item["wea"] = ele.xpath("./p[@class='wea']/@title")[0]  通过title属性获取
        item["最低温度"] = ele.xpath("./p[@class='tem']/i/text()")[0]
        item["最高温度"] = ele.xpath("./p[@class='tem']/span/text()")[0]
        # print(item["最高温度"])
        data.append(item)
        # print(data)
    return data

def save_weather_file(datas):
    """"保存文件"""

    # 列表转json字符串,并保存到文件中
    json_strs = json.dumps(datas,ensure_ascii=False,indent=2)
    # print(json_strs)
    # print(type(json_strs))
    with open("./file/weather.json","w",encoding="utf-8") as files:
        files.write(json_strs)
    print("数据保存成功!")

def main():
    # ①div/ul/li,获得7天天气预报的所有信息;  --lxml、requests、XPath   --list类型
    # 网址、请求头
    # print("HelloWorld...")
    http_url = "http://www.weather.com.cn/weather/10128100101A.shtml"
    headers = \
        {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
    # 解析url
    html_data = parse_url(http_url,headers)
    # print(html_data)

    # ②li下的所有数据进行提取数据;  --对上述的7条数据进行处理,提取数据  XPath   --列表
    weather_datas = get_weather_datas(html_data)
    # print(weather_datas)


    # ③保存文件。  --文件操作、json模块
    save_weather_file(weather_datas)


""""python程序入口,通常写在程序末尾处"""
if __name__ == '__main__':
    main()

2.爬取的数据:

  

 


二 、爬取美女网站信息

   url:http://www.xiaohuar.com/list-1-1.html

1.思路分析

   ①获得所有美女图片的div列表;

   ②在当前美女的div中获得所有信息;

   ③保存数据。

import requests
import lxml.html
import json

def parse_url(url,header):
    """"解析url地址,获取所有网页数据信息"""

    response = requests.get(url,headers=header)
    return response.content.decode("gbk")

def get_xiaohua_datas(html_content):
    """"获取数据内容"""

    metree = lxml.html.etree
    # 解析对象
    parser = metree.HTML(html_content,metree.HTMLParser())
    # 解析获得②在当前美女的div中获得所有信息
    div_list = parser.xpath("//div[@class='item_list infinite_scroll']/div")
        # 创建空列表
    data = []
    # print(div_list)
    # print(len(div_list))  25
    for ele in div_list:
        item = {}  #创建空字典
        # 继续使用xpath 语法
        item["title"] = ele.xpath("./div[@class='item_t']/div[@class='img']/a/img/@alt")[0]
        item["name"] = ele.xpath("./div[@class='item_t']/div[@class='img']/span/text()")[0]
        item["school"] = ele.xpath("./div[@class='item_t']/div[@class='img']/div[@class='btns']/a/text()")[0]
        item["like_count"] = ele.xpath("./div[@class='item_b clearfix']/div[@class='items_likes fl']/em/text()")[0]
        # print(item["like_count"])
        data.append(item)
    # print(data)
    return data

def save_xiaohua_file(datas):
    """"保存文件"""

    json_strs = json.dumps(datas,ensure_ascii=False,indent=2)
    with open("./file/xiaohua.json","w",encoding="utf-8") as files:
        files.write(json_strs)
    print("数据保存成功!")

def main():
    xiaohua_url = "http://www.xiaohuar.com/list-1-1.html"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
    html_data = parse_url(xiaohua_url,headers)
    # print(html_data)

    # ①获得所有美女图片的div列表
    xiaohua_datas = get_xiaohua_datas(html_data)

    # ③保存数据
    save_xiaohua_file(xiaohua_datas)


if __name__ == '__main__':
    main()

2.爬取的数据

[
  {
    "title": "大连国际舞蹈学校校花王钰萌",
    "name": "王钰萌",
    "school": "大连国际舞蹈学校",
    "like_count": "159"
  },
  {
    "title": "南昌大学校花曾阳",
    "name": "曾阳",
    "school": "南昌大学",
    "like_count": "220"
  },
  {
    "title": "中国民航大学校花张金玉",
    "name": "张金玉",
    "school": "中国民航大学",
    "like_count": "109"
  },
  {
    "title": "天津财经大学校花卓娅祺",
    "name": "卓娅祺",
    "school": "天津财经大学",
    "like_count": "361"
  },
  {
    "title": "新疆农业大学校花麦合丽娅",
    "name": "麦合丽娅",
    "school": "新疆农业大学",
    "like_count": "53"
  },
  {
    "title": "成都职业技术学院校花杨萍",
    "name": "杨萍",
    "school": "成都职业技术学院",
    "like_count": "108"
  },
  {
    "title": "东北师范大学校花尹思凝",
    "name": "尹思凝",
    "school": "东北师范大学",
    "like_count": "109"
  },
  {
    "title": "北京理工大学珠海学院校花韦若愚",
    "name": "韦若愚",
    "school": "北京理工大学珠海学院",
    "like_count": "122"
  },
  {
    "title": "厦门理工学院校花袁慧",
    "name": "袁慧",
    "school": "厦门理工学院",
    "like_count": "78"
  },
  {
    "title": "湖北艺术学院校花王媛茜",
    "name": "王媛茜",
    "school": "湖北艺术学院",
    "like_count": "96"
  },
  {
    "title": "文光中心校花陈里佳",
    "name": "陈里佳",
    "school": "文光中心",
    "like_count": "48"
  },
  {
    "title": "大连外国语大学校花高梦馨",
    "name": "高梦馨",
    "school": "大连外国语大学",
    "like_count": "115"
  },
  {
    "title": "舟山技师学院校花宋世杰",
    "name": "宋世杰",
    "school": "舟山技师学院",
    "like_count": "99"
  },
  {
    "title": "上海财经大学校花徐逸岑",
    "name": "徐逸岑",
    "school": "上海财经大学",
    "like_count": "123"
  },
  {
    "title": "武汉大学校花丁婷婷",
    "name": "丁婷婷",
    "school": "武汉大学",
    "like_count": "121"
  },
  {
    "title": "行健学院校花徐艳琛",
    "name": "徐艳琛",
    "school": "行健学院",
    "like_count": "149"
  },
  {
    "title": "上海交通大学校花唐雨乔",
    "name": "唐雨乔",
    "school": "上海交通大学",
    "like_count": "105"
  },
  {
    "title": "温州大学校花汤以斯贴",
    "name": "汤以斯贴",
    "school": "温州大学",
    "like_count": "289"
  },
  {
    "title": "华东大学校花赵梦洁",
    "name": "赵梦洁",
    "school": "华东大学",
    "like_count": "604"
  },
  {
    "title": "鄞州职业高级中学校花翁川美",
    "name": "翁川美",
    "school": "鄞州职业高级中学",
    "like_count": "109"
  },
  {
    "title": "中央戏剧学院校花刘垚昕",
    "name": "刘垚昕",
    "school": "中央戏剧学院",
    "like_count": "585"
  },
  {
    "title": "星源初中校花廖炯炅",
    "name": "廖炯炅",
    "school": "星源初中",
    "like_count": "99"
  },
  {
    "title": "广州华夏职业学院校花邓杏琳",
    "name": "邓杏琳",
    "school": "广州华夏职业学院",
    "like_count": "97"
  },
  {
    "title": "芷江师范校花滕之雅",
    "name": "滕之雅",
    "school": "芷江师范",
    "like_count": "208"
  },
  {
    "title": "铁岭师范校花施玉",
    "name": "施玉",
    "school": "铁岭师范",
    "like_count": "186"
  }
]

三、总结

爬取数据的基本步骤(以案例二为例)

1.写入对应的url

2.请求头,解析url地址,获取所有网页数据信息:

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"} 

html_data = parse_url(xiaohua_url,headers) 

3.获取数据后可以获得所有美女图片的div列表

xiaohua_datas = get_xiaohua_datas(html_data) 

4.根据需要的内容来保存数据

 save_xiaohua_file(xiaohua_datas)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章