python爬取自如网房源信息

前言

本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理。

作者: Star_Zhao

PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行获取http://t.cn/A6Zvjdun
本次爬取自如网房源信息所用到的知识点:

  • requests get请求

  • lxml解析html

  • Xpath

  • MongoDB存储

正文

分析目标站点

  • url: http://hz.ziroom.com/z/nl/z3.html?p=2 的p参数控制分页

  • get请求

获取单页源码

# -*- coding: utf-8 -*-
import requests
import time
from requests.exceptions import RequestException
def get_one_page(page):
    try:
        url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page)
        headers = {
            'Referer':'http://hz.ziroom.com/',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'
        }
        res = requests.get(url,headers=headers)
        if res.status_code == 200:
            print(res.text)
    except RequestException:
        return None
def main():
    page = 1
    get_one_page(page)
if __name__ == '__main__':
    main()
    time.sleep(1)

解析单页源码

  • 解析html文档, 目的: 测试XPath表达式

将获取的源码保存到当前文件夹下的"result.html"中, 然后通过XPath对其进行相应内容的提取, 当然你也可以使用某些在线工具.

from lxml import etree
#解析html文档
html = etree.parse("./resul.html",etree.HTMLParser())
results = html.xpath('//ul[@id="houseList"]/li')
for result in results[1:]:
    title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) >5 else ""
    location = result.xpath("./div/h4/a/text()")[0].replace("[","").replace("]",'')
    area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ","",1) #使用join方法将列表中的内容以" "字符连接
    nearby = result.xpath("./div/div/p[2]/span/text()")[0]
    print(title)
    print(location)
    print(area)
    print(nearby)

解析源代码

from lxml import etree
def parse_one_page(sourcehtml):
    '''解析单页源码'''
    contentTree = etree.HTML(sourcehtml)   #解析源代码
    results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相应内容
    for result in results[1:]:
        title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""
        location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')
        area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1)  # 使用join方法将列表中的内容以" "字符连接
        nearby = result.xpath("./div/div/p[2]/span/text()")[0]
        yield {
        "title": title,
        "location": location,
        "area": area,
        "nearby": nearby
        }
def main():
    page = 1
    html = get_one_page(page)
    print(type(html))
    parse_one_page(html)
    for item in parse_one_page(html):
      print(item)

if __name__ == '__main__':
    main()
    time.sleep(1)

获取多个页面

def parse_one_page(sourcehtml):
    '''解析单页源码'''
    contentTree = etree.HTML(sourcehtml)   #解析源代码
    results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相应内容
    for result in results[1:]:
        title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""
        location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')
        area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1)  # 使用join方法将列表中的内容以" "字符连接
        #nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()这里需要加判断, 改写为下句
        nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""
        yield {
        "title": title,
        "location": location,
        "area": area,
        "nearby": nearby
        }
        print(nearby)
    #yield {"pages":pages}
def get_pages():
    """得到总页数"""
    page = 1
    html = get_one_page(page)
    contentTree = etree.HTML(html)
    pages = int(contentTree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共页"))
    return pages
def main():
    pages = get_pages()
    print(pages)
    for page in range(1,pages+1):
        html = get_one_page(page)
        for item in parse_one_page(html):
            print(item)

if __name__ == '__main__':
    main()
    time.sleep(1)

存储到MongoDB中

需确保MongoDB已启动服务, 否则必然会存储失败

def save_to_mongodb(result):
    """存储到MongoDB中"""
    # 创建数据库连接对象, 即连接到本地
    client = pymongo.MongoClient(host="localhost")
    # 指定数据库,这里指定ziroom
    db = client.iroomz
    # 指定表的名称, 这里指定roominfo
    db_table = db.roominfo
    try:
        #存储到数据库
        if db_table.insert(result):
            print("---存储到数据库成功---",result)
    except Exception:
        print("---存储到数据库失败---",result)

完整代码

1 # -*- coding: utf-8 -*-
 2 
 3 import requests
 4 import time
 5 import pymongo
 6 from lxml import etree
 7 from requests.exceptions import RequestException
 8 def get_one_page(page):
 9     '''获取单页源码'''
10     try:
11         url = "http://hz.ziroom.com/z/nl/z2.html?p=" + str(page)
12         headers = {
13             'Referer':'http://hz.ziroom.com/',
14             'Upgrade-Insecure-Requests':'1',
15             'User-Agent':'Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36'
16         }
17         res = requests.get(url,headers=headers)
18         if res.status_code == 200:
19             return res.text
20         return None
21     except RequestException:
22         return None
23 def parse_one_page(sourcehtml):
24     '''解析单页源码'''
25     contentTree = etree.HTML(sourcehtml)   #解析源代码
26     results = contentTree.xpath('//ul[@id="houseList"]/li') #利用XPath提取相应内容
27     for result in results[1:]:
28         title = result.xpath("./div/h3/a/text()")[0][5:] if len(result.xpath("./div/h3/a/text()")[0]) > 5 else ""
29         location = result.xpath("./div/h4/a/text()")[0].replace("[", "").replace("]", '')
30         area = " ".join(result.xpath("./div/div/p[1]/span/text()")).replace(" ", "", 1)  # 使用join方法将列表中的内容以" "字符连接
31         #nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()这里需要加判断, 改写为下句
32         nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""
33         data = {
34         "title": title,
35         "location": location,
36         "area": area,
37         "nearby": nearby
38         }
39         save_to_mongodb(data)
40     #yield {"pages":pages}
41 def get_pages():
42     """得到总页数"""
43     page = 1
44     html = get_one_page(page)
45     contentTree = etree.HTML(html)
46     pages = int(contentTree.xpath('//div[@class="pages"]/span[2]/text()')[0].strip("共页"))
47     return pages
48 def save_to_mongodb(result):
49     """存储到MongoDB中"""
50     # 创建数据库连接对象, 即连接到本地
51     client = pymongo.MongoClient(host="localhost")
52     # 指定数据库,这里指定ziroom
53     db = client.iroomz
54     # 指定表的名称, 这里指定roominfo
55     db_table = db.roominfo
56     try:
57         #存储到数据库
58         if db_table.insert(result):
59             print("---存储到数据库成功---",result)
60     except Exception:
61         print("---存储到数据库失败---",result)
62 
63 def main():
64     pages = get_pages()
65     print(pages)
66     for page in range(1,pages+1):
67         html = get_one_page(page)
68         parse_one_page(html)
69 
70 if __name__ == '__main__':
71     main()
72     time.sleep(1)

最终结果

在这里插入图片描述

总结

在第三步中XPath使用注意事项

title = result.xpath("./div/h3/a/text()")
此处的点'.'不能忘记, 它表示当前节点, 如果不加'.', '/'就表示从根节点开始选取

在第四步获取多个页面时出现索引超出范围错误

nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()

IndexError: list index out of range

造成这种错误原因有两种:

  1. [index] index超出list范围

  2. [index] index索引内容为空

因为这里的nearby的index是0, 排除第一种情况, 那么这里就是空行了, 加句if判断就可以解决

nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip()
#改写以后:
nearby = result.xpath("./div/div/p[2]/span/text()")[0].strip() if len(result.xpath("./div/div/p[2]/span/text()"))>0 else ""

以上主要是对爬虫过程学习的总结, 若有不对的地方, 还请指正, 谢谢!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章