爬貼吧——(1)先爬一下貼吧的帖子列表

import requests
from lxml import etree

class TiebaSpider(object):
    def __init__(self, tieba_name): # 初始化需要用到的變量
        self.tieba_name = tieba_name
        self.url = "http://tieba.baidu.com/mo/q----,sz@320_240-1-3---/m?kw="+tieba_name+"&lp=7202"
        self.headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36"}

    def parse_url(self, url):   # 發送請求,響應首頁內容
        rp = requests.get(url, headers=self.headers)
        return rp.content.decode()

    def get_content_list(self, html_str):   # 清洗數據
        html = etree.HTML(html_str)
        div_list = html.xpath("//div[contains(@class,'i')]")
        href_list = []
        for div in div_list:
            item = {}
            item["title"] = div.xpath("./a/text()") if len(div.xpath("./a/text()"))>0 else None
            item["href"] = div.xpath("./a/@href") if len(div.xpath("./a/@href"))>0 else None
            href_list.append(item)
        return href_list

    def save_content(self, href_list):
        file_name = self.tieba_name + ".txt"
        with open(file_name, "a") as f:
            f.write(href_list)
            f.write("\n")


    def run(self):
        # 1,獲得開始url
        # 2,發送請求獲得內容
        html_str = self.parse_url(self.url)
        href_list = self.get_content_list(html_str)
        self.save_content(href_list)
        # 3,解析內容
        # 4,保存內容

if __name__ == '__main__':
    tieba_spider = TiebaSpider("做頭髮")
    tieba_spider.run()

更新中…

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章