python爬蟲

剛學Python沒多久,看到爬蟲這一節,聽老師說要有些東西來記錄的,避免日後遺忘,奉上第一篇博文。
代碼寫的難看。

import requests
from bs4 import BeautifulSoup
import csv
import os
class GetWeb:

def __init__(self,url,path):

    self.url = url
    self.path = path
    self.urllist = []

def downResponse(self):
    headers = {
        "Cookie": "sf_source=; s=; showAdxian=1; city=xian; global_cookie=274p85ahuvfgc793ht9o7e0zl13jn7bnaez; logGuid=bcfdd12f-f512-4084-bc48-42c8abc3dfe5; Integrateactivity=notincludemc; token=a84e9318cc984845a3fde631d80d540d; Captcha=477369645A665872443673526645776C2F59424E6B6346636942396375797673707457784157507635534A636A784D547944324C6148724A6258375552324E7145714B6A63444E6E6C68383D; sfut=65E8C2B28829B269A1015F9EEE58CFE78C9EA1F38E99E641999C8AA688FA52130AE5F588586636A2EFA84A3E31AAEB86A705C9E858289D0049C77A9EE67FC762830FFA9E700E3A33262D8ECA956C188FB6A8253F4FFB68F9; new_loginid=99460018; login_username=fang7487850776; unique_cookie=U_274p85ahuvfgc793ht9o7e0zl13jn7bnaez*10",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }
    response = requests.get(url,headers = headers)
    response.encoding = "gbk"
    return response.text

def downResponse_1(self,url):
    headers = {
        "Cookie": "sf_source=; s=; showAdxian=1; city=xian; global_cookie=274p85ahuvfgc793ht9o7e0zl13jn7bnaez; logGuid=bcfdd12f-f512-4084-bc48-42c8abc3dfe5; Integrateactivity=notincludemc; token=a84e9318cc984845a3fde631d80d540d; Captcha=477369645A665872443673526645776C2F59424E6B6346636942396375797673707457784157507635534A636A784D547944324C6148724A6258375552324E7145714B6A63444E6E6C68383D; sfut=65E8C2B28829B269A1015F9EEE58CFE78C9EA1F38E99E641999C8AA688FA52130AE5F588586636A2EFA84A3E31AAEB86A705C9E858289D0049C77A9EE67FC762830FFA9E700E3A33262D8ECA956C188FB6A8253F4FFB68F9; new_loginid=99460018; login_username=fang7487850776; unique_cookie=U_274p85ahuvfgc793ht9o7e0zl13jn7bnaez*10",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
    }
    response = requests.get(url,headers = headers)
    response.encoding = "gbk"
    return response.text

def parseResponse(self):#拿到所有房源的盒子
    # print(self.url)
    response = self.downResponse()
    soup = BeautifulSoup(response,"lxml")
    listul = soup.find_all("dl",attrs={"class":"clearfix"})
    return listul

def parseResponse_1(self,url):  # 拿到子頁面所有房源的盒子
    print(self.url)
    response = self.downResponse_1(url)
    soup = BeautifulSoup(response, "lxml")
    listul = soup.find_all("dl", attrs={"class": "clearfix"})
    return listul

def createCsv(self):
    try:
        os.makedirs(os.path.abspath(self.path + os.path.sep +".."))
    except:
        pass
    fd = os.open(self.path, os.O_CREAT | os.O_RDWR)
    os.close(fd)

def change(self,oldstr):
    newstr = oldstr[1:-1]
    return newstr

def changefool(self,oldfool):
    newfool = oldfool[1:]
    return newfool



   
def getUl(self,url,quyu):
    result =self.parseResponse_1(url)
    self.createCsv()
    for res in  result:
        spacelist = []
        try:
            huxing = res.find("p", attrs={"class": "tel_shop"}).get_text().split()
            address = res.find("p",attrs = {"class":"add_shop"}).find("a").get_text().split()[0]#地址
            price = res.find("dd",attrs= {"class":"price_right"}).find("span",attrs = {"class":"red"}).get_text()
            meanprice = res.find("dd", attrs={"class": "price_right"}).find_all("span")[1].get_text()
            spacelist.append(address)
            newmianji = self.change(huxing[1])
            spacelist.append(int(newmianji))#\48.5m
            newfool = self.changefool(huxing[2])
            spacelist.append(newfool)#\中層(共30層)
            newright = self.changefool(huxing[3])
            spacelist.append(newright)#|東南向
            newmeanprice = meanprice[:-3]
            spacelist.append(int(newmeanprice))#17391元/㎡
            newprice = price[:-1]
            spacelist.append(int(newprice))#50萬
            spacelist.append(quyu)
            fd_csv = open(self.path, "a", newline="")
            csv_write = csv.writer(fd_csv, dialect="excel")
            csv_write.writerow(spacelist)
        except:
            pass


def createCSVrol(self):
    self.createCsv()
    fd_csv = open(self.path, "a", newline="")
    csv_write = csv.writer(fd_csv, dialect="excel")
    csv_write.writerow(["小區","面積/㎡","樓層","朝向","單價/萬","總價/萬"])

def otherulr(self):#拼接頁面數
    response = self.downResponse_1(self.url)
    soup = BeautifulSoup(response, "lxml")
    listul = soup.find_all("div", attrs={"id": "list_D10_15"})
    for i in range(2,101):
		newulr = self.url + str("house/i3%d/")%i
        result = self.parseResponse_1(newulr)
        for res in result:
            spacelist = []
            try:
                huxing = res.find("p", attrs={"class": "tel_shop"}).get_text().split()
                address = res.find("p", attrs={"class": "add_shop"}).find("a").get_text().split()[0]  # 地址
                price = res.find("dd", attrs={"class": "price_right"}).find("span",
                                                                            attrs={"class": "red"}).get_text()
                meanprice = res.find("dd", attrs={"class": "price_right"}).find_all("span")[1].get_text()
                spacelist.append(address)
                newmianji = self.change(huxing[1])
                spacelist.append(int(newmianji))  # \48.5m
                newfool = self.changefool(huxing[2])
                spacelist.append(newfool)  # \中層(共30層)
                newright = self.changefool(huxing[3])
                spacelist.append(newright)  # |東南向
                newmeanprice = meanprice[:-3]
                spacelist.append(int(newmeanprice))  # 17391元/㎡
                newprice = price[:-1]
                spacelist.append(int(newprice))  # 50萬
                spacelist.append(quyu)
                # print(spacelist)
                fd_csv = open(self.path, "a", newline="")
                csv_write = csv.writer(fd_csv, dialect="excel")
                csv_write.writerow(spacelist)
            except:
                pass

def chengxiurl(self):
    response = self.downResponse()
    soup = BeautifulSoup(response, "lxml")
    listul = soup.find_all("ul", attrs={"class": "clearfix choose_screen floatl"})
    for li in listul:
        res = li.find_all("a")
        for text in res:
            result_1 = text.get_text()
            url = text.get("href")
            print(result_1)
            newurl = self.url + str(url)
            response = self.downResponse_1(newurl)
            soup = BeautifulSoup(response, "lxml")
            listul = soup.find_all("div", attrs={"id": "list_D10_15"})
            for fangxiang in listul:
                end = fangxiang.find_all("a")[-1].get("href")
                # print(self.url+str(end))
                endlist = str(end).split("/")
                new_url = endlist[1]
                endmowei = endlist[-2].split("i3")[1]
				for i in range(2,int(endmowei)):
                    new_url_url = self.url+new_url+"/i3%s"%i
                    # print(new_url_url)
                    result = self.parseResponse_1(new_url_url)
                    for res in result:
                        spacelist = []
                        try:
                            huxing = res.find("p", attrs={"class": "tel_shop"}).get_text().split()
                            address = res.find("p", attrs={"class": "add_shop"}).find("a").get_text().split()[
                                0]  # 地址
                            price = res.find("dd", attrs={"class": "price_right"}).find("span",
                                                                                        attrs={
                                                                                            "class": "red"}).get_text()
                            meanprice = res.find("dd", attrs={"class": "price_right"}).find_all("span")[
                                1].get_text()
                            spacelist.append(address)
                            newmianji = self.change(huxing[1])
                            spacelist.append(int(newmianji))  # \48.5m
                            newfool = self.changefool(huxing[2])
                            spacelist.append(newfool)  # \中層(共30層)
                            newright = self.changefool(huxing[3])
                            spacelist.append(newright)  # |東南向
                            newmeanprice = meanprice[:-3]
                            spacelist.append(int(newmeanprice))  # 17391元/㎡
                            newprice = price[:-1]
                            spacelist.append(int(newprice))
                            spacelist.append(result_1)# 50萬
                            # print(spacelist)
                            fd_csv = open(self.path, "a", newline="")
                            csv_write = csv.writer(fd_csv, dialect="excel")
                            csv_write.writerow(spacelist)
                        except:
                            pass



def star(self):
	url = "http://xian.esf.fang.com/"
	path = "C:\\Users\\zq\\Desktop\\xian.csv"
	quyu = "城北"
	p = GetWeb(url,path)
	p.createCSVrol()
	p.chengxiurl()

p = GetWeb(url,path)
p.star()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章