剛學Python沒多久,看到爬蟲這一節,聽老師說要有些東西來記錄的,避免日後遺忘,奉上第一篇博文。
代碼寫的難看。
import requests
from bs4 import BeautifulSoup
import csv
import os
class GetWeb:
def __init__(self,url,path):
self.url = url
self.path = path
self.urllist = []
def downResponse(self):
headers = {
"Cookie": "sf_source=; s=; showAdxian=1; city=xian; global_cookie=274p85ahuvfgc793ht9o7e0zl13jn7bnaez; logGuid=bcfdd12f-f512-4084-bc48-42c8abc3dfe5; Integrateactivity=notincludemc; token=a84e9318cc984845a3fde631d80d540d; Captcha=477369645A665872443673526645776C2F59424E6B6346636942396375797673707457784157507635534A636A784D547944324C6148724A6258375552324E7145714B6A63444E6E6C68383D; sfut=65E8C2B28829B269A1015F9EEE58CFE78C9EA1F38E99E641999C8AA688FA52130AE5F588586636A2EFA84A3E31AAEB86A705C9E858289D0049C77A9EE67FC762830FFA9E700E3A33262D8ECA956C188FB6A8253F4FFB68F9; new_loginid=99460018; login_username=fang7487850776; unique_cookie=U_274p85ahuvfgc793ht9o7e0zl13jn7bnaez*10",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
response = requests.get(url,headers = headers)
response.encoding = "gbk"
return response.text
def downResponse_1(self,url):
headers = {
"Cookie": "sf_source=; s=; showAdxian=1; city=xian; global_cookie=274p85ahuvfgc793ht9o7e0zl13jn7bnaez; logGuid=bcfdd12f-f512-4084-bc48-42c8abc3dfe5; Integrateactivity=notincludemc; token=a84e9318cc984845a3fde631d80d540d; Captcha=477369645A665872443673526645776C2F59424E6B6346636942396375797673707457784157507635534A636A784D547944324C6148724A6258375552324E7145714B6A63444E6E6C68383D; sfut=65E8C2B28829B269A1015F9EEE58CFE78C9EA1F38E99E641999C8AA688FA52130AE5F588586636A2EFA84A3E31AAEB86A705C9E858289D0049C77A9EE67FC762830FFA9E700E3A33262D8ECA956C188FB6A8253F4FFB68F9; new_loginid=99460018; login_username=fang7487850776; unique_cookie=U_274p85ahuvfgc793ht9o7e0zl13jn7bnaez*10",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
response = requests.get(url,headers = headers)
response.encoding = "gbk"
return response.text
def parseResponse(self):#拿到所有房源的盒子
# print(self.url)
response = self.downResponse()
soup = BeautifulSoup(response,"lxml")
listul = soup.find_all("dl",attrs={"class":"clearfix"})
return listul
def parseResponse_1(self,url): # 拿到子頁面所有房源的盒子
print(self.url)
response = self.downResponse_1(url)
soup = BeautifulSoup(response, "lxml")
listul = soup.find_all("dl", attrs={"class": "clearfix"})
return listul
def createCsv(self):
try:
os.makedirs(os.path.abspath(self.path + os.path.sep +".."))
except:
pass
fd = os.open(self.path, os.O_CREAT | os.O_RDWR)
os.close(fd)
def change(self,oldstr):
newstr = oldstr[1:-1]
return newstr
def changefool(self,oldfool):
newfool = oldfool[1:]
return newfool
def getUl(self,url,quyu):
result =self.parseResponse_1(url)
self.createCsv()
for res in result:
spacelist = []
try:
huxing = res.find("p", attrs={"class": "tel_shop"}).get_text().split()
address = res.find("p",attrs = {"class":"add_shop"}).find("a").get_text().split()[0]#地址
price = res.find("dd",attrs= {"class":"price_right"}).find("span",attrs = {"class":"red"}).get_text()
meanprice = res.find("dd", attrs={"class": "price_right"}).find_all("span")[1].get_text()
spacelist.append(address)
newmianji = self.change(huxing[1])
spacelist.append(int(newmianji))#\48.5m
newfool = self.changefool(huxing[2])
spacelist.append(newfool)#\中層(共30層)
newright = self.changefool(huxing[3])
spacelist.append(newright)#|東南向
newmeanprice = meanprice[:-3]
spacelist.append(int(newmeanprice))#17391元/㎡
newprice = price[:-1]
spacelist.append(int(newprice))#50萬
spacelist.append(quyu)
fd_csv = open(self.path, "a", newline="")
csv_write = csv.writer(fd_csv, dialect="excel")
csv_write.writerow(spacelist)
except:
pass
def createCSVrol(self):
self.createCsv()
fd_csv = open(self.path, "a", newline="")
csv_write = csv.writer(fd_csv, dialect="excel")
csv_write.writerow(["小區","面積/㎡","樓層","朝向","單價/萬","總價/萬"])
def otherulr(self):#拼接頁面數
response = self.downResponse_1(self.url)
soup = BeautifulSoup(response, "lxml")
listul = soup.find_all("div", attrs={"id": "list_D10_15"})
for i in range(2,101):
newulr = self.url + str("house/i3%d/")%i
result = self.parseResponse_1(newulr)
for res in result:
spacelist = []
try:
huxing = res.find("p", attrs={"class": "tel_shop"}).get_text().split()
address = res.find("p", attrs={"class": "add_shop"}).find("a").get_text().split()[0] # 地址
price = res.find("dd", attrs={"class": "price_right"}).find("span",
attrs={"class": "red"}).get_text()
meanprice = res.find("dd", attrs={"class": "price_right"}).find_all("span")[1].get_text()
spacelist.append(address)
newmianji = self.change(huxing[1])
spacelist.append(int(newmianji)) # \48.5m
newfool = self.changefool(huxing[2])
spacelist.append(newfool) # \中層(共30層)
newright = self.changefool(huxing[3])
spacelist.append(newright) # |東南向
newmeanprice = meanprice[:-3]
spacelist.append(int(newmeanprice)) # 17391元/㎡
newprice = price[:-1]
spacelist.append(int(newprice)) # 50萬
spacelist.append(quyu)
# print(spacelist)
fd_csv = open(self.path, "a", newline="")
csv_write = csv.writer(fd_csv, dialect="excel")
csv_write.writerow(spacelist)
except:
pass
def chengxiurl(self):
response = self.downResponse()
soup = BeautifulSoup(response, "lxml")
listul = soup.find_all("ul", attrs={"class": "clearfix choose_screen floatl"})
for li in listul:
res = li.find_all("a")
for text in res:
result_1 = text.get_text()
url = text.get("href")
print(result_1)
newurl = self.url + str(url)
response = self.downResponse_1(newurl)
soup = BeautifulSoup(response, "lxml")
listul = soup.find_all("div", attrs={"id": "list_D10_15"})
for fangxiang in listul:
end = fangxiang.find_all("a")[-1].get("href")
# print(self.url+str(end))
endlist = str(end).split("/")
new_url = endlist[1]
endmowei = endlist[-2].split("i3")[1]
for i in range(2,int(endmowei)):
new_url_url = self.url+new_url+"/i3%s"%i
# print(new_url_url)
result = self.parseResponse_1(new_url_url)
for res in result:
spacelist = []
try:
huxing = res.find("p", attrs={"class": "tel_shop"}).get_text().split()
address = res.find("p", attrs={"class": "add_shop"}).find("a").get_text().split()[
0] # 地址
price = res.find("dd", attrs={"class": "price_right"}).find("span",
attrs={
"class": "red"}).get_text()
meanprice = res.find("dd", attrs={"class": "price_right"}).find_all("span")[
1].get_text()
spacelist.append(address)
newmianji = self.change(huxing[1])
spacelist.append(int(newmianji)) # \48.5m
newfool = self.changefool(huxing[2])
spacelist.append(newfool) # \中層(共30層)
newright = self.changefool(huxing[3])
spacelist.append(newright) # |東南向
newmeanprice = meanprice[:-3]
spacelist.append(int(newmeanprice)) # 17391元/㎡
newprice = price[:-1]
spacelist.append(int(newprice))
spacelist.append(result_1)# 50萬
# print(spacelist)
fd_csv = open(self.path, "a", newline="")
csv_write = csv.writer(fd_csv, dialect="excel")
csv_write.writerow(spacelist)
except:
pass
def star(self):
url = "http://xian.esf.fang.com/"
path = "C:\\Users\\zq\\Desktop\\xian.csv"
quyu = "城北"
p = GetWeb(url,path)
p.createCSVrol()
p.chengxiurl()
p = GetWeb(url,path)
p.star()