import requests,chardet
from lxml import etree
import re
import pymysql
class MysqlHelper(object):
# 初始化的構造函數
def __init__(self):
self.db = pymysql.connect(host='127.0.0.1',user='root',password='123456',port=3306,database='py101',charset='utf8')
self.cursor = self.db.cursor()
# 執行修改操作
def mysql_do(self,sql):
self.cursor.execute(sql)
self.db.commit()
# 結束函數
def __del__(self):
self.cursor.close()
self.db.close()
def b (particulars_url,mysql_):
# print(type(particulars_url))
headers ={
# 'Cookie': 'XLA_CI=4bcc75a53587f6ef64ae25e76968175d; UM_distinctid=1655181588f377-018dee94e06b5e-51422e1f-1fa400-16551815890901; CNZZDATA1260535040=1775506378-1534666276-%7C1534686370; cscpvcouplet4298_fidx=3; cscpvrich5041_fidx=3',
# 'Host': 'www.dytt8.net',
# 'If-Modified-Since': 'Sat, 18 Aug 2018 11:57:59 GMT',
# 'If-None-Match': '"80fd25b5ea36d41:328"',
# 'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
}
response1 = requests.get(particulars_url,headers=headers)
response1.encoding='gb2312'
html=response1.text
# print(html)
# print(response1.text)
# html_ele = response1.text
# print(html_ele)
html_ele = etree.HTML(html)
# li_list = html_ele.xpath('//table[@class="tablelist textl"]/tr')
name = html_ele.xpath('//div[@class="title_all"]/h1/font/text()')[0]
print(name)
# lianjie = html_ele.xpath('//div[@id="Zoom"]/span')
# print(lianjie)
html_href = re.search(r'<a href="(.+)"><strong>', html).group(1)
print(html_href)
# bgcolor = "#fdfddf" > < a href = "(.*?)" >
# name = r'<font style="vertical-align: inherit;">(.*?)</font>'
sql = 'insert into py333(name,html_href)values({},{})'.format(repr(name),repr(html_href))
print(sql)
mysql_.mysql_do(sql)
def a ():
for i in range(1,4):
mysql_ = MysqlHelper()
list_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'.format(i)
headers = {
'Cookie': 'cscpvcouplet4298_fidx=1; XLA_CI=4bcc75a53587f6ef64ae25e76968175d; cscpvrich5041_fidx=1',
'Host': 'www.dytt8.net',
'If-Modified-Since': 'Sun, 19 Aug 2018 03:01:11 GMT',
'If-None-Match': '"80f518e26837d41:328"',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
}
response = requests.get(list_url,headers=headers)
# print(response)
html_ele = etree.HTML(response.text)
# print(html_ele)
li_list = html_ele.xpath('//div[@class="co_content8"]/ul/td[1]/table')
# with open( 'dytt.html','wb') as f :
# f.write(response.content)
# for i in li_list:
# print(len(li_list))
for li_ele in li_list:
xq = li_ele.xpath('./tr[2]/td[2]/b/a/@href')
if len(xq) == 2:
# xq = li_ele.xpath('./tr[2]/td[2]/b/a[2]/@href')[0]
xq.pop(0)
# print(xq)
# print(xq[0])
particulars_url = 'http://www.dytt8.net'+xq[0]
print(particulars_url)
# print(xq)
b(particulars_url,mysql_)
# print('--'*25)
# http://www.dytt8.net/html/gndy/dyzz/20180817/57299.html
if __name__ == '__main__':
a()
爬蟲--電影天堂
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.