import sqlite3
import bs4
import requests
class MyException(Exception):
def __init__(self):
Exception()
return
def get_ip():
url = 'http://www.xicidaili.com/nn/'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Accept-Encoding': ''
}
res = requests.get(url, headers = headers)
if not res.ok:
raise(MyException())
text = res.content.decode('utf-8')
soup = bs4.BeautifulSoup(text, 'lxml')
elems = soup.select('#ip_list tr')
'''需要的td標籤爲1 2 5 8 9,分別爲ip port type survival_time verification_time'''
info_list = []
for elem in elems:
td_list = elem.find_all('td')
if not td_list:
continue
td_info = []
for i in [1, 2, 5, 8, 9]:
td_info.append('"' + td_list[i].getText().strip() + '"')
info_list.append(','.join(td_info))
return info_list
def write_info(info_list):
conn = sqlite3.connect('ip.db')
create_comm = '''create table ip (
id integer primary key not null,
ip text,
port text,
type text,
survival_time text,
vertification_time text
);'''
try:
conn.execute(create_comm)
except:
print('can not create table ip...')
conn.execute('delete from ip')
for i in range(len(info_list)):
id = i + 1
'''
info_list[i] = info_list[i].replace('分鐘', 'min')
info_list[i] = info_list[i].replace('小時', 'h')
info_list[i] = info_list[i].replace('天', 'd')
'''
insert_comm = 'insert into ip values(%s, %s);' % (id, info_list[i])
try:
conn.execute(insert_comm)
except:
print('insert fail %s' % id)
conn.commit()
conn.close()
print('write finish...')
return
def main():
info_list = get_ip()
write_info(info_list)
return
if '__main__' == __name__:
main()
python3爬取代理信息並寫入sqlite3數據庫
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.