#!/usr/bin/python
# -*- encoding: utf-8 -*-
import requests,re
from time import time,sleep
from sys import argv
from threading import Thread
def geturl(content,num=100):
url = "https://www.baidu.com/s?" #使用百度搜索
threadlist = [] #線程列表
headers = {
'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
} #設置head瀏覽器標識
name = argv[0][:-9]+str(time())+'.txt'
global f
f = open(name, 'a') #將獲取的url寫入文件
print("正在採集url,並寫入文件...")
length = range(int(num/10))
for n in length: #實現翻頁功能
page = n*10
thread = Thread(target=writefile,args=(url,page,headers,content))
threadlist.append(thread)
for i in length:
threadlist[i].start()
sleep(1)
for i in length:
threadlist[i].join()
f.close()
print("採集完成,輸出文件位置:"+name)
def writefile(url,page,headers,content):
params = {"wd": content, "ie": "UTF-8", "pn": page, "oq": content, "rsv_pq": "e5f0d50b0004fbc8"} # 參數字典
r = requests.get(url, params=params, headers=headers)
urls = re.findall(r'<a\s+data-click="{\s+\'F\':.*\s+.*\s+.*\s+.*\s+.*\s+.*\s+.*\s+[\s\S]*?</a>', r.text)
for i in urls:
u = re.findall(r'http://www\.baidu\.com/link[^"]*', i)
try:
original = requests.get(u[0],headers=headers,timeout=2.5)
f.write(original.url + '\n')
except:
pass
if __name__ == '__main__':
geturl(argv[1],int(argv[2]))
python多線程實現百度url採集
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.