#!/usr/bin/env python3
#coding=utf-8
#
#data =2012-12-1
#version=0.1.0
#
import os
import sys
from urllib import parse
urljoin = parse.urljoin
urlsplit= parse.urlsplit
urlquote= parse.quote
from bs4 import BeautifulSoup as htmlparser
import httplib2
http = httplib2.Http(timeout = 30)
headers = {
'user-agent'
:
'Opera/9.80 (Windows NT 6.1; WOW64; Edition IBIS) Presto/2.12.388 Version/12.11',
}
'''
配置文件名稱,錯誤可能導致無法啓動程序
'''
config = 'configure.conf'
global baseurl
baseurl = 'http://www.hao123.se/665/'
#當前工作目錄
work_dir = os.getcwd()
#已經訪問過的URL,以及應該把已經訪問過的URL放在那個文件下
visited_url = set()
visited_url_file = 'visited_url.txt'
abs_visited_url_file = os.path.join(work_dir, visited_url_file)
#外部鏈接URL
foreign_url = set()
foreign_url_file = 'foreign_url.txt'
abs_foreign_url_file = os.path.join(work_dir, foreign_url_file)
#處理錯誤鏈接URL
error_url = set()
error_url_file = 'error_url.txt'
abs_error_url_file = os.path.join(work_dir, error_url_file)
#數據鏈接URL,最重要的,輸出數據的URL的
data_url = set()
data_url_file = 'data_url.txt'
abs_data_url_file = os.path.join(work_dir, data_url_file)
#沒有處理的URL
unhandled_url = set()
def write_to_file(data = 1, foreign = 0, visited = 0, error = 1):
global abs_data_url_file
global abs_foreign_url_file
global abs_visited_url_file
global abs_error_url_file
global data_url
global foreign_url
global visited_url
global error_url
if data == 1:
try:
with open(abs_data_url_file, 'a+') as fp:
for url in data_url:
fp.write(url+'\n')
data_url = set()
print('write to data_url.txt is OK!')
except Exception as excp:
raise RuntimeError(excp)
if foreign == 1:
try:
with open(abs_foreign_url_file, 'a+') as fp:
for url in foreign_url:
fp.write(url+'\n')
print('Write to foreign_url.txt is OK!')
except Exception as excp:
raise RuntimeError(excp)
if visited == 1:
try:
with open(abs_visited_url_file, 'a+') as fp:
for url in visited_url:
fp.write(url+'\n')
print("Write to visited_url.txt is OK!")
except Exception as excp:
raise RuntimeError(excp)
if error == 1:
try:
with open(abs_error_url_file, 'a+') as fp:
for url in error_url:
fp.write(url+'\n')
error_url = set()
print('Write to error_url.txt is OK')
except Exception as excp:
raise RuntimeError(excp)
def decode(data):
if type(data) is str:
try:
return data.encode('utf-8')
except Exception as excp:
raise RuntimeError(excp)
elif type(data) is bytes:
codes = ['utf-8', 'gbk', 'gb2312', 'big5']
for code in codes:
try:
return data.decode(code)
except Exception:
pass
raise RuntimeError('decode failed!')
else:
raise TypeError('Argument must be str or bytes')
def is_visited(url):
global visited_url
return url in visited_url
def get_content(url):
global http
global headers
global error_url
try:
resp, content = http.request(url, headers=headers)
if resp.status == 200:
return content
else:
raise RuntimeError('get content error!')
except Exception as excp:
error_url.add(url)
print(excp)
def is_foreign(baseurl, url):
global urlsplit
return urlsplit(baseurl).netloc != urlsplit(url).netloc
def parse_content(url):
global foreign_url
global error_url
global data_url
global visited_url
global unhandled_url
global urljoin
global htmlparser
try:
content = get_content(url)
except RuntimeError:
content = None
pass
if content == None:
return
elif type(content) is bytes:
try:
content = decode(content)
except RuntimeError as excp:
print(excp)
elif type(content) is str:
pass
else:
raise TypeError('content type is error!')
visited_url.add(url)
html_handle = htmlparser(content)
tags = html_handle.findAll(True)
for tag in tags:
if tag.name.lower() == 'script':
pass
elif tag.name.lower() == 'a':
try:
tag_url = urljoin(url, tag['href'])
except KeyError as excp:
continue
if is_visited(tag_url):
pass
elif is_foreign(url, tag_url):
foreign_url.add(tag_url)
else:
if tag_url.startswith(url) or url.startswith(tag_url):
unhandled_url.add(tag_url)
elif tag.attrs.__contains__('src'):
tag_url = urljoin(url, tag['src'])
if is_visited(tag_url):
pass
else:
data_url.add(tag_url)
visited_url.add(tag_url)
def main():
global baseurl
global unhandled_url
unhandled_url.add(baseurl)
count = 0
all_count = 0
while 1:
all_count += 1
try:
url = unhandled_url.pop()
except KeyError:
break
print(('%d Now handling:%s') % (all_count, url))
try:
parse_content(url)
except RuntimeError as excp:
print(excp)
count += 1
if count >= 20:
try:
write_to_file()
except RuntimeError as excp:
print(excp)
if input('Continue (Y/N) ?').lower() != 'y':
sys.exit(0)
count = 0
write_to_file(1, 1, 1, 1)
print("All is OK!\nExiting program....")
if __name__ == '__main__':
main()
一個簡單的spider
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.