一個簡單的spider

#!/usr/bin/env python3
#coding=utf-8
#
#data	=2012-12-1
#version=0.1.0
#

import os
import sys

from urllib import parse
urljoin	= parse.urljoin
urlsplit= parse.urlsplit
urlquote= parse.quote 

from bs4 import BeautifulSoup as htmlparser

import httplib2
http = httplib2.Http(timeout = 30)

headers = {
	'user-agent'
		:
	'Opera/9.80 (Windows NT 6.1; WOW64; Edition IBIS) Presto/2.12.388 Version/12.11',
	}


'''
配置文件名稱,錯誤可能導致無法啓動程序
'''
config = 'configure.conf'

global baseurl
baseurl = 'http://www.hao123.se/665/'

#當前工作目錄
work_dir = os.getcwd()

#已經訪問過的URL,以及應該把已經訪問過的URL放在那個文件下
visited_url = set()
visited_url_file = 'visited_url.txt'
abs_visited_url_file = os.path.join(work_dir, visited_url_file)

#外部鏈接URL
foreign_url = set()
foreign_url_file = 'foreign_url.txt'
abs_foreign_url_file = os.path.join(work_dir, foreign_url_file)

#處理錯誤鏈接URL
error_url = set()
error_url_file = 'error_url.txt'
abs_error_url_file = os.path.join(work_dir, error_url_file)

#數據鏈接URL,最重要的,輸出數據的URL的
data_url = set()
data_url_file = 'data_url.txt'
abs_data_url_file = os.path.join(work_dir, data_url_file)

#沒有處理的URL
unhandled_url = set()


def write_to_file(data = 1, foreign = 0,  visited = 0, error = 1):
	global abs_data_url_file
	global abs_foreign_url_file
	global abs_visited_url_file
	global abs_error_url_file
	
	global data_url
	global foreign_url
	global visited_url
	global error_url
	
	if data == 1:
		try:
			with open(abs_data_url_file, 'a+') as fp:
				for url in data_url:
					fp.write(url+'\n')
			data_url = set()
			print('write to data_url.txt is OK!')
		except Exception as excp:
			raise RuntimeError(excp)
	
	if foreign == 1:
		try:
			with open(abs_foreign_url_file, 'a+') as fp:
				for url in foreign_url:
					fp.write(url+'\n')
			print('Write to foreign_url.txt is OK!')
		except Exception as excp:
			raise RuntimeError(excp)
	
	if visited == 1:
		try:
			with open(abs_visited_url_file, 'a+') as fp:
				for url in visited_url:
					fp.write(url+'\n')
			print("Write to visited_url.txt is OK!")
		except Exception as excp:
			raise RuntimeError(excp)

	if error == 1:
		try:
			with open(abs_error_url_file, 'a+') as fp:
				for url in error_url:
					fp.write(url+'\n')
			error_url = set()
			print('Write to error_url.txt is OK')
		except Exception as excp:
			raise RuntimeError(excp)



def decode(data):
	if type(data) is str:
		try:
			return data.encode('utf-8')
		except Exception as excp:
			raise RuntimeError(excp)
	elif type(data) is bytes:
		codes = ['utf-8', 'gbk', 'gb2312', 'big5']
		for code in codes:
			try:
				return data.decode(code)
			except Exception:
				pass
		raise RuntimeError('decode failed!')
	else:
		raise TypeError('Argument must be str or bytes')


def is_visited(url):
	global visited_url
	return url in visited_url

def get_content(url):
	global http
	global headers
	global error_url
	
	try:
		resp, content = http.request(url, headers=headers)
		if resp.status == 200:
			return content
		else:
			raise RuntimeError('get content error!')
	except Exception as excp:
		error_url.add(url)
		print(excp)

def is_foreign(baseurl, url):
	global urlsplit
	return urlsplit(baseurl).netloc != urlsplit(url).netloc
		
def parse_content(url):
	global foreign_url
	global error_url
	global data_url
	global visited_url
	global unhandled_url
	
	global urljoin
	
	global htmlparser
	
	try:
		content = get_content(url)
	except RuntimeError:
		content = None
		pass
	
	if content == None:
		return
	elif type(content) is bytes:
		try:
			content = decode(content)
		except RuntimeError as excp:
			print(excp)
	elif type(content) is str:
		pass
	else:
		raise TypeError('content type is error!')
	
	visited_url.add(url)
	
	html_handle = htmlparser(content)
	
	tags = html_handle.findAll(True)
	

	for tag in tags:
		if tag.name.lower() == 'script':
			pass
		elif tag.name.lower() == 'a':
			try:
				tag_url = urljoin(url, tag['href'])
			except KeyError as excp:
				continue

			if is_visited(tag_url):
				pass
			elif is_foreign(url, tag_url):
				foreign_url.add(tag_url)
			else:
				if tag_url.startswith(url) or url.startswith(tag_url):
					unhandled_url.add(tag_url)
		elif tag.attrs.__contains__('src'):
			tag_url = urljoin(url, tag['src'])
			if is_visited(tag_url):
				pass
			else:
				data_url.add(tag_url)
				visited_url.add(tag_url)
	

def main():
	global baseurl
	global unhandled_url
	
	unhandled_url.add(baseurl)
	count = 0
	all_count = 0
	while 1:
		all_count += 1
		try:
			url = unhandled_url.pop()
		except KeyError:
			break
		print(('%d Now handling:%s') % (all_count, url))

		try:
			parse_content(url)
		except RuntimeError as excp:
			print(excp)
		count += 1
		if count >= 20:
			try:
				write_to_file()
			except RuntimeError as excp:
				print(excp)
				if input('Continue (Y/N) ?').lower() != 'y':
					sys.exit(0)
			count = 0
		
	write_to_file(1, 1, 1, 1)
	print("All is OK!\nExiting program....")

if __name__ == '__main__':
	main()
	

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章