XPath爬取百度搜索結果

webpy + nginx框架

主要涉及到lxml中的xpath模塊解析html格式數據

各種編碼問題

部分XPath實現:

<def parse_baidu(self, body):

		print("parse_baidu ===>>")

		elements = []
		try:
			html = body
			page = etree.HTML(html.lower().decode('utf-8')) # need convert to utf-8
			
			tags = page.xpath(u"//div[@class='result c-result'] | //div[@class='result c-result c-clk-recommend']")
			for tag in tags:

				#get link
				data_log_attrib = tag.attrib['data-log']
				if data_log_attrib:
					data_log_attrib = data_log_attrib.replace('\'', '"')
					data_log = json.loads(data_log_attrib)
					if data_log:
						node = {}
						node['order'] = data_log['order']
						node['link'] = data_log['mu']

						# get title
						_tag = tag.xpath(u"./div/a/h3")
						if len(_tag):
							_tag_str = _tag[0].xpath("string(.)")
							node['title'] = _tag_str.encode('utf-8')
						else:
							node['title'] = "Unknown"

						# get description
						_tag = tag.xpath(u"./div/div")
						if len(_tag):
							_tag_str = _tag[0].xpath("string(.)")
							_string = _tag_str.encode("utf-8")
							node['desc'] = _string
						else:
							node['desc'] = "Unknown"

						elements.append(node)
			print("parse_baidu <<===")
			return elements
				
		except Exception as e:
			print("parse_baidu failed {}".format(e))
		print("parse_baidu <<=== end")
		return None
def query_baidu(self, keyword):

		print("query_baidu ===>>")

		try:
			_keyword = urllib.quote(keyword.encode('utf-8'))
			url_query = "https://m.baidu.com/from=1014284b/s?word=" + _keyword + "&sa=tb&ts=6902153&t_kt=0&ie=utf-8&rsv_t=9e926S4zLuzG32Q2kkM5Tu%252Bc%252B4TbHKAg9WiWPQfnflJUbt8%252BiCpIrXI%252FyApB%252FeM&rsv_pq=17372243552828969370&ss=111&rsv_sug4=14106&inputT=12708&oq=%E4%B9%A0%E8%BF%91%E5%B9%B3"
			
			request = urllib2.Request(url_query)
			response = urllib2.urlopen(request)
			body = response.read()

			#
			# TODO: parse the body content, extract items (some links)
			#
			elements = self.parse_baidu(body)
			return elements
			
		except Exception as e:
			 print("Query Baidu Error: {}".format(e))

		return None

	def query_google(self, keyword):

		return "Not implemented!"

	def GET(self):
		response = None
		arguments = {}
		try:
			key_word = web.input(keyword = "demo_keyword")
			arguments['keyword'] = key_word.keyword
			arguments['extern_ip'] = "{}:{}".format(web.ctx.ip, web.ctx.env["REMOTE_PORT"])
			elements = self.query_baidu(key_word.keyword)
			response = self.render.template_index(arguments, elements)
		except Exception as e:
			response = "GET Error: {}".format(e)
		return response

	def POST(self):
		pass



html模版頁面部分實現:

<div id="result"> 
		$if elements:
			$for e in elements:
			<h3>$e['order']</h3>
				$if e['link'] and e['title']:
				<li><a href="$e['link']">$e['title']</a></li>
				$if e['desc']:
					<li>$e['desc']</li>
		$else:
			<h3>Not get values!</h3>
<p>
</div>


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章