webpy + nginx框架
主要涉及到lxml中的xpath模塊解析html格式數據
各種編碼問題
部分XPath實現:
<def parse_baidu(self, body):
print("parse_baidu ===>>")
elements = []
try:
html = body
page = etree.HTML(html.lower().decode('utf-8')) # need convert to utf-8
tags = page.xpath(u"//div[@class='result c-result'] | //div[@class='result c-result c-clk-recommend']")
for tag in tags:
#get link
data_log_attrib = tag.attrib['data-log']
if data_log_attrib:
data_log_attrib = data_log_attrib.replace('\'', '"')
data_log = json.loads(data_log_attrib)
if data_log:
node = {}
node['order'] = data_log['order']
node['link'] = data_log['mu']
# get title
_tag = tag.xpath(u"./div/a/h3")
if len(_tag):
_tag_str = _tag[0].xpath("string(.)")
node['title'] = _tag_str.encode('utf-8')
else:
node['title'] = "Unknown"
# get description
_tag = tag.xpath(u"./div/div")
if len(_tag):
_tag_str = _tag[0].xpath("string(.)")
_string = _tag_str.encode("utf-8")
node['desc'] = _string
else:
node['desc'] = "Unknown"
elements.append(node)
print("parse_baidu <<===")
return elements
except Exception as e:
print("parse_baidu failed {}".format(e))
print("parse_baidu <<=== end")
return None
def query_baidu(self, keyword):
print("query_baidu ===>>")
try:
_keyword = urllib.quote(keyword.encode('utf-8'))
url_query = "https://m.baidu.com/from=1014284b/s?word=" + _keyword + "&sa=tb&ts=6902153&t_kt=0&ie=utf-8&rsv_t=9e926S4zLuzG32Q2kkM5Tu%252Bc%252B4TbHKAg9WiWPQfnflJUbt8%252BiCpIrXI%252FyApB%252FeM&rsv_pq=17372243552828969370&ss=111&rsv_sug4=14106&inputT=12708&oq=%E4%B9%A0%E8%BF%91%E5%B9%B3"
request = urllib2.Request(url_query)
response = urllib2.urlopen(request)
body = response.read()
#
# TODO: parse the body content, extract items (some links)
#
elements = self.parse_baidu(body)
return elements
except Exception as e:
print("Query Baidu Error: {}".format(e))
return None
def query_google(self, keyword):
return "Not implemented!"
def GET(self):
response = None
arguments = {}
try:
key_word = web.input(keyword = "demo_keyword")
arguments['keyword'] = key_word.keyword
arguments['extern_ip'] = "{}:{}".format(web.ctx.ip, web.ctx.env["REMOTE_PORT"])
elements = self.query_baidu(key_word.keyword)
response = self.render.template_index(arguments, elements)
except Exception as e:
response = "GET Error: {}".format(e)
return response
def POST(self):
pass
html模版頁面部分實現:
<div id="result">
$if elements:
$for e in elements:
<h3>$e['order']</h3>
$if e['link'] and e['title']:
<li><a href="$e['link']">$e['title']</a></li>
$if e['desc']:
<li>$e['desc']</li>
$else:
<h3>Not get values!</h3>
<p>
</div>