用BeautifulSoup解析時要注意在有漢字的網頁中編碼解碼問題,下面是獲取大街網網頁中class='jobInfo'的div標籤的數據內容
from bs4 import BeautifulSoup
import urllib2
c = urllib2.urlopen('http://job.dajie.com/7262fae6-a1aa-4674-9efa-3baf697faa46.html')
soup = BeautifulSoup(c.read())
for div in soup.find_all('div'):
if div.get('class') == ['jobInfo']:
print 'find it'
#print div.contents
s = div.contents
for x in s:
if (x.encode('GB2312')) != '<br/>' and (x.encode('GB2312')) != '\n': #注意此處GB2312編碼不是utf8
print x.encode('GB2312')
break