練習3

簡單小爬蟲


#!/usr/bin/env python

#coding:utf-8

import urllib2

import bs4

url = 'http://www.163.com'

content = urllib2.urlopen(url).read()

content =  content.decode('gbk')


soup = bs4.BeautifulSoup(content)

links = soup.select('li a[href]')


result = []

for link in links:

    href = link.attrs['href']

    title = link.text

    if '.html' in href and '163.com' in href and len(title) >3:

        result.append(link)

for link in result:

    print link.attrs['href'], link.text


print '共有新聞[%s]條',   len(result)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章