歡迎使用CSDN-markdown編輯器

代碼塊

代碼塊語法遵循標準markdown代碼,例如:

@requires_authorization

#setup.py build
#setup.py intall
from bs4 import BeautifulSoup
bs = BeautifulSoup

doc = [
    '<html><head><title>Page title</title></head>',
    '<body><p id="firstpara" align="center">This is paragraph <b>one</b>.',
    '<p id="secondpara" align="blah">This is paragraph <b>two</b>.',
    '</html>'
]
soup = bs(''.join(doc))



#---
import re
import urllib

def getHtml(url):
    page = urllib.urlopen(url,proxies={'http': 'http://192.168.1.2:3128'})
    html = page.read()
    return html


def getImg(html):
    reg = r'src="(.+?\.jpg)" pic_ext'
    imgre = re.compile(reg)
    imglist = re.findall(imgre, html)
    return imglist


def imgDownload(imglist):
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl, '%s.jpg' % x)
        x+=1
        print '第', x, '張圖片下載完成'


html = getHtml('http://tieba.baidu.com/p/2460150866')

print getImg(html)

#---
def getItemNum(url):
        # 功能:獲取一個賣家的所有商品數目
        # 輸入: 一個賣家的任意商品列表頁面
        # 輸出: 賣家的所有商品數目 
        raw = getHtml(url)
        p = re.compile(r'<span class="rcnt"\s{0,}>(.*)?</span>')
        tmpNum = re.findall(p,raw)
        return tmpNum

starttime = datetime.datetime.now()
tmp = getItemNum(url)
itemNum = int(tmp[0].replace(',', '')) 
endtime =datetime.datetime.now()
print (endtime-starttime).seconds


#===
html = getHtml(url)
soup = bs(html)
soup.find(id='descItemNumber').string

#----
fid = open('fds.txt', 'r')
lines = fid.readlines()  # 帶/n

def getAddress(itemID):
    try:
        tmpUrl = 'http://www.ebay.com/itm/' + itemID
        html = getHtml(tmpUrl)
        soup = bs(html)
        xx = soup.select('div.iti-eu-bld-gry ')
        addr = bs(''.join(xx[0]))
        return addr
    except Exception, ex:
        print ex
        print '沒有找到地址'
        return None
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章