python

python写的基础下载程序可以下载多页

 
#conding:utf-8
import urllib2
import time
page=1        #初始化下载页面为第一页
url = ['']*350   设置url的存储
while page<8:   
        buf = urllib2.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()    #打开url并读取内容
        i = 0           
        title = buf.find(r'<a title=')   #从title开始查找
        href = buf.find(r'href=',title)
        html = buf.find(r'.html',href)
        while title !=-1 and href !=-1 and html !=-1 and i<50: 设置一页面的url数和判断title href都存在
                url[i] = buf[href+6:html+5] 使url正常
                print url[i]
                title = buf.find(r'<a title=',html)
                href = buf.find(r'href=',title)
                html = buf.find(r'.html',href)
                i = i+1
        else:
                print page,"find end "
        page = page+1
else:
        print 'all down '
j = 0
while j<350:          下载url
        biaoti = ['']*350
        content = urllib2.urlopen(url[j]).read()
        titname = content.find(r'SG_txta') 读取标题
        end = content.find(r'</h',titname)
        biaoti[j] = content[titname+9:end]
        print biaoti[j]
             
        open(r'hanhan/'+url[j][-26:],'w+').write(content) 保存内容以url的最后26位为名称和后缀
        print 'downing ',url[j]
        j=j+1
        time.sleep(4)
else:
        print 'down fished'
          
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章