python按分類爬取電子商城商品信息

 不同網站解析不同,下面爬取的是電子商城是:https://www.asos.de/damen 一家德國的電子商城

1.爬取主頁https://www.asos.de/damen ,獲得導航欄全部的分類鏈接

2.遍歷分類鏈接,爬取一個分類鏈接下全部商品,包括分頁的商品信息

3.將一個分類鏈接下的全部商品保存到excel表格,同時記錄已爬取的分類鏈接

4.爬蟲結束

知識點:

1.requests請求網頁

2.基於BeautifulSoup的網頁解析,本腳本主要用了 CSS選擇器(https://blog.csdn.net/lzz781699880/article/details/81209038

3.正則表達式

4.xlwt和文件讀寫操作

import  requests,random,os,xlwt,math,time,re
from bs4 import BeautifulSoup



#獲得靜態的界面
def get_static_html(site_url):
    print('開始加載', site_url, '頁面')
    headers_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    ]
    headers = {
        'user-agent': headers_list[random.randint(0,len(headers_list))-1],
        'Connection': 'keep - alive'
    }
    try:
        resp = requests.get(site_url, headers=headers)
    except Exception as inst:
        print(inst)
        requests.packages.urllib3.disable_warnings()
        resp = requests.get(site_url, headers=headers,verify=False)
    soup = BeautifulSoup(resp.text, 'html.parser')
    return soup



#下載html 文件
def download_html(content, html_path):
    if not os.path.exists(html_path):  # 文件夾不存在就創建按文件夾
        os.makedirs(html_path)
    print('download htmlfile path is:','{}.html'.format(html_path))
    try:
        with open('{}.html'.format(html_path), 'w+', encoding="utf-8") as f:
            f.write(content)
            f.close()
    except Exception as e:
        print(e)



#下載到表格
def exportTask(heads,task_done,path,filename):
    if not os.path.exists(path):
        os.makedirs(path)
    task_xls = xlwt.Workbook(encoding='utf-8')
    task_sheet1 = task_xls.add_sheet('sheet1')
    #表頭
    header_allign = xlwt.Alignment()
    header_allign.horz = xlwt.Alignment.HORZ_CENTER
    header_style = xlwt.XFStyle()
    header_style.alignment = header_allign
    for i in  range(len(heads)):
        task_sheet1.col(i).width = 12000
        task_sheet1.write(0,i,heads[i],header_style)
    #開始插入
    for i in range(len(task_done)):
        for j in range(len(heads)):
            task_sheet1.write(i+1,j,task_done[i][heads[j]])
    filename = "{0}.xls".format(filename.replace(':','-'))
    print(os.path.join(path,filename))
    task_xls.save(os.path.join(path,filename))
    return filename



#獲得大分類下的總頁數
def getTotalPageNums(url):
    soup = get_static_html(url)
    page_msg_tage = soup.select('._2sxPqJf')
    page_msg = page_msg_tage[0].text
    if len(page_msg_tage) == 0:
        return 1,''
    cate = soup.select('._2wckrGM')[0].text
    total_num = page_msg.split(' ')[4]
    page_num = math.ceil(int(total_num.replace('.','')) / 72)
    print(page_num)
    return page_num,cate



#獲得某個分類的商品的全部信息,包括分頁
def getInfoFromSoup(url):
    soup = get_static_html(url)
    cate = soup.select('._2wckrGM')[0].text
    info_list = []
    for tag in soup.select('._3x-5VWa'):
        # print(tag)
        img_tag = tag.select("._1FN5N-P > img")[0]
        desc_tag = tag.select('._10-bVn6 > div > p')[0]
        info = { 'cate_url': url,'cate': cate}
        info['desc'] = desc_tag.text
        if len(tag.select('.JW3hTZk')) == 0:
            price_tage = tag.select('._342BXW_')[0]
            price_msg = price_tage.text
        else:
            price_tage = tag.select('.JW3hTZk')[0]
            if '"' in price_tage.text:
                price_msg = price_tage.text.split['"'][1]
            else:
                price_msg = price_tage.text
        info['price'] = price_msg
        price_arr = price_msg.split(' €')
        if len(price_arr) != 0:
            info['price_num'] = float(price_arr[0].replace(',','.'))
        else:
            info['price_num'] = 0
        info['product link'] = tag.attrs['href']
        patttern = re.compile(r'[prd|grd]\/([0-9]+)\?')
        info_id = patttern.findall(tag.attrs['href'])
        info['product_id'] = info_id[0]
        if 'src' in img_tag.attrs.keys():
            info['img_url'] = img_tag.attrs['src']
        else:
            info['img_url'] = ''
        info_list.append(info)
    return info_list



#獲得主頁下商品的分類鏈接
def getCateUrl():
    url = 'https://www.asos.de/damen'
    soup = get_static_html(url)
    nav_tage = soup.select('._3kg3G5e')[0]
    cate_list = []
    for a_tage in nav_tage.select('a'):
        cate_list.append(a_tage.attrs['href'])
    return cate_list



#將某個分類下的商品存到excel表格
def dowloadExcelByCate(cate_url,path,num):
    pagenum,cate = getTotalPageNums(cate_url)
    info_list = []
    if pagenum > 1:
        for i in range(1, pagenum + 1):
            url_page = '{0}&page={1}'.format(cate_url, i)
            info_list += getInfoFromSoup(url_page)
            time.sleep(5)
        heads = ['cate_url','cate','desc','price','price_num','product_id','product link','img_url']
        filename = '{0}-{1}'.format(num,cate)
        exportTask(heads, info_list, path, filename)
    try:
        with open(path+'record.txt', 'a+', encoding="utf-8") as f:
            f.write(cate_url+'\n')
            f.close()
    except Exception as e:
        print(e)



#獲得已經爬取的分類鏈接
def getDoneUrl(path):
    done_url = []
    with open(path + 'record.txt', 'r', encoding="utf-8") as f:
        url_list = f.readlines()
        for url in url_list:
            done_url.append(url.rstrip('\n'))
        print(done_url)
    return done_url


save_path = 'C:\\Users\\SHEIN\\Desktop\\asos\\'
if __name__ == '__main__':
    cate_url_list = getCateUrl()
    done_url = getDoneUrl(save_path)
    for i in range(len(cate_url_list)):
        if cate_url_list[i] not in done_url:
            dowloadExcelByCate(cate_url_list[i],save_path,i+1)
    

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章