python3 scrapy框架crawl模版爬取京東產品並寫入mysql

crawl將自動對所有鏈接進行分析，將符合的鏈接數據爬取。官方文檔

，其中價格，好評率需要用瀏覽器抓包分析真實地址，本文所用的基礎技術包括：sql語句，re表達式,xpath表達式，基本的網絡知識和python基礎

jd.py

# -*- coding: utf-8 -*-
import scrapy
import urllib.request
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from jingdong.items import JingdongItem


class JdSpider(CrawlSpider):
    name = 'jd'
    allowed_domains = ['jd.com']
    start_urls = ['http://jd.com/']

    rules = (
        Rule(LinkExtractor(allow=''), callback='parse_item', follow=True),#爬取所有鏈接，均不加限制
    )

    def parse_item(self, response):
        i = JingdongItem()
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        thisurl = response.url#獲取當前所在的地址，下一步判斷是否是商品地址
        pat = 'item.jd.com/(.*?).html'
        url = re.search(pat,thisurl)
        if(url):#如果url是商品頁
            product_id = re.findall(pat,thisurl)[0]
            price_link = 'http://p.3.cn/prices/mgets?callback=jQuery6325563&type=1&area=1_72_2799_0&pdtk=&pduid=1509845912914927705768&pdpin=&pin=null&pdbp=0&skuIds=J_'+str(product_id)+'&ext=11000000&source=item-pc'#當前商品價格鏈接地址
            pat_price = 'p":"(.*?)"'#獲取價格的正則
            price_str=urllib.request.urlopen(price_link).read().decode('utf-8','ignore')#價格所在url
            i['price'] = re.compile(pat_price).findall(price_str)[0]#獲取價格
            goodRate_link = 'http://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv244&productId='+str(product_id)+'&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'#當前商品好評率鏈接地址
            pat_goodrate = '{"goodRateShow":(.*?),'#正則表達式匹配價格
            goodRate_str = urllib.request.urlopen(goodRate_link).read().decode('utf-8','ignore')#好評地址
            i['goodRate'] = re.compile(pat_goodrate).findall(goodRate_str)[0]#好評率
            i['title'] = response.xpath('//title/text()').extract()#商品標題
            i['store'] = response.xpath('//div[@class="name"]/a/text()').extract()#店鋪名字
            i['link'] = response.xpath('//link[@rel="canonical"]/@href').extract()#商品鏈接
            print(i['price'],i['goodRate'],i['title'],i['store'],i['link'])
        else:#不是商品葉
            pass
            #print("不是商品")
        return i



#價格地址
#http://p.3.cn/prices/mgets?callback=jQuery6325563&type=1&area=1_72_2799_0&pdtk=&pduid=1509845912914927705768&pdpin=&pin=null&pdbp=0&skuIds=J_5560552&ext=11000000&source=item-pc
#好評率地址
#http://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv244&productId=5560552&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1
#簡介xpath
#/html/body/div[7]/div/div[2]/div[1]/text()
#店鋪xpath
#//div[@class="name"]/a/text()
#商品鏈接
#//link[@rel="canonical"]/@href

嘗試運行

pipelines.py

# -*- coding: utf-8 -*-
import mysql.connector
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class JingdongPipeline(object):
    def process_item(self, item, spider):
        db = mysql.connector.connect(host='localhost',
                                     user='root',
                                     passwd='123456',
                                     db='python')  # 鏈接數據庫
        cur = db.cursor()  # 獲取數據庫遊標
        title = item['title']
        link = item['link']
        price = item['price']
        goodRate = item['goodRate']
        store = item['store']
        print(title,link,price,goodRate,store)
        cur.execute("insert into jingdong VALUES ('"+title+"','"+link+"','"+price+"','"+goodRate+"','"+store+"')")  # 執行語句
        db.commit()  # 提交事務，沒有此句在數據庫中不能查詢到數據
        cur.close()  # 關閉遊標
        db.close()  # 關閉數據庫