python 代理爬種子,保存到MongoDB和json

1.爬蟲 dmoz_spider.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:donghui

import scrapy
import re
from urllib.parse import quote
from tutorial.items import DmozItem

class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ["dmoz.org"]
    start_urls = [
        "http://www.btkuai.org/word/" + quote("風光") + "_{}.html".format(n) for n in range(1,10)
    ]

    def savefile(self,filename,var):
        f = open("tutorial/res/"+filename+".csv","w+")  #路徑一定要寫對
        f.write(var)
        f.close()
        #print("保存完畢")

    def parse(self, response):
        url_head = 'http://www.btkuai.org'
        #filename = response.url.split("/")[-2]

        selector = response.xpath('//div[@id="container"]/div/ul/li/div[@class="T1"]')
        for sel in selector:
            title = sel.xpath('a/text()').extract()[0]
            link = url_head +(sel.xpath('a/@href').extract()[0])
            if re.findall('([a-zA-z]+://[^\s]*html$)',link,re.S):
                #print(title, link)
                #self.savefile(filename, title + "," + link)
                item = DmozItem()
                item['title'] = title
                item['link'] = link
                yield item

2. Items items.py

import scrapy


class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()

3. 代理IP middlewares.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:donghui

# 導入隨機模塊
import random,json
# 導入settings文件中的IPPOOL
from .settings import IPPOOL
# 導入官方文檔對應的HttpProxyMiddleware
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
from scrapy.conf import settings

class IPPOOlS(HttpProxyMiddleware):
    with open("../EffectiveIp.json", 'r') as handler:
        ips = json.load(handler)
    handler.close()
    def __init__(self, ip=''):
        self.ip = ip

    def process_request(self, request, spider):
        proxyMode = settings['IPPoolMode']
        if proxyMode==0:
            thisip = random.choice(settings['IPPOOL'])
            print("代理ip%s" % thisip["http"])
            request.meta["proxy"] = "http://" + thisip["http"]
        elif proxyMode==1:
            thisip = random.choice(IPPOOlS.ips)
            print("代理ip%s" % thisip["http"])
            request.meta["proxy"] = "http://" + thisip["http"]

4.代理商 uamid.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:donghui

# 導入隨機模塊
import random
# 導入settings文件中的UPPOOL
from .settings import UPPOOL
# 導入官方文檔對應的HttpProxyMiddleware
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware

class Uamid(UserAgentMiddleware):
    # 初始化 注意一定要user_agent,不然容易報錯
    def __init__(self, user_agent=''):
        self.user_agent = user_agent
    # 請求處理
    def process_request(self, request, spider):
        # 先隨機選擇一個用戶代理
        thisua = random.choice(UPPOOL)
        print("當前使用User-Agent是:"+ thisua)
        request.headers.setdefault('User-Agent', thisua)
5.settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for tutorial project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tutorial'

SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial (+http://www.btkuai.org)'
USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# 配置mongoDB
MONGO_HOST = "127.0.0.1"  # 主機IP
MONGO_PORT = 27017  # 端口號
MONGO_DB = "btKuai"  # 庫名
MONGO_COLL = "fengguang"  # collection

#0 從配置文件拿 1 從代理json文件拿
IPPoolMode=1

# 設置IPIPPOOL = [{"http": "125.32.250.240:8060"},{"http": "183.159.93.165:61234"},{"http": "119.49.33.238:8060"},{"http": "119.187.120.118:8060"},{"http": "120.25.203.182:7777"},{"http": "121.17.18.219:8060"},{"http": "123.8.41.163:8060"},{"http": "119.41.236.180:8010"},{"http": "121.17.18.218:8060"},{"http": "114.55.0.166:8090"},{"http": "118.122.105.99:9000"},{"http": "45.115.39.139:7777"}]
# 設置用戶代理池
UPPOOL = [
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
]

#  禁止本地Cookie
COOKIES_ENABLED = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'tutorial.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   #'tutorial.middlewares.MyCustomDownloaderMiddleware': 543,
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
    'tutorial.middlewares.IPPOOlS' : 125,
    'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 2,
    'tutorial.uamid.Uamid': 1
}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   #'tutorial.pipelines.SomePipeline': 300,
    'tutorial.pipelines.BtKuaiMongo': 300,
    'tutorial.pipelines.JsonWritePipline': 300
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章