利用casperjs+phantomjs獲取網站文檔

利用casperjs+phantomjs獲取網站文檔

利用無界面瀏覽器,結合js腳本獲取網站的文檔鏈接,並下載文檔,簡單易行。本文從phantomjs 的改造開始,給出了一個遍歷網站鏈接,並下載文件格式爲pdfziprar的文件的實例。

1、改造phantomjs

利用phantomjs獲取文件需要對其進行改造,具體改造如下:

l 首先進入vs工具的命令行模式:vs tool cmd

l 然後運行,editbin LARGEADDRESSAWARE phantomjs.exe 即可。

這樣保證大的文件可以下載。

 

2、利用casper 獲取文件示例

直接利用download即可下載。

var utils   = require('utils'),

      fs      = require('fs');

var casper = require('casper').create({

    verbose: "true",

    logLevel: "false",

    pageSettings: {

      webSecurityEnabled: false

    }

  });

 

casper.timeout = 50000;

var url = 'http://www.moh.gov.cn/ewebeditor/uploadfile/2017/06/20170605095215136.pdf';

 

casper.start('http://www.moh.gov.cn', function () {

    this.download(url, '20170605095215136.pdf');

});

 

casper.run(function () {

    this.echo('Done.').exit();

});

 

 

3、完整例子

3.1爬蟲設置

(1) 首先下載相關的程序和系統,下載的網址如下:

http://phantomjs.org/download.html

http://casperjs.org/

(2) 下載解壓,設置環境變量,指向解壓目錄和解壓後的bin目錄。即可

3.2批處理命令

啓動爬蟲的批處理命令:

@echo off

chcp 65001

cd .

casperjs crawel.js http://www.moh.gov.cn/zhuz/s9493/wsbz.shtml

3.3 相關代碼

url.js代碼:

/*

 * An URI datatype.  Based upon examples in RFC3986.

 *

 * TODO %-escaping

 * TODO split apart authority

 * TODO split apart query_string (on demand, anyway)

 *

 * @(#) $Id$

 */

 

// Constructor for the URI object.  Parse a string into its components.

function URI(str) {

    if (!str) str = "";

    // Based on the regex in RFC2396 Appendix B.

    var parser = /^(?:([^:\/?\#]+):)?(?:\/\/([^\/?\#]*))?([^?\#]*)(?:\?([^\#]*))?(?:\#(.*))?/;

    var result = str.match(parser);

    this.scheme    = result[1] || null;

    this.authority = result[2] || null;

    this.path      = result[3] || null;

    this.query     = result[4] || null;

    this.fragment  = result[5] || null;

}

 

// Restore the URI to it's stringy glory.

URI.prototype.toString = function () {

    var str = "";

    if (this.scheme) {

        str += this.scheme + ":";

    }

    if (this.authority) {

        str += "//" + this.authority;

    }

    if (this.path) {

        str += this.path;

    }

    if (this.query) {

        str += "?" + this.query;

    }

    if (this.fragment) {

        str += "#" + this.fragment;

    }

    return str;

};

 

// Introduce a new scope to define some private helper functions.

(function () {

    // RFC3986 §5.2.3 (Merge Paths)

    function merge(base, rel_path) {

        var dirname = /^(.*)\//;

        if (base.authority && !base.path) {

            return "/" + rel_path;

        }

        else {

            return base.path.match(dirname)[0] + rel_path;

        }

    }

 

    // Match two path segments, where the second is ".." and the first must

    // not be "..".

    var DoubleDot = /\/((?!\.\.\/)[^\/]*)\/\.\.\//;

 

    function remove_dot_segments(path) {

        if (!path) return "";

        // Remove any single dots

        var newpath = path.replace(/\/\.\//g, '/');

        // Remove any trailing single dots.

        newpath = newpath.replace(/\/\.$/, '/');

        // Remove any double dots and the path previous.  NB: We can't use

        // the "g", modifier because we are changing the string that we're

        // matching over.

        while (newpath.match(DoubleDot)) {

            newpath = newpath.replace(DoubleDot, '/');

        }

        // Remove any trailing double dots.

        newpath = newpath.replace(/\/([^\/]*)\/\.\.$/, '/');

        // If there are any remaining double dot bits, then they're wrong

        // and must be nuked.  Again, we can't use the g modifier.

        while (newpath.match(/\/\.\.\//)) {

            newpath = newpath.replace(/\/\.\.\//, '/');

        }

        return newpath;

    }

 

    // RFC3986 §5.2.2. Transform References;

    URI.prototype.resolve = function (base) {

        var target = new URI();

        if (this.scheme) {

            target.scheme    = this.scheme;

            target.authority = this.authority;

            target.path      = remove_dot_segments(this.path);

            target.query     = this.query;

        }

        else {

            if (this.authority) {

                target.authority = this.authority;

                target.path      = remove_dot_segments(this.path);

                target.query     = this.query;

            }

            else {

                // XXX Original spec says "if defined and empty";

                if (!this.path) {

                    target.path = base.path;

                    if (this.query) {

                        target.query = this.query;

                    }

                    else {

                        target.query = base.query;

                    }

                }

                else {

                    if (this.path.charAt(0) === '/') {

                        target.path = remove_dot_segments(this.path);

                    } else {

                        target.path = merge(base, this.path);

                        target.path = remove_dot_segments(target.path);

                    }

                    target.query = this.query;

                }

                target.authority = base.authority;

            }

            target.scheme = base.scheme;

        }

 

        target.fragment = this.fragment;

 

        return target;

    };

})();

 

crawel.js文件如下

var casper = require("casper").create({

  pageSettings: {

    loadImages: false,

    loadPlugins: false,

    clearMemoryCaches:true

  }

});

var checked = [];

var currentLink = 0;

var fs = require('fs');

var upTo = ~~casper.cli.get('max-depth') || 100000;

var url = casper.cli.get(0);

var baseUrl = url;

var links = [url];

var utils = require('utils');

var f = utils.format;

casper.timeout = 500000;

function absPath(url, base) {

  return new URI(url).resolve(new URI(base)).toString();

}

 

// Clean links

function cleanLinks(urls, base) {

  return utils.unique(urls).filter(function(url) {

    return url.indexOf(baseUrl) === 0 || !new RegExp('^(#|ftp|javascript|http)').test(url);

  }).map(function(url) {

    return absPath(url, base);

  }).filter(function(url) {

    return checked.indexOf(url) === -1;

  });

}

 

// Opens the page, perform tests and fetch next links

function crawl(link) {

  this.start().then(function() {

      this.echo(link, 'COMMENT');

      if (link.indexOf(".pdf") != -1 || link.indexOf(".zip") != -1 || link.indexOf(".rar" || link.indexOf(".doc") != -1) != -1)

      {

          var query = new URI(link).resolve(new URI(link));

          var filename = query.path;

          filename = filename.substr(filename.lastIndexOf("/") + 1);

          this.echo("download " + filename);

          this.download(link, filename);

      } else

      {

          this.open(link);

      }

      checked.push(link);

  });

  this.then(function() {

    if (this.currentHTTPStatus === 404) {

      this.warn(link + ' is missing (HTTP 404)');

    } else if (this.currentHTTPStatus === 500) {

      this.warn(link + ' is broken (HTTP 500)');

    } else {

      this.echo(link + f(' is okay (HTTP %s)', this.currentHTTPStatus));

    }

  });

  this.then(function() {

    var newLinks = searchLinks.call(this);

    links = links.concat(newLinks).filter(function(url) {

      return checked.indexOf(url) === -1;

    });

    this.echo(newLinks.length + " new links found on " + link);

  });

}

 

// Fetch all <a> elements from the page and return

// the ones which contains a href starting with 'http://'

function searchLinks() {

  return cleanLinks(this.evaluate(function _fetchInternalLinks() {

    return [].map.call(__utils__.findAll('a[href]'), function(node) {

      return node.getAttribute('href');

    });

  }), this.getCurrentUrl());

}

 

// As long as it has a next link, and is under the maximum limit, will keep running

function check() {

  if (links[currentLink] && currentLink < upTo) {

    crawl.call(this, links[currentLink]);

    currentLink++;

    this.run(check);

  } else {

    this.echo("All done, " + checked.length + " links checked.");

    this.exit();

  }

}

if (!url) {

  casper.warn('No url passed, aborting.').exit();

}

 

casper.start(function () {

   

}).then(function() {

    this.echo("Starting");

    var file = fs.open("./url.js", 'r');

    var content = file.read();

    var scriptCode = content + '; return URI;';

    window.URI = new Function(scriptCode)();

    if (typeof window.URI === "function") {

        this.echo('URI.js loaded');

    } else {

        this.warn('Could not setup URI.js').exit();

    }

}).run(check);

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章