利用casperjs+phantomjs獲取網站文檔

利用無界面瀏覽器，結合js腳本獲取網站的文檔鏈接，並下載文檔，簡單易行。本文從phantomjs 的改造開始，給出了一個遍歷網站鏈接，並下載文件格式爲pdf、zip和rar的文件的實例。

1、改造phantomjs

利用phantomjs獲取文件需要對其進行改造，具體改造如下：

l 首先進入vs工具的命令行模式：vs tool cmd：

l 然後運行，editbin LARGEADDRESSAWARE phantomjs.exe 即可。

這樣保證大的文件可以下載。

2、利用casper 獲取文件示例

直接利用download即可下載。

var utils = require('utils'),

fs = require('fs');

var casper = require('casper').create({

verbose: "true",

logLevel: "false",

pageSettings: {

webSecurityEnabled: false

}

});

casper.timeout = 50000;

var url = 'http://www.moh.gov.cn/ewebeditor/uploadfile/2017/06/20170605095215136.pdf';

casper.start('http://www.moh.gov.cn', function () {

this.download(url, '20170605095215136.pdf');

});

casper.run(function () {

this.echo('Done.').exit();

});

3、完整例子

3.1爬蟲設置

(1) 首先下載相關的程序和系統，下載的網址如下：

http://phantomjs.org/download.html

http://casperjs.org/

(2) 下載解壓，設置環境變量，指向解壓目錄和解壓後的bin目錄。即可

3.2批處理命令

啓動爬蟲的批處理命令：

@echo off

chcp 65001

cd .

casperjs crawel.js http://www.moh.gov.cn/zhuz/s9493/wsbz.shtml

3.3 相關代碼

url.js代碼：

* An URI datatype. Based upon examples in RFC3986.

* TODO %-escaping

* TODO split apart authority

* TODO split apart query_string (on demand, anyway)

* @(#) $Id$

// Constructor for the URI object. Parse a string into its components.

function URI(str) {

if (!str) str = "";

// Based on the regex in RFC2396 Appendix B.

var parser = /^(?:([^:\/?\#]+):)?(?:\/\/([^\/?\#]*))?([^?\#]*)(?:\?([^\#]*))?(?:\#(.*))?/;

var result = str.match(parser);

this.scheme = result[1] || null;

this.authority = result[2] || null;

this.path = result[3] || null;

this.query = result[4] || null;

this.fragment = result[5] || null;

}

// Restore the URI to it's stringy glory.

URI.prototype.toString = function () {

var str = "";

if (this.scheme) {

str += this.scheme + ":";

}

if (this.authority) {

str += "//" + this.authority;

}

if (this.path) {

str += this.path;

}

if (this.query) {

str += "?" + this.query;

}

if (this.fragment) {

str += "#" + this.fragment;

}

return str;

};

// Introduce a new scope to define some private helper functions.

(function () {

// RFC3986 §5.2.3 (Merge Paths)

function merge(base, rel_path) {

var dirname = /^(.*)\//;

if (base.authority && !base.path) {

return "/" + rel_path;

}

else {

return base.path.match(dirname)[0] + rel_path;

}

// Match two path segments, where the second is ".." and the first must

// not be "..".

var DoubleDot = /\/((?!\.\.\/)[^\/]*)\/\.\.\//;

function remove_dot_segments(path) {

if (!path) return "";

// Remove any single dots

var newpath = path.replace(/\/\.\//g, '/');

// Remove any trailing single dots.

newpath = newpath.replace(/\/\.$/, '/');

// Remove any double dots and the path previous. NB: We can't use

// the "g", modifier because we are changing the string that we're

// matching over.

while (newpath.match(DoubleDot)) {

newpath = newpath.replace(DoubleDot, '/');

}

// Remove any trailing double dots.

newpath = newpath.replace(/\/([^\/]*)\/\.\.$/, '/');

// If there are any remaining double dot bits, then they're wrong

// and must be nuked. Again, we can't use the g modifier.

while (newpath.match(/\/\.\.\//)) {

newpath = newpath.replace(/\/\.\.\//, '/');

}

return newpath;

}

// RFC3986 §5.2.2. Transform References;

URI.prototype.resolve = function (base) {

var target = new URI();

if (this.scheme) {

target.scheme = this.scheme;

target.authority = this.authority;

target.path = remove_dot_segments(this.path);

target.query = this.query;

}

else {

if (this.authority) {

target.authority = this.authority;

target.path = remove_dot_segments(this.path);

target.query = this.query;

}

else {

// XXX Original spec says "if defined and empty"…;

if (!this.path) {

target.path = base.path;

if (this.query) {

target.query = this.query;

}

else {

target.query = base.query;

}

else {

if (this.path.charAt(0) === '/') {

target.path = remove_dot_segments(this.path);

} else {

target.path = merge(base, this.path);

target.path = remove_dot_segments(target.path);

}

target.query = this.query;

}

target.authority = base.authority;

}

target.scheme = base.scheme;

}

target.fragment = this.fragment;

return target;

};

})();

crawel.js文件如下

var casper = require("casper").create({

pageSettings: {

loadImages: false,

loadPlugins: false,

clearMemoryCaches:true

}

});

var checked = [];

var currentLink = 0;

var fs = require('fs');

var upTo = ~~casper.cli.get('max-depth') || 100000;

var url = casper.cli.get(0);

var baseUrl = url;

var links = [url];

var utils = require('utils');

var f = utils.format;

casper.timeout = 500000;

function absPath(url, base) {

return new URI(url).resolve(new URI(base)).toString();

}

// Clean links

function cleanLinks(urls, base) {

return utils.unique(urls).filter(function(url) {

return url.indexOf(baseUrl) === 0 || !new RegExp('^(#|ftp|javascript|http)').test(url);

}).map(function(url) {

return absPath(url, base);

}).filter(function(url) {

return checked.indexOf(url) === -1;

});

}

// Opens the page, perform tests and fetch next links

function crawl(link) {

this.start().then(function() {

this.echo(link, 'COMMENT');

if (link.indexOf(".pdf") != -1 || link.indexOf(".zip") != -1 || link.indexOf(".rar" || link.indexOf(".doc") != -1) != -1)

{

var query = new URI(link).resolve(new URI(link));

var filename = query.path;

filename = filename.substr(filename.lastIndexOf("/") + 1);

this.echo("download " + filename);

this.download(link, filename);

} else

{

this.open(link);

}

checked.push(link);

});

this.then(function() {

if (this.currentHTTPStatus === 404) {

this.warn(link + ' is missing (HTTP 404)');

} else if (this.currentHTTPStatus === 500) {

this.warn(link + ' is broken (HTTP 500)');

} else {

this.echo(link + f(' is okay (HTTP %s)', this.currentHTTPStatus));

}

});

this.then(function() {

var newLinks = searchLinks.call(this);

links = links.concat(newLinks).filter(function(url) {

return checked.indexOf(url) === -1;

});

this.echo(newLinks.length + " new links found on " + link);

});

}

// Fetch all <a> elements from the page and return

// the ones which contains a href starting with 'http://'

function searchLinks() {

return cleanLinks(this.evaluate(function _fetchInternalLinks() {

return [].map.call(__utils__.findAll('a[href]'), function(node) {

return node.getAttribute('href');

});

}), this.getCurrentUrl());

}

// As long as it has a next link, and is under the maximum limit, will keep running

function check() {

if (links[currentLink] && currentLink < upTo) {

crawl.call(this, links[currentLink]);

currentLink++;

this.run(check);

} else {

this.echo("All done, " + checked.length + " links checked.");

this.exit();

}

if (!url) {

casper.warn('No url passed, aborting.').exit();

}

casper.start(function () {

}).then(function() {

this.echo("Starting");

var file = fs.open("./url.js", 'r');

var content = file.read();

var scriptCode = content + '; return URI;';

window.URI = new Function(scriptCode)();

if (typeof window.URI === "function") {

this.echo('URI.js loaded');

} else {

this.warn('Could not setup URI.js').exit();

}

}).run(check);

利用casperjs+phantomjs獲取網站文檔

利用casperjs+phantomjs獲取網站文檔

1、改造phantomjs

2、利用casper 獲取文件示例

3、完整例子

3.1爬蟲設置

3.2批處理命令

3.3 相關代碼

【簡寫Mybatis-02】註冊機的實現以及SqlSession處理

手繪二維碼

.NET藉助虛擬網卡實現一個簡單異地組網工具

dndTree.js的樹視圖動態增加節點

利用casperjs+phantomjs獲取網站文檔

開啓智能安全搜索之旅：密文全文搜索技術的研究開發和應用

雲時代，我們的信息我們做主

搜索人生之——遺憾

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結