利用casperjs+phantomjs獲取網站文檔
利用無界面瀏覽器,結合js腳本獲取網站的文檔鏈接,並下載文檔,簡單易行。本文從phantomjs 的改造開始,給出了一個遍歷網站鏈接,並下載文件格式爲pdf、zip和rar的文件的實例。
1、改造phantomjs
利用phantomjs獲取文件需要對其進行改造,具體改造如下:
l 首先進入vs工具的命令行模式:vs tool cmd:
l 然後運行,editbin LARGEADDRESSAWARE phantomjs.exe 即可。
這樣保證大的文件可以下載。
2、利用casper 獲取文件示例
直接利用download即可下載。
var utils = require('utils'),
fs = require('fs');
var casper = require('casper').create({
verbose: "true",
logLevel: "false",
pageSettings: {
webSecurityEnabled: false
}
});
casper.timeout = 50000;
var url = 'http://www.moh.gov.cn/ewebeditor/uploadfile/2017/06/20170605095215136.pdf';
casper.start('http://www.moh.gov.cn', function () {
this.download(url, '20170605095215136.pdf');
});
casper.run(function () {
this.echo('Done.').exit();
});
3、完整例子
3.1爬蟲設置
(1) 首先下載相關的程序和系統,下載的網址如下:
http://phantomjs.org/download.html
http://casperjs.org/
(2) 下載解壓,設置環境變量,指向解壓目錄和解壓後的bin目錄。即可
3.2批處理命令
啓動爬蟲的批處理命令:
@echo off
chcp 65001
cd .
casperjs crawel.js http://www.moh.gov.cn/zhuz/s9493/wsbz.shtml
3.3 相關代碼
url.js代碼:
/*
* An URI datatype. Based upon examples in RFC3986.
*
* TODO %-escaping
* TODO split apart authority
* TODO split apart query_string (on demand, anyway)
*
* @(#) $Id$
*/
// Constructor for the URI object. Parse a string into its components.
function URI(str) {
if (!str) str = "";
// Based on the regex in RFC2396 Appendix B.
var parser = /^(?:([^:\/?\#]+):)?(?:\/\/([^\/?\#]*))?([^?\#]*)(?:\?([^\#]*))?(?:\#(.*))?/;
var result = str.match(parser);
this.scheme = result[1] || null;
this.authority = result[2] || null;
this.path = result[3] || null;
this.query = result[4] || null;
this.fragment = result[5] || null;
}
// Restore the URI to it's stringy glory.
URI.prototype.toString = function () {
var str = "";
if (this.scheme) {
str += this.scheme + ":";
}
if (this.authority) {
str += "//" + this.authority;
}
if (this.path) {
str += this.path;
}
if (this.query) {
str += "?" + this.query;
}
if (this.fragment) {
str += "#" + this.fragment;
}
return str;
};
// Introduce a new scope to define some private helper functions.
(function () {
// RFC3986 §5.2.3 (Merge Paths)
function merge(base, rel_path) {
var dirname = /^(.*)\//;
if (base.authority && !base.path) {
return "/" + rel_path;
}
else {
return base.path.match(dirname)[0] + rel_path;
}
}
// Match two path segments, where the second is ".." and the first must
// not be "..".
var DoubleDot = /\/((?!\.\.\/)[^\/]*)\/\.\.\//;
function remove_dot_segments(path) {
if (!path) return "";
// Remove any single dots
var newpath = path.replace(/\/\.\//g, '/');
// Remove any trailing single dots.
newpath = newpath.replace(/\/\.$/, '/');
// Remove any double dots and the path previous. NB: We can't use
// the "g", modifier because we are changing the string that we're
// matching over.
while (newpath.match(DoubleDot)) {
newpath = newpath.replace(DoubleDot, '/');
}
// Remove any trailing double dots.
newpath = newpath.replace(/\/([^\/]*)\/\.\.$/, '/');
// If there are any remaining double dot bits, then they're wrong
// and must be nuked. Again, we can't use the g modifier.
while (newpath.match(/\/\.\.\//)) {
newpath = newpath.replace(/\/\.\.\//, '/');
}
return newpath;
}
// RFC3986 §5.2.2. Transform References;
URI.prototype.resolve = function (base) {
var target = new URI();
if (this.scheme) {
target.scheme = this.scheme;
target.authority = this.authority;
target.path = remove_dot_segments(this.path);
target.query = this.query;
}
else {
if (this.authority) {
target.authority = this.authority;
target.path = remove_dot_segments(this.path);
target.query = this.query;
}
else {
// XXX Original spec says "if defined and empty"…;
if (!this.path) {
target.path = base.path;
if (this.query) {
target.query = this.query;
}
else {
target.query = base.query;
}
}
else {
if (this.path.charAt(0) === '/') {
target.path = remove_dot_segments(this.path);
} else {
target.path = merge(base, this.path);
target.path = remove_dot_segments(target.path);
}
target.query = this.query;
}
target.authority = base.authority;
}
target.scheme = base.scheme;
}
target.fragment = this.fragment;
return target;
};
})();
crawel.js文件如下
var casper = require("casper").create({
pageSettings: {
loadImages: false,
loadPlugins: false,
clearMemoryCaches:true
}
});
var checked = [];
var currentLink = 0;
var fs = require('fs');
var upTo = ~~casper.cli.get('max-depth') || 100000;
var url = casper.cli.get(0);
var baseUrl = url;
var links = [url];
var utils = require('utils');
var f = utils.format;
casper.timeout = 500000;
function absPath(url, base) {
return new URI(url).resolve(new URI(base)).toString();
}
// Clean links
function cleanLinks(urls, base) {
return utils.unique(urls).filter(function(url) {
return url.indexOf(baseUrl) === 0 || !new RegExp('^(#|ftp|javascript|http)').test(url);
}).map(function(url) {
return absPath(url, base);
}).filter(function(url) {
return checked.indexOf(url) === -1;
});
}
// Opens the page, perform tests and fetch next links
function crawl(link) {
this.start().then(function() {
this.echo(link, 'COMMENT');
if (link.indexOf(".pdf") != -1 || link.indexOf(".zip") != -1 || link.indexOf(".rar" || link.indexOf(".doc") != -1) != -1)
{
var query = new URI(link).resolve(new URI(link));
var filename = query.path;
filename = filename.substr(filename.lastIndexOf("/") + 1);
this.echo("download " + filename);
this.download(link, filename);
} else
{
this.open(link);
}
checked.push(link);
});
this.then(function() {
if (this.currentHTTPStatus === 404) {
this.warn(link + ' is missing (HTTP 404)');
} else if (this.currentHTTPStatus === 500) {
this.warn(link + ' is broken (HTTP 500)');
} else {
this.echo(link + f(' is okay (HTTP %s)', this.currentHTTPStatus));
}
});
this.then(function() {
var newLinks = searchLinks.call(this);
links = links.concat(newLinks).filter(function(url) {
return checked.indexOf(url) === -1;
});
this.echo(newLinks.length + " new links found on " + link);
});
}
// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'
function searchLinks() {
return cleanLinks(this.evaluate(function _fetchInternalLinks() {
return [].map.call(__utils__.findAll('a[href]'), function(node) {
return node.getAttribute('href');
});
}), this.getCurrentUrl());
}
// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
if (links[currentLink] && currentLink < upTo) {
crawl.call(this, links[currentLink]);
currentLink++;
this.run(check);
} else {
this.echo("All done, " + checked.length + " links checked.");
this.exit();
}
}
if (!url) {
casper.warn('No url passed, aborting.').exit();
}
casper.start(function () {
}).then(function() {
this.echo("Starting");
var file = fs.open("./url.js", 'r');
var content = file.read();
var scriptCode = content + '; return URI;';
window.URI = new Function(scriptCode)();
if (typeof window.URI === "function") {
this.echo('URI.js loaded');
} else {
this.warn('Could not setup URI.js').exit();
}
}).run(check);