構建自己的DSL之一 Simple Crawler

轉載請標明出處:http://fuliang.iteye.com/blog/1122008

經常需要從網上抓取一些需要的內容做成語料,供分類使用。所以需要一個靈活的抓取、抽取程序-自己的DSL來做這件事,這樣每次只需要寫幾行代碼就能得到需要的內容。比如我比較希望以下幾行代碼就能把我的博客的內容給抓下來:


crawler = Crawler.new
1.upto(10) do |pn|
urls = []
crawler.fetch "http://fuliang.iteye.com/?page=#{pn}" do |page|
page.css("div.blog_title > h3 > a").each do |node|
urls << "http://fuliang.iteye.com#{node.attributes['href']}"
end
end

urls.each do |url|
crawler.fetch url do |page|
page.xpath(:title => '//*[@id="main"]/div/div[2]/h3/a',:content => '//*[@id="blog_content"]').each do |entry|
printf("%s\t%s\n",entry[:title].text.gsub(/\s+/,""),entry[:content].text.gsub(/\s+/,""))
end
end
end
break
end


我們先創建一個Crawler對象,然後按照我博客的列表頁分頁特徵,得到第pn頁的url是
http://fuliang.iteye.com/?page=#{pn},當然有可能有複雜的規則,構建列表頁的url列表,然後遍歷。crawler只有一個fetch方法,就可以把頁面fetch下來,然後得到這個頁面在塊中處理。這個頁面可以直接根據xpath、css來得到需要抽取的內容,還可以一次抽取一個記錄,只需要向xpath,css方法中傳遞一個字段到xpath/css的hash,然後得到對應的記錄的hash。
按照上面的描述,我們先編寫一個簡單的Crawler,爲了防止被封我們使用了幾個代理:

class Crawler
def initialize
@proxies = 1.upto(6).collect{|index| "http://l-crwl#{index}:1080"}
end

def fetch(url)
yield Page.new( Nokogiri::HTML(open(url,fetch_options)) )
end

private
def rand_proxy
@proxies[(rand * 6).to_i]
end

def fetch_options
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20061201 Firefox/2.0.0.2 (Ubuntu-feisty)"

fetch_options = {
"User-Agent" => user_agent,
"proxy" => rand_proxy
}
end
end

然後我們定義Page類,動態定義了css和xpath的方法,我們直接代理給Nokogiri
的css、xpath,讓它來做事情,我們收集一下抽取的結果,一些就ok了:

class Page
def initialize(html)
@html = html
end

class_eval do
[:css,:xpath].each do |extract_by|
define_method extract_by do |arg,&block|
if arg.is_a? String then
if block.nil? then
@html.send(extract_by,arg)
else
block.call(@html.send(extract_by,arg))
end
elsif arg.is_a? Hash then
extract_raw = arg.collect{|key,value| [key, @html.send(extract_by,value)]}
data = extract_raw.collect do |key, vals|
([key] * vals.size).zip(vals)
end
result = data[0].zip(*data[1..-1]).collect{|e| Hash[ * e.flatten ]}
if block.nil? then
result
else
block.call(result)
end
else
raise ArgumentError.new('Argument type must String or Hash type')
end
end
end
end
end


整個的代碼:

#!/usr/bin/env ruby

require 'rubygems'
require 'nokogiri'
require 'open-uri'

class Crawler
def initialize
@proxies = 1.upto(6).collect{|index| "http://l-crwl#{index}:1080"}
end

def fetch(url)
yield Page.new( Nokogiri::HTML(open(url,fetch_options)) )
end

private
def rand_proxy
@proxies[(rand * 6).to_i]
end

def fetch_options
user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20061201 Firefox/2.0.0.2 (Ubuntu-feisty)"

fetch_options = {
"User-Agent" => user_agent,
"proxy" => rand_proxy
}
end
end

class Page
def initialize(html)
@html = html
end

class_eval do
[:css,:xpath].each do |extract_by|
define_method extract_by do |arg,&block|
if arg.is_a? String then
if block.nil? then
@html.send(extract_by,arg)
else
block.call(@html.send(extract_by,arg))
end
elsif arg.is_a? Hash then
extract_raw = arg.collect{|key,value| [key, @html.send(extract_by,value)]}
data = extract_raw.collect do |key, vals|
([key] * vals.size).zip(vals)
end
result = data[0].zip(*data[1..-1]).collect{|e| Hash[ * e.flatten ]}
if block.nil? then
result
else
block.call(result)
end
else
raise ArgumentError.new('Argument type must String or Hash type')
end
end
end
end
end

crawler = Crawler.new
1.upto(10) do |pn|
urls = []
crawler.fetch "http://fuliang.iteye.com/?page=#{pn}" do |page|
page.css("div.blog_title > h3 > a").each do |node|
urls << "http://fuliang.iteye.com#{node.attributes['href']}"
end
end

urls.each do |url|
crawler.fetch url do |page|
page.xpath(:title => '//*[@id="main"]/div/div[2]/h3/a',:content => '//*[@id="blog_content"]').each do |entry|
printf("%s\t%s\n",entry[:title].text.gsub(/\s+/,""),entry[:content].text.gsub(/\s+/,""))
end
end
end
break
end
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章