構建自己的DSL之一 Simple Crawler

原創

2020-02-25 07:52

轉載請標明出處：http://fuliang.iteye.com/blog/1122008

經常需要從網上抓取一些需要的內容做成語料，供分類使用。所以需要一個靈活的抓取、抽取程序-自己的DSL來做這件事，這樣每次只需要寫幾行代碼就能得到需要的內容。比如我比較希望以下幾行代碼就能把我的博客的內容給抓下來：



crawler = Crawler.new
1.upto(10) do |pn|
    urls = []
    crawler.fetch "http://fuliang.iteye.com/?page=#{pn}" do |page|
        page.css("div.blog_title > h3 > a").each do |node|
            urls << "http://fuliang.iteye.com#{node.attributes['href']}"
        end
    end

    urls.each do |url|
        crawler.fetch url do |page|
            page.xpath(:title => '//*[@id="main"]/div/div[2]/h3/a',:content => '//*[@id="blog_content"]').each do |entry|
                printf("%s\t%s\n",entry[:title].text.gsub(/\s+/,""),entry[:content].text.gsub(/\s+/,""))
            end
        end
    end
    break
end

我們先創建一個Crawler對象，然後按照我博客的列表頁分頁特徵，得到第pn頁的url是
http://fuliang.iteye.com/?page=#{pn}，當然有可能有複雜的規則，構建列表頁的url列表，然後遍歷。crawler只有一個fetch方法，就可以把頁面fetch下來，然後得到這個頁面在塊中處理。這個頁面可以直接根據xpath、css來得到需要抽取的內容，還可以一次抽取一個記錄，只需要向xpath,css方法中傳遞一個字段到xpath/css的hash，然後得到對應的記錄的hash。
按照上面的描述，我們先編寫一個簡單的Crawler，爲了防止被封我們使用了幾個代理：


class Crawler
    def initialize
        @proxies = 1.upto(6).collect{|index| "http://l-crwl#{index}:1080"}
    end

    def fetch(url)
        yield Page.new( Nokogiri::HTML(open(url,fetch_options)) )
    end

private
    def rand_proxy
        @proxies[(rand * 6).to_i]
    end

    def fetch_options
        user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20061201 Firefox/2.0.0.2 (Ubuntu-feisty)"

        fetch_options = {
            "User-Agent" => user_agent,
            "proxy" => rand_proxy
        }
    end
end

然後我們定義Page類，動態定義了css和xpath的方法,我們直接代理給Nokogiri
的css、xpath，讓它來做事情，我們收集一下抽取的結果，一些就ok了：


class Page
    def initialize(html)
        @html = html
    end

    class_eval do
        [:css,:xpath].each do |extract_by|
            define_method extract_by do |arg,&block|
                if arg.is_a? String then
                    if block.nil? then
                       @html.send(extract_by,arg)
                    else
                        block.call(@html.send(extract_by,arg))
                    end
                elsif arg.is_a? Hash then
                    extract_raw = arg.collect{|key,value| [key, @html.send(extract_by,value)]}
                    data = extract_raw.collect do |key, vals|
                        ([key] * vals.size).zip(vals)
                    end
                    result =  data[0].zip(*data[1..-1]).collect{|e| Hash[ * e.flatten ]}
                    if block.nil? then
                        result
                    else
                        block.call(result)
                    end
                else
                    raise ArgumentError.new('Argument type must String or Hash type')
                end
            end
        end
    end
end

整個的代碼：


#!/usr/bin/env ruby

require 'rubygems'
require 'nokogiri'
require 'open-uri'

class Crawler
    def initialize
        @proxies = 1.upto(6).collect{|index| "http://l-crwl#{index}:1080"}
    end

    def fetch(url)
        yield Page.new( Nokogiri::HTML(open(url,fetch_options)) )
    end

private
    def rand_proxy
        @proxies[(rand * 6).to_i]  
    end

    def fetch_options  
        user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.2) Gecko/20061201 Firefox/2.0.0.2 (Ubuntu-feisty)"

        fetch_options = {  
            "User-Agent" => user_agent,  
            "proxy" => rand_proxy  
        }  
    end  
end

class Page
    def initialize(html)
        @html = html
    end

    class_eval do
        [:css,:xpath].each do |extract_by|
            define_method extract_by do |arg,&block|
                if arg.is_a? String then
                    if block.nil? then 
                       @html.send(extract_by,arg)
                    else
                        block.call(@html.send(extract_by,arg))
                    end
                elsif arg.is_a? Hash then
                    extract_raw = arg.collect{|key,value| [key, @html.send(extract_by,value)]}
                    data = extract_raw.collect do |key, vals|
                        ([key] * vals.size).zip(vals)
                    end
                    result =  data[0].zip(*data[1..-1]).collect{|e| Hash[ * e.flatten ]}
                    if block.nil? then
                        result
                    else
                        block.call(result)
                    end
                else
                    raise ArgumentError.new('Argument type must String or Hash type')
                end
            end
        end
    end
end

crawler = Crawler.new
1.upto(10) do |pn|
    urls = []
    crawler.fetch "http://fuliang.iteye.com/?page=#{pn}" do |page|
        page.css("div.blog_title > h3 > a").each do |node|
            urls << "http://fuliang.iteye.com#{node.attributes['href']}"
        end
    end

    urls.each do |url|
        crawler.fetch url do |page|
            page.xpath(:title => '//*[@id="main"]/div/div[2]/h3/a',:content => '//*[@id="blog_content"]').each do |entry|
                printf("%s\t%s\n",entry[:title].text.gsub(/\s+/,""),entry[:content].text.gsub(/\s+/,""))
            end
        end
    end
    break
end

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

構建自己的DSL之一 Simple Crawler

釘釘打卡速度慢

Nginx R31 doc 官方文檔-01-nginx 如何安裝

Python 潮流週刊#51：用 Python 繪製美觀的圖表

Qt/C++音視頻開發74-合併標籤圖形/生成yolo運算結果圖形/文字和圖形合併成一個/水印濾鏡

挑戰程序設計競賽 2.2章習題 POJ - 3617 Best Cow Line 貪心

字節面試：MySQL什麼時候鎖表？如何防止鎖表？

.NET8連接SQL SERVER 2008 R2 報：證書鏈是由不受信任的頒發機構頒發的

golang開發環境搭建(win10)

python計算機視覺學習筆記——PIL庫的用法

Golang初學：獲取程序內存使用情況，std runtime

構建自己的DSL之二抓取文本處理

Hadoop in Action簡單筆記（一）

懂得人和計算機各自擅長做的事情很重要

構建自己的DSL之一 Simple Crawler

使用scala.sys.process包和系統交互

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結