基於深度學習的Hub頁識別

一、引言

    在web爬取領域,hub自動識別是一個比較重要的研究內容,通過自動識別的hub,可以增加爬蟲的種子源列表,進而提高數據的抓取量和覆蓋度。業界已有一些基於傳統機器學習方法的hub識別算法,但傳統識別算法需要有豐富的特徵工程經驗才能建立一個比較穩定而可靠的模型。此外,web頁面的結構千變萬化,提取穩定而可靠的特徵是相當困難的。

    相反,基於深度學習的方法可以擺脫對特徵工程的束縛,能夠自動提取樣本中的特徵,外加其強大的學習和表示能力,在Hub識別方面一定會比傳統方法要有效的多。

 

二、訓練樣本生成

    web頁面是一個版結構化的數據,爲了生成訓練數據,關鍵的一步便是如何將半結構化的web頁面轉換爲一個向量。如果將一個web頁面的body體中的節點遞歸的組織爲一棵樹,它會有如下的結構。

   

    

    如今的網頁源碼基本都是div+css的搭配模式。同時,如果你仔細觀察會發現,div的組織一般是從上到下或者從左到右,非常遵循我們人眼的視覺感知順序。下面要引入的樣本生成算法將按照這種視覺感知順序處理網頁中的每個節點,並最終將其轉換爲一個一維向量。


    PageToVec算法

    輸入: url, html, domain

    輸出: one-dim vector

  1.     利用lxml等開源類庫構建節點樹.
  2.     採用深度優先遍歷,遞歸的對每個節點做處理:  針對不同節點,在向量尾部填充充不同的標識.
  3.     向量映射,將向量映射爲僅包含-1,0,1三個數字的向量.
  4.     視覺增強,對p、h1、video等有視覺衝擊感的節點,在對應的向量位置上按照指定規則進行擴充.
  5.     向量正規化,對於超出長度的向量做採樣截斷,對於長度不足的向量採用首部或尾部填充.

    下面是算法PageToVec中的page_to_vec.py文件。

# usr/bin/env python
# -*- coding: UTF-8 -*-

import time, datetime
from time import sleep
import sys
import os
import math
import urllib
import random
import logging
import copy
from lxml import etree
from facility import *
from ac_automation import *
reload(sys)
sys.setdefaultencoding('utf-8')

class PageToVec(object):
    def __init__(self):
        super(PageToVec, self).__init__()
        self.domain = ""
        self.host = ""
        self.url = ""
        # url最長限制
        self.min_url_len = 256
        self.vector = []
        # max vector len
        self.vsize = 512
        self.h_tags = ['h1', 'h2']
        # 無需遞歸處理的標籤
        self.skip_tags = ["footer","noscript","nav","input","select","option"]
        self.kw_dict = {}
        # AC自動機
        self.ac_automation = ac_automation()
        self.ac_automation.parse("./key_word.txt")
        # 段落最小長度
        self.min_para_len = 16
        # 段落每行平均字數
        self.ave_char_num_per_line = 32
        # 標題最小長度
        self.min_h_len = 5
        # p 標籤數
        self.p_num = 0
        # a 標籤數
        self.a_num = 0
        # anchor最短長度
        self.min_anchor_len = 5

    def erase(self):
        self.vector = []
        self.p_num = 0
        self.a_num = 0
        self.domain = ""
        self.host = ""

    def parse(self, url, content, domain):
        try:
            host = url.split('/')[2]
            self.url = url
            self.host = host
            self.domain = domain
            self.clearNotes(content)
            self.clearStyle(content)
            lst = []
            for line in content:
                if line == "*\n" or line == "#\n" or line == "~\n":
                    continue
                lst.append(line)
            if not lst or len(lst) == 0:
                return []
            html = etree.HTML("".join(lst))
            if html is None:
                return []
            tree = etree.ElementTree(html)
            node_list = tree.xpath('/html/body')
            if node_list is not None and len(node_list) > 0:
                self.fill(node_list[0])
                self.normalize()
                return self.vector
            else:
                self.vector = []
                return self.vector
        except Exception as e:
            logger.info("error: %s" % str(e))
            self.erase()
            return self.vector

    """
    valid paragraph length > 16
    """
    def is_valid_para(self, para):
        if para == None or para == "":
            return False
        ln = len(para)
        if ln >= self.min_para_len:
            return True
        else:
            return False

    """
    fill vector by iterate node tree
    param: tree root
    param: recuresive depth
    """
    def fill(self, root):
        for node in root.getchildren():
            if self.skip(node):
                continue
            childs = node.getchildren()
            # a tag process
            if node.tag == "a":
                self.tag_a(node)
                continue
            # h tag process
            if node.tag in self.h_tags:
                h = self.tag_h(node)
                if h and len(h) >= self.min_h_len:
                    self.vector.append("h-" + str(len(h)))
                else:
                    self.vector.append("0")
                continue
            # img tag process
            if node is not None and node.tag == "img" and ".jpg" in node.get("src"):
                self.tag_img(node)
                continue
            if node.tag == "script":
                self.tag_script(node)
                continue
            if node.tag == "iframe":
                self.tag_iframe(node)
                continue
            if node.tag == "video":
                self.tag_video(node)
                continue
            if node.tag == "embed":
                self.tag_embed(node)
                continue
            if node.tag == "audio":
                self.tag_audio(node)
                continue
            # br tag
            if node.tag == "br":
                self.vector.append("p-0")
                continue
            # paragragh process
            if node.tag == "p" or (not childs or len(childs) == 0):
                level = 0
                para = self.tag_p(node, level)
                if self.is_valid_para(para):
                    self.vector.append("p-" + str(len(para)))
                else:
                    self.vector.append("0")
                continue
            self.fill(node)

    """
    normalize vector so that all of the element bounds in [0,1,-1]
    h: 1
    p: 1
    a: -1
    others: 0
    """
    def normalize(self):
        ln = len(self.vector)
        self.p_num = 0
        self.a_num = 0
        if ln <= 0:
            return
        self.vector[ln - 1] = 0
        # phase one: map
        for i in range(ln-2, -1, -1):
            c = self.vector[i][0]
            if c == '0':
                self.vector[i] = 0
                continue
            if c == '-':
                self.a_num += 1
                self.vector[i] = -1
                continue
            if c == 'h':
                self.vector[i] = 'h'
                continue
            if c == 'i':
                self.vector[i] = 'i'
                continue
            if c == 'm':
                self.vector[i] = 'm'
                continue
            ac = 0
            if i < ln-1:
                ac = self.vector[i+1]
            bc = ""
            line_num = int(self.vector[i].split('-')[1]) / self.ave_char_num_per_line
            if i > 0:
                bc = self.vector[i-1][0]
            if c == 'p' and (ac >= 1 or bc in ['p','i','h']):
                self.p_num += 1
                self.vector[i] = line_num
            elif c == 'p':
                self.p_num += 1
                self.vector[i] = line_num
            else:
                self.vector[i] = 0
        # phase two: expand
        container = []
        for e in self.vector:
            if e <= 0:
                container.append(e)
            elif e == 'h':
                container.extend([1] * 10)
            elif e == 'i':
                container.extend([1] * 5)
            elif e == 'm':
                container.extend([1] * 10)
            else:
                container.extend([1] * e)
        self.vector = container
        ln = len(self.vector)
        if ln > self.vsize:
            self.zoom()
            return
        if ln < self.vsize:
            self.vector.extend([0] * (self.vsize - ln))

    def zoom(self):
        self.zoom_internal(5, 0)
        if len(self.vector) == self.vsize:
            return
        self.zoom_internal(5, -1)
        if len(self.vector) > self.vsize:
            self.truncate()

    def zoom_internal(self, loop_num = 3, fig = 0):
        loop = 0
        while loop < loop_num:
            n = len(self.vector)
            df = n - self.vsize
            i = j = 0
            v = []
            while True:
                while i < n and self.vector[i] != fig:
                    i += 1
                v.extend(self.vector[j : i])
                if i >= n:
                    break
                j = i
                while i < n and self.vector[i] == fig:
                    i += 1
                if i - j > 1:
                    ln = 0
                    if i - j - df >= 1:
                        ln = i - j - df
                        df = 0
                    else:
                        r1 = int(random.uniform(1, i-j-1))
                        r2 = int(random.uniform(1, i-j-1))
                        r3 = int(random.uniform(1, i-j-1))
                        ln = min(min(r1,r2), r3)
                        df = df - (i - j - ln)
                    j = i
                    v.extend([fig] * ln)
                if df == 0:
                    v.extend(self.vector[i:])
                    break
                i += 1
            self.vector = v
            if df == 0:
                break
            loop += 1

    def truncate(self):
        j = len(self.vector) - self.vsize
        hzn = tzn = 0
        p = 0
        q = -1
        c = 0
        while c < j:
            hzn += int(self.vector[p] == 0)
            tzn += int(self.vector[q] == 0)
            c += 1
            p += 1
            q -= 1
        if hzn > tzn:
            self.vector = self.vector[j:]
        else:
            self.vector = self.vector[0 : self.vsize]

    """
    skip redundant nodes, such as attributes containing nav, ad, footer, etc
    root: current node
    """
    def skip(self, root):
        if root.tag in self.skip_tags:
            return True
        d = root.items()
        for item in d:
            if item[0] != "id" and item[0] != "class":
                continue
            if "video" in item[1] or "embed" in item[1] or "audio" in item[1]:
                continue
            if self.ac_automation.hit_key_words(item[1]):
                return True
        return False

    """
    filter key words by AC-Automation
    text: filted text
    """
    def hit_key_word(self, text):
        if not text or not text.strip():
            return False
        b1 = False
        b2 = False
        if len(text.replace("None","").strip()) < self.min_para_len:
            b1 = True
        else:
            return False
        if self.ac_automation.hit_key_words(text.strip().encode("utf-8")):
            b2 = True
        return b1 and b2

    """
    process a tag
    node: current node
    """
    def tag_a(self, node):
        href = node.get("href")
        if not href:
            self.vector.append('0')
            return
        # maybe advertisement
        if len(href) > self.min_url_len:
            return
        t = node
        while True:
            if t.text and t.text.strip():
                break
            c = t.getchildren()
            if not c or len(c) == 0:
                break
            t = c[0]
        # hit key word
        if self.hit_key_word(t.text):
            self.vector.append('0')
            return
        # anchor too short
        if t.text and len(t.text) < self.min_anchor_len:
            self.vector.append('0')
            return
        # such as /a/b.html, ../a.html, etc
        if not href.startswith("http") and "/" in href:
            self.vector.append('-1')
            return
        if self.domain and self.domain in href:
            self.vector.append('-1')
        else:
            self.vector.append('0')

    '''
    process p node and all sub nodes of p recrusivly
    param: current node
    param: record depth
    '''
    def tag_p(self, node, level):
        text = ""
        if node.text:
            text += node.text
        if node.tail:
            text += node.tail
        # get current text of node
        if text and not self.hit_key_word(text):
            text = text.strip()
        else:
            text = ""
        # get all text of node's subnodes recrusivly
        childs = node.getchildren()
        if childs and len(childs) > 0:
            level += 1
        for c in childs:
            if c.tag == 'a':
                self.tag_a(c)
                continue
            if c.tag in self.h_tags:
                self.tag_h(c)
                continue
            if c.tag == "br" and text and len(text) >= self.min_para_len:
                self.vector.append("p-0")
                continue
            if c.tag == "iframe":
                self.tag_iframe(c)
                continue
            if c is not None and c.tag == "img" and level <= 2 and ".jpg" in c.get("src"):
                self.tag_img(c)
                continue
            text += self.tag_p(c, level)
        return text.replace("None", "")

    def tag_img(self, node):
        p = node.getparent()
        if not p:
            return
        if p.tag != "p" and p.tag != "span" and p.tag != "center":
            return
        h = node.get("height")
        w = node.get("width")
        if (h and int(h) < 64) or (w and int(w) < 64):
            return
        self.vector.append("img")

    '''
    process h node and all sub nodes of node
    param node: current h node
    '''
    def tag_h(self, node):
        text = ""
        if node.text:
            text += node.text
        if node.tail:
            text += node.tail
        # get current text of node
        if text and not self.hit_key_word(text) and len(text.strip()) >= self.min_h_len:
            text = text.strip()
        else:
            text = ""
        # get all text of node's subnodes recrusivly
        childs = node.getchildren()
        for c in childs:
            if c.tag == 'a':
                self.tag_a(c)
                continue
            text += self.tag_h(c)
        return text.replace("None", "")

    '''
        detect media hiding in script
    '''
    def tag_script(self, node):
        src = node.get("src")
        if src and src.find(".mp4") != -1:
            return
        if src and src.find(".mp3") != -1:
            return
        score = 0
        text = node.text
        if not text:
            return
        token = ""
        if text.find(".mp4") != -1:
            score += 4
            token += ".mp4_"
        if text.find(".mp3") != -1:
            score += 4
            token += ".mp3_"
        if text.find("<video") != -1:
            score += 3
            token += "<video_"
        if text.find("Player") != -1 or text.find("player") != -1:
            score += 2
            token += "Player_"
        if text.find("width") != -1 or text.find("height") != -1:
            score += 1
            token += "wh_"
        if text.find("Width") != -1 or text.find("Height") != -1:
            score += 1
            token += "wh_"
        if text.find("video") != -1 or text.find("Video") != -1:
            score += 1
            token += "video_"
        if score >= 5:
            self.vector.append("media")

    def tag_iframe(self, node):
        keys = node.keys()
        if "src" in keys and "width" in keys and "height" in keys:
            src = node.get("src")
            if src.find(".mp4") != -1:
                return
            if src.find(".mp3") != -1:
                return
            if src.find("video") != -1 or src.find("Video") != -1:
                return
            if src.find("player") != -1 or src.find("Player") != -1:
                return
        childs = node.getchildren()
        for node in childs:
            if node.tag == "script":
                self.tag_script(node)
            elif node.tag == "video":
                self.tag_video(node)
            elif node.tag == "audio":
                self.tag_audio(node)
            else:
                continue

    def tag_video(self, node):
        self.vector.append("media")

    def tag_embed(self, node):
        self.vector.append("media")

    def tag_audio(self, node):
        self.vector.append("media")

    def clearScripts(self, content):
        n = len(content)
        i = 0
        while i < n:
            if not content[i].strip().startswith("<script"):
                i += 1
                continue
            while i < n and not content[i].strip().endswith("</script>"):
                content[i] = "#\n"
                i += 1
            if i < n:
                content[i] = "#\n"
                i += 1

    def clearStyle(self, content):
        n = len(content)
        i = 0
        while i < n:
            si = content[i].find("<style")
            if si == -1:
                i += 1
                continue
            ei = -1
            enjambment = False
            while i < n:
                ei = content[i].find("</style>")
                if ei == -1:
                    enjambment = True
                    content[i] = "~\n"
                    i += 1
                    continue
                if not enjambment:
                    splitor = content[i][si:ei+8]
                    content[i] = "".join(content[i].split(splitor))
                else:
                    content[i] = content[i][ei+8:]
                break

    def clearNotes(self, content):
        n = len(content)
        i = 0
        while i < n:
            si = content[i].find("<!--")
            if si == -1:
                i += 1
                continue
            ei = -1
            enjambment = False
            while i < n:
                ei = content[i].find("-->")
                if ei == -1:
                    enjambment = True
                    content[i] = "*\n"
                    i += 1
                    continue
                if not enjambment:
                    splitor = content[i][si:ei+3]
                    content[i] = "".join(content[i].split(splitor))
                else:
                    content[i] = content[i][ei+3:]
                break

if __name__ == '__main__':
    page_to_vec = PageToVec()

    AC自動機ac_automation.py,用於加速多模式串匹配

# usr/bin/env python
# -*- coding: UTF-8 -*-

import time, datetime
from time import sleep
import sys
import os
import random
import logging
from facility import *
reload(sys)
sys.setdefaultencoding('utf-8')

class node(object):
    def __init__(self):
        self.next = {}
        self.fail = None
        self.isWord = False
        self.word = ""

class ac_automation(object):
    def __init__(self):
        self.root = node()

    # 添加敏感詞函數
    def addword(self, word):
        temp_root = self.root
        for char in word:
            if char not in temp_root.next:
                temp_root.next[char] = node()
            temp_root = temp_root.next[char]
        temp_root.isWord = True
        temp_root.word = word

    # 失敗指針函數
    def make_fail(self):
        temp_que = []
        temp_que.append(self.root)
        while len(temp_que) != 0:
            temp = temp_que.pop(0)
            p = None
            for key,value in temp.next.item():
                if temp == self.root:
                    temp.next[key].fail = self.root
                else:
                    p = temp.fail
                    while p is not None:
                        if key in p.next:
                            temp.next[key].fail = p.fail
                            break
                        p = p.fail
                    if p is None:
                        temp.next[key].fail = self.root
                temp_que.append(temp.next[key])

    # 查找敏感詞函數
    def search(self, content):
        p = self.root
        result = []
        currentposition = 0

        while currentposition < len(content):
            word = content[currentposition]
            while word in p.next == False and p != self.root:
                p = p.fail

            if word in p.next:
                p = p.next[word]
            else:
                p = self.root

            if p.isWord:
                result.append(p.word)
                p = self.root
            currentposition += 1
        return result

    # 加載敏感詞庫函數
    def parse(self, path):
        with open(path, 'r') as f:
            for keyword in f.readlines():
                self.addword(keyword.strip())

    # 敏感詞替換函數
    def hit_key_words(self, text):
        """
        :param ah: AC自動機
        :param text: 文本
        :return: 過濾敏感詞之後的文本
        """
        result = list(set(self.search(text)))
        if result and len(result) > 0:
            for res in result:
                continue
            return True
        else:
            return False

if __name__ == '__main__':
    ah = ac_automation()
    path='./kw.txt'
    ah.parse(path)
    print(ah.hit_key_words(".help"))

    一些關鍵點說明如下

  •   對於如footer,input等標籤,我們選擇忽略並跳過處理,這些標籤對hub識別沒有太大作用;
  •   對於p標籤,採用p標籤內部有多少行進行視覺增強,比如有三行,則對應向量的位置上會有連續的三個1出現;
  •   對於視頻播放窗口和h1標籤,默認用連續10個1來增強視覺體驗;
  •   截斷向量採用的對向量內部的0值進行隨機連續採樣有限次,並將這些值刪除,相比首尾截斷更有利於樣本的多樣化 

 

三、模型訓練

    模型採用了全連接神經網絡。損失爲交叉熵損失;使用單個隱層100個隱單元來訓練網絡,輸出層激活採用Sigmod,其它層採用Relu;學習率默認是0.1,epoch是100, 可以選擇性的使用學習率衰減和L2正則化。採集了大約30萬訓練數據,僅隨機採樣了5萬條訓練數據進行初期訓練,得到了大約0.95的分類精度,其中採樣的正負樣本比例是1:4,測試數據規模是5000。由於這裏面存在樣本不平衡問題,分類精度可能不是特別可靠。

    源碼是借鑑了 Neural Networks and Deep Learning 一書中的代碼框架,並在此基礎上進行了擴充和修改。

# -*- coding: utf-8 -*-
"""
Created on Thu Jan  4 09:23:40 2018
@author: [email protected]
"""

import os
import sys
import math
import copy
import random
import json
import numpy as np

def equal(a, b):
    return abs(a-b) < 0.000001

def less(a, b):
    return (a - b) < -0.01

class ReluCell(object):
    @staticmethod
    def activate(z):
        ''' z is a 1-dim ndarray, such as [-1.0, 1.0, 0.0], for element which
        is smaller than 0, reset to 0, others keep unchanged '''
        z[z < 0] = 0
        return z

    @staticmethod
    def prime(z):
        ''' Derivative of relu function, 0 when z <= 0, otherwise 1 '''
        zt = np.where(z <= 0, 0, z)
        zt[zt > 0] = 1
        return zt

class SigmodCell(object):
    @staticmethod
    def activate(z):
        return 1.0 / (1.0 + np.exp(-z))

    @staticmethod
    def sigmod(z):
        """The sigmoid function."""
        return 1.0 / (1.0 + np.exp(-z))

    @staticmethod
    def prime(z):
        """ Derivative of the sigmoid function. """
        return SigmodCell.sigmod(z) * (1.0 - SigmodCell.sigmod(z))


class CrossEntropyCost(object):
    @staticmethod
    def fn(a, y):
        """ Return the cost associated with an output ``a`` and desired
        output ``y``.  Note that np.nan_to_num is used to ensure
        numerical stability. In particular, if both ``a`` and
        ``y`` have a 1.0 in the same slot, then the expression
        (1-y)*np.log(1-a) returns NaN. The np.nan_to_num
        ensures that that is converted to the correct value (0.0). """
        return np.sum(-y*np.log(a+0.000001) - (1.0-y)*np.log(1.0-a+0.000001))

    @staticmethod
    def delta(z, a, y):
        """ Return the error delta from the output layer. Note that the
        parameter ``z`` is not used by the method.  It is included in the
        method's parameters in order to make the interface consistent with
        the delta method for other cost classes. """
        return (a-y)

    @staticmethod
    def derivation(a_s, y_s):
        """ Derivative of cross entroy cost, where a_s, y_s represent
        avtivation value of output layer and labels of training data
        respectivly, particularly, once a*(1.0-a) is zero, (a-y) /
        a*(1.0-a) should be 1.0 according law of lobida. """
        o = []
        for a,y in np.nditer([a_s, y_s]):
            if equal(float(a), 1.0) or equal(float(a), 0.0):
                o.append(1.0)
            else:
                o.append((a-y) / a*(1.0-a))
        return np.array(o, dtype='float')


class QuadraticCost(object):
    @staticmethod
    def fn(a, y):
        ''' compute square of 2-norm of a-y '''
        return 0.5 * np.linalg.norm(a-y)**2

    @staticmethod
    def delta(z, a, y):
        """ Return the error delta from the output layer. """
        return (a-y) * sigmod_prime(z)

    @staticmethod
    def derivation(a_s, y_s):
        """ Derivative of square error cost, where a_s, y_s represent
        avtivation value of output layer and labels of training data respectivly """
        return (a_s - y_s)


class Forward_neural_netWork(object):
    def __init__(self, sizes, hide_cell=ReluCell, output_cell=SigmodCell, cost=CrossEntropyCost):
        # layer num of Network
        self.num_layers = len(sizes)

        # cell type of hide and output layer
        self.hide_cell = hide_cell
        self.output_cell = output_cell

        # unit num in every layer
        self.sizes = sizes

        # learning rate and decay setting
        # formulation: eta = eta * eta_decay ^ (current_epoch / eta_decay_step)
        # such as eta = 0.1 * 0.9 ^ (40 / 10) 
        self.eta = 0.1
        self.eta_decay = 0.95
        self.eta_decay_step = 30

        # L2 regularization
        self.l2_open = False
        self.l2_lamda = 0.1

        # mini batch size
        self.batch_size = 100

        # cost function
        self.cost = cost

        # init the weights and bias layer by layer
        self.init()

    def init(self):
        self.biases = []
        self.weights = []
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        for x,y in zip(self.sizes[:-1], self.sizes[1:]):
            ''' avoid variance too big to decay learning rate prematurely '''
            self.weights.append(np.random.randn(y,x) / np.sqrt(x))

    def feedforward(self, a):
        """ Return the out of the network if "a" is input. """
        for b, w in zip(self.biases[:-1], self.weights[:-1]):
            a = (self.hide_cell).activate(np.dot(w,a) + b)
        else:
            z = np.dot(self.weights[-1], a) + self.biases[-1]
            a = (self.output_cell).activate(z)
            return a

    def SGD(self, training_data, epochs, mini_batch_size, eta,
            test_data=None,
            monitor_test_cost=False,
            monitor_test_accuracy=False,
            monitor_train_cost=False,
            monitor_train_accuracy=False):
        """ Train the neural network using mini-batch stochastic
        gradient descent. The other non-optional parameters are
        self-explanatory. If "test_data" is provided then the
        network will be evaluated against the test data after each
        epoch, and partial progress printed out. This is useful for
        tracking progress, but slows things down substantially. """

        self.batch_size = mini_batch_size
        self.eta = eta

        n_test = 0
        if test_data: n_test = len(test_data)

        # cost and accuracy in test data
        test_cost, test_accuracy = [], []
        # cost and accuracy in train data
        train_cost, train_accuracy = [], []

        n = len(training_data)
        for j in range(epochs):
            np.random.shuffle(training_data)

            mini_batches = []
            # generate mini batches
            for k in range(0, n, self.batch_size):
                batch = training_data[k : k+self.batch_size]
                mini_batches.append(batch)

            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, n)

            # adjust learning rate
            self.eta = self.eta * self.eta_decay ** (j / self.eta_decay_step)

            # visualize
            if monitor_test_cost and test_data:
                cost = self.total_cost(test_data)
                print("Epoch {0} test cost: {1}".format(j, cost))
            if monitor_test_accuracy and test_data:
                acc = self.evaluate(test_data)
                print("Epoch {0} test accuracy: {1}".format(j, float(acc / n_test)))
            if monitor_train_cost:
                cost = self.total_cost(training_data, test=False)
                print("Epoch {0} train cost: {1}".format(j, cost))
            if monitor_train_accuracy:
                acc = self.evaluate(training_data, test=False)
                print("Epoch {0} train accuracy: {1}".format(j, float(acc / n)))

    def update_mini_batch(self, mini_batch, n):
        """ Update the network's weights and biases by applying
        gradient descent using backpropagation to a single mini batch.
        The "mini_batch" is a list of sample, and "n" is
        size of training data. """

        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        ''' matrix-based update mode '''
        nabla_b, nabla_w = self.backprop_v2(mini_batch)

        if self.l2_open:
            ws = []
            bs = []
            for w, nw in zip(self.weights, nabla_w):
                e = w - (self.eta/self.batch_size)*nw - (self.l2_lamda*self.eta/n)*w
                ws.append(e)
            for b, nb in zip(self.biases, nabla_b):
                e = b - (self.eta/self.batch_size)*nb
                bs.append(e)
            self.weights = ws
            self.biases = bs
        else:
            self.weights = [w-(self.eta/self.batch_size)*nw for w, nw in zip(self.weights, nabla_w)]
            self.biases = [b-(self.eta/self.batch_size)*nb for b, nb in zip(self.biases, nabla_b)]

    def backprop_v2(self, mini_batch):
        """ matrix-based impletation of backprop().
        Return a tuple "(nabla_b, nabla_w)" representing the
        gradient for the cost function C. "nabla_b" and
        "nabla_w" are layer-by-layer lists of numpy arrays, similar
        to "self.biases" and "self.weights". """

        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        # x = [x1,x2,...,xn].transpose()
        a = np.array(mini_batch, dtype='float').transpose()
        x = a[:-1]
        y = a[-1].reshape(1, self.batch_size)

        """ feedforward """
        # list to store all the activations, layer by layer
        at = x
        ats = [x]

        # list to store all the z vectors, layer by layer
        zs = []
        for b, w in zip(self.biases[:-1], self.weights[:-1]):
            # hide layer
            # Auto-broadcast in numpy
            z = np.dot(w,at) + b
            zs.append(z)
            at = (self.hide_cell).activate(z)
            ats.append(at)
        else:
            # output layer
            z = np.dot(self.weights[-1], ats[-1]) + self.biases[-1]
            zs.append(z)
            at = (self.output_cell).activate(z)
            ats.append(at)

        """ backward """
        dt = (self.cost).derivation(ats[-1], y) * (self.output_cell).prime(zs[-1])
        nabla_b[-1] = np.sum(dt, axis=1, keepdims=True)
        nabla_w[-1] = np.dot(dt, ats[-2].transpose())
        for l in range(2, self.num_layers):
            dt = np.dot(self.weights[-l+1].transpose(), dt) * (self.hide_cell).prime(zs[-l])
            nabla_b[-l] = np.sum(dt, axis=1, keepdims=True)
            nabla_w[-l] = np.dot(dt, ats[-l-1].transpose())
        return (nabla_b, nabla_w)

    def evaluate(self, data, test=True):
        """ Return the number of test inputs for which the neural
        network outputs the correct result. """
        res = []
        count = 0
        if test:
            for x,y in data:
                a = self.feedforward(np.array(x,dtype='float').reshape(len(x),1))
                res.append((a,y))
        else:
            for x in data:
                a = self.feedforward(np.array(x[:-1],dtype='float').reshape(len(x[:-1]),1))
                res.append((a, x[-1]))
        for e in res:
            distance = abs(float(e[0]) - e[1])
            if less(distance, 0.5):
                count += 1
        return count

    def total_cost(self, data, test=True):
        """ Return the total cost for the data set ``data``. The flag
        ``test`` should be set to False if the data set is the
        training data (the usual case), and to True if the data set is
        the validation or test data. See comments on the similar (but
        reversed) convention for the ``accuracy`` method, above. """
        cost = 0.0
        res = []
        if test:
            for x,y in data:
                a = self.feedforward(np.array(x,dtype='float').reshape(len(x),1))
                res.append((a, y))
        else:
            for x in data:
                a = self.feedforward(np.array(x[:-1],dtype='float').reshape(len(x[:-1]),1))
                res.append((a, x[-1]))
        for e in res:
            cost += self.cost.fn(e[0], e[1])
        return cost

    def save_model(self, fn):
        """ save parameters wrapped by json format, to file """
        model = {}
        model["sizes"] = self.sizes
        model["biases"] = [b.tolist() for b in self.biases]
        model["weights"] = [w.tolist() for w in self.weights]
        with open(fn, "w") as f:
            json.dump(model, f)

    def load_model(self, fn):
        """ load parameters from file """
        with open(fn, "r") as f:
            data = json.load(f)
            fnn = Forward_neural_netWork(data["sizes"])
            fnn.biases = [np.array(b) for b in data["biases"]]
            fnn.weights = [np.array(w) for w in data["weights"]]
            return fnn

def load_data(test=True, sampling_density=0.1):
    """ load train data and label, test data and label respectively,
    all are ndarray format, where sampling ratio defaut is 0.1 """
    train_data = []
    with open("../data/pos.txt", "r") as f:
        lines = f.readlines()
        for line in lines:
            if not line.strip():
                continue
            lst = line.strip().split(',')
            if "" in lst:
                continue
            sample = map(int, lst)
            train_data.append(list(map(float, sample)))
    with open("../data/neg.txt", "r") as f:
        lines = f.readlines()
        for line in lines:
            if not line.strip():
                continue
            lst = line.strip().split(',')
            if "" in lst:
                continue
            sample = map(int, lst)
            train_data.append(list(map(float, sample)))
    # shuffle data
    random.shuffle(train_data)
    if not test:
        return (train_data, None)
    # sampling test data
    test_data = []
    s = set()
    while float(len(test_data)) < sampling_density * float(len(train_data)):
        i = np.random.randint(0, len(train_data))
        if i not in s:
            test_data.append((train_data[i][:-1], train_data[i][-1]))
            s.add(i)
    # remove test data from train data
    for e in test_data:
        item = copy.deepcopy(e[0])
        item.append(e[1])
        train_data.remove(item)
    return (train_data, test_data)

def main():
    train_data, test_data = load_data()
    fnn = Forward_neural_netWork([512, 200, 1])
    fnn.SGD(train_data, 150, 500, 0.5, test_data, False, True, False, False)

if __name__ == '__main__':
    main()

 

四、模型搭建和訓練經驗

   1、模型的參數均是浮點數,所以在計算時需要考慮精度問題,如less和equal函數;

   2、log函數的輸入不能爲0,這可以通過顯式添加一個很小的實數來規避該問題;

   3、分母爲0的求導問題可以通過極限法則來規避,如CrossEntropyCost中的derivation用到了洛必達法則;

   4、隱層數目越多,模型一般越複雜,實驗過程中發現越是複雜的模型學習率初始應該越小;

   5、eta_decay_step的值設置不宜太小,一般設爲20或30較好;

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章