一、引言
在web爬取領域,hub自動識別是一個比較重要的研究內容,通過自動識別的hub,可以增加爬蟲的種子源列表,進而提高數據的抓取量和覆蓋度。業界已有一些基於傳統機器學習方法的hub識別算法,但傳統識別算法需要有豐富的特徵工程經驗才能建立一個比較穩定而可靠的模型。此外,web頁面的結構千變萬化,提取穩定而可靠的特徵是相當困難的。
相反,基於深度學習的方法可以擺脫對特徵工程的束縛,能夠自動提取樣本中的特徵,外加其強大的學習和表示能力,在Hub識別方面一定會比傳統方法要有效的多。
二、訓練樣本生成
web頁面是一個版結構化的數據,爲了生成訓練數據,關鍵的一步便是如何將半結構化的web頁面轉換爲一個向量。如果將一個web頁面的body體中的節點遞歸的組織爲一棵樹,它會有如下的結構。
如今的網頁源碼基本都是div+css的搭配模式。同時,如果你仔細觀察會發現,div的組織一般是從上到下或者從左到右,非常遵循我們人眼的視覺感知順序。下面要引入的樣本生成算法將按照這種視覺感知順序處理網頁中的每個節點,並最終將其轉換爲一個一維向量。
PageToVec算法
輸入: url, html, domain
輸出: one-dim vector
- 利用lxml等開源類庫構建節點樹.
- 採用深度優先遍歷,遞歸的對每個節點做處理: 針對不同節點,在向量尾部填充充不同的標識.
- 向量映射,將向量映射爲僅包含-1,0,1三個數字的向量.
- 視覺增強,對p、h1、video等有視覺衝擊感的節點,在對應的向量位置上按照指定規則進行擴充.
- 向量正規化,對於超出長度的向量做採樣截斷,對於長度不足的向量採用首部或尾部填充.
下面是算法PageToVec中的page_to_vec.py文件。
# usr/bin/env python
# -*- coding: UTF-8 -*-
import time, datetime
from time import sleep
import sys
import os
import math
import urllib
import random
import logging
import copy
from lxml import etree
from facility import *
from ac_automation import *
reload(sys)
sys.setdefaultencoding('utf-8')
class PageToVec(object):
def __init__(self):
super(PageToVec, self).__init__()
self.domain = ""
self.host = ""
self.url = ""
# url最長限制
self.min_url_len = 256
self.vector = []
# max vector len
self.vsize = 512
self.h_tags = ['h1', 'h2']
# 無需遞歸處理的標籤
self.skip_tags = ["footer","noscript","nav","input","select","option"]
self.kw_dict = {}
# AC自動機
self.ac_automation = ac_automation()
self.ac_automation.parse("./key_word.txt")
# 段落最小長度
self.min_para_len = 16
# 段落每行平均字數
self.ave_char_num_per_line = 32
# 標題最小長度
self.min_h_len = 5
# p 標籤數
self.p_num = 0
# a 標籤數
self.a_num = 0
# anchor最短長度
self.min_anchor_len = 5
def erase(self):
self.vector = []
self.p_num = 0
self.a_num = 0
self.domain = ""
self.host = ""
def parse(self, url, content, domain):
try:
host = url.split('/')[2]
self.url = url
self.host = host
self.domain = domain
self.clearNotes(content)
self.clearStyle(content)
lst = []
for line in content:
if line == "*\n" or line == "#\n" or line == "~\n":
continue
lst.append(line)
if not lst or len(lst) == 0:
return []
html = etree.HTML("".join(lst))
if html is None:
return []
tree = etree.ElementTree(html)
node_list = tree.xpath('/html/body')
if node_list is not None and len(node_list) > 0:
self.fill(node_list[0])
self.normalize()
return self.vector
else:
self.vector = []
return self.vector
except Exception as e:
logger.info("error: %s" % str(e))
self.erase()
return self.vector
"""
valid paragraph length > 16
"""
def is_valid_para(self, para):
if para == None or para == "":
return False
ln = len(para)
if ln >= self.min_para_len:
return True
else:
return False
"""
fill vector by iterate node tree
param: tree root
param: recuresive depth
"""
def fill(self, root):
for node in root.getchildren():
if self.skip(node):
continue
childs = node.getchildren()
# a tag process
if node.tag == "a":
self.tag_a(node)
continue
# h tag process
if node.tag in self.h_tags:
h = self.tag_h(node)
if h and len(h) >= self.min_h_len:
self.vector.append("h-" + str(len(h)))
else:
self.vector.append("0")
continue
# img tag process
if node is not None and node.tag == "img" and ".jpg" in node.get("src"):
self.tag_img(node)
continue
if node.tag == "script":
self.tag_script(node)
continue
if node.tag == "iframe":
self.tag_iframe(node)
continue
if node.tag == "video":
self.tag_video(node)
continue
if node.tag == "embed":
self.tag_embed(node)
continue
if node.tag == "audio":
self.tag_audio(node)
continue
# br tag
if node.tag == "br":
self.vector.append("p-0")
continue
# paragragh process
if node.tag == "p" or (not childs or len(childs) == 0):
level = 0
para = self.tag_p(node, level)
if self.is_valid_para(para):
self.vector.append("p-" + str(len(para)))
else:
self.vector.append("0")
continue
self.fill(node)
"""
normalize vector so that all of the element bounds in [0,1,-1]
h: 1
p: 1
a: -1
others: 0
"""
def normalize(self):
ln = len(self.vector)
self.p_num = 0
self.a_num = 0
if ln <= 0:
return
self.vector[ln - 1] = 0
# phase one: map
for i in range(ln-2, -1, -1):
c = self.vector[i][0]
if c == '0':
self.vector[i] = 0
continue
if c == '-':
self.a_num += 1
self.vector[i] = -1
continue
if c == 'h':
self.vector[i] = 'h'
continue
if c == 'i':
self.vector[i] = 'i'
continue
if c == 'm':
self.vector[i] = 'm'
continue
ac = 0
if i < ln-1:
ac = self.vector[i+1]
bc = ""
line_num = int(self.vector[i].split('-')[1]) / self.ave_char_num_per_line
if i > 0:
bc = self.vector[i-1][0]
if c == 'p' and (ac >= 1 or bc in ['p','i','h']):
self.p_num += 1
self.vector[i] = line_num
elif c == 'p':
self.p_num += 1
self.vector[i] = line_num
else:
self.vector[i] = 0
# phase two: expand
container = []
for e in self.vector:
if e <= 0:
container.append(e)
elif e == 'h':
container.extend([1] * 10)
elif e == 'i':
container.extend([1] * 5)
elif e == 'm':
container.extend([1] * 10)
else:
container.extend([1] * e)
self.vector = container
ln = len(self.vector)
if ln > self.vsize:
self.zoom()
return
if ln < self.vsize:
self.vector.extend([0] * (self.vsize - ln))
def zoom(self):
self.zoom_internal(5, 0)
if len(self.vector) == self.vsize:
return
self.zoom_internal(5, -1)
if len(self.vector) > self.vsize:
self.truncate()
def zoom_internal(self, loop_num = 3, fig = 0):
loop = 0
while loop < loop_num:
n = len(self.vector)
df = n - self.vsize
i = j = 0
v = []
while True:
while i < n and self.vector[i] != fig:
i += 1
v.extend(self.vector[j : i])
if i >= n:
break
j = i
while i < n and self.vector[i] == fig:
i += 1
if i - j > 1:
ln = 0
if i - j - df >= 1:
ln = i - j - df
df = 0
else:
r1 = int(random.uniform(1, i-j-1))
r2 = int(random.uniform(1, i-j-1))
r3 = int(random.uniform(1, i-j-1))
ln = min(min(r1,r2), r3)
df = df - (i - j - ln)
j = i
v.extend([fig] * ln)
if df == 0:
v.extend(self.vector[i:])
break
i += 1
self.vector = v
if df == 0:
break
loop += 1
def truncate(self):
j = len(self.vector) - self.vsize
hzn = tzn = 0
p = 0
q = -1
c = 0
while c < j:
hzn += int(self.vector[p] == 0)
tzn += int(self.vector[q] == 0)
c += 1
p += 1
q -= 1
if hzn > tzn:
self.vector = self.vector[j:]
else:
self.vector = self.vector[0 : self.vsize]
"""
skip redundant nodes, such as attributes containing nav, ad, footer, etc
root: current node
"""
def skip(self, root):
if root.tag in self.skip_tags:
return True
d = root.items()
for item in d:
if item[0] != "id" and item[0] != "class":
continue
if "video" in item[1] or "embed" in item[1] or "audio" in item[1]:
continue
if self.ac_automation.hit_key_words(item[1]):
return True
return False
"""
filter key words by AC-Automation
text: filted text
"""
def hit_key_word(self, text):
if not text or not text.strip():
return False
b1 = False
b2 = False
if len(text.replace("None","").strip()) < self.min_para_len:
b1 = True
else:
return False
if self.ac_automation.hit_key_words(text.strip().encode("utf-8")):
b2 = True
return b1 and b2
"""
process a tag
node: current node
"""
def tag_a(self, node):
href = node.get("href")
if not href:
self.vector.append('0')
return
# maybe advertisement
if len(href) > self.min_url_len:
return
t = node
while True:
if t.text and t.text.strip():
break
c = t.getchildren()
if not c or len(c) == 0:
break
t = c[0]
# hit key word
if self.hit_key_word(t.text):
self.vector.append('0')
return
# anchor too short
if t.text and len(t.text) < self.min_anchor_len:
self.vector.append('0')
return
# such as /a/b.html, ../a.html, etc
if not href.startswith("http") and "/" in href:
self.vector.append('-1')
return
if self.domain and self.domain in href:
self.vector.append('-1')
else:
self.vector.append('0')
'''
process p node and all sub nodes of p recrusivly
param: current node
param: record depth
'''
def tag_p(self, node, level):
text = ""
if node.text:
text += node.text
if node.tail:
text += node.tail
# get current text of node
if text and not self.hit_key_word(text):
text = text.strip()
else:
text = ""
# get all text of node's subnodes recrusivly
childs = node.getchildren()
if childs and len(childs) > 0:
level += 1
for c in childs:
if c.tag == 'a':
self.tag_a(c)
continue
if c.tag in self.h_tags:
self.tag_h(c)
continue
if c.tag == "br" and text and len(text) >= self.min_para_len:
self.vector.append("p-0")
continue
if c.tag == "iframe":
self.tag_iframe(c)
continue
if c is not None and c.tag == "img" and level <= 2 and ".jpg" in c.get("src"):
self.tag_img(c)
continue
text += self.tag_p(c, level)
return text.replace("None", "")
def tag_img(self, node):
p = node.getparent()
if not p:
return
if p.tag != "p" and p.tag != "span" and p.tag != "center":
return
h = node.get("height")
w = node.get("width")
if (h and int(h) < 64) or (w and int(w) < 64):
return
self.vector.append("img")
'''
process h node and all sub nodes of node
param node: current h node
'''
def tag_h(self, node):
text = ""
if node.text:
text += node.text
if node.tail:
text += node.tail
# get current text of node
if text and not self.hit_key_word(text) and len(text.strip()) >= self.min_h_len:
text = text.strip()
else:
text = ""
# get all text of node's subnodes recrusivly
childs = node.getchildren()
for c in childs:
if c.tag == 'a':
self.tag_a(c)
continue
text += self.tag_h(c)
return text.replace("None", "")
'''
detect media hiding in script
'''
def tag_script(self, node):
src = node.get("src")
if src and src.find(".mp4") != -1:
return
if src and src.find(".mp3") != -1:
return
score = 0
text = node.text
if not text:
return
token = ""
if text.find(".mp4") != -1:
score += 4
token += ".mp4_"
if text.find(".mp3") != -1:
score += 4
token += ".mp3_"
if text.find("<video") != -1:
score += 3
token += "<video_"
if text.find("Player") != -1 or text.find("player") != -1:
score += 2
token += "Player_"
if text.find("width") != -1 or text.find("height") != -1:
score += 1
token += "wh_"
if text.find("Width") != -1 or text.find("Height") != -1:
score += 1
token += "wh_"
if text.find("video") != -1 or text.find("Video") != -1:
score += 1
token += "video_"
if score >= 5:
self.vector.append("media")
def tag_iframe(self, node):
keys = node.keys()
if "src" in keys and "width" in keys and "height" in keys:
src = node.get("src")
if src.find(".mp4") != -1:
return
if src.find(".mp3") != -1:
return
if src.find("video") != -1 or src.find("Video") != -1:
return
if src.find("player") != -1 or src.find("Player") != -1:
return
childs = node.getchildren()
for node in childs:
if node.tag == "script":
self.tag_script(node)
elif node.tag == "video":
self.tag_video(node)
elif node.tag == "audio":
self.tag_audio(node)
else:
continue
def tag_video(self, node):
self.vector.append("media")
def tag_embed(self, node):
self.vector.append("media")
def tag_audio(self, node):
self.vector.append("media")
def clearScripts(self, content):
n = len(content)
i = 0
while i < n:
if not content[i].strip().startswith("<script"):
i += 1
continue
while i < n and not content[i].strip().endswith("</script>"):
content[i] = "#\n"
i += 1
if i < n:
content[i] = "#\n"
i += 1
def clearStyle(self, content):
n = len(content)
i = 0
while i < n:
si = content[i].find("<style")
if si == -1:
i += 1
continue
ei = -1
enjambment = False
while i < n:
ei = content[i].find("</style>")
if ei == -1:
enjambment = True
content[i] = "~\n"
i += 1
continue
if not enjambment:
splitor = content[i][si:ei+8]
content[i] = "".join(content[i].split(splitor))
else:
content[i] = content[i][ei+8:]
break
def clearNotes(self, content):
n = len(content)
i = 0
while i < n:
si = content[i].find("<!--")
if si == -1:
i += 1
continue
ei = -1
enjambment = False
while i < n:
ei = content[i].find("-->")
if ei == -1:
enjambment = True
content[i] = "*\n"
i += 1
continue
if not enjambment:
splitor = content[i][si:ei+3]
content[i] = "".join(content[i].split(splitor))
else:
content[i] = content[i][ei+3:]
break
if __name__ == '__main__':
page_to_vec = PageToVec()
AC自動機ac_automation.py,用於加速多模式串匹配
# usr/bin/env python
# -*- coding: UTF-8 -*-
import time, datetime
from time import sleep
import sys
import os
import random
import logging
from facility import *
reload(sys)
sys.setdefaultencoding('utf-8')
class node(object):
def __init__(self):
self.next = {}
self.fail = None
self.isWord = False
self.word = ""
class ac_automation(object):
def __init__(self):
self.root = node()
# 添加敏感詞函數
def addword(self, word):
temp_root = self.root
for char in word:
if char not in temp_root.next:
temp_root.next[char] = node()
temp_root = temp_root.next[char]
temp_root.isWord = True
temp_root.word = word
# 失敗指針函數
def make_fail(self):
temp_que = []
temp_que.append(self.root)
while len(temp_que) != 0:
temp = temp_que.pop(0)
p = None
for key,value in temp.next.item():
if temp == self.root:
temp.next[key].fail = self.root
else:
p = temp.fail
while p is not None:
if key in p.next:
temp.next[key].fail = p.fail
break
p = p.fail
if p is None:
temp.next[key].fail = self.root
temp_que.append(temp.next[key])
# 查找敏感詞函數
def search(self, content):
p = self.root
result = []
currentposition = 0
while currentposition < len(content):
word = content[currentposition]
while word in p.next == False and p != self.root:
p = p.fail
if word in p.next:
p = p.next[word]
else:
p = self.root
if p.isWord:
result.append(p.word)
p = self.root
currentposition += 1
return result
# 加載敏感詞庫函數
def parse(self, path):
with open(path, 'r') as f:
for keyword in f.readlines():
self.addword(keyword.strip())
# 敏感詞替換函數
def hit_key_words(self, text):
"""
:param ah: AC自動機
:param text: 文本
:return: 過濾敏感詞之後的文本
"""
result = list(set(self.search(text)))
if result and len(result) > 0:
for res in result:
continue
return True
else:
return False
if __name__ == '__main__':
ah = ac_automation()
path='./kw.txt'
ah.parse(path)
print(ah.hit_key_words(".help"))
一些關鍵點說明如下
- 對於如footer,input等標籤,我們選擇忽略並跳過處理,這些標籤對hub識別沒有太大作用;
- 對於p標籤,採用p標籤內部有多少行進行視覺增強,比如有三行,則對應向量的位置上會有連續的三個1出現;
- 對於視頻播放窗口和h1標籤,默認用連續10個1來增強視覺體驗;
- 截斷向量採用的對向量內部的0值進行隨機連續採樣有限次,並將這些值刪除,相比首尾截斷更有利於樣本的多樣化
三、模型訓練
模型採用了全連接神經網絡。損失爲交叉熵損失;使用單個隱層100個隱單元來訓練網絡,輸出層激活採用Sigmod,其它層採用Relu;學習率默認是0.1,epoch是100, 可以選擇性的使用學習率衰減和L2正則化。採集了大約30萬訓練數據,僅隨機採樣了5萬條訓練數據進行初期訓練,得到了大約0.95的分類精度,其中採樣的正負樣本比例是1:4,測試數據規模是5000。由於這裏面存在樣本不平衡問題,分類精度可能不是特別可靠。
源碼是借鑑了 Neural Networks and Deep Learning 一書中的代碼框架,並在此基礎上進行了擴充和修改。
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 4 09:23:40 2018
@author: [email protected]
"""
import os
import sys
import math
import copy
import random
import json
import numpy as np
def equal(a, b):
return abs(a-b) < 0.000001
def less(a, b):
return (a - b) < -0.01
class ReluCell(object):
@staticmethod
def activate(z):
''' z is a 1-dim ndarray, such as [-1.0, 1.0, 0.0], for element which
is smaller than 0, reset to 0, others keep unchanged '''
z[z < 0] = 0
return z
@staticmethod
def prime(z):
''' Derivative of relu function, 0 when z <= 0, otherwise 1 '''
zt = np.where(z <= 0, 0, z)
zt[zt > 0] = 1
return zt
class SigmodCell(object):
@staticmethod
def activate(z):
return 1.0 / (1.0 + np.exp(-z))
@staticmethod
def sigmod(z):
"""The sigmoid function."""
return 1.0 / (1.0 + np.exp(-z))
@staticmethod
def prime(z):
""" Derivative of the sigmoid function. """
return SigmodCell.sigmod(z) * (1.0 - SigmodCell.sigmod(z))
class CrossEntropyCost(object):
@staticmethod
def fn(a, y):
""" Return the cost associated with an output ``a`` and desired
output ``y``. Note that np.nan_to_num is used to ensure
numerical stability. In particular, if both ``a`` and
``y`` have a 1.0 in the same slot, then the expression
(1-y)*np.log(1-a) returns NaN. The np.nan_to_num
ensures that that is converted to the correct value (0.0). """
return np.sum(-y*np.log(a+0.000001) - (1.0-y)*np.log(1.0-a+0.000001))
@staticmethod
def delta(z, a, y):
""" Return the error delta from the output layer. Note that the
parameter ``z`` is not used by the method. It is included in the
method's parameters in order to make the interface consistent with
the delta method for other cost classes. """
return (a-y)
@staticmethod
def derivation(a_s, y_s):
""" Derivative of cross entroy cost, where a_s, y_s represent
avtivation value of output layer and labels of training data
respectivly, particularly, once a*(1.0-a) is zero, (a-y) /
a*(1.0-a) should be 1.0 according law of lobida. """
o = []
for a,y in np.nditer([a_s, y_s]):
if equal(float(a), 1.0) or equal(float(a), 0.0):
o.append(1.0)
else:
o.append((a-y) / a*(1.0-a))
return np.array(o, dtype='float')
class QuadraticCost(object):
@staticmethod
def fn(a, y):
''' compute square of 2-norm of a-y '''
return 0.5 * np.linalg.norm(a-y)**2
@staticmethod
def delta(z, a, y):
""" Return the error delta from the output layer. """
return (a-y) * sigmod_prime(z)
@staticmethod
def derivation(a_s, y_s):
""" Derivative of square error cost, where a_s, y_s represent
avtivation value of output layer and labels of training data respectivly """
return (a_s - y_s)
class Forward_neural_netWork(object):
def __init__(self, sizes, hide_cell=ReluCell, output_cell=SigmodCell, cost=CrossEntropyCost):
# layer num of Network
self.num_layers = len(sizes)
# cell type of hide and output layer
self.hide_cell = hide_cell
self.output_cell = output_cell
# unit num in every layer
self.sizes = sizes
# learning rate and decay setting
# formulation: eta = eta * eta_decay ^ (current_epoch / eta_decay_step)
# such as eta = 0.1 * 0.9 ^ (40 / 10)
self.eta = 0.1
self.eta_decay = 0.95
self.eta_decay_step = 30
# L2 regularization
self.l2_open = False
self.l2_lamda = 0.1
# mini batch size
self.batch_size = 100
# cost function
self.cost = cost
# init the weights and bias layer by layer
self.init()
def init(self):
self.biases = []
self.weights = []
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
for x,y in zip(self.sizes[:-1], self.sizes[1:]):
''' avoid variance too big to decay learning rate prematurely '''
self.weights.append(np.random.randn(y,x) / np.sqrt(x))
def feedforward(self, a):
""" Return the out of the network if "a" is input. """
for b, w in zip(self.biases[:-1], self.weights[:-1]):
a = (self.hide_cell).activate(np.dot(w,a) + b)
else:
z = np.dot(self.weights[-1], a) + self.biases[-1]
a = (self.output_cell).activate(z)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta,
test_data=None,
monitor_test_cost=False,
monitor_test_accuracy=False,
monitor_train_cost=False,
monitor_train_accuracy=False):
""" Train the neural network using mini-batch stochastic
gradient descent. The other non-optional parameters are
self-explanatory. If "test_data" is provided then the
network will be evaluated against the test data after each
epoch, and partial progress printed out. This is useful for
tracking progress, but slows things down substantially. """
self.batch_size = mini_batch_size
self.eta = eta
n_test = 0
if test_data: n_test = len(test_data)
# cost and accuracy in test data
test_cost, test_accuracy = [], []
# cost and accuracy in train data
train_cost, train_accuracy = [], []
n = len(training_data)
for j in range(epochs):
np.random.shuffle(training_data)
mini_batches = []
# generate mini batches
for k in range(0, n, self.batch_size):
batch = training_data[k : k+self.batch_size]
mini_batches.append(batch)
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, n)
# adjust learning rate
self.eta = self.eta * self.eta_decay ** (j / self.eta_decay_step)
# visualize
if monitor_test_cost and test_data:
cost = self.total_cost(test_data)
print("Epoch {0} test cost: {1}".format(j, cost))
if monitor_test_accuracy and test_data:
acc = self.evaluate(test_data)
print("Epoch {0} test accuracy: {1}".format(j, float(acc / n_test)))
if monitor_train_cost:
cost = self.total_cost(training_data, test=False)
print("Epoch {0} train cost: {1}".format(j, cost))
if monitor_train_accuracy:
acc = self.evaluate(training_data, test=False)
print("Epoch {0} train accuracy: {1}".format(j, float(acc / n)))
def update_mini_batch(self, mini_batch, n):
""" Update the network's weights and biases by applying
gradient descent using backpropagation to a single mini batch.
The "mini_batch" is a list of sample, and "n" is
size of training data. """
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
''' matrix-based update mode '''
nabla_b, nabla_w = self.backprop_v2(mini_batch)
if self.l2_open:
ws = []
bs = []
for w, nw in zip(self.weights, nabla_w):
e = w - (self.eta/self.batch_size)*nw - (self.l2_lamda*self.eta/n)*w
ws.append(e)
for b, nb in zip(self.biases, nabla_b):
e = b - (self.eta/self.batch_size)*nb
bs.append(e)
self.weights = ws
self.biases = bs
else:
self.weights = [w-(self.eta/self.batch_size)*nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(self.eta/self.batch_size)*nb for b, nb in zip(self.biases, nabla_b)]
def backprop_v2(self, mini_batch):
""" matrix-based impletation of backprop().
Return a tuple "(nabla_b, nabla_w)" representing the
gradient for the cost function C. "nabla_b" and
"nabla_w" are layer-by-layer lists of numpy arrays, similar
to "self.biases" and "self.weights". """
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# x = [x1,x2,...,xn].transpose()
a = np.array(mini_batch, dtype='float').transpose()
x = a[:-1]
y = a[-1].reshape(1, self.batch_size)
""" feedforward """
# list to store all the activations, layer by layer
at = x
ats = [x]
# list to store all the z vectors, layer by layer
zs = []
for b, w in zip(self.biases[:-1], self.weights[:-1]):
# hide layer
# Auto-broadcast in numpy
z = np.dot(w,at) + b
zs.append(z)
at = (self.hide_cell).activate(z)
ats.append(at)
else:
# output layer
z = np.dot(self.weights[-1], ats[-1]) + self.biases[-1]
zs.append(z)
at = (self.output_cell).activate(z)
ats.append(at)
""" backward """
dt = (self.cost).derivation(ats[-1], y) * (self.output_cell).prime(zs[-1])
nabla_b[-1] = np.sum(dt, axis=1, keepdims=True)
nabla_w[-1] = np.dot(dt, ats[-2].transpose())
for l in range(2, self.num_layers):
dt = np.dot(self.weights[-l+1].transpose(), dt) * (self.hide_cell).prime(zs[-l])
nabla_b[-l] = np.sum(dt, axis=1, keepdims=True)
nabla_w[-l] = np.dot(dt, ats[-l-1].transpose())
return (nabla_b, nabla_w)
def evaluate(self, data, test=True):
""" Return the number of test inputs for which the neural
network outputs the correct result. """
res = []
count = 0
if test:
for x,y in data:
a = self.feedforward(np.array(x,dtype='float').reshape(len(x),1))
res.append((a,y))
else:
for x in data:
a = self.feedforward(np.array(x[:-1],dtype='float').reshape(len(x[:-1]),1))
res.append((a, x[-1]))
for e in res:
distance = abs(float(e[0]) - e[1])
if less(distance, 0.5):
count += 1
return count
def total_cost(self, data, test=True):
""" Return the total cost for the data set ``data``. The flag
``test`` should be set to False if the data set is the
training data (the usual case), and to True if the data set is
the validation or test data. See comments on the similar (but
reversed) convention for the ``accuracy`` method, above. """
cost = 0.0
res = []
if test:
for x,y in data:
a = self.feedforward(np.array(x,dtype='float').reshape(len(x),1))
res.append((a, y))
else:
for x in data:
a = self.feedforward(np.array(x[:-1],dtype='float').reshape(len(x[:-1]),1))
res.append((a, x[-1]))
for e in res:
cost += self.cost.fn(e[0], e[1])
return cost
def save_model(self, fn):
""" save parameters wrapped by json format, to file """
model = {}
model["sizes"] = self.sizes
model["biases"] = [b.tolist() for b in self.biases]
model["weights"] = [w.tolist() for w in self.weights]
with open(fn, "w") as f:
json.dump(model, f)
def load_model(self, fn):
""" load parameters from file """
with open(fn, "r") as f:
data = json.load(f)
fnn = Forward_neural_netWork(data["sizes"])
fnn.biases = [np.array(b) for b in data["biases"]]
fnn.weights = [np.array(w) for w in data["weights"]]
return fnn
def load_data(test=True, sampling_density=0.1):
""" load train data and label, test data and label respectively,
all are ndarray format, where sampling ratio defaut is 0.1 """
train_data = []
with open("../data/pos.txt", "r") as f:
lines = f.readlines()
for line in lines:
if not line.strip():
continue
lst = line.strip().split(',')
if "" in lst:
continue
sample = map(int, lst)
train_data.append(list(map(float, sample)))
with open("../data/neg.txt", "r") as f:
lines = f.readlines()
for line in lines:
if not line.strip():
continue
lst = line.strip().split(',')
if "" in lst:
continue
sample = map(int, lst)
train_data.append(list(map(float, sample)))
# shuffle data
random.shuffle(train_data)
if not test:
return (train_data, None)
# sampling test data
test_data = []
s = set()
while float(len(test_data)) < sampling_density * float(len(train_data)):
i = np.random.randint(0, len(train_data))
if i not in s:
test_data.append((train_data[i][:-1], train_data[i][-1]))
s.add(i)
# remove test data from train data
for e in test_data:
item = copy.deepcopy(e[0])
item.append(e[1])
train_data.remove(item)
return (train_data, test_data)
def main():
train_data, test_data = load_data()
fnn = Forward_neural_netWork([512, 200, 1])
fnn.SGD(train_data, 150, 500, 0.5, test_data, False, True, False, False)
if __name__ == '__main__':
main()
四、模型搭建和訓練經驗
1、模型的參數均是浮點數,所以在計算時需要考慮精度問題,如less和equal函數;
2、log函數的輸入不能爲0,這可以通過顯式添加一個很小的實數來規避該問題;
3、分母爲0的求導問題可以通過極限法則來規避,如CrossEntropyCost中的derivation用到了洛必達法則;
4、隱層數目越多,模型一般越複雜,實驗過程中發現越是複雜的模型學習率初始應該越小;
5、eta_decay_step的值設置不宜太小,一般設爲20或30較好;