過採樣劃分數據集
import numpy as np
import csv
import random
def normalise_data(data):
data_norm = (data - np.vstack(np.mean(data, axis=1)) / np.vstack(np.std(data, axis=1)))
return data_norm
def get_all_data(filename, dims):
"""
Extract data from csv file
:param filename: path and filename of csv file
:param dims: number of features to extract
:return: features, labels in file
"""
with open(filename, 'rt') as csv_file:
f = csv.reader(csv_file, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
csv_file.readline()
features = []
labels = []
for row in f:
features.append(row[1:dims+1])
labels.append(int(row[-1]))
features = normalise_data(features)
return features, labels
def get_class_entries(filename, data_class):
"""
Extracts positive or negative classes from file
:param filename: filename for creditfraud file
:param data_class: must be 0 or 1
:return: class entries with specified feature
"""
# TODO: change function to work with batches so we don't have all data in memory.
if data_class != 0 and data_class != 1:
raise ValueError('Feature class must be 0 or 1.')
with open(filename, 'rt') as csv_file:
open_file = csv.reader(csv_file, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
csv_file.readline()
extracted_class = []
for row in open_file:
if int(row[-1]) == data_class:
extracted_class.append(row)
return extracted_class
def get_train_test_data(positive_class, negative_class, balanced=True, ratio=0.8, dims=5):
"""
Generates training and test data from positive and negative classes, either balanced or unbalanced. Depending on
Data Set.
:param positive_class:
:param negative_class:
:param balanced: True / False - balanced true means positive / negative classes are balanced
for highly unbalanced data sets this drastically affects size of training data we can use
:param ratio: ratio of training and test data
:param dims: number of pca dimensions of data to use
:return: training and test data as np arrays
"""
num_elements_pos = len(positive_class)
num_elements_neg = len(negative_class)
if not balanced:
balanced = num_elements_neg / num_elements_pos
num_samples_pos = num_elements_pos
num_samples_neg = balanced * num_elements_pos
rand_pos = random.sample(range(num_elements_pos), num_samples_pos)
rand_neg = random.sample(range(num_elements_neg), num_samples_neg)
# select the random elements from our classes and concatenate them to one list and shuffle
data = [positive_class[i] for i in rand_pos] + [negative_class[i] for i in rand_neg]
random.shuffle(data)
train_data = data[0:int(ratio * len(data))]
test_data = data[int(ratio * len(data)):]
train_labels = np.asarray(train_data)[:, -1].astype(np.float)
test_labels = np.asarray(test_data)[:, -1].astype(np.float)
train_data = np.asarray(train_data)[:, 1:dims+1].astype(np.float)
test_data = np.asarray(test_data)[:, 1:dims+1].astype(np.float)
train_data = normalise_data(train_data)
test_data = normalise_data(test_data)
return train_data, train_labels, test_data, test_labels
```
## NN
```python
import tensorflow as tf
def weight_variable(shape):
"""
Creates a new weight variable and initialises it
:param shape: shape of weight
:return: initialised weight variable
"""
initial = tf.truncated_normal(shape, stddev=0.1, dtype=tf.float32)
return tf.Variable(initial)
def bias_variable(shape):
"""
Creates a new bias variable and initialises it
:param shape: shape of bias
:return: initialised bias variable
"""
initial = tf.constant(0.1, shape=shape, dtype=tf.float32)
return tf.Variable(initial)
def calc_num_weights(in_size, out_size, num_hidden_layers, num_neurons_hiddenlayer):
number_weights = (in_size + 1) * num_neurons_hiddenlayer +\
num_hidden_layers * (num_neurons_hiddenlayer + 1) * num_neurons_hiddenlayer +\
num_neurons_hiddenlayer * (out_size + 1)
return number_weights
def mlp(input_variable, output_size, num_hidden_layers, num_neurons_hiddenlayer):
"""
Multi-layer perceptron, vanilla feed forward network
:param input_variable: input data vector
:param output_size: dimension of prediction
:param num_hidden_layers: number of hidden layers
:param num_neurons_hiddenlayer: neurons per hidden layer
:return: prediction vector
"""
weights = []
biases = []
layer = []
input_size = input_variable.get_shape()[1].value
number_of_parameters = calc_num_weights(input_size, output_size, num_hidden_layers, num_neurons_hiddenlayer)
print("Created Neural Network with {} parameters.".format(number_of_parameters))
with tf.name_scope('input_layer'):
with tf.name_scope('weights'):
weights.append(weight_variable([input_size, num_neurons_hiddenlayer]))
variable_summaries(weights[0])
with tf.name_scope('biases'):
biases.append(bias_variable([num_neurons_hiddenlayer]))
variable_summaries(biases[0])
with tf.name_scope('Wx_plus_b'):
layer.append(tf.nn.relu(tf.matmul(input_variable, weights[0]) + biases[0]))
tf.summary.histogram('activation', layer[0])
for l in range(num_hidden_layers + 1)[1:]:
with tf.name_scope('hidden_layer_{}'.format(l)):
with tf.name_scope('weights'):
weights.append(weight_variable([num_neurons_hiddenlayer, num_neurons_hiddenlayer]))
variable_summaries(weights[l])
with tf.name_scope('biases'):
biases.append(bias_variable([num_neurons_hiddenlayer]))
variable_summaries(biases[l])
with tf.name_scope('Wx_plus_b'):
layer.append(tf.nn.relu(tf.matmul(layer[l-1], weights[l]) + biases[l]))
tf.summary.histogram('activation', layer[l])
with tf.name_scope('output_layer'):
with tf.name_scope('weights'):
weights.append(weight_variable([num_neurons_hiddenlayer, output_size]))
variable_summaries(weights[num_hidden_layers + 1])
with tf.name_scope('biases'):
biases.append(bias_variable([output_size]))
variable_summaries(biases[num_hidden_layers + 1])
with tf.name_scope('Wx_plus_b'):
pred = tf.matmul(layer[num_hidden_layers], weights[num_hidden_layers + 1]) + biases[num_hidden_layers + 1]
return pred
def variable_summaries(var):
"""
Attaching summaries to a variable.
"""
with tf.name_scope('summaries'):
mean = tf.reduce_mean(var)
tf.summary.scalar('mean', mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.summary.scalar('stddev', stddev)
tf.summary.scalar('max', tf.reduce_max(var))
tf.summary.scalar('min', tf.reduce_min(var))
tf.summary.histogram('histogram', var)
```
main
```python
#!/usr/bin/env python
import tensorflow as tf
from sklearn import metrics
import tools.utils as tut
import tools.nn as tnn
SUMMARY_DIR = './logs'
FILENAME = 'creditcard.csv'
PCA_DIMS = 28
NEURONS_HL = 10
NUM_HL = 2
LEARNING_RATE = 0.001
INPUT_SIZE = PCA_DIMS
OUTPUT_SIZE = 2
TRAIN_TIME = 10000
data_positive = tut.get_class_entries(FILENAME, 1)
data_negative = tut.get_class_entries(FILENAME, 0)
features, labels = tut.get_all_data(FILENAME, PCA_DIMS)
training_data, training_labels, testing_data, testing_labels = tut.get_train_test_data(
data_positive, data_negative, dims=PCA_DIMS)
# transform our label vectors to one hot vectors
training_labels = tf.one_hot(training_labels, OUTPUT_SIZE)
testing_labels = tf.one_hot(testing_labels, OUTPUT_SIZE)
labels = tf.one_hot(labels, OUTPUT_SIZE)
x = tf.placeholder(dtype=tf.float32, shape=[None, INPUT_SIZE])
y = tf.placeholder(dtype=tf.int32, shape=[None, OUTPUT_SIZE])
# output of our neural network
prediction = tnn.mlp(x, OUTPUT_SIZE, NUM_HL, NEURONS_HL)
with tf.name_scope('xentropy'):
pre_loss = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction)
with tf.name_scope('total'):
loss = tf.reduce_mean(pre_loss)
tf.summary.scalar('xentropy', loss)
with tf.name_scope('train'):
train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)
with tf.name_scope('accuracy'):
with tf.name_scope('correct_prediction'):
correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
with tf.name_scope('accuracy'):
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
tf.summary.scalar('accuracy', accuracy)
with tf.Session() as session:
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(SUMMARY_DIR + '/train', session.graph)
session.run(tf.global_variables_initializer())
# convert one hot training labels and test labels from tensor objects to lists
testing_labels, training_labels, labels = session.run([testing_labels, training_labels, labels])
for i in range(TRAIN_TIME):
if i % 500 == 0:
summary, acc = session.run([merged, accuracy], feed_dict={x: testing_data, y: testing_labels})
train_writer.add_summary(summary, i)
print("step %d, accuracy: %f" % (i, acc))
session.run([train_step], feed_dict={x: training_data, y: training_labels})
# Let's test our model
true_label = tf.argmax(y, 1)
pred_label = tf.argmax(prediction, 1)
tl, pl = session.run([true_label, pred_label], feed_dict={x: features, y: labels})
report = metrics.classification_report(tl, pl, digits=4)
auc_val = metrics.roc_auc_score(tl, pl)
print("AUC Score: {}".format(auc_val))
print("Report: {}".format(report))