利用NN處理不平衡數據集

原文鏈接：https://github.com/ajesipow/NN_credit_fraud/
過採樣劃分數據集
import numpy as np
import csv
import random


def normalise_data(data):
    data_norm = (data - np.vstack(np.mean(data, axis=1)) / np.vstack(np.std(data, axis=1)))
    return data_norm


def get_all_data(filename, dims):
    """
    Extract data from csv file
    :param filename: path and filename of csv file
    :param dims: number of features to extract
    :return: features, labels in file
    """

    with open(filename, 'rt') as csv_file:
        f = csv.reader(csv_file, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
        csv_file.readline()
        features = []
        labels = []
        for row in f:
            features.append(row[1:dims+1])
            labels.append(int(row[-1]))
        features = normalise_data(features)
    return features, labels


def get_class_entries(filename, data_class):
    """
    Extracts positive or negative classes from file
    :param filename: filename for creditfraud file
    :param data_class: must be 0 or 1
    :return: class entries with specified feature
    """
    # TODO: change function to work with batches so we don't have all data in memory.
    if data_class != 0 and data_class != 1:
        raise ValueError('Feature class must be 0 or 1.')
    with open(filename, 'rt') as csv_file:
        open_file = csv.reader(csv_file, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
        csv_file.readline()
        extracted_class = []
        for row in open_file:
            if int(row[-1]) == data_class:
                extracted_class.append(row)
    return extracted_class


def get_train_test_data(positive_class, negative_class, balanced=True, ratio=0.8, dims=5):
    """
    Generates training and test data from positive and negative classes, either balanced or unbalanced. Depending on
    Data Set.
    :param positive_class:
    :param negative_class:
    :param balanced: True / False - balanced true means positive / negative classes are balanced
                    for highly unbalanced data sets this drastically affects size of training data we can use
    :param ratio: ratio of training and test data
    :param dims: number of pca dimensions of data to use
    :return: training and test data as np arrays
    """
    num_elements_pos = len(positive_class)
    num_elements_neg = len(negative_class)

    if not balanced:
        balanced = num_elements_neg / num_elements_pos

    num_samples_pos = num_elements_pos
    num_samples_neg = balanced * num_elements_pos

    rand_pos = random.sample(range(num_elements_pos), num_samples_pos)
    rand_neg = random.sample(range(num_elements_neg), num_samples_neg)

    # select the random elements from our classes and concatenate them to one list and shuffle
    data = [positive_class[i] for i in rand_pos] + [negative_class[i] for i in rand_neg]
    random.shuffle(data)

    train_data = data[0:int(ratio * len(data))]
    test_data = data[int(ratio * len(data)):]

    train_labels = np.asarray(train_data)[:, -1].astype(np.float)
    test_labels = np.asarray(test_data)[:, -1].astype(np.float)

    train_data = np.asarray(train_data)[:, 1:dims+1].astype(np.float)
    test_data = np.asarray(test_data)[:, 1:dims+1].astype(np.float)

    train_data = normalise_data(train_data)
    test_data = normalise_data(test_data)

    return train_data, train_labels, test_data, test_labels
    ```

## NN

```python
import tensorflow as tf


def weight_variable(shape):
    """
    Creates a new weight variable and initialises it
    :param shape: shape of weight
    :return: initialised weight variable
    """
    initial = tf.truncated_normal(shape, stddev=0.1, dtype=tf.float32)
    return tf.Variable(initial)


def bias_variable(shape):
    """
    Creates a new bias variable and initialises it
    :param shape: shape of bias
    :return: initialised bias variable
    """
    initial = tf.constant(0.1, shape=shape, dtype=tf.float32)
    return tf.Variable(initial)


def calc_num_weights(in_size, out_size, num_hidden_layers, num_neurons_hiddenlayer):

    number_weights = (in_size + 1) * num_neurons_hiddenlayer +\
                     num_hidden_layers * (num_neurons_hiddenlayer + 1) * num_neurons_hiddenlayer +\
                     num_neurons_hiddenlayer * (out_size + 1)

    return number_weights


def mlp(input_variable, output_size, num_hidden_layers, num_neurons_hiddenlayer):
    """
    Multi-layer perceptron, vanilla feed forward network
    :param input_variable: input data vector
    :param output_size: dimension of prediction
    :param num_hidden_layers: number of hidden layers
    :param num_neurons_hiddenlayer: neurons per hidden layer
    :return: prediction vector
    """

    weights = []
    biases = []
    layer = []
    input_size = input_variable.get_shape()[1].value

    number_of_parameters = calc_num_weights(input_size, output_size, num_hidden_layers, num_neurons_hiddenlayer)
    print("Created Neural Network with {} parameters.".format(number_of_parameters))

    with tf.name_scope('input_layer'):
        with tf.name_scope('weights'):
            weights.append(weight_variable([input_size, num_neurons_hiddenlayer]))
            variable_summaries(weights[0])
        with tf.name_scope('biases'):
            biases.append(bias_variable([num_neurons_hiddenlayer]))
            variable_summaries(biases[0])
        with tf.name_scope('Wx_plus_b'):
            layer.append(tf.nn.relu(tf.matmul(input_variable, weights[0]) + biases[0]))
        tf.summary.histogram('activation', layer[0])

    for l in range(num_hidden_layers + 1)[1:]:
        with tf.name_scope('hidden_layer_{}'.format(l)):
            with tf.name_scope('weights'):
                weights.append(weight_variable([num_neurons_hiddenlayer, num_neurons_hiddenlayer]))
                variable_summaries(weights[l])
            with tf.name_scope('biases'):
                biases.append(bias_variable([num_neurons_hiddenlayer]))
                variable_summaries(biases[l])
            with tf.name_scope('Wx_plus_b'):
                layer.append(tf.nn.relu(tf.matmul(layer[l-1], weights[l]) + biases[l]))
            tf.summary.histogram('activation', layer[l])

    with tf.name_scope('output_layer'):
        with tf.name_scope('weights'):
            weights.append(weight_variable([num_neurons_hiddenlayer, output_size]))
            variable_summaries(weights[num_hidden_layers + 1])
        with tf.name_scope('biases'):
            biases.append(bias_variable([output_size]))
            variable_summaries(biases[num_hidden_layers + 1])
        with tf.name_scope('Wx_plus_b'):
            pred = tf.matmul(layer[num_hidden_layers], weights[num_hidden_layers + 1]) + biases[num_hidden_layers + 1]

    return pred


def variable_summaries(var):
    """
    Attaching summaries to a variable.
    """
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)
        ```

main

```python
#!/usr/bin/env python

import tensorflow as tf
from sklearn import metrics

import tools.utils as tut
import tools.nn as tnn


SUMMARY_DIR = './logs'
FILENAME = 'creditcard.csv'

PCA_DIMS = 28
NEURONS_HL = 10
NUM_HL = 2
LEARNING_RATE = 0.001
INPUT_SIZE = PCA_DIMS
OUTPUT_SIZE = 2
TRAIN_TIME = 10000

data_positive = tut.get_class_entries(FILENAME, 1)
data_negative = tut.get_class_entries(FILENAME, 0)

features, labels = tut.get_all_data(FILENAME, PCA_DIMS)

training_data, training_labels, testing_data, testing_labels = tut.get_train_test_data(
    data_positive, data_negative, dims=PCA_DIMS)

# transform our label vectors to one hot vectors
training_labels = tf.one_hot(training_labels, OUTPUT_SIZE)
testing_labels = tf.one_hot(testing_labels, OUTPUT_SIZE)
labels = tf.one_hot(labels, OUTPUT_SIZE)

x = tf.placeholder(dtype=tf.float32, shape=[None, INPUT_SIZE])
y = tf.placeholder(dtype=tf.int32, shape=[None, OUTPUT_SIZE])

# output of our neural network
prediction = tnn.mlp(x, OUTPUT_SIZE, NUM_HL, NEURONS_HL)

with tf.name_scope('xentropy'):
    pre_loss = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction)
    with tf.name_scope('total'):
        loss = tf.reduce_mean(pre_loss)
tf.summary.scalar('xentropy', loss)

with tf.name_scope('train'):
    train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

with tf.name_scope('accuracy'):
    with tf.name_scope('correct_prediction'):
        correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
    with tf.name_scope('accuracy'):
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    tf.summary.scalar('accuracy', accuracy)

with tf.Session() as session:
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(SUMMARY_DIR + '/train', session.graph)
    session.run(tf.global_variables_initializer())

    # convert one hot training labels and test labels from tensor objects to lists
    testing_labels, training_labels, labels = session.run([testing_labels, training_labels, labels])

    for i in range(TRAIN_TIME):
        if i % 500 == 0:
            summary, acc = session.run([merged, accuracy], feed_dict={x: testing_data, y: testing_labels})
            train_writer.add_summary(summary, i)
            print("step %d, accuracy: %f" % (i, acc))
        session.run([train_step], feed_dict={x: training_data, y: training_labels})

    # Let's test our model
    true_label = tf.argmax(y, 1)
    pred_label = tf.argmax(prediction, 1)

    tl, pl = session.run([true_label, pred_label], feed_dict={x: features, y: labels})

    report = metrics.classification_report(tl, pl, digits=4)
    auc_val = metrics.roc_auc_score(tl, pl)

    print("AUC Score: {}".format(auc_val))
    print("Report: {}".format(report))
利用NN處理不平衡數據集

數據預處理DEMO

在python中如何用word2vec來計算句子的相似度

利用NN處理不平衡數據集

kaggle編碼categorical feature總結

離散特徵處理方法

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結