以決策樹爲基函數的提升方法稱爲提升樹(boosting tree)。提升樹採用前向分步算法,根據每輪迭代的殘差值,學習得到一個迴歸樹,用加法模型得到提升樹的結果。迴歸問題採用平方誤差損失函數,分類問題用指數損失函數。
指數損失通過最小化指數損失來逐步學習多個輸出爲1和−1的二值基函數的線性組合。對離羣點、噪聲非常敏感,常用在AdaBoost算法中。指數損失詳解
HIGGS 數據集包含有 11 million 個樣本,具有 28 個特徵,用於分類問題,來區分產生希格斯玻色子的信號過程和不產生希格斯玻色子的後臺過程。
下載數據
解壓下載的數據,用pandas read_csv().as_matrix(),然後用numpy轉成壓縮文件備用。
# tf.__version__ == 1.9.0
URL_ROOT = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280"
INPUT_FILE = "HIGGS.csv.gz"
NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file to contain "data" array.
def _download_higgs_data_and_save_npz(data_dir):
"""Download higgs data and store as a numpy compressed file."""
input_url = os.path.join(URL_ROOT, INPUT_FILE)
np_filename = os.path.join(data_dir, NPZ_FILE)
if tf.gfile.Exists(np_filename):
raise ValueError("data_dir already has the processed data file: {}".format(
np_filename))
if not tf.gfile.Exists(data_dir):
tf.gfile.MkDir(data_dir)
# 2.8 GB to download.
try:
tf.logging.info("Data downloading...")
temp_filename, _ = urllib.request.urlretrieve(input_url)
# Reading and parsing 11 million csv lines takes 2~3 minutes.
tf.logging.info("Data processing... taking multiple minutes...")
with gzip.open(temp_filename, "rb") as csv_file:
data = pd.read_csv(
csv_file,
dtype=np.float32,
names=["c%02d" % i for i in range(29)] # label + 28 features.
).as_matrix()
finally:
tf.gfile.Remove(temp_filename)
# Writing to temporary location then copy to the data_dir (0.8 GB).
f = tempfile.NamedTemporaryFile()
np.savez_compressed(f, data=data)
tf.gfile.Copy(f.name, np_filename)
tf.logging.info("Data saved to: {}".format(np_filename))
讀取數據:
NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file containing "data" array
def read_higgs_data(data_dir, train_start, train_count, eval_start, eval_count):
"""Reads higgs data from csv and returns train and eval data.
Args:
data_dir: A string, the directory of higgs dataset.
train_start: An integer, the start index of train examples within the data.
train_count: An integer, the number of train examples within the data.
eval_start: An integer, the start index of eval examples within the data.
eval_count: An integer, the number of eval examples within the data.
Returns:
Numpy array of train data and eval data.
"""
npz_filename = os.path.join(data_dir, NPZ_FILE)
print("read data from %s..." % (npz_filename))
try:
# gfile allows numpy to read data from network data sources as well.
with tf.gfile.Open(npz_filename, "rb") as npz_file:
with np.load(npz_file) as npz:
data = npz["data"]
except tf.errors.NotFoundError as e:
raise RuntimeError(
"Error loading data; use data_download.py to prepare the data.\n{}: {}"
.format(type(e).__name__, e))
return (data[train_start:train_start+train_count],
data[eval_start:eval_start+eval_count])
數據預處理
用numpy arrays格式的數據製作train_input_fn:
def make_inputs_from_np_arrays(features_np, label_np):
"""Makes and returns input_fn and feature_columns from numpy arrays.
The generated input_fn will return tf.data.Dataset of feature dictionary and a
label, and feature_columns will consist of the list of
tf.feature_column.BucketizedColumn.
Note, for in-memory training, tf.data.Dataset should contain the whole data
as a single tensor. Don't use batch.
Args:
features_np: A numpy ndarray (shape=[batch_size, num_features]) for
float32 features.
label_np: A numpy ndarray (shape=[batch_size, 1]) for labels.
Returns:
input_fn: A function returning a Dataset of feature dict and label.
feature_names: A list of feature names.
feature_column: A list of tf.feature_column.BucketizedColumn.
"""
# 特徵個數
num_features = features_np.shape[1]
# 把原始特徵縱向切分成num_features列,返回一個list,每個元素表示一列特徵
features_np_list = np.split(features_np, num_features, axis=1)
# 定義 1-based序列作爲特徵名.
feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
# Create source feature_columns and bucketized_columns.
# 計算每列特徵的百分比分位數,分桶.
def get_bucket_boundaries(feature):
"""Returns bucket boundaries for feature by percentiles."""
return np.unique(np.percentile(feature, range(0, 100))).tolist()
# 把連續變量轉成dtype類型,並指定默認值
source_columns = [
tf.feature_column.numeric_column(
feature_name, dtype=tf.float32,
# Although higgs data have no missing values, in general, default
# could be set as 0 or some reasonable value for missing values.
default_value=0.0)
for feature_name in feature_names
]
# 將連續變量進行分桶離散化,輸出one-hot的結果
bucketized_columns = [
tf.feature_column.bucketized_column(
source_columns[i],
boundaries=get_bucket_boundaries(features_np_list[i]))
for i in range(num_features)
]
# Make an input_fn that extracts source features.
def input_fn():
"""Returns features as a dictionary of numpy arrays, and a label.
把{feature_name, 特徵的numpy array} 和 label_np zipping起來
"""
features = {
feature_name: tf.constant(features_np_list[i])
for i, feature_name in enumerate(feature_names)
}
return tf.data.Dataset.zip((tf.data.Dataset.from_tensors(features),
tf.data.Dataset.from_tensors(label_np),))
return input_fn, feature_names, bucketized_columns
類似的,製作測試集的 input_fn:
def make_eval_inputs_from_np_arrays(features_np, label_np):
"""Makes eval input as streaming batches.
"""
num_features = features_np.shape[1]
features_np_list = np.split(features_np, num_features, axis=1)
# 1-based feature names.
feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
def input_fn():
features = {
feature_name: tf.constant(features_np_list[i])
for i, feature_name in enumerate(feature_names)
}
return tf.data.Dataset.zip((
tf.data.Dataset.from_tensor_slices(features),
tf.data.Dataset.from_tensor_slices(label_np),)).batch(1000)
return input_fn
訓練過程:
默認情況下,1100萬個樣本中前100萬個會被用於訓練,最後100萬個會用於評估。可以通過標誌 --train_start, -train_count, -eval_start, -eval_count 等選擇訓練/評估數據作爲索引範圍。
def train_boosted_trees(flags_obj):
"""Train boosted_trees estimator on HIGGS data.
Args:
flags_obj: An object containing parsed flag values.
"""
# 先清除model_dir
if tf.gfile.Exists(flags_obj.model_dir):
tf.gfile.DeleteRecursively(flags_obj.model_dir)
# 加載數據,按指定序號區分訓練集和測試集
tf.logging.info("## Data loading...")
train_data, eval_data = read_higgs_data(
flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count,
flags_obj.eval_start, flags_obj.eval_count)
tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format(
train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape))
# 製作出訓練集的 input_fn:A function returning a Dataset of feature dict and label.
# 製作對數值分桶了的feature_columns,以及feature_names
# Data consists of one label column followed by 28 feature columns.
train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays(
features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
# 製作測試集的 input_fn
eval_input_fn = make_eval_inputs_from_np_arrays(
features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1])
tf.logging.info("## Features prepared. Training starts...")
# Create benchmark logger to log info about the training and metric values
run_params = {
"model_name": "boosted_trees",
"dataset_name": "higgs",
"train_start": flags_obj.train_start,
"train_count": flags_obj.train_count,
"eval_start": flags_obj.eval_start,
"eval_count": flags_obj.eval_count,
"n_trees": flags_obj.n_trees,
"max_depth": flags_obj.max_depth,
}
tf.logging.info("run params:\n %s", run_params)
# Though BoostedTreesClassifier is under tf.estimator, faster in-memory
# training is yet provided as a contrib library.
# train_input_fn是訓練數據帶labels,feature_columns是bucketized_columns
# n_trees是提升樹的個數,max_depth是提升樹的最大深度。
classifier = tf.contrib.estimator.boosted_trees_classifier_train_in_memory(
train_input_fn,
feature_columns,
model_dir=flags_obj.model_dir or None,
n_trees=flags_obj.n_trees,
max_depth=flags_obj.max_depth,
learning_rate=flags_obj.learning_rate)
# 用測試集評估
eval_results = classifier.evaluate(eval_input_fn)
# Benchmark the evaluation results
# benchmark_logger.log_evaluation_result(eval_results)
tf.logging.info("Benchmark metric: %s", eval_results)
模型評估
當使用默認參數訓練時,最終的精度將在74%左右,並且在eval集上的損失大約爲0.516。
由於指定了n_trees=100,max_depth=6,所以global_step=600就自動結束了。
日誌輸出:
Benchmark metric: {‘accuracy’: 0.738377, ‘accuracy_baseline’: 0.529618, ‘auc’: 0.8194437, ‘auc_precision_recall’: 0.83378166, ‘average_loss’: 0.5168768, ‘label/mean’: 0.529618, ‘loss’: 0.51687634, ‘precision’: 0.74909055, ‘prediction/mean’: 0.52907485, ‘recall’: 0.76087105, ‘global_step’: 600}
導出模型:
if flags_obj.export_dir is not None:
classifier.export_savedmodel(
flags_obj.export_dir,
_make_csv_serving_input_receiver_fn(
column_names=feature_names,
# columns are all floats.
column_defaults=[[0.0]] * len(feature_names)))
代碼來自:https://github.com/tensorflow/models/tree/r1.9.0/official/boosted_trees
使用模型進行預測,需要用到 saved_model_cli 。