tensorflow2------csv文件的創建與讀取

import matplotlib as mpl #畫圖用的庫
import matplotlib.pyplot as plt
#下面這一句是爲了可以在notebook中畫圖
%matplotlib inline
import numpy as np
import sklearn   #機器學習算法庫
import pandas as pd #處理數據的庫   
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras   #使用tensorflow中的keras
#import keras #單純的使用keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, sklearn, pd, tf, keras:
    print(module.__name__, module.__version__)
#引用位於sklearn數據集中的房價預測數據集
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
#print(housing.DESCR) #數據集的描述
print(housing.data.shape) #相當於 x
print(housing.target.shape) #相當於 y



(20640, 8)
(20640,)
#用sklearn中專門用於劃分訓練集和測試集的方法
from sklearn.model_selection import train_test_split

#train_test_split默認將數據劃分爲3:1,我們可以通過修改test_size值來改變數據劃分比例(默認0.25,即3:1)
#將總數乘以test_size就表示test測試集、valid驗證集數量
#將數據集整體拆分爲train_all和test數據集
x_train_all,x_test, y_train_all,y_test = train_test_split(housing.data, housing.target, random_state=7)
#將train_all數據集拆分爲train訓練集和valid驗證集
x_train,x_valid, y_train,y_valid = train_test_split(x_train_all, y_train_all, random_state=11)

print(x_train_all.shape,y_train_all.shape)
print(x_test.shape, y_test.shape)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)



(15480, 8) (15480,)
(5160, 8) (5160,)
(11610, 8) (11610,)
(3870, 8) (3870,)
#訓練數據歸一化處理
# x = (x - u)/std  u爲均值,std爲方差
from sklearn.preprocessing import StandardScaler #使用sklearn中的StandardScaler實現訓練數據歸一化

scaler = StandardScaler()#初始化一個scaler對象
x_train_scaler = scaler.fit_transform(x_train)#x_train已經是二維數據了,無需astype轉換
x_valid_scaler = scaler.transform(x_valid)
x_test_scaler  = scaler.transform(x_test)
print(housing.feature_names)



['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
import shutil

output_dir = "generate_csv"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir) #存在該文件則先刪除該文件夾,保證每次運行創建的文件夾都是新的
os.mkdir(output_dir)

def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")#在大括號裏面填充數據生成文件名
    filenames = []
    
    #enumerate() 函數用於將一個可遍歷的數據對象(如列表、元組或字符串)組合爲一個索引序列,同時列出數據和數據下標,一般用在 for 循環當中
    # file_idx表示生成的n_partes個文件的索引,row_indices表示每一個文件內的行索引
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)#path_format傳入兩個參數組成文件名
        filenames.append(part_csv) #append生成最終的文件
        with open(part_csv, "wt", encoding="utf-8") as f: #打開這個文件並填充內容
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices: #這裏遍歷行索引,對每一個文件進行填充
                f.write(",".join([repr(col) for col in data[row_index]]))#對每一行中的每一列進行填充
                f.write('\n')
    return filenames

#因爲save_to_csv只有一個data參數,所以我們將(x, y)按行進行合併
train_data = np.c_[x_train_scaler, y_train]#np.c_函數可以將數據按行進行合併merge
valid_data = np.c_[x_valid_scaler, y_valid]
test_data  = np.c_[x_test_scaler , y_test ]

#生成表頭header,即每一列的數據前面的名稱    
header_cols = housing.feature_names + ["HouseValue"]
print(header_cols)
header_str = ",".join(header_cols)

#訓練集數據總數爲 11610,n_parts表示平均分爲n份
train_filenames = save_to_csv(output_dir, train_data, "train", header_str, n_parts=20)

valid_filenames = save_to_csv(output_dir, valid_data, "valid", header_str, n_parts=10)

test_filenames  = save_to_csv(output_dir, test_data, "test", header_str, n_parts=10)




['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'HouseValue']
import pprint

print("train_files:")
pprint.pprint(train_filenames)

print("valid_files:")
pprint.pprint(valid_filenames)

print("test_files:")
pprint.pprint(test_filenames)



train_files:
['generate_csv/train_00.csv',
 'generate_csv/train_01.csv',
 'generate_csv/train_02.csv',
 'generate_csv/train_03.csv',
 'generate_csv/train_04.csv',
 'generate_csv/train_05.csv',
 'generate_csv/train_06.csv',
 'generate_csv/train_07.csv',
 'generate_csv/train_08.csv',
 'generate_csv/train_09.csv',
 'generate_csv/train_10.csv',
 'generate_csv/train_11.csv',
 'generate_csv/train_12.csv',
 'generate_csv/train_13.csv',
 'generate_csv/train_14.csv',
 'generate_csv/train_15.csv',
 'generate_csv/train_16.csv',
 'generate_csv/train_17.csv',
 'generate_csv/train_18.csv',
 'generate_csv/train_19.csv']
valid_files:
['generate_csv/valid_00.csv',
 'generate_csv/valid_01.csv',
 'generate_csv/valid_02.csv',
 'generate_csv/valid_03.csv',
 'generate_csv/valid_04.csv',
 'generate_csv/valid_05.csv',
 'generate_csv/valid_06.csv',
 'generate_csv/valid_07.csv',
 'generate_csv/valid_08.csv',
 'generate_csv/valid_09.csv']
test_files:
['generate_csv/test_00.csv',
 'generate_csv/test_01.csv',
 'generate_csv/test_02.csv',
 'generate_csv/test_03.csv',
 'generate_csv/test_04.csv',
 'generate_csv/test_05.csv',
 'generate_csv/test_06.csv',
 'generate_csv/test_07.csv',
 'generate_csv/test_08.csv',
 'generate_csv/test_09.csv']
###讀取 csv 文件並形成一個datasets
#1.filename -> dataset
#2.read file -> dataset ->datasets ->merge
#3.parse csv

#首先生成一個文件名的數據集
filename_dataset = tf.data.Dataset.list_files(train_filenames)

for item in filename_dataset:
    print(item)

    
#讀取每一個文件形成最終的數據集
n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1),#skip(1)表示跳過第一行
    cycle_length = n_readers#每個文件讀取前面5行數據(skip(1)表示已經除去了header)
)

for line in dataset.take(15):#take(15)表示只讀取前面15個
    print(line.numpy())




tf.Tensor(b'generate_csv/train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train_08.csv', shape=(), dtype=string)
b'0.4853051504718848,-0.8492418886278699,-0.06530126513877861,-0.023379656040017353,1.4974350551260218,-0.07790657783453239,-0.9023632702857819,0.7814514907892068,2.956'
b'0.42408210084996534,0.9129633171802288,-0.04437481876046234,-0.15297213746739335,-0.24727627804141977,-0.10539166599677323,0.8612674255663844,-1.3357789003702432,3.955'
b'0.04971034572063198,-0.8492418886278699,-0.06214699417830008,0.17878747064657746,-0.8025354230744277,0.0005066066922077538,0.6466457006743215,-1.1060793768010604,2.286'
b'0.04326300977263167,-1.0895425985107923,-0.38878716774583305,-0.10789864528874438,-0.6818663605100649,-0.0723871014747467,-0.8883662012710817,0.8213992340186296,1.426'
b'-0.09719300311107498,-1.249743071766074,0.36232962250170797,0.026906080250728295,1.033811814747154,0.045881586971778555,1.3418334617377423,-1.6353869745909178,1.832'
b'-0.7432054083470616,0.9129633171802288,-0.644320243857189,-0.1479096959813185,0.7398510909061499,0.11427691039226903,-0.7950524078397521,0.6815821327156534,1.438'
b'0.4369234889778008,-1.9706452014148417,-0.1664210569911193,0.05486205164394496,-0.8379195842775115,-0.1323988058685803,-0.9956770637171147,0.941242463706905,1.73'
b'-1.453851024367546,1.874166156711919,-1.1315714708271856,0.3611276016530489,-0.3978857847006997,-0.03273859332533962,-0.7390641317809511,0.646627857389904,1.875'
b'-0.7543417158936074,-0.9293421252555106,-0.9212720434835953,0.1242806741969112,-0.5983960315181748,-0.18494335623235414,-0.8183808561975836,0.8513600414406984,1.717'
b'1.0534699704183814,-0.1283397589791022,0.13509497508586193,-0.2852867771449356,-0.37066719915986596,-0.017744041396267323,0.7586222527919203,-1.1510205879341566,2.674'
b'-0.8246762898717912,-0.04823952235146133,-0.3448658166118309,-0.08477587145199328,0.5012348243315076,-0.034699996532417135,0.5300034588851571,-0.08741192445075467,0.717'
b'-0.49303811681102094,-1.5701440182766375,-0.6933897788607161,0.16277645579446545,0.3279431630548662,-0.08806528786307917,-0.86503775291325,0.6366409215825501,2.033'
b'-1.1179501498535522,0.3522616607867429,-0.17415480367337632,0.1029357335256435,-0.24364713330264193,-0.06195252491676357,1.9063819119972951,-1.1210597805120879,0.603'
b'0.18702261628258646,-0.20843999560674303,0.005869659830725365,-0.2645340092721605,-0.011381870020860852,-0.015878889894211247,0.05876880205693385,0.17224840654049697,0.84'
b'-0.8698076415077927,-0.44874070548966555,0.9621267572121975,3.9409717092762584,-0.9740125119816802,-0.09383082108319943,-0.6690787867074531,1.6752822455475638,0.425'
#解析csv中的字符串str,record_defaults表示 字符串中每個元素對應的數據類型
# tf.io.devode_csv(str, record_defaults)

sample_str      = '1,2,3,4,5'#解析samole_str這個字符串
record_defaults = [tf.constant(0, dtype=tf.int32)] * 5
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)

sample_str2      = '1,2,3,4,5'#解析samole_str這個字符串
record_defaults2 = [tf.constant(0, dtype=tf.int32),
                    0,
                    np.nan,
                    "hello",
                    tf.constant([])]
parsed_fields2 = tf.io.decode_csv(sample_str2, record_defaults2)
print(parsed_fields2)

#傳入 空的字符串
try:
    parsed_fields2 = tf.io.decode_csv(',,,,', record_defaults2)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

#傳入 字符串files數量大於record_defaults    
try:
    parsed_fields2 = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults2)
except tf.errors.InvalidArgumentError as ex:
    print(ex)    



[<tf.Tensor: id=96, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=97, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=98, shape=(), dtype=int32, numpy=3>, <tf.Tensor: id=99, shape=(), dtype=int32, numpy=4>, <tf.Tensor: id=100, shape=(), dtype=int32, numpy=5>]
[<tf.Tensor: id=107, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=108, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=109, shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: id=110, shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: id=111, shape=(), dtype=float32, numpy=5.0>]
Field 4 is required but missing in record 0! [Op:DecodeCSV]
Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]
y
def parse_csv_line(line, n_fields=9):
def parse_csv_line(line, n_fields=9):
    defs = [tf.constant(np.nan)] * n_fields
    parse_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parse_fields[:-1])#前8個數據爲x,需要區分出來
    y = tf.stack(parse_fields[-1:])#倒數第一個數據
    return x, y
parse_csv_line('-0.9974222662636643,1.2333642636907922,-0.7577192870888144,-0.011109251557751528,-0.23003784053222506,0.05487422342718872,-0.757726890467217,0.7065494722340417,1.739',n_fields=9)




(<tf.Tensor: id=131, shape=(8,), dtype=float32, numpy=
 array([-0.9974223 ,  1.2333642 , -0.7577193 , -0.01110925, -0.23003784,
         0.05487422, -0.7577269 ,  0.70654947], dtype=float32)>,
 <tf.Tensor: id=132, shape=(1,), dtype=float32, numpy=array([1.739], dtype=float32)>)
###讀取 csv 文件並形成一個datasets
#1.filename -> dataset
#2.read file -> dataset ->datasets ->merge
#3.parse csv

def csv_reader_dataset(filenames, n_readers=5, 
                       batch_size=32, n_parse_threads=5, 
                       shuffle_buffer_size=10000):
    
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()#repeat()表示 數據集遍歷的次數,無參數表示遍歷無數次
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)#將數據打亂,數值越大,混亂程度越大
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)#一次性取出 batch_size 個數據
    return dataset
    
train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)



x:
<tf.Tensor: id=216, shape=(3, 8), dtype=float32, numpy=
array([[-6.6722274e-01, -4.8239522e-02,  3.4529406e-01,  5.3826684e-01,
         1.8521839e+00, -6.1125383e-02, -8.4170932e-01,  1.5204847e+00],
       [ 4.9710345e-02, -8.4924191e-01, -6.2146995e-02,  1.7878747e-01,
        -8.0253541e-01,  5.0660671e-04,  6.4664572e-01, -1.1060793e+00],
       [ 8.1150836e-01, -4.8239522e-02,  5.1873392e-01, -2.9386396e-02,
        -3.4064025e-02, -5.0815947e-02, -7.1573567e-01,  9.1627514e-01]],
      dtype=float32)>
y:
<tf.Tensor: id=217, shape=(3, 1), dtype=float32, numpy=
array([[1.59 ],
       [2.286],
       [2.147]], dtype=float32)>
x:
<tf.Tensor: id=218, shape=(3, 8), dtype=float32, numpy=
array([[-0.82195884,  1.8741661 ,  0.1821235 , -0.03170019, -0.6011179 ,
        -0.14337493,  1.0852206 , -0.8613995 ],
       [-1.0775077 , -0.4487407 , -0.5680568 , -0.14269263, -0.09666677,
         0.12326469, -0.31448638, -0.4818959 ],
       [ 1.5180511 , -0.52884096,  0.81024706, -0.1921417 ,  0.44135395,
         0.02733506, -0.81838083,  0.8563535 ]], dtype=float32)>
y:
<tf.Tensor: id=219, shape=(3, 1), dtype=float32, numpy=
array([[1.054],
       [0.978],
       [2.898]], dtype=float32)>
batch_size = 32
train_set = csv_reader_dataset(train_filenames, batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames, batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,   batch_size = batch_size)
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu",input_shape=[8]),
    keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error", optimizer="adam")


callbacks = [
    keras.callbacks.EarlyStopping(patience=5,min_delta=1e-3)
]

history=model.fit(train_set,
                  validation_data = valid_set,
                  steps_per_epoch = 11160 // batch_size,
                  validation_steps = 3870 // batch_size,
                  epochs = 100,
                  callbacks=callbacks)



Train for 348 steps, validate for 120 steps
Epoch 1/100
348/348 [==============================] - 1s 3ms/step - loss: 1.8485 - val_loss: 0.7731
Epoch 2/100
348/348 [==============================] - 1s 2ms/step - loss: 0.6238 - val_loss: 0.5818
Epoch 3/100
348/348 [==============================] - 1s 2ms/step - loss: 0.4906 - val_loss: 0.4821
Epoch 4/100
348/348 [==============================] - 1s 2ms/step - loss: 0.4322 - val_loss: 0.4568
Epoch 5/100
348/348 [==============================] - 1s 2ms/step - loss: 0.3987 - val_loss: 0.4285
Epoch 6/100
348/348 [==============================] - 1s 2ms/step - loss: 0.3987 - val_loss: 0.4097
。。。
Epoch 26/100
348/348 [==============================] - 1s 2ms/step - loss: 0.3319 - val_loss: 0.3419
Epoch 27/100
348/348 [==============================] - 1s 2ms/step - loss: 0.3237 - val_loss: 0.3422
Epoch 28/100
348/348 [==============================] - 1s 2ms/step - loss: 0.3317 - val_loss: 0.3421
Epoch 29/100
348/348 [==============================] - 1s 2ms/step - loss: 0.3433 - val_loss: 0.3408
model.evaluate(test_set, steps=5160 // batch_size)



161/161 [==============================] - 0s 844us/step - loss: 0.3417
0.3416683908945285

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章