Quora比賽代碼學習筆記

1.sns調色板使用

pal = sns.color_palette()

2.打印文件大小

print('# File sizes')
for f in os.listdir('../input'):
    if 'zip' not in f:
        #ljust() 返回一個原字符串左對齊,並使用空格填充至指定長度的新字符串。如果指定的長度小於原字符串的長度則返回原字符串。
        print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')

3.Pandas 中兩列合併成一個Series的方法

qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())

4.matplotlib中y軸的縮放

plt.yscale('log', nonposy='clip')

5.判斷是否是數字或者大寫

isupper()
isdigit() 

6.按順序輸出GridSearchCV結果

for i in range(1, len(cv.cv_results_['params'])+1):
    rank = cv.cv_results_['rank_test_score'][i-1]
    s = cv.cv_results_['mean_test_score'][i-1]
    sd = cv.cv_results_['std_test_score'][i-1]
    params = cv.cv_results_['params'][i-1]
    print("{0}. Mean validation neg log loss: {1:.3f} (std: {2:.3f}) - {3}".format(
        rank,
        s,
        sd,
        params
    ))

7.繪製ROC曲線
準確率就是A/(A+B) 大白話就是“你的預測有多少對的比例”
召回率就是A/(A+C) 大白話就是“正例裏你的預測覆蓋的比例”
false postive rate FPR(橫軸):預測爲正例,但實際爲負例佔真正負例的比例
True Negative Rate TNR(縱軸):預測爲負例,實際爲負例子,這些佔真正負例的比例。
首先AUC值是一個概率值,當你隨機挑選一個正樣本以及負樣本,當前的分類算法根據計算得到的Score值將這個正樣本排在負樣本前面的概率就是AUC值,AUC值越大,當前分類算法越有可能將正樣本排在負樣本前面,從而能夠更好地分類。

colors = ['r', 'g', 'b', 'y', 'k', 'c', 'm', 'brown', 'r']
lw = 1
Cs = [1e-6, 1e-4, 1e0]

plt.figure(figsize=(12,8))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for different classifiers')

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')

labels = []
for idx, C in enumerate(Cs):
    clf = LogisticRegression(C = C)
    clf.fit(X_train, y_train)
    print("C: {}, parameters {} and intercept {}".format(C, clf.coef_, clf.intercept_))
    fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=lw, color=colors[idx])
    labels.append("C: {}, AUC = {}".format(C, np.round(roc_auc, 4)))

plt.legend(['random AUC = 0.5'] + labels)

8.文件目錄的設置

BASE_DIR = '../input/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'

9.載入已訓練的詞向量

from gensim.models import KeyedVectors
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

10.去除停用詞

if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

11.字符連接成字符串

text = " ".join(text)

12.提取詞幹

if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

13.讀取NLP的csv文件

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

13.Keras中的分詞處理

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)

14.填充序列長度

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)

15.詞向量預處理

print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

16.train,valid數據採樣提取

perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
    17.
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章