1.sns調色板使用
pal = sns.color_palette()
2.打印文件大小
print('# File sizes')
for f in os.listdir('../input'):
if 'zip' not in f:
#ljust() 返回一個原字符串左對齊,並使用空格填充至指定長度的新字符串。如果指定的長度小於原字符串的長度則返回原字符串。
print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')
3.Pandas 中兩列合併成一個Series的方法
qids = pd.Series(df_train['qid1'].tolist() + df_train['qid2'].tolist())
4.matplotlib中y軸的縮放
plt.yscale('log', nonposy='clip')
5.判斷是否是數字或者大寫
isupper()
isdigit()
6.按順序輸出GridSearchCV結果
for i in range(1, len(cv.cv_results_['params'])+1):
rank = cv.cv_results_['rank_test_score'][i-1]
s = cv.cv_results_['mean_test_score'][i-1]
sd = cv.cv_results_['std_test_score'][i-1]
params = cv.cv_results_['params'][i-1]
print("{0}. Mean validation neg log loss: {1:.3f} (std: {2:.3f}) - {3}".format(
rank,
s,
sd,
params
))
7.繪製ROC曲線
準確率就是A/(A+B) 大白話就是“你的預測有多少對的比例”
召回率就是A/(A+C) 大白話就是“正例裏你的預測覆蓋的比例”
false postive rate FPR(橫軸):預測爲正例,但實際爲負例佔真正負例的比例
True Negative Rate TNR(縱軸):預測爲負例,實際爲負例子,這些佔真正負例的比例。
首先AUC值是一個概率值,當你隨機挑選一個正樣本以及負樣本,當前的分類算法根據計算得到的Score值將這個正樣本排在負樣本前面的概率就是AUC值,AUC值越大,當前分類算法越有可能將正樣本排在負樣本前面,從而能夠更好地分類。
colors = ['r', 'g', 'b', 'y', 'k', 'c', 'm', 'brown', 'r']
lw = 1
Cs = [1e-6, 1e-4, 1e0]
plt.figure(figsize=(12,8))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for different classifiers')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
labels = []
for idx, C in enumerate(Cs):
clf = LogisticRegression(C = C)
clf.fit(X_train, y_train)
print("C: {}, parameters {} and intercept {}".format(C, clf.coef_, clf.intercept_))
fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=lw, color=colors[idx])
labels.append("C: {}, AUC = {}".format(C, np.round(roc_auc, 4)))
plt.legend(['random AUC = 0.5'] + labels)
8.文件目錄的設置
BASE_DIR = '../input/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
9.載入已訓練的詞向量
from gensim.models import KeyedVectors
print('Indexing word vectors')
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))
10.去除停用詞
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
11.字符連接成字符串
text = " ".join(text)
12.提取詞幹
if stem_words:
text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in text]
text = " ".join(stemmed_words)
13.讀取NLP的csv文件
test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
reader = csv.reader(f, delimiter=',')
header = next(reader)
for values in reader:
test_texts_1.append(text_to_wordlist(values[1]))
test_texts_2.append(text_to_wordlist(values[2]))
test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))
13.Keras中的分詞處理
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
14.填充序列長度
test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
15.詞向量預處理
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index))+1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
if word in word2vec.vocab:
embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
16.train,valid數據採樣提取
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]
data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
-
17.