import nltk
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk.download()
再輸入需要的包
#一個完整文本分類流程
#using NLTK library, we can do lot of text preprocesing
import nltk
from nltk.tokenize import word_tokenize
#function to split text into word
tokens = word_tokenize("The quick brown fox jumps over the lazy dog")
nltk.download('stopwords')
print(tokens)
from nltk.corpus import stopwords
stop_words = set(stopwords.words(‘english’))
tokens = [w for w in tokens if not w in stop_words]
print(tokens)
#NLTK provides several stemmer interfaces like Porter stemmer, #Lancaster Stemmer, Snowball Stemmer
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stems = []
for t in tokens:
stems.append(porter.stem(t))
print(stems)
#convert the dataset from files to a python DataFrame
import pandas as pd
import os
folder = 'aclImdb'
labels = {'pos': 1, 'neg': 0}
df = pd.DataFrame()
for f in ('test', 'train'):
for l in ('pos', 'neg'):
path = os.path.join(folder, f, l)
for file in os.listdir (path) :
with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
txt = infile.read()
df = df.append([[txt, labels[l]]],ignore_index=True)
df.columns = ['review', 'sentiment']
# get frequency distribution
import nltk
from nltk.tokenize import word_tokenize
reviews = df.review.str.cat(sep=' ')
#function to split text into word
tokens = word_tokenize(reviews)
vocabulary = set(tokens)
print(len(vocabulary))
frequency_dist = nltk.FreqDist(tokens)
sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)[0:50]
# get rid ofm stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
#draw word cloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud().
generate_from_frequencies(frequency_dist)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#build classifier
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_vectors, y_train)
from sklearn.metrics import accuracy_score
predicted = clf.predict(test_vectors)
print(accuracy_score(y_test,predicted))
空值用平均值取代:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 1:3]) #處理第1,2列所有行的值
X[:, 1:3] = imputer.transform(X[:, 1:3])
代碼:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 25 18:52:15 2018
@author: krunal
"""
# Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing Dataset
dataset = pd.read_csv('patientData.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values
# Handing Missing Dataset
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
# Encode Categorical Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
# Split the data between the Training Data and Test Data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2
,random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
https://appdividend.com/2018/07/23/prepare-dataset-for-machine-learning-in-python/