情感分析
import nltk.corpus as nc
import nltk.classify as cf
import nltk.classify.util as cu
pdata = []
fileids = nc.movie_reviews.fileids("pos")
for fileid in fileids:
feature = {}
words = nc.movie_reviews.words(fileid)
for word in words:
feature[word] = True
pdata.append((feature,"POSITIVE"))
ndata = []
nfileids = nc.movie_reviews.fileids("neg")
for fileid in nfileids:
feature = {}
words = nc.movie_reviews.words(fileid)
for word in words:
feature[word] = True
ndata.append((feature,"NEGETIVE"))
pnum,nnum = int(0.8 * len(pdata)), int(0.8 * len(ndata))
train_data = pdata[:pnum] + ndata[:nnum]
test_data = pdata[pnum:] + ndata[nnum:]
model = cf.NaiveBayesClassifier.train(train_data)
ac = cu.accuracy(model,test_data)
print("%.2f%%"%round(ac*100,2))
tops = model.most_informative_features()# 關鍵字
for top in tops:
print(top[0])
reviews = [
"It is a amazing movie.",
"This is a dull movie. I would never recommend it to anyone",
"The cinematography is pretty great in this movie.",
"This direction was terrible and the story was all over the place."
]
sents, probs = [], []
for review in reviews:
feature = {}
words = review.split()
for word in words:
feature[word] = True
pcls = model.prob_classify(feature)
sent = pcls.max()
prob = pcls.prob(sent)
sents.append(sent)
probs.append(prob)
for review, sent, probs in zip(reviews, sents, probs):
print(review,"->",sent,"%.2f%%"%round(prob * 100, 2))
性別識別
import random
import numpy as np
import nltk.corpus as nc
import nltk.classify as cf
male_names = nc.names.words("male.txt")
female_names = nc.names.words("female.txt")
models, acs = [],[]
for n_letter in range(1,6):
data = []
for male_name in male_names:
feature= {"feature":male_name[-n_letter:].lower()}
data.append((feature,"male"))
for female_name in female_names:
feature = {"feature":female_name[-n_letter:].lower()}
data.append((feature,"female"))
random.seed(7)
random.shuffle(data)
train_data = data[:int(len(data)/2)]
test_data = data[int(len(data)/2):]
model = cf.NaiveBayesClassifier.train(train_data)
ac = cf.accuracy(model,test_data)
acs.append(ac)
models.append(model)
best_index = np.array(acs).argmax()
best_letter = best_index + 1
names = ["Leonardo","Amy","Sam","Tom","Katherine","Tayior",
"Susanne","Watermelon","Alpaca","Paris","Python","JAVA"]
print(names)
genders = []
for name in names:
feature = {"feature":name[-best_letter:].lower()}
gender = models[best_index].classify(feature)
genders.append(gender)
print(genders)
文本分類
import sklearn.datasets as sd
import sklearn.feature_extraction.text as ft
import sklearn.naive_bayes as nb
cld = {"misc.forsale":"SALES","rec.motorcycles":"MOTORCYLES",
"rec.sport.baseball":"BASEBALL",
"sci.crypt":"CRYPTOGRAPHY",
"sci.space":"SPAVE"}
train = sd.fetch_20newsgroups(subset="train",
categories=cld.keys(),random_state=7,
shuffle = True)
train_data = train.data
train_y = train.target
categories = train.target_names
cv = ft.CountVectorizer()
train_fmt = cv.fit_transform(train_data)
tf = ft.TfidfTransformer()
train_x = tf.fit_transform(train_fmt)
model= nb.MultinomialNB()
model.fit(train_x,train_y)
test_data = ["The curveballs of right handed pitches tend to curver to the left",
"Caesar cipher is an ancient from encryption",
"This two-wheeler is really good on slippery roads"]
test_fmt = cv.transform(test_data)
test_x = tf.transform(test_fmt)
pred_y = model.predict(test_x)
for sentence, index in zip(test_data,pred_y):
print(sentence,"->",cld[categories[index]])
3D圖
import numpy as np
import matplotlib.pyplot as mp
from mpl_toolkits.mplot3d import axes3d
n_samples = 300
x = np.random.rand(n_samples)
y = np.random.rand(n_samples)
z = np.random.rand(n_samples)
ax = mp.gca(projection = "3d")
mp.title('Scatter 3D', fontsize=20)
ax.set_xlabel('x', fontsize=14)
ax.set_ylabel('y', fontsize=14)
ax.set_zlabel('z', fontsize=14)
mp.tick_params(labelsize=10)
mp.gca().scatter(x,y,z,c = np.array([x,y,z]).T,
s=100 * np.linalg.norm((x, y, z), axis=0))
mp.show()
訓練
import os
import sys
import warnings
import numpy as np
import cv2 as cv
import hmmlearn.hmm as hl
def show_image(title, image):
cv.imshow(title, image)
def search_objects(directory):
if not os.path.isdir(directory):
raise IOError(
"The director '" + directory + "' doesn't exist!")
objects = {}
for curdir, subdirs, files in os.walk(directory):
for jpeg in (file for file in files if file.endswith('.jpg')):
path = os.path.join(curdir, jpeg)
label = path.split(os.path.sep)[-2]
if label not in objects:
objects[label] = []
objects[label].append(path)
return objects
def read_image(filename):
image = cv.imread(filename)
return image
def resize_image(image, size):
h, w = image.shape[:2]
scale = size / min(h, w)
image = cv.resize(image, None, fx=scale, fy=scale)
return image
def calc_features(image):
star = cv.xfeatures2d.StarDetector_create()
keypoints = star.detect(image)
sift = cv.xfeatures2d.SIFT_create()
gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
keypoints, desc = sift.compute(gray, keypoints)
return desc
def read_data(directory):
objects = search_objects(directory)
x, y, z = [], [], []
for label, filenames in objects.items():
z.append([])
descs = np.array([])
for filename in filenames:
print(filename, '->', label)
image = read_image(filename)
z[-1].append(image)
image = resize_image(image, 200)
desc = calc_features(image)
descs = desc if len(descs) == 0 else np.append(descs, desc, axis=0)
x.append(descs)
y.append(label)
return x, y, z
def train_models(x, y):
models = {}
for descs, label in zip(x, y):
model = hl.GaussianHMM(n_components=4, covariance_type='diag',
n_iter=1000)
models[label] = model.fit(descs)
return models
def pred_models(models, x):
y = []
for descs in x:
best_score, best_label = None, None
for label, model in models.items():
score = model.score(descs)
if best_score is None:
best_score = score
if best_label is None:
best_label = label
if best_score < score:
best_score, best_label = score, label
y.append(best_label)
return y
def show_labels(labels, pred_labels, images):
i = 0
for label, pred_label, row in zip(labels, pred_labels, images):
for image in row:
i += 1
show_image('{}: {} {} {}'.format(
i, label, '==' if label == pred_label else '!=',
pred_label), image)
def main(argc, argv, envp):
warnings.filterwarnings(
'ignore', category=DeprecationWarning)
np.seterr(all='ignore')
train_x, train_y, train_z = read_data('objects\\training')
test_x, test_y, test_z = read_data('objects\\testing')
models = train_models(train_x, train_y)
pred_test_y = pred_models(models, test_x)
show_labels(test_y, pred_test_y, test_z)
cv.waitKey()
return 0
if __name__ == '__main__':
sys.exit(main(len(sys.argv), sys.argv, os.environ))
特徵
import os
import sys
import platform
import cv2 as cv
import numpy as np
import matplotlib.pyplot as mp
import mpl_toolkits.axes_grid1 as mg
def resize_image(image):
image = cv.resize(image,None,fx =2 ,fy =2 )
return image
def read_image(filename):
image = cv.imread(filename)
return image
def show_image(name,image):
cv.imshow(name,image)
def draw_desc(desc):
ma = mp.matshow(desc,cmap = "jet")
mp.gcf().set_facecolor(np.ones(3) * 240/255)
mp.title("Desc Calc",fontsize = 20)
mp.xlabel("X",fontsize = 14)
mp.ylabel("Y",fontsize = 14)
ax = mp.gca()
ax.xaxis.set_major_locator(mp.MultipleLocator(8))
ax.xaxis.set_minor_locator(mp.MultipleLocator())
ax.yaxis.set_major_locator(mp.MultipleLocator(8))
ax.yaxis.set_minor_locator(mp.MultipleLocator())
mp.tick_params(which = "both", top = True, right = True,
labelsize = 10,labeltop = False,
labelbottom = True)
dv = mg.make_axes_locatable(ax)
ca = dv.append_axes("right","3%",pad = "3%")
cb = mp.colorbar(ma,cax=ca)
cb.set_label('DESC',fontsize = 14)
mp.show()
def calc_image(image):
star = cv.xfeatures2d.StarDetector_create()
keypoints = star.detect(image)
sift = cv.xfeatures2d.SIFT_create()
gray = cv.cvtColor(image,cv.COLOR_BGR2GRAY)
keypoints,desc = sift.compute(gray,keypoints)
return desc
def main(argv,argc,envir):
image = read_image("F406752152C6E98DE1611516FC243D77.jpg")
image = resize_image(image)
show_image("Circle",image)
desc = calc_image(image)
draw_desc(desc)
cv.waitKey()
return 0
if __name__ == "__main__":
sys.exit(main(len(sys.argv),sys.argv,os.environ))
詞幹提取
import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb
import nltk.stem as ns
words = ["tables", "probably", "wolves", "playing", "is",
"dog", "the", "beaches", "grounded", "envision"]
# 寬鬆, 簡單, 快, 不保證詞幹的語法正確
stemmer = pt.PorterStemmer()
for word in words:
stem = stemmer.stem(word)
print(stem)
print("-" * 72)
# 嚴格,慢,保證詞幹準確
stemmer = lc.LancasterStemmer()
for word in words:
stem = stemmer.stem(word)
print(stem)
print("-" * 72)
stemmer = sb.SnowballStemmer("english")
for word in words:
stem = stemmer.stem(word)
print(stem)
lemmatizer = ns.WordNetLemmatizer()
for word in words:
lemme = lemmatizer.lemmatize(word,"n")
print(lemme)
print("-"*72)
#詞塊劃分
import nltk.corpus as nc
doc = " ".join(nc.brown.words()[:310])
print(doc)
words = doc.split()
print(words)
chunks = []
for word in words:
if len(chunks) == 0 or len(chunks[-1]) == 5:
chunks.append([])
chunks[-1].append(word)
for chunk in chunks:
for word in chunk:
print("{:15}".format(word),end = "")
print()
詞頻矩陣
import nltk.tokenize as tk
import sklearn.feature_extraction as ft
doc = 'The brown dog is running. The black dog in the black room. Running in the is forbidden'
print(doc)
print('-' * 72)
sentencs = tk.sent_tokenize(doc)
print(sentencs)
SIF圖像特徵提取
import cv2 as cv
import numpy as np
image = cv.imread("F406752152C6E98DE1611516FC243D77.jpg")
gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
cv.imshow("Gray",gray)
detector = cv.xfeatures2d.SIFT_create()
keypoints = detector.detect(gray)
# cv.drawkeypoints(image,keypoints,flags = cv.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
cv.drawKeypoints(gray, keypoints,image, flags=cv.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
#
cv.imshow("Img",image)
cv.waitKey()
STAR特徵提取
import numpy as np
import cv2 as cv
image = cv.imread('table.jpg')
cv.imshow('Original', image)
gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
cv.imshow('Gray', gray)
detector = cv.xfeatures2d.StarDetector_create()
keypoints = detector.detect(gray)
cv.drawKeypoints(image, keypoints, image,
flags=cv.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
cv.imshow('Star Keypoints', image)
cv.waitKey()
皮爾遜係數(推薦係數)
import os
import sys
import json
import numpy as np
def calc_ps(ratings, user1, user2):
movies = set()
for movie in ratings[user1]:
if movie in ratings[user2]:
movies.add(movie)
n = len(movies)
if n == 0:
return 0
x = np.array([ratings[user1][move] for move in movies])
y = np.array([ratings[user2][move] for move in movies])
sx = x.sum()
sy = y.sum()
xx = (x**2).sum()
yy = (y**2).sum()
xy = (x * y).sum()
sxx = xx - sx**2 / n
syy = yy - sy**2 / n
sxy = xy - sx * sy / n
if sxx * syy == 0:
return 0
pearson_score = sxy / np.sqrt(sxx * syy)
return pearson_score
def read_data(filename):
with open(filename, 'r') as f:
ratings = json.loads(f.read())
return ratings
def eval_ps(ratings):
users, psmat = list(ratings.keys()), []
for user1 in users:
psrow = []
for user2 in users:
psrow.append(calc_ps(ratings, user1, user2))
psmat.append(psrow)
users = np.array(users)
psmat = np.array(psmat)
return users, psmat
def find_similars(users, psmat, user, n_similars=None):
user_index = np.arange(len(users))[users == user][0]
sorted_indices = psmat[user_index].argsort()[::-1]
similar_indices = sorted_indices[
sorted_indices != user_index][:n_similars]
similar_users = users[similar_indices]
similar_scores = psmat[user_index][similar_indices]
return similar_users, similar_scores
def calc_reco(ratings, user):
users, psmat = eval_ps(ratings)
similar_users, similar_scores = find_similars(
users, psmat, user)
positive_mask = similar_scores > 0
similar_users = similar_users[positive_mask]
similar_scores = similar_scores[positive_mask]
score_sums, weight_sums = {}, {}
for i, similar_user in enumerate(similar_users):
for movie, score in ratings[similar_user].items():
if movie not in ratings[user].keys() or \
ratings[user][movie] == 0:
if movie not in score_sums.keys():
score_sums[movie] = 0
score_sums[movie] += score * similar_scores[i]
if movie not in weight_sums.keys():
weight_sums[movie] = 0
weight_sums[movie] += similar_scores[i]
movie_ranks = {movie: score_sum / weight_sums[movie]
for movie, score_sum in score_sums.items()}
sorted_indices = np.array(
list(movie_ranks.values())).argsort()[::-1]
reco = np.array(
list(movie_ranks.keys()))[sorted_indices]
return reco
def main(argc, argv, envp):
ratings = read_data('ratings.json')
for user in ratings.keys():
reco = calc_reco(ratings, user)
print('{}: {}'.format(user, reco))
return 0
if __name__ == '__main__':
sys.exit(main(len(sys.argv), sys.argv, os.environ))
特殊優化
import numpy as np
import sklearn.datasets as sd
import sklearn.feature_selection as fs
import sklearn.ensemble as se
import sklearn.pipeline as sp
import sklearn.model_selection as ms
import matplotlib.pyplot as mp
x, y = sd.samples_generator.make_classification(
n_informative=4, n_features=20, n_redundant=0,
random_state=5)
skb = fs.SelectKBest(fs.f_regression, k=5)
rfc = se.RandomForestClassifier(n_estimators=25,
max_depth=4)
model = sp.Pipeline([
('selector', skb), ('classifier', rfc)])
print(ms.cross_val_score(model, x, y, cv=10,
scoring='f1_weighted').mean())
model.set_params(selector__k=2,
classifier__n_estimators=10)
print(ms.cross_val_score(model, x, y, cv=10,
scoring='f1_weighted').mean())
model.fit(x, y)
selected_mask = model.named_steps['selector'].get_support()
selected_indices = np.arange(x.shape[1])[selected_mask]
x = x[:, selected_indices]
model.fit(x, y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h), np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
mp.gcf().set_facecolor(np.ones(3) * 240 / 255)
mp.title('Pipeline', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='Dark2')
mp.xlim(grid_x[0].min(), grid_x[0].max())
mp.ylim(grid_x[1].min(), grid_x[1].max())
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='cool', s=80)
mp.show()