自然語言,圖像處理

情感分析

import nltk.corpus as nc
import nltk.classify as cf
import nltk.classify.util as cu


pdata = []
fileids = nc.movie_reviews.fileids("pos")
for fileid in fileids:
  feature = {}
  words = nc.movie_reviews.words(fileid)
  for word in words:
    feature[word] = True
  pdata.append((feature,"POSITIVE"))
ndata = []
nfileids = nc.movie_reviews.fileids("neg")
for fileid in nfileids:
  feature = {}
  words = nc.movie_reviews.words(fileid)
  for word in words:
    feature[word] = True
  ndata.append((feature,"NEGETIVE"))
pnum,nnum = int(0.8 * len(pdata)), int(0.8 * len(ndata))
train_data = pdata[:pnum] + ndata[:nnum]
test_data = pdata[pnum:] + ndata[nnum:]
model = cf.NaiveBayesClassifier.train(train_data)
ac = cu.accuracy(model,test_data)
print("%.2f%%"%round(ac*100,2))

tops = model.most_informative_features()# 關鍵字
for top in tops:
  print(top[0])

reviews = [
"It is a amazing movie.",
"This is a dull movie. I would never recommend it to anyone",
"The cinematography is pretty great in this movie.",
"This direction was  terrible and the story was all over the place."
]
sents, probs = [], []
for review in reviews:
  feature = {}
  words = review.split()
  for word in words:
     feature[word] = True
  pcls = model.prob_classify(feature)
  sent = pcls.max()
  prob = pcls.prob(sent)
  sents.append(sent)
  probs.append(prob)
for review, sent, probs in zip(reviews, sents, probs):
  print(review,"->",sent,"%.2f%%"%round(prob * 100, 2))
  

性別識別

import random
import numpy as np
import nltk.corpus as nc
import nltk.classify as cf

male_names = nc.names.words("male.txt")
female_names = nc.names.words("female.txt")

models, acs = [],[]
for n_letter in range(1,6):
  data = []
  for male_name in male_names:
    feature= {"feature":male_name[-n_letter:].lower()}
    data.append((feature,"male"))
  for female_name in female_names:
    feature = {"feature":female_name[-n_letter:].lower()}
    data.append((feature,"female"))
  random.seed(7)
  random.shuffle(data)
  train_data = data[:int(len(data)/2)]
  test_data = data[int(len(data)/2):]
  model = cf.NaiveBayesClassifier.train(train_data)
  ac = cf.accuracy(model,test_data)
  acs.append(ac)
  models.append(model)
best_index = np.array(acs).argmax()
best_letter = best_index + 1 
names = ["Leonardo","Amy","Sam","Tom","Katherine","Tayior",
         "Susanne","Watermelon","Alpaca","Paris","Python","JAVA"]
print(names)
genders = []
for name in names:
  feature = {"feature":name[-best_letter:].lower()}
  gender = models[best_index].classify(feature)
  genders.append(gender)
print(genders)

文本分類

import sklearn.datasets as sd
import sklearn.feature_extraction.text as ft
import sklearn.naive_bayes as nb

cld = {"misc.forsale":"SALES","rec.motorcycles":"MOTORCYLES",
       "rec.sport.baseball":"BASEBALL",
       "sci.crypt":"CRYPTOGRAPHY",
       "sci.space":"SPAVE"}
train = sd.fetch_20newsgroups(subset="train",
                                   categories=cld.keys(),random_state=7,
                                   shuffle = True)
train_data = train.data

train_y = train.target
categories = train.target_names
cv = ft.CountVectorizer()
train_fmt = cv.fit_transform(train_data)
tf = ft.TfidfTransformer()
train_x = tf.fit_transform(train_fmt)
model= nb.MultinomialNB()
model.fit(train_x,train_y)
test_data = ["The curveballs of right handed pitches tend to curver to the left",
             "Caesar cipher is an ancient from encryption",
             "This two-wheeler is really good on slippery roads"]
test_fmt = cv.transform(test_data)
test_x = tf.transform(test_fmt)
pred_y = model.predict(test_x)
for sentence, index in zip(test_data,pred_y):
    print(sentence,"->",cld[categories[index]])

3D圖

import numpy as np
import matplotlib.pyplot as mp
from mpl_toolkits.mplot3d import axes3d

n_samples = 300

x = np.random.rand(n_samples)
y = np.random.rand(n_samples)
z = np.random.rand(n_samples)

ax = mp.gca(projection = "3d")

mp.title('Scatter 3D', fontsize=20)
ax.set_xlabel('x', fontsize=14)
ax.set_ylabel('y', fontsize=14)
ax.set_zlabel('z', fontsize=14)
mp.tick_params(labelsize=10)

mp.gca().scatter(x,y,z,c = np.array([x,y,z]).T,
                 s=100 * np.linalg.norm((x, y, z), axis=0))
mp.show()

訓練

import os
import sys
import warnings
import numpy as np
import cv2 as cv
import hmmlearn.hmm as hl


def show_image(title, image):
    cv.imshow(title, image)


def search_objects(directory):
    if not os.path.isdir(directory):
        raise IOError(
            "The director '" + directory + "' doesn't exist!")
    objects = {}
    for curdir, subdirs, files in os.walk(directory):
        for jpeg in (file for file in files if file.endswith('.jpg')):
            path = os.path.join(curdir, jpeg)
            label = path.split(os.path.sep)[-2]
            if label not in objects:
                objects[label] = []
            objects[label].append(path)
    return objects


def read_image(filename):
    image = cv.imread(filename)
    return image


def resize_image(image, size):
    h, w = image.shape[:2]
    scale = size / min(h, w)
    image = cv.resize(image, None, fx=scale, fy=scale)
    return image


def calc_features(image):
    star = cv.xfeatures2d.StarDetector_create()
    keypoints = star.detect(image)
    sift = cv.xfeatures2d.SIFT_create()
    gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
    keypoints, desc = sift.compute(gray, keypoints)
    return desc


def read_data(directory):
    objects = search_objects(directory)
    x, y, z = [], [], []
    for label, filenames in objects.items():
        z.append([])
        descs = np.array([])
        for filename in filenames:
            print(filename, '->', label)
            image = read_image(filename)
            z[-1].append(image)
            image = resize_image(image, 200)
            desc = calc_features(image)
            descs = desc if len(descs) == 0 else np.append(descs, desc, axis=0)
        x.append(descs)
        y.append(label)
    return x, y, z


def train_models(x, y):
    models = {}
    for descs, label in zip(x, y):
        model = hl.GaussianHMM(n_components=4, covariance_type='diag',
                               n_iter=1000)
        models[label] = model.fit(descs)
    return models


def pred_models(models, x):
    y = []
    for descs in x:
        best_score, best_label = None, None
        for label, model in models.items():
            score = model.score(descs)
            if best_score is None:
                best_score = score
            if best_label is None:
                best_label = label
            if best_score < score:
                best_score, best_label = score, label
        y.append(best_label)
    return y


def show_labels(labels, pred_labels, images):
    i = 0
    for label, pred_label, row in zip(labels, pred_labels, images):
        for image in row:
            i += 1
            show_image('{}: {} {} {}'.format(
                i, label, '==' if label == pred_label else '!=',
                pred_label), image)


def main(argc, argv, envp):
    warnings.filterwarnings(
        'ignore', category=DeprecationWarning)
    np.seterr(all='ignore')
    train_x, train_y, train_z = read_data('objects\\training')
    test_x, test_y, test_z = read_data('objects\\testing')
    models = train_models(train_x, train_y)
    pred_test_y = pred_models(models, test_x)
    show_labels(test_y, pred_test_y, test_z)
    cv.waitKey()
    return 0


if __name__ == '__main__':
    sys.exit(main(len(sys.argv), sys.argv, os.environ))

特徵

import os
import sys
import platform
import cv2 as cv
import numpy as np
import matplotlib.pyplot as mp
import mpl_toolkits.axes_grid1 as mg

def resize_image(image):
    image = cv.resize(image,None,fx =2 ,fy =2 )
    return image
def read_image(filename):
    image = cv.imread(filename)
    return image
def show_image(name,image):
    cv.imshow(name,image)
def draw_desc(desc):
    ma = mp.matshow(desc,cmap = "jet")
    mp.gcf().set_facecolor(np.ones(3) * 240/255)
    mp.title("Desc Calc",fontsize = 20)
    mp.xlabel("X",fontsize = 14)
    mp.ylabel("Y",fontsize = 14)
    ax = mp.gca()
    ax.xaxis.set_major_locator(mp.MultipleLocator(8))
    ax.xaxis.set_minor_locator(mp.MultipleLocator())
    ax.yaxis.set_major_locator(mp.MultipleLocator(8))
    ax.yaxis.set_minor_locator(mp.MultipleLocator())
    mp.tick_params(which = "both", top = True, right = True,
                    labelsize = 10,labeltop = False,
                   labelbottom = True)
    dv = mg.make_axes_locatable(ax)
    ca = dv.append_axes("right","3%",pad = "3%")
    cb = mp.colorbar(ma,cax=ca)
    cb.set_label('DESC',fontsize = 14)
    mp.show()
def calc_image(image):
    star = cv.xfeatures2d.StarDetector_create()
    keypoints = star.detect(image)
    sift = cv.xfeatures2d.SIFT_create()
    gray = cv.cvtColor(image,cv.COLOR_BGR2GRAY)
    keypoints,desc = sift.compute(gray,keypoints)
    return desc

def main(argv,argc,envir):
    image = read_image("F406752152C6E98DE1611516FC243D77.jpg")
    image = resize_image(image)
    show_image("Circle",image)
    desc = calc_image(image)
    draw_desc(desc)
    cv.waitKey()
    return 0

if __name__ == "__main__":
    sys.exit(main(len(sys.argv),sys.argv,os.environ))


詞幹提取

import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb
import nltk.stem as ns

words = ["tables", "probably", "wolves", "playing", "is",
         "dog", "the", "beaches", "grounded", "envision"]
# 寬鬆, 簡單, 快, 不保證詞幹的語法正確
stemmer = pt.PorterStemmer()
for word in words:
    stem = stemmer.stem(word)
    print(stem)
print("-" * 72)
# 嚴格,慢,保證詞幹準確
stemmer = lc.LancasterStemmer()
for word in words:
    stem = stemmer.stem(word)
    print(stem)
print("-" * 72)
stemmer = sb.SnowballStemmer("english")
for word in words:
    stem = stemmer.stem(word)
    print(stem)
lemmatizer = ns.WordNetLemmatizer()
for word in words:
    lemme = lemmatizer.lemmatize(word,"n")
    print(lemme)
print("-"*72)
#詞塊劃分
import nltk.corpus as nc
doc = " ".join(nc.brown.words()[:310])
print(doc)
words = doc.split()
print(words)
chunks = []
for word in words:
    if len(chunks) == 0 or len(chunks[-1]) == 5:
        chunks.append([])
    chunks[-1].append(word)
for chunk in chunks:
    for word in chunk:
        print("{:15}".format(word),end = "")
    print()

詞頻矩陣

import nltk.tokenize as tk
import sklearn.feature_extraction as ft

doc = 'The brown dog is running. The black dog in the black room. Running in the is forbidden'
print(doc)
print('-' * 72)
sentencs = tk.sent_tokenize(doc)
print(sentencs)

SIF圖像特徵提取

import cv2 as cv
import numpy as np

image = cv.imread("F406752152C6E98DE1611516FC243D77.jpg")
gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
cv.imshow("Gray",gray)
detector = cv.xfeatures2d.SIFT_create()
keypoints = detector.detect(gray)
# cv.drawkeypoints(image,keypoints,flags = cv.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)

cv.drawKeypoints(gray, keypoints,image, flags=cv.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
#
cv.imshow("Img",image)
cv.waitKey()

STAR特徵提取

import numpy as np
import cv2 as cv
image = cv.imread('table.jpg')
cv.imshow('Original', image)
gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
cv.imshow('Gray', gray)
detector = cv.xfeatures2d.StarDetector_create()
keypoints = detector.detect(gray)
cv.drawKeypoints(image, keypoints, image,
                 flags=cv.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
cv.imshow('Star Keypoints', image)
cv.waitKey()

皮爾遜係數(推薦係數)

import os
import sys
import json
import numpy as np


def calc_ps(ratings, user1, user2):
    movies = set()
    for movie in ratings[user1]:
        if movie in ratings[user2]:
            movies.add(movie)
    n = len(movies)
    if n == 0:
        return 0
    x = np.array([ratings[user1][move] for move in movies])
    y = np.array([ratings[user2][move] for move in movies])
    sx = x.sum()
    sy = y.sum()
    xx = (x**2).sum()
    yy = (y**2).sum()
    xy = (x * y).sum()
    sxx = xx - sx**2 / n
    syy = yy - sy**2 / n
    sxy = xy - sx * sy / n
    if sxx * syy == 0:
        return 0
    pearson_score = sxy / np.sqrt(sxx * syy)
    return pearson_score


def read_data(filename):
    with open(filename, 'r') as f:
        ratings = json.loads(f.read())
    return ratings


def eval_ps(ratings):
    users, psmat = list(ratings.keys()), []
    for user1 in users:
        psrow = []
        for user2 in users:
            psrow.append(calc_ps(ratings, user1, user2))
        psmat.append(psrow)
    users = np.array(users)
    psmat = np.array(psmat)
    return users, psmat


def find_similars(users, psmat, user, n_similars=None):
    user_index = np.arange(len(users))[users == user][0]
    sorted_indices = psmat[user_index].argsort()[::-1]
    similar_indices = sorted_indices[
        sorted_indices != user_index][:n_similars]
    similar_users = users[similar_indices]
    similar_scores = psmat[user_index][similar_indices]
    return similar_users, similar_scores


def calc_reco(ratings, user):
    users, psmat = eval_ps(ratings)
    similar_users, similar_scores = find_similars(
        users, psmat, user)
    positive_mask = similar_scores > 0
    similar_users = similar_users[positive_mask]
    similar_scores = similar_scores[positive_mask]
    score_sums, weight_sums = {}, {}
    for i, similar_user in enumerate(similar_users):
        for movie, score in ratings[similar_user].items():
            if movie not in ratings[user].keys() or \
                    ratings[user][movie] == 0:
                if movie not in score_sums.keys():
                    score_sums[movie] = 0
                score_sums[movie] += score * similar_scores[i]
                if movie not in weight_sums.keys():
                    weight_sums[movie] = 0
                weight_sums[movie] += similar_scores[i]
    movie_ranks = {movie: score_sum / weight_sums[movie]
                   for movie, score_sum in score_sums.items()}
    sorted_indices = np.array(
        list(movie_ranks.values())).argsort()[::-1]
    reco = np.array(
        list(movie_ranks.keys()))[sorted_indices]
    return reco


def main(argc, argv, envp):
    ratings = read_data('ratings.json')
    for user in ratings.keys():
        reco = calc_reco(ratings, user)
        print('{}: {}'.format(user, reco))
    return 0


if __name__ == '__main__':
    sys.exit(main(len(sys.argv), sys.argv, os.environ))

特殊優化

import numpy as np
import sklearn.datasets as sd
import sklearn.feature_selection as fs
import sklearn.ensemble as se
import sklearn.pipeline as sp
import sklearn.model_selection as ms
import matplotlib.pyplot as mp


x, y = sd.samples_generator.make_classification(
	n_informative=4, n_features=20, n_redundant=0,
	random_state=5)
skb = fs.SelectKBest(fs.f_regression, k=5)
rfc = se.RandomForestClassifier(n_estimators=25,
	max_depth=4)
model = sp.Pipeline([
	('selector', skb), ('classifier', rfc)])
print(ms.cross_val_score(model, x, y, cv=10,
	scoring='f1_weighted').mean())
model.set_params(selector__k=2,
	classifier__n_estimators=10)
print(ms.cross_val_score(model, x, y, cv=10,
	scoring='f1_weighted').mean())
model.fit(x, y)
selected_mask = model.named_steps['selector'].get_support()
selected_indices = np.arange(x.shape[1])[selected_mask]
x = x[:, selected_indices]
model.fit(x, y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h), np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
mp.gcf().set_facecolor(np.ones(3) * 240 / 255)
mp.title('Pipeline', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='Dark2')
mp.xlim(grid_x[0].min(), grid_x[0].max())
mp.ylim(grid_x[1].min(), grid_x[1].max())
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='cool', s=80)
mp.show()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章