import csv
import numpy as np
import numpy.matlib
import pandas as pd
import scipy.sparse.linalg as ll
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import preprocessing
from scipy import stats
import seaborn as sns


df = pd.read_csv('data/n90pol.csv')
data = pd.read_csv('data/n90pol.csv').to_numpy()
y = data[:,2]
data = data[:,:2]

m, n = data.shape

amygdala = data[:,0]
acc = data[:,1]


_ = plt.hist(amygdala, bins=7)
plt.title("Histogram of Amygdala")
plt.show()


ax = df['amygdala'].plot.kde(bw_method=0.3)


_ = plt.hist(acc, bins=9)
plt.title("Histogram of Acc")
plt.show()


ax = df['acc'].plot.kde(bw_method=0.3)


# adapted from: https://www.geeksforgeeks.org/plot-2-d-histogram-in-python-using-matplotlib/
x1 = data[:,0]
x2 = data[:,1]
  
# Creating bins
x1_min = np.min(x1)
x1_max = np.max(x1)
  
x2_min = np.min(x2)
x2_max = np.max(x2)
  
x1_bins = np.linspace(x1_min, x1_max, 7)#7
x2_bins = np.linspace(x2_min, x2_max, 9)#9
  
fig, ax = plt.subplots(figsize =(10, 7))
# Creating plot
plt.hist2d(x1, x2, bins =[x1_bins, x2_bins])
plt.title("2-D Histogram of (Amygdala, Acc)")
  
ax.set_xlabel('Amygdala') 
ax.set_ylabel('Acc')

plt.colorbar()
  
# show plot
plt.tight_layout() 
plt.show()


# adapted from : https://docs.scipy.org/doc/scipy/reference/
                    # generated/scipy.stats.gaussian_kde.html
x_1, x_2 = np.mgrid[x1_min:x1_max:100j, x2_min:x2_max:100j]
positions = np.vstack([x_1.ravel(), x_2.ravel()])
values = np.vstack([x1, x2])
kernel = stats.gaussian_kde(values, bw_method=0.45)
Z = np.reshape(kernel(positions).T, x_1.shape)

fig, ax = plt.subplots()
ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
          extent=[x1_min, x1_max, x2_min, x2_max])
ax.plot(x1,x2, 'k.', markersize=2)
ax.set_xlim([x1_min, x1_max])
ax.set_ylim([x2_min, x2_max])
plt.show()


sns.kdeplot(data=df, x="amygdala", hue="orientation", bw_method=0.4)

<matplotlib.axes._subplots.AxesSubplot at 0x2bfe3576e48>


sns.kdeplot(data=df, x="acc", hue="orientation", bw_method=0.4)

<matplotlib.axes._subplots.AxesSubplot at 0x2bfe4805808>


df.groupby('orientation').mean().transpose()


# adapted from: https://seaborn.pydata.org/generated/seaborn.kdeplot.html#seaborn.kdeplot
sns.kdeplot(data=df, x="amygdala", y="acc", hue="orientation", bw_method=0.7)

<matplotlib.axes._subplots.AxesSubplot at 0x2bfe4fb8348>


from scipy.stats import multivariate_normal as mvn
import scipy as sp
import scipy.io
from sklearn import preprocessing
from sklearn.cluster import KMeans
%matplotlib inline


mat = scipy.io.loadmat('data/data.mat')
label = scipy.io.loadmat('data/label.mat')


x = mat['data'].T.copy() #transform to shape: (1990, 784)
y = label['trueLabel'].copy() # shape: (1, 1990)


print(np.min(x), np.max(x))

0.0 1.0


# following cells are adapted from ISYE 6740 demo code
m, n = x.shape
C = np.matmul(x.T, x)/m


# pca the data
d = 4  # reduced dimension
V,_,_ = np.linalg.svd(C)
V = V[:, :d]

# project the data to the top 4 principal directions
x_proj = np.dot(x,V)


# EM-GMM for digit data
# number of digit classes
K = 2

# random seed
seed = 42

# initialize prior
# np.random.seed(seed)
pi = np.random.random(K)
pi = pi/np.sum(pi)

# initial mean and covariance
# np.random.seed(seed)
mu = np.random.randn(K,4)
mu_old = mu.copy()

sigma = []
for ii in range(K):
    # to ensure the covariance psd
    # np.random.seed(seed)
    dummy = np.random.randn(4, 4)
    sigma.append(dummy@dummy.T+np.eye(4))
    
# initialize the posterior
tau = np.full((m, K), fill_value=0.)

# initialize the likelihood
llh = np.full((m, K), fill_value=0.)

# # parameter for countour plot
# xrange = np.arange(-5, -5, 0.1)
# yrange = np.arange(-5, -5, 0.1)

# ####
maxIter= 100
tol = 1e-2

plt.ion()

log_like = []
iterations = []    

for ii in range(100):

    # E-step    
    for kk in range(K):
        tau[:, kk] = pi[kk] * mvn.pdf(x_proj, mu[kk], sigma[kk])
    
        llh[:, kk] = pi[kk] * mvn.pdf(x_proj, mu[kk], sigma[kk])

    # normalize tau
    sum_tau = np.sum(tau, axis=1)
    sum_tau.shape = (m,1)    
    tau = np.divide(tau, np.tile(sum_tau, (1, K)))
    
    
    # M-step
    for kk in range(K):
        # update prior
        pi[kk] = np.sum(tau[:, kk])/m
        
        # update component mean
        mu[kk] = x_proj.T @ tau[:,kk] / np.sum(tau[:,kk], axis = 0)
        
        # update cov matrix
        dummy = x_proj - np.tile(mu[kk], (m,1)) # X-mu
        sigma[kk] = dummy.T @ np.diag(tau[:,kk]) @ dummy / np.sum(tau[:,kk], axis = 0)
        
    #print('-----iteration---',ii)    
    #print('Log-Likelihood (sum):',np.sum(np.log(np.sum(llh,axis=1))))
    
    log_like.append(np.sum(np.log(np.sum(llh,axis=1))))
    iterations.append(ii)

    if np.linalg.norm(mu-mu_old) < tol:
        print('training coverged')
        break
    mu_old = mu.copy()
    if ii==99:
        print('max iteration reached')
        break

plt.plot(iterations, log_like)
plt.title("Log Likelihood vs Number of Iterations")
plt.ylabel('Log Likelihood (sum)')
plt.xlabel('Iterations')
plt.show()

training coverged

mu

array([[-6.86628478,  1.98684347,  0.06967933, -0.47731838],
       [-6.99692044, -2.19471701, -0.08265557,  0.31252929]])

pi

array([0.51227004, 0.48772996])


sigma

[array([[ 3.18358838, -1.32203456, -0.32780244, -0.29635426],
        [-1.32203456,  1.69363236,  0.34336186,  0.39520337],
        [-0.32780244,  0.34336186,  5.87158552,  1.93867412],
        [-0.29635426,  0.39520337,  1.93867412,  2.31983833]]),
 array([[ 2.97073272,  0.36093295,  0.26847198, -0.94429828],
        [ 0.36093295,  1.5566962 , -0.68745127,  1.26689877],
        [ 0.26847198, -0.68745127,  1.78470381, -1.97545337],
        [-0.94429828,  1.26689877, -1.97545337,  4.77066693]])]


# adapted from: 
 # https://stackoverflow.com/questions/2659312/
    # how-do-i-convert-a-numpy-array-to-and-display-an-image

Gaussian_1 = (V.dot(mu[0])+np.mean(x)).reshape((28,28))
    
from matplotlib import pyplot as plt
plt.imshow(Gaussian_1.T, interpolation='nearest')
plt.show()


sns.set_theme()

ax = sns.heatmap(sigma[0])


Gaussian_2 = (V.dot(mu[1])+np.mean(x)).reshape((28,28))

from matplotlib import pyplot as plt
plt.imshow(Gaussian_2.T, interpolation='nearest')
plt.show()


ax = sns.heatmap(sigma[1])


# inferred labels from tau
GMM_labels = np.where(np.argmax(tau, axis=1) == 1, 2, 6)
GMM_labels

array([2, 2, 2, ..., 6, 6, 6])


# true labels
y[0]

(1990,)


# adapted from: https://stackoverflow.com/questions/20402109/
                    # calculating-percentage-error-by-comparing-two-arrays/20402224
ind_2 = np.where(GMM_labels == 2)

# False Positive rate (Type I error) 
misclassification_rate = np.mean(GMM_labels[ind_2] != y[0][ind_2])
print("Mis-classification rate for '2' = {}%".format(round(misclassification_rate*100,3)))

Mis-classification rate for '2' = 0.923%


ind_6 = np.where(GMM_labels == 6)

# False Positive rate (Type I error) 
misclassification_rate = np.mean(GMM_labels[ind_6] != y[0][ind_6])
print("Mis-classification rate for '6' = {}%".format(round(misclassification_rate*100,3)))

Mis-classification rate for '6' = 6.502%


# run k-means clustering
kmeans = KMeans(n_clusters=2, random_state=0).fit(x)


kmeans_labels = np.where(kmeans.labels_ == 1, 2, 6)
kmeans_labels

array([2, 2, 2, ..., 2, 6, 6])


# adapted from: https://stackoverflow.com/questions/20402109/
                    # calculating-percentage-error-by-comparing-two-arrays/20402224
ind_2_km = np.where(kmeans_labels == 2)

# False Positive rate (Type I error) 
misclassification_rate = np.mean(kmeans_labels[ind_2_km] != y[0][ind_2_km])
print("Mis-classification rate for '2' = {}%".format(round(misclassification_rate*100,3)))

Mis-classification rate for '2' = 6.513%


# adapted from: https://stackoverflow.com/questions/20402109/
                    # calculating-percentage-error-by-comparing-two-arrays/20402224
ind_6_km = np.where(kmeans_labels == 6)

# False Positive rate (Type I error) 
misclassification_rate = np.mean(kmeans_labels[ind_6_km] != y[0][ind_6_km])
print("Mis-classification rate for '6' = {}%".format(round(misclassification_rate*100,3)))

Mis-classification rate for '6' = 5.92%


misclassification_rate = np.mean(GMM_labels != y[0])
print("Overall mis-classification rate for GMM = {}%".format(round(misclassification_rate*100,3)))

misclassification_rate = np.mean(kmeans_labels != y[0])
print("Overall mis-classification rate for K-means = {}%".format(round(misclassification_rate*100,3)))

Overall mis-classification rate for GMM = 3.769%
Overall mis-classification rate for K-means = 6.231%

orientation	2	3	4	5
amygdala	0.019062	0.000588	-0.00472	-0.005692
acc	-0.014769	0.001671	0.00131	0.008142

ISYE 6740 - Homework 3¶

Date: 6/20/2021¶

By: Erik Magnusson¶

1. Density estimation: Psychological experiments¶

1 (a): 1-dimensional histogram and KDE¶

1-dimensional histogram for amygdala¶

1-dimensional KDE for amygdala¶

1-dimensional histogram for acc¶

1-dimensional KDE for acc¶

1 (b): 2-dimensional histogram¶

1 (c): 2-dimensional KDE¶

1 (d): Conditional KDEs with Orientation¶

Explanation¶

1 (e): Conditional Joint Distribution of the volume of the amygdala and acc¶

2. Implementing EM for MNIST dataset¶

2 (a): Detailed expression of the E-step and M-step in the EM algorithm¶

E-Step¶

M-Step¶

2 (b): Implement EM algorithm¶

2 (c): Report the fitted GMM model¶

2 (d): Infer the labels of the images¶

$\tau_k^i$ mis-classification rate for digit "2"¶

$\tau_k^i$ mis-classification rate for digit "6"¶

K-means clustering with K=2¶

K-means mis-classification rate for digit "2"¶

K-means mis-classification rate for digit "6"¶

Overall¶

Performance Comparison¶