Imports
import csv
import numpy as np
import numpy.matlib
import pandas as pd
import scipy.sparse.linalg as ll
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import preprocessing
from scipy import stats
import seaborn as sns
df = pd.read_csv('data/n90pol.csv')
data = pd.read_csv('data/n90pol.csv').to_numpy()
y = data[:,2]
data = data[:,:2]
m, n = data.shape
amygdala = data[:,0]
acc = data[:,1]
Bins are set to 7 to show the higher density of values at the left portion of the histogram
_ = plt.hist(amygdala, bins=7)
plt.title("Histogram of Amygdala")
plt.show()
Bandwidth is set to 0.3 to show the smaller density peak at the left of the distribution
ax = df['amygdala'].plot.kde(bw_method=0.3)
Bins are set to 9 to show the higher density of values at the right portion of the graph
_ = plt.hist(acc, bins=9)
plt.title("Histogram of Acc")
plt.show()
Bandwidth is set to 0.3 to show the smaller density peak at the right of the distribution
ax = df['acc'].plot.kde(bw_method=0.3)
The bin sizes for amygdala and acc are 7 and 9, respectively, because those values show the peak around (-0.01, -0.01).
Additionally, those bins sizes show the smaller amygdala peak around -0.05 and smaller acc peak around 0.05.
# adapted from: https://www.geeksforgeeks.org/plot-2-d-histogram-in-python-using-matplotlib/
x1 = data[:,0]
x2 = data[:,1]
# Creating bins
x1_min = np.min(x1)
x1_max = np.max(x1)
x2_min = np.min(x2)
x2_max = np.max(x2)
x1_bins = np.linspace(x1_min, x1_max, 7)#7
x2_bins = np.linspace(x2_min, x2_max, 9)#9
fig, ax = plt.subplots(figsize =(10, 7))
# Creating plot
plt.hist2d(x1, x2, bins =[x1_bins, x2_bins])
plt.title("2-D Histogram of (Amygdala, Acc)")
ax.set_xlabel('Amygdala')
ax.set_ylabel('Acc')
plt.colorbar()
# show plot
plt.tight_layout()
plt.show()
Based on the 2-dimensional kernel-density-estimation below, amygdala and acc do appear to be independent. Although the KDE shows bimodal peaks around 0 for both, the values of amygdala and acc do not follow a clear relationship with one another.
# adapted from : https://docs.scipy.org/doc/scipy/reference/
# generated/scipy.stats.gaussian_kde.html
x_1, x_2 = np.mgrid[x1_min:x1_max:100j, x2_min:x2_max:100j]
positions = np.vstack([x_1.ravel(), x_2.ravel()])
values = np.vstack([x1, x2])
kernel = stats.gaussian_kde(values, bw_method=0.45)
Z = np.reshape(kernel(positions).T, x_1.shape)
fig, ax = plt.subplots()
ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
extent=[x1_min, x1_max, x2_min, x2_max])
ax.plot(x1,x2, 'k.', markersize=2)
ax.set_xlim([x1_min, x1_max])
ax.set_ylim([x2_min, x2_max])
plt.show()
Estimated conditional distribution of amygdala conditioning on political orientation
sns.kdeplot(data=df, x="amygdala", hue="orientation", bw_method=0.4)
<matplotlib.axes._subplots.AxesSubplot at 0x2bfe3576e48>
Estimated conditional distribution of acc conditioning on political orientation
sns.kdeplot(data=df, x="acc", hue="orientation", bw_method=0.4)
<matplotlib.axes._subplots.AxesSubplot at 0x2bfe4805808>
Political orientation is ranked from 1 (very conservative) to 5 (very liberal). In the conditional distributions above, one can see that more conservative orientations are associated with smaller acc brain regions and larger amygdala regions. On the other hand, liberal orientations appear to have regular acc and amygdala brain region volumes. A kernel bandwidth of 0.4 was effective when viewing these associations.
Conditional sample means for amygdala and acc
df.groupby('orientation').mean().transpose()
orientation | 2 | 3 | 4 | 5 |
---|---|---|---|---|
amygdala | 0.019062 | 0.000588 | -0.00472 | -0.005692 |
acc | -0.014769 | 0.001671 | 0.00131 | 0.008142 |
# adapted from: https://seaborn.pydata.org/generated/seaborn.kdeplot.html#seaborn.kdeplot
sns.kdeplot(data=df, x="amygdala", y="acc", hue="orientation", bw_method=0.7)
<matplotlib.axes._subplots.AxesSubplot at 0x2bfe4fb8348>
Using a kernel bandwidth of 0.7, I was able to limit the noise in the joint distribution and view the overall associations between orientation and amygdala/acc volumes. The joint distribution above confirms my earlier conclusions when analyzing the conditional distributions for acc and amygdala volumes with orientation. One can see the lighter-colored orientation contours (i.e., those associated with conservative orientations) located among larger amygdala volumes and lower acc volumes than more liberal orientations. Like before, the contours associated with liberal orientations are located more centrally in terms of both acc and amygdala volume.
Import data (already formatted as 28x28 pixel images in each column (1990 columns))
from scipy.stats import multivariate_normal as mvn
import scipy as sp
import scipy.io
from sklearn import preprocessing
from sklearn.cluster import KMeans
%matplotlib inline
mat = scipy.io.loadmat('data/data.mat')
label = scipy.io.loadmat('data/label.mat')
x = mat['data'].T.copy() #transform to shape: (1990, 784)
y = label['trueLabel'].copy() # shape: (1, 1990)
Data is already scaled to between (0, 1)
print(np.min(x), np.max(x))
0.0 1.0
*from lecture slides
# following cells are adapted from ISYE 6740 demo code
m, n = x.shape
C = np.matmul(x.T, x)/m
PCA and project data to the top 4 principal directions
# pca the data
d = 4 # reduced dimension
V,_,_ = np.linalg.svd(C)
V = V[:, :d]
# project the data to the top 4 principal directions
x_proj = np.dot(x,V)
# EM-GMM for digit data
# number of digit classes
K = 2
# random seed
seed = 42
# initialize prior
# np.random.seed(seed)
pi = np.random.random(K)
pi = pi/np.sum(pi)
# initial mean and covariance
# np.random.seed(seed)
mu = np.random.randn(K,4)
mu_old = mu.copy()
sigma = []
for ii in range(K):
# to ensure the covariance psd
# np.random.seed(seed)
dummy = np.random.randn(4, 4)
sigma.append(dummy@dummy.T+np.eye(4))
# initialize the posterior
tau = np.full((m, K), fill_value=0.)
# initialize the likelihood
llh = np.full((m, K), fill_value=0.)
# # parameter for countour plot
# xrange = np.arange(-5, -5, 0.1)
# yrange = np.arange(-5, -5, 0.1)
# ####
maxIter= 100
tol = 1e-2
plt.ion()
log_like = []
iterations = []
for ii in range(100):
# E-step
for kk in range(K):
tau[:, kk] = pi[kk] * mvn.pdf(x_proj, mu[kk], sigma[kk])
llh[:, kk] = pi[kk] * mvn.pdf(x_proj, mu[kk], sigma[kk])
# normalize tau
sum_tau = np.sum(tau, axis=1)
sum_tau.shape = (m,1)
tau = np.divide(tau, np.tile(sum_tau, (1, K)))
# M-step
for kk in range(K):
# update prior
pi[kk] = np.sum(tau[:, kk])/m
# update component mean
mu[kk] = x_proj.T @ tau[:,kk] / np.sum(tau[:,kk], axis = 0)
# update cov matrix
dummy = x_proj - np.tile(mu[kk], (m,1)) # X-mu
sigma[kk] = dummy.T @ np.diag(tau[:,kk]) @ dummy / np.sum(tau[:,kk], axis = 0)
#print('-----iteration---',ii)
#print('Log-Likelihood (sum):',np.sum(np.log(np.sum(llh,axis=1))))
log_like.append(np.sum(np.log(np.sum(llh,axis=1))))
iterations.append(ii)
if np.linalg.norm(mu-mu_old) < tol:
print('training coverged')
break
mu_old = mu.copy()
if ii==99:
print('max iteration reached')
break
plt.plot(iterations, log_like)
plt.title("Log Likelihood vs Number of Iterations")
plt.ylabel('Log Likelihood (sum)')
plt.xlabel('Iterations')
plt.show()
training coverged
posterior means of each latent Gaussian
mu
array([[-6.86628478, 1.98684347, 0.06967933, -0.47731838], [-6.99692044, -2.19471701, -0.08265557, 0.31252929]])
posterior weights
pi
array([0.51227004, 0.48772996])
posterior covariance matrix
sigma
[array([[ 3.18358838, -1.32203456, -0.32780244, -0.29635426], [-1.32203456, 1.69363236, 0.34336186, 0.39520337], [-0.32780244, 0.34336186, 5.87158552, 1.93867412], [-0.29635426, 0.39520337, 1.93867412, 2.31983833]]), array([[ 2.97073272, 0.36093295, 0.26847198, -0.94429828], [ 0.36093295, 1.5566962 , -0.68745127, 1.26689877], [ 0.26847198, -0.68745127, 1.78470381, -1.97545337], [-0.94429828, 1.26689877, -1.97545337, 4.77066693]])]
First average image
# adapted from:
# https://stackoverflow.com/questions/2659312/
# how-do-i-convert-a-numpy-array-to-and-display-an-image
Gaussian_1 = (V.dot(mu[0])+np.mean(x)).reshape((28,28))
from matplotlib import pyplot as plt
plt.imshow(Gaussian_1.T, interpolation='nearest')
plt.show()
First covariance matrix as a heatmap
sns.set_theme()
ax = sns.heatmap(sigma[0])
Second average image
Gaussian_2 = (V.dot(mu[1])+np.mean(x)).reshape((28,28))
from matplotlib import pyplot as plt
plt.imshow(Gaussian_2.T, interpolation='nearest')
plt.show()
Second covariance matrix as a heatmap
ax = sns.heatmap(sigma[1])
# inferred labels from tau
GMM_labels = np.where(np.argmax(tau, axis=1) == 1, 2, 6)
GMM_labels
array([2, 2, 2, ..., 6, 6, 6])
# true labels
y[0]
(1990,)
# adapted from: https://stackoverflow.com/questions/20402109/
# calculating-percentage-error-by-comparing-two-arrays/20402224
ind_2 = np.where(GMM_labels == 2)
# False Positive rate (Type I error)
misclassification_rate = np.mean(GMM_labels[ind_2] != y[0][ind_2])
print("Mis-classification rate for '2' = {}%".format(round(misclassification_rate*100,3)))
Mis-classification rate for '2' = 0.923%
ind_6 = np.where(GMM_labels == 6)
# False Positive rate (Type I error)
misclassification_rate = np.mean(GMM_labels[ind_6] != y[0][ind_6])
print("Mis-classification rate for '6' = {}%".format(round(misclassification_rate*100,3)))
Mis-classification rate for '6' = 6.502%
# run k-means clustering
kmeans = KMeans(n_clusters=2, random_state=0).fit(x)
kmeans_labels = np.where(kmeans.labels_ == 1, 2, 6)
kmeans_labels
array([2, 2, 2, ..., 2, 6, 6])
# adapted from: https://stackoverflow.com/questions/20402109/
# calculating-percentage-error-by-comparing-two-arrays/20402224
ind_2_km = np.where(kmeans_labels == 2)
# False Positive rate (Type I error)
misclassification_rate = np.mean(kmeans_labels[ind_2_km] != y[0][ind_2_km])
print("Mis-classification rate for '2' = {}%".format(round(misclassification_rate*100,3)))
Mis-classification rate for '2' = 6.513%
# adapted from: https://stackoverflow.com/questions/20402109/
# calculating-percentage-error-by-comparing-two-arrays/20402224
ind_6_km = np.where(kmeans_labels == 6)
# False Positive rate (Type I error)
misclassification_rate = np.mean(kmeans_labels[ind_6_km] != y[0][ind_6_km])
print("Mis-classification rate for '6' = {}%".format(round(misclassification_rate*100,3)))
Mis-classification rate for '6' = 5.92%
misclassification_rate = np.mean(GMM_labels != y[0])
print("Overall mis-classification rate for GMM = {}%".format(round(misclassification_rate*100,3)))
misclassification_rate = np.mean(kmeans_labels != y[0])
print("Overall mis-classification rate for K-means = {}%".format(round(misclassification_rate*100,3)))
Overall mis-classification rate for GMM = 3.769% Overall mis-classification rate for K-means = 6.231%
When classifying the digit "2", GMM had a mis-classification rate of 0.9%, whereas K-means had a mis-classification rate of 6.5%. From these results, we can conclude that GMM achieved better performance than K-means when classifying the digit "2".
When classifying the digit "6", GMM had a mis-classification rate of 6.5%, whereas K-means had a mis-classification rate of 5.9%. From these results, we can conclude that K-means achieved better performance than GMM when classifying the digit "6".
Overall, when looking at total mis-classification rate, GMM outperforms K-means with a mis-classification rate of 3.8% versus 6.2%, respectively.