Importing libraries and football.bmp image
import numpy as np
from PIL import Image
import numpy as np
import time
import matplotlib.pyplot as plt
from scipy.sparse import csc_matrix, find
from itertools import combinations
import copy
from skimage import io
# import .bmp image as a numpy array
#img = np.array(Image.open('data/football.bmp'))
img = np.array(Image.open('data/GeorgiaTech.bmp'))
Reshaping array from (412, 620, 3) to (255440, 3) in order to represent the image as 1 RGB pixel per row
Referenced: https://stackoverflow.com/questions/10439104/reading-bmp-files-in-python
# image shape is (412, 620, 3)
print(img.shape)
# reshape to 1 row/pixel
pixels = img.reshape(img.shape[0]*img.shape[1], 3)
# data shape is now (255440, 3)
print(pixels.shape)
(400, 400, 3) (160000, 3)
n = pixels.shape[1]
k = 2
c = np.zeros((k, n))
for i in range(k):
c[i, 0] = np.clip(pixels[random.randint(0,pixels.shape[0]), 0],0,245)
c[i, 1] = np.clip(pixels[random.randint(0,pixels.shape[0]), 1],0,245)
c[i, 2] = np.clip(pixels[random.randint(0,pixels.shape[0]), 2],0,245)
c
array([[ 64., 43., 93.], [ 70., 153., 92.]])
c_updated = copy.deepcopy(c) + 10
labels = np.zeros(pixels.shape[0])
start_time = time.time()
while np.linalg.norm(c - c_updated, ord='fro') > 10:
c = c_updated.copy()
# updating label assignments
for i in range(pixels.shape[0]):
dist = np.zeros(c.shape[0])
for j in range(c.shape[0]):
dist[j] = np.linalg.norm(pixels[i] - c[j])
labels[i] = np.argmin(dist)
# adjust cluster centers
for i in range(c.shape[0]):
subset = pixels[np.where(labels == i)]
c_updated[i] = np.nan_to_num(c_updated[i])
c_updated[i] = (int(pixels[np.where(labels == i)][:,0].mean()),int(pixels[np.where(labels == i)][:,1].mean()),int(pixels[np.where(labels == i)][:,2].mean()))
#print(c[0])
print(c_updated[0])
print("--- %s seconds ---" % (time.time() - start_time))
#print(c)
#print(labels[50:100])
#print(pixels[np.where(labels == 0)])
[77. 60. 41.] [62. 56. 44.] [60. 56. 45.] --- 9.789340734481812 seconds ---
h, w = img.shape[0], img.shape[1]
labels = np.reshape(labels, (h, w))
output = np.ones((h, w, 3), dtype = np.uint8)
output[:,:,:] = c[labels.astype(int),:]
io.imshow(output)
<matplotlib.image.AxesImage at 0x20772a2cb88>
The k_means_compress function below takes 2 inputs (image pixel values and k, the number of desired clusters) and returns 2 outputs (each pixel's assigned cluster label and the values of the clustering centroids).
The k_means_compress function has 3 main steps:
1) Randomly initialize cluster centers
2) Repeatedly 1) assign pixels to their nearest cluster and 2) adjust cluster centers to the average pixel value of those assigned to it until the centroid locations stop changing significantly
3) Display compressed image using a number of pixel values equal to the number of set cluster centers
Note: I used the image files as my input, rather than the pixels array, in order to retain the dimensions of the original image for eventually outputting a compressed image. This can be easily switched back to inputs=(pixels, k) by performing the pixels calculation outside of the 'k_means_compress' function but I thought it was helpful to display the compressed image.
def k_means_compress(image, k):
pixels = image.reshape(image.shape[0]*image.shape[1], 3)
n = pixels.shape[1]
c = np.zeros((k, n))
for i in range(k):
c[i, 0] = np.clip(pixels[random.randint(0,pixels.shape[0]), 0],0,245)
c[i, 1] = np.clip(pixels[random.randint(0,pixels.shape[0]), 1],0,245)
c[i, 2] = np.clip(pixels[random.randint(0,pixels.shape[0]), 2],0,245)
c_updated = copy.deepcopy(c) + 10
labels = np.zeros(pixels.shape[0])
start_time = time.time()
iterations = 0
# from kmeans demo code
while np.linalg.norm(c - c_updated, ord='fro') > 10:
c = c_updated.copy()
# updating label assignments
for i in range(pixels.shape[0]):
dist = np.zeros(c.shape[0])
for j in range(c.shape[0]):
dist[j] = np.linalg.norm(pixels[i] - c[j])
labels[i] = np.argmin(dist)
# adjust cluster centers
for i in range(c.shape[0]):
subset = pixels[np.where(labels == i)]
c_updated[i] = (int(np.nan_to_num(pixels[np.where(labels == i)][:,0].mean()))
,int(np.nan_to_num(pixels[np.where(labels == i)][:,1].mean()))
,int(np.nan_to_num(pixels[np.where(labels == i)][:,2].mean())))
iterations+=1
print("--- %s seconds ---" % (time.time() - start_time))
print("--- %s iterations ---" % (iterations))
h, w = image.shape[0], image.shape[1]
cluster_assignment = labels.copy()
labels = np.reshape(labels, (h, w))
output = np.ones((h, w, 3), dtype = np.uint8)
output[:,:,:] = c[labels.astype(int),:]
io.imshow(output)
return cluster_assignment, c_updated
Below are the runtimes and iterations for k-means clustering convergence for the football.bmp image compression:
Compressed images are displayed in order below, along with 1) each pixel's class and 2) centroid locations.
# import .bmp image as a numpy array
img_football = np.array(Image.open('data/football.bmp'))
k_means_compress(img_football, 2)
--- 9.3910231590271 seconds --- --- 2 iterations ---
(array([1., 1., 1., ..., 1., 1., 1.]), array([[192., 185., 175.], [ 77., 78., 68.]]))
k_means_compress(img_football, 4)
--- 132.76101970672607 seconds --- --- 10 iterations ---
(array([1., 1., 2., ..., 1., 1., 1.]), array([[211., 214., 215.], [121., 115., 91.], [ 36., 42., 46.], [177., 162., 144.]]))
k_means_compress(img_football, 8)
--- 138.3072111606598 seconds --- --- 9 iterations ---
(array([5., 5., 5., ..., 7., 7., 7.]), array([[214., 219., 223.], [110., 23., 20.], [186., 174., 157.], [165., 119., 94.], [ 27., 28., 25.], [ 82., 80., 61.], [ 24., 63., 116.], [115., 132., 105.]]))
k_means_compress(img_football, 16)
--- 343.7740070819855 seconds --- --- 11 iterations ---
(array([0., 0., 0., ..., 3., 3., 3.]), array([[ 85., 87., 68.], [ 92., 106., 121.], [191., 188., 181.], [117., 139., 87.], [165., 108., 84.], [ 15., 36., 67.], [130., 143., 155.], [ 42., 38., 24.], [217., 223., 226.], [131., 97., 77.], [ 64., 62., 45.], [ 14., 14., 15.], [ 22., 66., 123.], [170., 142., 114.], [139., 18., 23.], [191., 168., 141.]]))
Below are the runtimes and iterations for k-means clustering convergence for the GeorgiaTech.bmp image compression:
Compressed images are displayed in order below, along with 1) each pixel's class and 2) centroid locations.
# import .bmp image as a numpy array
img_gatech = np.array(Image.open('data/GeorgiaTech.bmp'))
k_means_compress(img_gatech, 2)
--- 11.551669120788574 seconds --- --- 4 iterations ---
(array([1., 1., 1., ..., 0., 0., 0.]), array([[ 60., 56., 46.], [172., 161., 153.]]))
k_means_compress(img_gatech, 4)
--- 20.681691884994507 seconds --- --- 4 iterations ---
(array([1., 1., 1., ..., 2., 2., 2.]), array([[ 99., 97., 84.], [166., 175., 184.], [ 42., 40., 34.], [207., 144., 92.]]))
k_means_compress(img_gatech, 8)
--- 64.97340726852417 seconds --- --- 7 iterations ---
(array([0., 0., 0., ..., 3., 3., 3.]), array([[231., 211., 197.], [100., 92., 89.], [ 46., 60., 66.], [ 39., 32., 24.], [143., 160., 177.], [209., 145., 94.], [ 83., 128., 155.], [119., 92., 36.]]))
k_means_compress(img_gatech, 16)
C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:43: RuntimeWarning: Mean of empty slice. C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:44: RuntimeWarning: Mean of empty slice. C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:45: RuntimeWarning: Mean of empty slice.
--- 154.70340085029602 seconds --- --- 9 iterations ---
(array([11., 11., 11., ..., 7., 7., 7.]), array([[ 33., 29., 26.], [208., 156., 109.], [ 22., 20., 17.], [140., 160., 179.], [144., 118., 100.], [ 12., 10., 10.], [140., 106., 41.], [ 56., 43., 29.], [209., 184., 170.], [ 88., 87., 88.], [ 91., 75., 31.], [251., 244., 236.], [ 39., 60., 72.], [ 4., 2., 3.], [235., 118., 44.], [ 86., 132., 160.]]))
Below are the runtimes and iterations for k-means clustering convergence for the XING_B24.bmp image compression:
Compressed images are displayed in order below, along with 1) each pixel's class and 2) centroid locations.
# import .bmp image as a numpy array
img_flower = np.array(Image.open('data/XING_B24.bmp'))
#img_flower = img_flower.reshape(img_flower.shape[0]*img_flower.shape[1], 3)
k_means_compress(img_flower, 2)
--- 2.8300390243530273 seconds --- --- 4 iterations ---
(array([0., 0., 0., ..., 0., 0., 0.]), array([[124., 75., 74.], [200., 183., 171.]]))
k_means_compress(img_flower, 4)
--- 9.271649837493896 seconds --- --- 7 iterations ---
(array([2., 2., 2., ..., 2., 2., 2.]), array([[230., 52., 99.], [226., 210., 167.], [ 73., 88., 63.], [121., 191., 190.]]))
k_means_compress(img_flower, 8)
--- 14.494359493255615 seconds --- --- 6 iterations ---
(array([7., 7., 7., ..., 2., 2., 2.]), array([[237., 38., 93.], [112., 136., 98.], [ 45., 53., 40.], [153., 50., 67.], [222., 216., 170.], [102., 215., 243.], [245., 86., 126.], [ 69., 99., 65.]]))
k_means_compress(img_flower, 16)
C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:43: RuntimeWarning: Mean of empty slice. C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:44: RuntimeWarning: Mean of empty slice. C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:45: RuntimeWarning: Mean of empty slice.
--- 28.054656505584717 seconds --- --- 6 iterations ---
(array([15., 15., 9., ..., 0., 0., 0.]), array([[ 43., 55., 43.], [214., 36., 80.], [246., 252., 150.], [248., 89., 129.], [105., 50., 51.], [ 31., 32., 21.], [144., 164., 129.], [212., 194., 191.], [102., 216., 244.], [ 80., 112., 76.], [156., 43., 65.], [170., 88., 94.], [106., 134., 97.], [250., 42., 102.], [187., 161., 106.], [ 59., 88., 58.]]))
The k_means_compress_poor function below is an exact replication of the k_means_compress function above, but with a poor centroid initialization strategy.
The poor initialization strategy I am using involves setting a maximum pixel value of 100 for each centroid (instead of the usual 255). This leads to more localized initial cluster centers and is more likely to converge at a local optima when minimizing the L2-norm objective function.
def k_means_compress_poor(image, k):
pixels = image.reshape(image.shape[0]*image.shape[1], 3)
n = pixels.shape[1]
c = np.zeros((k, n))
for i in range(k):
c[i, 0] = np.clip(pixels[random.randint(0,pixels.shape[0]), 0],0,100)
c[i, 1] = np.clip(pixels[random.randint(0,pixels.shape[0]), 1],0,100)
c[i, 2] = np.clip(pixels[random.randint(0,pixels.shape[0]), 2],0,100)
c_updated = copy.deepcopy(c) + 10
labels = np.zeros(pixels.shape[0])
start_time = time.time()
iterations = 0
# from kmeans demo code
while np.linalg.norm(c - c_updated, ord='fro') > 10:
c = c_updated.copy()
# updating label assignments
for i in range(pixels.shape[0]):
dist = np.zeros(c.shape[0])
for j in range(c.shape[0]):
dist[j] = np.linalg.norm(pixels[i] - c[j])
labels[i] = np.argmin(dist)
# adjust cluster centers
for i in range(c.shape[0]):
subset = pixels[np.where(labels == i)]
c_updated[i] = (int(np.nan_to_num(pixels[np.where(labels == i)][:,0].mean()))
,int(np.nan_to_num(pixels[np.where(labels == i)][:,1].mean()))
,int(np.nan_to_num(pixels[np.where(labels == i)][:,2].mean())))
iterations+=1
print("--- %s seconds ---" % (time.time() - start_time))
print("--- %s iterations ---" % (iterations))
h, w = image.shape[0], image.shape[1]
cluster_assignment = labels.copy()
labels = np.reshape(labels, (h, w))
output = np.ones((h, w, 3), dtype = np.uint8)
output[:,:,:] = c[labels.astype(int),:]
io.imshow(output)
return cluster_assignment, c_updated
After multiple runs using the sample image and k=4, it is clear that random cluster initialization is a superior method to the poor cluster initialization strategy.
Metrics from each initialization strategy, along with the accompanying compressed images, are below:
Although the same number of iterations were needed for each method, the compression using random initialization consistently required more runtime. This is likely a result of the poor initialization strategy converging to local optima before the random intialization strategy could converge to more useful & distant optima. As seen in the images below, the image produced via random initialization has more distinct color groupings, indicating a more effective spread of initial cluster centers.
# random initialization
k_means_compress(img_flower, 4)
--- 7.513904809951782 seconds --- --- 6 iterations ---
(array([0., 0., 0., ..., 0., 0., 0.]), array([[ 73., 88., 63.], [230., 52., 98.], [102., 216., 244.], [209., 198., 158.]]))
# poor initialization
k_means_compress_poor(img_flower, 4)
--- 7.271868467330933 seconds --- --- 6 iterations ---
(array([0., 0., 0., ..., 0., 0., 0.]), array([[ 57., 73., 51.], [109., 116., 87.], [234., 52., 100.], [190., 211., 186.]]))
The k_means_compress_manhattan function below is an exact replication of the k_means_compress function above, but using Manhattan distance as the distance metric and adjusting centroid location using median cluster values instead of mean cluster values.
Comments: Both the L2-norm and L1-norm distance methods produced compressed images of similar appearance. However, the clustering runtime for the L1-norm method generally required more iterations and a longer runtime to converge. This effect became more pronounced as the number of clusters increased.
The reason for this increase in runtime and iterations when moving from Euclidean distance to Manhattan distance is a result of using the same convergence condition for both methods. Realistically, the clustering using Manhattan distance could have a higher convergence threshold (and therefore not take quite as much runtime) while maintaining a similar effectiveness as the method using Euclidian distance.
def k_means_compress_manhattan(image, k):
pixels = image.reshape(image.shape[0]*image.shape[1], 3)
n = pixels.shape[1]
c = np.zeros((k, n))
for i in range(k):
c[i, 0] = np.clip(pixels[random.randint(0,pixels.shape[0]), 0],0,245)
c[i, 1] = np.clip(pixels[random.randint(0,pixels.shape[0]), 1],0,245)
c[i, 2] = np.clip(pixels[random.randint(0,pixels.shape[0]), 2],0,245)
c_updated = copy.deepcopy(c) + 10
labels = np.zeros(pixels.shape[0])
start_time = time.time()
iterations = 0
# now using Manhattan distance
while np.sum(np.abs(c - c_updated)) > 10:
c = c_updated.copy()
# updating label assignments
for i in range(pixels.shape[0]):
dist = np.zeros(c.shape[0])
for j in range(c.shape[0]):
# now using Manhattan distance
dist[j] = np.sum(np.abs(pixels[i] - c[j]))
labels[i] = np.argmin(dist)
# adjust cluster centers (now using median)
for i in range(c.shape[0]):
subset = pixels[np.where(labels == i)]
c_updated[i] = (int(np.nan_to_num(np.median(pixels[np.where(labels == i)][:,0])))
,int(np.nan_to_num(np.median(pixels[np.where(labels == i)][:,1])))
,int(np.nan_to_num(np.median(pixels[np.where(labels == i)][:,2]))))
iterations+=1
print("--- %s seconds ---" % (time.time() - start_time))
print("--- %s iterations ---" % (iterations))
h, w = image.shape[0], image.shape[1]
cluster_assignment = labels.copy()
labels = np.reshape(labels, (h, w))
output = np.ones((h, w, 3), dtype = np.uint8)
output[:,:,:] = c[labels.astype(int),:]
io.imshow(output)
return cluster_assignment, c_updated
Below are the runtimes and iterations for k-means clustering convergence for the football.bmp image compression (using Manhattan distance):
Compressed images are displayed in order below, along with 1) each pixel's class and 2) centroid locations.
k_means_compress_manhattan(img_football, 2)
--- 22.76741075515747 seconds --- --- 5 iterations ---
(array([0., 0., 0., ..., 0., 0., 0.]), array([[ 77., 80., 73.], [194., 184., 173.]]))
k_means_compress_manhattan(img_football, 4)
--- 38.06762194633484 seconds --- --- 5 iterations ---
(array([1., 1., 1., ..., 1., 1., 1.]), array([[183., 165., 149.], [116., 112., 87.], [216., 219., 221.], [ 31., 41., 35.]]))
k_means_compress_manhattan(img_football, 8)
C:\Users\emag3\Anaconda3\lib\site-packages\numpy\core\fromnumeric.py:3373: RuntimeWarning: Mean of empty slice. out=out, **kwargs)
--- 192.64198303222656 seconds --- --- 14 iterations ---
(array([1., 1., 1., ..., 3., 3., 3.]), array([[160., 106., 84.], [ 69., 68., 51.], [ 20., 22., 20.], [114., 130., 93.], [ 25., 64., 116.], [183., 159., 137.], [191., 184., 181.], [220., 223., 224.]]))
k_means_compress_manhattan(img_football, 16)
--- 552.2890989780426 seconds --- --- 19 iterations ---
(array([ 7., 7., 7., ..., 11., 11., 11.]), array([[ 55., 52., 37.], [100., 99., 76.], [ 33., 33., 24.], [183., 159., 140.], [ 57., 6., 7.], [119., 129., 139.], [191., 185., 183.], [ 74., 75., 57.], [ 38., 74., 132.], [ 16., 19., 20.], [ 6., 8., 8.], [120., 142., 89.], [217., 199., 129.], [220., 224., 225.], [ 10., 48., 92.], [162., 109., 86.]]))
Below are the runtimes and iterations for k-means clustering convergence for the GeorgiaTech.bmp image compression (using Manhattan distance):
Compressed images are displayed in order below, along with 1) each pixel's class and 2) centroid locations.
k_means_compress_manhattan(img_gatech, 2)
--- 9.383878231048584 seconds --- --- 3 iterations ---
(array([0., 0., 0., ..., 1., 1., 1.]), array([[165., 158., 165.], [ 51., 51., 33.]]))
k_means_compress_manhattan(img_gatech, 4)
--- 22.123305082321167 seconds --- --- 4 iterations ---
(array([2., 2., 2., ..., 1., 1., 1.]), array([[136., 157., 175.], [ 39., 34., 27.], [222., 173., 140.], [ 96., 91., 78.]]))
k_means_compress_manhattan(img_gatech, 8)
--- 218.72900366783142 seconds --- --- 21 iterations ---
(array([2., 2., 2., ..., 5., 4., 4.]), array([[136., 159., 181.], [ 90., 123., 143.], [255., 255., 255.], [204., 180., 172.], [ 35., 29., 24.], [ 62., 64., 63.], [116., 95., 59.], [213., 146., 96.]]))
k_means_compress_manhattan(img_gatech, 16)
--- 374.9333162307739 seconds --- --- 20 iterations ---
(array([0., 0., 0., ..., 5., 2., 2.]), array([[255., 255., 255.], [152., 121., 99.], [ 52., 41., 29.], [ 30., 25., 22.], [ 37., 52., 61.], [ 85., 71., 32.], [136., 161., 183.], [231., 200., 182.], [219., 168., 126.], [179., 169., 172.], [ 60., 73., 84.], [ 88., 133., 160.], [130., 101., 37.], [206., 149., 96.], [101., 93., 90.], [245., 118., 42.]]))
Below are the runtimes and iterations for k-means clustering convergence for the XING_B24.bmp image compression (using Manhattan distance):
Compressed images are displayed in order below, along with 1) each pixel's class and 2) centroid locations.
k_means_compress_manhattan(img_flower, 2)
--- 4.951285123825073 seconds --- --- 7 iterations ---
C:\Users\emag3\Anaconda3\lib\site-packages\skimage\io\_plugins\matplotlib_plugin.py:75: UserWarning: Low image data range; displaying image with stretched contrast. warn("Low image data range; displaying image with "
(array([0., 0., 0., ..., 0., 0., 0.]), array([[ 74., 95., 66.], [244., 63., 110.]]))
k_means_compress_manhattan(img_flower, 4)
--- 7.715327024459839 seconds --- --- 6 iterations ---
(array([1., 1., 1., ..., 1., 1., 1.]), array([[243., 49., 98.], [ 69., 92., 63.], [211., 192., 147.], [102., 216., 244.]]))
k_means_compress_manhattan(img_flower, 8)
--- 18.755235195159912 seconds --- --- 8 iterations ---
(array([5., 5., 5., ..., 4., 4., 4.]), array([[ 86., 116., 81.], [128., 146., 108.], [205., 41., 80.], [247., 206., 162.], [ 43., 51., 42.], [ 62., 90., 61.], [255., 58., 111.], [102., 216., 244.]]))
k_means_compress_manhattan(img_flower, 16)
--- 30.149405479431152 seconds --- --- 7 iterations ---
(array([ 2., 15., 2., ..., 12., 12., 12.]), array([[255., 42., 102.], [112., 139., 101.], [ 69., 103., 68.], [ 87., 118., 83.], [247., 253., 147.], [160., 163., 130.], [ 36., 41., 34.], [208., 63., 94.], [102., 216., 244.], [202., 195., 190.], [145., 49., 65.], [220., 26., 77.], [ 45., 57., 46.], [ 58., 73., 58.], [255., 82., 128.], [ 55., 89., 53.]]))
import numpy as np
import time
import matplotlib.pyplot as plt
from scipy.sparse import csc_matrix, find
from itertools import combinations
import copy
Create adjacency matrix using the simple graph provided:
A = np.matrix([[0, 1, 1, 0, 0],
[1, 0, 1, 0, 0],
[1, 1, 0, 0, 0],
[0, 0, 0, 0, 1],
[0, 0, 0, 1, 0]])
A
matrix([[0, 1, 1, 0, 0], [1, 0, 1, 0, 0], [1, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0]])
Generate the degree matrix using the adjacency matrix (formed by summing the number of adjacent nodes for each of the 5 nodes in the graph)
D = np.matrix([[2, 0, 0, 0, 0],
[0, 2, 0, 0, 0],
[0, 0, 2, 0, 0],
[0, 0, 0, 1, 0],
[0, 0, 0, 0, 1]])
D
matrix([[2, 0, 0, 0, 0], [0, 2, 0, 0, 0], [0, 0, 2, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]])
L = D - A
L
matrix([[ 2, -1, -1, 0, 0], [-1, 2, -1, 0, 0], [-1, -1, 2, 0, 0], [ 0, 0, 0, 1, -1], [ 0, 0, 0, -1, 1]])
from scipy.linalg import eig
eig_val, eig_vec = eig(L)
print('eigenvalues: ')
print(eig_val)
print('eigenvectors: ')
print(eig_vec)
eigenvalues: [ 3.00000000e+00+0.j -1.11022302e-16+0.j 3.00000000e+00+0.j 2.00000000e+00+0.j 0.00000000e+00+0.j] eigenvectors: [[ 0.81649658 -0.57735027 0.30959441 0. 0. ] [-0.40824829 -0.57735027 -0.80910101 0. 0. ] [-0.40824829 -0.57735027 0.49950661 0. 0. ] [ 0. 0. 0. 0.70710678 0.70710678] [ 0. 0. 0. -0.70710678 0.70710678]]
Eigenvectors associated with zero eigenvalue:
print('zero eigenvalue: ')
print(eig_val[1])
print('eigenvector: ')
print(eig_vec[1])
print('\nzero eigenvalue: ')
print(eig_val[4])
print('eigenvector: ')
print(eig_vec[4])
zero eigenvalue: (-1.1102230246251565e-16+0j) eigenvector: [-0.40824829 -0.57735027 -0.80910101 0. 0. ] zero eigenvalue: 0j eigenvector: [ 0. 0. 0. -0.70710678 0.70710678]
The multiplicty of the eigenvalue 0 is 2, corresponding to 2 connected components in the graph.
These 2 eigenvectors include cluster assignment information. The first eignvector has nonzero values in positions 0, 1, and 2, corresponding to the cluster between nodes 1, 2, and 3 in the graph. The second eigenvector has nonzero values in positions 3 and 4, corresponding to the cluster between nodes 4 and 5 in the graph.
#code below is adapted from the provided demo code
import os
import numpy as np
from os.path import abspath, exists
from scipy import sparse
import scipy
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
def output_file(a, idx2name, c_idx):
dirpath = os.getcwd()
node_file = dirpath + '//nodes.csv'
edge_file = dirpath + '//edges.csv'
with open(edge_file, 'w') as fid:
fid.write('Source\tTarget\n')
for i in range(len(a)):
fid.write(f'{a[i,0]}\t{a[i,1]}\n')
with open(node_file, 'w') as fid:
fid.write('Id\tLabel\tColor\n')
for i in range(len(idx2name)):
fid.write(f'{i}\t{idx2name[i]}\t{c_idx[i]}\n')
def read_team_name():
# read inverse_teams.txt file
f_path = abspath("inverse_teams.txt")
idx2name = []
if exists(f_path):
with open(f_path) as fid:
for line in fid.readlines():
name = line.split("\t", 1)[1]
idx2name.append(name[:-1])
return idx2name
def import_graph():
# read the graph from 'play_graph.txt'
f_path = abspath("play_graph.txt")
if exists(f_path):
with open(f_path) as graph_file:
lines = [line.split() for line in graph_file]
return np.array(lines).astype(int)
# load the graph
a = import_graph()
n = 321
i = a[:, 0]-1
j = a[:, 1]-1
v = np.ones((a.shape[0], 1)).flatten()
A = sparse.coo_matrix((v, (i, j)), shape=(n, n))
A = (A + np.transpose(A))/2
A = sparse.csc_matrix.todense(A) # ## convert to dense matrix
D = np.diag(1/np.sqrt(np.sum(A, axis=1)).A1) # taking -1/2 power of diagonal matrix
L = D @ A @ D # normalized Laplacian, therefore want to find the largest eigenvalues
L = np.array(L) # ## convert to array
# v, x = np.linalg.eig(L)
# x = x[:, 0:k].real
# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest
The eigendecomposition of the normalized Laplacian matrix is above. Below are the plotted eigenvalues (ranked largest to smallest).
Based on the plot, I would say there should be approximately 17 clusters. I am choosing about 17 clusters because, when considering the normalized Laplacian's eigendecomposition, one wants to select the k largest eigenvalues. After the first 17 or so sorted eigenvalues, there is a large drop in value (from ~0.6 to ~0.4). I would treat this as a distinct cutoff for the "large" eigenvalues.
plt.plot(v[idx_sorted].real[::-1])
[<matplotlib.lines.Line2D at 0x203d2d72f88>]
The cells below display the largest and smallest cluster sizes for k = 5, 7, 10, and 17 (my approximation)
# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest
# spectral clustering
k = 5
x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors
x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)
# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()
# convert x to real numbers
x = x.real
# k-means with k=5
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_
c_idx
print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])
Largest cluster size: 106 Smallest cluster size: 36
# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest
# spectral clustering
k = 7
x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors
x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)
# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()
# convert x to real numbers
x = x.real
# k-means with k=7
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_
c_idx
print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])
Largest cluster size: 94 Smallest cluster size: 18
# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest
# spectral clustering
k = 10
x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors
x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)
# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()
# convert x to real numbers
x = x.real
# k-means with k=10
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_
c_idx
print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])
Largest cluster size: 63 Smallest cluster size: 18
# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest
# spectral clustering
k = 17
x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors
x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)
# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()
# convert x to real numbers
x = x.real
# k-means with k=17
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_
c_idx
print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])
Largest cluster size: 28 Smallest cluster size: 10
Cluster results for k = 10, listing the teams that belong to each cluster
# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest
# spectral clustering
k = 10
x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors
x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)
# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()
# convert x to real numbers
x = x.real
# k-means with k=10
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_
# show cluster
idx2name = read_team_name()
for i in range(k):
print(f'Cluster {i+1}\n***************')
idx = [index for index, t in enumerate(c_idx) if t == i]
for index in idx:
print(idx2name[index])
print('\n')
# output file
output_file(a, idx2name, c_idx)
print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])
Cluster 1 *************** Virginia-Lynchburg Alcorn St. Faith U. Mississippi Valley St. Southern Ark.-Pine Bluff Jackson St. Alabama St. Grambling Alabama A&M Prairie View A&M Texas Southern Tuskegee Concordia Central Methodist Stillman College Louisiana College Central State (Ohio) Cluster 2 *************** Georgia State Texas A&M South Carolina Wake Forest La.-Monroe Ole Miss New Mexico St. Vanderbilt Syracuse Virginia Pittsburgh Navy Troy UAB Appalachian St. Kentucky Georgia Tech Ga. Southern N. Carolina St. Boston College Missouri Alabama Virginia Tech Arkansas Auburn Clemson Georgia Duke North Carolina UL Lafayette Arkansas St. Texas State Southern Miss Mississippi St. Florida St. LSU Tennessee Miami (Fla.) Louisville South Alabama Florida Idaho Cluster 3 *************** E. Illinois Missouri St. E. Kentucky SE Missouri St. W. Illinois Kentucky Chr. Tenn. Tech S. Illinois Union College (KY) Murray St. Jacksonville St. N. Iowa UT Martin N. Dakota St. Youngstown St. Indiana St. South Dakota St. Austin Peay Edward Waters Tennessee St. South Dakota William Penn Illinois St. Cumberland (TN) West Alabama Wisconsin-Oshkosh Kentucky Wesleyan Cluster 4 *************** Presbyterian Chattanooga Charlotte Reinhardt Mercer Point U Charleston So. Wofford Monmouth VMI Coastal Carolina The Citadel Liberty Samford Warner W. Carolina Gardner-Webb Furman Johnson C. Smith Brevard College Newberry Bluefield State Catawba N. Greenville Ave Maria Virginia-Wise Wesley Cluster 5 *************** Stony Brook Howard Villanova Delaware Delaware St. Hampton James Madison William & Mary S.C. State Benedict Elon Morgan St. Towson Richmond Norfolk St. Maine Bethune-Cook. Savannah St. Florida A&M NC Central NC A&T Elizabeth City State Bowie State Morehouse Fort Valley State American International Miles Chowan Cluster 6 *************** Sam Houston St. E. Washington Abil Chr. Northwestern St. Idaho State Cal-Poly North Dakota Weber St. Nicholls St. So. Utah UC-Davis Portland St. Montana Montana St. Sacramento St. Incarnate Word N. Arizona Cent. Arkansas S.F. Austin Lamar SE Louisiana Montana-Western Texas College McNeese St. Central Washington McMurry University Houston Baptist Fort Lewis Black Hills State N. Colorado Western Oregon Chadron State New Mexico Highlands Henderson State Texas A&M-Commerce CSU-Pueblo Menlo Mississippi College Arkansas Tech Cluster 7 *************** Campbell Missouri Baptist Valparaiso Taylor Morehead St. Marist Stetson Grand View U. Drake College of Faith Davidson Jacksonville Dayton Pikeville College Butler Wittenberg Florida Tech Truman State St. Joseph's (IN) Western New Mexico San Diego Birmingham-Southern William Jewell Limestone College Cluster 8 *************** Utah Boise St. San Jose St. Washington St. Arizona St. BYU Colorado St. Colorado UNLV Arizona UCLA Air Force Nevada California Notre Dame Stanford Oregon St. Wyoming San Diego St. Fresno St. USC New Mexico Oregon Washington Hawaii Utah St. Cluster 9 *************** Minnesota Northern Illinois C. Michigan Akron Tulane Tulsa Rutgers Temple Connecticut Michigan St. Bowling Green W. Kentucky Texas-San Antonio Houston Penn St. UCF W. Michigan Purdue Iowa Ohio St. Michigan Iowa St. Illinois Indiana Ball St. Massachusetts Northwestern FAU Nebraska Old Dominion Marshall Miami (Ohio) Maryland Rice Buffalo West Virginia E. Michigan Ohio Kent St. TCU Memphis Louisiana Tech Oklahoma FIU South Florida M. Tenn. St. Texas Tech Toledo Kansas St. UTEP North Texas Texas Oklahoma St. East Carolina Wisconsin SMU Baylor Army Kansas Cincinnati Cluster 10 *************** Bryant Robert Morris Wagner Georgetown Colgate Duquesne Bucknell Holy Cross Albany Central Conn. St. Sacred Heart St. Francis Fordham New Hampshire Lehigh Clarion Merrimack Lafayette Rhode Island Assumption Harvard Yale Brown Columbia Cornell Penn Princeton Dartmouth West Liberty State Alderson Broaddus Largest cluster size: 60 Smallest cluster size: 18
Results between different runs of k-means clustering with k=10 are slightly different because of how the clusters are initialized. The default initialization strategy, 'k-means++', speeds up the convergence process (when compared to random initialization) but still changes from run to run. This is why the clusters are slightly different between runs.
Between all of my k=10 runs, “Georgia Tech”, “Georgia State”, and “Georgia” (UGA) have always been in the same cluster. Despite the changing cluster initialization mentioned above, “Georgia Tech”, “Georgia State”, and “Georgia” must be interconnected enough that the vast majority of initializations will still group them togther into the same cluster.
When repeatedly running k-means with k=10, the sizes of the largest and smallest clusters changed slightly with every run. I found this interesting because it was a direct view into the fact that we use k-means as a heuristic and, although it is very useful, it will not provide the optimal solution every time it is used.