import numpy as np
from PIL import Image
import numpy as np
import time
import matplotlib.pyplot as plt
from scipy.sparse import csc_matrix, find
from itertools import combinations
import copy
from skimage import io

# import .bmp image as a numpy array
#img = np.array(Image.open('data/football.bmp'))
img = np.array(Image.open('data/GeorgiaTech.bmp'))


# image shape is (412, 620, 3)
print(img.shape)

# reshape to 1 row/pixel
pixels = img.reshape(img.shape[0]*img.shape[1], 3)

# data shape is now (255440, 3)
print(pixels.shape)

(400, 400, 3)
(160000, 3)


n = pixels.shape[1]
    
k = 2

c = np.zeros((k, n))

for i in range(k):

    c[i, 0] = np.clip(pixels[random.randint(0,pixels.shape[0]), 0],0,245)
    c[i, 1] = np.clip(pixels[random.randint(0,pixels.shape[0]), 1],0,245)
    c[i, 2] = np.clip(pixels[random.randint(0,pixels.shape[0]), 2],0,245)

c

array([[ 64.,  43.,  93.],
       [ 70., 153.,  92.]])


c_updated = copy.deepcopy(c) + 10
labels = np.zeros(pixels.shape[0])

start_time = time.time()

while np.linalg.norm(c - c_updated, ord='fro') > 10:

    c = c_updated.copy()
    
    # updating label assignments
    for i in range(pixels.shape[0]):
    
        dist = np.zeros(c.shape[0])

        for j in range(c.shape[0]):
            dist[j] = np.linalg.norm(pixels[i] - c[j])

        labels[i] = np.argmin(dist)
    
    
    # adjust cluster centers
    for i in range(c.shape[0]):
        subset = pixels[np.where(labels == i)]
        c_updated[i] = np.nan_to_num(c_updated[i])
        c_updated[i] = (int(pixels[np.where(labels == i)][:,0].mean()),int(pixels[np.where(labels == i)][:,1].mean()),int(pixels[np.where(labels == i)][:,2].mean()))

    #print(c[0])
    print(c_updated[0])

    
print("--- %s seconds ---" % (time.time() - start_time))

#print(c)
#print(labels[50:100])
#print(pixels[np.where(labels == 0)])

[77. 60. 41.]
[62. 56. 44.]
[60. 56. 45.]
--- 9.789340734481812 seconds ---


h, w = img.shape[0], img.shape[1]
labels = np.reshape(labels, (h, w))

output = np.ones((h, w, 3), dtype = np.uint8)

output[:,:,:] = c[labels.astype(int),:]
    
io.imshow(output)

<matplotlib.image.AxesImage at 0x20772a2cb88>


def k_means_compress(image, k):
    
    pixels = image.reshape(image.shape[0]*image.shape[1], 3)

    n = pixels.shape[1]

    c = np.zeros((k, n))

    for i in range(k):

        c[i, 0] = np.clip(pixels[random.randint(0,pixels.shape[0]), 0],0,245)
        c[i, 1] = np.clip(pixels[random.randint(0,pixels.shape[0]), 1],0,245)
        c[i, 2] = np.clip(pixels[random.randint(0,pixels.shape[0]), 2],0,245)

    c_updated = copy.deepcopy(c) + 10
    labels = np.zeros(pixels.shape[0])

    start_time = time.time()
    
    iterations = 0
    
    # from kmeans demo code
    while np.linalg.norm(c - c_updated, ord='fro') > 10:

        c = c_updated.copy()

        # updating label assignments
        for i in range(pixels.shape[0]):

            dist = np.zeros(c.shape[0])

            for j in range(c.shape[0]):
                dist[j] = np.linalg.norm(pixels[i] - c[j])

            labels[i] = np.argmin(dist)

        # adjust cluster centers
        for i in range(c.shape[0]):
            subset = pixels[np.where(labels == i)]
            c_updated[i] = (int(np.nan_to_num(pixels[np.where(labels == i)][:,0].mean()))
                            ,int(np.nan_to_num(pixels[np.where(labels == i)][:,1].mean()))
                            ,int(np.nan_to_num(pixels[np.where(labels == i)][:,2].mean())))
        
        iterations+=1
        
    print("--- %s seconds ---" % (time.time() - start_time))
    print("--- %s iterations ---" % (iterations))

    h, w = image.shape[0], image.shape[1]
    
    cluster_assignment = labels.copy()
    labels = np.reshape(labels, (h, w))

    output = np.ones((h, w, 3), dtype = np.uint8)

    output[:,:,:] = c[labels.astype(int),:]

    io.imshow(output)
    
    return cluster_assignment, c_updated


# import .bmp image as a numpy array
img_football = np.array(Image.open('data/football.bmp'))


k_means_compress(img_football, 2)

--- 9.3910231590271 seconds ---
--- 2 iterations ---

(array([1., 1., 1., ..., 1., 1., 1.]), array([[192., 185., 175.],
        [ 77.,  78.,  68.]]))


k_means_compress(img_football, 4)

--- 132.76101970672607 seconds ---
--- 10 iterations ---

(array([1., 1., 2., ..., 1., 1., 1.]), array([[211., 214., 215.],
        [121., 115.,  91.],
        [ 36.,  42.,  46.],
        [177., 162., 144.]]))


k_means_compress(img_football, 8)

--- 138.3072111606598 seconds ---
--- 9 iterations ---

(array([5., 5., 5., ..., 7., 7., 7.]), array([[214., 219., 223.],
        [110.,  23.,  20.],
        [186., 174., 157.],
        [165., 119.,  94.],
        [ 27.,  28.,  25.],
        [ 82.,  80.,  61.],
        [ 24.,  63., 116.],
        [115., 132., 105.]]))


k_means_compress(img_football, 16)

--- 343.7740070819855 seconds ---
--- 11 iterations ---

(array([0., 0., 0., ..., 3., 3., 3.]), array([[ 85.,  87.,  68.],
        [ 92., 106., 121.],
        [191., 188., 181.],
        [117., 139.,  87.],
        [165., 108.,  84.],
        [ 15.,  36.,  67.],
        [130., 143., 155.],
        [ 42.,  38.,  24.],
        [217., 223., 226.],
        [131.,  97.,  77.],
        [ 64.,  62.,  45.],
        [ 14.,  14.,  15.],
        [ 22.,  66., 123.],
        [170., 142., 114.],
        [139.,  18.,  23.],
        [191., 168., 141.]]))


# import .bmp image as a numpy array
img_gatech = np.array(Image.open('data/GeorgiaTech.bmp'))


k_means_compress(img_gatech, 2)

--- 11.551669120788574 seconds ---
--- 4 iterations ---

(array([1., 1., 1., ..., 0., 0., 0.]), array([[ 60.,  56.,  46.],
        [172., 161., 153.]]))


k_means_compress(img_gatech, 4)

--- 20.681691884994507 seconds ---
--- 4 iterations ---

(array([1., 1., 1., ..., 2., 2., 2.]), array([[ 99.,  97.,  84.],
        [166., 175., 184.],
        [ 42.,  40.,  34.],
        [207., 144.,  92.]]))


k_means_compress(img_gatech, 8)

--- 64.97340726852417 seconds ---
--- 7 iterations ---

(array([0., 0., 0., ..., 3., 3., 3.]), array([[231., 211., 197.],
        [100.,  92.,  89.],
        [ 46.,  60.,  66.],
        [ 39.,  32.,  24.],
        [143., 160., 177.],
        [209., 145.,  94.],
        [ 83., 128., 155.],
        [119.,  92.,  36.]]))


k_means_compress(img_gatech, 16)

C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:43: RuntimeWarning: Mean of empty slice.
C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:44: RuntimeWarning: Mean of empty slice.
C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:45: RuntimeWarning: Mean of empty slice.

--- 154.70340085029602 seconds ---
--- 9 iterations ---

(array([11., 11., 11., ...,  7.,  7.,  7.]), array([[ 33.,  29.,  26.],
        [208., 156., 109.],
        [ 22.,  20.,  17.],
        [140., 160., 179.],
        [144., 118., 100.],
        [ 12.,  10.,  10.],
        [140., 106.,  41.],
        [ 56.,  43.,  29.],
        [209., 184., 170.],
        [ 88.,  87.,  88.],
        [ 91.,  75.,  31.],
        [251., 244., 236.],
        [ 39.,  60.,  72.],
        [  4.,   2.,   3.],
        [235., 118.,  44.],
        [ 86., 132., 160.]]))


# import .bmp image as a numpy array
img_flower = np.array(Image.open('data/XING_B24.bmp'))
#img_flower = img_flower.reshape(img_flower.shape[0]*img_flower.shape[1], 3)


k_means_compress(img_flower, 2)

--- 2.8300390243530273 seconds ---
--- 4 iterations ---

(array([0., 0., 0., ..., 0., 0., 0.]), array([[124.,  75.,  74.],
        [200., 183., 171.]]))


k_means_compress(img_flower, 4)

--- 9.271649837493896 seconds ---
--- 7 iterations ---

(array([2., 2., 2., ..., 2., 2., 2.]), array([[230.,  52.,  99.],
        [226., 210., 167.],
        [ 73.,  88.,  63.],
        [121., 191., 190.]]))


k_means_compress(img_flower, 8)

--- 14.494359493255615 seconds ---
--- 6 iterations ---

(array([7., 7., 7., ..., 2., 2., 2.]), array([[237.,  38.,  93.],
        [112., 136.,  98.],
        [ 45.,  53.,  40.],
        [153.,  50.,  67.],
        [222., 216., 170.],
        [102., 215., 243.],
        [245.,  86., 126.],
        [ 69.,  99.,  65.]]))


k_means_compress(img_flower, 16)

C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:43: RuntimeWarning: Mean of empty slice.
C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:44: RuntimeWarning: Mean of empty slice.
C:\Users\emag3\Anaconda3\lib\site-packages\ipykernel_launcher.py:45: RuntimeWarning: Mean of empty slice.

--- 28.054656505584717 seconds ---
--- 6 iterations ---

(array([15., 15.,  9., ...,  0.,  0.,  0.]), array([[ 43.,  55.,  43.],
        [214.,  36.,  80.],
        [246., 252., 150.],
        [248.,  89., 129.],
        [105.,  50.,  51.],
        [ 31.,  32.,  21.],
        [144., 164., 129.],
        [212., 194., 191.],
        [102., 216., 244.],
        [ 80., 112.,  76.],
        [156.,  43.,  65.],
        [170.,  88.,  94.],
        [106., 134.,  97.],
        [250.,  42., 102.],
        [187., 161., 106.],
        [ 59.,  88.,  58.]]))


def k_means_compress_poor(image, k):
    
    pixels = image.reshape(image.shape[0]*image.shape[1], 3)

    n = pixels.shape[1]

    c = np.zeros((k, n))

    for i in range(k):

        c[i, 0] = np.clip(pixels[random.randint(0,pixels.shape[0]), 0],0,100)
        c[i, 1] = np.clip(pixels[random.randint(0,pixels.shape[0]), 1],0,100)
        c[i, 2] = np.clip(pixels[random.randint(0,pixels.shape[0]), 2],0,100)

    c_updated = copy.deepcopy(c) + 10
    labels = np.zeros(pixels.shape[0])

    start_time = time.time()
    
    iterations = 0
    
    # from kmeans demo code
    while np.linalg.norm(c - c_updated, ord='fro') > 10:

        c = c_updated.copy()

        # updating label assignments
        for i in range(pixels.shape[0]):

            dist = np.zeros(c.shape[0])

            for j in range(c.shape[0]):
                dist[j] = np.linalg.norm(pixels[i] - c[j])

            labels[i] = np.argmin(dist)

        # adjust cluster centers
        for i in range(c.shape[0]):
            subset = pixels[np.where(labels == i)]
            c_updated[i] = (int(np.nan_to_num(pixels[np.where(labels == i)][:,0].mean()))
                            ,int(np.nan_to_num(pixels[np.where(labels == i)][:,1].mean()))
                            ,int(np.nan_to_num(pixels[np.where(labels == i)][:,2].mean())))
        
        iterations+=1
        
    print("--- %s seconds ---" % (time.time() - start_time))
    print("--- %s iterations ---" % (iterations))

    h, w = image.shape[0], image.shape[1]
    
    cluster_assignment = labels.copy()
    labels = np.reshape(labels, (h, w))

    output = np.ones((h, w, 3), dtype = np.uint8)

    output[:,:,:] = c[labels.astype(int),:]

    io.imshow(output)
    
    return cluster_assignment, c_updated


# random initialization
k_means_compress(img_flower, 4)

--- 7.513904809951782 seconds ---
--- 6 iterations ---

(array([0., 0., 0., ..., 0., 0., 0.]), array([[ 73.,  88.,  63.],
        [230.,  52.,  98.],
        [102., 216., 244.],
        [209., 198., 158.]]))


# poor initialization
k_means_compress_poor(img_flower, 4)

--- 7.271868467330933 seconds ---
--- 6 iterations ---

(array([0., 0., 0., ..., 0., 0., 0.]), array([[ 57.,  73.,  51.],
        [109., 116.,  87.],
        [234.,  52., 100.],
        [190., 211., 186.]]))


def k_means_compress_manhattan(image, k):
    
    pixels = image.reshape(image.shape[0]*image.shape[1], 3)

    n = pixels.shape[1]

    c = np.zeros((k, n))

    for i in range(k):

        c[i, 0] = np.clip(pixels[random.randint(0,pixels.shape[0]), 0],0,245)
        c[i, 1] = np.clip(pixels[random.randint(0,pixels.shape[0]), 1],0,245)
        c[i, 2] = np.clip(pixels[random.randint(0,pixels.shape[0]), 2],0,245)

    c_updated = copy.deepcopy(c) + 10
    labels = np.zeros(pixels.shape[0])

    start_time = time.time()
    
    iterations = 0
    
    # now using Manhattan distance
    while np.sum(np.abs(c - c_updated)) > 10:

        c = c_updated.copy()

        # updating label assignments
        for i in range(pixels.shape[0]):

            dist = np.zeros(c.shape[0])

            for j in range(c.shape[0]):
                # now using Manhattan distance
                dist[j] = np.sum(np.abs(pixels[i] - c[j]))

            labels[i] = np.argmin(dist)

        # adjust cluster centers (now using median)
        for i in range(c.shape[0]):
            subset = pixels[np.where(labels == i)]
            c_updated[i] = (int(np.nan_to_num(np.median(pixels[np.where(labels == i)][:,0])))
                            ,int(np.nan_to_num(np.median(pixels[np.where(labels == i)][:,1])))
                            ,int(np.nan_to_num(np.median(pixels[np.where(labels == i)][:,2]))))
        
        iterations+=1
        
    print("--- %s seconds ---" % (time.time() - start_time))
    print("--- %s iterations ---" % (iterations))

    h, w = image.shape[0], image.shape[1]
    
    cluster_assignment = labels.copy()
    labels = np.reshape(labels, (h, w))

    output = np.ones((h, w, 3), dtype = np.uint8)

    output[:,:,:] = c[labels.astype(int),:]

    io.imshow(output)
    
    return cluster_assignment, c_updated


k_means_compress_manhattan(img_football, 2)

--- 22.76741075515747 seconds ---
--- 5 iterations ---

(array([0., 0., 0., ..., 0., 0., 0.]), array([[ 77.,  80.,  73.],
        [194., 184., 173.]]))


k_means_compress_manhattan(img_football, 4)

--- 38.06762194633484 seconds ---
--- 5 iterations ---

(array([1., 1., 1., ..., 1., 1., 1.]), array([[183., 165., 149.],
        [116., 112.,  87.],
        [216., 219., 221.],
        [ 31.,  41.,  35.]]))


k_means_compress_manhattan(img_football, 8)

C:\Users\emag3\Anaconda3\lib\site-packages\numpy\core\fromnumeric.py:3373: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)

--- 192.64198303222656 seconds ---
--- 14 iterations ---

(array([1., 1., 1., ..., 3., 3., 3.]), array([[160., 106.,  84.],
        [ 69.,  68.,  51.],
        [ 20.,  22.,  20.],
        [114., 130.,  93.],
        [ 25.,  64., 116.],
        [183., 159., 137.],
        [191., 184., 181.],
        [220., 223., 224.]]))


k_means_compress_manhattan(img_football, 16)

--- 552.2890989780426 seconds ---
--- 19 iterations ---

(array([ 7.,  7.,  7., ..., 11., 11., 11.]), array([[ 55.,  52.,  37.],
        [100.,  99.,  76.],
        [ 33.,  33.,  24.],
        [183., 159., 140.],
        [ 57.,   6.,   7.],
        [119., 129., 139.],
        [191., 185., 183.],
        [ 74.,  75.,  57.],
        [ 38.,  74., 132.],
        [ 16.,  19.,  20.],
        [  6.,   8.,   8.],
        [120., 142.,  89.],
        [217., 199., 129.],
        [220., 224., 225.],
        [ 10.,  48.,  92.],
        [162., 109.,  86.]]))


k_means_compress_manhattan(img_gatech, 2)

--- 9.383878231048584 seconds ---
--- 3 iterations ---

(array([0., 0., 0., ..., 1., 1., 1.]), array([[165., 158., 165.],
        [ 51.,  51.,  33.]]))


k_means_compress_manhattan(img_gatech, 4)

--- 22.123305082321167 seconds ---
--- 4 iterations ---

(array([2., 2., 2., ..., 1., 1., 1.]), array([[136., 157., 175.],
        [ 39.,  34.,  27.],
        [222., 173., 140.],
        [ 96.,  91.,  78.]]))


k_means_compress_manhattan(img_gatech, 8)

--- 218.72900366783142 seconds ---
--- 21 iterations ---

(array([2., 2., 2., ..., 5., 4., 4.]), array([[136., 159., 181.],
        [ 90., 123., 143.],
        [255., 255., 255.],
        [204., 180., 172.],
        [ 35.,  29.,  24.],
        [ 62.,  64.,  63.],
        [116.,  95.,  59.],
        [213., 146.,  96.]]))


k_means_compress_manhattan(img_gatech, 16)

--- 374.9333162307739 seconds ---
--- 20 iterations ---

(array([0., 0., 0., ..., 5., 2., 2.]), array([[255., 255., 255.],
        [152., 121.,  99.],
        [ 52.,  41.,  29.],
        [ 30.,  25.,  22.],
        [ 37.,  52.,  61.],
        [ 85.,  71.,  32.],
        [136., 161., 183.],
        [231., 200., 182.],
        [219., 168., 126.],
        [179., 169., 172.],
        [ 60.,  73.,  84.],
        [ 88., 133., 160.],
        [130., 101.,  37.],
        [206., 149.,  96.],
        [101.,  93.,  90.],
        [245., 118.,  42.]]))


k_means_compress_manhattan(img_flower, 2)

--- 4.951285123825073 seconds ---
--- 7 iterations ---

C:\Users\emag3\Anaconda3\lib\site-packages\skimage\io\_plugins\matplotlib_plugin.py:75: UserWarning: Low image data range; displaying image with stretched contrast.
  warn("Low image data range; displaying image with "

(array([0., 0., 0., ..., 0., 0., 0.]), array([[ 74.,  95.,  66.],
        [244.,  63., 110.]]))


k_means_compress_manhattan(img_flower, 4)

--- 7.715327024459839 seconds ---
--- 6 iterations ---

(array([1., 1., 1., ..., 1., 1., 1.]), array([[243.,  49.,  98.],
        [ 69.,  92.,  63.],
        [211., 192., 147.],
        [102., 216., 244.]]))


k_means_compress_manhattan(img_flower, 8)

--- 18.755235195159912 seconds ---
--- 8 iterations ---

(array([5., 5., 5., ..., 4., 4., 4.]), array([[ 86., 116.,  81.],
        [128., 146., 108.],
        [205.,  41.,  80.],
        [247., 206., 162.],
        [ 43.,  51.,  42.],
        [ 62.,  90.,  61.],
        [255.,  58., 111.],
        [102., 216., 244.]]))


k_means_compress_manhattan(img_flower, 16)

--- 30.149405479431152 seconds ---
--- 7 iterations ---

(array([ 2., 15.,  2., ..., 12., 12., 12.]), array([[255.,  42., 102.],
        [112., 139., 101.],
        [ 69., 103.,  68.],
        [ 87., 118.,  83.],
        [247., 253., 147.],
        [160., 163., 130.],
        [ 36.,  41.,  34.],
        [208.,  63.,  94.],
        [102., 216., 244.],
        [202., 195., 190.],
        [145.,  49.,  65.],
        [220.,  26.,  77.],
        [ 45.,  57.,  46.],
        [ 58.,  73.,  58.],
        [255.,  82., 128.],
        [ 55.,  89.,  53.]]))


import numpy as np
import time
import matplotlib.pyplot as plt
from scipy.sparse import csc_matrix, find
from itertools import combinations
import copy


A = np.matrix([[0, 1, 1, 0, 0],
               [1, 0, 1, 0, 0],
               [1, 1, 0, 0, 0],
               [0, 0, 0, 0, 1],
               [0, 0, 0, 1, 0]])
A

matrix([[0, 1, 1, 0, 0],
        [1, 0, 1, 0, 0],
        [1, 1, 0, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0]])


D = np.matrix([[2, 0, 0, 0, 0],
               [0, 2, 0, 0, 0],
               [0, 0, 2, 0, 0],
               [0, 0, 0, 1, 0],
               [0, 0, 0, 0, 1]])
D

matrix([[2, 0, 0, 0, 0],
        [0, 2, 0, 0, 0],
        [0, 0, 2, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])


L = D - A
L

matrix([[ 2, -1, -1,  0,  0],
        [-1,  2, -1,  0,  0],
        [-1, -1,  2,  0,  0],
        [ 0,  0,  0,  1, -1],
        [ 0,  0,  0, -1,  1]])


from scipy.linalg import eig

eig_val, eig_vec = eig(L)

print('eigenvalues: ')
print(eig_val)
print('eigenvectors: ')
print(eig_vec)

eigenvalues: 
[ 3.00000000e+00+0.j -1.11022302e-16+0.j  3.00000000e+00+0.j
  2.00000000e+00+0.j  0.00000000e+00+0.j]
eigenvectors: 
[[ 0.81649658 -0.57735027  0.30959441  0.          0.        ]
 [-0.40824829 -0.57735027 -0.80910101  0.          0.        ]
 [-0.40824829 -0.57735027  0.49950661  0.          0.        ]
 [ 0.          0.          0.          0.70710678  0.70710678]
 [ 0.          0.          0.         -0.70710678  0.70710678]]


print('zero eigenvalue: ')
print(eig_val[1])
print('eigenvector: ')
print(eig_vec[1])
print('\nzero eigenvalue: ')
print(eig_val[4])
print('eigenvector: ')
print(eig_vec[4])

zero eigenvalue: 
(-1.1102230246251565e-16+0j)
eigenvector: 
[-0.40824829 -0.57735027 -0.80910101  0.          0.        ]

zero eigenvalue: 
0j
eigenvector: 
[ 0.          0.          0.         -0.70710678  0.70710678]


#code below is adapted from the provided demo code

import os
import numpy as np
from os.path import abspath, exists
from scipy import sparse
import scipy
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt


def output_file(a, idx2name, c_idx):
    dirpath = os.getcwd()
    node_file = dirpath + '//nodes.csv'
    edge_file = dirpath + '//edges.csv'

    with open(edge_file, 'w') as fid:
        fid.write('Source\tTarget\n')
        for i in range(len(a)):
            fid.write(f'{a[i,0]}\t{a[i,1]}\n')

    with open(node_file, 'w') as fid:
        fid.write('Id\tLabel\tColor\n')
        for i in range(len(idx2name)):
            fid.write(f'{i}\t{idx2name[i]}\t{c_idx[i]}\n')


def read_team_name():
    # read inverse_teams.txt file
    f_path = abspath("inverse_teams.txt")
    idx2name = []
    if exists(f_path):
        with open(f_path) as fid:
            for line in fid.readlines():
                name = line.split("\t", 1)[1]
                idx2name.append(name[:-1])
    return idx2name


def import_graph():
    # read the graph from 'play_graph.txt'
    f_path = abspath("play_graph.txt")
    if exists(f_path):
        with open(f_path) as graph_file:
            lines = [line.split() for line in graph_file]
    return np.array(lines).astype(int)


# load the graph
a = import_graph()

n = 321

i = a[:, 0]-1
j = a[:, 1]-1
v = np.ones((a.shape[0], 1)).flatten()

A = sparse.coo_matrix((v, (i, j)), shape=(n, n))
A = (A + np.transpose(A))/2
A = sparse.csc_matrix.todense(A) # ## convert to dense matrix

D = np.diag(1/np.sqrt(np.sum(A, axis=1)).A1) # taking -1/2 power of diagonal matrix
L = D @ A @ D # normalized Laplacian, therefore want to find the largest eigenvalues
L = np.array(L) # ## convert to array

# v, x = np.linalg.eig(L)
# x = x[:, 0:k].real

# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest


plt.plot(v[idx_sorted].real[::-1])

[<matplotlib.lines.Line2D at 0x203d2d72f88>]


# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest

# spectral clustering
k = 5

x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors

x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)

# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()

# convert x to real numbers
x = x.real

# k-means with k=5
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_
c_idx

print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])

Largest cluster size:  106
Smallest cluster size:  36


# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest

# spectral clustering
k = 7

x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors

x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)

# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()

# convert x to real numbers
x = x.real

# k-means with k=7
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_
c_idx

print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])

Largest cluster size:  94
Smallest cluster size:  18


# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest

# spectral clustering
k = 10

x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors

x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)

# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()

# convert x to real numbers
x = x.real

# k-means with k=10
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_
c_idx

print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])

Largest cluster size:  63
Smallest cluster size:  18


# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest

# spectral clustering
k = 17

x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors

x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)

# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()

# convert x to real numbers
x = x.real

# k-means with k=17
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_
c_idx

print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])

Largest cluster size:  28
Smallest cluster size:  10


# eigendecompoosition
v, x= np.linalg.eig(L)
idx_sorted = np.argsort(v) # the index of eigenvalue sorted acsending
#v[idx_sorted].real[::-1] # eigenvalues sorted from largest to smallest

# spectral clustering
k = 10

x = x[:, idx_sorted[-k:]] # select the k largest eigenvectors

x = x/np.repeat(np.sqrt(np.sum(x*x, axis=1).reshape(-1, 1)), k, axis=1)

# # scatter
# plt.scatter(x[:, 0], x[:, 1])
# plt.show()

# convert x to real numbers
x = x.real

# k-means with k=10
kmeans = KMeans(n_clusters=k).fit(x)
c_idx = kmeans.labels_

# show cluster
idx2name = read_team_name()
for i in range(k):
    print(f'Cluster {i+1}\n***************')
    idx = [index for index, t in enumerate(c_idx) if t == i]
    for index in idx:
        print(idx2name[index])
    print('\n')

# output file
output_file(a, idx2name, c_idx)

print('Largest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmax())[0].shape[0])
print('Smallest cluster size: ', np.where(c_idx==np.bincount(c_idx).argmin())[0].shape[0])

Cluster 1
***************
Virginia-Lynchburg
Alcorn St.
Faith U.
Mississippi Valley St.
Southern
Ark.-Pine Bluff
Jackson St.
Alabama St.
Grambling
Alabama A&M
Prairie View A&M
Texas Southern
Tuskegee
Concordia
Central Methodist
Stillman College
Louisiana College
Central State (Ohio)


Cluster 2
***************
Georgia State
Texas A&M
South Carolina
Wake Forest
La.-Monroe
Ole Miss
New Mexico St.
Vanderbilt
Syracuse
Virginia
Pittsburgh
Navy
Troy
UAB
Appalachian St.
Kentucky
Georgia Tech
Ga. Southern
N. Carolina St.
Boston College
Missouri
Alabama
Virginia Tech
Arkansas
Auburn
Clemson
Georgia
Duke
North Carolina
UL Lafayette
Arkansas St.
Texas State
Southern Miss
Mississippi St.
Florida St.
LSU
Tennessee
Miami (Fla.)
Louisville
South Alabama
Florida
Idaho


Cluster 3
***************
E. Illinois
Missouri St.
E. Kentucky
SE Missouri St.
W. Illinois
Kentucky Chr.
Tenn. Tech
S. Illinois
Union College (KY)
Murray St.
Jacksonville St.
N. Iowa
UT Martin
N. Dakota St.
Youngstown St.
Indiana St.
South Dakota St.
Austin Peay
Edward Waters
Tennessee St.
South Dakota
William Penn
Illinois St.
Cumberland (TN)
West Alabama
Wisconsin-Oshkosh
Kentucky Wesleyan


Cluster 4
***************
Presbyterian
Chattanooga
Charlotte
Reinhardt
Mercer
Point U
Charleston So.
Wofford
Monmouth
VMI
Coastal Carolina
The Citadel
Liberty
Samford
Warner
W. Carolina
Gardner-Webb
Furman
Johnson C. Smith
Brevard College
Newberry
Bluefield State
Catawba
N. Greenville
Ave Maria
Virginia-Wise
Wesley


Cluster 5
***************
Stony Brook
Howard
Villanova
Delaware
Delaware St.
Hampton
James Madison
William & Mary
S.C. State
Benedict
Elon
Morgan St.
Towson
Richmond
Norfolk St.
Maine
Bethune-Cook.
Savannah St.
Florida A&M
NC Central
NC A&T
Elizabeth City State
Bowie State
Morehouse
Fort Valley State
American International
Miles
Chowan


Cluster 6
***************
Sam Houston St.
E. Washington
Abil Chr.
Northwestern St.
Idaho State
Cal-Poly
North Dakota
Weber St.
Nicholls St.
So. Utah
UC-Davis
Portland St.
Montana
Montana St.
Sacramento St.
Incarnate Word
N. Arizona
Cent. Arkansas
S.F. Austin
Lamar
SE Louisiana
Montana-Western
Texas College
McNeese St.
Central Washington
McMurry University
Houston Baptist
Fort Lewis
Black Hills State
N. Colorado
Western Oregon
Chadron State
New Mexico Highlands
Henderson State
Texas A&M-Commerce
CSU-Pueblo
Menlo
Mississippi College
Arkansas Tech


Cluster 7
***************
Campbell
Missouri Baptist
Valparaiso
Taylor
Morehead St.
Marist
Stetson
Grand View U.
Drake
College of Faith
Davidson
Jacksonville
Dayton
Pikeville College
Butler
Wittenberg
Florida Tech
Truman State
St. Joseph's (IN)
Western New Mexico
San Diego
Birmingham-Southern
William Jewell
Limestone College


Cluster 8
***************
Utah
Boise St.
San Jose St.
Washington St.
Arizona St.
BYU
Colorado St.
Colorado
UNLV
Arizona
UCLA
Air Force
Nevada
California
Notre Dame
Stanford
Oregon St.
Wyoming
San Diego St.
Fresno St.
USC
New Mexico
Oregon
Washington
Hawaii
Utah St.


Cluster 9
***************
Minnesota
Northern Illinois
C. Michigan
Akron
Tulane
Tulsa
Rutgers
Temple
Connecticut
Michigan St.
Bowling Green
W. Kentucky
Texas-San Antonio
Houston
Penn St.
UCF
W. Michigan
Purdue
Iowa
Ohio St.
Michigan
Iowa St.
Illinois
Indiana
Ball St.
Massachusetts
Northwestern
FAU
Nebraska
Old Dominion
Marshall
Miami (Ohio)
Maryland
Rice
Buffalo
West Virginia
E. Michigan
Ohio
Kent St.
TCU
Memphis
Louisiana Tech
Oklahoma
FIU
South Florida
M. Tenn. St.
Texas Tech
Toledo
Kansas St.
UTEP
North Texas
Texas
Oklahoma St.
East Carolina
Wisconsin
SMU
Baylor
Army
Kansas
Cincinnati


Cluster 10
***************
Bryant
Robert Morris
Wagner
Georgetown
Colgate
Duquesne
Bucknell
Holy Cross
Albany
Central Conn. St.
Sacred Heart
St. Francis
Fordham
New Hampshire
Lehigh
Clarion
Merrimack
Lafayette
Rhode Island
Assumption
Harvard
Yale
Brown
Columbia
Cornell
Penn
Princeton
Dartmouth
West Liberty State
Alderson Broaddus


Largest cluster size:  60
Smallest cluster size:  18

ISYE 6740 - Homework 1¶

Question 1: Image compression using clustering¶

Note: The individual steps of the k-means algorithm are displayed below before putting them all together in the "Full Process - Question 1.1" section¶

Randomly initialize cluster centers¶

REPEAT: a) Assign pixels to nearest cluster and b) adjust cluster centers¶

Display compressed image¶

Full Process - Question 1.1¶

football.bmp¶

GeorgiaTech.bmp¶

XING_B24.bmp (my sample image)¶

Question 1.2¶

Comparing random cluster initialization to poor cluster initialization using the sample image¶

Question 1.3¶

football.bmp¶

GeorgiaTech.bmp¶

XING_B24.bmp (my sample image)¶

Question 2: Spectral clustering and discover football colleague¶

Question 2.1¶

Explanation:¶

Question 2.2¶

Question 2.3¶

k=5¶

k=7¶

k=10¶

k=17¶

Question 2.4¶

Question 2.5¶