PCA¶

Principal Components Analysis El objetivo principal de PCA es reducir la dimensionalidad. Que es la dimensionalidad y como lo hace?.

Sea la variable X^* una representacion de los datos en una menor dimension. El Objetivo es que si reconstruimos los datos al espacio original se pierda la menor cantidad de informacion.

Los datos se deben centrar antes de realizar cualquier projeccion.

$Y_{n,k} = (X_{n,d}-\mu_d) . W_{d,k}$

$X^*_{n,d} = Y_{n,k}.W^{-1}_{k,d}$

Como $W^{-1}_{k,d}$ es ortonormal, la inversa es la transpuesta $W^{-1}_{k,d}= W^T$.

$Min (X-X^*)^2$ , $W^T.W=I$

$=((X-\mu) - Y_{n,k}.W^{-1}_{k,d} )^2$

$=((X-\mu) - (X_{n,d}-\mu)W_{d,k}W^{-1}_{k,d})^2$

Sin embargo, minimizar el error, es similar a maximizar

$S = W^T \Sigma W$

$S=\sum_{i=1}^D{(X-\mu)(X-\mu)}^T$

$X_{n,d}- X_{n,k}^* = \sum_{i=1}^D{ W_i^T \Sigma W_i} - \sum_{i=1}^K{ W_i^T \Sigma W_i}$

Para reducir el error, las varianzas de los primeros k elementos van a ser mas grandes.

$det(A-\lambda)=0$

$m = [[1 2 4] [3 5 6] [1 0 2]]$

import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import tarfile
import zipfile
from scipy.io import loadmat
from urllib.request import urlretrieve
from os.path import isfile, isdir
import seaborn as sns; sns.set()
%matplotlib inline

rng = np.random.RandomState(1)
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
plt.scatter(X[:, 0], X[:, 1])
plt.axis('equal');

m = np.mean(X,axis=0)
m

array([ 0.03351168, -0.00408072])

#calcular media
m = np.mean(X,axis=0)
X_center = X - m
X_center.mean(axis=0)
S = np.dot(X_center.T,X_center)
S.shape

(2, 2)

S = np.dot(X_center.T,X_center)
S

array([[135.75334359,  45.9560144 ],
       [ 45.9560144 ,  19.6675263 ]])

# eigenvectors and eigenvalues for the from the scatter matrix
eig_val_sc, eig_vec_sc = np.linalg.eig(S)
eig_val_sc, eig_vec_sc

(array([151.74376868,   3.67710121]), array([[ 0.94446029, -0.32862557],
        [ 0.32862557,  0.94446029]]))

$W_{2,2}$

$W_{1,2} * X_{2,200}= X^*_{1,200}$

X_star = np.dot(X_center,eig_vec_sc[0,:].reshape(2,1))
Y = X_star *0
plt.scatter(X_star, Y, alpha=0.2)

<matplotlib.collections.PathCollection at 0x1a19aa4ba8>

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0,
                    shrinkB=0,
                    color='r')
    ax.annotate('', v1, v0, arrowprops=arrowprops)

# plot data
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');

EigenFaces¶

Vamos a utilizar las famosas Eigen faces, para mostrar como PCA puede ayudarnos en la practica

def download_files():
    """
    Este metodo descarga los archivos de imagenes sino existen
    """
    path_tar = os.path.join("data",'faces.zip')
    if not isfile(path_tar):
        urlretrieve(
            'http://courses.media.mit.edu/2002fall/mas622j/proj/faces/rawdata.zip',
            path_tar)
    dest_path =  "data/faces"
    with zipfile.ZipFile(path_tar) as tar:
        tar.extractall(dest_path)
        tar.close()
download_files()

from scipy.io import loadmat
import glob
import random
import matplotlib.image as img
#Install pillow
from PIL import Image

d_name = 'data/faces/rawdata'
sample = 0.2
X = []
for nfile in  os.listdir(d_name):
    if random.random() <= sample:
        bytes_read = open(os.path.join(d_name,nfile), "rb").read()
        img = Image.frombytes('L', (128,128), bytes_read)
        X.append(np.array(img).flatten())

X = np.array(X)

def plot_1_images(data, label="Image 1", ax=None):
    fn_shape = lambda X: X.reshape(128,128)
    fig = None
    if ax is None:
        fig, ax = plt.subplots(1,1, constrained_layout=True)
    ax.imshow(fn_shape(data))
    ax.set_title(label=label)
    return fig,ax

def plot_3_images(data,ix_1, ix_2 , ix_3):
    fn_shape = lambda X: X.reshape(128,128)
    fig, ax = plt.subplots(1,3, constrained_layout=True)
    ax[0].imshow(fn_shape(data[ix_1]))
    ax[0].set_title(label="Image %s"% ix_1)
    ax[1].imshow(fn_shape(data[ix_2]))
    ax[1].set_title(label="Image %s"% ix_2)
    ax[2].imshow(fn_shape(data[ix_2]))
    ax[2].set_title(label="Image %s "% ix_3)
    plt.show()
plot_3_images(X,1, 2 , 3)

Paso 1, Quitar la media de los datos.¶

X_mean = X.mean(axis=0)
X_center = X - X_mean
plot_3_images(X_center,1, 2 ,3)

plot_1_images(X_mean)

(<Figure size 432x288 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1b342748>)

Paso 2, Calcular Matrix de Covarianza¶

Calcular eigenvalues y eigen vectors

from sklearn.decomposition import PCA
pca = PCA(n_components=200)
# images x dim
X_reduced = pca.fit_transform(X_center)

eigen_values = pca.explained_variance_
eigen_faces = pca.components_

eigen_values

array([6.69756608e+06, 2.27029898e+06, 1.90846502e+06, 1.00629367e+06,
       9.00214222e+05, 8.06186833e+05, 6.52082094e+05, 4.29096924e+05,
       3.87416090e+05, 3.46315963e+05, 2.60651725e+05, 2.32220254e+05,
       2.15345959e+05, 2.01089566e+05, 1.77150011e+05, 1.66470804e+05,
       1.59197944e+05, 1.49317886e+05, 1.36830532e+05, 1.29717559e+05,
       1.29272264e+05, 1.24409617e+05, 1.17774719e+05, 1.11709776e+05,
       1.00936149e+05, 9.66836892e+04, 9.38523678e+04, 8.97641120e+04,
       8.87982451e+04, 7.83729385e+04, 7.52638970e+04, 7.31565679e+04,
       7.03860150e+04, 6.74633428e+04, 6.72851273e+04, 6.41436571e+04,
       5.98833749e+04, 5.91398167e+04, 5.62203994e+04, 5.55326551e+04,
       5.28859537e+04, 5.27369580e+04, 4.99150851e+04, 4.86550420e+04,
       4.75975618e+04, 4.63950316e+04, 4.53632277e+04, 4.36589919e+04,
       4.28712775e+04, 4.10088060e+04, 4.03182428e+04, 3.96281321e+04,
       3.83351875e+04, 3.69900781e+04, 3.64231705e+04, 3.60009991e+04,
       3.51420250e+04, 3.44276266e+04, 3.40348158e+04, 3.14697879e+04,
       3.09554004e+04, 3.03841849e+04, 2.97119019e+04, 2.88281347e+04,
       2.80021131e+04, 2.74973813e+04, 2.72882240e+04, 2.67182509e+04,
       2.60679885e+04, 2.54484333e+04, 2.50111098e+04, 2.46469393e+04,
       2.43051511e+04, 2.36610052e+04, 2.31409392e+04, 2.29975557e+04,
       2.19530324e+04, 2.17968896e+04, 2.12257127e+04, 2.10845467e+04,
       2.07772410e+04, 2.01157118e+04, 2.00911105e+04, 1.97695281e+04,
       1.95983872e+04, 1.92752728e+04, 1.89704818e+04, 1.85442291e+04,
       1.83905948e+04, 1.76822297e+04, 1.74658513e+04, 1.72002129e+04,
       1.69046314e+04, 1.66869523e+04, 1.63277297e+04, 1.61291998e+04,
       1.60486640e+04, 1.57021028e+04, 1.54014757e+04, 1.51389239e+04,
       1.49280646e+04, 1.48369920e+04, 1.43338524e+04, 1.42691870e+04,
       1.39709149e+04, 1.37706675e+04, 1.36639862e+04, 1.35420020e+04,
       1.33170278e+04, 1.32492416e+04, 1.29626851e+04, 1.28101768e+04,
       1.26817061e+04, 1.24386205e+04, 1.23034874e+04, 1.21138064e+04,
       1.18948393e+04, 1.17435494e+04, 1.15350368e+04, 1.14844182e+04,
       1.13980409e+04, 1.13342025e+04, 1.11676791e+04, 1.09026328e+04,
       1.08037170e+04, 1.06795845e+04, 1.04499944e+04, 1.04312999e+04,
       1.02170729e+04, 1.01663659e+04, 1.00329574e+04, 9.95834073e+03,
       9.86046874e+03, 9.70248881e+03, 9.55463452e+03, 9.48913871e+03,
       9.41609383e+03, 9.31829690e+03, 9.14686560e+03, 9.04846950e+03,
       8.91213433e+03, 8.77191350e+03, 8.70161485e+03, 8.65917596e+03,
       8.60648026e+03, 8.48379074e+03, 8.42043656e+03, 8.22256628e+03,
       8.09954472e+03, 8.04766023e+03, 8.00308183e+03, 7.87910182e+03,
       7.78498188e+03, 7.70599462e+03, 7.58762123e+03, 7.49555846e+03,
       7.46631267e+03, 7.33378489e+03, 7.24990607e+03, 7.17496085e+03,
       7.04577155e+03, 6.99593264e+03, 6.95002547e+03, 6.88182860e+03,
       6.82277348e+03, 6.77373971e+03, 6.69828875e+03, 6.61723986e+03,
       6.54108142e+03, 6.46536675e+03, 6.45316141e+03, 6.33365257e+03,
       6.28909140e+03, 6.20217389e+03, 6.11365064e+03, 6.06092863e+03,
       6.00578353e+03, 5.87012355e+03, 5.77721397e+03, 5.66944209e+03,
       5.65191099e+03, 5.59150154e+03, 5.52143533e+03, 5.49892317e+03,
       5.47011404e+03, 5.43461724e+03, 5.30631395e+03, 5.26614170e+03,
       5.21607621e+03, 5.19076202e+03, 5.08393922e+03, 5.03823233e+03,
       4.92260125e+03, 4.86125062e+03, 4.79200793e+03, 4.77931401e+03,
       4.74861596e+03, 4.68981652e+03, 4.61331267e+03, 4.57276919e+03])

# Se puede ver como los primeras componentes contienen mas informacion
plt.plot(range(0,200), eigen_values/eigen_values[1])

[<matplotlib.lines.Line2D at 0x1a1bb9c390>]

plot_1_images(X[1])

(<Figure size 432x288 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1ba20438>)

Si pintamos las dimensiones, no tienen sentido. Porque cada elemento representa una mezcla de los pixeles de la foto original.

fig, ax = plt.subplots(1,1, constrained_layout=True)
ax.imshow(X_reduced[1].reshape(1,-1))
ax.set_title(label="Image 1")
plt.show()

Paso 4 Reconstruir la imagen¶

print('Componentes', X_reduced.shape)
print('Eigenvectores', eigen_faces.shape)
face_1 = np.dot(X_reduced[1].reshape(1,-1),eigen_faces)
print(face_1.sum(axis=0))

Componentes (808, 200)
Eigenvectores (200, 16384)
[1.18592879e-11 3.79390332e-11 2.42834828e-02 ... 0.00000000e+00
 0.00000000e+00 0.00000000e+00]

eigen_faces[0].shape
face_c1 = np.dot(X_reduced[1].reshape(1,-1),eigen_faces)
face_c1.shape

(1, 16384)

plot_1_images(X_mean + face_1.sum(axis=0) )

(<Figure size 432x288 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1baedf60>)

Vemos que la reconstruccion de las imagenes funciono bien utilizando 200 eigen vectores.

face_10 = X_mean + np.sum(np.dot(X_reduced[1].reshape(1,-1)[:,:10],
                                 eigen_faces[:10,:]),
                            axis=0)
face_50 = X_mean + np.sum(np.dot(X_reduced[1].reshape(1,-1)[:,:50],
                                      eigen_faces[:50,:]),
                               axis=0)


face_100 = X_mean + np.sum(np.dot(X_reduced[1].reshape(1,-1)[:,:100],
                                      eigen_faces[:100,:]),
                               axis=0)

face_200 = X_mean + np.sum(np.dot(X_reduced[1].reshape(1,-1)[:,:200],
                                     eigen_faces[:200,:]),
                              axis=0)
fig, ax = plt.subplots(1,4,figsize=(15,15))
plot_1_images(face_20,ax=ax[0],label="Con 10 eigenfaces")
plot_1_images(face_20,ax=ax[1],label="Con 50 eigenfaces")
plot_1_images(face_100,ax=ax[2],label="Con 100 eigenfaces")
plot_1_images(face_200,ax=ax[3],label="Con 200 eigenfaces")

(None, <matplotlib.axes._subplots.AxesSubplot at 0x1a1b5861d0>)

Como se puede ver en la imagen anterior. Las caracteristicas mas importantes son las formas de la cara. Despues las cejas y los ojos. Y por ultimo los detalles como el cabello, la boca y color de piel.

Distancia utilizando PCA¶

Despues de demostrar que 200 eigenvectors contienen la cantidad de informacion necesaria para reconstruir la imagen con calidad. Utilizaremos esas 200 dimensiones para encontrar las fotos mas similares.

min_dis = 100000000000000000
ix_1 = 0
ix_2 = 0
for i in range(200):
    for j in range(i+1,200):
        # excluir imagen en negro
        if np.sum(X_reduced[i])>0:
            dis_ij = sum(abs(X_reduced[i]-X_reduced[j]))
            if dis_ij < min_dis:
                min_dis = dis_ij
                ix_1 = i
                ix_2 = j
print("Imagenes similares son {0} y {1}".format(ix_1,ix_2))

Imagenes similares son 108 y 162

plot_1_images(X[ix_1])

(<Figure size 432x288 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1b7fa588>)

plot_1_images(X[ix_2])

(<Figure size 432x288 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x1a1bc55fd0>)