X_standardized = (X - mean) / std

# Importing necessary libraries

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Loading dataset

data = pd.read_csv('California_Real_Estate.csv', sep=';')
df_real_estate = data.copy()  # Keeping a copy of original dataset
df_real_estate.head()

# Checking for null

df_real_estate.isna().sum()

Building             0
Year of sale        72
Month of sale       72
Type of property     0
Property #          60
Area (ft.)           0
Price                0
Status               0
dtype: int64

df_real_estate['Status'].value_counts()

Status
1    195
0     72
Name: count, dtype: int64

df_real_estate[df_real_estate['Status'] == 1].isna().sum()

Building            0
Year of sale        0
Month of sale       0
Type of property    0
Property #          0
Area (ft.)          0
Price               0
Status              0
dtype: int64

# Let's discard the rows with Null values for now - Since Status with 0 has nulls, we will take the Status with 1

df_real_estate_nonull = df_real_estate[df_real_estate['Status'] == 1]
df_real_estate_nonull.info()

<class 'pandas.core.frame.DataFrame'>
Index: 195 entries, 0 to 195
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Building          195 non-null    int64  
 1   Year of sale      195 non-null    float64
 2   Month of sale     195 non-null    float64
 3   Type of property  195 non-null    int64  
 4   Property #        195 non-null    float64
 5   Area (ft.)        195 non-null    int64  
 6   Price             195 non-null    int64  
 7   Status            195 non-null    int64  
dtypes: float64(3), int64(5)
memory usage: 13.7 KB

df_real_estate_nonull.describe()

# Looking at above, we can see that we need to Standardize our data

scaler = StandardScaler()
df_re_nonull_std = scaler.fit_transform(df_real_estate_nonull)

# Now we can apply PCA on top of it - but without components first as we want to find out the required number of components

pca = PCA()
pca.fit_transform(df_re_nonull_std)

array([[-0.60734881, -1.31926442,  0.57906783, ...,  0.25542486,
        -0.2337479 ,  0.        ],
       [-0.58105774, -1.34289556,  0.32392824, ...,  0.18273758,
        -0.199292  ,  0.        ],
       [-1.8398805 ,  0.1606641 , -0.29157809, ...,  0.71379278,
        -0.2905317 ,  0.        ],
       ...,
       [-1.03970034, -1.18485044, -0.6530342 , ...,  0.00559872,
        -0.19757116,  0.        ],
       [-0.76196533, -1.19268984, -0.64690188, ...,  0.00744995,
         0.28604396,  0.        ],
       [-0.82936359, -1.09937408, -0.7236547 , ...,  0.01692015,
        -0.20697981,  0.        ]])

pca.explained_variance_ratio_

array([0.30724715, 0.28117078, 0.15166089, 0.12851171, 0.09233419,
       0.03075947, 0.00831583, 0.        ])

# Let's visualize the variance ratio explained by each component

plt.figure(figsize = (11,6))
components = ['Component 1','Component 2','Component 3','Component 4','Component 5','Component 6','Component 7','Component 8']
var_exp = pca.explained_variance_ratio_
plt.bar(components, var_exp)
plt.title('Explained variance by principal components')
plt.xlabel('Principal components')
plt.ylabel('Explained variance ratio')
plt.show()

# Now we will see the cumulative variance

plt.figure(figsize=(10,6))
plt.plot(range(1,9),pca.explained_variance_ratio_.cumsum(),marker='o', linestyle='--')
plt.title('Explained variance by components')
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()

# From above, we can see that having 4 components is enough to explain more than 80% of variance
# Re-initialize PCA

pca = PCA(n_components=4)
pca.fit(df_re_nonull_std)

PCA(n_components=4)

PCA(n_components=4)

pca.components_

array([[-0.01347829,  0.06534594, -0.06689355, -0.15099905,  0.39492744,
         0.63949973,  0.63509464,  0.        ],
       [ 0.61576753,  0.62980998,  0.02208123, -0.4114363 ,  0.15693164,
        -0.12506318, -0.11888648,  0.        ],
       [ 0.2675697 , -0.06682334,  0.90927776,  0.22605443, -0.12937694,
         0.11386435,  0.1278709 ,  0.        ],
       [-0.26715037, -0.33315704,  0.3274807 , -0.55432795,  0.56219835,
        -0.20000856, -0.21689532, -0.        ]])

# let's create dataframe showing how much each original feature contributes to each of the first four principal components

df_pca_comp = pd.DataFrame(data=pca.components_,
                           columns=df_real_estate.columns.values,
                           index=['Component 1','Component 2','Component 3','Component 4'])
df_pca_comp

# Let's visualize above in heatmap

sns.heatmap(df_pca_comp,
           vmin=-1,
           vmax=1,
           cmap='RdBu',
           annot=True)

plt.yticks([0,1,2,3],
          ['Component 1','Component 2','Component 3','Component 4'],
          rotation=45,
          fontsize=9)

plt.show()

df_re_nonull_std_pca = pca.transform(df_re_nonull_std)

# Let's view original data and PCA projected data

plt.figure(figsize=(12, 5))

# Plot original in PCA-reduced 2D
plt.subplot(1, 2, 1)
plt.scatter(df_re_nonull_std[:, 0], df_re_nonull_std[:, 1], alpha=0.6)
plt.title("Original Data (first 2 features)")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")

# Plot PCA-projected data
plt.subplot(1, 2, 2)
plt.scatter(df_re_nonull_std_pca[:, 0], df_re_nonull_std_pca[:, 1], alpha=0.6, color='orange')
plt.title("PCA Projection (2D)")
plt.xlabel("PC1")
plt.ylabel("PC2")

plt.tight_layout()
plt.show()

# Using simple KMeans to look at the way to use PCA data

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(df_re_nonull_std_pca)

plt.figure(figsize=(14, 6))

# PC1 vs PC2
plt.subplot(2, 2, 1)
plt.scatter(df_re_nonull_std_pca[:, 0], df_re_nonull_std_pca[:, 1], c=clusters, cmap='Set1', alpha=0.7)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PC1 vs PC2")

# PC1 vs PC3
plt.subplot(2, 2, 2)
plt.scatter(df_re_nonull_std_pca[:, 0], df_re_nonull_std_pca[:, 2], c=clusters, cmap='Set1', alpha=0.7)
plt.xlabel("PC1")
plt.ylabel("PC3")
plt.title("PC1 vs PC3")

# PC1 vs PC4
plt.subplot(2, 2, 3)
plt.scatter(df_re_nonull_std_pca[:, 0], df_re_nonull_std_pca[:, 3], c=clusters, cmap='Set1', alpha=0.7)
plt.xlabel("PC1")
plt.ylabel("PC4")
plt.title("PC1 vs PC4")

# PC3 vs PC4
plt.subplot(2, 2, 4)
plt.scatter(df_re_nonull_std_pca[:, 2], df_re_nonull_std_pca[:, 3], c=clusters, cmap='Set1', alpha=0.7)
plt.xlabel("PC3")
plt.ylabel("PC4")
plt.title("PC3 vs PC4")

plt.tight_layout()
plt.show()

	Building	Year of sale	Month of sale	Property #	Area (ft.)	Price	Status
0	1	2005.0	11.0	30.0	743	246173	1
1	1	2005.0	10.0	29.0	756	246332	1
2	2	2007.0	7.0	2.0	587	209281	1
3	2	2007.0	12.0	31.0	1605	452667	1
4	1	2004.0	11.0	49.0	1375	467083	1

	Building	Year of sale	Month of sale	Type of property	Property #	Area (ft.)	Price	Status
count	195.000000	195.000000	195.000000	195.000000	195.000000	195.000000	195.000000	195.0
mean	2.564103	2006.333333	7.271795	0.035897	27.328205	899.835897	269434.564103	1.0
std	1.243434	1.072941	3.378674	0.186513	14.830832	261.484883	80129.054039	0.0
min	1.000000	2004.000000	1.000000	0.000000	1.000000	411.000000	117564.000000	1.0
25%	2.000000	2006.000000	4.000000	0.000000	15.000000	740.000000	212393.500000	1.0
50%	2.000000	2007.000000	8.000000	0.000000	27.000000	785.000000	243053.000000	1.0
75%	3.000000	2007.000000	10.000000	0.000000	39.000000	1075.500000	308550.500000	1.0
max	5.000000	2010.000000	12.000000	1.000000	59.000000	1943.000000	529317.000000	1.0

In-Depth Exploration of Principal Component Analysis (PCA)¶

How PCA Works: Step-by-Step¶

1. Data Preparation: Standardization¶

2. Covariance Matrix¶

Key Properties:¶

3. Eigen Decomposition (or SVD)¶

4. Sort and Select Components¶

5. Project Data into Lower Dimension¶

Intuition behind Minimizing Projection Error¶

Linear Combinations of Original Features¶

Step-by-Step Process of Applying PCA in Code¶

Step 1: Handle Missing Values (NaNs)¶

Step 2: Handle Categorical Variables¶

Step 3: Standardize the Data¶

Step 4: Apply PCA¶

Step 5: Understand the Importance of Each Principal Component¶

Step 6: Visualize the Variance Explained¶

Bar Chart of Individual Variance¶

Cumulative Variance Plot¶

Step 7: Choose Number of Components¶

Step 8: Transform and Use the New Data¶

Practical Choices in PCA¶

How Many PCs to Choose?¶

Assumptions & Limitations¶

Variants of PCA¶

Coding in Python¶

	Building	Year of sale	Month of sale	Type of property	Property #	Area (ft.)	Price	Status
Component 1	-0.013478	0.065346	-0.066894	-0.150999	0.394927	0.639500	0.635095	0.0
Component 2	0.615768	0.629810	0.022081	-0.411436	0.156932	-0.125063	-0.118886	0.0
Component 3	0.267570	-0.066823	0.909278	0.226054	-0.129377	0.113864	0.127871	0.0
Component 4	-0.267150	-0.333157	0.327481	-0.554328	0.562198	-0.200009	-0.216895	-0.0

Height (cm)	Weight (kg)
160	60
165	65
170	68
175	72
180	75

Component	Explained Variance Ratio
PC1	0.42
PC2	0.22
PC3	0.11
PC4	0.07
PC5	0.06
PC6	0.05
PC7	0.04
PC8	0.03

PC1	PC2	PC3	PC4
1.2	-0.9	0.3	2.1
0.5	-0.1	0.7	1.4
...	...	...	...