import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("clv_data.csv")

sns.boxplot(df['purchases'])

<Axes: ylabel='purchases'>

def extract_outliers_from_boxplot(array):
    ## Get IQR
    iqr_q1 = np.quantile(array, 0.25)
    iqr_q3 = np.quantile(array, 0.75)
    med = np.median(array)

    # finding the iqr region
    iqr = iqr_q3-iqr_q1

    # finding upper and lower whiskers
    upper_bound = iqr_q3+(1.5*iqr)
    lower_bound = iqr_q1-(1.5*iqr)

    outliers = array[(array <= lower_bound) | (array >= upper_bound)]
    
    return outliers

print('Outliers within the box plot are :')
extract_outliers_from_boxplot(df['purchases'])

Outliers within the box plot are :

47      5
104     5
142     5
301     5
323     5
485     6
486     5
1026    5
1104    6
1112    5
1120    6
1125    5
1374    5
1504    5
1623    5
1669    6
1670    5
1809    6
1818    5
1836    5
1870    5
2180    5
2463    6
2548    5
2572    5
2605    5
2717    5
2901    5
3032    6
3080    5
3105    5
3162    5
3170    5
3291    5
3298    5
3321    5
3361    5
3380    5
3410    5
3566    5
3603    6
3631    6
3835    5
3848    5
4003    6
4141    5
4334    5
4346    5
4545    5
4597    5
4611    5
4620    5
4662    5
4691    5
4728    5
4751    5
4761    5
4895    5
4958    5
Name: purchases, dtype: int64

plt.violinplot(df['purchases'])
plt.show()

purchases = df['purchases']

def percentile_outliers(array,
                        lower_bound_perc,
                        upper_bound_perc):
    
    upper_bound = np.percentile(df['purchases'], upper_bound_perc)
    lower_bound = np.percentile(df['purchases'], lower_bound_perc)
    
    outliers = array[(array <= lower_bound) | (array >= upper_bound)]
    
    return outliers

def z_score_outliers(array,
                     z_score_lower,
                     z_score_upper):

    z_scores = scipy.stats.zscore(array)
    outliers = (z_scores > 1.96) | (z_scores < -1.96)
    
    return array[outliers]

outliers = percentile_outliers(df['purchases'],
               upper_bound_perc = 99,
               lower_bound_perc = 1)

z_score_outliers(df['purchases'],
                     z_score_lower = -1.96,
                     z_score_upper = 1.96)[:10]

28     4
47     5
51     4
67     4
74     4
96     4
104    5
117    4
142    5
147    4
Name: purchases, dtype: int64

from sklearn.ensemble import IsolationForest

features = ['age','income','days_on_platform','purchases']

## We'll do a simple drop null for now
df = df.dropna()

## Create a training-test set
X = df[features]
X_train = X[:4000]
X_test = X[1000:]

## Fit Model
clf = IsolationForest(n_estimators=50, max_samples=100)
clf.fit(X_train)

## Get Scores
df['scores'] = clf.decision_function(X_train)
df['anomaly'] = clf.predict(X)

## Get Anomalies
outliers=df.loc[df['anomaly']==-1]

outliers[:10]

def z_score_removal(df, column, lower_z_score, upper_z_score):
    
    col_df = df[column]

    z_scores = scipy.stats.zscore(purchases)
    outliers = (z_scores > upper_z_score) | (z_scores < lower_z_score)
    return df[~outliers]

def percentile_removal(df, column, lower_bound_perc, upper_bound_perc):
    
    col_df = df[column]
    
    upper_bound = np.percentile(col_df, upper_bound_perc)
    lower_bound = np.percentile(col_df, lower_bound_perc)

    z_scores = scipy.stats.zscore(purchases)
    outliers = (z_scores > upper_bound) | (z_scores < lower_bound)
    return df[~outliers]

filtered_df = z_score_removal(df, 'purchases', -1.96, 1.96)
percentile_removal(df, 'purchases', lower_bound_perc = 1, upper_bound_perc = 99)[:10]

def winsorize(df, column, upper, lower):
    col_df = df[column]
    
    perc_upper = np.percentile(df[column],upper)
    perc_lower = np.percentile(df[column],lower)
    
    df[column] = np.where(df[column] >= perc_upper, 
                          perc_upper, 
                          df[column])
    
    df[column] = np.where(df[column] <= perc_lower, 
                          perc_lower, 
                          df[column])
    
    return df


winsorize(df, 'purchases', 97.5, 0.025)[:10]

	Unnamed: 0	id	age	gender	income	days_on_platform	city	purchases	scores	anomaly
9	9	9	49.0	Female	76842	19.0	Tokyo	2	-0.028500	-1
15	15	15	31.0	Female	226249	20.0	Miami	0	-0.041933	-1
17	17	17	27.0	Female	177582	2.0	London	0	-0.025880	-1
18	18	18	10.0	Female	260	32.0	San Francisco	0	-0.055640	-1
23	23	23	10.0	Female	108804	5.0	Tokyo	2	-0.018705	-1
25	25	25	46.0	Female	112992	9.0	London	3	-0.054900	-1
40	40	40	31.0	Male	138533	20.0	New York City	3	-0.032015	-1
44	44	44	36.0	Male	1062	28.0	Tokyo	2	-0.006741	-1
47	47	47	34.0	Male	9866	33.0	London	5	-0.116212	-1
50	50	50	36.0	Male	255965	22.0	Tokyo	1	-0.065819	-1

	Unnamed: 0	id	age	gender	income	days_on_platform	city	purchases	scores	anomaly
3	3	3	29.0	Male	43791	28.0	London	2	0.034956	1
4	4	4	18.0	Female	132181	26.0	London	2	0.002514	1
9	9	9	49.0	Female	76842	19.0	Tokyo	2	-0.028500	-1
23	23	23	10.0	Female	108804	5.0	Tokyo	2	-0.018705	-1
25	25	25	46.0	Female	112992	9.0	London	3	-0.054900	-1
29	29	29	43.0	Male	70598	15.0	London	2	0.021325	1
38	38	38	27.0	Female	19003	25.0	San Francisco	2	0.009190	1
40	40	40	31.0	Male	138533	20.0	New York City	3	-0.032015	-1
44	44	44	36.0	Male	1062	28.0	Tokyo	2	-0.006741	-1
47	47	47	34.0	Male	9866	33.0	London	5	-0.116212	-1

	Unnamed: 0	id	age	gender	income	days_on_platform	city	purchases	scores	anomaly
2	2	2	24.0	Male	104723	34.0	London	1.0	0.036205	1
3	3	3	29.0	Male	43791	28.0	London	2.0	0.034956	1
4	4	4	18.0	Female	132181	26.0	London	2.0	0.002514	1
5	5	5	23.0	Male	12315	14.0	New York City	0.0	0.030462	1
8	8	8	46.0	Male	129157	23.0	New York City	0.0	0.030737	1
9	9	9	49.0	Female	76842	19.0	Tokyo	2.0	-0.028500	-1
12	12	12	12.0	Male	130521	12.0	London	1.0	0.022177	1
15	15	15	31.0	Female	226249	20.0	Miami	0.0	-0.041933	-1
16	16	16	19.0	Female	51434	18.0	New York City	0.0	0.049889	1
17	17	17	27.0	Female	177582	2.0	London	0.0	-0.025880	-1

What are Outliers?¶

Causes of Outliers¶

Consequences of Not Treating Outliers¶

Implications of Outliers in Machine Learning Models¶

Few Model-wise Explanation¶

Outlier Detection Techniques¶

1. Box Plot¶

Theoretical Foundation:¶

Interpretation:¶

Example:¶

2. Violin Plot¶

Theoretical Foundation:¶

Usefulness in Outlier Detection:¶

Example:¶

3. Z-Score Method¶

Mathematical Formula:¶

Interpretation:¶

Assumptions:¶

Practical Scenario:¶

4. Percentile Method¶

Theoretical Description:¶

Formula (generalized):¶

When to Use:¶

Example:¶

5. Isolation Forest¶

Theory Behind It:¶

Score:¶

Advantages:¶

Example:¶

6. DBSCAN (Density-Based Spatial Clustering of Applications with Noise)¶

Theory:¶

Use Cases:¶

Limitation:¶

Outlier Treatment Methods¶

1. Removal¶

Justification:¶

Caveat:¶

Example:¶

2. Winsorization¶

How It Works:¶

Effect:¶

Example:¶

Additional Outlier Detection Techniques¶

1. Mahalanobis Distance¶

Formula:¶

Use:¶

2. Robust Mahalanobis Distance¶

Why Needed:¶

3. Algorithm-Based Detection¶

1. K-Means Clustering¶

2. Hierarchical Clustering¶

Robust Models (Outlier-Insensitive)¶

Relationship Between Loss Functions and Outliers¶

What is a Loss Function?¶

Common Loss Functions and Their Sensitivity to Outliers¶

Mean Squared Error (MSE)¶

Mean Absolute Error (MAE)¶

Huber Loss¶

Quantile Loss (Pinball Loss)¶

Log-Cosh Loss¶

Why This Matters¶

Let's do some Coding¶