import pandas as pd

import numpy as np

import matplotlib.pylab as plt

import seaborn as sns

# To scale the data using z-score 
from sklearn.preprocessing import StandardScaler

!pip install scikit-learn
!pip install scikit-learn-extra
import sklearn
import sklearn_extra


# Importing clustering algorithms
from sklearn.cluster import KMeans

from sklearn.mixture import GaussianMixture

from sklearn_extra.cluster import KMedoids

from sklearn.cluster import AgglomerativeClustering #hierarchial clustering value

from sklearn.cluster import DBSCAN

# Silhouette score
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings("ignore")

Requirement already satisfied: scikit-learn in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (1.0.2)
Requirement already satisfied: joblib>=0.11 in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn) (1.1.0)
Requirement already satisfied: numpy>=1.14.6 in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn) (1.21.5)
Requirement already satisfied: scipy>=1.1.0 in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn) (1.7.3)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn) (2.2.0)
Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl (389 kB)
     |████████████████████████████████| 389 kB 4.2 MB/s eta 0:00:01
Requirement already satisfied: scipy>=0.19.1 in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn-extra) (1.7.3)
Requirement already satisfied: scikit-learn>=0.23.0 in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn-extra) (1.0.2)
Requirement already satisfied: numpy>=1.13.3 in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn-extra) (1.21.5)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (2.2.0)
Requirement already satisfied: joblib>=0.11 in /Users/rija/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (1.1.0)
Installing collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.2.0


data = pd.read_csv("Country-data.csv")

data.head()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     167 non-null    object 
 1   child_mort  167 non-null    float64
 2   exports     167 non-null    float64
 3   health      167 non-null    float64
 4   imports     167 non-null    float64
 5   income      167 non-null    int64  
 6   inflation   167 non-null    float64
 7   life_expec  167 non-null    float64
 8   total_fer   167 non-null    float64
 9   gdpp        167 non-null    int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 13.2+ KB


data[data.duplicated()]


data.describe().T


for col in data.columns[1:]:
    print(col)
    
    print('Skew :', round(data[col].skew(), 2)) #skew of normal is 0, positive is left, negative is right, 
                                                #shift = skew name (data left, view more right, right shift)
    
    plt.figure(figsize = (15, 4))
    
    plt.subplot(1, 2, 1)
    
    data[col].hist(bins = 10, grid = False)
    
    plt.ylabel('count')
    
    plt.subplot(1, 2, 2)
    
    sns.boxplot(x = data[col])
    
    plt.show()

child_mort
Skew : 1.45

exports
Skew : 2.45

health
Skew : 0.71

imports
Skew : 1.91

income
Skew : 2.23

inflation
Skew : 5.15

life_expec
Skew : -0.97

total_fer
Skew : 0.97

gdpp
Skew : 2.22


plt.figure(figsize  = (10, 10))

sns.heatmap(data.corr(), annot = True, cmap = "YlGnBu")

plt.show()


data_new = data.drop(columns = ["country", "gdpp"])


# Scaling the data and storing the output as a new DataFrame

scaler = StandardScaler()

data_scaled = pd.DataFrame(scaler.fit_transform(data_new), columns = data_new.columns)

data_scaled.head()


# Creating copy of the data to store labels from each algorithm
data_scaled_copy = data_scaled.copy(deep = True)


# Empty dictionary to store the SSE for each value of K
sse = {} 

# Iterate for a range of Ks and fit the scaled data to the algorithm. 
# Use inertia attribute from the clustering object and store the inertia value for that K 
for k in range(1, 10):
    kmeans = KMeans(n_clusters = k, random_state = 1).fit(data_scaled) #
    
    sse[k] = kmeans.inertia_ #attribute sse values

# Elbow plot
plt.figure()

plt.plot(list(sse.keys()), list(sse.values()), 'bx-')

plt.xlabel("Number of cluster")

plt.ylabel("SSE")

plt.show()


# Empty dictionary to store the Silhouette score for each value of K
sc = {} 

# Iterate for a range of Ks and fit the scaled data to the algorithm. Store the Silhouette score for that K 
for k in range(2, 10):
    kmeans = KMeans(n_clusters = k, random_state = 1).fit(data_scaled)
    
    labels = kmeans.predict(data_scaled)
    
    sc[k] = silhouette_score(data_scaled, labels)

# Elbow plot
plt.figure()

plt.plot(list(sc.keys()), list(sc.values()), 'bx-')

plt.xlabel("Number of cluster")

plt.ylabel("Silhouette Score")

plt.show()


kmeans = KMeans(n_clusters = 3, random_state = 1) #random initialized affects value counts in next line

kmeans.fit(data_scaled)

# Adding predicted labels to the original data and the scaled data 
data_scaled_copy['KMeans_Labels'] = kmeans.predict(data_scaled)

data['KMeans_Labels'] = kmeans.predict(data_scaled)


data['KMeans_Labels'].value_counts()

0    112
1     52
2      3
Name: KMeans_Labels, dtype: int64


# Calculating the mean and the median of the original data for each label
mean = data.groupby('KMeans_Labels').mean()

median = data.groupby('KMeans_Labels').median()

df_kmeans = pd.concat([mean, median], axis = 0)

df_kmeans.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']

df_kmeans.T


cols_visualise = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

for col in cols_visualise:
    sns.boxplot(x = 'KMeans_Labels', y = col, data = data)
    plt.show()


cols_visualise = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

for col in cols_visualise:
    sns.scatterplot(x = col, y = 'gdpp', data = data, hue = 'KMeans_Labels', palette = 'Dark2')
    
    plt.show()


kmedo = KMedoids(n_clusters = 3, random_state = 1)

kmedo.fit(data_scaled)

data_scaled_copy['kmedoLabels'] = kmedo.predict(data_scaled)

data['kmedoLabels'] = kmedo.predict(data_scaled)


data.kmedoLabels.value_counts()

1    75
2    58
0    34
Name: kmedoLabels, dtype: int64


# Calculating the mean and the median of the original data for each label
original_features = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

mean = data.groupby('kmedoLabels').mean()

median = data.groupby('kmedoLabels').median() #less sensitive to outliers

df_kmedoids = pd.concat([mean, median], axis = 0)

df_kmedoids.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']

df_kmedoids[original_features].T


for col in cols_visualise:
    sns.boxplot(x = 'kmedoLabels', y = col, data = data)
    
    plt.show()


gmm = GaussianMixture(n_components = 3, random_state = 1)

gmm.fit(data_scaled)

data_scaled_copy['GmmLabels'] = gmm.predict(data_scaled)

data['GmmLabels'] = gmm.predict(data_scaled)


data.GmmLabels.value_counts()

0    67
2    62
1    38
Name: GmmLabels, dtype: int64


# Calculating the mean and the median of the original data for each label
original_features = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

mean = data.groupby('GmmLabels').mean()

median = data.groupby('GmmLabels').median()

df_gmm = pd.concat([mean, median], axis = 0)

df_gmm.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']

df_gmm[original_features].T


cols_visualise = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

for col in cols_visualise:
    sns.boxplot(x = 'GmmLabels', y = col, data = data)
    
    plt.show()


from scipy.cluster.hierarchy import dendrogram, linkage


# The List of all linkage methods to check
methods = ['single',
           'average', 
           'complete']

# Create a subplot image
fig, axs = plt.subplots(len(methods), 1, figsize = (20, 15))

# Enumerate through the list of all methods above, get linkage and plot dendrogram
for i, method in enumerate(methods):
    Z = linkage(data_scaled, metric = 'euclidean', method = method)
    
    dendrogram(Z, ax = axs[i]);
    
    axs[i].set_title(f'Dendrogram ({method.capitalize()} Linkage)')
    
    axs[i].set_ylabel('Distance')


plt.figure(figsize = (20, 7))  

plt.title("Dendrograms")  

dend = dendrogram(linkage(data_scaled, method = 'complete'))

plt.axhline(y = 9, color = 'r', linestyle = '--')

<matplotlib.lines.Line2D at 0x7f79c81caf40>


# Clustering with 4 clusters
hierarchical = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'complete')

hierarchical.fit(data_scaled)

AgglomerativeClustering(linkage='complete', n_clusters=4)


data_scaled_copy['HCLabels'] = hierarchical.labels_

data['HCLabels'] = hierarchical.labels_


data.HCLabels.value_counts()

0    103
1     60
2      3
3      1
Name: HCLabels, dtype: int64


# Checking 3 countries in cluster 2
data[data.HCLabels == 2]


# Checking 1 country in cluster 3
data[data.HCLabels == 3]


# Calculating the mean and the median of the original data for each label
original_features = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

mean = data.groupby('HCLabels').mean()

median = data.groupby('HCLabels').median()

df_hierachical = pd.concat([mean, median], axis = 0)

df_hierachical.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_3 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median', 'group_3 Median']

df_hierachical[original_features].T


cols_visualise = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

for col in cols_visualise:
    sns.boxplot(x = 'HCLabels', y = col, data = data)
    plt.show()


dbs = DBSCAN(eps = 1)

data_scaled_copy['DBSLabels'] = dbs.fit_predict(data_scaled)

data['DBSLabels'] = dbs.fit_predict(data_scaled)


data['DBSLabels'].value_counts()

-1    90
 0    55
 1    17
 2     5
Name: DBSLabels, dtype: int64


# Calculating the mean and the median of the original data for each label
original_features = ['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']

mean = data.groupby('DBSLabels').mean()

median = data.groupby('DBSLabels').median()

df_hierachical = pd.concat([mean, median], axis = 0)

df_hierachical.index = ['group_-1 Mean', 'group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_-1 Median', 'group_0 Median', 'group_1 Median', 'group_2 Median']

df_hierachical[original_features].T


for col in cols_visualise:
    sns.boxplot(x = 'DBSLabels', y = col, data = data)
    
    plt.show()

	country	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
0	Afghanistan	90.2	10.0	7.58	44.9	1610	9.44	56.2	5.82	553
1	Albania	16.6	28.0	6.55	48.6	9930	4.49	76.3	1.65	4090
2	Algeria	27.3	38.4	4.17	31.4	12900	16.10	76.5	2.89	4460
3	Angola	119.0	62.3	2.85	42.9	5900	22.40	60.1	6.16	3530
4	Antigua and Barbuda	10.3	45.5	6.03	58.9	19100	1.44	76.8	2.13	12200

	count	mean	std	min	25%	50%	75%	max
child_mort	167.0	38.270060	40.328931	2.6000	8.250	19.30	62.10	208.00
exports	167.0	41.108976	27.412010	0.1090	23.800	35.00	51.35	200.00
health	167.0	6.815689	2.746837	1.8100	4.920	6.32	8.60	17.90
imports	167.0	46.890215	24.209589	0.0659	30.200	43.30	58.75	174.00
income	167.0	17144.688623	19278.067698	609.0000	3355.000	9960.00	22800.00	125000.00
inflation	167.0	7.781832	10.570704	-4.2100	1.810	5.39	10.75	104.00
life_expec	167.0	70.555689	8.893172	32.1000	65.300	73.10	76.80	82.80
total_fer	167.0	2.947964	1.513848	1.1500	1.795	2.41	3.88	7.49
gdpp	167.0	12964.155689	18328.704809	231.0000	1330.000	4660.00	14050.00	105000.00

	child_mort	exports	health	imports	income	inflation	life_expec	total_fer
0	1.291532	-1.138280	0.279088	-0.082455	-0.808245	0.157336	-1.619092	1.902882
1	-0.538949	-0.479658	-0.097016	0.070837	-0.375369	-0.312347	0.647866	-0.859973
2	-0.272833	-0.099122	-0.966073	-0.641762	-0.220844	0.789274	0.670423	-0.038404
3	2.007808	0.775381	-1.448071	-0.165315	-0.585043	1.387054	-1.179234	2.128151
4	-0.695634	0.160668	-0.286894	0.497568	0.101732	-0.601749	0.704258	-0.541946

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
child_mort	15.703571	88.844231	4.133333	12.450	85.65	2.80
exports	43.487500	28.203827	176.000000	39.600	23.30	175.00
health	7.069464	6.270385	6.793333	6.790	5.48	7.77
imports	46.143750	42.164729	156.666667	44.900	39.75	154.00
income	22069.285714	3832.750000	64033.333333	16250.000	1960.00	72100.00
inflation	6.042920	11.833750	2.468000	3.810	8.95	3.62
life_expec	75.215179	59.892308	81.433333	75.800	60.45	81.30
total_fer	2.119821	4.822115	1.380000	1.995	5.00	1.36
gdpp	16937.535714	1832.884615	57566.666667	8580.000	932.00	46600.00

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
child_mort	7.085294	17.853333	82.951724	4.700	15.70	79.600
exports	36.938235	52.989333	28.191362	31.600	48.30	23.800
health	9.413235	6.202133	6.086379	9.585	6.00	5.275
imports	33.452941	57.694667	40.795964	30.750	55.10	36.800
income	38094.705882	17592.800000	4284.189655	36550.000	13500.00	2145.000
inflation	4.020676	5.511573	12.922328	1.825	3.82	9.225
life_expec	79.614706	73.870667	60.958621	80.200	74.10	60.800
total_fer	1.842353	2.179600	4.589655	1.875	2.13	4.875
gdpp	35925.588235	10989.813333	2057.034483	38700.000	6230.00	994.000

Socio-economic Factors for Geographic Clustering¶

Context¶

Objective¶

Data Dictionary¶

Importing the libraries and overview of the dataset¶

Loading the data¶

Checking the info of the data¶

Check duplicate entries¶

Exploratory Data Analysis¶

Summary Statistics¶

Let's check the distribution and outliers for each column in the data¶

Let's check the correlation among the variables¶

Scaling the data¶

K-Means Clustering¶

K-Medoids Clustering¶

Gaussian Mixture Model¶

Hierarchical Clustering¶

DBSCAN¶

Conclusion¶

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
child_mort	76.459701	5.544737	17.058065	66.80	4.250	16.550
exports	30.704463	60.047368	40.745161	25.00	50.900	35.400
health	6.140000	8.444737	6.547419	5.30	9.405	6.440
imports	43.886058	51.736842	47.166129	42.90	40.950	48.250
income	4578.850746	45431.578947	13386.774194	2520.00	40900.000	12800.000
inflation	11.796642	3.538553	6.043968	8.43	1.395	4.465
life_expec	62.070149	79.707895	74.116129	61.80	80.200	74.150
total_fer	4.404776	1.791579	2.082419	4.60	1.850	2.040
gdpp	2235.880597	40707.105263	7553.870968	1170.00	39750.000	6240.000

	country	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp	KMeans_Labels	kmedoLabels	GmmLabels	HCLabels
91	Luxembourg	2.8	175.0	7.77	142.0	91700	3.620	81.3	1.63	105000	2	1	1	2
98	Malta	6.8	153.0	8.65	154.0	28300	3.830	80.3	1.36	21100	2	1	1	2
133	Singapore	2.8	200.0	3.96	174.0	72100	-0.046	82.7	1.15	46600	2	1	1	2

	group_0 Mean	group_1 Mean	group_2 Mean	group_3 Mean	group_0 Median	group_1 Median	group_2 Median	group_3 Median
child_mort	16.678641	75.513333	4.133333	130.00	10.80	73.300	2.80	130.00
exports	42.532806	32.183667	176.000000	25.30	38.70	28.900	175.00	25.30
health	7.013883	6.505667	6.793333	5.07	6.91	5.685	7.77	5.07
imports	42.438504	49.535000	156.666667	17.40	38.40	47.650	154.00	17.40
income	23425.533981	4218.050000	64033.333333	5150.00	17800.00	2500.000	72100.00	5150.00
inflation	6.723262	8.261100	2.468000	104.00	4.49	5.860	3.62	104.00
life_expec	75.471845	61.740000	81.433333	60.50	76.10	61.300	81.30	60.50
total_fer	2.074660	4.477333	1.380000	5.84	1.93	4.710	1.36	5.84
gdpp	18053.689320	2174.233333	57566.666667	2330.00	10700.00	1185.000	46600.00	2330.00

	group_-1 Mean	group_0 Mean	group_1 Mean	group_2 Mean	group_-1 Median	group_0 Median	group_1 Median	group_2 Median
child_mort	54.907778	17.130909	4.147059	87.340	50.900	15.70	4.100	90.20
exports	42.922211	41.525455	35.194118	24.000	36.100	37.00	29.900	22.80
health	6.254556	6.709455	10.294706	6.256	5.275	6.55	10.100	6.01
imports	48.265177	49.510909	33.982353	37.200	42.400	51.30	31.000	34.90
income	16254.611111	13433.090909	38382.352941	1785.600	5170.000	11200.00	38800.000	1610.00
inflation	11.155856	4.015527	1.309118	10.486	8.605	3.53	0.873	9.44
life_expec	67.202222	74.203636	81.076471	55.020	67.700	74.50	81.300	54.50
total_fer	3.578222	2.067455	1.708235	5.504	3.250	1.92	1.630	5.43
gdpp	10940.611111	8043.018182	43200.000000	718.600	2775.000	6250.00	41900.000	553.00