import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

# To scale the data using z-score 
from sklearn.preprocessing import StandardScaler

# Importing PCA and t-SNE
from sklearn.decomposition import PCA

from sklearn.manifold import TSNE


# Loading the data
data_air_pol = pd.read_csv("AirPollution.csv")


data_air_pol.head()


data_air_pol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SrNo               403 non-null    int64  
 1   Date               403 non-null    object 
 2   NO                 401 non-null    float64
 3   CO                 402 non-null    float64
 4   NO2                401 non-null    float64
 5   O3                 397 non-null    float64
 6   SO2                399 non-null    float64
 7   PM2.5              401 non-null    float64
 8   Benzene            402 non-null    float64
 9   Toulene            402 non-null    float64
 10  P_Xylene           372 non-null    float64
 11  NOx                401 non-null    float64
 12  PM10               401 non-null    float64
 13  WindDirection      402 non-null    float64
 14  NH3                401 non-null    float64
 15  RH                 402 non-null    float64
 16  Temp               401 non-null    float64
 17  WindSpeed          402 non-null    float64
 18  VerticalWindSpeed  401 non-null    float64
 19  Solar              401 non-null    float64
 20  BarPressure        401 non-null    float64
 21  Weather            403 non-null    object 
 22  PD_PM2.5           393 non-null    float64
 23  PD_PM10            392 non-null    float64
 24  PD_NO2             391 non-null    float64
 25  PD_SO2             390 non-null    float64
 26  PD_CO              392 non-null    float64
dtypes: float64(24), int64(1), object(2)
memory usage: 85.1+ KB


data_air_pol.isnull().sum()

SrNo                  0
Date                  0
NO                    2
CO                    1
NO2                   2
O3                    6
SO2                   4
PM2.5                 2
Benzene               1
Toulene               1
P_Xylene             31
NOx                   2
PM10                  2
WindDirection         1
NH3                   2
RH                    1
Temp                  2
WindSpeed             1
VerticalWindSpeed     2
Solar                 2
BarPressure           2
Weather               0
PD_PM2.5             10
PD_PM10              11
PD_NO2               12
PD_SO2               13
PD_CO                11
dtype: int64


# Dropping the columns - SrNo and Date
data_air_pol.drop(columns = ["SrNo", "Date"], inplace = True)


data_air_pol.describe().T


result = data_air_pol.select_dtypes(include = 'number')

cont_cols = list(result)

for col in cont_cols:

    print(col)
    
    print('Skew :',round(data_air_pol[col].skew(),2))
    
    plt.figure(figsize = (15, 4))
    
    plt.subplot(1, 2, 1) #index of location of subplot
    
    data_air_pol[col].hist(bins = 10, grid = False)
    
    plt.ylabel('count')
    
    plt.subplot(1, 2, 2)
    
    sns.boxplot(x = data_air_pol[col])
    
    plt.show()

NO
Skew : 1.65

CO
Skew : 4.16

NO2
Skew : 0.35

O3
Skew : 0.53

SO2
Skew : 6.87

PM2.5
Skew : 1.45

Benzene
Skew : 0.71

Toulene
Skew : 0.61

P_Xylene
Skew : 1.55

NOx
Skew : 1.32


plt.figure(figsize = (18, 12))

sns.heatmap(data_air_pol.corr(), annot = True, fmt = '0.2f')

plt.show()


# Imputing missing values with mode for the Weather column and with median for all other columns as majority of variables are skewed
for col in data_air_pol.columns:
    
    if col == "Weather":
        data_air_pol[col].fillna(value = data_air_pol[col].mode()[0], inplace = True)
    
    else:
        data_air_pol[col].fillna(value = data_air_pol[col].median(), inplace = True)


# Creating dummy variables for Weather column
data_air_pol = pd.get_dummies(data_air_pol, drop_first = True)


scaler = StandardScaler()
data_air_pol_scaled = scaler.fit_transform(data_air_pol)


data_air_pol_scaled = pd.DataFrame(data_air_pol_scaled, columns = data_air_pol.columns)


# Defining the number of principal components to generate
n = data_air_pol_scaled.shape[1]

# Finding principal components for the data
pca1 = PCA(n_components = n, random_state = 1)
data_air_pol_pca = pd.DataFrame(pca1.fit_transform(data_air_pol_scaled))

# The percentage of variance explained by each principal component
exp_var1 = pca1.explained_variance_ratio_


# Visualize the explained variance by individual components
plt.figure(figsize = (10, 10))

plt.plot(range(1, 29), pca1.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--')

plt.title("Explained Variances by Components")

plt.xlabel("Number of Components")

plt.ylabel("Cumulative Explained Variance")

plt.show()


# Find the least number of components that can explain more than 70% variance (standard)
sum = 0

for ix, i in enumerate(exp_var1):
    
    sum = sum + i
    
    if(sum>0.70):
        print("Number of PCs that explain at least 70% variance: ", ix + 1)
        break

Number of PCs that explain at least 70% variance:  5


# Making a new dataframe with first 5 principal components as columns and original features as indices
cols = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']

pc1 = pd.DataFrame(np.round(pca1.components_.T[:, 0:5], 2), index = data_air_pol_scaled.columns, columns = cols)
pc1


for i in range(10, 50, 5):# finding perplexity
    tsne = TSNE(n_components = 2, random_state = 1, perplexity = i)
    
    data_air_pol_tsne = tsne.fit_transform(data_air_pol_scaled)
    
    data_air_pol_tsne = pd.DataFrame(data_air_pol_tsne)
    
    data_air_pol_tsne.columns = ['X1', 'X2']
    
    plt.figure(figsize = (7,7))
    
    sns.scatterplot(x = 'X1', y = 'X2', data = data_air_pol_tsne)
    
    plt.title("perplexity = {}".format(i))

/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(


# Fitting t-SNE with number of components equal to 2
tsne = TSNE(n_components = 2, random_state = 1, perplexity = 35)

data_air_pol_tsne = tsne.fit_transform(data_air_pol_scaled)

# Converting the embeddings to a dataframe
data_air_pol_tsne = pd.DataFrame(data_air_pol_tsne, columns = ["X1", "X2"])

# Scatter plot for two components
plt.figure(figsize = (7, 7))

sns.scatterplot(x = 'X1', y = 'X2', data = data_air_pol_tsne)

plt.show()

/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2.
  warnings.warn(
/Users/rija/opt/anaconda3/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(


# Let's assign points to 4 different groups
def grouping(x):
    first_component = x['X1']
    
    second_component = x['X2']
    
    if second_component > 12:
        return 'group_1'
    
    elif (second_component < -10) and (first_component > -10): 
        return 'group_2'
    
    elif (second_component < 12) and (first_component > -9):
        return 'group_3'
    
    else: 
        return 'group_4'


data_air_pol_tsne['groups'] = data_air_pol_tsne.apply(grouping, axis = 1)


# Scatter plot for two components with hue
plt.figure(figsize = (7, 7))

sns.scatterplot(x = 'X1', y = 'X2', data = data_air_pol_tsne, hue = 'groups')

<AxesSubplot:xlabel='X1', ylabel='X2'>


all_col = data_air_pol_scaled.columns[:-4].tolist()

plt.figure(figsize = (30, 50))

for i, variable in enumerate(all_col):
    plt.subplot(8, 3, i + 1)
    
    sns.boxplot(y=data_air_pol_scaled[variable], x=data_air_pol_tsne['groups'])
    
    plt.tight_layout()
    
    plt.title(variable)

plt.show()

	SrNo	Date	NO	CO	NO2	O3	SO2	PM2.5	Benzene	Toulene	...	WindSpeed	VerticalWindSpeed	Solar	BarPressure	Weather	PD_PM2.5	PD_PM10	PD_NO2	PD_SO2	PD_CO
0	1	04-04-2015	7.22	1.77	47.94	51.07	16.88	48.99	2.53	9.65	...	1.22	0.08	162.18	732.25	Summer	NaN	NaN	NaN	NaN	NaN
1	2	05-04-2015	6.99	0.22	45.27	19.26	16.71	60.20	3.19	11.10	...	0.62	-0.04	99.37	734.05	Summer	48.99	82.85	47.94	16.88	1.77
2	3	09-04-2015	7.60	0.50	59.86	94.29	13.11	46.93	2.29	8.61	...	1.55	-0.17	146.94	728.08	Summer	NaN	NaN	NaN	NaN	NaN
3	4	10-04-2015	7.57	0.77	63.56	66.91	16.19	112.95	3.92	10.76	...	1.18	-0.15	150.07	730.47	Summer	46.93	171.36	59.86	13.11	0.50
4	5	11-04-2015	8.34	0.48	61.99	69.48	20.28	104.87	5.19	15.95	...	0.88	0.15	137.01	730.62	Summer	112.95	232.22	63.56	16.19	0.77

	count	mean	std	min	25%	50%	75%	max
NO	401.0	81.323292	97.204368	1.12	11.3800	39.220	118.0300	480.90
CO	402.0	2.210000	1.601589	0.22	1.2025	1.780	2.7375	19.90
NO2	401.0	74.433791	25.862319	25.27	54.5000	73.680	91.4300	149.01
O3	397.0	56.298917	29.690436	4.98	30.1200	53.210	76.8500	159.70
SO2	399.0	25.988496	25.273204	0.00	9.5150	21.680	37.1450	371.75
PM2.5	401.0	130.160673	84.196954	18.75	68.1100	108.240	166.1600	550.23
Benzene	402.0	6.525945	3.983177	0.16	3.2350	5.910	8.9400	17.51
Toulene	402.0	17.192488	10.410131	0.00	9.6950	16.240	22.9725	48.41
P_Xylene	372.0	8.128844	6.733961	0.30	3.3825	6.060	10.5700	32.35
NOx	401.0	194.681222	141.121196	26.49	87.6700	152.310	264.3800	743.70
PM10	401.0	265.254389	134.702840	36.63	163.6600	249.670	353.2700	705.70
WindDirection	402.0	202.851915	31.287309	126.69	180.4025	209.415	228.8675	258.06
NH3	401.0	39.630998	20.754359	1.00	25.9000	34.700	50.7200	144.82
RH	402.0	52.146990	15.878966	15.19	41.5750	53.870	63.8525	86.82
Temp	401.0	24.425686	7.419778	6.56	18.8200	26.570	29.8600	41.05
WindSpeed	402.0	1.238035	0.556081	0.30	0.8525	1.160	1.5200	3.45
VerticalWindSpeed	401.0	0.188853	0.785720	-2.87	-0.1700	-0.050	0.4000	3.56
Solar	401.0	112.954813	37.465048	3.31	84.9000	121.850	141.0300	345.68
BarPressure	401.0	732.560698	1.799439	721.67	731.6900	733.380	733.7600	736.08
PD_PM2.5	393.0	130.505445	84.786348	18.75	68.0500	108.240	170.9900	550.23
PD_PM10	392.0	265.622551	135.526761	36.63	163.3800	250.105	357.9550	705.70
PD_NO2	391.0	74.319003	25.996784	25.27	54.0300	73.520	91.3700	149.01
PD_SO2	390.0	25.964949	25.417022	0.00	9.4125	21.605	37.1975	371.75
PD_CO	392.0	2.200765	1.602474	0.37	1.2075	1.775	2.6675	19.90

	PC1	PC2	PC3	PC4	PC5
NO	0.25	-0.05	0.18	0.14	-0.13
CO	0.21	0.04	0.18	-0.00	-0.03
NO2	0.19	-0.22	0.18	0.06	0.24
O3	0.02	-0.38	-0.02	0.18	0.08
SO2	0.12	-0.19	-0.20	0.28	-0.11
PM2.5	0.26	-0.06	-0.10	-0.18	-0.14
Benzene	0.27	0.09	0.15	0.01	-0.01
Toulene	0.25	0.10	0.27	0.08	0.01
P_Xylene	0.25	0.07	0.22	0.03	-0.10
NOx	0.24	0.01	0.26	0.15	-0.12
PM10	0.23	-0.17	-0.10	-0.16	-0.20
WindDirection	0.09	-0.06	0.03	0.13	-0.56
NH3	0.24	0.04	-0.12	-0.08	0.11
RH	0.10	0.46	-0.02	0.01	0.18
Temp	-0.21	-0.17	0.30	-0.06	-0.20
WindSpeed	-0.20	-0.04	-0.27	-0.07	-0.07
VerticalWindSpeed	-0.03	-0.22	0.28	-0.20	0.31
Solar	-0.18	-0.22	0.14	0.11	-0.27
BarPressure	0.13	-0.01	-0.29	0.27	-0.06
PD_PM2.5	0.24	-0.05	-0.18	-0.24	-0.03
PD_PM10	0.22	-0.15	-0.19	-0.24	-0.04
PD_NO2	0.18	-0.23	0.08	-0.02	0.28
PD_SO2	0.13	-0.17	-0.16	0.24	-0.08
PD_CO	0.19	0.05	0.06	-0.13	0.07
Weather_Monsoon	-0.10	0.36	0.15	0.08	-0.24
Weather_Spring	0.02	-0.04	-0.19	0.53	0.29
Weather_Summer	-0.13	-0.33	0.02	-0.28	-0.05
Weather_Winter	0.17	0.12	-0.33	-0.25	-0.09

Air Pollution Data Exploration¶

Context¶

Objective¶

Dataset¶

Importing libraries and overview of the dataset¶

Data Preprocessing and Exploratory Data Analysis¶

Summary Statistics¶

Data Preprocessing¶

Scaling the data¶

Principal Component Analysis¶

t-SNE¶