# Importing libraries for data manipulation
import numpy as np

import pandas as pd

# Importing libraries for data visualization
import seaborn as sns

import matplotlib.pyplot as plt

# Importing libraries for building linear regression model
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

# Importing libraries for scaling the data
from sklearn.preprocessing import MinMaxScaler

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")


# Loading both train and test datasets

train_df = pd.read_csv('Train.csv')

test_df = pd.read_csv('Test.csv')


# Checking the first 5 rows of the dataset
train_df.head()


train_df = train_df.drop(['Item_Identifier', 'Outlet_Identifier'], axis = 1)

test_df = test_df.drop(['Item_Identifier', 'Outlet_Identifier'], axis = 1)


train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                7060 non-null   float64
 1   Item_Fat_Content           8523 non-null   object 
 2   Item_Visibility            8523 non-null   float64
 3   Item_Type                  8523 non-null   object 
 4   Item_MRP                   8523 non-null   float64
 5   Outlet_Establishment_Year  8523 non-null   int64  
 6   Outlet_Size                6113 non-null   object 
 7   Outlet_Location_Type       8523 non-null   object 
 8   Outlet_Type                8523 non-null   object 
 9   Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 666.0+ KB


(train_df.isnull().sum() / train_df.shape[0])*100

Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64


fig, axes = plt.subplots(2, 2, figsize = (18, 10))
  
fig.suptitle('Bar plot for all categorical variables in the dataset')

sns.countplot(ax = axes[0, 0], x = 'Item_Fat_Content', data = train_df, color = 'blue', 
              order = train_df['Item_Fat_Content'].value_counts().index);

sns.countplot(ax = axes[0, 1], x = 'Outlet_Size', data = train_df, color = 'blue', 
              order = train_df['Outlet_Size'].value_counts().index);

sns.countplot(ax = axes[1, 0], x = 'Outlet_Location_Type', data = train_df, color = 'blue', 
              order = train_df['Outlet_Location_Type'].value_counts().index);

sns.countplot(ax = axes[1, 1], x = 'Outlet_Type', data = train_df, color = 'blue', 
              order = train_df['Outlet_Type'].value_counts().index);


fig = plt.figure(figsize = (18, 6))

sns.countplot(x = 'Item_Type', data = train_df, color = 'blue', order = train_df['Item_Type'].value_counts().index);

plt.xticks(rotation = 45);


train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].apply(lambda x: 'Low Fat' if x == 'low fat' or x == 'LF' else x)

train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].apply(lambda x: 'Regular' if x == 'reg' else x)


test_df['Item_Fat_Content'] = test_df['Item_Fat_Content'].apply(lambda x: 'Low Fat' if x == 'low fat' or x == 'LF' else x)

test_df['Item_Fat_Content'] = test_df['Item_Fat_Content'].apply(lambda x: 'Regular' if x == 'reg' else x)


fig, axes = plt.subplots(1, 3, figsize = (20, 6))
  
fig.suptitle('Histogram for all numerical variables in the dataset')
  
sns.histplot(x = 'Item_Weight', data = train_df, kde = True, ax = axes[0]);

sns.histplot(x = 'Item_Visibility', data = train_df, kde = True, ax = axes[1]);

sns.histplot(x = 'Item_MRP', data = train_df, kde = True, ax = axes[2]);


fig = plt.figure(figsize = (18, 6))

sns.lineplot(x = 'Outlet_Establishment_Year', y = 'Item_Outlet_Sales', data = train_df, ci = None, estimator = 'mean');


fig = plt.figure(figsize = (18, 6))

sns.heatmap(train_df.corr(), annot = True);

plt.xticks(rotation = 45);


fig, axes = plt.subplots(1, 3, figsize = (20, 6))
  
fig.suptitle('Bi-variate scatterplot for all numerical variables with the dependent variable')
  
sns.scatterplot(x = 'Item_Weight', y = 'Item_Outlet_Sales', data = train_df, ax = axes[0]);

sns.scatterplot(x = 'Item_Visibility', y = 'Item_Outlet_Sales', data = train_df, ax = axes[1]);

sns.scatterplot(x = 'Item_MRP', y = 'Item_Outlet_Sales', data = train_df, ax = axes[2]);


fig = plt.figure(figsize = (18, 3))

sns.heatmap(train_df.pivot_table(index = 'Item_Fat_Content', columns = 'Item_Type', values = 'Item_Weight'), annot = True);

plt.xticks(rotation = 45);


fig = plt.figure(figsize = (18, 3))

sns.heatmap(train_df.pivot_table(index = 'Outlet_Type', columns = 'Outlet_Location_Type', values = 'Item_Weight'), annot = True);

plt.xticks(rotation = 45);


fig = plt.figure(figsize = (18, 3))

sns.heatmap(train_df.pivot_table(index = 'Item_Fat_Content', columns = 'Outlet_Size', values = 'Item_Weight'), annot = True);

plt.xticks(rotation = 45);


item_weight_indices_to_be_updated = train_df[train_df['Item_Weight'].isnull()].index

train_df.loc[item_weight_indices_to_be_updated, 'Item_Weight'] = np.random.uniform(10, 14, 
                                                                                   len(item_weight_indices_to_be_updated))


item_weight_indices_to_be_updated = test_df[test_df['Item_Weight'].isnull()].index

test_df.loc[item_weight_indices_to_be_updated, 'Item_Weight'] = np.random.uniform(10, 14, 
                                                                                   len(item_weight_indices_to_be_updated))


outlet_size_data = train_df[train_df['Outlet_Size'].notnull()]

outlet_size_missing_data = train_df[train_df['Outlet_Size'].isnull()]


fig, axes = plt.subplots(1, 3, figsize = (18, 6))
  
fig.suptitle('Bar plot for all categorical variables in the dataset where the variable Outlet_Size is missing')
  
sns.countplot(ax = axes[0], x = 'Outlet_Type', data = outlet_size_missing_data, color = 'blue', 
              order = outlet_size_missing_data['Outlet_Type'].value_counts().index);

sns.countplot(ax = axes[1], x = 'Outlet_Location_Type', data = outlet_size_missing_data, color = 'blue', 
              order = outlet_size_missing_data['Outlet_Location_Type'].value_counts().index);

sns.countplot(ax = axes[2], x = 'Item_Fat_Content', data = outlet_size_missing_data, color = 'blue', 
              order = outlet_size_missing_data['Item_Fat_Content'].value_counts().index);


fig= plt.figure(figsize = (18, 3))

sns.heatmap(pd.crosstab(index = outlet_size_data['Outlet_Type'], columns = outlet_size_data['Outlet_Size']), annot = True, fmt = 'g')

plt.xticks(rotation = 45);


fig = plt.figure(figsize = (18, 3))

sns.heatmap(pd.crosstab(index = outlet_size_data['Outlet_Location_Type'], columns = outlet_size_data['Outlet_Size']), annot = True, fmt = 'g')

plt.xticks(rotation = 45);


fig = plt.figure(figsize = (18, 3))

sns.heatmap(pd.crosstab(index = outlet_size_data['Item_Fat_Content'], columns = outlet_size_data['Outlet_Size']), annot = True, fmt = 'g')

plt.xticks(rotation = 45);


grocery_store_indices = train_df[train_df['Outlet_Size'].isnull()].query(" Outlet_Type == 'Grocery Store' ").index

tier_2_indices = train_df[train_df['Outlet_Size'].isnull()].query(" Outlet_Location_Type == 'Tier 2' ").index


train_df.loc[grocery_store_indices, 'Outlet_Size'] = 'Small'

train_df.loc[tier_2_indices, 'Outlet_Size'] = 'Small'


grocery_store_indices = test_df[test_df['Outlet_Size'].isnull()].query(" Outlet_Type == 'Grocery Store' ").index

tier_2_indices = test_df[test_df['Outlet_Size'].isnull()].query(" Outlet_Location_Type == 'Tier 2' ").index


test_df.loc[grocery_store_indices, 'Outlet_Size'] = 'Small'

test_df.loc[tier_2_indices, 'Outlet_Size'] = 'Small'


train_df.isnull().sum()

Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64


test_df.isnull().sum()

Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64


train_df['Outlet_Age'] = 2013 - train_df['Outlet_Establishment_Year']

test_df['Outlet_Age'] = 2013 - test_df['Outlet_Establishment_Year']


fig = plt.figure(figsize = (18, 6))

sns.boxplot(x = 'Outlet_Age', y = 'Item_Outlet_Sales', data = train_df);


# We are removing the outcome variable from the feature set
# Also removing the variable Outlet_Establishment_Year, as we have created a new variable Outlet_Age
train_features = train_df.drop(['Item_Outlet_Sales', 'Outlet_Establishment_Year'], axis = 1)

# And then we are extracting the outcome variable separately
train_target = train_df['Item_Outlet_Sales']


# Creating dummy variables for the categorical variables
train_features = pd.get_dummies(train_features, drop_first = True)

train_features.head()


# Creating an instance of the MinMaxScaler
scaler = MinMaxScaler()

# Applying fit_transform on the training features data
train_features_scaled = scaler.fit_transform(train_features)

# The above scaler returns the data in array format, below we are converting it back to pandas DataFrame
train_features_scaled = pd.DataFrame(train_features_scaled, index = train_features.index, columns = train_features.columns)

train_features_scaled.head()


# Adding the intercept term
train_features_scaled = sm.add_constant(train_features_scaled)

# Calling the OLS algorithm on the train features and the target variable
ols_model_0 = sm.OLS(train_target, train_features_scaled)

# Fitting the Model
ols_res_0 = ols_model_0.fit()

print(ols_res_0.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.563
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     405.8
Date:                Sat, 06 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:25:01   Log-Likelihood:                -71993.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8495   BIC:                         1.442e+05
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
===================================================================================================
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const                             192.2250    508.450      0.378      0.705    -804.460    1188.910
Item_Weight                       -18.6789     48.666     -0.384      0.701    -114.075      76.718
Item_Visibility                   -99.6205     81.718     -1.219      0.223    -259.807      60.566
Item_MRP                         3668.9239     46.594     78.742      0.000    3577.588    3760.259
Outlet_Age                       -746.0040    250.310     -2.980      0.003   -1236.672    -255.336
Item_Fat_Content_Regular           40.1790     28.244      1.423      0.155     -15.187      95.545
Item_Type_Breads                    5.2058     84.085      0.062      0.951    -159.622     170.033
Item_Type_Breakfast                 7.5314    116.664      0.065      0.949    -221.159     236.222
Item_Type_Canned                   26.8954     62.797      0.428      0.668     -96.202     149.993
Item_Type_Dairy                   -39.8361     62.264     -0.640      0.522    -161.889      82.216
Item_Type_Frozen Foods            -27.2245     58.897     -0.462      0.644    -142.678      88.229
Item_Type_Fruits and Vegetables    30.0232     54.984      0.546      0.585     -77.760     137.806
Item_Type_Hard Drinks              -2.1841     90.229     -0.024      0.981    -179.055     174.687
Item_Type_Health and Hygiene      -11.3017     68.044     -0.166      0.868    -144.685     122.082
Item_Type_Household               -38.1290     59.952     -0.636      0.525    -155.650      79.392
Item_Type_Meat                      0.8572     70.685      0.012      0.990    -137.702     139.416
Item_Type_Others                  -21.8458     98.673     -0.221      0.825    -215.269     171.578
Item_Type_Seafood                 186.2079    148.080      1.257      0.209    -104.065     476.481
Item_Type_Snack Foods             -10.0531     55.274     -0.182      0.856    -118.405      98.298
Item_Type_Soft Drinks             -27.7325     70.204     -0.395      0.693    -165.350     109.885
Item_Type_Starchy Foods            23.9142    103.091      0.232      0.817    -178.170     225.999
Outlet_Size_Medium               -728.1544    274.677     -2.651      0.008   -1266.588    -189.721
Outlet_Size_Small                -762.6946    254.941     -2.992      0.003   -1262.441    -262.948
Outlet_Location_Type_Tier 2      -168.8740     87.637     -1.927      0.054    -340.665       2.917
Outlet_Location_Type_Tier 3      -421.2031    152.008     -2.771      0.006    -719.176    -123.230
Outlet_Type_Supermarket Type1    1516.2963    140.022     10.829      0.000    1241.819    1790.774
Outlet_Type_Supermarket Type2    1252.0527    123.873     10.108      0.000    1009.232    1494.874
Outlet_Type_Supermarket Type3    3724.0002    176.037     21.155      0.000    3378.925    4069.076
==============================================================================
Omnibus:                      961.289   Durbin-Watson:                   2.004
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2293.830
Skew:                           0.667   Prob(JB):                         0.00
Kurtosis:                       5.163   Cond. No.                         105.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled.values, i) for i in range(train_features_scaled.shape[1])],
    index = train_features_scaled.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                              1726.930249
Item_Weight                           1.020436
Item_Visibility                       1.101135
Item_MRP                              1.013143
Outlet_Age                           50.920787
Item_Fat_Content_Regular              1.216616
Item_Type_Breads                      1.349952
Item_Type_Breakfast                   1.158277
Item_Type_Canned                      1.853154
Item_Type_Dairy                       1.906438
Item_Type_Frozen Foods                2.093556
Item_Type_Fruits and Vegetables       2.497309
Item_Type_Hard Drinks                 1.331226
Item_Type_Health and Hygiene          1.771880
Item_Type_Household                   2.289816
Item_Type_Meat                        1.581290
Item_Type_Others                      1.264078
Item_Type_Seafood                     1.091655
Item_Type_Snack Foods                 2.468958
Item_Type_Soft Drinks                 1.629245
Item_Type_Starchy Foods               1.211395
Outlet_Size_Medium                  111.035924
Outlet_Size_Small                   106.822011
Outlet_Location_Type_Tier 2          11.286485
Outlet_Location_Type_Tier 3          36.822583
Outlet_Type_Supermarket Type1        29.622481
Outlet_Type_Supermarket Type2         9.945411
Outlet_Type_Supermarket Type3        20.218127
dtype: float64


train_features_scaled_new = train_features_scaled.drop("Outlet_Age", axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new.values, i) for i in range(train_features_scaled_new.shape[1])],
    index = train_features_scaled_new.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                              121.393000
Item_Weight                          1.020365
Item_Visibility                      1.101115
Item_MRP                             1.013102
Item_Fat_Content_Regular             1.216573
Item_Type_Breads                     1.349651
Item_Type_Breakfast                  1.158268
Item_Type_Canned                     1.853093
Item_Type_Dairy                      1.906435
Item_Type_Frozen Foods               2.093311
Item_Type_Fruits and Vegetables      2.497137
Item_Type_Hard Drinks                1.331145
Item_Type_Health and Hygiene         1.771845
Item_Type_Household                  2.289797
Item_Type_Meat                       1.581277
Item_Type_Others                     1.264023
Item_Type_Seafood                    1.091493
Item_Type_Snack Foods                2.468907
Item_Type_Soft Drinks                1.629240
Item_Type_Starchy Foods              1.211392
Outlet_Size_Medium                  10.994155
Outlet_Size_Small                   12.274728
Outlet_Location_Type_Tier 2          2.691632
Outlet_Location_Type_Tier 3          7.538596
Outlet_Type_Supermarket Type1        5.953573
Outlet_Type_Supermarket Type2        4.230171
Outlet_Type_Supermarket Type3        4.263294
dtype: float64


ols_model_2 = sm.OLS(train_target, train_features_scaled_new)

ols_res_2 = ols_model_2.fit()


print(ols_res_2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.563
Model:                            OLS   Adj. R-squared:                  0.561
Method:                 Least Squares   F-statistic:                     420.6
Date:                Thu, 14 Apr 2022   Prob (F-statistic):               0.00
Time:                        17:11:00   Log-Likelihood:                -71997.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8496   BIC:                         1.442e+05
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
===================================================================================================
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const                           -1270.9837    134.780     -9.430      0.000   -1535.186   -1006.782
Item_Weight                       -12.3498     48.666     -0.254      0.800    -107.747      83.047
Item_Visibility                   -98.4424     81.754     -1.204      0.229    -258.700      61.815
Item_MRP                         3667.9397     46.614     78.688      0.000    3576.565    3759.314
Item_Fat_Content_Regular           40.7470     28.256      1.442      0.149     -14.643      96.137
Item_Type_Breads                    1.7078     84.114      0.020      0.984    -163.176     166.591
Item_Type_Breakfast                 8.4265    116.720      0.072      0.942    -220.373     237.226
Item_Type_Canned                   25.8750     62.825      0.412      0.680     -97.278     149.028
Item_Type_Dairy                   -39.9257     62.291     -0.641      0.522    -162.031      82.179
Item_Type_Frozen Foods            -25.4585     58.923     -0.432      0.666    -140.962      90.045
Item_Type_Fruits and Vegetables    28.4420     55.010      0.517      0.605     -79.391     136.275
Item_Type_Hard Drinks              -4.0005     90.266     -0.044      0.965    -180.945     172.944
Item_Type_Health and Hygiene      -10.5661     68.079     -0.155      0.877    -144.017     122.884
Item_Type_Household               -38.9075     59.979     -0.649      0.517    -156.481      78.666
Item_Type_Meat                      1.3535     70.719      0.019      0.985    -137.274     139.981
Item_Type_Others                  -24.1736     98.715     -0.245      0.807    -217.679     169.332
Item_Type_Seafood                 180.8347    148.139      1.221      0.222    -109.554     471.224
Item_Type_Snack Foods             -10.9284     55.305     -0.198      0.843    -119.339      97.482
Item_Type_Soft Drinks             -27.2034     70.234     -0.387      0.699    -164.879     110.472
Item_Type_Starchy Foods            24.0722    103.143      0.233      0.815    -178.113     226.257
Outlet_Size_Medium                 48.7150     86.480      0.563      0.573    -120.806     218.236
Outlet_Size_Small                 -48.0264     86.468     -0.555      0.579    -217.525     121.472
Outlet_Location_Type_Tier 2        59.0752     42.817      1.380      0.168     -24.858     143.008
Outlet_Location_Type_Tier 3       -17.3905     68.822     -0.253      0.801    -152.298     117.517
Outlet_Type_Supermarket Type1    1889.1598     62.814     30.075      0.000    1766.029    2012.290
Outlet_Type_Supermarket Type2    1531.9612     80.825     18.954      0.000    1373.524    1690.398
Outlet_Type_Supermarket Type3    3258.2514     80.874     40.288      0.000    3099.720    3416.783
==============================================================================
Omnibus:                      962.884   Durbin-Watson:                   2.004
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2300.313
Skew:                           0.668   Prob(JB):                         0.00
Kurtosis:                       5.166   Cond. No.                         28.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


train_features_scaled_new2 = train_features_scaled_new.drop(['Item_Type_Breads',
'Item_Type_Breakfast',
'Item_Type_Canned',
'Item_Type_Dairy',
'Item_Type_Frozen Foods',
'Item_Type_Fruits and Vegetables',
'Item_Type_Hard Drinks',
'Item_Type_Health and Hygiene',
'Item_Type_Household',
'Item_Type_Meat',
'Item_Type_Others',
'Item_Type_Seafood',
'Item_Type_Snack Foods',
'Item_Type_Soft Drinks',
'Item_Type_Starchy Foods'], axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new2.values, i) for i in range(train_features_scaled_new2.shape[1])],
    index = train_features_scaled_new2.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            109.452185
Item_Weight                        1.007189
Item_Visibility                    1.093214
Item_MRP                           1.000729
Item_Fat_Content_Regular           1.003145
Outlet_Size_Medium                10.984463
Outlet_Size_Small                 12.266452
Outlet_Location_Type_Tier 2        2.689564
Outlet_Location_Type_Tier 3        7.529063
Outlet_Type_Supermarket Type1      5.945893
Outlet_Type_Supermarket Type2      4.226082
Outlet_Type_Supermarket Type3      4.260733
dtype: float64


ols_model_3 = sm.OLS(train_target, train_features_scaled_new2)

ols_res_3 = ols_model_3.fit()

print(ols_res_3.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.563
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     994.9
Date:                Sat, 06 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:33:59   Log-Likelihood:                -72000.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8511   BIC:                         1.441e+05
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1277.0790    127.990     -9.978      0.000   -1527.970   -1026.188
Item_Weight                     -19.4482     48.343     -0.402      0.687    -114.213      75.317
Item_Visibility                 -94.6918     81.414     -1.163      0.245    -254.284      64.900
Item_MRP                       3666.7944     46.302     79.192      0.000    3576.030    3757.559
Item_Fat_Content_Regular         52.0902     25.644      2.031      0.042       1.821     102.359
Outlet_Size_Medium               48.7242     86.384      0.564      0.573    -120.609     218.057
Outlet_Size_Small               -49.2185     86.382     -0.570      0.569    -218.547     120.110
Outlet_Location_Type_Tier 2      60.3934     42.776      1.412      0.158     -23.459     144.245
Outlet_Location_Type_Tier 3     -17.8502     68.728     -0.260      0.795    -152.573     116.873
Outlet_Type_Supermarket Type1  1888.6502     62.726     30.110      0.000    1765.692    2011.608
Outlet_Type_Supermarket Type2  1532.3453     80.740     18.979      0.000    1374.076    1690.614
Outlet_Type_Supermarket Type3  3258.5172     80.803     40.327      0.000    3100.124    3416.911
==============================================================================
Omnibus:                      961.630   Durbin-Watson:                   2.003
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2292.565
Skew:                           0.668   Prob(JB):                         0.00
Kurtosis:                       5.161   Cond. No.                         25.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


train_features_scaled_new3 = train_features_scaled_new2.drop("Item_Weight", axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new3.values, i) for i in range(train_features_scaled_new3.shape[1])],
    index = train_features_scaled_new3.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            106.905754
Item_Visibility                    1.093065
Item_MRP                           1.000167
Item_Fat_Content_Regular           1.002641
Outlet_Size_Medium                10.977215
Outlet_Size_Small                 12.259457
Outlet_Location_Type_Tier 2        2.689448
Outlet_Location_Type_Tier 3        7.519408
Outlet_Type_Supermarket Type1      5.938545
Outlet_Type_Supermarket Type2      4.225983
Outlet_Type_Supermarket Type3      4.255095
dtype: float64


ols_model_4 = sm.OLS(train_target, train_features_scaled_new3)

ols_res_4 = ols_model_4.fit()

print(ols_res_4.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.563
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     1095.
Date:                Sat, 06 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:34:16   Log-Likelihood:                -72000.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8512   BIC:                         1.441e+05
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1284.9327    126.486    -10.159      0.000   -1532.875   -1036.990
Item_Visibility                 -94.3088     81.405     -1.159      0.247    -253.882      65.264
Item_MRP                       3666.3534     46.287     79.209      0.000    3575.619    3757.088
Item_Fat_Content_Regular         52.3215     25.637      2.041      0.041       2.068     102.575
Outlet_Size_Medium               47.8315     86.351      0.554      0.580    -121.437     217.100
Outlet_Size_Small               -50.0483     86.353     -0.580      0.562    -219.321     119.224
Outlet_Location_Type_Tier 2      60.5062     42.773      1.415      0.157     -23.340     144.352
Outlet_Location_Type_Tier 3     -18.8403     68.680     -0.274      0.784    -153.470     115.790
Outlet_Type_Supermarket Type1  1887.7630     62.684     30.116      0.000    1764.887    2010.639
Outlet_Type_Supermarket Type2  1532.5026     80.735     18.982      0.000    1374.243    1690.762
Outlet_Type_Supermarket Type3  3259.6997     80.746     40.370      0.000    3101.419    3417.981
==============================================================================
Omnibus:                      961.830   Durbin-Watson:                   2.003
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2292.698
Skew:                           0.668   Prob(JB):                         0.00
Kurtosis:                       5.161   Cond. No.                         24.5
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


train_features_scaled_new4 = train_features_scaled_new3.drop(["Outlet_Location_Type_Tier 2", "Outlet_Location_Type_Tier 3"], axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new4.values, i) for i in range(train_features_scaled_new4.shape[1])],
    index = train_features_scaled_new4.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            27.169808
Item_Visibility                   1.092348
Item_MRP                          1.000143
Item_Fat_Content_Regular          1.002605
Outlet_Size_Medium                4.034011
Outlet_Size_Small                 2.814532
Outlet_Type_Supermarket Type1     2.478493
Outlet_Type_Supermarket Type2     2.842880
Outlet_Type_Supermarket Type3     2.863427
dtype: float64


ols_model_5 = sm.OLS(train_target, train_features_scaled_new4)

ols_res_5 = ols_model_5.fit()

print(ols_res_5.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.562
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     1368.
Date:                Sat, 06 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:35:24   Log-Likelihood:                -72001.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8514   BIC:                         1.441e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1358.8933     63.766    -21.311      0.000   -1483.889   -1233.897
Item_Visibility                 -93.3375     81.378     -1.147      0.251    -252.859      66.184
Item_MRP                       3666.0517     46.287     79.203      0.000    3575.318    3756.785
Item_Fat_Content_Regular         52.1432     25.636      2.034      0.042       1.890     102.396
Outlet_Size_Medium               66.6696     52.347      1.274      0.203     -35.943     169.282
Outlet_Size_Small                14.1489     41.376      0.342      0.732     -66.958      95.255
Outlet_Type_Supermarket Type1  1942.9093     40.496     47.978      0.000    1863.527    2022.291
Outlet_Type_Supermarket Type2  1568.8091     66.218     23.692      0.000    1439.005    1698.613
Outlet_Type_Supermarket Type3  3296.0104     66.238     49.760      0.000    3166.167    3425.854
==============================================================================
Omnibus:                      961.728   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2296.342
Skew:                           0.667   Prob(JB):                         0.00
Kurtosis:                       5.164   Cond. No.                         13.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


train_features_scaled_new5 = train_features_scaled_new4.drop(["Outlet_Size_Small", "Outlet_Size_Medium"], axis=1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new5.values, i) for i in range(train_features_scaled_new5.shape[1])],
    index = train_features_scaled_new5.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            15.813401
Item_Visibility                   1.092314
Item_MRP                          1.000114
Item_Fat_Content_Regular          1.002581
Outlet_Type_Supermarket Type1     2.306663
Outlet_Type_Supermarket Type2     1.731428
Outlet_Type_Supermarket Type3     1.744734
dtype: float64


ols_model_6 = sm.OLS(train_target, train_features_scaled_new5)
ols_res_6 = ols_model_6.fit()
print(ols_res_6.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.562
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     1824.
Date:                Sat, 06 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:36:39   Log-Likelihood:                -72002.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8516   BIC:                         1.441e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1344.7104     48.647    -27.642      0.000   -1440.070   -1249.351
Item_Visibility                 -93.1427     81.377     -1.145      0.252    -252.661      66.376
Item_MRP                       3665.7108     46.286     79.197      0.000    3574.979    3756.443
Item_Fat_Content_Regular         52.3194     25.636      2.041      0.041       2.067     102.572
Outlet_Type_Supermarket Type1  1949.3298     39.067     49.897      0.000    1872.749    2025.911
Outlet_Type_Supermarket Type2  1621.3566     51.677     31.375      0.000    1520.057    1722.657
Outlet_Type_Supermarket Type3  3348.5571     51.705     64.763      0.000    3247.203    3449.911
==============================================================================
Omnibus:                      960.314   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2290.252
Skew:                           0.667   Prob(JB):                         0.00
Kurtosis:                       5.161   Cond. No.                         10.7
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


train_features_scaled_new6 = train_features_scaled_new5.drop("Item_Visibility", axis = 1)

vif_series = pd.Series(
    [variance_inflation_factor(train_features_scaled_new6.values, i) for i in range(train_features_scaled_new6.shape[1])],
    index = train_features_scaled_new6.columns,
    dtype = float)

print("VIF Scores: \n\n{}\n".format(vif_series))

VIF Scores: 

const                            11.452496
Item_MRP                          1.000114
Item_Fat_Content_Regular          1.000048
Outlet_Type_Supermarket Type1     2.125686
Outlet_Type_Supermarket Type2     1.654765
Outlet_Type_Supermarket Type3     1.658941
dtype: float64


ols_model_7 = sm.OLS(train_target, train_features_scaled_new6)
ols_res_7 = ols_model_7.fit()
print(ols_res_7.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.562
Model:                            OLS   Adj. R-squared:                  0.562
Method:                 Least Squares   F-statistic:                     2188.
Date:                Sat, 06 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:37:04   Log-Likelihood:                -72003.
No. Observations:                8523   AIC:                         1.440e+05
Df Residuals:                    8517   BIC:                         1.441e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                         -1373.9504     41.400    -33.187      0.000   -1455.104   -1292.796
Item_MRP                       3665.7374     46.287     79.196      0.000    3575.004    3756.471
Item_Fat_Content_Regular         50.8447     25.604      1.986      0.047       0.655     101.035
Outlet_Type_Supermarket Type1  1961.8549     37.504     52.311      0.000    1888.338    2035.371
Outlet_Type_Supermarket Type2  1633.8029     50.521     32.339      0.000    1534.769    1732.837
Outlet_Type_Supermarket Type3  3361.6803     50.418     66.676      0.000    3262.848    3460.513
==============================================================================
Omnibus:                      962.316   Durbin-Watson:                   2.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2299.500
Skew:                           0.668   Prob(JB):                         0.00
Kurtosis:                       5.166   Cond. No.                         8.43
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Residuals
residual = ols_res_7.resid


residual.mean()

1.404618687386992e-12


# Plot histogram of residuals
sns.histplot(residual, kde = True)

<AxesSubplot:ylabel='Count'>


# Predicted values
fitted = ols_res_7.fittedvalues

sns.residplot(x = fitted, y = residual, color = "lightblue")

plt.xlabel("Fitted Values")

plt.ylabel("Residual")

plt.title("Residual PLOT")

plt.show()


# Log transformation on the target variable
train_target_log = np.log(train_target)


# Fitting new model with the transformed target variable
ols_model_7 = sm.OLS(train_target_log, train_features_scaled_new6)

ols_res_7 = ols_model_7.fit()


# Predicted values
fitted = ols_res_7.fittedvalues

residual1 = ols_res_7.resid

sns.residplot(x = fitted, y = residual1, color = "lightblue")

plt.xlabel("Fitted Values")

plt.ylabel("Residual")

plt.title("Residual PLOT")

plt.show()


print(ols_res_7.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.720
Model:                            OLS   Adj. R-squared:                  0.720
Method:                 Least Squares   F-statistic:                     4375.
Date:                Sat, 06 Aug 2022   Prob (F-statistic):               0.00
Time:                        16:41:11   Log-Likelihood:                -6816.7
No. Observations:                8523   AIC:                         1.365e+04
Df Residuals:                    8517   BIC:                         1.369e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
=================================================================================================
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const                             4.6356      0.020    234.794      0.000       4.597       4.674
Item_MRP                          1.9555      0.022     88.588      0.000       1.912       1.999
Item_Fat_Content_Regular          0.0158      0.012      1.293      0.196      -0.008       0.040
Outlet_Type_Supermarket Type1     1.9550      0.018    109.305      0.000       1.920       1.990
Outlet_Type_Supermarket Type2     1.7737      0.024     73.618      0.000       1.726       1.821
Outlet_Type_Supermarket Type3     2.4837      0.024    103.297      0.000       2.437       2.531
==============================================================================
Omnibus:                      829.137   Durbin-Watson:                   2.007
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1164.564
Skew:                          -0.775   Prob(JB):                    1.31e-253
Kurtosis:                       3.937   Cond. No.                         8.43
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


from statsmodels.stats.diagnostic import het_white

from statsmodels.compat import lzip

import statsmodels.stats.api as sms


import statsmodels.stats.api as sms

from statsmodels.compat import lzip

name = ["F statistic", "p-value"]

test = sms.het_goldfeldquandt(train_target_log, train_features_scaled_new6)

lzip(name, test)

[('F statistic', 0.9395156175145154), ('p-value', 0.9790604597916552)]


without_const = train_features_scaled.iloc[:, 1:]

test_features = pd.get_dummies(test_df, drop_first = True)

test_features = test_features[list(without_const)]

# Applying transform on the test data
test_features_scaled = scaler.transform(test_features)

test_features_scaled = pd.DataFrame(test_features_scaled, columns = without_const.columns)

test_features_scaled = sm.add_constant(test_features_scaled)

test_features_scaled = test_features_scaled.drop(["Item_Weight", "Item_Visibility", "Item_Type_Breads", "Item_Type_Breakfast", "Item_Type_Canned", "Item_Type_Dairy","Item_Type_Frozen Foods","Item_Type_Fruits and Vegetables", "Item_Type_Hard Drinks", "Item_Type_Health and Hygiene", "Item_Type_Household", "Item_Type_Meat", "Item_Type_Others", "Item_Type_Seafood", "Item_Type_Snack Foods", "Item_Type_Soft Drinks", "Item_Type_Starchy Foods", "Outlet_Size_Medium", "Outlet_Size_Small", "Outlet_Location_Type_Tier 2", "Outlet_Location_Type_Tier 3", 'Outlet_Age'], axis = 1)

test_features_scaled.head()


print(ols_res_7.rsquared)

0.71975057509795


print(ols_res_7.mse_resid)

0.2900908064681798


print(np.sqrt(ols_res_7.mse_resid))

0.5386007858035298


# Fitting linear model

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

linearregression = LinearRegression()                                    

cv_Score11 = cross_val_score(linearregression, train_features_scaled_new6, train_target_log, cv = 10)

cv_Score12 = cross_val_score(linearregression, train_features_scaled_new6, train_target_log, cv = 10, 
                             scoring = 'neg_mean_squared_error')                                  


print("RSquared: %0.3f (+/- %0.3f)" % (cv_Score11.mean(), cv_Score11.std()*2))

print("Mean Squared Error: %0.3f (+/- %0.3f)" % (-1*cv_Score12.mean(), cv_Score12.std()*2))

RSquared: 0.718 (+/- 0.049)
Mean Squared Error: 0.290 (+/- 0.030)


# These test predictions will be on a log scale
test_predictions = ols_res_7.predict(test_features_scaled)

# We are converting the log scale predictions to its original scale
test_predictions_inverse_transformed = np.exp(test_predictions)

test_predictions_inverse_transformed

0       1374.895044
1       1177.811420
2        591.395560
3       2033.800973
4       6765.005719
           ...     
5676    1843.793296
5677    1937.766915
5678    1504.855760
5679    3388.112463
5680    1106.508932
Length: 5681, dtype: float64


fig, ax = plt.subplots(1, 2, figsize = (24, 12))

sns.histplot(test_predictions, ax = ax[0]);

sns.histplot(test_predictions_inverse_transformed, ax = ax[1]);

BigMart Sales Prediction¶

Context¶

Objective¶

Dataset¶

Importing the libraries and overview of the dataset¶

Note: The first section of the notebook is the section that has been covered in the previous case studies. For this discussion, this part can be skipped and we can directly refer to this summary of data description and observations from EDA.

Loading the datasets¶

Checking the info of the training data¶

EDA and Data Preprocessing¶

Univariate Analysis¶

Observations:¶

Bivariate Analysis¶

Summary of EDA ¶

Missing Value Treatment¶

Feature Engineering¶

Modeling¶

Feature Selection¶

Removing Multicollinearity¶

Checking for the assumptions and rebuilding the model¶

Mean of residuals should be 0 and normality of error terms¶

Tests for Normality¶

Linearity of Variables¶

No Heteroscedasticity¶

Test for Homoscedasticity¶

Evaluation Metrics¶

R-Squared¶

Mean Squared Error¶

Root Mean Squared Error¶

Predictions on the Test Dataset¶

Conclusions and Recommendations¶

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type	Item_Outlet_Sales
0	FDA15	9.30	Low Fat	0.016047	Dairy	249.8092	OUT049	1999	Medium	Tier 1	Supermarket Type1	3735.1380
1	DRC01	5.92	Regular	0.019278	Soft Drinks	48.2692	OUT018	2009	Medium	Tier 3	Supermarket Type2	443.4228
2	FDN15	17.50	Low Fat	0.016760	Meat	141.6180	OUT049	1999	Medium	Tier 1	Supermarket Type1	2097.2700
3	FDX07	19.20	Regular	0.000000	Fruits and Vegetables	182.0950	OUT010	1998	NaN	Tier 3	Grocery Store	732.3800
4	NCD19	8.93	Low Fat	0.000000	Household	53.8614	OUT013	1987	High	Tier 3	Supermarket Type1	994.7052

	Item_Weight	Item_Visibility	Item_MRP	Outlet_Age	Item_Fat_Content_Regular	Item_Type_Dairy	...	Item_Type_Soft Drinks	Outlet_Size_Medium	Outlet_Size_Small	Outlet_Location_Type_Tier 3	Outlet_Type_Supermarket Type1	Outlet_Type_Supermarket Type2
0	0.282525	0.048866	0.927507	0.416667	0.0	1.0	...	0.0	1.0	0.0	0.0	1.0	0.0
1	0.081274	0.058705	0.072068	0.000000	1.0	0.0	...	1.0	1.0	0.0	1.0	0.0	1.0
2	0.770765	0.051037	0.468288	0.416667	0.0	0.0	...	0.0	1.0	0.0	0.0	1.0	0.0
3	0.871986	0.000000	0.640093	0.458333	1.0	0.0	...	0.0	0.0	1.0	1.0	0.0	0.0
4	0.260494	0.000000	0.095805	0.916667	0.0	0.0	...	0.0	0.0	0.0	1.0	1.0	0.0

	const	Item_MRP	Item_Fat_Content_Regular	Outlet_Type_Supermarket Type1	Outlet_Type_Supermarket Type3
0	1.0	0.325012	0.0	1.0	0.0
1	1.0	0.237819	1.0	1.0	0.0
2	1.0	0.893316	0.0	0.0	0.0
3	1.0	0.525233	0.0	1.0	0.0
4	1.0	0.861381	1.0	0.0	1.0

BigMart Sales Prediction¶

Context¶

Objective¶

Dataset¶

Importing the libraries and overview of the dataset¶

Note: The first section of the notebook is the section that has been covered in the previous case studies. For this discussion, this part can be skipped and we can directly refer to this summary of data description and observations from EDA.

Loading the datasets¶

Checking the info of the training data¶

EDA and Data Preprocessing¶

Univariate Analysis¶

Observations:¶

Bivariate Analysis¶

Summary of EDA¶

Missing Value Treatment¶

Feature Engineering¶

Modeling¶

Feature Selection¶

Removing Multicollinearity¶

Checking for the assumptions and rebuilding the model¶

Mean of residuals should be 0 and normality of error terms¶

Tests for Normality¶

Linearity of Variables¶

No Heteroscedasticity¶

Test for Homoscedasticity¶

Evaluation Metrics¶

R-Squared¶

Mean Squared Error¶

Root Mean Squared Error¶

Predictions on the Test Dataset¶

Conclusions and Recommendations¶

Summary of EDA ¶