# Library to suppress warnings

import warnings
warnings.filterwarnings('ignore')


# Libraries to help with reading and manipulating data

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# Libraries to help with data visualization

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

# Library to extract datetime features
import datetime as dt


data = pd.read_csv('Uber.csv')


# Copying data to another variable to avoid any changes to the original data
df = data.copy()


# Looking at head (the first 5 observations) 
df.head()


# Looking at tail (the last 5 observations) 
df.tail()


df.shape

(29101, 13)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29101 entries, 0 to 29100
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pickup_dt  29101 non-null  object 
 1   borough    26058 non-null  object 
 2   pickups    29101 non-null  int64  
 3   spd        29101 non-null  float64
 4   vsb        29101 non-null  float64
 5   temp       29101 non-null  float64
 6   dewp       29101 non-null  float64
 7   slp        29101 non-null  float64
 8   pcp01      29101 non-null  float64
 9   pcp06      29101 non-null  float64
 10  pcp24      29101 non-null  float64
 11  sd         29101 non-null  float64
 12  hday       29101 non-null  object 
dtypes: float64(9), int64(1), object(3)
memory usage: 2.9+ MB


df.describe().T


df.describe(exclude = 'number').T


# Making a list of all categorical variables 
cat_col = ['borough', 'hday']

# Printing number of count of each unique value in each column

for column in cat_col:
    print(df[column].value_counts())
    
    print('-' * 50)

Bronx            4343
Brooklyn         4343
EWR              4343
Manhattan        4343
Queens           4343
Staten Island    4343
Name: borough, dtype: int64
--------------------------------------------------
N    27980
Y     1121
Name: hday, dtype: int64
--------------------------------------------------


# Converting pickup_dt datatype to datetime 

df.pickup_dt = pd.to_datetime(df.pickup_dt)

# Extracting date parts from pickup_dt

df['start_year'] = df.pickup_dt.dt.year

df['start_month'] = df.pickup_dt.dt.month_name()

df['start_hour'] = df.pickup_dt.dt.hour

df['start_day'] = df.pickup_dt.dt.day

df['week_day'] = df.pickup_dt.dt.day_name()


# Removing pickup_dt column as it will not be required for further analysis

df.drop('pickup_dt', axis = 1, inplace = True)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29101 entries, 0 to 29100
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   borough      26058 non-null  object 
 1   pickups      29101 non-null  int64  
 2   spd          29101 non-null  float64
 3   vsb          29101 non-null  float64
 4   temp         29101 non-null  float64
 5   dewp         29101 non-null  float64
 6   slp          29101 non-null  float64
 7   pcp01        29101 non-null  float64
 8   pcp06        29101 non-null  float64
 9   pcp24        29101 non-null  float64
 10  sd           29101 non-null  float64
 11  hday         29101 non-null  object 
 12  start_year   29101 non-null  int64  
 13  start_month  29101 non-null  object 
 14  start_hour   29101 non-null  int64  
 15  start_day    29101 non-null  int64  
 16  week_day     29101 non-null  object 
dtypes: float64(9), int64(4), object(4)
memory usage: 3.8+ MB


# Checking missing values

df.isna().sum()

borough        3043
pickups           0
spd               0
vsb               0
temp              0
dewp              0
slp               0
pcp01             0
pcp06             0
pcp24             0
sd                0
hday              0
start_year        0
start_month       0
start_hour        0
start_day         0
week_day          0
dtype: int64


# Checking the missing values further

df.borough.value_counts(normalize = True, dropna = False)

Bronx            0.149239
Brooklyn         0.149239
EWR              0.149239
Manhattan        0.149239
Queens           0.149239
Staten Island    0.149239
NaN              0.104567
Name: borough, dtype: float64


# Replacing NaN with Unknown

df['borough'].fillna('Unknown', inplace = True)


df.borough.value_counts()

Bronx            4343
Brooklyn         4343
EWR              4343
Manhattan        4343
Queens           4343
Staten Island    4343
Unknown          3043
Name: borough, dtype: int64


df.isnull().sum()

borough        0
pickups        0
spd            0
vsb            0
temp           0
dewp           0
slp            0
pcp01          0
pcp06          0
pcp24          0
sd             0
hday           0
start_year     0
start_month    0
start_hour     0
start_day      0
week_day       0
dtype: int64


# While doing a univariate analysis of numerical variables, we want to study their central tendency and dispersion

# Let us write a function that will help us create a boxplot and histogram for any numerical variable

# This function takes the numerical variable as the input and returns the boxplots and histograms for that variable

# This would help us write faster and cleaner code

def histogram_boxplot(feature, figsize = (15,10), bins = None):
    """ Boxplot and histogram combined
    feature: 1-d feature array
    figsize: size of fig (default (9,8))
    bins: number of bins (default None / auto)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(nrows = 2,     # Number of rows of the subplot grid
                                           sharex = True, # The X-axis will be shared among all the subplots
                                           gridspec_kw = {"height_ratios": (.25, .75)}, 
                                           figsize = figsize 
                                           ) 
    # Creating the subplots
    
    # Boxplot will be created and the mean value of the column will be indicated using some symbol
    sns.boxplot(feature, ax = ax_box2, showmeans = True, color ='red')
    
    # For histogram
    sns.distplot(feature, kde = F, ax = ax_hist2, bins = bins) if bins else sns.distplot(feature, kde = False, ax = ax_hist2)
    
    ax_hist2.axvline(np.mean(feature), color = 'g', linestyle = '--')      # Add mean to the histogram
    
    ax_hist2.axvline(np.median(feature), color = 'black', linestyle = '-') # Add median to the histogram


histogram_boxplot(df.pickups)


histogram_boxplot(df.vsb)


histogram_boxplot(df.temp)


histogram_boxplot(df.dewp)


histogram_boxplot(df.slp)


histogram_boxplot(df.sd)


# Function to create barplots that indicates percentage for each category

def bar_perc(data, z):
    
    total = len(data[z]) # Length of the column
    
    plt.figure(figsize = (15, 5))
    
    # plt.xticks(rotation = 45)
    
    ax = sns.countplot(data[z], palette = 'Paired')
    
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height() / total) # Percentage of each class
        
        x = p.get_x() + p.get_width() / 2 - 0.05                    # Width of the plot
        
        y = p.get_y() + p.get_height()                              # Height of the plot
        
        ax.annotate(percentage, (x, y), size = 12)                  # Annotate the percentage 
        
    plt.show()                                                      # Display the plot


bar_perc(df, 'hday')


bar_perc(df, 'borough')


# Check for correlation among numerical variables

num_var = ['pickups', 'spd', 'vsb', 'temp', 'dewp', 'slp', 'pcp01', 'pcp06', 'pcp24', 'sd']

corr = df[num_var].corr()

# Plot the heatmap

plt.figure(figsize = (14, 10))

sns.heatmap(corr, annot = True, cmap = 'coolwarm',
            
        fmt = ".1f",
            
        xticklabels = corr.columns,
            
        yticklabels = corr.columns)

<AxesSubplot:>


sns.pairplot(df[num_var], corner = True)

plt.show()


cats = df.start_month.unique().tolist()

df.start_month = pd.Categorical(df.start_month, ordered = True, categories = cats)

plt.figure(figsize = (20, 7))

sns.lineplot(x = "start_month", y = "pickups", data = df, ci = 0, color = "RED", estimator = 'sum')

plt.ylabel('Total pickups')

plt.xlabel('Month')

plt.show()


plt.figure(figsize = (20, 7))

sns.lineplot(x = "start_day", y = "pickups", estimator = 'sum', ci = 0, data = df, color = "RED")

plt.ylabel('Total pickups')

plt.xlabel('Day of Month')

plt.show()


# Let's drop the Feb month and visualize again

df_not_feb =  df[df['start_month'] != 'February']

plt.figure(figsize = (20, 7))

sns.lineplot(x = "start_day", y = "pickups", estimator = 'sum', ci = 0, data = df_not_feb, color = "RED")

plt.ylabel('Total pickups')

plt.xlabel('Day of Month')

plt.show()


cats = ['Monday', 'Tuesday', 'Wednesday','Thursday', 'Friday', 'Saturday', 'Sunday']

df.week_day = pd.Categorical(df.week_day, ordered = True, categories = cats)

plt.figure(figsize = (20, 7))

sns.lineplot(x = "week_day", y = "pickups", ci = 0, data = df, color = "RED")

plt.ylabel('Mean pickups')

plt.xlabel('Weeks')

plt.show()


plt.figure(figsize = (20, 10))  

sns.boxplot(df['borough'], df['pickups'])

plt.ylabel('pickups')

plt.xlabel('Borough')

plt.show()


df.groupby('hday')['pickups'].mean()

hday
N    492.339957
Y    437.199822
Name: pickups, dtype: float64


# Check if the trend is similar across boroughs

df.groupby(by = ['borough','hday'])['pickups'].mean()

borough        hday
Bronx          N         50.771073
               Y         48.065868
Brooklyn       N        534.727969
               Y        527.011976
EWR            N          0.023467
               Y          0.041916
Manhattan      N       2401.302921
               Y       2035.928144
Queens         N        308.899904
               Y        320.730539
Staten Island  N          1.606082
               Y          1.497006
Unknown        N          2.057456
               Y          2.050420
Name: pickups, dtype: float64


plt.figure(figsize = (20, 7))

sns.lineplot(x = "start_hour", y = "pickups", ci = 0, data = df, hue = 'borough')

plt.ylabel('Pickups')

plt.xlabel('Hour of the day')

plt.show()


plt.figure(figsize = (20, 7))

sns.lineplot(x = df.start_hour, y = np.log1p(df.pickups), estimator ='sum', ci = 0, hue = df.borough)

plt.ylabel('Total pickups')

plt.xlabel('Hour of the day')

plt.legend(bbox_to_anchor = (1, 1))

plt.show()


df_man = df[df.borough == 'Manhattan']

df_hm = df_man.pivot_table(index = 'start_hour', columns = 'week_day', values = 'pickups')

# Draw a heatmap

plt.figure(figsize = (20, 10)) # To resize the plot

sns.heatmap(df_hm,  fmt = "d", cmap = 'coolwarm', linewidths = .5, vmin = 0)

plt.show()


df_br = df[df.borough == 'Brooklyn']

df_hm = df_br.pivot_table(index = 'start_hour', columns = 'week_day', values = 'pickups')

# Draw a heatmap 
plt.figure(figsize = (20, 10)) # To resize the plot

sns.heatmap(df_hm,  fmt = "d", cmap = 'coolwarm', linewidths = .5, vmin = 0)

plt.show()

	count	mean	std	min	25%	50%	75%	max
pickups	29101.0	490.215903	995.649536	0.0	1.0	54.0	449.000000	7883.00
spd	29101.0	5.984924	3.699007	0.0	3.0	6.0	8.000000	21.00
vsb	29101.0	8.818125	2.442897	0.0	9.1	10.0	10.000000	10.00
temp	29101.0	47.669042	19.814969	2.0	32.0	46.0	64.500000	89.00
dewp	29101.0	30.823065	21.283444	-16.0	14.0	30.0	50.000000	73.00
slp	29101.0	1017.817938	7.768796	991.4	1012.5	1018.2	1022.900000	1043.40
pcp01	29101.0	0.003830	0.018933	0.0	0.0	0.0	0.000000	0.28
pcp06	29101.0	0.026129	0.093125	0.0	0.0	0.0	0.000000	1.24
pcp24	29101.0	0.090464	0.219402	0.0	0.0	0.0	0.050000	2.10
sd	29101.0	2.529169	4.520325	0.0	0.0	0.0	2.958333	19.00

Uber Data Analysis¶

Context¶

Objective¶

Key Questions¶

Dataset Description¶

Importing the necessary libraries and overview of the dataset¶

Loading the dataset¶

View the first 5 rows of the dataset¶

View the last 5 rows of the dataset¶

Checking the shape of the dataset¶

Checking the info()¶

Summary of the data¶

Extracting date parts from pickup date¶

Missing value treatment¶

Exploratory Data Analysis: Univariate¶

Observations on Pickups¶

Observations on Visibility¶

Observations on Temperature¶

Observations on Dew Point¶

Observations on Sea Level Pressure¶

Observations on Snow Depth¶

Observations on holiday¶

Observations on borough¶

Exploratory Data Analysis: Multivariate¶

Correlation¶

Pair Plot¶

Relationship between pickups and time based variables¶

Pickups across Months¶

Pickups vs Days of the Month¶

Pickups across Weekdays¶

Pickups across Boroughs¶

Relationship between Pickups and Holidays¶

Relationship between Pickups and Hour of the day across Boroughs¶

Manhattan Pickups Heatmap - Weekday vs Hour¶

Let us see if a similar trend exists in Brooklyn¶

Conclusion and Recommendations¶

Conclusion¶

Recommendation to business¶

Further Analysis¶

	pickup_dt	borough	pickups	spd	vsb	temp	dewp	slp	hday
0	2015-01-01 01:00:00	Bronx	152	5.0	10.0	30.0	7.0	1023.5	Y
1	2015-01-01 01:00:00	Brooklyn	1519	5.0	10.0	30.0	7.0	1023.5	Y
2	2015-01-01 01:00:00	EWR	0	5.0	10.0	30.0	7.0	1023.5	Y
3	2015-01-01 01:00:00	Manhattan	5258	5.0	10.0	30.0	7.0	1023.5	Y
4	2015-01-01 01:00:00	Queens	405	5.0	10.0	30.0	7.0	1023.5	Y

	pickup_dt	borough	pickups	spd	vsb	temp	dewp	slp	hday
29096	2015-06-30 23:00:00	EWR	0	7.0	10.0	75.0	65.0	1011.8	N
29097	2015-06-30 23:00:00	Manhattan	3828	7.0	10.0	75.0	65.0	1011.8	N
29098	2015-06-30 23:00:00	Queens	580	7.0	10.0	75.0	65.0	1011.8	N
29099	2015-06-30 23:00:00	Staten Island	0	7.0	10.0	75.0	65.0	1011.8	N
29100	2015-06-30 23:00:00	NaN	3	7.0	10.0	75.0	65.0	1011.8	N

	count	unique	top	freq
pickup_dt	29101	4343	2015-01-01 01:00:00	7
borough	26058	6	Bronx	4343
hday	29101	2	N	27980