from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


import warnings                                 # Used to ignore the warning given as output of the code
warnings.filterwarnings('ignore')

import numpy as np                              # Basic libraries of python for numeric and dataframe computations
import pandas as pd

import matplotlib.pyplot as plt                 # Basic library for data visualization
import seaborn as sns                           # Slightly advanced library for data visualization

from collections import defaultdict             # A dictionary output that does not raise a key error

from sklearn.metrics import mean_squared_error  # A performance metrics in sklearn


# Importing the "ratings.csv" dataset
rating = pd.read_csv('/content/drive/MyDrive/ratings.csv')


# Info of the dataframe
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


# drop() is a method used to remove the desired columns/rows from a dataframe
rating = rating.drop(['timestamp'], axis = 1)


# head() is a method used to display the first n records of a dataframe, by default n=5.
rating.head()


# Finding number of unique users by using nunique method
rating['userId'].nunique()

610


# Finding the number of unique movies
rating['movieId'].nunique()

9724


# Finding the frequency of movies rated by each user
rating.groupby(['userId', 'movieId']).count()


# Finding the sum of ratings count by user-movie pair
rating.groupby(['userId', 'movieId']).count()['rating'].sum()

100836


# Counting the number of people who have watched a certain movie
rating['movieId'].value_counts()

356       329
318       317
296       307
593       279
2571      278
         ... 
86279       1
86922       1
5962        1
87660       1
163981      1
Name: movieId, Length: 9724, dtype: int64


# Plotting distributions of ratings for 329 interactions with movieid 356

# Let us fix the size of the figure
plt.figure(figsize = (7, 7))

rating[rating['movieId'] == 356]['rating'].value_counts().plot(kind = 'bar')

# This gives a label to the variable on the x-axis
plt.xlabel('Rating')

# This gives a label to the variable on the y-axis
plt.ylabel('Count')

# This displays the plot
plt.show()


# Counting the number of movies each user has watched
rating['userId'].value_counts()

414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
442      20
569      20
320      20
576      20
53       20
Name: userId, Length: 610, dtype: int64


# Finding user-movie interactions distribution
count_interactions = rating.groupby('userId').count()['movieId']
count_interactions

userId
1       232
2        29
3        39
4       216
5        44
       ... 
606    1115
607     187
608     831
609      37
610    1302
Name: movieId, Length: 610, dtype: int64


# Calculate average ratings for each movie
average_rating = rating.groupby('movieId').mean()['rating']

# Calculate the count of ratings for each movie
count_rating = rating.groupby('movieId').count()['rating']

# Making a dataframe with the count and average of ratings
final_rating = pd.DataFrame({'avg_rating': average_rating, 'rating_count': count_rating})


# First 5 records of the final_rating dataset
final_rating.head()


# It gives top n movies among those being watched for more than min_interactions
def top_n_movies(data, n, min_interaction = 100):
    
    # Finding movies with interactions greater than the minimum number of interactions
    recommendations = data[data['rating_count'] > min_interaction]
    
    # Sorting values with respect to the average rating
    recommendations = recommendations.sort_values(by = 'avg_rating', ascending = False)
    
    return recommendations.index[:n]


list(top_n_movies(final_rating, 5, 50))

[318, 858, 2959, 1276, 750]


list(top_n_movies(final_rating, 5, 100))

[318, 858, 2959, 1221, 48516]


list(top_n_movies(final_rating, 5, 200))

[318, 2959, 50, 260, 527]


# Installing the surprise library
!pip install surprise

Requirement already satisfied: surprise in /usr/local/lib/python3.7/dist-packages (0.1)
Requirement already satisfied: scikit-surprise in /usr/local/lib/python3.7/dist-packages (from surprise) (1.1.1)
Requirement already satisfied: numpy>=1.11.2 in /usr/local/lib/python3.7/dist-packages (from scikit-surprise->surprise) (1.21.6)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-surprise->surprise) (1.4.1)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from scikit-surprise->surprise) (1.15.0)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-surprise->surprise) (1.1.0)


# To compute the accuracy of models
from surprise import accuracy

# Class to parse a file containing ratings, data should be in structure - user; item; rating
from surprise.reader import Reader

# Class for loading datasets
from surprise.dataset import Dataset

# For tuning model hyperparameters
from surprise.model_selection import GridSearchCV

# For splitting the rating data in train and test datasets
from surprise.model_selection import train_test_split

# For implementing similarity-based recommendation system
from surprise.prediction_algorithms.knns import KNNBasic

# For implementing matrix factorization based recommendation system
from surprise.prediction_algorithms.matrix_factorization import SVD

# for implementing K-Fold cross-validation
from surprise.model_selection import KFold

# For implementing clustering-based recommendation system
from surprise import CoClustering


def precision_recall_at_k(model, k = 10, threshold = 3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user
    user_est_true = defaultdict(list)
    
    # Making predictions on the test data
    predictions = model.test(testset)
    
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key = lambda x: x[0], reverse = True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. Therefore, we are setting Precision to 0 when n_rec_k is 0

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. Therefore, we are setting Recall to 0 when n_rel is 0

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    
    # Mean of all the predicted precisions are calculated
    precision = round((sum(prec for prec in precisions.values()) / len(precisions)), 3)
    
    # Mean of all the predicted recalls are calculated
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)), 3)
    
    accuracy.rmse(predictions)
    
    print('Precision: ', precision) # Command to print the overall precision
    
    print('Recall: ', recall) # Command to print the overall recall
    
    print('F_1 score: ', round((2*precision*recall)/(precision+recall), 3)) # Formula to compute the F-1 score


# Instantiating Reader scale with expected rating scale
reader = Reader(rating_scale = (0, 5))

# Loading the rating dataset
data = Dataset.load_from_df(rating[['userId', 'movieId', 'rating']], reader)

# Splitting the data into train and test datasets
trainset, testset = train_test_split(data, test_size = 0.2, random_state = 42)


# Declaring the similarity options
sim_options = {'name': 'cosine',
               'user_based': True}

# KNN algorithm is used to find desired similar items
sim_user_user = KNNBasic(sim_options = sim_options, verbose = False, random_state = 1)

# Train the algorithm on the trainset, and predict ratings for the test set
sim_user_user.fit(trainset)

# Let us compute precision@k, recall@k, and F_1 score with k = 10
precision_recall_at_k(sim_user_user)

RMSE: 0.9823
Precision:  0.757
Recall:  0.542
F_1 score:  0.632


# Predicting rating for a sample user with an interacted movie
sim_user_user.predict(4, 10, r_ui = 4, verbose = True)

user: 4          item: 10         r_ui = 4.00   est = 3.41   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.4133289774831344, details={'actual_k': 40, 'was_impossible': False})


rating[rating.movieId == 3].userId.unique()

array([  1,   6,  19,  32,  42,  43,  44,  51,  58,  64,  68,  91, 100,
       102, 116, 117, 150, 151, 169, 179, 217, 226, 240, 269, 270, 288,
       289, 294, 302, 307, 308, 321, 330, 337, 368, 410, 414, 448, 456,
       470, 477, 480, 492, 501, 544, 552, 555, 588, 590, 594, 599, 608])


# Predicting rating for a sample user with a non interacted movie
sim_user_user.predict(4, 3, verbose = True)

user: 4          item: 3          r_ui = None   est = 3.26   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.260929007645626, details={'actual_k': 40, 'was_impossible': False})


# Setting up parameter grid to tune the hyperparameters
param_grid = {'k': [30, 40, 50], 'min_k': [3, 6, 9],
              'sim_options': {'name': ['msd', 'cosine'],
                              'user_based': [True]}
              }

# Performing 3-Fold cross-validation to tune the hyperparameters
gs = GridSearchCV(KNNBasic, param_grid, measures = ['rmse'], cv = 3, n_jobs = -1)

# Fitting the model on data
gs.fit(data)

# Printing the best RMSE score
print(gs.best_score['rmse'])

# Printing the combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.947994992166605
{'k': 30, 'min_k': 3, 'sim_options': {'name': 'msd', 'user_based': True}}


# Using the optimal similarity measure for user-user collaborative filtering
sim_options = {'name': 'msd',
               'user_based': True}

# Creating an instance of KNNBasic with optimal hyperparameter values
sim_user_user_optimized = KNNBasic(sim_options = sim_options, k = 30, min_k = 3, random_state = 1, verbose = False)

# Training the algorithm on the trainset
sim_user_user_optimized.fit(trainset)

# Let us compute precision@k, recall@k, and F_1 score with k = 10
precision_recall_at_k(sim_user_user_optimized)

RMSE: 0.9467
Precision:  0.762
Recall:  0.554
F_1 score:  0.642


sim_user_user_optimized.predict(4, 10, r_ui = 4, verbose = True)

user: 4          item: 10         r_ui = 4.00   est = 3.50   {'actual_k': 30, 'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.497691535784751, details={'actual_k': 30, 'was_impossible': False})


sim_user_user_optimized.predict(4, 3, verbose = True)

user: 4          item: 3          r_ui = None   est = 3.45   {'actual_k': 30, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.4530529132024763, details={'actual_k': 30, 'was_impossible': False})


sim_user_user_optimized.get_neighbors(4, k = 5)

[89, 90, 91, 181, 230]


def get_recommendations(data, user_id, top_n, algo):
    
    # Creating an empty list to store the recommended movie ids
    recommendations = []
    
    # Creating an user item interactions matrix
    user_item_interactions_matrix = data.pivot(index = 'userId', columns = 'movieId', values = 'rating')
    
    # Extracting those movie IDs which the user ID has not interacted yet
    non_interacted_movies = user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # Looping through each of the movie IDs which user ID has not interacted yet
    for item_id in non_interacted_movies:
        
        # Predicting the ratings for those non interacted movie IDs by this user
        est = algo.predict(user_id, item_id).est
        
        # Appending the predicted ratings
        recommendations.append((item_id, est))

    # Sorting the predicted ratings in descending order
    recommendations.sort(key = lambda x: x[1], reverse = True)

    # Returing top n highest predicted rating movies for this user
    return recommendations[:top_n]


# Making top 5 recommendations for userId 4 using the similarity-based recommendation system
recommendations = get_recommendations(rating, 4, 5, sim_user_user_optimized)


# Building the dataframe for above recommendations with columns "movieId" and "predicted_ratings"
pd.DataFrame(recommendations, columns = ['movieId', 'predicted_ratings'])


def ranking_movies(recommendations, final_rating):

  # Sort the movies based on ratings count
  ranked_movies = final_rating.loc[[items[0] for items in recommendations]].sort_values('rating_count', ascending = False)[['rating_count']].reset_index()

  # Merge with the recommended movies to get predicted ratings
  ranked_movies = ranked_movies.merge(pd.DataFrame(recommendations, columns = ['movieId', 'predicted_ratings']), on = 'movieId', how = 'inner')

  # Rank the movies based on corrected ratings
  ranked_movies['corrected_ratings'] = ranked_movies['predicted_ratings'] - 1 / np.sqrt(ranked_movies['rating_count'])

  # Sort the movies based on corrected ratings
  ranked_movies = ranked_movies.sort_values('corrected_ratings', ascending = False)
  
  return ranked_movies


# Applying the ranking movies function and sorting it based on corrected ratings
ranking_movies(recommendations, final_rating)


# Declaring the similarity options
sim_options = {'name': 'cosine',
               'user_based': False}

# The KNN algorithm is used to find desired similar items
sim_item_item = KNNBasic(sim_options = sim_options, random_state = 1, verbose = False)

# Train the algorithm on the trainset, and predict ratings for the testset
sim_item_item.fit(trainset)

# Let us compute precision@k, recall@k, and f_1 score with k = 10
precision_recall_at_k(sim_item_item)

RMSE: 0.9800
Precision:  0.609
Recall:  0.464
F_1 score:  0.527


# Predicting rating for a sample user with an interacted movie
sim_item_item.predict(4, 10, r_ui = 4, verbose = True)

user: 4          item: 10         r_ui = 4.00   est = 3.63   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.6257369831511945, details={'actual_k': 40, 'was_impossible': False})


# Predicting rating for a sample user with a non interacted movie
sim_item_item.predict(4, 3, verbose = True)

user: 4          item: 3          r_ui = None   est = 3.67   {'actual_k': 40, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.6748659322681623, details={'actual_k': 40, 'was_impossible': False})


# Setting up parameter grid to tune the hyperparameters
param_grid = {'k': [10, 20, 30], 'min_k': [3, 6, 9],
              'sim_options': {'name': ['msd', 'cosine'],
                              'user_based': [False]}
              }

# Performing 3-Fold cross validation to tune the hyperparameters
gs = GridSearchCV(KNNBasic, param_grid, measures = ['rmse'], cv = 3, n_jobs = -1)

# Fitting the model on the data
gs.fit(data)

# Print the best RMSE score
print(gs.best_score['rmse'])

# Print the combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.9168710668985779
{'k': 30, 'min_k': 6, 'sim_options': {'name': 'msd', 'user_based': False}}


# Using the optimal similarity measure for item-item based collaborative filtering
sim_options = {'name': 'msd',
               'user_based': False}

# Creating an instance of KNNBasic with optimal hyperparameter values
sim_item_item_optimized = KNNBasic(sim_options = sim_options, k = 30, min_k = 6, random_state = 1, verbose = False)

# Training the algorithm on the trainset
sim_item_item_optimized.fit(trainset)

# Let us compute precision@k, recall@k, and f_1 score with k = 10
precision_recall_at_k(sim_item_item_optimized)

RMSE: 0.9160
Precision:  0.678
Recall:  0.499
F_1 score:  0.575


sim_item_item_optimized.predict(4, 10, r_ui = 4, verbose = True)

user: 4          item: 10         r_ui = 4.00   est = 3.26   {'actual_k': 30, 'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.2569148418342952, details={'actual_k': 30, 'was_impossible': False})


sim_item_item_optimized.predict(4, 3, verbose = True)

user: 4          item: 3          r_ui = None   est = 3.57   {'actual_k': 30, 'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.574650247164053, details={'actual_k': 30, 'was_impossible': False})


sim_item_item_optimized.get_neighbors(4, k = 5)

[45, 73, 148, 155, 180]


# Making top 5 recommendations for userId 4 using the similarity-based recommendation system
recommendations = get_recommendations(rating, 4, 5, sim_item_item_optimized)


# Building the dataframe for above recommendations with columns "movieId" and "predicted_ratings"
pd.DataFrame(recommendations, columns = ['movieId', 'predicted_ratings'])


# Applying the "ranking_movies" function and sorting it based on corrected ratings
ranking_movies(recommendations, final_rating)


# Using SVD with matrix factorization
svd = SVD(random_state = 1)

# Training the algorithm on the training dataset
svd.fit(trainset)

# Let us compute precision@k, recall@k, and f_1 score with k = 10
precision_recall_at_k(svd)

RMSE: 0.8797
Precision:  0.738
Recall:  0.507
F_1 score:  0.601


# Making prediction for userId 4 and movieId 10
svd.predict(4, 10, r_ui = 4, verbose = True)

user: 4          item: 10         r_ui = 4.00   est = 3.33   {'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.333359479354037, details={'was_impossible': False})


# Making prediction for userid 4 and movieId 3
svd.predict(4, 3, verbose = True)

user: 4          item: 3          r_ui = None   est = 2.94   {'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=2.9386110726567756, details={'was_impossible': False})


# Set the parameter space to do hyperparameter tuning
param_grid = {'n_epochs': [10, 20, 30], 'lr_all': [0.001, 0.005, 0.01],
              'reg_all': [0.2, 0.4, 0.6]}

# Performing 3-Fold gridsearch cross-validation
gs = GridSearchCV(SVD, param_grid, measures = ['rmse'], cv = 3, n_jobs = -1)

# Fitting the model on the data
gs.fit(data)

# Print the best RMSE score
print(gs.best_score['rmse'])

# Print the combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.8717294809446704
{'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.2}


# Building the optimized SVD model using optimal hyperparameters search
svd_optimized = SVD(n_epochs = 30, lr_all = 0.01, reg_all = 0.2, random_state = 1)

# Training the algorithm on the train set
svd_optimized = svd_optimized.fit(trainset)

# Let us compute precision@k, recall@k, and f_1 score with k = 10
precision_recall_at_k(svd_optimized)

RMSE: 0.8752
Precision:  0.731
Recall:  0.511
F_1 score:  0.602


# Using svd_algo_optimized model to recommend for userId 4 and movieId 10
svd_optimized.predict(4, 10, r_ui = 4, verbose = True)

user: 4          item: 10         r_ui = 4.00   est = 3.39   {'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.3892642624049993, details={'was_impossible': False})


# Using svd_algo_optimized model to recommend for userId 4 and movieId 3 with unknown baseline rating
svd_optimized.predict(4, 3, verbose = True)

user: 4          item: 3          r_ui = None   est = 3.20   {'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.20286300753659, details={'was_impossible': False})


# Getting top 5 recommendations for userId 4 using "svd_algo_optimized" algorithm
svd_recommendations = get_recommendations(rating, 4, 5, svd_optimized)


# Ranking movies based on above recommendations
ranking_movies(svd_recommendations, final_rating)

	movieId	predicted_ratings
0	3404	5.000000
1	7121	5.000000
2	6460	4.844207
3	115122	4.813285
4	1178	4.808807

	movieId	rating_count	predicted_ratings	corrected_ratings
1	3404	6	5.000000	4.591752
0	1178	12	4.808807	4.520132
3	7121	4	5.000000	4.500000
2	6460	5	4.844207	4.396993
4	115122	3	4.813285	4.235934

	movieId	predicted_ratings
0	5706	4.771028
1	176579	4.748016
2	25959	4.744049
3	2149	4.730439
4	56176	4.724374

	movieId	rating_count	predicted_ratings	corrected_ratings
0	2149	4	4.730439	4.230439
1	56176	3	4.724374	4.147023
2	5706	1	4.771028	3.771028
3	176579	1	4.748016	3.748016
4	25959	1	4.744049	3.744049

	movieId	rating_count	predicted_ratings	corrected_ratings
0	1178	12	4.446400	4.157725
1	177593	8	4.380428	4.026875
2	106642	7	4.379596	4.001631
3	3266	6	4.332485	3.924236
4	7121	4	4.342665	3.842665

Movie Recommendation System Part 1¶

Context¶

Objective¶

Dataset¶

Importing the necessary libraries and overview of the dataset¶

Loading the data¶

Exploring the dataset¶

Q1. What is the total number of unique users?¶

Q2. What is the total number of unique movies?¶

Q3. Is there any movie that has been interacted with more than once by the same user?¶

Q4. Which one is the most interacted movie in the dataset?¶

Q5. Which user interacted the most with any movie in the dataset?¶

Q6. What is the distribution of the user-movie interactions in this dataset?¶

Rank Based Recommendation System¶

Recommending top 5 movies with 50 minimum interactions based on popularity¶

Recommending top 5 movies with 100 minimum interactions based on popularity¶

Recommending top 5 movies with 200 minimum interactions based on popularity¶

Collaborative Filtering Based Recommendation System¶

Types of Collaborative Filtering¶

Building user-user similarity/neighborhood based Collaborative Filtering.¶

Building a baseline user-user similarity based recommendation system¶

Precision@k, Recall@ k, and F1-score@k¶

Some useful functions¶

Improving similarity-based recommendation system by tuning its hyperparameters¶

Identifying similar users to a given user (nearest neighbors)¶

Implementing the recommendation algorithm based on optimized KNNBasic model¶

Predicted top 5 movies for userId = 4 using the similarity-based recommendation system.¶

Correcting the Ratings and Ranking the above movies¶

Item-Item Similarity-Based Collaborative Filtering Recommendation Systems¶

Improving similarity-based recommendation system by tuning its hyperparameters¶

Identifying similar items to a given item (nearest neighbors)¶

Predicting top 5 movies for userId = 4 using the similarity-based recommendation system.¶

Model Based Collaborative Filtering - Matrix Factorization¶

Singular Value Decomposition (SVD)¶

U-matrix¶

Sigma-matrix¶

V-transpose matrix¶

Building a baseline matrix factorization recommendation system¶

Improving matrix factorization based recommendation system by tuning its hyperparameters¶

Conclusion¶

		rating
userId	movieId
1	1	1
	3	1
	6	1
	47	1
	50	1
...	...	...
610	166534	1
	168248	1
	168250	1
	168252	1
	170875	1

	avg_rating	rating_count
movieId
1	3.920930	215
2	3.431818	110
3	3.259615	52
4	2.357143	7
5	3.071429	49

	userId	movieId	rating
0	1	1	4.0
1	1	3	4.0
2	1	6	4.0
3	1	47	5.0
4	1	50	5.0