# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


import warnings                                 # Used to ignore the warning given as output of the code
warnings.filterwarnings('ignore')

import numpy as np                              # Basic libraries of python for numeric and dataframe computations
import pandas as pd

import matplotlib.pyplot as plt                 # Basic library for data visualization
import seaborn as sns                           # Slightly advanced library for data visualization

from collections import defaultdict             # A dictionary that does not raise a key error

from sklearn.metrics.pairwise import cosine_similarity # To find the similarity between two vectors

from sklearn.metrics import mean_squared_error  # A performance metric in sklearn


# Loading the movies dataset
movies = pd.read_csv('/content/drive/MyDrive/movies.csv')

# Let us see the first five records of the dataset
movies.head()


# Shape of the DataFrame
movies.shape

(9742, 3)


# Loading the ratings dataset
ratings = pd.read_csv('/content/drive/MyDrive/ratings.csv')


# Shape of the ratings dataset
ratings.shape

(100836, 4)


# Merging datasets on movieId 
ratings_with_title = pd.merge(ratings, movies[['movieId', 'title']], on = 'movieId', how = 'inner')

# See the first five records of the dataset
ratings_with_title.head()


# Checking info of the merged dataset
ratings_with_title.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 4.6+ MB


# Dropping the timestamp column
rating = ratings_with_title.drop(['timestamp'], axis = 1)


# Calculating average ratings
average_rating = rating.groupby('movieId').mean()['rating']

# Calculating the count of ratings
count_rating = rating.groupby('movieId').count()['rating']

# Making a dataframe with the count and average of ratings
final_rating = pd.DataFrame({'avg_rating': average_rating, 'rating_count': count_rating})


# See the first five records of the final_rating dataset
final_rating.head()


# Find the number of unique users
ratings_with_title['userId'].nunique()

610


# Find the number of unique movies
ratings_with_title['title'].nunique()

9719


# Installing the surprise package
!pip install surprise

Requirement already satisfied: surprise in /usr/local/lib/python3.7/dist-packages (0.1)
Requirement already satisfied: scikit-surprise in /usr/local/lib/python3.7/dist-packages (from surprise) (1.1.1)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-surprise->surprise) (1.1.0)
Requirement already satisfied: numpy>=1.11.2 in /usr/local/lib/python3.7/dist-packages (from scikit-surprise->surprise) (1.21.6)
Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from scikit-surprise->surprise) (1.15.0)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-surprise->surprise) (1.4.1)


# To compute the accuracy of models
from surprise import accuracy

# Class to parse a file containing ratings, data should be in structure - user; item; rating
from surprise.reader import Reader

# Class for loading datasets
from surprise.dataset import Dataset

# For tuning model hyperparameters
from surprise.model_selection import GridSearchCV

# For splitting the rating data in train and test dataset
from surprise.model_selection import train_test_split

# For implementing a clustering-based recommendation system
from surprise import CoClustering


def precision_recall_at_k(model, k = 10, threshold = 3.5):
    """Return precision@k and recall@k metrics for each user"""

    # First map the predictions to each user
    user_est_true = defaultdict(list)
    
    # Making predictions on the test data
    predictions=model.test(testset)
    
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key = lambda x: x[0], reverse = True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[ : k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[ : k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set Precision to 0 when n_rec_k is 0

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set Recall to 0 when n_rel is 0

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0
    
    # Mean of all the predicted precisions are calculated
    precision = round((sum(prec for prec in precisions.values()) / len(precisions)), 3)

    # Mean of all the predicted recalls are calculated
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)), 3)
    
    accuracy.rmse(predictions)

    # Command to print the overall precision
    print('Precision: ', precision)
    
    # Command to print the overall recall
    print('Recall: ', recall)
    
    # Formula to compute the F-1 score
    print('F_1 score: ', round((2 * precision * recall) / (precision + recall), 3))


# Instantiating Reader scale with expected rating scale
reader = Reader(rating_scale = (0, 5))

# Loading the rating dataset
data = Dataset.load_from_df(rating[['userId', 'movieId', 'rating']], reader)

# Splitting the data into train and test dataset
trainset, testset = train_test_split(data, test_size = 0.2, random_state = 42)


# Using CoClustering algorithm
clust_baseline = CoClustering(random_state = 1)

# Training the algorithm on the train set
clust_baseline.fit(trainset)

# Let us compute precision@k, recall@k, and F_1 score with k = 10
precision_recall_at_k(clust_baseline)

RMSE: 0.9490
Precision:  0.717
Recall:  0.502
F_1 score:  0.591


# Making prediction for userId 4 and movieId 10
clust_baseline.predict(4, 10, r_ui = 4, verbose = True)

user: 4          item: 10         r_ui = 4.00   est = 3.68   {'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.6757402992691386, details={'was_impossible': False})


# Making prediction for userId 4 and movieId 3
clust_baseline.predict(4, 3, verbose = True)

user: 4          item: 3          r_ui = None   est = 3.26   {'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.258169827544438, details={'was_impossible': False})


# Set the parameter space to tune
param_grid = {'n_cltr_u': [3, 4, 5, 6], 'n_cltr_i': [3, 4, 5, 6], 'n_epochs': [30, 40, 50]}

# Performing 3-Fold gridsearch cross-validation
gs = GridSearchCV(CoClustering, param_grid, measures = ['rmse'], cv = 3, n_jobs = -1)

# Fitting data
gs.fit(data)

# Printing the best RMSE score
print(gs.best_score['rmse'])

# Printing the combination of parameters that gives the best RMSE score
print(gs.best_params['rmse'])

0.9570102979396152
{'n_cltr_u': 3, 'n_cltr_i': 3, 'n_epochs': 30}


# Using tuned Coclustering algorithm
clust_tuned = CoClustering(n_cltr_u = 3, n_cltr_i = 3, n_epochs = 30, random_state = 1)

# Training the algorithm on the train set
clust_tuned.fit(trainset)

# Let us compute precision@k, recall@k, and F_1 score with k = 10
precision_recall_at_k(clust_tuned)

RMSE: 0.9499
Precision:  0.715
Recall:  0.5
F_1 score:  0.588


# Using co-clustering_optimized model to recommend for userId 4 and movieId 10
clust_tuned.predict(4, 10, r_ui = 4, verbose = True)

user: 4          item: 10         r_ui = 4.00   est = 3.65   {'was_impossible': False}

Prediction(uid=4, iid=10, r_ui=4, est=3.6549811878747214, details={'was_impossible': False})


# Using Co-clustering based optimized model
clust_tuned.predict(4, 3, verbose = True)

user: 4          item: 3          r_ui = None   est = 3.23   {'was_impossible': False}

Prediction(uid=4, iid=3, r_ui=None, est=3.2318175022354345, details={'was_impossible': False})


def get_recommendations(data, user_id, top_n, algo):
    
    # Creating an empty list to store the recommended movie IDs
    recommendations = []
    
    # Creating an user-item interactions matrix 
    user_item_interactions_matrix = data.pivot(index = 'userId', columns = 'movieId', values = 'rating')
    
    # Extracting those movie IDs which the userId has not interacted yet
    non_interacted_movies = user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # Looping through each of the movie IDs which userId has not interacted yet
    for item_id in non_interacted_movies:
        
        # Predicting the ratings for those non interacted movie IDs by this user
        est = algo.predict(user_id, item_id).est
        
        # Appending the predicted ratings
        recommendations.append((item_id, est))

    # Sorting the predicted ratings in descending order
    recommendations.sort(key = lambda x: x[1], reverse = True)

    # Returing top n highest predicted rating movies for this user
    return recommendations[:top_n]


# Getting top 5 recommendations for userId 4 using Co-clustering based optimized algorithm
clustering_recommendations = get_recommendations(rating, 4, 5, clust_tuned)


def ranking_movies(recommendations, final_rating):
  
    # Sort the movies based on ratings count
    ranked_movies = final_rating.loc[[items[0] for items in recommendations]].sort_values('rating_count', ascending = False)[['rating_count']].reset_index()

    # Merge with the recommended movies to get predicted ratings
    ranked_movies = ranked_movies.merge(pd.DataFrame(recommendations, columns = ['movieId', 'predicted_ratings']), on = 'movieId', how = 'inner')

    # Rank the movies based on corrected ratings
    ranked_movies['corrected_ratings'] = ranked_movies['predicted_ratings'] - 1 / np.sqrt(ranked_movies['rating_count'])

    # Sort the movies based on corrected ratings
    ranked_movies = ranked_movies.sort_values('corrected_ratings', ascending = False)

    return ranked_movies


# Ranking movies based on the above recommendations
ranking_movies(clustering_recommendations, final_rating)


# Importing the tags data
tags = pd.read_csv('/content/drive/MyDrive/tags.csv')
tags.head()


# Merging all the three datasets on movieId
ratings_with_title = pd.merge(ratings, movies[['movieId', 'title', 'genres']], on = 'movieId' )

final_ratings = pd.merge(ratings_with_title, tags[['movieId', 'tag']], on = 'movieId' )

# Let us see the dataset
final_ratings


# Replacing | character with space in genres column
final_ratings['genres'] = final_ratings['genres'].apply(lambda x: " ".join(x.split('|')))


# Combining title, genres, and tag columns
final_ratings['text'] = final_ratings['title'] + ' ' + final_ratings['genres'] + ' ' + final_ratings['tag']

final_ratings.head()


# Create the final_ratings dataset with specified columns
final_ratings = final_ratings[['userId', 'movieId', 'rating', 'title', 'text']]

# Let us drop the duplicate records
final_ratings = final_ratings.drop_duplicates(subset = ['title'])

# Set the index
final_ratings = final_ratings.set_index('title')

# See the first five records of the dataset
final_ratings.head()


# Let us see the shape of final_ratings data
final_ratings.shape

(1554, 4)


# Importing nltk (natural language toolkit library)
import nltk

# Downloading punctuations
nltk.download('punkt')

# Downloading stopwords
nltk.download('stopwords')

# Downloading wordnet
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

True


# This is importing regular expression
import re

# Word_tokenize is used to do tokenization
from nltk import word_tokenize

# Importing the Lematizer 
from nltk.stem import WordNetLemmatizer

# Importing the stopwords
from nltk.corpus import stopwords

# Tfidf vectorizer used to create the computational vectors
from sklearn.feature_extraction.text import TfidfVectorizer


# Create the tokenize function
def tokenize(text):
    
    # Making each letter as lowercase and removing non-alphabetical text
    text = re.sub(r"[^a-zA-Z]"," ", text.lower())
    
    # Extracting each word in the text
    tokens = word_tokenize(text)
    
    # Removing stopwords
    words = [word for word in tokens if word not in stopwords.words("english")]
    
    # Lemmatize the words
    text_lems = [WordNetLemmatizer().lemmatize(lem).strip() for lem in words]

    return text_lems


# Creating the TF-IDF object
tfidf = TfidfVectorizer(tokenizer = tokenize)

movie_tfidf = tfidf.fit_transform(final_ratings['text'].values).toarray()


# Making the DataFrame of movie_tfidf data
pd.DataFrame(movie_tfidf)


# Calculating the cosine similarity
similar_movies = cosine_similarity(movie_tfidf, movie_tfidf)

# Let us see the above array
similar_movies

array([[1.        , 0.02268393, 0.        , ..., 0.02022472, 0.        ,
        0.        ],
       [0.02268393, 1.        , 0.        , ..., 0.04779055, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.02022472, 0.04779055, 0.        , ..., 1.        , 0.00719396,
        0.19617374],
       [0.        , 0.        , 0.        , ..., 0.00719396, 1.        ,
        0.01217017],
       [0.        , 0.        , 0.        , ..., 0.19617374, 0.01217017,
        1.        ]])


# Function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, similar_movies):
    
    recommended_movies = []
    
    indices = pd.Series(final_ratings.index)
    
    # Getting the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # Creating a Series with the similarity scores in descending order
    score_series = pd.Series(similar_movies[idx]).sort_values(ascending = False)

    # Getting the indices of 10 most similar movies
    top_10_indexes = list(score_series.iloc[1 : 11].index)
    print(top_10_indexes)
    
    # Populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(final_ratings.index)[i])
        
    return recommended_movies


recommendations('Usual Suspects, The (1995)', similar_movies)

[71, 1186, 124, 551, 569, 77, 719, 766, 123, 658]

['Game, The (1997)',
 'Andalusian Dog, An (Chien andalou, Un) (1929)',
 'Town, The (2010)',
 'Now You See Me (2013)',
 'Charade (1963)',
 'Negotiator, The (1998)',
 'Following (1998)',
 '21 Grams (2003)',
 'Inception (2010)',
 'Insomnia (2002)']

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy

	userId	movieId	rating	timestamp	title
0	1	1	4.0	964982703	Toy Story (1995)
1	5	1	4.0	847434962	Toy Story (1995)
2	7	1	4.5	1106635946	Toy Story (1995)
3	15	1	2.5	1510577970	Toy Story (1995)
4	17	1	4.5	1305696483	Toy Story (1995)

	avg_rating	rating_count
movieId
1	3.920930	215
2	3.431818	110
3	3.259615	52
4	2.357143	7
5	3.071429	49

	movieId	rating_count	predicted_ratings	corrected_ratings
0	304	3	5	4.422650
1	53	2	5	4.292893
2	99	2	5	4.292893
3	238	2	5	4.292893
4	148	1	5	4.000000

	userId	movieId	tag	timestamp
0	2	60756	funny	1445714994
1	2	60756	Highly quotable	1445714996
2	2	60756	will ferrell	1445714992
3	2	89774	Boxing story	1445715207
4	2	89774	MMA	1445715200

Movie Recommendation System Part 2¶

Context¶

Objective¶

Dataset¶

Importing the necessary libraries and overview of the dataset¶

Loading the data¶

Exploring the dataset¶

Question 1: How many unique users are present in the above dataset?¶

Question 2: What is the total number of unique movies?¶

Precision@k, Recall@ k, and F1-score@k¶

Some useful functions¶

Cluster-Based Recommendation System¶

Improving clustering-based recommendation system by tuning its hyperparameters¶

Implementing the recommendation algorithm based on the optimized KNNBasic model¶

Correcting the Ratings and Ranking the above movies¶

Content-Based Recommendation System¶

Loading libraries to handle the text data¶

Feature Extraction¶

Conclusion¶

	userId	movieId	rating	timestamp	title	genres	tag	text
0	1	1	4.0	964982703	Toy Story (1995)	Adventure Animation Children Comedy Fantasy	pixar	Toy Story (1995) Adventure Animation Children ...
1	1	1	4.0	964982703	Toy Story (1995)	Adventure Animation Children Comedy Fantasy	pixar	Toy Story (1995) Adventure Animation Children ...
2	1	1	4.0	964982703	Toy Story (1995)	Adventure Animation Children Comedy Fantasy	fun	Toy Story (1995) Adventure Animation Children ...
3	5	1	4.0	847434962	Toy Story (1995)	Adventure Animation Children Comedy Fantasy	pixar	Toy Story (1995) Adventure Animation Children ...
4	5	1	4.0	847434962	Toy Story (1995)	Adventure Animation Children Comedy Fantasy	pixar	Toy Story (1995) Adventure Animation Children ...

	0	1	2	3	4	5	6	7	8	9	...	2773	2774	2775	2776	2777	2778	2779	2780	2781	2782
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1549	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1550	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1551	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1552	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1553	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0