%%HTML
<script src="require.js"></script>

from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
HTML('''<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js "></script><script>
code_show=true; 
function code_toggle() {
if (code_show){
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').hide();
} else {
$('div.jp-CodeCell > div.jp-Cell-inputWrapper').show();
}
code_show = !code_show
} 
$( document ).ready(code_toggle);</script><form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
''')

import pandas as pd
import bs4
import requests
import re
import os
import sqlite3
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from scipy.spatial.distance import euclidean, cityblock, cosine
from scipy.integrate import trapz
import warnings
import seaborn as sns

def get_info(html_file):
    """Get the information for each movie, which comes from an html file.

    Parameters
    -------
    html_file : str
        Name of html file

    Returns
    -------
    mydict : dict
        Dictionary which contains all features of a movie
    """
    with open(html_file) as f:
        html_content = f.read()

    soup = bs4.BeautifulSoup(html_content)

    title = soup.head.title.text.replace(' - Rotten Tomatoes', '')

    synopsis = soup.find('p', {'data-qa': 'movie-info-synopsis'}).text

    genre_raw = soup.find('span', {'class': 'genre'}).text
    genre = genre_raw.replace('\n', '').replace(' ', '').split(',')

    runtime_raw = soup.find('b', string='Runtime:').find_parent() \
        .find('span').find('time').text.strip()
    match = re.match(r'(\d+)h (\d+)m', runtime_raw)
    runtime = int(match.group(1)) * 60 + int(match.group(2)) if match else 0

    language = soup.find('b', string='Original Language:') \
        .find_parent().find('span').text

    top_critics = soup.find_all(
        'review-speech-balloon-deprecated',
        {'istopcritic': 'true'}
    )
    critic = '\n\n'.join(tc.get('reviewquote', '') for tc in top_critics)

    mydict = {
        'title': title,
        'synopsis': synopsis,
        'genre': genre,
        'runtime': runtime,
        'language': language,
        'critic': critic
    }

    return mydict

def get_all_movie_info():
    """Create a dataframe for the movies' details.

    Returns
    -------
    df
        Data Frame containing all the movies' details
    """
    movie_list = []
    folder_path = "movies"

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        if os.path.isfile(file_path):
            movie_list.append(get_info(file_path))
            print('done with ', file_path)

    return pd.DataFrame(movie_list)

def one_hot_encode(movies):
    """For categorical columns, such as genre and language, one-hot-encode
    them. And return the new dataframe with one-hot-encoded categorical
    variables.

    Parameters
    -------
    movies : df
        Data Frame containing all the movies and their features

    Returns
    -------
    movies_encoded : df
        Data Frame where the categorical features from the original Data Frame
        are one-hot encoded
    """
    genre_dummies = pd.get_dummies(
        movies['genre'].apply(pd.Series).stack()).groupby(level=0).sum()
    genre_dummies = genre_dummies.add_prefix('g_')

    language_dummies = pd.get_dummies(
        movies['language'], dtype='int').add_prefix('lang_')

    movies_encoded = pd.concat([movies, genre_dummies, language_dummies],
                               axis=1)
    movies_encoded.drop(['genre', 'language'], axis=1, inplace=True)
    movies_encoded.columns = movies_encoded.columns.str.lower()

    return movies_encoded

def movie_sql(df):
    """Create a table from a Data Frame which will be placed inside a
    SQLite3 database

    Parameters
    -------
    df : df
        Data Frame containing all the movies and their features
    """
    db_file = 'rotten_tomatoes.db'
    conn = sqlite3.connect(db_file)
    df.to_sql('movie_encoded', conn, index=False, if_exists='replace',)
    conn.close()

def load_sql(db='rotten_tomatoes.db'):
    """Return the Data Frame from the table in the SQLite3 database

    Parameters
    -------
    db : str, optional
        Name of the SQLite3 database

    Returns
    -------
    df : df
        Data Frame which was loaded from the SQLite3 database
    """
    conn = sqlite3.connect(db)
    cursor = conn.cursor()

    df = pd.read_sql("SELECT * FROM movie_encoded", conn)

    cursor.close()
    conn.close()

    return df

def sort_idf(df, cv):
    """Calculate the idfs for each movie and inserting them into a Data Frame,
    sort them by ascending order and return the Data Frame.

    Parameters
    -------
    df : df
        Data Frame containing all the movies and their features

    cv : CountVectorizer
        Fitted CountVectorizer containing the feature names

    Returns
    -------
    df_idf : df
        Data Frame containing the sorted idfs
    """
    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) 
    tfidf_transformer.fit(df) 
    df_idf = pd.DataFrame(tfidf_transformer.idf_,
                          index=cv.get_feature_names_out(),
                          columns=["idf_weights"]) 
    df_idf = df_idf.sort_values(by='idf_weights')
    return df_idf

def preprocessor(text):
    """Return a preprocessed document which is stemmed, lowercased, and removed of
    special character and where some words are normalized to a similar
    word `_connector_`.

    Parameters
    -------
    text : str
        Document

    Returns
    -------
    str
        Preprocessed document
    """
    english_stemmer = EnglishStemmer()

    text = text.lower()
    # remove special chars
    text = re.sub(r"\W", " ", text)
    # normalize certain words
    text = re.sub(r"\s+(in|the|all|for|and|on)\s+", " _connector_ ", text) 

    # stem words
    words = re.split("\\s+",text)
    stemmed_words = [english_stemmer.stem(word=word) for word in words]
    return ' '.join(stemmed_words)

def bag_of_words(srs, mode, stop_extend=None):
    """Return the vectorized document, whose preprocessing settings differ
    based on the mode and for the "optimal" preprocessing, whose stop words
    may be extended by `stop_extend`.

    Parameters
    -------
    srs : Series
        Series containing the document

    mode : str
        Default or `Optimal`

    stop_extend : list, optional
        List of extra stop words

    Returns
    -------
    vectorizer : CountVectorizer
        Fitted CountVectorizer

    bowmatrix : array-like
        Bag-of-word matrix
    """
    with open('minimal-stop.txt', 'r') as stop:
        lst_temp = stop.readlines()
        lst_stop = [i.replace('\n', '') for i in lst_temp]
    if stop_extend is not None:
        lst_stop.extend(stop_extend)
    if mode == 'default':
        vectorizer = CountVectorizer()
        bowmatrix = vectorizer.fit_transform(srs)
        stop.close()
        return vectorizer, bowmatrix
    else:
        vectorizer = CountVectorizer(preprocessor=preprocessor,
                                     stop_words=lst_stop,
                                     ngram_range=(1, 2),
                                     max_df=0.9,
                                     min_df=0.1,)
        bowmatrix = vectorizer.fit_transform(srs)
        stop.close()
        return vectorizer, bowmatrix

def normalize_tfidf(df):
    """Normalize the frequencies of each token column by its TF-IDF.

    Parameters
    -------
    df : df
        Data Frame containing the bag-of-word matrix

    Returns
    -------
    tfidf_transformer : TfidfTransformer
        Fitted transformer for TF-IDF
    
    df_idf : df
        Data Frame containing the TF-IDF normalized matrix
    """
    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    df_idf = tfidf_transformer.fit_transform(df)
    return tfidf_transformer, df_idf

def nearest_k(query, objects, k, dist):
    """Return the indices to objects most similar to query

    Parameters
    ----------
    query : ndarray
        query object represented in the same form vector representation as the
        objects

    objects : ndarray
        vector-represented objects in the database; rows correspond to
        objects, columns correspond to features

    k : int
        number of most similar objects to return

    dist : function
        accepts two ndarrays as parameters then returns their distance

    Returns
    -------
    most_similar : ndarray
        Indices to the most similar objects in the database
    """
    return np.argsort([dist(query, obj) for obj in objects], kind="stable")[:k]

def kappa(df1, df2, name1, name2):
    """Compute the kappa statistic for a given pair of judges and return both
    the kappa statistic and relevance matrix

    Parameters
    ----------
    df1 : df
        Data Frame containing the first judge's votes

    df2 : df
        Data Frame containing the second judge's votes

    name1 : str
        Name of first judge

    name2 : str
        Name of second judge

    Returns
    -------
    kappa : float
        Kappa statistic

    rel_mat : df
        Relevance matrix
    """
    yes_yes = 0
    yes_no = 0
    no_yes = 0
    no_no = 0
    df1 = df1.notnull().astype('int')
    df2 = df2.notnull().astype('int')
    for (index1, row1), (index2, row2) in zip(df1.iterrows(), df2.iterrows()):
        for column in df1.columns:
            if row1[column] == 1 and row1[column] == row2[column]:
                yes_yes += 1
            elif row1[column] > row2[column]:
                yes_no += 1
            elif row1[column] < row2[column]:
                no_yes += 1
            elif row1[column] == 0 and row1[column] == row2[column]:
                no_no += 1
    yes_tup1 = (name1, 'Yes')
    yes_tup2 = (name2, 'Yes')
    no_tup1 = (name1, 'No')
    no_tup2 = (name2, 'No')
    total_tup1 = (name1, 'Total')
    total_tup2 = (name2, 'Total')
    index = pd.MultiIndex.from_tuples([yes_tup1, no_tup1, total_tup1])
    cols = pd.MultiIndex.from_tuples([yes_tup2, no_tup2, total_tup2])
    rel_mat = pd.DataFrame([[yes_yes, yes_no, yes_yes + yes_no],
                            [no_yes, no_no, no_yes + no_no],
                           [yes_yes + no_yes, yes_no + no_no, (
                            yes_yes + yes_no + no_yes + no_no)]],
                           columns=cols,
                           index=index)

    # Getting kappa statistic
    total = yes_yes + no_no + yes_no + no_yes
    p_a = (yes_yes + no_no) / total
    p_nonrelevant = (yes_no + no_no + no_yes + no_no) / (total * 2)
    p_relevant = (yes_yes + yes_no + yes_yes + no_yes) / (total *2)
    p_e = (p_nonrelevant ** 2) + (p_relevant ** 2)
    kappa = (p_a - p_e) / (1 - p_e)
    return kappa, rel_mat

def pk(df, y_test, y_pred, k=5):
    """Compute precision @ k for an input boolean dataframe

    Parameters
    ----------
    df : df
        Data Frame containing boolean columns y_text and y_pred

    y_test : str
        Name of column containing actual relevance in binary where 0 is
        irrelevant and 1 is relevant

    y_pred : str
        Name of column containing ones since the returned results are
        essentially considered as relevant by the system

    k : int, optional
        Integer number of items to consider

    Returns
    -------
    float
        Number of precision value for k items
    """
    # extract the k rows
    dfK = df.head(k)
    # compute number of recommended items @ k
    denominator = dfK[y_pred].sum()
    # compute number of recommended items that are relevant @ k
    numerator = dfK[y_test].sum()
    # return result
    if denominator > 0:
        return numerator/denominator
    else:
        return None


def rk(df, y_test, y_pred, k=5):
    """Compute recall @ k for an input boolean dataframe

    Parameters
    ----------
    df : df
        Data Frame containing boolean columns y_text and y_pred

    y_test : str
        Name of column containing actual relevance in binary where 0 is
        irrelevant and 1 is relevant

    y_pred : str
        Name of column containing ones since the returned results are
        essentially considered as relevant by the system

    k : int, optional
        Integer number of items to consider

    Returns
    -------
    float
        Number of recall value for k items
    """
    # extract the k rows
    dfK = df.head(k)
    # compute number of all relevant items
    denominator = df[y_test].sum()
    # compute number of recommended items that are relevant @ k
    numerator = dfK[y_test].sum()
    # return result
    if denominator > 0:
        return numerator/denominator
    else:
        return None

def a_pk(lst_pk):
    """Compute average (precision @ k)

    Parameters
    ----------
    lst_pk : list
        List of precision values @ k

    Returns
    -------
    float
        Average (Precision @ k)
    """
    return np.mean(lst_pk)


def a_rk(lst_rk):
    """Compute average (recall @ k)

    Parameters
    ----------
    lst_pk : list
        List of recall values @ k

    Returns
    -------
    float
        Average (Recall @ k)
    """
    return np.mean(lst_rk)

def apk(actual, predicted, k=10):
    """
    Compute the average precision at k. Compute the average precision at k 
    between two lists of items.

    Parameters
    ----------
    actual : list
        A list of elements that are to be predicted (order doesn't matter)

    predicted : list
        A list of predicted elements (order does matter)

    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Compute the mean average precision at k. Compute the mean average
    prescision at k between two lists of lists of items.

    Parameters
    ----------
    actual : list
        A list of lists of elements that are to be predicted (order doesn't
        matter in the lists)

    predicted : list
        A list of lists of predicted elements (order matters in the lists)

    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
        The mean average precision at k over the input lists
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def mf_onek(mean_prec, mean_rec, beta=0.5):
    """Accept the mean precision, mean recall, and beta,
    and return the mean F-measure.

    Parameters
    ----------
    mean_prec : float
        Mean Precision

    mean_rec : float
        Mean Recall

    beta : float, optional
        Beta for the F-measure

    Returns
    -------
    F : float
        Mean F-measure based on the parameters
    """
    first_factor = 1 + beta**2
    second_factor = (mean_prec * mean_rec) / ((beta**2 * mean_prec) + mean_rec)
    F = first_factor * second_factor
    return F

def rprec(actual, predicted):
    """Compute the R-Precision of a query.

    Parameters
    ----------
    actual : array-like
        List of actual values

    predicted : array-like
        List of predicted values

    Returns
    -------
    float
        R-Precision of a query
    """
    rel = len(actual)
    predicted = predicted[:rel]
    sum_rel = 0
    for predict in predicted:
        if predict in actual:
            sum_rel += 1
    return sum_rel / rel


def ave_rprec(actual, predicted):
    """Compute the average R-Precision of several queries.

    Parameters
    ----------
    actual : array-like
        List of list of actual values (per query)

    predicted : array-like
        List of list of predicted values (per query)

    Returns
    -------
    float
        R-Precision of a query
    """
    return np.mean([rprec(a, p) for a, p in zip(actual, predicted)])

def ave_prg(lst_query, objects, dist, actual, lst_all_labels):
    """Draw PR curve

    Parameters
    ----------
    query : array-like
        find objects similar to this query

    objects : ndarray
        database of objects to search in

    dist : function
        function that returns the distance of two input `ndarray`s

    actual : int
        class label

    all_labels : array-like
        label of each object in the database

    Returns
    -------
    matplotlib.Axes
        rendered PR curve
    """
    lst_recalls = []
    lst_precisions = []
    for query, all_labels in zip(lst_query, lst_all_labels):
        all_labels = np.asarray(all_labels)
        results = nearest_k(query, objects, len(all_labels), dist)
        rs = (all_labels[results] == actual).cumsum()
        N = (all_labels == actual).sum()
        precisions = rs / np.arange(1, len(rs) + 1)
        recalls = rs / N
        recalls = [0] + recalls.tolist()
        precisions = [1] + precisions.tolist()
        lst_recalls.append(recalls)
        lst_precisions.append(precisions)

    ys = []
    x = np.linspace(0, 1, 11, endpoint=True)
    for recall, precision in zip(lst_recalls, lst_precisions):
        y = np.interp(x, recall, precision)
        ys.append(y)
    y = np.mean(ys, axis=0)

    fig, ax = plt.subplots()
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_xlabel("recall")
    ax.set_ylabel("precision")
    ax.set_title("Averaged Eleven-Point Precision/Recall Curve")
    ax.plot(x, y, "--r")
    ax.text(
        0.65,
        0.8,
        "AUC={:0.2f}".format(trapz(y, x)),
        fontsize=12,
    )
    return ax

def pretty_print(df):
    """Pretty print the dataframe

    Parameters
    ----------
    df : df
        Data Frame

    Returns
    ----------
    Display Object
        Pretty printed Data Frame
    """
    return display(HTML(df.to_html().replace("\\n", "<br>")))

def recommend(name, k=5):
    """Print the Data Frame containing the k `relevant` movies.

    Parameters
    ----------
    name : str
        Movie whose similar or `relevant` movies would be searched for

    k : int, optional
        Number of movies to return
    """
    query_d = movies_default[movies_default["title_old"] == name].iloc[0]
    query_d = query_d.drop("title_old")
    results = nearest_k(query_d, movies_default.iloc[:, 1:].to_numpy(),
                        int(k), cosine)
    pretty_print(
        pd.DataFrame(movies["title"][results])
        .reset_index()
        .rename(columns={"title": "Movie"})
        .drop("index", axis=1)
    )

def search_k_title():
    """Display the drop-down menu widget and button widget
    """
    lst_reco_k = list(range(1, 129))

    reco_k = widgets.Dropdown(
        options=lst_reco_k,
        value=5,
        description='k',
        disabled=False,
    )
    reco_movie = widgets.Dropdown(
        options=movies['title'].tolist(),
        value='Afire',
        description='Movie Title',
        disabled=False,
    )

    display(reco_k)
    display(reco_movie)

    button = widgets.Button(description="Search", button_style='success')
    output = widgets.Output()
    display(button, output)

    def on_button_clicked(b):
        """Run an event given a click input of the button

        Parameters
        ----------
        b : Button
            Button
        """
        with output:
            clear_output(wait=True)
            display(HTML(f'<p><b>Table 27.</b></p>'
                         f'<p><i>{reco_k.value} movies similar to '
                         f'{reco_movie.value}</i></p>'))
            recommend(reco_movie.value, reco_k.value)

    button.on_click(on_button_clicked)

%%capture
movies =  get_all_movie_info();

movies = one_hot_encode(movies)

# Convert the dataframe into a table inside a SQL database
movie_sql(movies)

movie_sql(movies)

%%capture
movies = load_sql()

movies.head(3)

movies.boxplot('runtime', color='green',
               flierprops=dict(markeredgecolor='red',
                               markerfacecolor='red'))
plt.title('Box-Plot of Runtime Column')
plt.ylabel('runtime')
plt.show()

warnings.filterwarnings("ignore", category=FutureWarning)

df = movies

bin_ranges = [0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300]
bin_labels = ['0-30', '31-60', '61-90', '91-120', '121-150',
              '151-180', '181-210', '211-240', '241-270', '271-300']

df['runtime_bins'] = pd.cut(
    df['runtime'], bins=bin_ranges, labels=bin_labels, right=False)

runtime_counts = df['runtime_bins'].value_counts().sort_index()


top_colors = ['#B74325' if 1 < i < 5 else '#FAE3A6'
              for i in range(len(runtime_counts))]

plt.figure(figsize=(10, 6))
sns.barplot(x=runtime_counts.index, y=runtime_counts.values,
            palette=top_colors)

plt.xticks(rotation=45)
plt.xlabel('Runtime (minutes)')
plt.ylabel('Number of Movies')
plt.title('Distribution of Movie Runtimes')

plt.show()

warnings.filterwarnings("ignore", category=FutureWarning)

cat = movies.loc[:, 'g_action':'g_western']
cat.columns = [re.sub(r'g_', '', i) for i in cat.columns]

lst_categories = []

for index, row in cat.iterrows():
    category = []
    for column in cat.columns:
        if row[column] == 1:
            category.append(column)
    categories = ', '.join(category)
    lst_categories.append(categories)

cat['categories'] = lst_categories

category_counts = cat['categories'].value_counts()

top_colors = ['#B74325' if i < 5 else '#FAE3A6'
              for i in range(len(category_counts))]

plt.figure(figsize=(10, 15))
sns.barplot(y=category_counts.index, x=category_counts.values,
            palette=top_colors, orient='h')

plt.xticks(rotation=0, ha='right')
plt.xlabel('Number of Movies')
plt.ylabel('Categories')
plt.title('Distribution of Movie Categories (Top 10)')

plt.tight_layout()
plt.show()

genres = movies.loc[:, 'g_action':'g_western']
genres.columns = [re.sub(r'g_', '', i) for i in genres.columns]

genres = genres.sum(axis=0)
genres = genres.sort_values(ascending=False)

top_colors = ['#B74325' if i < 5 else '#FAE3A6'
              for i in range(len(genres))]

plt.figure(figsize=(10, 6))
sns.barplot(y=genres.index, x=genres.values,
            palette=top_colors, orient="h")

plt.xticks(rotation=0)
plt.xlabel("Number of Movies")
plt.ylabel("Genre")
plt.title("Distribution of Movie Genres")

plt.show()

lang = movies.loc[:, 'lang_arabic':'lang_urdu']
lang.columns = [re.sub(r'lang_', '', i) for i in lang.columns]

lang = lang.sum(axis=0)
lang = lang.sort_values(ascending=False)

top_colors = ['#B74325' if i < 5 else '#FAE3A6'
              for i in range(len(lang))]

plt.figure(figsize=(10, 6))
sns.barplot(y=lang.index, x=lang.values,
            palette=top_colors, orient="h")

plt.xticks(rotation=0)
plt.xlabel("Number of Movies")
plt.ylabel("Languages")
plt.title("Distribution of Movie Languages")

plt.show()

%%capture
movies = load_sql()

warnings.filterwarnings('ignore')

# Movies (Default Mode)
vectorizer_default_synopsis, movie_default_synopsis = bag_of_words(
    movies['synopsis'], mode='default')
movie_default_synopsis = pd.DataFrame.sparse.from_spmatrix(
    movie_default_synopsis,
    columns=vectorizer_default_synopsis.get_feature_names_out(),
)
vectorizer_default_critic, movie_default_critic = bag_of_words(
    movies['critic'], mode='default')
movie_default_critic = pd.DataFrame.sparse.from_spmatrix(
    movie_default_critic,
    columns=vectorizer_default_critic.get_feature_names_out(),
)

# Movies (Optimal Mode)
vectorizer_optimal_synopsis, movie_optimal_synopsis = bag_of_words(
    movies['synopsis'], mode='optimal')
movie_optimal_synopsis = pd.DataFrame.sparse.from_spmatrix(
    movie_optimal_synopsis,
    columns=vectorizer_optimal_synopsis.get_feature_names_out(),
)
vectorizer_optimal_critic, movie_optimal_critic = bag_of_words(
    movies['critic'], mode='optimal')
movie_optimal_critic = pd.DataFrame.sparse.from_spmatrix(
    movie_optimal_critic,
    columns=vectorizer_optimal_critic.get_feature_names_out(),
)

movie_default_synopsis.head(3)

movie_default_critic.head(3)

movie_optimal_synopsis.head(3)

movie_optimal_critic.head(3)

# Synopsis Optimal
vectorizer_optimal_synopsis, movie_optimal_synopsis = bag_of_words(
    movies['synopsis'], mode='optimal')
idf_movie_synopsis = sort_idf(movie_optimal_synopsis,
                             vectorizer_optimal_synopsis)

# Critic Optimal
vectorizer_optimal_critic, movie_optimal_critic = bag_of_words(
    movies['critic'], mode='optimal')
idf_movie_critic = sort_idf(movie_optimal_critic,
                             vectorizer_optimal_critic)

idf_movie_synopsis.head(5)

idf_movie_critic.head(5)

# Optimal Synopsis
vectorizer_optimal_synopsis, movie_optimal_synopsis = bag_of_words(
    movies['synopsis'],
    mode='optimal',
    stop_extend=list(idf_movie_synopsis.head(2).index)
)
movie_optimal_synopsis = pd.DataFrame.sparse.from_spmatrix(
    movie_optimal_synopsis,
    columns=vectorizer_optimal_synopsis.get_feature_names_out(),
)

# Optimal Critic
vectorizer_optimal_critic, movie_optimal_critic = bag_of_words(
    movies['critic'],
    mode='optimal',
    stop_extend=list(idf_movie_critic.head(5).index)
)
movie_optimal_critic = pd.DataFrame.sparse.from_spmatrix(
    movie_optimal_critic,
    columns=vectorizer_optimal_critic.get_feature_names_out(),
)

movie_optimal_synopsis.head(3)

movie_optimal_critic.head(3)

# Movies (Default Mode)
vectorizer_default_synopsis, movie_default_synopsis = bag_of_words(
    movies['synopsis'], mode='default')
vectorizer_default_critic, movie_default_critic = bag_of_words(
    movies['critic'], mode='default')
vectorizer_def_sys_tfidf, movie_default_synopsis = normalize_tfidf(
    movie_default_synopsis)
vectorizer_def_crit_tfidf, movie_default_critic = normalize_tfidf(
    movie_default_critic)
movie_default_synopsis = pd.DataFrame.sparse.from_spmatrix(
    movie_default_synopsis,
    columns=vectorizer_default_synopsis.get_feature_names_out(),
)
movie_default_critic = pd.DataFrame.sparse.from_spmatrix(
    movie_default_critic,
    columns=vectorizer_default_critic.get_feature_names_out(),
)

# Movies (Optimal Mode)
vectorizer_optimal_synopsis, movie_optimal_synopsis = bag_of_words(
    movies['synopsis'],
    mode='optimal',
    stop_extend=list(idf_movie_synopsis.head(2).index)
)
vectorizer_optimal_critic, movie_optimal_critic = bag_of_words(
    movies['critic'],
    mode='optimal',
    stop_extend=list(idf_movie_critic.head(5).index)
)
vectorizer_opt_sys_tfidf, movie_optimal_synopsis = normalize_tfidf(
    movie_optimal_synopsis)
vectorizer_opt_crit_tfidf, movie_optimal_critic = normalize_tfidf(
    movie_optimal_critic)
movie_optimal_synopsis = pd.DataFrame.sparse.from_spmatrix(
    movie_optimal_synopsis,
    columns=vectorizer_optimal_synopsis.get_feature_names_out(),
)
movie_optimal_critic = pd.DataFrame.sparse.from_spmatrix(
    movie_optimal_critic,
    columns=vectorizer_optimal_critic.get_feature_names_out(),
)

movie_default_synopsis.head(5)

movie_default_critic.head(5)

movie_optimal_synopsis.head(5)

movie_optimal_critic.head(5)

concat_movies = movies.rename(columns={'runtime': 'runtime_old',
                                       'title': 'title_old'})

# Movies (Default)
movies_default = concat_movies.drop(['critic', 'synopsis'], axis=1)
movies_default = pd.concat([movies_default, movie_default_synopsis,
                            movie_default_critic], axis=1)

# Movies (Optimal)
movies_optimal = concat_movies.drop(['critic', 'synopsis'], axis=1)
movies_optimal = pd.concat([movies_optimal, movie_optimal_synopsis,
                            movie_optimal_critic], axis=1)

scaler_default = MinMaxScaler()
scaler_optimal = MinMaxScaler()


# Movies (Default)
runtime_default = scaler_default.fit_transform(
    movies_default[['runtime_old']])
runtime_default = pd.DataFrame(runtime_default, columns=['runtime_old'])
movies_default.drop('runtime_old', axis=1, inplace=True)
movies_default = pd.concat([movies_default, runtime_default], axis=1)

# Movies (Optimal)
runtime_optimal = scaler_optimal.fit_transform(
    movies_optimal[['runtime_old']])
runtime_optimal = pd.DataFrame(runtime_optimal, columns=['runtime_old'])
movies_optimal.drop('runtime_old', axis=1, inplace=True)
movies_optimal = pd.concat([movies_optimal, runtime_optimal], axis=1)

movies_default.head(3)

movies_optimal.head(3)

validation_data = pd.read_csv('validation_data.csv')
validation_data.fillna(0, inplace=True)
validation_data_temp = validation_data.iloc[:, 1:].astype(int)
validation_data = pd.concat([validation_data.iloc[:, 0],
                             validation_data_temp],
                             axis=1)
validation_data = validation_data.map(
    lambda x: 1 if x == 2 or x == 100 else x)
movies.set_index('title', inplace=True)
validation_data.set_index('title', inplace=True)
validation_data = validation_data.reindex(movies.index)
movies.reset_index(inplace=True)
validation_data.reset_index(inplace=True)
validation_data.head(3)

judge_1 = pd.read_csv('judge_1.csv')
judge_1.drop('title', axis=1)
judge_2 = pd.read_csv('judge_2.csv')
judge_2.drop('title', axis=1)
judge_3 = pd.read_csv('validation_data.csv')
judge_3.drop('title', axis=1)

kappa_12, rel_mat12 = kappa(judge_1, judge_2, 'Judge 1', 'Judge 2')
rel_mat12

kappa_23, rel_mat23 = kappa(judge_2, judge_3, 'Judge 2', 'Judge 3')
rel_mat23

kappa_13, rel_mat13 = kappa(judge_1, judge_3, 'Judge 1', 'Judge 3')
rel_mat13

pd.DataFrame(
    {
        "Judge 1 vs. 2": "{:0.2f}".format(kappa_12),
        "Judge 2 vs. 3": "{:0.2f}".format(kappa_23),
        "Judge 1 vs. 3": "{:0.2f}".format(kappa_13),
        "Average Pairwise": "{:0.2f}".format(np.mean([kappa_12, kappa_23,
                                                      kappa_13])),
    },
    index=["Kappa Statistics"],
)

# Preparing dictionaries
l2_dict = {'Default': {}, 'Optimal': {}}
l1_dict = {'Default': {}, 'Optimal': {}}
cos_dict = {'Default': {}, 'Optimal': {}}

for column in validation_data.columns[1:]:
    # Euclidian Distance
    query_d = movies_default[movies_default['title_old'] == column].iloc[0]
    query_d = query_d.drop('title_old')
    query_o = movies_optimal[movies_optimal['title_old'] == column].iloc[0]
    query_o = query_o.drop('title_old')
    l2_default = nearest_k(query_d,
                           movies_default.iloc[:, 1:].to_numpy(),
                           len(movies_default),
                           euclidean)
    l2_optimal = nearest_k(query_o,
                           movies_optimal.iloc[:, 1:].to_numpy(),
                           len(movies_optimal),
                           euclidean)
    l2_dict['Default'][column] = l2_default
    l2_dict['Optimal'][column] = l2_optimal

    # Manhattan Distance
    l1_default = nearest_k(query_d,
                           movies_default.iloc[:, 1:].to_numpy(),
                           len(movies_default),
                           cityblock)
    l1_optimal = nearest_k(query_o,
                           movies_optimal.iloc[:, 1:].to_numpy(),
                           len(movies_optimal),
                           cityblock)
    l1_dict['Default'][column] = l1_default
    l1_dict['Optimal'][column] = l1_optimal

    # Cosine Distance
    cos_default = nearest_k(query_d,
                            movies_default.iloc[:, 1:].to_numpy(),
                            len(movies_default),
                            cosine)
    cos_optimal = nearest_k(query_o,
                            movies_optimal.iloc[:, 1:].to_numpy(),
                            len(movies_optimal),
                            cosine)
    cos_dict['Default'][column] = cos_default
    cos_dict['Optimal'][column] = cos_optimal

# Getting Average Precision @ k and Average Recall @ k
# Default
# Euclidean
dict_perf = {
    "Euclidean": {
        "Default": {"Precision": [], "Recall": []},
        "Optimal": {"Precision": [], "Recall": []},
    },
    "Manhattan": {
        "Default": {"Precision": [], "Recall": []},
        "Optimal": {"Precision": [], "Recall": []},
    },
    "Cosine": {
        "Default": {"Precision": [], "Recall": []},
        "Optimal": {"Precision": [], "Recall": []},
    },
}

for movie, results in l2_dict["Default"].items():
    gold_standard = validation_data[movie][results]
    recommended = np.ones(128)
    df = pd.DataFrame({"true": gold_standard, "results": recommended})
    df["results"] = df["results"].astype(int)
    dict_perf["Euclidean"]["Default"]["Precision"].append(pk(
        df, "true", "results", k=5))
    dict_perf["Euclidean"]["Default"]["Recall"].append(
        rk(df, "true", "results", k=5))

# Manhattan
for movie, results in l1_dict["Default"].items():
    gold_standard = validation_data[movie][results]
    recommended = np.ones(128)
    df = pd.DataFrame({"true": gold_standard, "results": recommended})
    df["results"] = df["results"].astype(int)
    dict_perf["Manhattan"]["Default"]["Precision"].append(pk(
        df, "true", "results", k=5))
    dict_perf["Manhattan"]["Default"]["Recall"].append(
        rk(df, "true", "results", k=5))

# Cosine
for movie, results in cos_dict["Default"].items():
    gold_standard = validation_data[movie][results]
    recommended = np.ones(128)
    df = pd.DataFrame({"true": gold_standard, "results": recommended})
    df["results"] = df["results"].astype(int)
    dict_perf["Cosine"]["Default"]["Precision"].append(
        pk(df, "true", "results", k=5))
    dict_perf["Cosine"]["Default"]["Recall"].append(
        rk(df, "true", "results", k=5))

# Optimal
# Euclidean
for movie, results in l2_dict["Optimal"].items():
    gold_standard = validation_data[movie][results]
    recommended = np.ones(128)
    df = pd.DataFrame({"true": gold_standard, "results": recommended})
    df["results"] = df["results"].astype(int)
    dict_perf["Euclidean"]["Optimal"]["Precision"].append(
        pk(df, "true", "results", k=5))
    dict_perf["Euclidean"]["Optimal"]["Recall"].append(
        rk(df, "true", "results", k=5))

# Manhattan
for movie, results in l1_dict["Optimal"].items():
    gold_standard = validation_data[movie][results]
    recommended = np.ones(128)
    df = pd.DataFrame({"true": gold_standard, "results": recommended})
    df["results"] = df["results"].astype(int)
    dict_perf["Manhattan"]["Optimal"]["Precision"].append(pk(
        df, "true", "results", k=5))
    dict_perf["Manhattan"]["Optimal"]["Recall"].append(
        rk(df, "true", "results", k=5))

# Cosine
for movie, results in cos_dict["Optimal"].items():
    gold_standard = validation_data[movie][results]
    recommended = np.ones(128)
    df = pd.DataFrame({"true": gold_standard, "results": recommended})
    df["results"] = df["results"].astype(int)
    dict_perf["Cosine"]["Optimal"]["Precision"].append(
        pk(df, "true", "results", k=5))
    dict_perf["Cosine"]["Optimal"]["Recall"].append(
        rk(df, "true", "results", k=5))

# Getting the Average (Precision @ 5) and (Average Recall @ 5)
# Getting the Average (F-1 Score @ 5)
# Default (Euclidean)
default_euc_pave = a_pk(dict_perf["Euclidean"]["Default"]["Precision"])
default_euc_rave = a_rk(dict_perf["Euclidean"]["Default"]["Recall"])
default_euc_fone = mf_onek(default_euc_pave, default_euc_rave, beta=0.5)
# Default (Manhattan)
default_man_pave = a_pk(dict_perf["Manhattan"]["Default"]["Precision"])
default_man_rave = a_rk(dict_perf["Manhattan"]["Default"]["Recall"])
default_man_fone = mf_onek(default_man_pave, default_man_rave, beta=0.5)
# Default (Cosine)
default_cos_pave = a_pk(dict_perf["Cosine"]["Default"]["Precision"])
default_cos_rave = a_rk(dict_perf["Cosine"]["Default"]["Recall"])
default_cos_fone = mf_onek(default_cos_pave, default_cos_rave, beta=0.5)

# Optimal (Euclidean)
optimal_euc_pave = a_pk(dict_perf["Euclidean"]["Optimal"]["Precision"])
optimal_euc_rave = a_rk(dict_perf["Euclidean"]["Optimal"]["Recall"])
optimal_euc_fone = mf_onek(optimal_euc_pave, optimal_euc_rave, beta=0.5)
# Default (Manhattan)
optimal_man_pave = a_pk(dict_perf["Manhattan"]["Optimal"]["Precision"])
optimal_man_rave = a_rk(dict_perf["Manhattan"]["Optimal"]["Recall"])
optimal_man_fone = mf_onek(optimal_man_pave, optimal_man_rave, beta=0.5)
# Default (Cosine)
optimal_cos_pave = a_pk(dict_perf["Cosine"]["Optimal"]["Precision"])
optimal_cos_rave = a_rk(dict_perf["Cosine"]["Optimal"]["Recall"])
optimal_cos_fone = mf_onek(optimal_cos_pave, optimal_cos_rave, beta=0.5)

# Mean Average Precision
l2_map = {
    "Default": {"Actual": [], "Predicted": []},
    "Optimal": {"Actual": [], "Predicted": []},
}
l1_map = {
    "Default": {"Actual": [], "Predicted": []},
    "Optimal": {"Actual": [], "Predicted": []},
}
cos_map = {
    "Default": {"Actual": [], "Predicted": []},
    "Optimal": {"Actual": [], "Predicted": []},
}

# Default
# Euclidean
for movie, results in l2_dict["Default"].items():
    actual = validation_data[validation_data[movie] == 1].index.tolist()
    predict = results
    l2_map["Default"]["Actual"].append(actual)
    l2_map["Default"]["Predicted"].append(predict)

# Manhattan
for movie, results in l1_dict["Default"].items():
    actual = validation_data[validation_data[movie] == 1].index.tolist()
    predict = results
    l1_map["Default"]["Actual"].append(actual)
    l1_map["Default"]["Predicted"].append(predict)

# Cosine
for movie, results in cos_dict["Default"].items():
    actual = validation_data[validation_data[movie] == 1].index.tolist()
    predict = results
    cos_map["Default"]["Actual"].append(actual)
    cos_map["Default"]["Predicted"].append(predict)

# Optimal
# Euclidean
for movie, results in l2_dict["Optimal"].items():
    actual = validation_data[validation_data[movie] == 1].index.tolist()
    predict = results
    l2_map["Optimal"]["Actual"].append(actual)
    l2_map["Optimal"]["Predicted"].append(predict)

# Manhattan
for movie, results in l1_dict["Optimal"].items():
    actual = validation_data[validation_data[movie] == 1].index.tolist()
    predict = results
    l1_map["Optimal"]["Actual"].append(actual)
    l1_map["Optimal"]["Predicted"].append(predict)

# Cosine
for movie, results in cos_dict["Optimal"].items():
    actual = validation_data[validation_data[movie] == 1].index.tolist()
    predict = results
    cos_map["Optimal"]["Actual"].append(actual)
    cos_map["Optimal"]["Predicted"].append(predict)

# Mean Average Precision
# Default Euclidean
default_euc_map = mapk(
    l2_map["Default"]["Actual"],
    l2_map["Default"]["Predicted"],
    len(l2_map["Default"]["Predicted"]),
)

# Default Manhattan
default_man_map = mapk(
    l1_map["Default"]["Actual"],
    l1_map["Default"]["Predicted"],
    len(l1_map["Default"]["Predicted"]),
)

# Default Cosine
default_cos_map = mapk(
    cos_map["Default"]["Actual"],
    cos_map["Default"]["Predicted"],
    len(cos_map["Default"]["Predicted"]),
)

# Optimal Euclidean
optimal_euc_map = mapk(
    l2_map["Optimal"]["Actual"],
    l2_map["Optimal"]["Predicted"],
    len(l2_map["Optimal"]["Predicted"]),
)

# Optimal Manhattan
optimal_man_map = mapk(
    l1_map["Optimal"]["Actual"],
    l1_map["Optimal"]["Predicted"],
    len(l1_map["Optimal"]["Predicted"]),
)

# Optimal Cosine
optimal_cos_map = mapk(
    cos_map["Optimal"]["Actual"],
    cos_map["Optimal"]["Predicted"],
    len(cos_map["Optimal"]["Predicted"]),
)

# R-Precision
# Default Euclidean
default_euc_rp = ave_rprec(
    l2_map["Default"]["Actual"],
    l2_map["Default"]["Predicted"],
)

# Default Manhattan
default_man_rp = ave_rprec(
    l1_map["Default"]["Actual"],
    l1_map["Default"]["Predicted"],
)

# Default Cosine
default_cos_rp = ave_rprec(
    cos_map["Default"]["Actual"],
    cos_map["Default"]["Predicted"],
)

# Optimal Euclidean
optimal_euc_rp = ave_rprec(
    l2_map["Optimal"]["Actual"],
    l2_map["Optimal"]["Predicted"],
)

# Optimal Manhattan
optimal_man_rp = ave_rprec(
    l1_map["Optimal"]["Actual"],
    l1_map["Optimal"]["Predicted"],
)

# Optimal Cosine
optimal_cos_rp = ave_rprec(
    cos_map["Optimal"]["Actual"],
    cos_map["Optimal"]["Predicted"],
)

# Eleven-Point Precision/Recall Curve
# Preparing list for queries
queries = {'Default': [], 'Optimal': []}
gold_standards = []

for column in validation_data.columns[1:]:
    query_d = movies_default[movies_default['title_old'] == column].iloc[0]
    query_d = query_d.drop('title_old')
    query_o = movies_optimal[movies_optimal['title_old'] == column].iloc[0]
    query_o = query_o.drop('title_old')
    queries['Default'].append(query_d)
    queries['Optimal'].append(query_o)
    standard = validation_data[column]
    gold_standards.append(standard)

# Preparing main database
data_default = movies_default.iloc[:, 1:].to_numpy()
data_optimal = movies_optimal.iloc[:, 1:].to_numpy()

# R-Precision per Genre
dict_genre = {
    "Euclidean": {
        "Default": [],
        "Optimal": [],
    },
    "Manhattan": {
        "Default": [],
        "Optimal": [],
    },
    "Cosine": {
        "Default": [],
        "Optimal": [],
    },
}

i = 0
for a, p in zip(l2_map['Default']['Actual'], l2_map['Default']['Predicted']):
    dict_genre['Euclidean']['Default'].append(rprec(a, p))
    i += 1
i = 0
for a, p in zip(l1_map['Default']['Actual'], l1_map['Default']['Predicted']):
    dict_genre['Manhattan']['Default'].append(rprec(a, p))
    i += 1
i = 0
for a, p in zip(cos_map['Default']['Actual'], cos_map['Default']['Predicted']):
    dict_genre['Cosine']['Default'].append(rprec(a, p))
    i += 1
i = 0
for a, p in zip(l2_map['Optimal']['Actual'], l2_map['Optimal']['Predicted']):
    dict_genre['Euclidean']['Optimal'].append(rprec(a, p))
    i += 1
i = 0
for a, p in zip(l1_map['Optimal']['Actual'], l1_map['Optimal']['Predicted']):
    dict_genre['Manhattan']['Optimal'].append(rprec(a, p))
    i += 1
i = 0
for a, p in zip(cos_map['Optimal']['Actual'], cos_map['Optimal']['Predicted']):
    dict_genre['Cosine']['Optimal'].append(rprec(a, p))
i += 1

# Getting Average Precision @ k and Average Recall @ k
# Default
# Euclidean
dict_k_check = {
    "Euclidean": {
        "Default": {"Precision": {}, "Recall": {}},
        "Optimal": {"Precision": {}, "Recall": {}},
    },
    "Manhattan": {
        "Default": {"Precision": {}, "Recall": {}},
        "Optimal": {"Precision": {}, "Recall": {}},
    },
    "Cosine": {
        "Default": {"Precision": {}, "Recall": {}},
        "Optimal": {"Precision": {}, "Recall": {}},
    },
}

for i in range(1, 129):
    dict_k_check["Euclidean"]["Default"]["Precision"][i] = []
    dict_k_check["Euclidean"]["Default"]["Recall"][i] = []
    for movie, results in l2_dict["Default"].items():
        gold_standard = validation_data[movie][results]
        recommended = np.ones(128)
        df = pd.DataFrame({"true": gold_standard, "results": recommended})
        df["results"] = df["results"].astype(int)
        dict_k_check["Euclidean"]["Default"]["Precision"][i].append(pk(
            df, "true", "results", k=i))
        dict_k_check["Euclidean"]["Default"]["Recall"][i].append(
            rk(df, "true", "results", k=i))

# Manhattan
for i in range(1, 129):
    dict_k_check["Manhattan"]["Default"]["Precision"][i] = []
    dict_k_check["Manhattan"]["Default"]["Recall"][i] = []
    for movie, results in l1_dict["Default"].items():
        gold_standard = validation_data[movie][results]
        recommended = np.ones(128)
        df = pd.DataFrame({"true": gold_standard, "results": recommended})
        df["results"] = df["results"].astype(int)
        dict_k_check["Manhattan"]["Default"]["Precision"][i].append(pk(
            df, "true", "results", k=i))
        dict_k_check["Manhattan"]["Default"]["Recall"][i].append(
            rk(df, "true", "results", k=i))

# Cosine
for i in range(1, 129):
    dict_k_check["Cosine"]["Default"]["Precision"][i] = []
    dict_k_check["Cosine"]["Default"]["Recall"][i] = []
    for movie, results in cos_dict["Default"].items():
        gold_standard = validation_data[movie][results]
        recommended = np.ones(128)
        df = pd.DataFrame({"true": gold_standard, "results": recommended})
        df["results"] = df["results"].astype(int)
        dict_k_check["Cosine"]["Default"]["Precision"][i].append(
            pk(df, "true", "results", k=i))
        dict_k_check["Cosine"]["Default"]["Recall"][i].append(
            rk(df, "true", "results", k=i))

# Optimal
# Euclidean
for i in range(1, 129):
    dict_k_check["Euclidean"]["Optimal"]["Precision"][i] = []
    dict_k_check["Euclidean"]["Optimal"]["Recall"][i] = []
    for movie, results in l2_dict["Optimal"].items():
        gold_standard = validation_data[movie][results]
        recommended = np.ones(128)
        df = pd.DataFrame({"true": gold_standard, "results": recommended})
        df["results"] = df["results"].astype(int)
        dict_k_check["Euclidean"]["Optimal"]["Precision"][i].append(
            pk(df, "true", "results", k=i))
        dict_k_check["Euclidean"]["Optimal"]["Recall"][i].append(
            rk(df, "true", "results", k=i))

# Manhattan
for i in range(1, 129):
    dict_k_check["Manhattan"]["Optimal"]["Precision"][i] = []
    dict_k_check["Manhattan"]["Optimal"]["Recall"][i] = []
    for movie, results in l1_dict["Optimal"].items():
        gold_standard = validation_data[movie][results]
        recommended = np.ones(128)
        df = pd.DataFrame({"true": gold_standard, "results": recommended})
        df["results"] = df["results"].astype(int)
        dict_k_check["Manhattan"]["Optimal"]["Precision"][i].append(pk(
            df, "true", "results", k=i))
        dict_k_check["Manhattan"]["Optimal"]["Recall"][i].append(
            rk(df, "true", "results", k=i))

# Cosine
for i in range(1, 129):
    dict_k_check["Cosine"]["Optimal"]["Precision"][i] = []
    dict_k_check["Cosine"]["Optimal"]["Recall"][i] = []
    for movie, results in cos_dict["Optimal"].items():
        gold_standard = validation_data[movie][results]
        recommended = np.ones(128)
        df = pd.DataFrame({"true": gold_standard, "results": recommended})
        df["results"] = df["results"].astype(int)
        dict_k_check["Cosine"]["Optimal"]["Precision"][i].append(
            pk(df, "true", "results", k=i))
        dict_k_check["Cosine"]["Optimal"]["Recall"][i].append(
            rk(df, "true", "results", k=i))

xs_k = np.arange(1, 129, 1)
ys_pk = list()
ys_rk = list()

for i in range(1, 129):
    arr_pk = np.asarray(dict_k_check["Euclidean"]["Default"]["Precision"][i])
    arr_rk = np.asarray(dict_k_check["Euclidean"]["Default"]["Recall"][i])
    y_pk = np.mean(arr_pk)
    y_rk = np.mean(arr_rk)
    ys_pk.append(y_pk)
    ys_rk.append(y_rk)

ys_pk2 = list()
ys_rk2 = list()

for i in range(1, 129):
    arr_pk = np.asarray(dict_k_check["Manhattan"]["Default"]["Precision"][i])
    arr_rk = np.asarray(dict_k_check["Manhattan"]["Default"]["Recall"][i])
    y_pk = np.mean(arr_pk)
    y_rk = np.mean(arr_rk)
    ys_pk2.append(y_pk)
    ys_rk2.append(y_rk)

ys_pk3 = list()
ys_rk3 = list()

for i in range(1, 129):
    arr_pk = np.asarray(dict_k_check["Cosine"]["Default"]["Precision"][i])
    arr_rk = np.asarray(dict_k_check["Cosine"]["Default"]["Recall"][i])
    y_pk = np.mean(arr_pk)
    y_rk = np.mean(arr_rk)
    ys_pk3.append(y_pk)
    ys_rk3.append(y_rk)

ys_pk4 = list()
ys_rk4 = list()

for i in range(1, 129):
    arr_pk = np.asarray(dict_k_check["Euclidean"]["Optimal"]["Precision"][i])
    arr_rk = np.asarray(dict_k_check["Euclidean"]["Optimal"]["Recall"][i])
    y_pk = np.mean(arr_pk)
    y_rk = np.mean(arr_rk)
    ys_pk4.append(y_pk)
    ys_rk4.append(y_rk)

ys_pk5 = list()
ys_rk5 = list()

for i in range(1, 129):
    arr_pk = np.asarray(dict_k_check["Manhattan"]["Optimal"]["Precision"][i])
    arr_rk = np.asarray(dict_k_check["Manhattan"]["Optimal"]["Recall"][i])
    y_pk = np.mean(arr_pk)
    y_rk = np.mean(arr_rk)
    ys_pk5.append(y_pk)
    ys_rk5.append(y_rk)

ys_pk6 = list()
ys_rk6 = list()

for i in range(1, 129):
    arr_pk = np.asarray(dict_k_check["Cosine"]["Optimal"]["Precision"][i])
    arr_rk = np.asarray(dict_k_check["Cosine"]["Optimal"]["Recall"][i])
    y_pk = np.mean(arr_pk)
    y_rk = np.mean(arr_rk)
    ys_pk6.append(y_pk)
    ys_rk6.append(y_rk)

fig, ax = plt.subplots()
ax.set_xlabel("k")
ax.set_ylabel("Precision/Recall")
ax.set_title("Precision/Recall Measures @ k")
ax.plot(xs_k, ys_pk, "--r")
ax.plot(xs_k, ys_rk, "--g")
ax.plot(xs_k, ys_pk2, "--r")
ax.plot(xs_k, ys_rk2, "--g")
ax.plot(xs_k, ys_pk3, "--r")
ax.plot(xs_k, ys_rk3, "--g")
ax.plot(xs_k, ys_pk4, "--r")
ax.plot(xs_k, ys_rk4, "--g")
ax.plot(xs_k, ys_pk5, "--r")
ax.plot(xs_k, ys_rk5, "--g")
ax.plot(xs_k, ys_pk6, "--r")
ax.plot(xs_k, ys_rk6, "--g")
ax.legend(['Precisions @ k', 'Recalls @ k'])
plt.show()

# Main Table for Metrics
df_final = pd.DataFrame(
    columns=[
        "Vectorizer Settings",
        "Distance Measure",
        "Average\n(Precision @ 5)",
        "Average\n(Recall @ 5)",
        "Average\n(F-score @ 5)",
        "Mean Average Precision (MAP)",
        "R-Precision",
        "AUC-PR (Averaged Eleven-Point)",
    ]
)
df_final.loc[0] = [
    "Default",
    "Euclidean (L2)",
    "{:0.2f}".format(default_euc_pave),
    "{:0.2f}".format(default_euc_rave),
    "{:0.2f}".format(default_euc_fone),
    "{:0.2f}".format(default_euc_map),
    "{:0.2f}".format(default_euc_rp),
    0.54,
]

df_final.loc[1] = [
    '"Optimal"',
    "Euclidean (L2)",
    "{:0.2f}".format(optimal_euc_pave),
    "{:0.2f}".format(optimal_euc_rave),
    "{:0.2f}".format(optimal_euc_fone),
    "{:0.2f}".format(optimal_euc_map),
    "{:0.2f}".format(optimal_euc_rp),
    0.52,
]

df_final.loc[2] = [
    "Default",
    "Manhattan (L1)",
    "{:0.2f}".format(default_man_pave),
    "{:0.2f}".format(default_man_rave),
    "{:0.2f}".format(default_man_fone),
    "{:0.2f}".format(default_man_map),
    "{:0.2f}".format(default_man_rp),
    0.43,
]

df_final.loc[3] = [
    '"Optimal"',
    "Manhattan (L1)",
    "{:0.2f}".format(optimal_man_pave),
    "{:0.2f}".format(optimal_man_rave),
    "{:0.2f}".format(optimal_man_fone),
    "{:0.2f}".format(optimal_man_map),
    "{:0.2f}".format(optimal_man_rp),
    0.44,
]

df_final.loc[4] = [
    "Default",
    "Cosine",
    "{:0.2f}".format(default_cos_pave),
    "{:0.2f}".format(default_cos_rave),
    "{:0.2f}".format(default_cos_fone),
    "{:0.2f}".format(default_cos_map),
    "{:0.2f}".format(default_cos_rp),
    0.58,
]

df_final.loc[5] = [
    '"Optimal"',
    "Cosine",
    "{:0.2f}".format(optimal_cos_pave),
    "{:0.2f}".format(optimal_cos_rave),
    "{:0.2f}".format(optimal_cos_fone),
    "{:0.2f}".format(optimal_cos_map),
    "{:0.2f}".format(optimal_cos_rp),
    0.57,
]

pretty_print(df_final)

# Main Table for Metrics
df_genre = pd.DataFrame(
    columns=[
        "Vectorizer Settings",
        "Distance Measure",
        "Drama",
        "Comedy",
        "Mystery and Thriller",
        "Horror",
        "Romance",
        "Action",
        "Sci-fi",
        "Adventure",
        "LGBTQA+",
        "Fantasy",
    ]
)

lst_settings = ['Default', 'Optimal', 'Default', 'Optimal', 'Default',
                'Optimal']
lst_distances = ['Euclidean', 'Euclidean', 'Manhattan', 'Manhattan',
                 'Cosine', 'Cosine']
for i in range(6):
    df_genre.loc[i] = [
        lst_settings[i],
        lst_distances[i],
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][0]),
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][1]),
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][2]),
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][3]),
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][4]),
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][5]),
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][6]),
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][7]),
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][8]),
        "{:0.2f}".format(dict_genre[lst_distances[i]][lst_settings[i]][9]),
    ]
df_genre

# Eleven-Point Precision/Recall Curve
# Default Euclidean
ave_prg(queries['Default'], data_default, euclidean, 1, gold_standards)
plt.show()

# Default Manhattan
ave_prg(queries['Default'], data_default, cityblock, 1, gold_standards)
plt.show()

# Default Cosine
ave_prg(queries['Default'], data_default, cosine, 1, gold_standards)
plt.show()

# Optimal Euclidean
ave_prg(queries['Optimal'], data_optimal, euclidean, 1, gold_standards)
plt.show()

# Optimal Manhattan
ave_prg(queries['Optimal'], data_optimal, cityblock, 1, gold_standards)
plt.show()

# Optimal Cosine
ave_prg(queries['Optimal'], data_optimal, cosine, 1, gold_standards)
plt.show()

search_k_title()

Dropdown(description='k', index=4, options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,…

Dropdown(description='Movie Title', options=('Afire', 'A Thousand and One', 'A Million Miles Away', 'Attachmen…

Button(button_style='success', description='Search', style=ButtonStyle())

Output()

	title	synopsis	runtime	critic	g_biography	g_comedy	...	lang_german
0	Afire	\n While vacationing by the...	103	The script, also by director Christian Petzold...	0	1	...	1
1	A Thousand and One	\n A THOUSAND AND ONE follo...	117	The director maps the contours of a destabilis...	0	0	...	0
2	A Million Miles Away	\n Inspired by the real-lif...	121	A Million Miles Away doesn’t exactly zoom past...	1	0	...	0

Step No.	Step	Description
1.	Data Extraction	The team scraped data from the Rotten Tomatoes' website and complied these data into a dataframe which was then saved into a database.
2.	Data Preprocessing	The team preprocessed the data through one-hot encoding, stemming, common word-normalization, lowercasing of each character, and other relevant preprocessing techniques.
3.	Information Retrieval Creation	The team tested several distance measures for the system, where the distance measure which would provide the highest IR metric would be used as the final measure.
4.	Evaluation of IR System	The team evaluated the created IR systems using average (precision @ k), average (recall @k), mean average precision, area under the Averaged Eleven-Point PR curve, and such which would then be discussed afterwards.

	11	12	12th	15	16	17	18th	1945	1960s	1976	...	younger	youngest	your	yourself	youssef	youth	zadi	zakhar	zem	zero
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	...	zoom
0	...	0
1	...	0
2	...	1

	are	famili	...	when	who	year	year old	young
0	1	1	...	1	0	1	0	1
1	0	0	...	0	1	1	1	0
2	0	1	...	0	0	0	0	0

ABSTRACT

IMPORT LIBRARIES AND DEFINE FUNCTIONS

Import Libraries

Define Functions

TABLE OF CONTENTS

I. PROBLEM STATEMENT

II. MOTIVATION

III. DATA SOURCE

Data Source and Description

Data Assumptions and Limitations

Database Creation

IV. DATA EXPLORATION

Data Cleaning

Data Exploration

Runtime Distribution

Movie Category Analysis

Movie Genre Breakdown

Movie Language Diversity

V. METHODOLOGY

Methodology Overview

Data Extraction

Data Preprocessing

Information Retrieval Creation

Evaluation of IR System

VI. RESULTS AND DISCUSSION

VII. CONCLUSION

VIII. RECOMMENDATION

ACKNOWLEDGMENT

REFERENCES

No.	Column	Description	Data Type
1.	Title	title of the movie	string
2.	Synopsis	synopsis of the movie, which would be converted to bag-of-words	string
3.	Runtime	runtime of the movie, in minutes	int
4.	Critic	the concatenated review for the movie of the different Top Critics of Rotten Tomatoes, which would be converted to bag-of-words	string
5.	Genre	genre/s of the movies, which would soon be one-hot encoded with the prefix ‘g_’ (eg. g_drama, g_comedy)	string
6.	Language	language of the film, which would soon be one-hot encoded with the prefix ‘lang_’ (eg. lang_english, lang_french)	string

	11	12	12th	15	16	17	18th	1945	1960s	1976	...	younger	youngest	your	yourself	youssef	youth	zadi	zakhar	zem	zero
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	...	zoom
0	...	0
1	...	0
2	...	1

	are	famili	...	when	who	year	year old	young
0	1	1	...	1	0	1	0	1
1	0	0	...	0	1	1	1	0
2	0	1	...	0	0	0	0	0

	_connector_ film	_connector_ spanish	actor	...	while	who	work
0	0	1	1	...	1	0	1
1	1	0	0	...	0	1	0
2	0	0	0	...	0	0	0

	are	famili	...	who	year	year old	young
0	1	1	...	0	1	0	1
1	0	0	...	1	1	1	0
2	0	1	...	0	0	0	0

	_connector_ is	_connector_ spanish	actor	also	...	while	who	work
0	0	1	1	1	...	1	0	1
1	0	0	0	0	...	0	1	0
2	1	0	0	0	...	0	0	0

	11	12	12th	15	16	17	18th	1945	1960s	1976	...	younger	youngest	your	yourself	youssef	youth	zadi	zakhar	zem	zero
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	...	zoom
0	...	0.000000
1	...	0.000000
2	...	0.127834
3	...	0.000000
4	...	0.000000

	are	back	be	begin	famili	...	two	who	year	year old	young
0	0.301699	0.000000	0.000000	0.000000	0.301699	...	0.0	0.000000	0.289036	0.000000	0.306247
1	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.0	0.243383	0.266502	0.323606	0.000000
2	0.000000	0.000000	0.000000	0.000000	0.340635	...	0.0	0.000000	0.000000	0.000000	0.000000
3	0.000000	0.245681	0.440677	0.240856	0.000000	...	0.0	0.181147	0.000000	0.000000	0.210165
4	0.000000	0.000000	0.000000	0.000000	0.000000	...	1.0	0.000000	0.000000	0.000000	0.000000

	idf_weights
is	1.655120
when	1.889520
who	2.276293
she	2.276293
life	2.394077

	idf_weights
film	1.244692
movi	1.625706
be	1.732678
what	1.834461
more	1.834461

No.	Genre	Movie
1.	Drama	Oppenheimer
2.	Comedy	Sitting in Bars with Cake
3.	Mystery and Thriller	M3GAN
4.	Horror	The Blackening
5.	Romance	Red, White & Royal Blue
6.	Action	John Wick: Chapter 4
7.	Sci-Fi	Biosphere
8.	Adventure	Guardians of the Galaxy Vol. 3
9.	LGBTQA+	Shortcomings
10.	Fantasy	Suzume

	title	Oppenheimer	Sitting in Bars with Cake	Red, White & Royal Blue
0	Afire	0	1	1
1	A Thousand and One	0	0	0
2	A Million Miles Away	1	0	0

	11	12	12th	15	16	17	18th	1945	1960s	1976	...	younger	youngest	your	yourself	youssef	youth	zadi	zakhar	zem	zero
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	...	zoom
0	...	0
1	...	0
2	...	1

	are	famili	...	when	who	year	year old	young
0	1	1	...	1	0	1	0	1
1	0	0	...	0	1	1	1	0
2	0	1	...	0	0	0	0	0

	_connector_ film	_connector_ spanish	actor	...	while	who	work
0	0	1	1	...	1	0	1
1	1	0	0	...	0	1	0
2	0	0	0	...	0	0	0

	are	famili	...	who	year	year old	young
0	1	1	...	0	1	0	1
1	0	0	...	1	1	1	0
2	0	1	...	0	0	0	0

	_connector_ is	_connector_ spanish	actor	also	...	while	who	work
0	0	1	1	1	...	1	0	1
1	0	0	0	0	...	0	1	0
2	1	0	0	0	...	0	0	0

	11	12	12th	15	16	17	18th	1945	1960s	1976	...	younger	youngest	your	yourself	youssef	youth	zadi	zakhar	zem	zero
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	_connector_ is	_connector_ most	_connector_ spanish	actor	also	...	which	while	who	work	you
0	0.000000	0.000000	0.189491	0.197687	0.167216	...	0.000000	0.175795	0.000000	0.146946	0.000000
1	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.151074	0.000000	0.000000
2	0.190172	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000
3	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.145567	0.000000	0.000000
4	0.000000	0.161545	0.154848	0.161545	0.000000	...	0.132463	0.000000	0.125019	0.000000	0.095729

	title_old	g_biography	g_comedy	g_drama	...	zoom	runtime_old
0	Afire	0	1	1	...	0.000000	0.572222
1	A Thousand and One	0	0	1	...	0.000000	0.650000
2	A Million Miles Away	1	0	1	...	0.127834	0.672222

No.	Evaluation Metric	Description
1.	Average (Precision @ k)	In this metric, at an arbitrarily set k results based on the Precision-Recall vs. k graph, the precision would be calculated per query then averaged again. This is a good metric to measure in this type of IR given that most users of a recommender system would only appreciate seeing few movies to watch (e.g. 5) than several ones. Moreover, given the relatively small dataset, having high number of results than the supposed number of relevant movies to return would distort the performance of the IR system to be worse than it should be.
2.	Average (Recall @ k)	In this metric, at an arbitrarily set k based on the Precision-Recall vs. k graph, the recall would be calculated per query then averaged again.
3.	Average (F-1 Score @ k and beta = 0.5)	After getting both Average (Precision @ k) and Average (Recall @ k), the F-1 score is calculated using the same metrics, with more emphasis on precision over recall. There is more emphasis on precision (beta = 0.5) over recall given that avoiding false positives is more important than avoiding false negatives, given that a user of this recommendation system would not feel satisfied if the returned "relevant" results are in fact actually irrelevant. Compare this to a scenario where the same user would miss relevant results; being recommended an irrelevant movie would have more consequences to user satisfaction as compared to missing an opportunity to be recommended a relevant movie.
4.	Mean Average Precision (MAP)	For every query, this metric considers up to the final relevant result or kth result, through which the metric computes the average precision @ k for the given query. As compared to Average (Precision @ k) which only considers the precision at exactly k, in this metric, all ks with a corresponding relevant results are considered in the average, while ks with corresponding irrelevant results are disregarded. The arithmetic mean of all these averages are then calculated. Being a robust metric, it provides equal weight to each query, and thus serves as a more reliable metric for precision than Average (Precision @ k).
5.	Average R-Precision	This metric takes into account how many truly relevant movies (where its number is denoted by \|Rel\|) are there for a given movie query and takes this into account for each query where the precision for each query is simply the Average (Precision @ \|Rel\|). These are averaged, and this can serve as both the average precision and recall for the IR system. As a side note, the unaveraged measure would also be retrieved for each genre, to gather insights on the IR and its possible relationships to the population or distribution or characteristics of its dataset.
6.	Averaged 11-point Precision/Recall Curve	The interpolated precision (possible highest precision) at recall levels ranging from 0 to 1 (with an increment of 0.1) are calculated for each query, and these averaged interpolated precisions are then plotted, where the goal is to highlight potential or possible increases in precision given an increase in the result set (Cambridge UP, 2009). This was also done in order to consolidate all query results into one Precision/Recall Curve.

	11	12	12th	15	16	17	18th	1945	1960s	1976	...	younger	youngest	your	yourself	youssef	youth	zadi	zakhar	zem	zero
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	...	zoom
0	...	0
1	...	0
2	...	1

	are	famili	...	when	who	year	year old	young
0	1	1	...	1	0	1	0	1
1	0	0	...	0	1	1	1	0
2	0	1	...	0	0	0	0	0

	_connector_ film	_connector_ spanish	actor	...	while	who	work
0	0	1	1	...	1	0	1
1	1	0	0	...	0	1	0
2	0	0	0	...	0	0	0

	are	famili	...	who	year	year old	young
0	1	1	...	0	1	0	1
1	0	0	...	1	1	1	0
2	0	1	...	0	0	0	0

	_connector_ is	_connector_ spanish	actor	also	...	while	who	work
0	0	1	1	1	...	1	0	1
1	0	0	0	0	...	0	1	0
2	1	0	0	0	...	0	0	0

	11	12	12th	15	16	17	18th	1945	1960s	1976	...	younger	youngest	your	yourself	youssef	youth	zadi	zakhar	zem	zero
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	Vectorizer Settings	Distance Measure	Average (Precision @ 5)	Average (Recall @ 5)	Average (F-score @ 5)	Mean Average Precision (MAP)	R-Precision	AUC-PR (Averaged Eleven-Point)
0	Default	Euclidean (L2)	0.50	0.54	0.51	0.53	0.56	0.54
1	"Optimal"	Euclidean (L2)	0.46	0.50	0.47	0.52	0.51	0.52
2	Default	Manhattan (L1)	0.36	0.38	0.36	0.40	0.40	0.43
3	"Optimal"	Manhattan (L1)	0.40	0.42	0.40	0.43	0.44	0.44
4	Default	Cosine	0.56	0.59	0.57	0.57	0.59	0.58
5	"Optimal"	Cosine	0.54	0.57	0.55	0.57	0.59	0.57

	Vectorizer Settings	Distance Measure	Drama	Comedy	Mystery and Thriller	Horror	Romance	Action	Sci-fi	Adventure	LGBTQA+	Fantasy
0	Default	Euclidean	0.67	0.50	0.67	0.40	0.29	0.50	0.50	1.00	0.75	0.33
1	Optimal	Euclidean	0.67	0.50	0.67	0.40	0.29	0.50	0.50	0.50	0.75	0.33
2	Default	Manhattan	0.50	0.25	0.33	0.20	0.29	0.50	0.25	0.75	0.75	0.17
3	Optimal	Manhattan	0.67	0.25	0.67	0.40	0.29	0.67	0.50	0.50	0.25	0.17
4	Default	Cosine	0.83	0.50	0.67	0.40	0.29	0.50	0.50	1.00	0.75	0.50
5	Optimal	Cosine	0.83	0.50	0.67	0.40	0.29	0.67	0.50	0.75	0.75	0.50