Source code for catalyst.contrib.datasets.movielens

import itertools
import os

import numpy as np
import pandas as pd
import scipy.sparse as sp

import torch
from torch.utils.data import Dataset

from catalyst.contrib.datasets.misc import download_and_extract_archive


[docs]class MovieLens(Dataset):
    """
    MovieLens data sets were collected by the GroupLens Research Project
    at the University of Minnesota.

    This data set consists of:
    * 100,000 ratings (1-5) from 943 users on 1682 movies.
    * Each user has rated at least 20 movies.
    * Simple demographic info for the users
    (age, gender, occupation, zip)

    The data was collected through the MovieLens web site
    (movielens.umn.edu) during the seven-month period from September 19th,
    1997 through April 22nd, 1998. This data has been cleaned up - users
    who had less than 20 ratings or did not have complete demographic
    information were removed from this data set. Detailed descriptions of
    the data file can be found at the end of this file.

    Neither the University of Minnesota nor any of the researchers
    involved can guarantee the correctness of the data, its suitability
    for any particular purpose, or the validity of results based on the
    use of the data set.  The data set may be used for any research
    purposes under the following conditions:
    * The user may not state or imply any endorsement from the
    University of Minnesota or the GroupLens Research Group.
    * The user must acknowledge the use of the data set in
    publications resulting from the use of the data set
    (see below for citation information).
    * The user may not redistribute the data without separate
    permission.
    * The user may not use this information for any commercial or
    revenue-bearing purposes without first obtaining permission
    from a faculty member of the GroupLens Research Project at the
    University of Minnesota.

    If you have any further questions or comments, please contact GroupLens
    <grouplens-info@cs.umn.edu>.
    http://files.grouplens.org/datasets/movielens/ml-100k-README.txt

    .. note::
        catalyst[ml] required for this dataset.
    """

    resources = (
        "http://files.grouplens.org/datasets/movielens/ml-100k.zip",
        "0e33842e24a9c977be4e0107933c0723",
    )
    filename = "ml-100k.zip"
    training_file = "training.pt"
    test_file = "test.pt"

[docs]    def __init__(self, root, train=True, download=False, min_rating=0.0):
        """
        Args:
            root (string): Root directory of dataset where
                ``MovieLens/processed/training.pt``
                and  ``MovieLens/processed/test.pt`` exist.
            train (bool, optional): If True, creates dataset from
                ``training.pt``, otherwise from ``test.pt``.
            download (bool, optional): If true, downloads the dataset from
                the internet and puts it in root directory. If dataset
                is already downloaded, it is not downloaded again.
            min_rating (float, optional): Minimum rating to include in
                the interaction matrix

        Raises:
            RuntimeError: If ``download is False`` and the dataset not found.
        """
        if isinstance(root, torch._six.string_classes):
            root = os.path.expanduser(root)

        self.root = root
        self.train = train
        self.min_rating = min_rating

        if download:
            self._download()

        self._fetch_movies()

        if not self._check_exists():
            raise RuntimeError("Dataset not found. Set `download=True`")

        if self.train:
            data_file = self.training_file
        else:
            data_file = self.test_file

        self.data = torch.load(os.path.join(self.processed_folder, data_file))

    def __getitem__(self, user_index):
        """Get item.

        Args:
            user_index (int): User index [0, 942]

        Returns:
            tensor: (items) item's ranking for the user with index user_index
        """
        return self.data[user_index]

    def __len__(self):
        """The length of the loader"""
        return self.dimensions[0]

    @property
    def raw_folder(self):
        """Create raw folder for data download

        Returns:
            raw_path (path): raw folder path
        """
        return os.path.join(self.root, self.__class__.__name__, "raw")

    @property
    def processed_folder(self):
        """Create the folder for the processed files

        Returns:
            raw_path (path): processed folder path
        """
        return os.path.join(self.root, self.__class__.__name__, "processed")

    def _check_exists(self):
        """Check if the path for tarining and testing data
        exists in processed folder.

        Returns:
            raw_path (path): processed folder path
        """
        return os.path.exists(
            os.path.join(self.processed_folder, self.training_file)
        ) and os.path.exists(os.path.join(self.processed_folder, self.test_file))

    def _download(self):
        """Download and extract files/"""
        if self._check_exists():
            return

        os.makedirs(self.raw_folder, exist_ok=True)
        os.makedirs(self.processed_folder, exist_ok=True)
        url = self.resources[0]
        md5 = self.resources[1]

        download_and_extract_archive(
            url=url,
            download_root=self.raw_folder,
            filename=self.filename,
            md5=md5,
            remove_finished=True,
        )

    def _read_raw_movielens_data(self):
        """Return the raw lines of the train and test files."""
        path = self.raw_folder

        with open(path + "/ml-100k/ua.base") as datafile:
            ua_base = datafile.read().split("\n")

        with open(path + "/ml-100k/ua.test") as datafile:
            ua_test = datafile.read().split("\n")

        with open(path + "/ml-100k/u.item", encoding="ISO-8859-1") as datafile:
            u_item = datafile.read().split("\n")

        with open(path + "/ml-100k/u.genre") as datafile:
            u_genre = datafile.read().split("\n")

        return (ua_base, ua_test, u_item, u_genre)

    def _build_interaction_matrix(self, rows, cols, data):
        """Builds interaction matrix.

        Args:
            rows (int): rows of the oevrall dataset
            cols (int): columns of the overall dataset
            data (generator object): generator of
            the data object

        Returns:
            interaction_matrix (torch.sparse.Float):
            sparse user2item interaction matrix
        """
        mat = sp.lil_matrix((rows, cols), dtype=np.int32)

        for uid, iid, rating, _ in data:
            if rating >= self.min_rating:
                mat[uid, iid] = rating
        coo = mat.tocoo()
        values = coo.data
        indices = np.vstack((coo.row, coo.col))

        i = torch.LongTensor(indices)
        v = torch.FloatTensor(values)
        shape = coo.shape

        interaction_matrix = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()
        return interaction_matrix

    def _parse(self, data):
        """Parses the raw data. Substract one to shift to zero based indexing

        Args:
            data: raw data of the dataset

        Yields:
            Generator iterator for parsed data
        """
        for line in data:

            if not line:
                continue

            uid, iid, rating, timestamp = [int(x) for x in line.split("\t")]
            yield uid - 1, iid - 1, rating, timestamp

    def _get_dimensions(self, train_data, test_data):
        """Gets the dimensions of the raw dataset

        Args:
            train_data: (uid, iid, rating, timestamp)
                Genrator for training data
            test_data: (uid, iid, rating, timestamp)
                Genrator for testing data

        Returns:
            The total dimension of the dataset
        """
        uids = set()
        iids = set()

        for uid, iid, _, _ in itertools.chain(train_data, test_data):
            uids.add(uid)
            iids.add(iid)

        rows = max(uids) + 1
        cols = max(iids) + 1

        self.dimensions = (rows, cols)

        return rows, cols

    def _fetch_movies(self):
        """
        Fetch data and save in the pytorch format
            1. Read the train/test data from raw archive
            2. Parse train data
            3. Parse test data
            4. Save in the .pt with torch.save
        """
        data = self._read_raw_movielens_data()
        train_raw = data[0]
        test_raw = data[1]

        train_parsed = self._parse(train_raw)
        test_parsed = self._parse(test_raw)

        num_users, num_items = self._get_dimensions(train_parsed, test_parsed)

        train = self._build_interaction_matrix(
            num_users, num_items, self._parse(train_raw)
        )
        test = self._build_interaction_matrix(
            num_users, num_items, self._parse(test_raw)
        )
        assert train.shape == test.shape

        with open(os.path.join(self.processed_folder, self.training_file), "wb") as f:
            torch.save(train, f)

        with open(os.path.join(self.processed_folder, self.test_file), "wb") as f:
            torch.save(test, f)


class MovieLens20M(Dataset):
    """
    MovieLens data sets (ml-20m) were collected by
    the GroupLens Research Project at the University of Minnesota.

    This data set consists of:
    * 20,000,263 ratings (1-5)
    and 465,564 tag applications from 138,493 users on 27,278 movies.
    * Each user has rated at least 20 movies.
    * Simple demographic info for the users
    (age, gender, occupation, zip)

    Users were selected at random for inclusion.
    All selected users had rated at least 20 movies.
    No demographic information is included.
    Each user is represented by an id, and no other information is provided.

    More details about the contents and use of all these files follows.
    This and other GroupLens data sets are publicly available for download
    at http://grouplens.org/datasets/.

    The data was collected through the MovieLens web site.
    (movielens.umn.edu)  between January 09, 1995 and March 31, 2015.
    This dataset was generated on October 17, 2016.

    Neither the University of Minnesota nor any of the researchers involved
    can guarantee the correctness of the data, its suitability
    for any particular purpose, or the validity of
    results based on the use of the data set.

    The data set may be used for any research purposes
    under the following conditions:

    The user may not state or imply any endorsement
    from the University of Minnesota or the GroupLens Research Group.

    The user must acknowledge the use of the data set in
    publications resulting from the use of the data set
    (see below for citation information).

    The user may not redistribute the data without separate permission.

    The user may not use this information for any
    commercial or revenue-bearing purposes
    without first obtaining permission from a faculty member
    of the GroupLens Research Project at the University of Minnesota.

    The executable software scripts are provided "as is"
    without warranty of any kind, either expressed or implied, including,
    but not limited to, the implied warranties of merchantability
    and fitness for a particular purpose.

    The entire risk as to the quality and performance of them is with you.
    Should the program prove defective,
    you assume the cost of all necessary servicing, repair or correction.

    In no event shall the University of Minnesota,
    its affiliates or employees be liable to you for any damages
    arising out of the use or inability to use these programs (including
    but not limited to loss of data or data being rendered inaccurate).

    The data are contained in six files:
    1. genome-scores.csv
    2. genome-tags.csv
    3. links.csv
    4. movies.csv
    5. ratings.csv
    6. tags.csv

    Ratings Data File Structure (ratings.csv)
    All ratings are contained in the file ratings.csv.
    Each line of this file after the header row represents
    one rating of one movie by one user,and has the following format:

    1. userId,
    2. movieId,
    3. rating,
    4. timestamp

    Tags Data File Structure (tags.csv)

    1. userId,
    2. movieId,
    3. tag,
    4. timestamp

    Movies Data File Structure (movies.csv)

    1. movieId,
    2. title,
    3. genres

    Movie titles are entered manually or
    imported from https://www.themoviedb.org/, and include the year
    of release in parentheses.
    Errors and inconsistencies may exist in these titles.

    Links Data File Structure (links.csv)

    1. movieId,
    2. imdbId,
    3. tmdbId

    Tag Genome (genome-scores.csv and genome-tags.csv)

    1. movieId,
    2. tagId,
    3. relevance


    If you have any further questions or comments, please contact GroupLens
    <grouplens-info@cs.umn.edu>.
    https://files.grouplens.org/datasets/movielens/ml-20m-README.html
    """

    resources = (
        "https://files.grouplens.org/datasets/movielens/ml-20m.zip",
        " cd245b17a1ae2cc31bb14903e1204af3",
    )
    filename = "ml-20m.zip"
    training_file = "training.pt"
    test_file = "test.pt"

    def __init__(
        self,
        root,
        train=True,
        download=False,
        min_rating=0.0,
        min_items_per_user=1.0,
        min_users_per_item=2.0,
        test_prop=0.2,
        split="users",
        sample=False,
        n_rows=1000,
    ):
        """
        Args:
            root (string): Root directory of dataset where
                ``MovieLens/processed/training.pt``
                and  ``MovieLens/processed/test.pt`` exist.
            train (bool, optional): If True, creates dataset from
                ``training.pt``, otherwise from ``test.pt``.
            download (bool, optional): If true, downloads the dataset from
                the internet and puts it in root directory. If dataset
                is already downloaded, it is not downloaded again.
            min_rating (float, optional): Minimum rating to include in
                the interaction matrix
            min_items_per_user (float, optional):
                Minimum number of items per user
                to include in the interaction matrix
            min_users_per_item (float, optional):
                Minimum rating to users per itemrs
                to include in the interaction matrix
            test_prop (float, optional): train-test split
            split (string, optional): the splittage method.
                `users` – split by users
                `ts` - split by timestamp
            sample (bool, optional):
                If true, then use the sample of the dataset.
                If true the `n_rows` shold be provide
            n_rows (int, optional): number of rows to retrieve.
                Availbale only with `sample = True`

        Raises:
            RuntimeError: If ``download = False`` and the dataset not found.
            RuntimeError: If torch version < `1.7.0`"
        """
        if isinstance(root, torch._six.string_classes):
            root = os.path.expanduser(root)

        self.root = root
        self.train = train
        self.min_rating = min_rating
        self.min_items_per_user = min_items_per_user
        self.min_users_per_item = min_users_per_item
        self.test_prop = test_prop
        self.nrows = n_rows
        self.sample = sample
        self.split = split

        if download:
            self._download()

        self._fetch_movies(split_by=split)

        if not self._check_exists():
            raise RuntimeError("Dataset not found. Set `download=True`")

        if self.train:
            data_file = self.training_file
        else:
            data_file = self.test_file

        self.data = torch.load(os.path.join(self.processed_folder, data_file))

    def __getitem__(self, user_index):
        """Get item.

        Args:
            user_index (int): User index

        Returns:
            tensor: (items) item's ranking for the user with index user_index
        """
        return self.data[user_index]

    def __len__(self):
        """The length of the loader"""
        return self.dimensions[0]

    @property
    def raw_folder(self):
        """Create raw folder for data download

        Returns:
            raw_path (path): raw folder path
        """
        return os.path.join(self.root, self.__class__.__name__, "raw")

    @property
    def processed_folder(self):
        """Create the folder for the processed files

        Returns:
            raw_path (path): processed folder path
        """
        return os.path.join(self.root, self.__class__.__name__, "processed")

    def _check_exists(self):
        """Check if the path for tarining and testing data exists in
        processed folder.

        Returns:
            raw_path (path): processed folder path
        """
        return os.path.exists(
            os.path.join(self.processed_folder, self.training_file)
        ) and os.path.exists(os.path.join(self.processed_folder, self.test_file))

    def _download(self):
        """Download and extract files"""
        if self._check_exists():
            return

        os.makedirs(self.raw_folder, exist_ok=True)
        os.makedirs(self.processed_folder, exist_ok=True)
        url = self.resources[0]

        download_and_extract_archive(
            url=url,
            download_root=self.raw_folder,
            filename=self.filename,
            remove_finished=True,
        )

    def _read_raw_movielens_data(self):
        """Read the csv files with pandas.

        Returns:
            (movies, ratings, genome_scores, genome_tags, tags):
            (pd.DataFrame, pd.DataFrame, pd.DataFrame,
            pd.DataFrame, pd.DataFrame)
        """
        path = self.raw_folder

        if self.sample:
            movies = pd.read_csv(path + "/ml-20m/movies.csv", nrows=self.nrows)
            ratings = pd.read_csv(path + "/ml-20m/ratings.csv", nrows=self.nrows)
            genome_scores = pd.read_csv(
                path + "/ml-20m/genome-scores.csv", nrows=self.nrows
            )
            genome_tags = pd.read_csv(path + "/ml-20m/genome-tags.csv", nrows=self.nrows)
            tags = pd.read_csv(path + "/ml-20m/tags.csv", nrows=self.nrows)
        else:
            movies = pd.read_csv(path + "/ml-20m/movies.csv")
            ratings = pd.read_csv(path + "/ml-20m/ratings.csv")
            genome_scores = pd.read_csv(path + "/ml-20m/genome-scores.csv")
            genome_tags = pd.read_csv(path + "/ml-20m/genome-tags.csv")
            tags = pd.read_csv(path + "/ml-20m/tags.csv")

        return (movies, ratings, genome_scores, genome_tags, tags)

    def _build_interaction_matrix(self, ratings):
        """Builds interaction matrix.

        Args:
            ratings (pd.Dataframe): pandas DataFrame of the following format
                    userId	movieId	rating
                    20	1	924	     3.5
                    19	1	919	     3.5
                    86	1	2683	 3.5
                    61	1	1584	 3.5
                    23	1	1079	 4.0

        Returns:
            interaction_matrix (torch.sparse.Float):
            sparse user2item interaction matrix
        """
        csr_matrix = sp.coo_matrix(
            (
                ratings["rating"].astype(np.float32),
                (ratings["movieId"], ratings["userId"]),
            )
        )

        interaction_matrix = torch.sparse.LongTensor(
            torch.LongTensor([csr_matrix.row.tolist(), csr_matrix.col.tolist()]),
            torch.LongTensor(csr_matrix.data.astype(np.int32)),
        )

        return interaction_matrix

    def _parse(
        self,
        ratings,
        rating_cut=True,
        user_per_item_cut=True,
        item_per_user_cut=True,
        ts_cut=False,
    ):
        """Parses and pre-process the raw data.
        Substract one to shift to zero based indexing
        To-do add timestamp cut

        Args:
            ratings (pd.Dataframe): pandas DataFrame of the following format
                userId	movieId	rating	timestamp
                20	1	924	     3.5	1094785598
                19	1	919	     3.5	1094785621
                86	1	2683	 3.5	1094785650
                61	1	1584	 3.5	1094785656
                23	1	1079	 4.0	1094785665
            rating_cut (bool, optional):
                If true, filter datafreame on the `min_rating` value
            user_per_item_cut (bool, optional):
                If true, filter datafreame on the `min_users_per_item` value
            item_per_user_cut (bool, optional):
                If true, filter datafreame on the `min_items_per_user` value
            ts_cut (bool, optional):
                If true, filter datafreame on the `min_ts` value [TO-DO]

        Returns:
            ratings (pd.Dataframe): filtered `ratings` pandas DataFrame
            users_activity (pd.DataFrame):
                Number of items each user interacted with
            items_activity (pd.DataFrame):
                Number of users interacted with each item.
        """
        if rating_cut:
            ratings = ratings[ratings["rating"] >= self.min_rating].sort_values(
                ["userId", "timestamp"]
            )

        movie_id = "movieId"
        user_cnt_df = (
            ratings[[movie_id]]
            .groupby(movie_id, as_index=False)
            .size()
            .rename(columns={"size": "user_cnt"})
        )
        user_id = "userId"
        item_cnt_df = (
            ratings[[user_id]]
            .groupby(user_id, as_index=False)
            .size()
            .rename(columns={"size": "item_cnt"})
        )

        user_not_filtered = True
        item_not_filtered = True

        while user_not_filtered or item_not_filtered:
            ratings = ratings[
                ratings[movie_id].isin(
                    user_cnt_df.index[user_cnt_df["user_cnt"] >= self.min_users_per_item]
                )
            ]
            ratings = ratings[
                ratings[user_id].isin(
                    item_cnt_df.index[item_cnt_df["item_cnt"] >= self.min_items_per_user]
                )
            ]

            user_cnt_df = (
                ratings[[movie_id]]
                .groupby(movie_id, as_index=False)
                .size()
                .rename(columns={"size": "user_cnt"})
            )
            item_cnt_df = (
                ratings[[user_id]]
                .groupby(user_id, as_index=False)
                .size()
                .rename(columns={"size": "item_cnt"})
            )

            user_not_filtered = (user_cnt_df["user_cnt"] < self.min_users_per_item).any()
            item_not_filtered = (item_cnt_df["item_cnt"] < self.min_items_per_user).any()

        users_activity = (
            ratings[["userId"]]
            .groupby("userId", as_index=False)
            .size()
            .rename(columns={"size": "user_cnt"})
        )
        items_activity = (
            ratings[["movieId"]]
            .groupby("movieId", as_index=False)
            .size()
            .rename(columns={"size": "item_cnt"})
        )
        return ratings, users_activity, items_activity

    def _split_by_users(self, ratings, users_activity):
        """Split the ratings DataFrame into train and test
        Randomly shuffle users and split

        Args:
            ratings (pd.Dataframe): pandas DataFrame of the following format
                userId	movieId	rating	timestamp
                20	1	924	     3.5	1094785598
                19	1	919	     3.5	1094785621
                86	1	2683	 3.5	1094785650
                61	1	1584	 3.5	1094785656
                23	1	1079	 4.0	1094785665
            users_activity (pd.DataFrame):
                Number of items each user interacted with

        Returns:
            train_events (pd.Dataframe): pandas DataFrame for training data
            test_events (pd.Dataframe): pandas DataFrame for training data
        """
        idx_perm = np.random.permutation(users_activity.index.size)
        unique_uid = users_activity.index[idx_perm]
        n_users = unique_uid.size

        test_users = unique_uid[: int(n_users * self.test_prop)]
        train_users = unique_uid[int(n_users * self.test_prop) :]

        train_events = ratings.loc[ratings["userId"].isin(train_users)]
        test_events = ratings.loc[ratings["userId"].isin(test_users)]

        return (train_events, test_events)

    def _split_by_time(self, ratings):
        """Split the ratings DataFrame into train and test by timestamp
        Ratings[timestamp] extreme values used for the filtering interval

        Args:
            ratings (pd.Dataframe): pandas DataFrame of the following format
                userId	movieId	rating	timestamp
                20	1	924	     3.5	1094785598
                19	1	919	     3.5	1094785621
                86	1	2683	 3.5	1094785650
                61	1	1584	 3.5	1094785656
                23	1	1079	 4.0	1094785665

        Returns:
            train_events (pd.Dataframe): pandas DataFrame for training data
            test_events (pd.Dataframe): pandas DataFrame for training data
        """
        ts = ratings["timestamp"].sort_values()
        ts_max = ts.max()
        ts_min = ts.min()
        ts_split = ts_min + (ts_max - ts_min) * self.test_prop

        train_events = ratings[ratings["timestamp"] > ts_split]
        test_events = ratings[ratings["timestamp"] <= ts_split]

        return (train_events, test_events)

    def _fetch_movies(self, split_by="users"):
        """
        Fetch data and save in the pytorch format
            1. Read the MovieLens20 data from raw archive
            2. Parse the rating dataset
            3. Split dataset into train and test
            4. Build user-item matrix interaction
            5. Save in the .pt with torch.save

        Args:
            split_by (string, optional): the splittage method.
                `users` – split by users
                `ts` - split by timestamp

        Raises:
            ValueError: If `split_by` argument is not equal `users` or `ts`
        """
        raw_data = self._read_raw_movielens_data()

        ratings = raw_data[1]

        # TO-DO: add error handling
        ratings, users_activity, items_activity = self._parse(ratings)
        self.users_activity = users_activity
        self.items_activity = items_activity

        if split_by == "users":
            train_raw, test_raw = self._split_by_users(ratings, users_activity)
        if split_by == "ts":
            train_raw, test_raw = self._split_by_time(ratings)
        if split_by != "users" and split_by != "ts":
            raise ValueError("Only splitting by users and ts supported")

        train = self._build_interaction_matrix(train_raw)
        test = self._build_interaction_matrix(test_raw)

        with open(os.path.join(self.processed_folder, self.training_file), "wb") as f:
            torch.save(train, f)

        with open(os.path.join(self.processed_folder, self.test_file), "wb") as f:
            torch.save(test, f)


__all__ = ["MovieLens", "MovieLens20M"]