Source code for cornac.eval_methods.ratio_split

# -*- coding: utf-8 -*-

"""
@author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
"""

from ..utils.common import safe_indexing
from math import ceil
from .base_method import BaseMethod
from ..experiment.result import Result
import numpy as np


[docs]class RatioSplit(BaseMethod): """Train-Test Split Evaluation Method. Parameters ---------- data: ..., required The input data in the form of triplets (user, item, rating). fmt: str, optional, default: "UIR" The format of input data: - UIR: (user, item, rating) triplet data - UIRT: (user, item , rating, timestamp) quadruplet data test_size: float, optional, default: 0.2 The proportion of the test set, \ if > 1 then it is treated as the size of the test set. val_size: float, optional, default: 0.0 The proportion of the validation set, \ if > 1 then it is treated as the size of the validation set. rating_threshold: float, optional, default: 1. The minimum value that is considered to be a good rating used for ranking, \ e.g, if the ratings are in {1, ..., 5}, then rating_threshold = 4. shuffle: bool, optional, default: True Shuffle the data before splitting. seed: bool, optional, default: None Random seed. exclude_unknowns: bool, optional, default: False Ignore unknown users and items (cold-start) during evaluation and testing verbose: bool, optional, default: False Output running log """ def __init__(self, data, fmt='UIR', test_size=0.2, val_size=0.0, rating_threshold=1.0, shuffle=True, seed=None, exclude_unknowns=False, verbose=False, **kwargs): BaseMethod.__init__(self, data=data, fmt=fmt, rating_threshold=rating_threshold, exclude_unknowns=exclude_unknowns, verbose=verbose, **kwargs) self._shuffle = shuffle self._seed = seed self._train_size, self._val_size, self._test_size = self.validate_size(val_size, test_size, len(self._data)) self._split_ran = False @staticmethod def validate_size(val_size, test_size, num_ratings): if val_size is None: val_size = 0.0 elif val_size < 0: raise ValueError('val_size={} should be greater than zero'.format(val_size)) elif val_size >= num_ratings: raise ValueError( 'val_size={} should be less than the number of ratings {}'.format(val_size, num_ratings)) if test_size is None: test_size = 0.0 elif test_size < 0: raise ValueError('test_size={} should be greater than zero'.format(test_size)) elif test_size >= num_ratings: raise ValueError( 'test_size={} should be less than the number of ratings {}'.format(test_size, num_ratings)) if val_size < 1: val_size = ceil(val_size * num_ratings) if test_size < 1: test_size = ceil(test_size * num_ratings) if val_size + test_size >= num_ratings: raise ValueError( 'The sum of val_size and test_size ({}) should be smaller than the number of ratings {}'.format( val_size + test_size, num_ratings)) train_size = num_ratings - (val_size + test_size) return int(train_size), int(val_size), int(test_size) def split(self): if self._split_ran: return if self.verbose: print("Splitting the data") data_idx = np.arange(len(self._data)) if self._shuffle: if self._seed is not None: np.random.seed(self._seed) data_idx = np.random.permutation(data_idx) train_idx = data_idx[:self._train_size] test_idx = data_idx[-self._test_size:] val_idx = data_idx[self._train_size:-self._test_size] train_data = safe_indexing(self._data, train_idx) test_data = safe_indexing(self._data, test_idx) val_data = None if len(val_idx) > 0: val_data = safe_indexing(self._data, val_idx) self.build(train_data=train_data, test_data=test_data, val_data=val_data) self._split_ran = True if self.verbose: print('Total users = {}'.format(self.total_users)) print('Total items = {}'.format(self.total_items))
[docs] def evaluate(self, model, metrics, user_based): self.split() return BaseMethod.evaluate(self, model, metrics, user_based)