# -*- coding: utf-8 -*-
"""
@author: Quoc-Tuan Truong <tuantq.vnu@gmail.com>
"""
from ..utils.common import safe_indexing
from math import ceil
from .base_method import BaseMethod
from ..experiment.result import Result
import numpy as np
[docs]class RatioSplit(BaseMethod):
"""Train-Test Split Evaluation Method.
Parameters
----------
data: ..., required
The input data in the form of triplets (user, item, rating).
fmt: str, optional, default: "UIR"
The format of input data:
- UIR: (user, item, rating) triplet data
- UIRT: (user, item , rating, timestamp) quadruplet data
test_size: float, optional, default: 0.2
The proportion of the test set, \
if > 1 then it is treated as the size of the test set.
val_size: float, optional, default: 0.0
The proportion of the validation set, \
if > 1 then it is treated as the size of the validation set.
rating_threshold: float, optional, default: 1.
The minimum value that is considered to be a good rating used for ranking, \
e.g, if the ratings are in {1, ..., 5}, then rating_threshold = 4.
shuffle: bool, optional, default: True
Shuffle the data before splitting.
seed: bool, optional, default: None
Random seed.
exclude_unknowns: bool, optional, default: False
Ignore unknown users and items (cold-start) during evaluation and testing
verbose: bool, optional, default: False
Output running log
"""
def __init__(self, data, fmt='UIR', test_size=0.2, val_size=0.0, rating_threshold=1.0, shuffle=True,
seed=None, exclude_unknowns=False, verbose=False, **kwargs):
BaseMethod.__init__(self, data=data, fmt=fmt, rating_threshold=rating_threshold,
exclude_unknowns=exclude_unknowns, verbose=verbose, **kwargs)
self._shuffle = shuffle
self._seed = seed
self._train_size, self._val_size, self._test_size = self.validate_size(val_size, test_size, len(self._data))
self._split_ran = False
@staticmethod
def validate_size(val_size, test_size, num_ratings):
if val_size is None:
val_size = 0.0
elif val_size < 0:
raise ValueError('val_size={} should be greater than zero'.format(val_size))
elif val_size >= num_ratings:
raise ValueError(
'val_size={} should be less than the number of ratings {}'.format(val_size, num_ratings))
if test_size is None:
test_size = 0.0
elif test_size < 0:
raise ValueError('test_size={} should be greater than zero'.format(test_size))
elif test_size >= num_ratings:
raise ValueError(
'test_size={} should be less than the number of ratings {}'.format(test_size, num_ratings))
if val_size < 1:
val_size = ceil(val_size * num_ratings)
if test_size < 1:
test_size = ceil(test_size * num_ratings)
if val_size + test_size >= num_ratings:
raise ValueError(
'The sum of val_size and test_size ({}) should be smaller than the number of ratings {}'.format(
val_size + test_size, num_ratings))
train_size = num_ratings - (val_size + test_size)
return int(train_size), int(val_size), int(test_size)
def split(self):
if self._split_ran:
return
if self.verbose:
print("Splitting the data")
data_idx = np.arange(len(self._data))
if self._shuffle:
if self._seed is not None:
np.random.seed(self._seed)
data_idx = np.random.permutation(data_idx)
train_idx = data_idx[:self._train_size]
test_idx = data_idx[-self._test_size:]
val_idx = data_idx[self._train_size:-self._test_size]
train_data = safe_indexing(self._data, train_idx)
test_data = safe_indexing(self._data, test_idx)
val_data = None
if len(val_idx) > 0:
val_data = safe_indexing(self._data, val_idx)
self.build(train_data=train_data, test_data=test_data, val_data=val_data)
self._split_ran = True
if self.verbose:
print('Total users = {}'.format(self.total_users))
print('Total items = {}'.format(self.total_items))
[docs] def evaluate(self, model, metrics, user_based):
self.split()
return BaseMethod.evaluate(self, model, metrics, user_based)