Source code for pimkl.data

"""Split data into training and test."""
import numpy as np


def _get_learning_data_indices(samples, labels=None, max_per_class=30):
    """In case labels are None only one class named 'class' is considered."""
    permuted = np.random.permutation(range(samples))
    if labels is None:
        labels = np.repeat('class', samples)
    counts = {label: 0 for label in set(labels)}
    train = []
    test = []
    for index in permuted:
        label = labels[index]
        if counts[label] < max_per_class:
            counts[label] += 1
            train.append(index)
        else:
            test.append(index)
    return train, test


[docs]def get_learning_data_indices_fraction(X, fraction=0.5):
    """Return data in dict mode splitted using a fraction."""
    # get a key
    if isinstance(X, dict):
        keys = list(X.keys())
        number_of_samples = X[keys[0]].shape[0]
    else:
        number_of_samples = X.shape[0]
    sample_indices = np.arange(number_of_samples)
    number_of_training_samples = int(np.floor(fraction * number_of_samples))
    train = list(np.random.choice(sample_indices, number_of_training_samples))
    test = list(set(sample_indices) - set(train))
    return train, test


[docs]def get_learning_data_in_dict_mode(
    X, labels=None, data_types=None, max_per_class=30
):
    """Return splitted test and training data for multiple data types."""
    if data_types is None:
        data_types = list(X.keys())
    number_of_samples = X[data_types[0]].shape[0]

    train, test = _get_learning_data_indices(
        number_of_samples, labels, max_per_class=max_per_class
    )
    X_train, X_test = {}, {}
    for data_type in data_types:
        X_train[data_type], X_test[data_type] = \
            X[data_type][train], X[data_type][test]
    if labels is None:
        return X_train, X_test
    else:
        y_train, y_test = labels[train], labels[test]
        return X_train, y_train, X_test, y_test


[docs]def get_learning_data_in_dict_mode_fraction(
    X, labels=None, data_types=None, fraction=0.5
):
    """Return splitted test and training data for multiple data types."""
    if data_types is None:
        data_types = list(X.keys())

    train, test = get_learning_data_indices_fraction(X, fraction=fraction)
    X_train, X_test = {}, {}
    for data_type in data_types:
        X_train[data_type], X_test[data_type] = \
            X[data_type][train], X[data_type][test]
    if labels is None:
        return X_train, X_test
    else:
        y_train, y_test = labels[train], labels[test]
        return X_train, y_train, X_test, y_test


[docs]def get_learning_data(X, labels=None, max_per_class=30):
    """Return splitted test and training data for single data type."""
    number_of_samples = X.shape[0]
    train, test = _get_learning_data_indices(
        number_of_samples, labels, max_per_class
    )
    if labels is None:
        return X[train], X[test]
    else:
        return X[train], labels[train], X[test], labels[test]