"""Split data into training and test."""
import numpy as np
def _get_learning_data_indices(samples, labels=None, max_per_class=30):
"""In case labels are None only one class named 'class' is considered."""
permuted = np.random.permutation(range(samples))
if labels is None:
labels = np.repeat('class', samples)
counts = {label: 0 for label in set(labels)}
train = []
test = []
for index in permuted:
label = labels[index]
if counts[label] < max_per_class:
counts[label] += 1
train.append(index)
else:
test.append(index)
return train, test
[docs]def get_learning_data_indices_fraction(X, fraction=0.5):
"""Return data in dict mode splitted using a fraction."""
# get a key
if isinstance(X, dict):
keys = list(X.keys())
number_of_samples = X[keys[0]].shape[0]
else:
number_of_samples = X.shape[0]
sample_indices = np.arange(number_of_samples)
number_of_training_samples = int(np.floor(fraction * number_of_samples))
train = list(np.random.choice(sample_indices, number_of_training_samples))
test = list(set(sample_indices) - set(train))
return train, test
[docs]def get_learning_data_in_dict_mode(
X, labels=None, data_types=None, max_per_class=30
):
"""Return splitted test and training data for multiple data types."""
if data_types is None:
data_types = list(X.keys())
number_of_samples = X[data_types[0]].shape[0]
train, test = _get_learning_data_indices(
number_of_samples, labels, max_per_class=max_per_class
)
X_train, X_test = {}, {}
for data_type in data_types:
X_train[data_type], X_test[data_type] = \
X[data_type][train], X[data_type][test]
if labels is None:
return X_train, X_test
else:
y_train, y_test = labels[train], labels[test]
return X_train, y_train, X_test, y_test
[docs]def get_learning_data_in_dict_mode_fraction(
X, labels=None, data_types=None, fraction=0.5
):
"""Return splitted test and training data for multiple data types."""
if data_types is None:
data_types = list(X.keys())
train, test = get_learning_data_indices_fraction(X, fraction=fraction)
X_train, X_test = {}, {}
for data_type in data_types:
X_train[data_type], X_test[data_type] = \
X[data_type][train], X[data_type][test]
if labels is None:
return X_train, X_test
else:
y_train, y_test = labels[train], labels[test]
return X_train, y_train, X_test, y_test
[docs]def get_learning_data(X, labels=None, max_per_class=30):
"""Return splitted test and training data for single data type."""
number_of_samples = X.shape[0]
train, test = _get_learning_data_indices(
number_of_samples, labels, max_per_class
)
if labels is None:
return X[train], X[test]
else:
return X[train], labels[train], X[test], labels[test]