Source code for pimkl.cli.analyse

import glob
import logging
import os
import sys
import traceback
from functools import partial, reduce
from multiprocessing import Pool, cpu_count

import click
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import KernelPCA
from tqdm import tqdm

from pimkl.analysis import significant_pathways
from pimkl.inducers import read_inducer
from pimkl.models import PIMKL
from pimkl.run import fold_generator, run_model
from pimkl.utils.preprocessing import Standardizer

# plotting setup
sns.set_palette(sns.color_palette('colorblind'))
sns.set_style('white')
sns.set_context('talk')
significant_color = sns.color_palette('colorblind')[2]
other_color = sns.color_palette('colorblind')[0]


[docs]def read_preprocessed( data_names, network_name, gene_sets_name, preprocess_dir ): inducers_filenames = {} inducers_names = {} inducers = {} data = {} for data_name in data_names: # data data[data_name] = pd.read_csv( os.path.join( preprocess_dir, '{}_{}_{}_{}.csv'.format( 'data', data_name, network_name, gene_sets_name ) ), index_col=0 ) # inducers inducers_filenames[data_name] = glob.glob( os.path.join( preprocess_dir, '{}_{}_{}_*.csv.gz'.format( 'inducer', data_name, network_name # * matches gene set (inducer) name which can contain "_" ) ) ) inducers_names[data_name] = [ '_'.join(os.path.basename(filename).split('.')[0].split('_')[4:]) for filename in inducers_filenames[data_name] ] assert len(inducers_names[data_name] ) == len(set(inducers_names[data_name])) inducers[data_name] = [ read_inducer(filename, size=data[data_name].shape[1]) for filename in inducers_filenames[data_name] ] inducers_extended_names = [ '{}-{}'.format(data_name, name) for data_name, inducer_names in inducers_names.items() for name in inducer_names ] return data, inducers, inducers_extended_names
[docs]def analyse( data_names, network_name, gene_sets_name, preprocess_dir, output_dir, class_label_file, model_name='EasyMKL', lam=.2, k=5, number_of_folds=2, max_per_class=20, seed=0, max_processes=cpu_count() ): # reproducible results np.random.seed(seed) # model parameters regularization_factor = False kernel_normalization = True estimator_parameters = { 'trace_normalization': kernel_normalization, 'regularization_factor': regularization_factor, 'lam': lam } if model_name == 'UMKLKNN': mkl_parameters = {'trace_normalization': kernel_normalization, 'k': k} elif model_name == 'AverageMKL': mkl_parameters = {'trace_normalization': kernel_normalization} else: # EasyMKL mkl_parameters = estimator_parameters # prepare: read data and inducers data, inducers, inducers_extended_names = read_preprocessed( data_names, network_name, gene_sets_name, preprocess_dir ) # prepare: classification labels class_labels = pd.read_csv( class_label_file, index_col=0, header=None, squeeze=True ) class_labels = class_labels[~pd.isna(class_labels)] # match samples in data and labels measurement_data_samples = sorted( list( reduce( lambda a, b: a & b, (set(data[data_name].index) for data_name in data_names) ) ) ) samples = sorted( list(set(measurement_data_samples) & set(class_labels.index)) ) labels = ( class_labels[samples].values if model_name == 'EasyMKL' else None # TODO keep series, check model later ) for data_name in data_names: data[data_name] = data[data_name].loc[samples].values # no more pandas labels # learn support vector and kernel weights for different data splits all_trace_factors = {} all_aucs = {} all_weights = {} # parallel processes = max_processes if max_processes < number_of_folds else number_of_folds # noqa if (processes == 1) or ( logging.root.level <= logging.DEBUG ): # serial, allows easier debugging for fold_parameters in tqdm( fold_generator(number_of_folds, data, labels, max_per_class) ): try: aucs, weights, trace_factors = run_model( inducers=inducers, induction_name="induce_linear_kernel", mkl_name=model_name, estimator_name="EasyMKL", mkl_parameters=estimator_parameters, estimator_parameters=estimator_parameters, induction_parameters={}, inducers_extended_names=inducers_extended_names, fold_parameters=fold_parameters ) all_trace_factors[fold_parameters['fold']] = trace_factors all_aucs[fold_parameters['fold']] = aucs if isinstance(weights, list): for label, weights_per_label in weights: all_weights[(fold_parameters['fold'], label) ] = weights_per_label else: all_weights[fold_parameters['fold']] = weights except TypeError: # run returned None logging.debug( 'fold {} not appended'.format(fold_parameters['fold']) ) traceback.print_exc() else: # parallel run_fold = partial( run_model, inducers, 'induce_linear_kernel', model_name, 'EasyMKL', mkl_parameters, estimator_parameters, {}, inducers_extended_names ) logging.debug('fold runs start') with Pool(processes=processes) as pool: runner = pool.imap( run_fold, fold_generator(number_of_folds, data, labels, max_per_class) ) logging.debug('lazy iterator created') results = list(runner) logging.debug('fold runs done') results = filter(lambda x: x is not None, results) # a generator for i, (aucs, weights, trace_factors) in enumerate(results): all_trace_factors[i] = trace_factors all_aucs[i] = aucs if isinstance(weights, list): for label, weights_per_label in weights: all_weights[(i, label)] = weights_per_label else: all_weights[i] = weights # preparing output aucs_df = pd.DataFrame(all_aucs).T weights_df = pd.DataFrame(all_weights).T trace_factors_df = pd.DataFrame(all_trace_factors).T output_filename_part = '{}_{}_{}_cv={}_mc={}_{}'.format( '-'.join(data_names), network_name, gene_sets_name, number_of_folds, max_per_class, model_name ) print(aucs_df) # write to file aucs_df.to_csv('{}/auc_{}.csv'.format(output_dir, output_filename_part)) weights_df.to_csv( '{}/weights_{}.csv'.format(output_dir, output_filename_part) ) trace_factors_df.to_csv( '{}/tracefactors_{}.csv'.format(output_dir, output_filename_part) ) # visualize weights inducers_ordering = weights_df.median().sort_values(ascending=False).index plt.close() colors = [ significant_color if is_significant else other_color for is_significant in significant_pathways(weights_df[inducers_ordering]) ] sns.boxplot(data=weights_df[inducers_ordering], palette=colors) plt.xlabel('Pathway') plt.ylabel('Weight') _ = plt.xticks(rotation=90, fontsize=8) plt.axhline(y=1. / weights_df.shape[1], lw=1., ls='--', c='black') plt.savefig( '{}/weights_{}.pdf'.format(output_dir, output_filename_part), bbox_inches='tight' ) plt.close() # visualize auc aucs_df_box = aucs_df.melt(value_name='AUC') sns.boxplot(data=aucs_df_box, y='AUC', x='variable') plt.savefig( '{}/aucs_{}.pdf'.format(output_dir, output_filename_part), bbox_inches='tight' ) plt.close() print("Files *_{}.* written to disk".format(output_filename_part)) return output_filename_part
[docs]def kpca( data_names, network_name, gene_sets_name, preprocess_dir, output_dir, class_label_file, weights_csv_file, fold, ): # model parameters kernel_normalization = True # prepare: read data and inducers data, inducers, inducers_extended_names = read_preprocessed( data_names, network_name, gene_sets_name, preprocess_dir ) # prepare: classification labels # if class_label_file: class_labels = pd.read_csv( class_label_file, index_col=0, header=None, squeeze=True ) class_labels = class_labels[~pd.isna(class_labels)] # match samples in data and labels measurement_data_samples = sorted( list( reduce( lambda a, b: a & b, (set(data[data_name].index) for data_name in data_names) ) ) ) samples = sorted( list(set(measurement_data_samples) & set(class_labels.index)) ) labels = (class_labels[samples].values) for data_name in data_names: data[data_name] = data[data_name].loc[samples].values # read learned weight to compute final kernel on all data weights = pd.read_csv(weights_csv_file, index_col=0) kpca_basename = os.path.basename(weights_csv_file).split('.')[0] if fold == -1: weights = weights.median() fold_name = 'median' else: weights = weights.loc[fold, :] fold_name = 'fold{}'.format(fold) # kernel pca model_trained_weights = PIMKL( inducers=inducers, induction='induce_linear_kernel', mkl='WeightedAverageMKL', mkl_parameters={ 'trace_normalization': kernel_normalization, 'kernels_weights': weights } ) model_trained_weights.fit(data) optimal_kernel = model_trained_weights.predict(data) kernel_pca = KernelPCA(kernel='precomputed' ).fit(Standardizer().apply(optimal_kernel)) transformed_data = kernel_pca.transform(optimal_kernel) # # not really meaningful for KernelPCA # explained_variance = np.var(transformed_data, axis=0) # explained_variance_ratio = explained_variance / explained_variance.sum() # plt.plot(np.cumsum(explained_variance_ratio)) # plt.xlabel('KernelPCA components') # plt.ylabel('Explained Variance Ratio') # plt.ylim((explained_variance_ratio[0], 1.0)) # plt.savefig( # '{}/kernel_pca_explained_variance_{}_{}.pdf'.format( # output_dir, fold_name, kpca_basename # ), # bbox_inches='tight' # ) components = 3 kpca_columns = list( map(lambda index: 'KernelPC{}'.format(index + 1), range(components)) ) kernel_pca_signature = pd.DataFrame( transformed_data[:, :components], index=samples, columns=kpca_columns ) kernel_pca_signature['class'] = labels plt.clf() sns.pairplot( kernel_pca_signature, kind='scatter', hue='class', vars=kpca_columns, plot_kws=dict(s=10, edgecolor='darkgrey', linewidth=1) ) plt.legend() plt.savefig( '{}/kernel_pca_signature_{}_{}_{}.pdf'.format( output_dir, components, fold_name, kpca_basename ), bbox_inches='tight' )
@click.group() def main(): pass @click.command(short_help='train and test many folds') @click.option('-nd', '--data_name', 'data_names', required=True, multiple=True) @click.argument('network_name', required=True) @click.argument('gene_sets_name', required=True) @click.argument( 'preprocess_dir', required=True, type=click.Path(exists=True, file_okay=False) ) @click.argument( 'output_dir', required=True, type=click.Path(exists=True, file_okay=False, writable=True) ) @click.argument( 'class_label_file', required=True, type=click.Path(exists=True, file_okay=True) ) @click.option( '--model_name', default='EasyMKL', type=click.Choice(['EasyMKL', 'UMKLKNN', 'AverageMKL']) ) @click.argument('lam', default=0.2) @click.argument('k', default=5) @click.argument('number_of_folds', default=10) @click.argument('max_per_class', default=20) @click.argument('seed', default=0) @click.argument('max_processes', default=1) def run_performance_analysis( data_names, network_name, gene_sets_name, preprocess_dir, output_dir, class_label_file, model_name, lam, k, number_of_folds, max_per_class, seed, max_processes ): """ Run classifications using pathway induced multiple kernel learning on preprocessed data and inducers on a number of train/test splits and analyse the resulting classification performance and learned pathway weights. The `class_label_file` should be readable with `pd.read_csv( class_label_file, index_col=0, header=None, squeeze=True)` """ output_filename_core = analyse( data_names, network_name, gene_sets_name, preprocess_dir, output_dir, class_label_file, model_name, lam, k, number_of_folds, max_per_class, seed, max_processes ) return 0 @click.command(short_help='KernelPCA with given pathway weights') @click.option('-nd', '--data_name', 'data_names', required=True, multiple=True) @click.argument('network_name', required=True) @click.argument('gene_sets_name', required=True) @click.argument( 'preprocess_dir', required=True, type=click.Path(exists=True, file_okay=False) ) @click.argument( 'output_dir', required=True, type=click.Path(exists=True, file_okay=False, writable=True) ) @click.argument( 'class_label_file', required=True, type=click.Path(exists=True, file_okay=True) ) @click.argument( 'weights_csv_file', required=True, type=click.Path(exists=True, file_okay=True) ) @click.argument('fold', default=-1) def run_kpca( data_names, network_name, gene_sets_name, preprocess_dir, output_dir, class_label_file, weights_csv_file, fold, ): """ Following pathway weight computation during performance analysis, perform KernelPCA on a final kernel defined by weights of either a given fold or by default the median pathway weight. """ kpca( data_names, network_name, gene_sets_name, preprocess_dir, output_dir, class_label_file, weights_csv_file, fold, ) return 0 main.add_command(run_performance_analysis) main.add_command(run_kpca) if __name__ == "__main__": sys.exit(main()) # pragma: no cover