Source code for pimkl.analysis

import sys
import io
import click
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.special import betainc
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import fdrcorrection, multipletests

SIGNIFICANT_COLOR = sns.color_palette('colorblind')[2]
OTHER_COLOR = sns.color_palette('colorblind')[0]


[docs]def significant_pathways(df, alpha=1e-3): """significance test vs the average weight""" avg = 1. / df.shape[1] ordering = df.columns wilcoxon_pvalues = pd.Series( { ws: wilcoxon(df[ws] - avg)[1] if df[ws].median() > avg else 1. for ws in df } ) wilcoxon_pvalues = wilcoxon_pvalues[ordering] # Multiple Testing Correction reject_H0, pvals_corrected = fdrcorrection(wilcoxon_pvalues, alpha=alpha) return pd.Series(reject_H0, index=ordering)
[docs]def significant_correlations(rho, sample_length): triu_indices = np.triu_indices(rho.shape[0], 1) rhof = rho.values[triu_indices] dof = sample_length - 2 ts = rhof * rhof * (dof / (1 - rhof * rhof + sys.float_info.epsilon)) p_values = betainc(0.5 * dof, 0.5, dof / (dof + ts)) significants, p_values_corrected, _, _ = multipletests( p_values, alpha=5e-2, method='fdr_bh' ) p_vals = np.zeros_like(rho) p_vals[triu_indices] = p_values_corrected p_vals += p_vals.T sig_mask = np.zeros_like(rho, dtype=np.bool) sig_mask[triu_indices] = significants sig_mask += (sig_mask.T + np.eye(rho.shape[0], dtype=np.bool)) return sig_mask, p_vals
[docs]def plot_aucs_to_buffer(df, save=False): """ plot AUC for multiindexed pandas.DataFrame where df.columns.names = ['data', 'kind'] """ buffer = None aucs_df_box = df.melt(value_name='AUC') if 'class' in df.columns.levels[1]: sns.boxplot(data=aucs_df_box, y='AUC', x='data') else: sns.boxplot( data=aucs_df_box, y='AUC', x='kind', hue='data' ) # hue_order=['', ''] plt.xlabel('') # plt.xticks(rotation=90) plt.ylim((0.5, df.values.max())) if save: buffer = io.BytesIO() plt.savefig(buffer, format='pdf', bbox_inches='tight') buffer.seek(0) else: plt.show() plt.close() return buffer
[docs]def plot_weights_significant_correlations_to_buffer( weights_df, correlation_type, save=False ): """ plot heatmap showing value of correlation if significant between different molecular signatures where weights_df.index.names is ['fold', 'class'] """ buffers_dict = {} correlation_cmap = sns.diverging_palette( 240, 10, center='dark', as_cmap=True ) # Blue-Grey-Red # correlation all folds rho = weights_df.swaplevel('fold', 'class').sort_index().T.corr( method=correlation_type ) sample_length = weights_df.shape[1] sig_mask = pd.DataFrame( significant_correlations(rho, sample_length)[0], dtype=np.uint8 ) # symmetric mask = np.zeros_like(rho, dtype=np.bool) mask[np.triu_indices(rho.shape[0], 1)] = True extended_mask = ~(sig_mask.astype(np.bool)).values | mask ticks = ['' for i in range(weights_df.shape[0])] ticks[0:weights_df.shape[0]:len(weights_df.index.levels[0])] = list( weights_df.index.levels[1] ) f = sns.heatmap( rho, mask=extended_mask, vmin=-1, vmax=1, xticklabels=ticks, yticklabels=ticks, cmap=correlation_cmap ).get_figure() if save: buffers_dict['correlation_folds_{}'.format(correlation_type) ] = io.BytesIO() f.savefig( buffers_dict['correlation_folds_{}'.format(correlation_type)], bbox_inches='tight' ) buffers_dict['correlation_folds_{}'.format(correlation_type)].seek(0) else: plt.show() plt.close() # correlation of class summary median_classes_df = weights_df.swaplevel('fold', 'class').sort_index().median( axis=0, level='class' ).T median_classes_df['all folds'] = weights_df.median( axis=0 ).T # add median over folds over all classes rho_ = median_classes_df.corr(method=correlation_type) sig_mask_ = pd.DataFrame( significant_correlations(rho_, sample_length)[0], dtype=np.uint8, index=rho_.index, columns=rho_.columns ) # symmetric mask_ = np.zeros_like(rho_, dtype=np.bool) mask_[np.triu_indices(rho_.shape[0], 1)] = True extended_mask_ = ~(sig_mask_.astype(np.bool)).values | mask_ f = sns.heatmap( rho_, mask=extended_mask_, vmin=-1, vmax=1, cmap=correlation_cmap ).get_figure() if save: buffers_dict['correlation_phenotypes_{}'.format(correlation_type) ] = io.BytesIO() f.savefig( buffers_dict['correlation_phenotypes_{}'.format(correlation_type)], bbox_inches='tight' ) buffers_dict['correlation_phenotypes_{}'.format(correlation_type) ].seek(0) else: plt.show() plt.close() return buffers_dict
[docs]def plot_weights_to_buffer(weights_df, save=False, plot_correlations=False): """ plot molecular signature over many folds in pathway wise boxes. For non-binary problems each 1 versus Rest is ploted. weights_df.index.names should be ['fold'] or ['fold', 'class']) """ buffers_dict = {} if 'class' not in weights_df.index.names: # _beautify_columns(weights_df) inducers_ordering = weights_df.median().sort_values( ascending=False ).index cols = [ SIGNIFICANT_COLOR if sig else OTHER_COLOR for sig in significant_pathways(weights_df[inducers_ordering]) ] avg = 1. / weights_df.shape[1] # plt.figure(figsize=(8,3)) plt.figure(figsize=(18, 6)) sns.boxplot(data=weights_df[inducers_ordering], palette=cols) plt.axhline(y=avg, lw=1., ls='--', c='black') # plt.title( # 'Pathways weights for {}\n'.format(phenotype_label) + # '(fold={}, training samples per class={})'.format(cv, mc) # ) plt.xlabel('Pathway') plt.ylabel('Weight') plt.ylim((0., weights_df.values.max())) # _ = plt.xticks([]) _ = plt.xticks(rotation=90) if save: buffers_dict['weights_analysis'] = io.BytesIO() plt.savefig( buffers_dict['weights_analysis'], format='pdf', # '{}/weights_analysis_{}.pdf'.format(directory_name, suffix), bbox_inches='tight' ) buffers_dict['weights_analysis'].seek(0) else: plt.show() plt.close() else: # multi-index in case of EasyMKL with nonbinary labels significant_masks = {} # _beautify_columns(weights_df) w_dfs = { w: weights_df.xs(w, level='class') for w in weights_df.index.levels[1] } for w, w_df in w_dfs.items(): inducers_ordering = w_df.median().sort_values( ascending=False ).index significant_masks[w] = significant_pathways( w_df[inducers_ordering] ) cols = [ SIGNIFICANT_COLOR if sig else OTHER_COLOR for sig in significant_masks[w] ] avg = 1. / w_df.shape[1] plt.figure(figsize=(18, 6)) sns.boxplot(data=w_df[inducers_ordering], palette=cols) plt.axhline(y=avg, lw=1., ls='--', c='black') # plt.title( # 'Pathways weights for {} {} vs rest \n'. # format(phenotype_label, w) + # '(fold={}, training samples per class={})'.format(cv, mc) # ) plt.xlabel('Pathway') plt.ylabel('Weight') plt.ylim((0., w_df.values.max())) # _ = plt.xticks([]) _ = plt.xticks(rotation=90) if save: buffers_dict['weights_analysis_{}vR'.format(w)] = io.BytesIO() plt.savefig( buffers_dict['weights_analysis_{}vR'.format(w)], format='pdf', bbox_inches='tight' ) buffers_dict['weights_analysis_{}vR'.format(w)].seek(0) else: plt.show() plt.close() # 1vRest significance comparison significant_overall = pd.DataFrame(significant_masks).T # order = ( # sig count # significant_overall.T # ).T.sum().sort_values().index[::-1] order = pd.concat([df for df in w_dfs.values()], axis=0).median().sort_values(ascending=False).index fig, ax = plt.subplots() # plt.figure(figsize=(18,6)) sns.heatmap( significant_overall[order], square=True, cmap=[OTHER_COLOR, SIGNIFICANT_COLOR], linewidths=.5, cbar=False, ax=ax ) ax.set(aspect=5) plt.yticks(rotation=0) fontsize = 4 if w_df.shape[1] > 50 else 7 plt.xticks(np.array(range(len(order))) + 0.5, order, fontsize=fontsize) plt.tight_layout() if save: buffers_dict['significant_pathways'] = io.BytesIO() plt.savefig( buffers_dict['significant_pathways'], format='pdf', bbox_inches='tight' ) buffers_dict['significant_pathways'].seek(0) else: plt.show() plt.close() if plot_correlations: # correlation between 1vR weights more_buffers_dict = {} for correlation_type in ['pearson']: # , 'spearman' more_buffers_dict = { **more_buffers_dict, **plot_weights_significant_correlations_to_buffer( weights_df=weights_df, correlation_type=correlation_type, save=save ) } return {**buffers_dict, **more_buffers_dict} return buffers_dict