Source code for pimkl.analysis

import sys
import io
import click
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.special import betainc
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import fdrcorrection, multipletests

SIGNIFICANT_COLOR = sns.color_palette('colorblind')[2]
OTHER_COLOR = sns.color_palette('colorblind')[0]


[docs]def significant_pathways(df, alpha=1e-3):
    """significance test vs the average weight"""
    avg = 1. / df.shape[1]
    ordering = df.columns
    wilcoxon_pvalues = pd.Series(
        {
            ws: wilcoxon(df[ws] - avg)[1] if df[ws].median() > avg else 1.
            for ws in df
        }
    )
    wilcoxon_pvalues = wilcoxon_pvalues[ordering]
    # Multiple Testing Correction
    reject_H0, pvals_corrected = fdrcorrection(wilcoxon_pvalues, alpha=alpha)
    return pd.Series(reject_H0, index=ordering)


[docs]def significant_correlations(rho, sample_length):
    triu_indices = np.triu_indices(rho.shape[0], 1)
    rhof = rho.values[triu_indices]
    dof = sample_length - 2
    ts = rhof * rhof * (dof / (1 - rhof * rhof + sys.float_info.epsilon))
    p_values = betainc(0.5 * dof, 0.5, dof / (dof + ts))
    significants, p_values_corrected, _, _ = multipletests(
        p_values, alpha=5e-2, method='fdr_bh'
    )
    p_vals = np.zeros_like(rho)
    p_vals[triu_indices] = p_values_corrected
    p_vals += p_vals.T
    sig_mask = np.zeros_like(rho, dtype=np.bool)
    sig_mask[triu_indices] = significants
    sig_mask += (sig_mask.T + np.eye(rho.shape[0], dtype=np.bool))
    return sig_mask, p_vals


[docs]def plot_aucs_to_buffer(df, save=False):
    """
    plot AUC for multiindexed pandas.DataFrame where
    df.columns.names = ['data', 'kind']
    """
    buffer = None
    aucs_df_box = df.melt(value_name='AUC')
    if 'class' in df.columns.levels[1]:
        sns.boxplot(data=aucs_df_box, y='AUC', x='data')
    else:
        sns.boxplot(
            data=aucs_df_box, y='AUC', x='kind', hue='data'
        )  # hue_order=['', '']
    plt.xlabel('')
    # plt.xticks(rotation=90)
    plt.ylim((0.5, df.values.max()))
    if save:
        buffer = io.BytesIO()
        plt.savefig(buffer, format='pdf', bbox_inches='tight')
        buffer.seek(0)
    else:
        plt.show()
    plt.close()
    return buffer


[docs]def plot_weights_significant_correlations_to_buffer(
    weights_df, correlation_type, save=False
):
    """
    plot heatmap showing value of correlation if significant between different
    molecular signatures where weights_df.index.names is ['fold', 'class']
    """
    buffers_dict = {}
    correlation_cmap = sns.diverging_palette(
        240, 10, center='dark', as_cmap=True
    )  # Blue-Grey-Red
    # correlation all folds
    rho = weights_df.swaplevel('fold', 'class').sort_index().T.corr(
        method=correlation_type
    )
    sample_length = weights_df.shape[1]
    sig_mask = pd.DataFrame(
        significant_correlations(rho, sample_length)[0], dtype=np.uint8
    )
    # symmetric
    mask = np.zeros_like(rho, dtype=np.bool)
    mask[np.triu_indices(rho.shape[0], 1)] = True
    extended_mask = ~(sig_mask.astype(np.bool)).values | mask
    ticks = ['' for i in range(weights_df.shape[0])]
    ticks[0:weights_df.shape[0]:len(weights_df.index.levels[0])] = list(
        weights_df.index.levels[1]
    )
    f = sns.heatmap(
        rho,
        mask=extended_mask,
        vmin=-1,
        vmax=1,
        xticklabels=ticks,
        yticklabels=ticks,
        cmap=correlation_cmap
    ).get_figure()
    if save:
        buffers_dict['correlation_folds_{}'.format(correlation_type)
                     ] = io.BytesIO()
        f.savefig(
            buffers_dict['correlation_folds_{}'.format(correlation_type)],
            bbox_inches='tight'
        )
        buffers_dict['correlation_folds_{}'.format(correlation_type)].seek(0)
    else:
        plt.show()
    plt.close()

    # correlation of class summary
    median_classes_df = weights_df.swaplevel('fold',
                                             'class').sort_index().median(
                                                 axis=0, level='class'
                                             ).T
    median_classes_df['all folds'] = weights_df.median(
        axis=0
    ).T  # add median over folds over all classes
    rho_ = median_classes_df.corr(method=correlation_type)
    sig_mask_ = pd.DataFrame(
        significant_correlations(rho_, sample_length)[0],
        dtype=np.uint8,
        index=rho_.index,
        columns=rho_.columns
    )
    # symmetric
    mask_ = np.zeros_like(rho_, dtype=np.bool)
    mask_[np.triu_indices(rho_.shape[0], 1)] = True
    extended_mask_ = ~(sig_mask_.astype(np.bool)).values | mask_
    f = sns.heatmap(
        rho_, mask=extended_mask_, vmin=-1, vmax=1, cmap=correlation_cmap
    ).get_figure()
    if save:
        buffers_dict['correlation_phenotypes_{}'.format(correlation_type)
                     ] = io.BytesIO()
        f.savefig(
            buffers_dict['correlation_phenotypes_{}'.format(correlation_type)],
            bbox_inches='tight'
        )
        buffers_dict['correlation_phenotypes_{}'.format(correlation_type)
                     ].seek(0)
    else:
        plt.show()
    plt.close()
    return buffers_dict


[docs]def plot_weights_to_buffer(weights_df, save=False, plot_correlations=False):
    """
    plot molecular signature over many folds in pathway wise boxes.
    For non-binary problems each 1 versus Rest is ploted.
    weights_df.index.names should be ['fold'] or ['fold', 'class'])
    """
    buffers_dict = {}
    if 'class' not in weights_df.index.names:
        # _beautify_columns(weights_df)
        inducers_ordering = weights_df.median().sort_values(
            ascending=False
        ).index
        cols = [
            SIGNIFICANT_COLOR if sig else OTHER_COLOR
            for sig in significant_pathways(weights_df[inducers_ordering])
        ]
        avg = 1. / weights_df.shape[1]

        # plt.figure(figsize=(8,3))
        plt.figure(figsize=(18, 6))
        sns.boxplot(data=weights_df[inducers_ordering], palette=cols)
        plt.axhline(y=avg, lw=1., ls='--', c='black')
        # plt.title(
        #     'Pathways weights for {}\n'.format(phenotype_label) +
        #     '(fold={}, training samples per class={})'.format(cv, mc)
        # )
        plt.xlabel('Pathway')
        plt.ylabel('Weight')
        plt.ylim((0., weights_df.values.max()))
        # _ = plt.xticks([])
        _ = plt.xticks(rotation=90)
        if save:
            buffers_dict['weights_analysis'] = io.BytesIO()
            plt.savefig(
                buffers_dict['weights_analysis'],
                format='pdf',
                # '{}/weights_analysis_{}.pdf'.format(directory_name, suffix),
                bbox_inches='tight'
            )
            buffers_dict['weights_analysis'].seek(0)
        else:
            plt.show()
        plt.close()
    else:  # multi-index in case of EasyMKL with nonbinary labels
        significant_masks = {}
        # _beautify_columns(weights_df)
        w_dfs = {
            w: weights_df.xs(w, level='class')
            for w in weights_df.index.levels[1]
        }
        for w, w_df in w_dfs.items():
            inducers_ordering = w_df.median().sort_values(
                ascending=False
            ).index
            significant_masks[w] = significant_pathways(
                w_df[inducers_ordering]
            )
            cols = [
                SIGNIFICANT_COLOR if sig else OTHER_COLOR
                for sig in significant_masks[w]
            ]
            avg = 1. / w_df.shape[1]

            plt.figure(figsize=(18, 6))
            sns.boxplot(data=w_df[inducers_ordering], palette=cols)
            plt.axhline(y=avg, lw=1., ls='--', c='black')
            # plt.title(
            #     'Pathways weights for {} {} vs rest \n'.
            #     format(phenotype_label, w) +
            #     '(fold={}, training samples per class={})'.format(cv, mc)
            # )
            plt.xlabel('Pathway')
            plt.ylabel('Weight')
            plt.ylim((0., w_df.values.max()))
            # _ = plt.xticks([])
            _ = plt.xticks(rotation=90)
            if save:
                buffers_dict['weights_analysis_{}vR'.format(w)] = io.BytesIO()
                plt.savefig(
                    buffers_dict['weights_analysis_{}vR'.format(w)],
                    format='pdf',
                    bbox_inches='tight'
                )
                buffers_dict['weights_analysis_{}vR'.format(w)].seek(0)
            else:
                plt.show()
            plt.close()
        # 1vRest significance comparison
        significant_overall = pd.DataFrame(significant_masks).T
        # order = (  # sig count
        #     significant_overall.T
        # ).T.sum().sort_values().index[::-1]
        order = pd.concat([df for df in w_dfs.values()],
                          axis=0).median().sort_values(ascending=False).index

        fig, ax = plt.subplots()
        # plt.figure(figsize=(18,6))
        sns.heatmap(
            significant_overall[order],
            square=True,
            cmap=[OTHER_COLOR, SIGNIFICANT_COLOR],
            linewidths=.5,
            cbar=False,
            ax=ax
        )
        ax.set(aspect=5)
        plt.yticks(rotation=0)
        fontsize = 4 if w_df.shape[1] > 50 else 7
        plt.xticks(np.array(range(len(order))) + 0.5, order, fontsize=fontsize)
        plt.tight_layout()
        if save:
            buffers_dict['significant_pathways'] = io.BytesIO()
            plt.savefig(
                buffers_dict['significant_pathways'],
                format='pdf',
                bbox_inches='tight'
            )
            buffers_dict['significant_pathways'].seek(0)
        else:
            plt.show()
        plt.close()

        if plot_correlations:
            # correlation between 1vR weights
            more_buffers_dict = {}
            for correlation_type in ['pearson']:  # , 'spearman'
                more_buffers_dict = {
                    **more_buffers_dict,
                    **plot_weights_significant_correlations_to_buffer(
                        weights_df=weights_df,
                        correlation_type=correlation_type,
                        save=save
                    )
                }
            return {**buffers_dict, **more_buffers_dict}
    return buffers_dict