Notebook

In [1]:

from IPython.parallel import Client
cli = Client()
print("Total workers: {}".format(len(cli.ids)))

Total workers: 12

In [2]:

dview = cli[:]
lbview = cli.load_balanced_view()

In [3]:

%%px --local
from collections import Counter, namedtuple
import itertools
from pathlib import Path
import pickle

from IPython.html import widgets # Widget definitions
from IPython.display import display # Used to display widgets in the notebook

from PIL import Image

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import cross_validation

In [4]:

current_dir = Path('.').resolve().as_posix()
src_dir = Path('../../src').resolve().as_posix()
dview['current_dir'] = current_dir
dview['src_dir'] = src_dir

In [5]:

%%px --local
import sys
sys.path.append(current_dir)
sys.path.append(src_dir)
from exp_utils import (
    Connection, Dataset, SingleLabelDataset,
    PathConnection, DatasetConnection, 
    TrainTestConnection, TrainTestDatsetConnection,
    Pred, get_mismatch_pred, make_mismatch_df, confmat_to_df, read_patches_from_batch
)

Label mapping¶

In [6]:

%%px --local
with pd.ExcelFile(
    str(Path(current_dir).parent / 'exp06_both' / 'label_mapping_modified.xlsx')
) as xlsx:
    label_stat_df = TrainTestConnection(xlsx.parse('Train'), xlsx.parse('Test'))
    
IGNORE_LABELS = [-1, 6, 7]
def make_label_map(df):
    return {
        fsuf: lab
        for fsuf, lab in zip(df.file_suffix, df.label)
        if lab not in IGNORE_LABELS
    }
anno_label_map = label_stat_df.apply(make_label_map)

LABEL_NAME_MAP = {
    1: '腺癌', 2: '正常', 3: '粘液癌', 4: '锯齿状癌',
    5: '乳头状癌', 6: '绒毛状腺瘤', 7: '锯齿状腺瘤', 8: '绸带粉刺状癌',    
}
LABEL_NAME_MAP_EN= {
    1: 'AC', 2: 'N', 3: 'MC', 4: 'SC',
    5: 'PC', 6: 'VA', 7: 'SA', 8: 'CCTA'
}

In [7]:

%%px --local
train = Dataset(
    dir_name='exp04_run', img_desc_pth=str(Path(current_dir).parent / 'exp04_run/remote_train_list.txt'),
    anno_label_map=anno_label_map.train
)
test = Dataset(
    dir_name='exp05_run', img_desc_pth=str(Path(current_dir).parent / 'exp05_run/remote_test_list.txt'),
    anno_label_map=anno_label_map.test
)
addnorm = SingleLabelDataset(
    label=2,
    dir_name='exp08_addnorm', img_desc_pth=str(Path(current_dir).parent /'exp08_addnorm/normal_list_0.txt')
)
all_datasets = TrainTestDatsetConnection(train=train, test=test, addnorm=addnorm)

In [8]:

feat_df_avg_softmax3 = all_datasets.used_features.apply(
    lambda feat_mat: pd.DataFrame(np.r_[f[0], f[2]] for f in feat_mat))
feat_df_avg_softmax3.shape

Out[8]:

Connection(train=(273, 8192), test=(120, 8192), addnorm=(300, 8192))

In [9]:

full_X = np.concatenate(list(feat_df_avg_softmax3))
full_Y = np.concatenate(list(all_datasets.used_labels))

SVM¶

In [10]:

ss_splitter = cross_validation.StratifiedShuffleSplit(
    full_Y, n_iter=1, test_size=0.5, random_state=9527
)
train_ix, test_ix = next(iter(ss_splitter))

train_X, train_Y = full_X[train_ix], full_Y[train_ix]
test_X, test_Y = full_X[test_ix], full_Y[test_ix]

classifier = svm.LinearSVC(C=0.5, random_state=5566)
classifier.fit(train_X, train_Y)

weight_mat = classifier.coef_
weight_mat.shape

Out[10]:

(6, 8192)

In [11]:

LABEL_NAME = [LABEL_NAME_MAP_EN[cls] for cls in classifier.classes_]

Plot¶

In [12]:

%%px --local
import matplotlib as mpl
mpl.use("Agg")

from matplotlib import pyplot as plt
from matplotlib import gridspec
from matplotlib import font_manager
from matplotlib.colors import Normalize
from matplotlib.ticker import MaxNLocator
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [13]:

%%px --local
class MidpointNormalize(Normalize):
    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        # I'm ignoring masked values and all kinds of edge cases to make a
        # simple example...
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        # masking value too low
        # norm_value = np.interp(value, x, y)
        # white_cond = (0.45 < norm_value) & (norm_value < 0.55)
        # return np.ma.masked_where(white_cond, norm_value)
        norm_arr = np.interp(value, x, y)
        return np.ma.masked_array(norm_arr, mask=np.isnan(norm_arr))

In [14]:

out_root = Path(
    r"\\msralab\ProjectData\eHealth\v-lianwa\CRC_pathology_result\exp10_refinefig"
    r"\heatmap_multilabel_separate_jet"
)
if not out_root.exists():
    out_root.mkdir(parents=True)

out_root_test = out_root / 'test'
out_root_train = out_root / 'train'
out_root_addnorm = out_root / 'addnorm'
for out_dir in [out_root_test, out_root_train, out_root_addnorm]:
    if not out_dir.exists():
        out_dir.mkdir()
    for class_label in LABEL_NAME:
        cls_subdir = out_dir / class_label
        if not cls_subdir.exists():
            cls_subdir.mkdir(parents=True)

In [15]:

%%px --local
def read_batch(p):
    with p.open('rb') as f:
        img_batch_map = pickle.load(f)
    return img_batch_map

img_batch_map = TrainTestConnection(
    train=read_batch(all_datasets.train.img_batch_pickle),
    test=read_batch(all_datasets.test.img_batch_pickle),
    addnorm=None
)

### img_batch_map for addnorm
batch_dir_addnorm = all_datasets.addnorm.result_root / 'batch'
batch_list_addnorm = sorted(
    (p for p in batch_dir_addnorm.iterdir() if p.name.startswith('data_batch')),
    key=lambda p: int(p.name.rsplit('_', 1)[-1])
)
img_batch_map.addnorm = dict(
    (k, list(v)) 
    for k, v in itertools.groupby(
        batch_list_addnorm, 
        lambda p: int(p.name.rsplit('_', 1)[-1]) // 1000
    ))

In [16]:

ar = dview.push(dict(
    out_root_test=out_root_test, out_root_train=out_root_train, out_root_addnorm=out_root_addnorm,
    LABEL_NAME=LABEL_NAME, LABEL_NAME_MAP=LABEL_NAME_MAP, LABEL_NAME_MAP_EN=LABEL_NAME_MAP_EN,
    weight_mat=weight_mat
))
ar.wait()

In [17]:

%%px --local
def read_patches_from_batch_addnorm(img_id, img_batch_map):
    img_data = []
    feat_vec = []
    for batch_p in img_batch_map[img_id]:
        with batch_p.open('rb') as f:
            raw = pickle.load(f, encoding='latin1')
        img_data.extend(
            data_vec for data_vec, lab in zip(raw['data'], raw['labels'])
            if lab == 0
        )
        feat_vec.extend(
            fc2_vec for fc2_vec, lab in zip(raw['fc2'], raw['labels'])
            if lab == 0
        )
    return img_data, feat_vec

In [18]:

%%px --local
def plot_class_confidence_separate(
    img_id, img_patches, img_pth, label_name, dist_mat, fig_out_p
):
    """Plot two plot, original figure and heatmap"""
    # open instance image
    instance_img = Image.open(img_pth.open('rb'))
    orig_size = instance_img.size
    
    # resize
    instance_img.thumbnail((4096, 4096), Image.ANTIALIAS)
    current_size = instance_img.size
    scale_x = current_size[0] / orig_size[0]
    scale_y = current_size[1] / orig_size[1]
    
    ## Draw orig fig
    # orig_fig_pth = fig_out_p.parent / (fig_out_p.stem + '_orig.png')
    # fig = plt.figure(None, (8, 8))
    # plt.imshow(instance_img, interpolation='nearest')
    # plt.axis('off')
    # fig.savefig(str(orig_fig_pth), transparent=True, bbox_inches='tight', pad_inches=0, dpi=300)
    # plt.close(fig)
    
    ##  Draw heatmap
    fig = plt.figure(None, (8, 8), frameon=False)
    ax = fig.add_subplot(1, 1, 1)
    # make the confidence matrix
    confid_mat = np.zeros(current_size, np.float32)
    confid_count = np.zeros(current_size, np.int)
    # confid_vec = dist_mat[0, :]
    confid_vec = next(
        confid_vec for class_label, confid_vec in zip(LABEL_NAME, dist_mat)
        if class_label == label_name
    )
    for confid, patch in zip(confid_vec, img_patches):
        margin = patch.window.to_margin()
        slice_x = slice(np.round(margin.x1 * scale_x), np.round(margin.x2 * scale_x))
        slice_y = slice(np.round(margin.y1 * scale_y), np.round(margin.y2 * scale_y))
        confid_mat[slice_x, slice_y] += confid
        confid_count[slice_x, slice_y] += 1
    confid_count = np.where(confid_count == 0, np.nan, confid_count)
    confid_mat /= confid_count
    # plot heatmap
    norm = MidpointNormalize(midpoint=0)
    cmap = plt.cm.jet  # plt.cm.RdYlBu_r
    # cmap.set_bad(color='w', alpha=0)
    ax.imshow(instance_img, interpolation='nearest', aspect='equal')
    im_confid = ax.imshow(
        confid_mat.T, interpolation='nearest', norm=norm, origin='upper',
        alpha=0.4, cmap=cmap, aspect='equal'
    )
    # turn off borders
    ax.set_axis_off()
    plt.axis('off')
    plt.tight_layout(pad=0, h_pad=0, w_pad=0)
    fig.savefig(str(fig_out_p), transparent=True, bbox_inches='tight', pad_inches=0, dpi=600)
    plt.close(fig)
    return fig_out_p    

In [35]:

%%px --local
def plot_class_confidence(
    img_id, img_patches, img_pth, label_name, dist_mat, fig_out_p
):   
    # open instance image
    instance_img = Image.open(img_pth.open('rb'))
    orig_size = instance_img.size
    
    # resize
    instance_img.thumbnail((4096, 4096), Image.ANTIALIAS)
    instance_img.thumbnail((2048, 2048), Image.ANTIALIAS)
    # instance_img.thumbnail((1024, 1024), Image.ANTIALIAS)

    current_size = instance_img.size
    scale_x = current_size[0] / orig_size[0]
    scale_y = current_size[1] / orig_size[1]
    
    # Begin drawing
    fig = plt.figure(None, (16, 8))
    gs_all = gridspec.GridSpec(1, 2)

    # orig fig
    ax_orig = plt.subplot(gs_all[:2, 0])
    ax_orig.imshow(instance_img)
    ax_orig.axis('off')
    # ax_orig.set_title("Image {:d} (Class {:s})".format(img_id, label_name), fontproperties=chs_font)
    ax1 = plt.subplot(gs_all[0, 1])

    for class_label, confid_vec in zip(LABEL_NAME, dist_mat):
        if class_label != label_name:
            continue
        # make the confidence matrix
        confid_mat = np.zeros(current_size)
        confid_count = np.zeros(current_size, np.int)
        for confid, patch in zip(confid_vec, img_patches):
            margin = patch.window.to_margin()
            slice_x = slice(np.round(margin.x1 * scale_x), np.round(margin.x2 * scale_x))
            slice_y = slice(np.round(margin.y1 * scale_y), np.round(margin.y2 * scale_y))
            confid_mat[slice_x, slice_y] += confid
            confid_count[slice_x, slice_y] += 1
        confid_count = np.where(confid_count == 0, np.nan, confid_count)
        confid_mat /= confid_count
        
        # plot orig image
        ax1.imshow(instance_img)
        # plot heatmap
        norm = MidpointNormalize(midpoint=0)
        im_confid = ax1.imshow(confid_mat.T, norm=norm, alpha=0.6, origin='upper', cmap=plt.cm.RdYlBu_r)
        # im_confid = ax1.imshow(confid_mat.T, norm=norm, alpha=0.4, origin='upper', cmap=plt.cm.jet)
        
        # setting
        # ax.set_title("Class {:s} vs rest".format(class_label), fontproperties=chs_font)
        ax1.axis('off')
    
    # save figure
    fig.savefig(str(fig_out_p), transparent=True, bbox_inches='tight', pad_inches=0.1, dpi=300)
    plt.close(fig)
    return fig_out_p

In [19]:

@lbview.parallel(block=False)
def plot_test_class_confidence(img_id):
    # print("Processing test image", img_id)
    img_data, img_patch_feats = read_patches_from_batch(img_id, img_batch_map.test)
    # print("Getting total {:d} patches".format(len(img_patch_feats)))
    img_patch_feat_mat = np.tile(img_patch_feats, [1, 2])
    dist_mat = weight_mat.dot(img_patch_feat_mat.T)  # comput the distance (confidence) value
    
    # read patch
    patch_pickle_p = Path(all_datasets.test.result_root, 'patch',  str(img_id), 'patch_records.pickle3') 
    
    with patch_pickle_p.open('rb') as f:
        img_patches = [p for p in pickle.load(f) if not any(p.window.outmargin)]
    img_pth = all_datasets.test.img_paths[img_id]

    label_name = LABEL_NAME_MAP_EN[all_datasets.test.labels[img_id]]
    fig_out_p = Path(out_root_test, label_name, "Image_{:d}.png".format(img_id))
    return plot_class_confidence_separate(
        img_id, img_patches, img_pth, label_name, dist_mat, fig_out_p
    )


@lbview.parallel(block=False)
def plot_train_class_confidence(img_id):
    # print("Processing test image", img_id)
    img_data, img_patch_feats = read_patches_from_batch(img_id, img_batch_map.train)
    # print("Getting total {:d} patches".format(len(img_patch_feats)))
    img_patch_feat_mat = np.tile(img_patch_feats, [1, 2])
    dist_mat = weight_mat.dot(img_patch_feat_mat.T)  # comput the distance (confidence) value
    
    # read patch
    patch_pickle_p = Path(all_datasets.train.result_root, 'patch',  str(img_id), 'patch_records.pickle3') 
    
    with patch_pickle_p.open('rb') as f:
        img_patches = [p for p in pickle.load(f) if not any(p.window.outmargin)]
    img_pth = all_datasets.train.img_paths[img_id]

    label_name = LABEL_NAME_MAP_EN[all_datasets.train.labels[img_id]]
    fig_out_p = Path(out_root_train, label_name, "Image_{:d}.png".format(img_id))
    return plot_class_confidence_separate(
        img_id, img_patches, img_pth, label_name, dist_mat, fig_out_p
    )


@lbview.parallel(block=False)
def plot_addnorm_class_confidence(img_id):
    # print("Processing test image", img_id)
    img_data, img_patch_feats = read_patches_from_batch_addnorm(img_id, img_batch_map.addnorm)
    # print("Getting total {:d} patches".format(len(img_patch_feats)))
    img_patch_feat_mat = np.tile(img_patch_feats, [1, 2])
    dist_mat = weight_mat.dot(img_patch_feat_mat.T)  # comput the distance (confidence) value
    
    # read patch
    patch_pickle_p = Path(all_datasets.addnorm.result_root, 'patch',  str(img_id), 'patch_records.pickle3') 
    
    with patch_pickle_p.open('rb') as f:
        img_patches = [p for p in pickle.load(f) if not any(p.window.outmargin)]
    img_pth = all_datasets.addnorm.img_paths[img_id]

    label_name = LABEL_NAME_MAP_EN[2]
    fig_out_p = Path(out_root_addnorm, label_name, "Image_{:d}.png".format(img_id))
    return plot_class_confidence_separate(
        img_id, img_patches, img_pth, label_name, dist_mat, fig_out_p
    )

Testing¶

In [20]:

# from IPython import display as ipydisplay

In [21]:

%%time
fig_p = PathConnection(train=None, test=None, addnorm=None)
fig_p.train = plot_train_class_confidence.func(13)
fig_p.addnorm = plot_addnorm_class_confidence.func(10)
fig_p.test = plot_test_class_confidence.func(15)

WARNING:py.warnings:C:\Miniconda\envs\crc34\lib\site-packages\PIL\Image.py:2192: DecompressionBombWarning: Image size (205051824 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.
  DecompressionBombWarning)

Wall time: 3min 12s

In [22]:

# ipydisplay.Image(str(fig_p.train))

Run all images¶

In [23]:

def save_async_meta(async_result, out_pth):
    """Save async result meta to pickle"""
    ar_meta_df = pd.DataFrame(async_result.metadata)
    ar_meta_df = ar_meta_df[
        ['engine_id', 'engine_uuid', 'msg_id', 'started', 'completed', 'received']]
    ar_meta_df.to_pickle(str(out_pth))

In [22]:

ar = plot_train_class_confidence.map(all_datasets.train.used_img_ids)
ar.wait_interactive()

 273/273 tasks finished after 1104 s
done

In [25]:

save_async_meta(ar, out_root / 'train_ar_meta.pickle3')

In [23]:

ar = plot_test_class_confidence.map(all_datasets.test.used_img_ids)
ar.wait_interactive()

 120/120 tasks finished after 1250 s
done

In [27]:

save_async_meta(ar, out_root / 'test_ar_meta.pickle3')

In [24]:

ar = plot_addnorm_class_confidence.map(all_datasets.addnorm.used_img_ids)
ar.wait_interactive()

 300/300 tasks finished after 1835 s
done

In [29]:

save_async_meta(ar, out_root / 'addnorm_ar_meta.pickle3')