#!/usr/bin/env python # coding: utf-8 # # Computing a PCA of tile-level CHARM features # This notebook exemplifies working with the computed CHARM features. # We will load several Wells of time resolved tile-level features, # compute their PCA, and show that single cell biologically relevant # information are indeed present in unsupervised features computed # on tiles without segmentation. # **Some cells take a very long time to run # (querying the rows from the table in particular)** # ### Dependencies # * [Matplotlib](http://matplotlib.org/) # * [NumPy](http://www.numpy.org/) # * [Pandas](http://pandas.pydata.org/) # * [Scikit-learn](http://scikit-learn.org/) # # The cell below will install dependencies if you choose to run the notebook in [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb#recent=true). # In[ ]: get_ipython().run_line_magic('pip', 'install idr-py') # In[1]: from IPython import get_ipython import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame from pandas import concat import omero from matplotlib import gridspec import sklearn.neighbors as nn import sklearn.decomposition as dec from sklearn.preprocessing import scale from idr import connection get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams['image.cmap'] = 'gray' # ### Method definitions # In[2]: def getBulkAnnotationAsDf(screenID, conn): """ Download the annotation file from a screen as a Pandas DataFrame """ sc = conn.getObject('Screen', screenID) for ann in sc.listAnnotations(): if isinstance(ann, omero.gateway.FileAnnotationWrapper): if (ann.getFile().getName() == 'bulk_annotations'): if (ann.getFile().getSize() > 147625090): print("that's a big file...") return None ofId = ann.getFile().getId() break original_file = omero.model.OriginalFileI(ofId, False) table = conn.c.sf.sharedResources().openTable(original_file) try: rowCount = table.getNumberOfRows() column_names = [col.name for col in table.getHeaders()] black_list = [] column_indices = [] for column_name in column_names: if column_name in black_list: continue column_indices.append(column_names.index(column_name)) table_data = table.slice(column_indices, None) finally: table.close() data = [] for index in range(rowCount): row_values = [column.values[index] for column in table_data.columns] data.append(row_values) dfAnn = DataFrame(data) dfAnn.columns = column_names return dfAnn # In[22]: def goneFishing(iid, x, y, t, df, nbrs, conn): """ Find and display nearest neighbours of a given tile """ qry = df[(df.x == x) & (df.y == y) & (df.ImageID == iid) & (df.t == t)].iloc[:, 12:] dfq = df[df.ImageID != iid] w = dfq.w.iloc[0] h = dfq.h.iloc[0] hook = getCondensationSubStack(iid, x, y, w, h, t, t + 1, conn) distances, indices = nbrs.kneighbors(qry) nnn = len(indices[0]) tiles = np.zeros((h, w, nnn)) for ind, ii in zip(indices[0], range(nnn)): iidcur = dfq.ImageID.iloc[ind] x = dfq.x.iloc[ind] y = dfq.y.iloc[ind] t = int(dfq.t.iloc[ind]) tiles[:, :, ii] = getCondensationSubStack(iidcur, x, y, w, h, t, t + 1, conn) d, r = divmod(nnn, 4) plt.figure(figsize=(12, 30)) gs = gridspec.GridSpec(2, 1, height_ratios=[1, 3]) ax0 = plt.subplot(gs[0, 0]) ax0.set_title('Original frame') ax0.imshow(hook) imc = buildComposite(tiles, d + (1 & r), 4, smpl=1) ax1 = plt.subplot(gs[1, 0]) ax1.set_title('Nearest neighbours') ax1.imshow(imc) # In[4]: def getCondensationSubStack(imId, x, y, w, h, ti, tf, conn, chan=0): # Extract a substack of an image as a numpy array im = conn.getObject("Image", imId) pix = im.getPrimaryPixels() z = 0 c = chan tile = (x, y, w, h) zctList = [(z, c, it, tile) for it in range(ti, tf)] planes = pix.getTiles(zctList) st = [] for plane in planes: st.append(plane) st = np.asarray(st) if tf > ti + 1: st = np.rollaxis(st, 0, 3) else: st = np.squeeze(st) return st # In[5]: def buildComposite(st, n, m, smpl=None): # nxm shots from st in a grid, as an image nr = st.shape[0] nc = st.shape[1] if smpl is None: smpl = st.shape[2] // (n * m) res = np.zeros((nr * n, nc * m)) for i in range(n): for j in range(m): try: v = st[:, :, (i * m + j) * smpl] res[i * nr: i * nr + nr, j * nc: j * nc + nc] = v except Exception: break return res # In[26]: def outOneStack(imId, x, y, df, con, alg='PCA', chan=0, title='First 5 PCA components of a stack'): # Plot PCA components along with tiles from the stack df = df[(df.x == x) & (df.y == y) & (df.ImageID == imId)].sort_values('t', ascending=True) plt.figure(figsize=(12, 30)) gs = gridspec.GridSpec(2, 1, height_ratios=[1, 2]) ax0 = plt.subplot(gs[0, 0]) ax0.set_title(title) ax0.set_xlabel('Time (frame)') ax0.set_ylabel('PCA components (a.u.)') if alg == 'PCA': ax0.plot(df.PCA_0.values) ax0.plot(df.PCA_1.values, 'r') ax0.plot(df.PCA_2.values, 'g') ax0.plot(df.PCA_3.values, 'y') ax0.plot(df.PCA_4.values, 'm') ax0.plot(df.PCA_5.values, 'c') elif alg == 'DR': ax0.plot(df.DR_0.values) ax0.plot(df.DR_1.values, 'r') ax0.plot(df.DR_2.values, 'g') ax0.plot(df.DR_3.values, 'y') ax0.plot(df.DR_4.values, 'm') x, y, w, h = x, y, df.w.min(), df.h.min() ti, tf = int(df.t.min()), int(df.t.max()) st = getCondensationSubStack(imId, x, y, w, h, ti, tf, conn, chan) imc = buildComposite(st, 5, 5) ax1 = plt.subplot(gs[1, 0]) ax1.imshow(imc) ax1.set_title('Mosaic from the corresponding stack') return st # In[7]: def data2df(data): # convert an OMERO.table data structure to a dataframe, # unpacking features array nrow = len(data.columns[0].values) nfeat = 0 for i in range(len(data.columns)): try: s = len(data.columns[i].values[0]) except Exception: s = 1 nfeat = nfeat + s featsAll = np.zeros((nrow, nfeat)) nfeat = 0 featNames = [] for i in range(len(data.columns)): try: s = len(data.columns[i].values[0]) for k in range(len(data.columns[i].values)): v = np.array(data.columns[i].values[k]) featsAll[k, nfeat: nfeat + s] = v for k in range(s): featNames.append(data.columns[i].name+str(k)) except Exception: s = 1 featsAll[:, nfeat] = np.array(data.columns[i].values) featNames.append(data.columns[i].name) nfeat = nfeat + s dfFeat = DataFrame(featsAll) dfFeat.columns = featNames return dfFeat # In[8]: def rows2features(qr, concatenateC=True): # Extract selected rows from an OMERO.table maxr = 10000 # else memory errors dfFeatPhen = DataFrame() k = -1 # in case there is less that maxr rows for k in range(len(qr) // maxr): data = featTable.readCoordinates(qr[k * maxr: (k + 1) * maxr]) data = data2df(data) dfFeatPhen = concat([dfFeatPhen, data]) print(str(k * maxr) + ' to ' + str((k + 1) * maxr)) if len(qr) % maxr != 0: data = featTable.readCoordinates(qr[(k + 1) * maxr:]) data = data2df(data) dfFeatPhen = concat([dfFeatPhen, data]) print('from ' + str((k + 1) * maxr)) dfFeatPhen = dfFeatPhen.reset_index(drop=True) if concatenateC: featNames = dfFeatPhen.columns[0: 12] for i in [0, 1, 2]: s = str(i) + '_' + dfFeatPhen.columns[12:] featNames = featNames.append(s) nf = dfFeatPhen.shape[1] - 12 dfFeat = np.zeros((dfFeatPhen.shape[0] // 3, 12 + 3 * nf)) v = dfFeatPhen.groupby(['ImageID', u'x', u'y']) for k, (name, grp) in enumerate(v): dfFeat[k, 0: 12] = grp.iloc[0, 0: 12].values grp.sort_values('c', inplace=True) for i, (nr, r) in enumerate(grp.iterrows()): value = r.iloc[12:].values dfFeat[k, (12 + i * nf): (12 + (i + 1) * nf)] = value dfFeatPhen = DataFrame(dfFeat) dfFeatPhen.columns = featNames return dfFeatPhen # ## Loading Data # In[9]: conn = connection('idr.openmicroscopy.org') # In[10]: scId = 102 # Heriche et al., DNA condentation screen dfAnn = getBulkAnnotationAsDf(scId, conn) # In[11]: # Retrieving the original file corresponding to the feature table sc = conn.getObject('Screen', scId) # Printing names of all files attached to the screen for ann in sc.listAnnotations(): if isinstance(ann, omero.gateway.FileAnnotationWrapper): print(ann.getFile().getName()) # Getting the fileAnnotation corresponding to the features # !!! not the full file name = 'idr0002-heriche-condensation-screenA-plate-all.h5' for ann in sc.listAnnotations(): if isinstance(ann, omero.gateway.FileAnnotationWrapper): if (ann.getFile().getName() == name): ofId = ann.getFile().getId() break original_file = omero.model.OriginalFileI(ofId, False) # In[12]: # Cherry picking a few wells plateCP = ['plate1_1_013', 'plate2_2_007', 'plate2_2_007', 'plate3_4_034'] wellCP = [5, 53, 63, 24] w = [dfAnn[(dfAnn['Plate Name'] == plateCP[i]) & (dfAnn['Well Number'] == str(wellCP[i]))]['Well'] for i in range(4)] w = concat(w) print(w) # In[13]: # Finding the rows of the table we are interested in featTable = conn.c.sf.sharedResources().openTable(original_file) try: featRowCount = featTable.getNumberOfRows() print('number of tiles with computed features:' + str(featRowCount)) where = "(WellID==" + str(w.values[0]) + ") | (WellID==" + str(w.values[1]) + ") | (WellID==" + str(w.values[2]) + ") | (WellID==" + str(w.values[3]) + ")" qr = featTable.getWhereList(where, variables={}, start=0, stop=featRowCount, step=0) # downloading them as a dataframe maxr = 10000 # else memory errors df = DataFrame() k = -1 # in case there is less that maxr rows for k in range(len(qr) // maxr): data = featTable.readCoordinates(qr[k * maxr: (k + 1) * maxr]) data = data2df(data) df = concat([df, data]) print(str(k * maxr) + ' to ' + str((k + 1) * maxr)) if len(qr) % maxr != 0: data = featTable.readCoordinates(qr[(k + 1) * maxr:]) data = data2df(data) df = concat([df, data]) print('from ' + str((k + 1) * maxr)) df = df[df.c == 0] # first channel is the one with chromosome information df = df.reset_index(drop=True) finally: featTable.close() # ## computing PCA # In[14]: # PCA of 4 wells df = df[df.c == 0] dat = df.iloc[:, 12:] # keeping only the features which are not too discrete # nbv = [len(dat[x].unique()) for x in df.iloc[:, 12:]] # dat = dat.iloc[:, np.array(nbv) > 10] dat = scale(dat) pca = dec.PCA(n_components=250, svd_solver='randomized') # pca = dec.RandomizedPCA(250) # pca = dec.SparsePCA(250) # pca = dec.KernelPCA(n_components=10, kernel='rbf') pca.fit(dat) datPCA = pca.transform(dat) datPCA = DataFrame(datPCA) datPCA.columns = ['PCA_' + str(i) for i in range(datPCA.shape[1])] df = df.reset_index(drop=True) df = concat((df.iloc[:, 0: 12], datPCA), axis=1) # ## Visualization # In[27]: # Visualization of one stack. # a Division is happening around t = 220, and seems to have # a clear signature, even with only 5 components x, y = 504, 384 imId = 179719 st = outOneStack(imId, x, y, df, conn, alg='PCA', chan=0) # In[28]: # Looking for nearest neighbours of a given frame iid, x, y, t = 179719, 504, 384, 220 # division at 220 dfq = df[df.ImageID != iid] # excluding the image the query frame comes from nbrs = nn.NearestNeighbors(n_neighbors=12, algorithm='ball_tree').fit(dfq.iloc[:, 12:]) goneFishing(iid, x, y, t, df, nbrs, conn) # ### Disconnect when done loading data # In[ ]: conn.close() # ### License (BSD 2-Clause)ΒΆ # # Copyright (C) 2016-2021 University of Dundee. All Rights Reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: # # Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.