#!/usr/bin/env python # coding: utf-8 # # Data pre-processing # 156,049 mice nuclei from developing brain and spinal cord at age of p2 or # p11 mice were profiled by [SPLiT-seq](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE110823), # where 26,894 genes were detected. # In[1]: import numpy as np import scipy.io import pandas as pd import loompy import rpy2.robjects as robjects saveRDS = robjects.r["saveRDS"] from rpy2.robjects import pandas2ri pandas2ri.activate() # In[2]: data_path = "E:/DISC/reproducibility/data/BRAIN_SPLiT/original_data/GSM3017261_150000_CNS_nuclei.mat" data = scipy.io.loadmat(data_path) gene_bc_sparse = data["DGE"].transpose() gene_name = pd.Series(data['genes']).str.strip(' ').values sample_type = pd.Series(data['sample_type']).str.strip(' ').values barcode_str = data["barcodes"].squeeze().astype(np.str) cell_id = pd.Series(np.repeat("Cell", barcode_str.size)).str.cat(barcode_str, sep='_').values saveRDS(pd.Series(pd.Series(data['cluster_assignment']).str.strip(' ').values, index=cell_id), "E:/DISC/reproducibility/data/BRAIN_SPLiT/cell_type.rds") output_path = "E:/DISC/reproducibility/data/BRAIN_SPLiT/raw.loom" row_attrs = {"Gene": gene_name} col_attrs = {"CellID": cell_id, "SampleID": sample_type} loompy.create(output_path, gene_bc_sparse, row_attrs, col_attrs) print(output_path) print(gene_bc_sparse.shape) # Reference: # # 1. Rosenberg, A. B. et al. Single-cell profiling of the developing mouse brain and spinal cord with split-pool barcoding. Science 360, 176-182 (2018).