#!/usr/bin/env python
# coding: utf-8

# # Data pre-processing
# 156,049 mice nuclei from developing brain and spinal cord at age of p2 or
# p11 mice were profiled by [SPLiT-seq](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE110823),
# where 26,894 genes were detected.

# In[1]:


import numpy as np
import scipy.io
import pandas as pd
import loompy
import rpy2.robjects as robjects
saveRDS = robjects.r["saveRDS"]
from rpy2.robjects import pandas2ri
pandas2ri.activate()


# In[2]:


data_path = "E:/DISC/reproducibility/data/BRAIN_SPLiT/original_data/GSM3017261_150000_CNS_nuclei.mat"
data = scipy.io.loadmat(data_path)
gene_bc_sparse = data["DGE"].transpose()
gene_name = pd.Series(data['genes']).str.strip(' ').values
sample_type = pd.Series(data['sample_type']).str.strip(' ').values
barcode_str = data["barcodes"].squeeze().astype(np.str)
cell_id = pd.Series(np.repeat("Cell", barcode_str.size)).str.cat(barcode_str, sep='_').values
saveRDS(pd.Series(pd.Series(data['cluster_assignment']).str.strip(' ').values, index=cell_id), "E:/DISC/reproducibility/data/BRAIN_SPLiT/cell_type.rds")

output_path = "E:/DISC/reproducibility/data/BRAIN_SPLiT/raw.loom"
row_attrs = {"Gene": gene_name}
col_attrs = {"CellID": cell_id, "SampleID": sample_type}
loompy.create(output_path, gene_bc_sparse, row_attrs, col_attrs)
print(output_path)
print(gene_bc_sparse.shape)


# Reference: 
# 
# 1. Rosenberg, A. B. et al. Single-cell profiling of the developing mouse brain and spinal cord with split-pool barcoding. Science 360, 176-182 (2018).