Updated on Jan 2018
## Download
cat DR_cDNA.fa DR_ncRNA.fa > DR_all_RNA.fa
## Create Salmon index (k=31)
salmon index -t DR_all_RNA.fa -i ~/DR_all_RNA_Jun28_k31_salmon_index --type quasi -k 31
## FastQC
fastqc -o . -f fastq --extract SRR3465546.fastq.gz -t 8
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3465546.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR3465546.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR3465546.cor.fq -o filtered_SRR3465546_fastp.cor.fq \
-q 5 -c -p -w 10 \
-j filtered_SRR3465546_fastp.json -h filtered_SRR3465546_fastp.html -R "filtered_SRR3465546 report"
## FastQC [27909863.bc]
fastqc -o . -f fastq --extract filtered_SRR3465546_fastp.cor.fq -t 10
## Mapping (k=31) ---> 71.3872% reads mapped
salmon quant -i ~/DR_all_RNA_Jun28_k31_salmon_index -l A \
-r filtered_SRR3465546_fastp.cor.fq \
-o ~/filtered_SRR3465546_fastp_DR_all_RNA_Jun28_k31_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR3465547.fastq.gz -t 8
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3465547.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR3465547.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR3465547.cor.fq -o filtered_SRR3465547_fastp.cor.fq \
-q 5 -c -p -w 10 \
-j filtered_SRR3465547_fastp.json -h filtered_SRR3465547_fastp.html -R "filtered_SRR3465547 report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR3465547_fastp.cor.fq -t 10
## Mapping (k=31) index ---> 71.0672%% reads mapped [27922630.bc]
salmon quant -i ~/DR_all_RNA_Jun28_k31_salmon_index -l A \
-r filtered_SRR3465547_fastp.cor.fq \
-o ~/filtered_SRR3465547_fastp_DR_all_RNA_Jun28_k31_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR3465548.fastq.gz -t 8
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3465548.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR3465548.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR3465548.cor.fq -o filtered_SRR3465548_fastp.cor.fq \
-q 5 -c -p -w 10 \
-j filtered_SRR3465548_fastp.json -h filtered_SRR3465548_fastp.html -R "filtered_SRR3465548 report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR3465548_fastp.cor.fq -t 10
## Mapping (k=31) ---> 71.4919% reads mapped
salmon quant -i ~/DR_all_RNA_Jun28_k31_salmon_index -l A \
-r filtered_SRR3465548_fastp.cor.fq \
-o ~/filtered_SRR3465548_fastp_DR_all_RNA_Jun28_k31_salmon_quant
## rCorrector [27908342.bc]
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3465549.fastq.gz
## Filter [27908407.bc]
python ~/FilterUncorrectabledSEfastq.py -i SRR3465549.cor.fq.gz -o filtered
## fastp [27909875.bc]
fastp -i filtered_SRR3465549.cor.fq -o filtered_SRR3465549_fastp.cor.fq \
-q 5 -c -p -w 10 \
-j filtered_SRR3465549_fastp.json -h filtered_SRR3465549_fastp.html -R "filtered_SRR3465549 report"
## FastQC [27909876.bc]
fastqc -o . -f fastq --extract filtered_SRR3465549_fastp.cor.fq -t 10
## Mapping (k=31) ---> 71.5107% reads mapped [27922634.bc]
salmon quant -i ~/DR_all_RNA_Jun28_k31_salmon_index -l A \
-r filtered_SRR3465549_fastp.cor.fq \
-o ~/filtered_SRR3465549_fastp_DR_all_RNA_Jun28_k31_salmon_quant
multiqc .
## Import libraries
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7b_Zebrafish")
## Define function
def extract_non0(salmon_output_filename, library_ID):
with open(salmon_output_filename, "r") as infile:
lib = pd.read_csv(infile, sep='\t')
lib_non0 = lib.loc[lib["TPM"]>0]
lib_non0_counts = lib_non0[["Name", "TPM"]]
return(lib_non0_counts)
## Analyze each library
lib6 = extract_non0("./filtered_SRR3465546_fastp_DR_all_RNA_Jun28_k31_salmon_quant/quant.sf", "SRR3465546")
lib7 = extract_non0("./filtered_SRR3465547_fastp_DR_all_RNA_Jun28_k31_salmon_quant/quant.sf", "SRR3465547")
lib8 = extract_non0("./filtered_SRR3465548_fastp_DR_all_RNA_Jun28_k31_salmon_quant/quant.sf", "SRR3465548")
lib9 = extract_non0("./filtered_SRR3465549_fastp_DR_all_RNA_Jun28_k31_salmon_quant/quant.sf", "SRR3465549")
%get lib6 --from Python3
%get lib7 --from Python3
head(lib6, 10)
head(lib7, 10)
Name | TPM | |
---|---|---|
3 | ENSDART00000170923.2 | 2.479800 |
4 | ENSDART00000171190.2 | 7.667080 |
5 | ENSDART00000165811.2 | 3.880930 |
7 | ENSDART00000007487.9 | 17.090300 |
8 | ENSDART00000162972.3 | 0.141226 |
9 | ENSDART00000171570.2 | 5.170290 |
10 | ENSDART00000168177.2 | 7.088380 |
12 | ENSDART00000162709.3 | 9.806990 |
14 | ENSDART00000168926.2 | 13.296900 |
15 | ENSDART00000169180.2 | 8.571490 |
Name | TPM | |
---|---|---|
3 | ENSDART00000170923.2 | 1.229110 |
4 | ENSDART00000171190.2 | 5.517730 |
5 | ENSDART00000165811.2 | 3.949240 |
7 | ENSDART00000007487.9 | 16.860100 |
8 | ENSDART00000162972.3 | 0.665929 |
9 | ENSDART00000171570.2 | 5.979240 |
10 | ENSDART00000168177.2 | 6.115120 |
12 | ENSDART00000162709.3 | 9.458940 |
14 | ENSDART00000168926.2 | 12.616400 |
15 | ENSDART00000169180.2 | 8.660670 |
def sorted_merged_table(df1, df2, df3, df4,
lib_name1, lib_name2, lib_name3, lib_name4):
non0_2 = df1.merge(df2, on='Name') # Create lookup merged table of all libraries
non0_3 = non0_2.merge(df3, on='Name')
non0_4 = non0_3.merge(df4, on='Name')
non0_4['Median'] = non0_4.median(axis=1) # Calculate median read count of each transcript across all libraries
non0_4_median_sorted = non0_4.sort_values(by="Median", ascending=False) # Sort transcripts by median read count
non0_4_median_sorted.columns = ("Name", lib_name1, lib_name2, lib_name3, lib_name4, "Median") # Rename columns
return non0_4_median_sorted
DR_sorted_merged = sorted_merged_table(lib6, lib7, lib8, lib9,
"SRR3465546", "SRR3465547", "SRR3465548", "SRR3465549")
%get DR_sorted_merged --from Python3
head(DR_sorted_merged,10)
nrow(DR_sorted_merged)
Name | SRR3465546 | SRR3465547 | SRR3465548 | SRR3465549 | Median | |
---|---|---|---|---|---|---|
34737 | ENSDART00000093611.3 | 121405.00 | 132392.00 | 132560.00 | 135275.00 | 132476.00 |
35022 | ENSDART00000117474.3 | 58401.40 | 65014.10 | 67116.20 | 63151.30 | 64082.70 |
33311 | ENSDART00000182970.1 | 19842.10 | 9807.03 | 21568.10 | 15676.90 | 17759.50 |
34736 | ENSDART00000093609.3 | 11807.40 | 13954.20 | 14022.30 | 13290.50 | 13622.35 |
34739 | ENSDART00000093613.3 | 10803.00 | 12767.50 | 12902.30 | 13264.50 | 12834.90 |
34735 | ENSDART00000093606.3 | 11273.20 | 13201.90 | 12731.30 | 12627.60 | 12679.45 |
34738 | ENSDART00000093612.3 | 9459.21 | 11129.50 | 10755.40 | 11220.60 | 10942.45 |
35138 | ENSDART00000174022.2 | 10769.90 | 14598.50 | 10601.30 | 8607.61 | 10685.60 |
35030 | ENSDART00000116823.3 | 6529.78 | 8440.01 | 7806.37 | 6778.69 | 7292.53 |
35019 | ENSDART00000116869.3 | 7975.68 | 5231.02 | 7202.87 | 6323.97 | 6763.42 |
DR_median_sorted_IDs = DR_sorted_merged["Name"]
DR_median_sorted_IDs.shape
(36362,)
%get DR_median_sorted_IDs --from Python3
head(DR_median_sorted_IDs, 10)
## Load Drosophila Biomart dataset
library(biomaRt)
DR_ensembl = useMart("ensembl", dataset="drerio_gene_ensembl")
## Create Biomart query
DR_biomart_output <- getBM(attributes = c('ensembl_transcript_id_version', 'ensembl_gene_id', 'external_gene_name',
'description', 'transcript_biotype', 'gene_biotype'),
filters = 'ensembl_transcript_id_version',
values = DR_median_sorted_IDs,
mart = DR_ensembl)
## Preview results
dim(DR_biomart_output)
head(DR_biomart_output, 10)
Batch submitting query [>------------------------------] 3% eta: 1m Batch submitting query [>------------------------------] 4% eta: 1m Batch submitting query [=>-----------------------------] 5% eta: 1m Batch submitting query [=>-----------------------------] 7% eta: 1m Batch submitting query [==>----------------------------] 8% eta: 1m Batch submitting query [==>----------------------------] 10% eta: 1m Batch submitting query [==>----------------------------] 11% eta: 1m Batch submitting query [===>---------------------------] 12% eta: 1m Batch submitting query [===>---------------------------] 14% eta: 1m Batch submitting query [====>--------------------------] 15% eta: 1m Batch submitting query [====>--------------------------] 16% eta: 1m Batch submitting query [=====>-------------------------] 18% eta: 1m Batch submitting query [=====>-------------------------] 19% eta: 1m Batch submitting query [=====>-------------------------] 21% eta: 1m Batch submitting query [======>------------------------] 22% eta: 1m Batch submitting query [======>------------------------] 23% eta: 1m Batch submitting query [=======>-----------------------] 25% eta: 1m Batch submitting query [=======>-----------------------] 26% eta: 1m Batch submitting query [=======>-----------------------] 27% eta: 1m Batch submitting query [========>----------------------] 29% eta: 1m Batch submitting query [========>----------------------] 30% eta: 1m Batch submitting query [=========>---------------------] 32% eta: 1m Batch submitting query [=========>---------------------] 33% eta: 1m Batch submitting query [==========>--------------------] 34% eta: 49s Batch submitting query [==========>--------------------] 36% eta: 50s Batch submitting query [==========>--------------------] 37% eta: 49s Batch submitting query [===========>-------------------] 38% eta: 47s Batch submitting query [===========>-------------------] 40% eta: 46s Batch submitting query [============>------------------] 41% eta: 45s Batch submitting query [============>------------------] 42% eta: 44s Batch submitting query [=============>-----------------] 44% eta: 43s Batch submitting query [=============>-----------------] 45% eta: 42s Batch submitting query [=============>-----------------] 47% eta: 41s Batch submitting query [==============>----------------] 48% eta: 40s Batch submitting query [==============>----------------] 49% eta: 38s Batch submitting query [===============>---------------] 51% eta: 37s Batch submitting query [===============>---------------] 52% eta: 36s Batch submitting query [================>--------------] 53% eta: 35s Batch submitting query [================>--------------] 55% eta: 34s Batch submitting query [================>--------------] 56% eta: 33s Batch submitting query [=================>-------------] 58% eta: 32s Batch submitting query [=================>-------------] 59% eta: 32s Batch submitting query [==================>------------] 60% eta: 31s Batch submitting query [==================>------------] 62% eta: 30s Batch submitting query [===================>-----------] 63% eta: 28s Batch submitting query [===================>-----------] 64% eta: 27s Batch submitting query [===================>-----------] 66% eta: 27s Batch submitting query [====================>----------] 67% eta: 25s Batch submitting query [====================>----------] 68% eta: 24s Batch submitting query [=====================>---------] 70% eta: 23s Batch submitting query [=====================>---------] 71% eta: 22s Batch submitting query [======================>--------] 73% eta: 21s Batch submitting query [======================>--------] 74% eta: 20s Batch submitting query [======================>--------] 75% eta: 19s Batch submitting query [=======================>-------] 77% eta: 18s Batch submitting query [=======================>-------] 78% eta: 17s Batch submitting query [========================>------] 79% eta: 16s Batch submitting query [========================>------] 81% eta: 15s Batch submitting query [========================>------] 82% eta: 13s Batch submitting query [=========================>-----] 84% eta: 12s Batch submitting query [=========================>-----] 85% eta: 11s Batch submitting query [==========================>----] 86% eta: 10s Batch submitting query [==========================>----] 88% eta: 9s Batch submitting query [===========================>---] 89% eta: 8s Batch submitting query [===========================>---] 90% eta: 7s Batch submitting query [===========================>---] 92% eta: 6s Batch submitting query [============================>--] 93% eta: 5s Batch submitting query [============================>--] 95% eta: 4s Batch submitting query [=============================>-] 96% eta: 3s Batch submitting query [=============================>-] 97% eta: 2s Batch submitting query [==============================>] 99% eta: 1s Batch submitting query [===============================] 100% eta: 0s
ensembl_transcript_id_version | ensembl_gene_id | external_gene_name | description | transcript_biotype | gene_biotype |
---|---|---|---|---|---|
ENSDART00000002691.9 | ENSDARG00000008407 | tspan7b | tetraspanin 7b [Source:ZFIN;Acc:ZDB-GENE-040927-5] | protein_coding | protein_coding |
ENSDART00000003001.8 | ENSDARG00000006316 | rpl23a | ribosomal protein L23a [Source:ZFIN;Acc:ZDB-GENE-030131-7479] | protein_coding | protein_coding |
ENSDART00000003008.8 | ENSDARG00000027419 | gad1b | glutamate decarboxylase 1b [Source:ZFIN;Acc:ZDB-GENE-030909-3] | protein_coding | protein_coding |
ENSDART00000003042.6 | ENSDARG00000020708 | mdkb | midkine b [Source:ZFIN;Acc:ZDB-GENE-010131-6] | protein_coding | protein_coding |
ENSDART00000003825.7 | ENSDARG00000018997 | cplx2l | complexin 2, like [Source:ZFIN;Acc:ZDB-GENE-040718-160] | protein_coding | protein_coding |
ENSDART00000004109.7 | ENSDARG00000009553 | gng3 | guanine nucleotide binding protein (G protein), gamma 3 [Source:ZFIN;Acc:ZDB-GENE-010705-1] | protein_coding | protein_coding |
ENSDART00000004692.9 | ENSDARG00000003795 | idh2 | isocitrate dehydrogenase 2 (NADP+), mitochondrial [Source:ZFIN;Acc:ZDB-GENE-031118-95] | protein_coding | protein_coding |
ENSDART00000005191.8 | ENSDARG00000011146 | uqcrb | ubiquinol-cytochrome c reductase binding protein [Source:ZFIN;Acc:ZDB-GENE-050522-542] | protein_coding | protein_coding |
ENSDART00000005879.8 | ENSDARG00000001788 | atp5po | ATP synthase peripheral stalk subunit OSCP [Source:ZFIN;Acc:ZDB-GENE-050522-147] | protein_coding | protein_coding |
ENSDART00000006132.8 | ENSDARG00000021124 | cfl1 | cofilin 1 [Source:ZFIN;Acc:ZDB-GENE-030131-215] | protein_coding | protein_coding |
%get DR_biomart_output --from R
## Check dataframe dimension after import
print("Dataframe dimensions:", DR_biomart_output.shape, "\n")
## Change transcript ID column header to match TPM>0 read counts table for merging
DR_biomart_output_df = DR_biomart_output.rename(columns = {'ensembl_transcript_id_version':'Name'})
Dataframe dimensions: (36361, 6)
## Merge Biomart output & read counts table sorted by median read count
DR_sorted_counts_BM = DR_sorted_merged.merge(DR_biomart_output_df, on='Name')
## Extract entries of protein-coding transcripts
DR_PC_transcripts = DR_sorted_counts_BM.loc[DR_sorted_counts_BM["transcript_biotype"].str.contains("protein_coding")]
## Check dataframe dimensions
print(DR_PC_transcripts.shape)
(32082, 11)
%get DR_PC_transcripts --from Python3
## Preview
dim(DR_PC_transcripts)
head(DR_PC_transcripts, 20)
## Extract to CSV
write.csv(head(DR_PC_transcripts, 20), file = "DR_top_20_PC_transcripts.csv")
Name | SRR3465546 | SRR3465547 | SRR3465548 | SRR3465549 | Median | ensembl_gene_id | external_gene_name | description | transcript_biotype | gene_biotype | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ENSDART00000093611.3 | 121405.00 | 132392.00 | 132560.00 | 135275.00 | 132476.000 | ENSDARG00000063910 | mt-atp8 | ATP synthase 8, mitochondrial [Source:ZFIN;Acc:ZDB-GENE-011205-19] | protein_coding | protein_coding |
2 | ENSDART00000182970.1 | 19842.10 | 9807.03 | 21568.10 | 15676.90 | 17759.500 | ENSDARG00000111458 | wu:fi09b08 | wu:fi09b08 [Source:ZFIN;Acc:ZDB-GENE-030131-5630] | protein_coding | protein_coding |
3 | ENSDART00000093609.3 | 11807.40 | 13954.20 | 14022.30 | 13290.50 | 13622.350 | ENSDARG00000063908 | mt-co2 | cytochrome c oxidase II, mitochondrial [Source:ZFIN;Acc:ZDB-GENE-011205-15] | protein_coding | protein_coding |
4 | ENSDART00000093613.3 | 10803.00 | 12767.50 | 12902.30 | 13264.50 | 12834.900 | ENSDARG00000063912 | mt-co3 | cytochrome c oxidase III, mitochondrial [Source:ZFIN;Acc:ZDB-GENE-011205-16] | protein_coding | protein_coding |
5 | ENSDART00000093606.3 | 11273.20 | 13201.90 | 12731.30 | 12627.60 | 12679.450 | ENSDARG00000063905 | mt-co1 | cytochrome c oxidase I, mitochondrial [Source:ZFIN;Acc:ZDB-GENE-011205-14] | protein_coding | protein_coding |
6 | ENSDART00000093612.3 | 9459.21 | 11129.50 | 10755.40 | 11220.60 | 10942.450 | ENSDARG00000063911 | mt-atp6 | ATP synthase 6, mitochondrial [Source:ZFIN;Acc:ZDB-GENE-011205-18] | protein_coding | protein_coding |
10 | ENSDART00000171617.2 | 5566.77 | 8967.55 | 5215.21 | 7021.53 | 6294.150 | ENSDARG00000103498 | epd | ependymin [Source:ZFIN;Acc:ZDB-GENE-980526-111] | protein_coding | protein_coding |
11 | ENSDART00000188084.1 | 5578.48 | 6410.95 | 6359.70 | 5774.42 | 6067.060 | ENSDARG00000117167 | rpl39 | ribosomal protein L39 [Source:ZFIN;Acc:ZDB-GENE-040625-51] | protein_coding | protein_coding |
12 | ENSDART00000093617.3 | 4871.62 | 6375.34 | 5814.17 | 5918.74 | 5866.455 | ENSDARG00000063916 | mt-nd4l | NADH dehydrogenase 4L, mitochondrial [Source:ZFIN;Acc:ZDB-GENE-011205-11] | protein_coding | protein_coding |
13 | ENSDART00000181887.1 | 5265.68 | 5232.80 | 5966.44 | 5993.42 | 5616.060 | ENSDARG00000115319 | mtbl | metallothionein-B-like [Source:ZFIN;Acc:ZDB-GENE-110414-3] | protein_coding | protein_coding |
15 | ENSDART00000093625.3 | 4082.53 | 4731.02 | 4862.35 | 4979.03 | 4796.685 | ENSDARG00000063924 | mt-cyb | cytochrome b, mitochondrial [Source:ZFIN;Acc:ZDB-GENE-011205-17] | protein_coding | protein_coding |
19 | ENSDART00000182119.1 | 4289.66 | 4498.40 | 4416.94 | 4565.07 | 4457.670 | ENSDARG00000112656 | rpl36a | ribosomal protein L36A [Source:ZFIN;Acc:ZDB-GENE-020423-1] | protein_coding | protein_coding |
22 | ENSDART00000149639.2 | 4097.25 | 3771.98 | 4444.10 | 3604.46 | 3934.615 | ENSDARG00000036186 | mbpa | myelin basic protein a [Source:ZFIN;Acc:ZDB-GENE-030128-2] | protein_coding | protein_coding |
24 | ENSDART00000052556.8 | 3106.96 | 3046.13 | 3425.84 | 2607.85 | 3076.545 | ENSDARG00000036186 | mbpa | myelin basic protein a [Source:ZFIN;Acc:ZDB-GENE-030128-2] | protein_coding | protein_coding |
25 | ENSDART00000093615.3 | 2732.44 | 3048.63 | 2987.04 | 3038.94 | 3012.990 | ENSDARG00000063914 | mt-nd3 | NADH dehydrogenase 3, mitochondrial [Source:ZFIN;Acc:ZDB-GENE-011205-9] | protein_coding | protein_coding |
26 | ENSDART00000182410.1 | 2893.27 | 2790.55 | 2946.34 | 3451.55 | 2919.805 | ENSDARG00000116304 | ndufa4 | NADH:ubiquinone oxidoreductase subunit A4 [Source:ZFIN;Acc:ZDB-GENE-040426-1962] | protein_coding | protein_coding |
28 | ENSDART00000093623.3 | 2203.58 | 2532.95 | 2440.17 | 2446.22 | 2443.195 | ENSDARG00000063922 | mt-nd6 | NADH dehydrogenase 6, mitochondrial [Source:ZFIN;Acc:ZDB-GENE-011205-13] | protein_coding | protein_coding |
29 | ENSDART00000186782.1 | 2557.15 | 2037.68 | 2265.55 | 2603.08 | 2411.350 | ENSDARG00000113583 | mt2 | metallothionein 2 [Source:ZFIN;Acc:ZDB-GENE-030131-4174] | protein_coding | protein_coding |
30 | ENSDART00000191912.1 | 2323.54 | 2424.89 | 2257.54 | 2486.85 | 2374.215 | ENSDARG00000116220 | BX005436.1 | protein_coding | protein_coding | |
31 | ENSDART00000186881.1 | 2329.81 | 2438.60 | 2360.62 | 1852.06 | 2345.215 | ENSDARG00000114294 | BX511120.1 | protein_coding | protein_coding |
DR_PC_transcripts_ID <- DR_PC_transcripts$Name
## Create getBM() query for obtaining peptide sequences
DR_BM_peptide_seqs <- getSequence(id = DR_PC_transcripts_ID,
type = 'ensembl_transcript_id_version',
seqType = 'peptide',
mart = DR_ensembl)
## Export to FASTA
exportFASTA(DR_BM_peptide_seqs, file='./DR_non0_pep_Nov11.fasta')
Batch submitting query [>------------------------------] 3% eta: 5m Batch submitting query [>------------------------------] 5% eta: 6m Batch submitting query [=>-----------------------------] 6% eta: 6m Batch submitting query [=>-----------------------------] 8% eta: 6m Batch submitting query [==>----------------------------] 9% eta: 6m Batch submitting query [==>----------------------------] 11% eta: 6m Batch submitting query [===>---------------------------] 12% eta: 6m Batch submitting query [===>---------------------------] 14% eta: 5m Batch submitting query [====>--------------------------] 15% eta: 5m Batch submitting query [====>--------------------------] 17% eta: 5m Batch submitting query [=====>-------------------------] 18% eta: 5m Batch submitting query [=====>-------------------------] 20% eta: 5m Batch submitting query [======>------------------------] 22% eta: 5m Batch submitting query [======>------------------------] 23% eta: 5m Batch submitting query [=======>-----------------------] 25% eta: 5m Batch submitting query [=======>-----------------------] 26% eta: 5m Batch submitting query [========>----------------------] 28% eta: 5m Batch submitting query [========>----------------------] 29% eta: 4m Batch submitting query [=========>---------------------] 31% eta: 4m Batch submitting query [=========>---------------------] 32% eta: 4m Batch submitting query [=========>---------------------] 34% eta: 4m Batch submitting query [==========>--------------------] 35% eta: 4m Batch submitting query [==========>--------------------] 37% eta: 4m Batch submitting query [===========>-------------------] 38% eta: 4m Batch submitting query [===========>-------------------] 40% eta: 4m Batch submitting query [============>------------------] 42% eta: 4m Batch submitting query [============>------------------] 43% eta: 4m Batch submitting query [=============>-----------------] 45% eta: 4m Batch submitting query [=============>-----------------] 46% eta: 3m Batch submitting query [==============>----------------] 48% eta: 3m Batch submitting query [==============>----------------] 49% eta: 3m Batch submitting query [===============>---------------] 51% eta: 3m Batch submitting query [===============>---------------] 52% eta: 3m Batch submitting query [================>--------------] 54% eta: 3m Batch submitting query [================>--------------] 55% eta: 3m Batch submitting query [=================>-------------] 57% eta: 3m Batch submitting query [=================>-------------] 58% eta: 3m Batch submitting query [==================>------------] 60% eta: 3m Batch submitting query [==================>------------] 62% eta: 2m Batch submitting query [===================>-----------] 63% eta: 2m Batch submitting query [===================>-----------] 65% eta: 2m Batch submitting query [====================>----------] 66% eta: 2m Batch submitting query [====================>----------] 68% eta: 2m Batch submitting query [====================>----------] 69% eta: 2m Batch submitting query [=====================>---------] 71% eta: 2m Batch submitting query [=====================>---------] 72% eta: 2m Batch submitting query [======================>--------] 74% eta: 2m Batch submitting query [======================>--------] 75% eta: 2m Batch submitting query [=======================>-------] 77% eta: 1m Batch submitting query [=======================>-------] 78% eta: 1m Batch submitting query [========================>------] 80% eta: 1m Batch submitting query [========================>------] 82% eta: 1m Batch submitting query [=========================>-----] 83% eta: 1m Batch submitting query [=========================>-----] 85% eta: 1m Batch submitting query [==========================>----] 86% eta: 1m Batch submitting query [==========================>----] 88% eta: 46s Batch submitting query [===========================>---] 89% eta: 40s Batch submitting query [===========================>---] 91% eta: 34s Batch submitting query [============================>--] 92% eta: 28s Batch submitting query [============================>--] 94% eta: 23s Batch submitting query [=============================>-] 95% eta: 17s Batch submitting query [=============================>-] 97% eta: 11s Batch submitting query [==============================>] 98% eta: 6s Batch submitting query [===============================] 100% eta: 0s
##[28617541.bc]
rpsblast -query ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7b_Zebrafish/DR_non0_pep_Nov11.fasta -db Kog \
-out ~/DR_CNS_pep_Nov12_KOG.txt -evalue 1E-5 \
-outfmt "6 qseqid sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp qcovs" \
-max_hsps 1 -max_target_seqs 1
## Replace "[" and "]" with "#" for later import as dataframe
cp ~/DR_CNS_pep_Nov12_KOG.txt .
sed -i 's/\[/#/g' DR_CNS_pep_Nov12_KOG.txt
sed -i 's/\]./#/g' DR_CNS_pep_Nov12_KOG.txt
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7b_Zebrafish")
DR_KOG = pd.read_csv("DR_CNS_pep_Nov12_KOG.txt", sep='#', header=None, engine="python")
type(DR_KOG)
pandas.core.frame.DataFrame
%get DR_KOG --from Python3
head(DR_KOG, 10)
0 | 1 | 2 | |
---|---|---|---|
0 | ENSDART00000014723.7 gnl|CDD|230178 KOG2239, KOG2239, KOG2239, Transcription factor containing NAC and TS-N domains | Transcription | 81.019 216 33 3 216 1 215 209 1 209 3.22e-94 272 99 99 |
1 | ENSDART00000010777.7 gnl|CDD|229436 KOG1495, KOG1495, KOG1495, Lactate dehydrogenase | Energy production and conversion | 68.769 333 103 1 335 1 333 332 1 332 0.0 553 99 99 |
2 | ENSDART00000004692.9 gnl|CDD|229467 KOG1526, KOG1526, KOG1526, NADP-dependent isocitrate dehydrogenase | Energy production and conversion | 73.039 204 52 1 226 22 225 383 180 380 1.90e-153 428 90 90 |
3 | ENSDART00000016181.11 gnl|CDD|230870 KOG2931, KOG2931, KOG2931, Differentiation-related gene 1 protein (NDR1 protein), related proteins | Function unknown | 59.218 179 60 2 363 149 320 222 38 210 3.28e-82 247 47 47 |
4 | ENSDART00000013690.9 gnl|CDD|231387 KOG3449, KOG3449, KOG3449, 60S acidic ribosomal protein P2 | Translation, ribosomal structure and biogenesis | 65.789 114 37 1 115 1 114 112 1 112 1.23e-30 103 99 99 |
5 | ENSDART00000006948.6 gnl|CDD|230248 KOG2309, KOG2309, KOG2309, 60s ribosomal protein L2/L8 | Translation, ribosomal structure and biogenesis | 82.677 254 38 4 258 1 254 248 1 248 1.69e-151 420 98 98 |
6 | ENSDART00000002595.7 gnl|CDD|229671 KOG1732, KOG1732, KOG1732, 60S ribosomal protein L21 | Translation, ribosomal structure and biogenesis | 83.125 160 27 0 161 1 160 160 1 160 2.68e-85 245 99 99 |
7 | ENSDART00000015629.9 gnl|CDD|229242 KOG1300, KOG1300, KOG1300, Vesicle trafficking protein Sec1 | Intracellular trafficking, secretion, and vesicular transport | 59.391 591 232 8 592 4 589 593 1 588 0.0 841 99 99 |
8 | ENSDART00000002691.9 gnl|CDD|231813 KOG3882, KOG3882, KOG3882, Tetraspanin family integral membrane protein | General function prediction only | 34.728 239 147 3 250 11 248 237 5 235 1.51e-60 188 95 95 |
9 | ENSDART00000006132.8 gnl|CDD|229674 KOG1735, KOG1735, KOG1735, Actin depolymerizing factor | Cytoskeleton | 49.367 158 58 5 166 1 152 146 1 142 1.08e-53 164 92 92 |
## Count the number of occurrences of each category
KOGs= ["RNA processing and modification", "Chromatin structure and dynamics", "Energy production and conversion", "Cell cycle control",
"Amino acid transport and metabolism", "Nucleotide transport and metabolism", "Carbohydrate transport and metabolism", "Coenzyme transport and metabolism",
"Lipid transport and metabolism", "Translation, ribosomal structure and biogenesis", "Transcription", "Replication, recombination and repair",
"Cell wall/membrane/envelope biogenesis", "Cell motility", "Posttranslational modification", "Inorganic ion transport and metabolism",
"Secondary metabolites", "General function prediction only", "Function unknown", "Signal transduction mechanisms", "Intracellular trafficking",
"Defense mechanisms", "Extracellular structures", "Nuclear structure", "Cytoskeleton"]
data = []
for KOG in KOGs:
print(KOG, DR_KOG[1].str.contains(KOG).sum())
data.append([KOG, DR_KOG[1].str.contains(KOG).sum()])
df = pd.DataFrame(data)
df.columns = ["KOG", "Count"]
df["DR_Percentage"] = df["Count"]/df["Count"].sum()*100
print(df)
df[["KOG", "DR_Percentage"]].to_csv("DR_KOG_summary.txt", sep="\t", index=None)
RNA processing and modification 882 Chromatin structure and dynamics 425 Energy production and conversion 567 Cell cycle control 687 Amino acid transport and metabolism 605 Nucleotide transport and metabolism 286 Carbohydrate transport and metabolism 676 Coenzyme transport and metabolism 165 Lipid transport and metabolism 731 Translation, ribosomal structure and biogenesis 691 Transcription 3095 Replication, recombination and repair 438 Cell wall/membrane/envelope biogenesis 278 Cell motility 64 Posttranslational modification 2036 Inorganic ion transport and metabolism 809 Secondary metabolites 291 General function prediction only 3365 Function unknown 1940 Signal transduction mechanisms 6221 Intracellular trafficking 1613 Defense mechanisms 228 Extracellular structures 608 Nuclear structure 172 Cytoskeleton 1429 KOG Count DR_Percentage 0 RNA processing and modification 882 3.116388 1 Chromatin structure and dynamics 425 1.501661 2 Energy production and conversion 567 2.003392 3 Cell cycle control 687 2.427390 4 Amino acid transport and metabolism 605 2.137658 5 Nucleotide transport and metabolism 286 1.010529 6 Carbohydrate transport and metabolism 676 2.388524 7 Coenzyme transport and metabolism 165 0.582998 8 Lipid transport and metabolism 731 2.582856 9 Translation, ribosomal structure and biogenesis 691 2.441524 10 Transcription 3095 10.935623 11 Replication, recombination and repair 438 1.547594 12 Cell wall/membrane/envelope biogenesis 278 0.982263 13 Cell motility 64 0.226132 14 Posttranslational modification 2036 7.193838 15 Inorganic ion transport and metabolism 809 2.858455 16 Secondary metabolites 291 1.028196 17 General function prediction only 3365 11.889619 18 Function unknown 1940 6.854639 19 Signal transduction mechanisms 6221 21.980779 20 Intracellular trafficking 1613 5.699244 21 Defense mechanisms 228 0.805597 22 Extracellular structures 608 2.148258 23 Nuclear structure 172 0.607731 24 Cytoskeleton 1429 5.049113
SRR3465546_non0_Jun30.txt
---> 44,699 IDs
SRR3465547_non0_Jun30.txt
---> 44,324 IDs
SRR3465548_non0_Jun30.txt
---> 43,915 IDs
SRR3465549_non0_Jun30.txt
---> 44,017 IDs
DR_6789_non0_Jun30.txt
---> 36,362 IDs
## Extract transcripts with TPM>0 in all 4 libraries
> grep -Fwf SRR3465546_non0_Jun30.txt SRR3465547_non0_Jun30.txt > DR_67_non0_Jun30.txt
> grep -Fwf SRR3465548_non0_Jun30.txt DR_67_non0_Jun30.txt > DR_678_non0_Jun30.txt
> grep -Fwf SRR3465549_non0_Jun30.txt DR_678_non0_Jun30.txt > DR_6789_non0_Jun30.txt
wc -l DR_6789_non0_Jun30.txt
echo ""
head -n 10 DR_6789_non0_Jun30.txt
36362 DR_6789_non0_Jun30.txt ENSDART00000093611.3 ENSDART00000117474.3 ENSDART00000174022.2 ENSDART00000093609.3 ENSDART00000093606.3 ENSDART00000093613.3 ENSDART00000093612.3 ENSDART00000182970.1 ENSDART00000171617.2 ENSDART00000116823.3
rpsblast -query ~/CNS-transcriptomes/DR/DR_CNS_pep_Jun30.fa -db Kog \
-out ~/CNS-transcriptomes/DR/DR_CNS_pep_Jun30_KOG.txt -evalue 1E-5 \
-outfmt "6 qseqid sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp qcovs" \
-max_hsps 1 -max_target_seqs 1
DR_CNS_pep_Jun30.fa
---> 32,394 sequences
## Download protein sequences
wget ftp://ftp.ensembl.org/pub/release-92/fasta/danio_rerio/pep/Danio_rerio.GRCz11.pep.all.fa.gz
gunzip Danio_rerio.GRCz11.pep.all.fa.gz
mv Danio_rerio.GRCz11.pep.all.fa DR_pep.fa
## Extract protein sequences of transcripts with TPM>0 in all 4 libraries [27923037.bc]
filterbyname.sh in=DR_pep.fa out=DR_CNS_pep_Jun30.fa names=DR_6789_non0_Jun30.txt include=t substring