Reference transcripts were obtained from Ensembl, which were imported from FlyBase release dmel_6.17 (FB2017_04).
## Combine coding & non-coding RNAs
cat DM_cDNA.fa DM_ncRNA.fa > DM_all_RNA.fa
## Create Salmon index (k=31)
salmon index -t DM_all_RNA.fa -i ~/DM_all_RNA_k31_Jun28_salmon_index --type quasi -k 31
## FastQC
fastqc -o . -f fastq --extract SRR3478195.fastq.gz -t 8
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3478195.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR3478195.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR3478195.cor.fq -o filtered_SRR3478195_fastp.cor.fq \
-q 5 -c -p -w 10 -a GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTAT \
-j filtered_SRR3478195_fastp.json -h filtered_SRR3478195_fastp.html \
-R "filtered_SRR3478195_fastp report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR3478195_fastp.cor.fq -t 6
## Mapping to k=31 index ---> 97.3689% reads mapped
salmon quant -i ~/DM_all_RNA_k31_Jun28_salmon_index -l A \
-r filtered_SRR3478195_fastp.cor.fq \
-o ~/filtered_SRR3478195_fastp_DM_all_RNA_k31_Jun28_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR3478196.fastq.gz -t 8
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3478196.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR3478196.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR3478196.cor.fq -o filtered_SRR3478196_fastp.cor.fq \
-q 5 -c -p -w 6 -A \
-j filtered_fastp_SRR3478196.json -h filtered_fastp_SRR3478196.html \
-R "filtered_fastp_SRR3478196 report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR3478196_fastp.cor.fq -t 6
## Mapping (k=31) ---> 97.1142% reads mapped
salmon quant -i ~/DM_all_RNA_k31_Jun28_salmon_index -l A \
-r filtered_SRR3478196_fastp.cor.fq \
-o ~/filtered_SRR3478196_fastp_DM_all_RNA_k31_Jun28_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR3478197.fastq.gz -t 8
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3478197.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR3478197.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR3478197.cor.fq -o filtered_SRR3478197_fastp.cor.fq \
-q 5 -c -p -w 6 -A \
-j filtered_fastp_SRR3478197.json -h filtered_fastp_SRR3478197.html \
-R "filtered_fastp_SRR3478197 report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR3478197_fastp.cor.fq -t 6
## Mapping (k=31) ---> 97.148% reads mapped
salmon quant -i ~/DM_all_RNA_k31_Jun28_salmon_index -l A \
-r filtered_SRR3478197_fastp.cor.fq \
-o ~/filtered_SRR3478197_fastp_DM_all_RNA_k31_Jun28_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR3478217.fastq.gz -t 8
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3478217.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR3478217.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR3478217.cor.fq -o filtered_SRR3478217_fastp.cor.fq \
-q 5 -c -p -w 6 \
-j filtered_fastp_SRR3478217.json -h filtered_fastp_SRR3478217.html \
-R "filtered_fastp_SRR3478217 report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR3478217_fastp.cor.fq -t 6
## Mapping (k=31) ---> 96.9379% reads mapped
salmon quant -i ~/DM_all_RNA_k31_Jun28_salmon_index -l A \
-r filtered_SRR3478217_fastp.cor.fq \
-o ~/filtered_SRR3478217_fastp_DM_all_RNA_k31_Jun28_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR3478218.fastq.gz -t 8
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3478218.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR3478218.cor.fq.gz -o filtered
# fastp
fastp -i filtered_SRR3478218.cor.fq -o filtered_SRR3478218_fastp.cor.fq \
-q 5 -c -p -A -w 6 \
-j filtered_SRR3478218_fastp.json -h filtered_SRR3478218_fastp.html \
-R "filtered_SRR3478218_fastp report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR3478218_fastp.cor.fq -t 6
## Mapping (k=31) ---> 97.2746% reads mapped
salmon quant -i ~/DM_all_RNA_k31_Jun28_salmon_index -l A \
-r filtered_SRR3478218_fastp.cor.fq \
-o ~/filtered_SRR3478218_fastp_DM_all_RNA_k31_Jun28_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR3478219.fastq.gz -t 8
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR3478219.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR3478219.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR3478219.cor.fq -o filtered_SRR3478219_fastp.cor.fq \
-q 5 -c -p -w 6 \
-j filtered_SRR3478219_fastp.json -h filtered_SRR3478219_fastp.html \
-R "filtered_SRR3478219_fastp report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR3478219_fastp.cor.fq -t 6
## Mapping (k=31) ---> 97.2507% reads mapped
salmon quant -i ~/DM_all_RNA_k31_Jun28_salmon_index -l A \
-r filtered_SRR3478219_fastp.cor.fq \
-o ~/filtered_SRR3478219_fastp_DM_all_RNA_k31_Jun28_salmon_quant
Named Jun29, but using data analyzed on Jun28 (k=31 mapping)
## MultiQC command to summarize all FastQC and Salmon output logs in the current folder
multiqc .
## Import libraries
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7a_Drosophila")
## Define function
def extract_non0(salmon_output_filename, library_ID):
with open(salmon_output_filename, "r") as infile:
lib = pd.read_csv(infile, sep='\t')
lib_non0 = lib.loc[lib["TPM"]>0]
lib_non0_counts = lib_non0[["Name", "TPM"]]
print("There are", lib_non0_counts.shape[0], "transcripts with TPM>0 in the reads library", library_ID)
return(lib_non0_counts)
## Analyses
lib95 = extract_non0("./filtered_SRR3478195_fastp_DM_all_RNA_k31_Jun28_salmon_quant/quant.sf", "SRR3478195")
lib96 = extract_non0("./filtered_SRR3478196_fastp_DM_all_RNA_k31_Jun28_salmon_quant/quant.sf", "SRR3478196")
lib97 = extract_non0("./filtered_SRR3478197_fastp_DM_all_RNA_k31_Jun28_salmon_quant/quant.sf", "SRR3478197")
lib17 = extract_non0("./filtered_SRR3478217_fastp_DM_all_RNA_k31_Jun28_salmon_quant/quant.sf", "SRR3478217")
lib18 = extract_non0("./filtered_SRR3478218_fastp_DM_all_RNA_k31_Jun28_salmon_quant/quant.sf", "SRR3478218")
lib19 = extract_non0("./filtered_SRR3478219_fastp_DM_all_RNA_k31_Jun28_salmon_quant/quant.sf", "SRR3478219")
There are 22356 transcripts with TPM>0 in the reads library SRR3478195 There are 22889 transcripts with TPM>0 in the reads library SRR3478196 There are 21790 transcripts with TPM>0 in the reads library SRR3478197 There are 22697 transcripts with TPM>0 in the reads library SRR3478217 There are 19936 transcripts with TPM>0 in the reads library SRR3478218 There are 20369 transcripts with TPM>0 in the reads library SRR3478219
%get lib95 --from Python3
%get lib96 --from Python3
head(lib95, 10)
head(lib96, 10)
Name | TPM | |
---|---|---|
0 | FBtr0075502 | 0.813066 |
1 | FBtr0300738 | 0.102797 |
2 | FBtr0300739 | 24.321500 |
3 | FBtr0300737 | 5.016550 |
4 | FBtr0300736 | 0.416568 |
5 | FBtr0078628 | 0.059027 |
6 | FBtr0300740 | 0.015383 |
7 | FBtr0078627 | 2.365450 |
8 | FBtr0300741 | 0.558595 |
9 | FBtr0072762 | 18.382800 |
Name | TPM | |
---|---|---|
2 | FBtr0300739 | 20.8008000 |
4 | FBtr0300736 | 0.8957440 |
5 | FBtr0078628 | 0.0303317 |
7 | FBtr0078627 | 0.3727980 |
9 | FBtr0072762 | 16.6405000 |
10 | FBtr0346821 | 0.9219090 |
13 | FBtr0086122 | 2.2962300 |
15 | FBtr0081459 | 3.5324000 |
16 | FBtr0305321 | 4.4229800 |
17 | FBtr0083716 | 1.4543400 |
## Define function
def sorted_merged_table(df1, df2, df3, df4, df5, df6,
lib_name1, lib_name2, lib_name3, lib_name4, lib_name5, lib_name6):
non0_2 = df1.merge(df2, on='Name') # Create lookup merged table of all libraries
non0_3 = non0_2.merge(df3, on='Name')
non0_4 = non0_3.merge(df4, on='Name')
non0_5 = non0_4.merge(df5, on='Name')
non0_6 = non0_5.merge(df6, on='Name')
non0_6['Median'] = non0_6.median(axis=1) # Calculate median read count of each transcript across all libraries
non0_6_median_sorted = non0_6.sort_values(by="Median", ascending=False) # Sort transcripts by median read count
non0_6_median_sorted.columns = ("Name", lib_name1, lib_name2, lib_name3, # Rename columns
lib_name4, lib_name5, lib_name6, "Median")
return non0_6_median_sorted
## Create median-sorted lookup table
DM_sorted_merged = sorted_merged_table(lib95, lib96, lib97, lib17, lib18, lib19,
"SRR3478195", "SRR3478196", "SRR3478197", "SRR3478217", "SRR3478218", "SRR3478219")
print("Dataframe dimensions:", DM_sorted_merged.shape)
## Output non-0 transcript IDs to file
DM_sorted_merged["Name"].to_csv("./DM_567789_non0_Jun28.txt", index=None)
Dataframe dimensions: (15379, 8)
%get DM_sorted_merged --from Python3
head(DM_sorted_merged, 10)
Name | SRR3478195 | SRR3478196 | SRR3478197 | SRR3478217 | SRR3478218 | SRR3478219 | Median | |
---|---|---|---|---|---|---|---|---|
14866 | FBtr0100888 | 72407.20 | 81838.0 | 61424.00 | 109216.0 | 103100.0 | 91296.2 | 86567.10 |
15247 | FBtr0307364 | 67079.00 | 91910.5 | 71111.50 | 38351.2 | 33338.0 | 34928.0 | 52715.10 |
7880 | FBtr0100868 | 23731.20 | 30708.1 | 26425.30 | 32645.9 | 30841.3 | 28208.7 | 29458.40 |
8570 | FBtr0100861 | 26776.40 | 33453.8 | 28996.10 | 35745.4 | 29645.1 | 27438.0 | 29320.60 |
12070 | FBtr0082158 | 16639.90 | 24308.1 | 20152.30 | 14051.7 | 18095.5 | 18878.5 | 18487.00 |
6743 | FBtr0100863 | 14372.10 | 18408.6 | 14199.80 | 21887.3 | 18524.1 | 16917.7 | 17663.15 |
14915 | FBtr0346885 | 7351.36 | 20358.5 | 8762.40 | 10896.8 | 18878.2 | 23953.6 | 14887.50 |
6257 | FBtr0433502 | 11658.80 | 14917.7 | 14051.10 | 15637.7 | 14516.7 | 13962.5 | 14283.90 |
14800 | FBtr0346903 | 7757.37 | 16773.8 | 7570.16 | 10930.5 | 21163.4 | 22669.6 | 13852.15 |
11967 | FBtr0072185 | 12502.60 | 15227.6 | 12987.80 | 14468.9 | 12239.2 | 11509.1 | 12745.20 |
DM_median_sorted_IDs = DM_sorted_merged["Name"]
library(biomaRt)
DM_ensembl = useMart("ensembl", dataset="dmelanogaster_gene_ensembl")
%get DM_median_sorted_IDs --from Python3
## Create getBM() query for converting transcript IDs to gene IDs
DM_biomart_output <- getBM(attributes = c('flybase_transcript_id', 'ensembl_gene_id', 'external_gene_name', 'description',
'transcript_biotype', 'gene_biotype'),
filters = 'flybase_transcript_id',
values = DM_median_sorted_IDs,
mart = DM_ensembl)
## Preview results
head(DM_biomart_output, 20)
dim(DM_biomart_output)
Batch submitting query [=>-----------------------------] 6% eta: 14s Batch submitting query [==>----------------------------] 10% eta: 18s Batch submitting query [===>---------------------------] 13% eta: 23s Batch submitting query [====>--------------------------] 16% eta: 24s Batch submitting query [=====>-------------------------] 19% eta: 26s Batch submitting query [======>------------------------] 23% eta: 25s Batch submitting query [=======>-----------------------] 26% eta: 24s Batch submitting query [========>----------------------] 29% eta: 25s Batch submitting query [=========>---------------------] 32% eta: 23s Batch submitting query [==========>--------------------] 35% eta: 22s Batch submitting query [===========>-------------------] 39% eta: 21s Batch submitting query [============>------------------] 42% eta: 20s Batch submitting query [=============>-----------------] 45% eta: 20s Batch submitting query [==============>----------------] 48% eta: 19s Batch submitting query [===============>---------------] 52% eta: 18s Batch submitting query [================>--------------] 55% eta: 16s Batch submitting query [=================>-------------] 58% eta: 15s Batch submitting query [==================>------------] 61% eta: 14s Batch submitting query [===================>-----------] 65% eta: 13s Batch submitting query [====================>----------] 68% eta: 12s Batch submitting query [=====================>---------] 71% eta: 10s Batch submitting query [======================>--------] 74% eta: 9s Batch submitting query [=======================>-------] 77% eta: 8s Batch submitting query [========================>------] 81% eta: 7s Batch submitting query [=========================>-----] 84% eta: 6s Batch submitting query [==========================>----] 87% eta: 5s Batch submitting query [===========================>---] 90% eta: 4s Batch submitting query [============================>--] 94% eta: 2s Batch submitting query [=============================>-] 97% eta: 1s Batch submitting query [===============================] 100% eta: 0s
flybase_transcript_id | ensembl_gene_id | external_gene_name | description | transcript_biotype | gene_biotype |
---|---|---|---|---|---|
FBtr0070041 | FBgn0052230 | ND-MLRQ | NADH dehydrogenase (ubiquinone) MLRQ subunit [Source:FlyBase;Acc:FBgn0052230] | protein_coding | protein_coding |
FBtr0070135 | FBgn0040382 | CG5273 | protein_coding | protein_coding | |
FBtr0070148 | FBgn0015288 | RpL22 | Ribosomal protein L22 [Source:FlyBase;Acc:FBgn0015288] | protein_coding | protein_coding |
FBtr0070159 | FBgn0026879 | CG13364 | protein_coding | protein_coding | |
FBtr0070364 | FBgn0026088 | CG14818 | protein_coding | protein_coding | |
FBtr0070386 | FBgn0025839 | ND-B14.5A | NADH dehydrogenase (ubiquinone) B14.5 A subunit [Source:FlyBase;Acc:FBgn0025839] | protein_coding | protein_coding |
FBtr0070611 | FBgn0285910 | VhaAC39-1 | Vacuolar H[+] ATPase AC39 subunit 1 [Source:FlyBase;Acc:FBgn0285910] | protein_coding | protein_coding |
FBtr0070655 | FBgn0040907 | mRpL33 | mitochondrial ribosomal protein L33 [Source:FlyBase;Acc:FBgn0040907] | protein_coding | protein_coding |
FBtr0070800 | FBgn0029785 | RpL35 | Ribosomal protein L35 [Source:FlyBase;Acc:FBgn0029785] | protein_coding | protein_coding |
FBtr0070801 | FBgn0029785 | RpL35 | Ribosomal protein L35 [Source:FlyBase;Acc:FBgn0029785] | protein_coding | protein_coding |
FBtr0070814 | FBgn0029810 | CG12239 | protein_coding | protein_coding | |
FBtr0070907 | FBgn0261955 | kdn | knockdown [Source:FlyBase;Acc:FBgn0261955] | protein_coding | protein_coding |
FBtr0070933 | FBgn0086558 | Ubi-p5E | Ubiquitin-5E [Source:FlyBase;Acc:FBgn0086558] | protein_coding | protein_coding |
FBtr0071094 | FBgn0004403 | RpS14a | Ribosomal protein S14a [Source:FlyBase;Acc:FBgn0004403] | protein_coding | protein_coding |
FBtr0071123 | FBgn0029990 | CG2233 | protein_coding | protein_coding | |
FBtr0071135 | FBgn0261592 | RpS6 | Ribosomal protein S6 [Source:FlyBase;Acc:FBgn0261592] | protein_coding | protein_coding |
FBtr0071343 | FBgn0040931 | CG9034 | protein_coding | protein_coding | |
FBtr0071360 | FBgn0030136 | RpS28b | Ribosomal protein S28b [Source:FlyBase;Acc:FBgn0030136] | protein_coding | protein_coding |
FBtr0071389 | FBgn0030158 | CG9686 | protein_coding | protein_coding | |
FBtr0071393 | FBgn0026415 | Idgf4 | Imaginal disc growth factor 4 [Source:FlyBase;Acc:FBgn0026415] | protein_coding | protein_coding |
%get DM_biomart_output --from R
## Check dataframe dimension after import
print("Dataframe dimensions:", DM_biomart_output.shape, "\n")
## Change transcript ID column header to match TPM>0 read counts table for merging
DM_biomart_output_df = DM_biomart_output.rename(columns = {'flybase_transcript_id':'Name'})
Dataframe dimensions: (15379, 6)
## Merge Biomart output & read counts table sorted by median read count
DM_sorted_counts_BM = DM_sorted_merged.merge(DM_biomart_output_df, on='Name')
## Extract entries of protein-coding transcripts
DM_PC_transcripts = DM_sorted_counts_BM.loc[DM_sorted_counts_BM["transcript_biotype"].str.contains("protein_coding")]
## Check dataframe dimensions
print(DM_PC_transcripts.shape)
(14789, 13)
%get DM_sorted_counts_BM --from Python3
head(DM_sorted_counts_BM, 20)
Name | SRR3478195 | SRR3478196 | SRR3478197 | SRR3478217 | SRR3478218 | SRR3478219 | Median | ensembl_gene_id | external_gene_name | description | transcript_biotype | gene_biotype | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | FBtr0100888 | 7.24072e+04 | 8.18380e+04 | 6.14240e+04 | 109216.00 | 103100.00 | 91296.20 | 86567.100 | FBgn0013686 | mt:lrRNA | mitochondrial large ribosomal RNA [Source:FlyBase;Acc:FBgn0013686] | rRNA | rRNA |
1 | FBtr0307364 | 6.70790e+04 | 9.19105e+04 | 7.11115e+04 | 38351.20 | 33338.00 | 34928.00 | 52715.100 | FBgn0058469 | CR40469 | ncRNA | ncRNA | |
2 | FBtr0100868 | 2.37312e+04 | 3.07081e+04 | 2.64253e+04 | 32645.90 | 30841.30 | 28208.70 | 29458.400 | FBgn0013676 | mt:CoIII | mitochondrial Cytochrome c oxidase subunit III [Source:FlyBase;Acc:FBgn0013676] | protein_coding | protein_coding |
3 | FBtr0100861 | 2.67764e+04 | 3.34538e+04 | 2.89961e+04 | 35745.40 | 29645.10 | 27438.00 | 29320.600 | FBgn0013674 | mt:CoI | mitochondrial Cytochrome c oxidase subunit I [Source:FlyBase;Acc:FBgn0013674] | protein_coding | protein_coding |
4 | FBtr0082158 | 1.66399e+04 | 2.43081e+04 | 2.01523e+04 | 14051.70 | 18095.50 | 18878.50 | 18487.000 | FBgn0002868 | MtnA | Metallothionein A [Source:FlyBase;Acc:FBgn0002868] | protein_coding | protein_coding |
5 | FBtr0100863 | 1.43721e+04 | 1.84086e+04 | 1.41998e+04 | 21887.30 | 18524.10 | 16917.70 | 17663.150 | FBgn0013675 | mt:CoII | mitochondrial Cytochrome c oxidase subunit II [Source:FlyBase;Acc:FBgn0013675] | protein_coding | protein_coding |
6 | FBtr0346885 | 7.35136e+03 | 2.03585e+04 | 8.76240e+03 | 10896.80 | 18878.20 | 23953.60 | 14887.500 | FBgn0267504 | 28SrRNA:CR45844 | 28S ribosomal RNA:CR45844 [Source:FlyBase;Acc:FBgn0267504] | rRNA | rRNA |
7 | FBtr0433502 | 1.16588e+04 | 1.49177e+04 | 1.40511e+04 | 15637.70 | 14516.70 | 13962.50 | 14283.900 | FBgn0013678 | mt:Cyt-b | mitochondrial Cytochrome b [Source:FlyBase;Acc:FBgn0013678] | protein_coding | protein_coding |
8 | FBtr0346903 | 7.75737e+03 | 1.67738e+04 | 7.57016e+03 | 10930.50 | 21163.40 | 22669.60 | 13852.150 | FBgn0267520 | 28SrRNA-Psi:CR45860 | 28S ribosomal RNA pseudogene:CR45860 [Source:FlyBase;Acc:FBgn0267520] | pseudogene | pseudogene |
9 | FBtr0072185 | 1.25026e+04 | 1.52276e+04 | 1.29878e+04 | 14468.90 | 12239.20 | 11509.10 | 12745.200 | FBgn0023170 | RpL39 | Ribosomal protein L39 [Source:FlyBase;Acc:FBgn0023170] | protein_coding | protein_coding |
10 | FBtr0100231 | 1.22792e+04 | 1.64489e+04 | 1.14223e+04 | 14541.60 | 12367.90 | 11503.70 | 12323.550 | FBgn0066084 | RpL41 | Ribosomal protein L41 [Source:FlyBase;Acc:FBgn0066084] | protein_coding | protein_coding |
11 | FBtr0433498 | 1.02397e+04 | 1.27918e+04 | 1.13638e+04 | 15365.80 | 12053.50 | 11041.70 | 11708.650 | FBgn0013672 | mt:ATPase6 | mitochondrial ATPase subunit 6 [Source:FlyBase;Acc:FBgn0013672] | protein_coding | protein_coding |
12 | FBtr0305669 | 1.14361e+04 | 1.18531e+04 | 1.12036e+04 | 12985.30 | 10985.70 | 9710.47 | 11319.850 | FBgn0016726 | RpL29 | Ribosomal protein L29 [Source:FlyBase;Acc:FBgn0016726] | protein_coding | protein_coding |
13 | FBtr0346874 | 5.15395e-02 | 1.99499e-07 | 1.65381e-04 | 19199.50 | 28094.00 | 33610.30 | 9599.776 | FBgn0085802 | 18SrRNA:CR41548 | 18S ribosomal RNA:CR41548 [Source:FlyBase;Acc:FBgn0085802] | rRNA | rRNA |
14 | FBtr0346878 | 1.72844e+04 | 2.39737e+04 | 2.04463e+04 | 703.08 | 1571.66 | 1590.72 | 9437.560 | FBgn0267498 | 18SrRNA:CR45838 | 18S ribosomal RNA:CR45838 [Source:FlyBase;Acc:FBgn0267498] | rRNA | rRNA |
15 | FBtr0088816 | 1.08732e+04 | 7.07863e+03 | 1.20603e+04 | 10118.20 | 8431.46 | 7656.14 | 9274.830 | FBgn0033268 | Obp44a | Odorant-binding protein 44a [Source:FlyBase;Acc:FBgn0033268] | protein_coding | protein_coding |
16 | FBtr0346872 | 8.09745e+03 | 7.66093e+03 | 8.50554e+03 | 7512.93 | 12967.60 | 12817.40 | 8301.495 | FBgn0267511 | 28SrRNA-Psi:CR45851 | 28S ribosomal RNA pseudogene:CR45851 [Source:FlyBase;Acc:FBgn0267511] | pseudogene | pseudogene |
17 | FBtr0346876 | 3.77795e+03 | 7.52062e+03 | 4.48155e+03 | 4126.89 | 7462.42 | 8302.33 | 5971.985 | FBgn0267497 | 28SrRNA:CR45837 | 28S ribosomal RNA:CR45837 [Source:FlyBase;Acc:FBgn0267497] | rRNA | rRNA |
18 | FBtr0081920 | 5.27758e+03 | 6.44507e+03 | 5.53297e+03 | 4731.72 | 5239.29 | 4888.94 | 5258.435 | FBgn0040532 | CG8369 | protein_coding | protein_coding | |
19 | FBtr0345321 | 4.94450e+03 | 5.63542e+03 | 4.77491e+03 | 5218.73 | 4927.06 | 4362.67 | 4935.780 | FBgn0002579 | RpL36 | Ribosomal protein L36 [Source:FlyBase;Acc:FBgn0002579] | protein_coding | protein_coding |
%get DM_PC_transcripts --from Python3
## Preview the dataframe
head(DM_PC_transcripts, 20)
## Write to table
write.csv(head(DM_PC_transcripts, 20), file = "DM_top20_PC_transcripts.csv")
Name | SRR3478195 | SRR3478196 | SRR3478197 | SRR3478217 | SRR3478218 | SRR3478219 | Median | ensembl_gene_id | external_gene_name | description | transcript_biotype | gene_biotype | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | FBtr0100868 | 23731.20 | 30708.10 | 26425.30 | 32645.90 | 30841.30 | 28208.70 | 29458.400 | FBgn0013676 | mt:CoIII | mitochondrial Cytochrome c oxidase subunit III [Source:FlyBase;Acc:FBgn0013676] | protein_coding | protein_coding |
3 | FBtr0100861 | 26776.40 | 33453.80 | 28996.10 | 35745.40 | 29645.10 | 27438.00 | 29320.600 | FBgn0013674 | mt:CoI | mitochondrial Cytochrome c oxidase subunit I [Source:FlyBase;Acc:FBgn0013674] | protein_coding | protein_coding |
4 | FBtr0082158 | 16639.90 | 24308.10 | 20152.30 | 14051.70 | 18095.50 | 18878.50 | 18487.000 | FBgn0002868 | MtnA | Metallothionein A [Source:FlyBase;Acc:FBgn0002868] | protein_coding | protein_coding |
5 | FBtr0100863 | 14372.10 | 18408.60 | 14199.80 | 21887.30 | 18524.10 | 16917.70 | 17663.150 | FBgn0013675 | mt:CoII | mitochondrial Cytochrome c oxidase subunit II [Source:FlyBase;Acc:FBgn0013675] | protein_coding | protein_coding |
7 | FBtr0433502 | 11658.80 | 14917.70 | 14051.10 | 15637.70 | 14516.70 | 13962.50 | 14283.900 | FBgn0013678 | mt:Cyt-b | mitochondrial Cytochrome b [Source:FlyBase;Acc:FBgn0013678] | protein_coding | protein_coding |
9 | FBtr0072185 | 12502.60 | 15227.60 | 12987.80 | 14468.90 | 12239.20 | 11509.10 | 12745.200 | FBgn0023170 | RpL39 | Ribosomal protein L39 [Source:FlyBase;Acc:FBgn0023170] | protein_coding | protein_coding |
10 | FBtr0100231 | 12279.20 | 16448.90 | 11422.30 | 14541.60 | 12367.90 | 11503.70 | 12323.550 | FBgn0066084 | RpL41 | Ribosomal protein L41 [Source:FlyBase;Acc:FBgn0066084] | protein_coding | protein_coding |
11 | FBtr0433498 | 10239.70 | 12791.80 | 11363.80 | 15365.80 | 12053.50 | 11041.70 | 11708.650 | FBgn0013672 | mt:ATPase6 | mitochondrial ATPase subunit 6 [Source:FlyBase;Acc:FBgn0013672] | protein_coding | protein_coding |
12 | FBtr0305669 | 11436.10 | 11853.10 | 11203.60 | 12985.30 | 10985.70 | 9710.47 | 11319.850 | FBgn0016726 | RpL29 | Ribosomal protein L29 [Source:FlyBase;Acc:FBgn0016726] | protein_coding | protein_coding |
15 | FBtr0088816 | 10873.20 | 7078.63 | 12060.30 | 10118.20 | 8431.46 | 7656.14 | 9274.830 | FBgn0033268 | Obp44a | Odorant-binding protein 44a [Source:FlyBase;Acc:FBgn0033268] | protein_coding | protein_coding |
18 | FBtr0081920 | 5277.58 | 6445.07 | 5532.97 | 4731.72 | 5239.29 | 4888.94 | 5258.435 | FBgn0040532 | CG8369 | protein_coding | protein_coding | |
19 | FBtr0345321 | 4944.50 | 5635.42 | 4774.91 | 5218.73 | 4927.06 | 4362.67 | 4935.780 | FBgn0002579 | RpL36 | Ribosomal protein L36 [Source:FlyBase;Acc:FBgn0002579] | protein_coding | protein_coding |
20 | FBtr0100870 | 4460.08 | 5932.88 | 4120.85 | 5482.12 | 5074.41 | 4508.94 | 4791.675 | FBgn0013681 | mt:ND3 | mitochondrial NADH-ubiquinone oxidoreductase chain 3 [Source:FlyBase;Acc:FBgn0013681] | protein_coding | protein_coding |
21 | FBtr0083970 | 4418.53 | 3542.62 | 4442.73 | 4719.67 | 3858.70 | 3558.81 | 4138.615 | FBgn0038834 | RpS30 | Ribosomal protein S30 [Source:FlyBase;Acc:FBgn0038834] | protein_coding | protein_coding |
22 | FBtr0111120 | 3576.76 | 5052.31 | 4145.02 | 4539.37 | 3643.21 | 3369.58 | 3894.115 | FBgn0040007 | RpL38 | Ribosomal protein L38 [Source:FlyBase;Acc:FBgn0040007] | protein_coding | protein_coding |
23 | FBtr0433499 | 3004.90 | 4096.07 | 4034.34 | 3941.94 | 3679.44 | 3526.38 | 3810.690 | FBgn0013679 | mt:ND1 | mitochondrial NADH-ubiquinone oxidoreductase chain 1 [Source:FlyBase;Acc:FBgn0013679] | protein_coding | protein_coding |
27 | FBtr0071897 | 3474.22 | 4066.15 | 3227.59 | 3872.37 | 3135.34 | 2873.13 | 3350.905 | FBgn0010078 | RpL23 | Ribosomal protein L23 [Source:FlyBase;Acc:FBgn0010078] | protein_coding | protein_coding |
29 | FBtr0111132 | 3204.97 | 2952.79 | 2982.18 | 3438.57 | 3133.56 | 3050.26 | 3091.910 | FBgn0064225 | RpL5 | Ribosomal protein L5 [Source:FlyBase;Acc:FBgn0064225] | protein_coding | protein_coding |
30 | FBtr0082136 | 3226.80 | 3457.42 | 2830.81 | 3281.44 | 2926.15 | 2783.21 | 3076.475 | FBgn0261599 | RpS29 | Ribosomal protein S29 [Source:FlyBase;Acc:FBgn0261599] | protein_coding | protein_coding |
34 | FBtr0085961 | 2927.31 | 3363.11 | 2942.27 | 3084.86 | 2589.26 | 2358.32 | 2934.790 | FBgn0032987 | RpL21 | Ribosomal protein L21 [Source:FlyBase;Acc:FBgn0032987] | protein_coding | protein_coding |
DM_PC_transcripts_ID <- DM_PC_transcripts$Name
## Create getBM() query for obtaining peptide sequences
DM_BM_peptide_seqs <- getSequence(id = DM_PC_transcripts_ID,
type = 'flybase_transcript_id',
seqType = 'peptide',
mart = DM_ensembl)
## Export to FASTA
exportFASTA(DM_BM_peptide_seqs, file='./DM_non0_pep_Nov11.fasta')
Batch submitting query [=>-----------------------------] 7% eta: 35s Batch submitting query [==>----------------------------] 10% eta: 49s Batch submitting query [===>---------------------------] 13% eta: 1m Batch submitting query [====>--------------------------] 17% eta: 1m Batch submitting query [=====>-------------------------] 20% eta: 1m Batch submitting query [======>------------------------] 23% eta: 1m Batch submitting query [=======>-----------------------] 27% eta: 1m Batch submitting query [========>----------------------] 30% eta: 1m Batch submitting query [=========>---------------------] 33% eta: 1m Batch submitting query [==========>--------------------] 37% eta: 1m Batch submitting query [===========>-------------------] 40% eta: 1m Batch submitting query [============>------------------] 43% eta: 49s Batch submitting query [=============>-----------------] 47% eta: 47s Batch submitting query [===============>---------------] 50% eta: 45s Batch submitting query [================>--------------] 53% eta: 42s Batch submitting query [=================>-------------] 57% eta: 40s Batch submitting query [==================>------------] 60% eta: 37s Batch submitting query [===================>-----------] 63% eta: 34s Batch submitting query [====================>----------] 67% eta: 31s Batch submitting query [=====================>---------] 70% eta: 29s Batch submitting query [======================>--------] 73% eta: 26s Batch submitting query [=======================>-------] 77% eta: 23s Batch submitting query [========================>------] 80% eta: 20s Batch submitting query [=========================>-----] 83% eta: 16s Batch submitting query [==========================>----] 87% eta: 13s Batch submitting query [===========================>---] 90% eta: 10s Batch submitting query [============================>--] 93% eta: 7s Batch submitting query [=============================>-] 97% eta: 3s Batch submitting query [===============================] 100% eta: 0s
##[28617542.bc]
rpsblast -query ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7a_Drosophila/DM_non0_pep_Nov11.fasta -db Kog \
-out ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7a_Drosophila/DM_CNS_pep_Nov11_KOG.txt -evalue 1E-5 \
-outfmt "6 qseqid sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp qcovs" \
-max_hsps 1 -max_target_seqs 1
## Replace "[" and "]" with "#" for later import as dataframe
sed -i 's/\[/#/g' DM_CNS_pep_Nov11_KOG.txt
sed -i 's/\]./#/g' DM_CNS_pep_Nov11_KOG.txt
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7a_Drosophila")
DM_KOG = pd.read_csv("DM_CNS_pep_Nov11_KOG.txt", sep='#', header=None, engine="python")
type(DM_KOG)
pandas.core.frame.DataFrame
%get DM_KOG --from Python3
head(DM_KOG, 10)
0 | 1 | 2 | |
---|---|---|---|
0 | FBtr0070135 gnl|CDD|229923 KOG1984, KOG1984, KOG1984, Vesicle coat complex COPII, subunit SFB3 | Intracellular trafficking, secretion, and vesicular transport | 24.057 212 146 5 469 79 279 1007 19 226 1.30e-08 54.8 43 43 |
1 | FBtr0070611 gnl|CDD|230896 KOG2957, KOG2957, KOG2957, Vacuolar H+-ATPase V0 sector, subunit d | Energy production and conversion | 78.000 350 76 1 350 1 350 350 2 350 0.0 642 100 100 |
2 | FBtr0070159 gnl|CDD|232691 KOG4766, KOG4766, KOG4766, Uncharacterized conserved protein | Function unknown | 75.926 54 13 0 64 1 54 64 1 54 9.64e-07 39.1 84 84 |
3 | FBtr0071498 gnl|CDD|229595 KOG1654, KOG1654, KOG1654, Microtubule-associated anchor protein involved in autophagy and membrane trafficking | Cytoskeleton | 66.379 116 39 0 121 1 116 116 1 116 4.27e-72 208 96 96 |
4 | FBtr0072188 gnl|CDD|229674 KOG1735, KOG1735, KOG1735, Actin depolymerizing factor | Cytoskeleton | 46.309 149 76 2 148 1 148 146 1 146 2.46e-63 188 100 100 |
5 | FBtr0070148 gnl|CDD|231372 KOG3434, KOG3434, KOG3434, 60S ribosomal protein L22 | Translation, ribosomal structure and biogenesis | 66.400 125 41 1 299 173 297 125 1 124 4.20e-44 144 42 42 |
6 | FBtr0071094 gnl|CDD|228356 KOG0407, KOG0407, KOG0407, 40S ribosomal protein S14 | Translation, ribosomal structure and biogenesis | 89.062 128 14 0 151 13 140 139 1 128 3.71e-82 235 85 85 |
7 | FBtr0071444 gnl|CDD|232489 KOG4561, KOG4561, KOG4561, Uncharacterized conserved protein, contains TBC domain | Signal transduction mechanisms, General function prediction only | 29.240 342 178 6 429 73 411 281 1 281 3.73e-91 275 79 79 |
8 | FBtr0070800 gnl|CDD|231374 KOG3436, KOG3436, KOG3436, 60S ribosomal protein L35 | Translation, ribosomal structure and biogenesis | 61.789 123 47 0 123 1 123 123 1 123 1.59e-29 101 100 100 |
9 | FBtr0072187 gnl|CDD|229526 KOG1585, KOG1585, KOG1585, Protein required for fusion of vesicles in vesicular transport, gamma-SNAP | Intracellular trafficking, secretion, and vesicular transport | 38.558 319 168 4 302 1 302 308 1 308 1.91e-101 297 100 100 |
## Count the number of occurrences of each category
KOGs= ["RNA processing and modification", "Chromatin structure and dynamics", "Energy production and conversion", "Cell cycle control",
"Amino acid transport and metabolism", "Nucleotide transport and metabolism", "Carbohydrate transport and metabolism", "Coenzyme transport and metabolism",
"Lipid transport and metabolism", "Translation, ribosomal structure and biogenesis", "Transcription", "Replication, recombination and repair",
"Cell wall/membrane/envelope biogenesis", "Cell motility", "Posttranslational modification", "Inorganic ion transport and metabolism",
"Secondary metabolites", "General function prediction only", "Function unknown", "Signal transduction mechanisms", "Intracellular trafficking",
"Defense mechanisms", "Extracellular structures", "Nuclear structure", "Cytoskeleton"]
data = []
for KOG in KOGs:
print(KOG, DM_KOG[1].str.contains(KOG).sum())
data.append([KOG, DM_KOG[1].str.contains(KOG).sum()])
df = pd.DataFrame(data)
df.columns = ["KOG", "Count"]
df["DM_Percentage"] = df["Count"]/df["Count"].sum()*100
print(df)
df[["KOG", "DM_Percentage"]].to_csv("DM_KOG_summary.txt", sep="\t", index=None)
RNA processing and modification 518 Chromatin structure and dynamics 229 Energy production and conversion 448 Cell cycle control 292 Amino acid transport and metabolism 435 Nucleotide transport and metabolism 148 Carbohydrate transport and metabolism 416 Coenzyme transport and metabolism 113 Lipid transport and metabolism 467 Translation, ribosomal structure and biogenesis 529 Transcription 1085 Replication, recombination and repair 231 Cell wall/membrane/envelope biogenesis 143 Cell motility 24 Posttranslational modification 1037 Inorganic ion transport and metabolism 384 Secondary metabolites 195 General function prediction only 1742 Function unknown 973 Signal transduction mechanisms 2239 Intracellular trafficking 702 Defense mechanisms 90 Extracellular structures 163 Nuclear structure 52 Cytoskeleton 532 KOG Count DM_Percentage 0 RNA processing and modification 518 3.928111 1 Chromatin structure and dynamics 229 1.736559 2 Energy production and conversion 448 3.397285 3 Cell cycle control 292 2.214302 4 Amino acid transport and metabolism 435 3.298703 5 Nucleotide transport and metabolism 148 1.122317 6 Carbohydrate transport and metabolism 416 3.154622 7 Coenzyme transport and metabolism 113 0.856905 8 Lipid transport and metabolism 467 3.541366 9 Translation, ribosomal structure and biogenesis 529 4.011527 10 Transcription 1085 8.227800 11 Replication, recombination and repair 231 1.751725 12 Cell wall/membrane/envelope biogenesis 143 1.084401 13 Cell motility 24 0.181997 14 Posttranslational modification 1037 7.863805 15 Inorganic ion transport and metabolism 384 2.911959 16 Secondary metabolites 195 1.478729 17 General function prediction only 1742 13.209980 18 Function unknown 973 7.378479 19 Signal transduction mechanisms 2239 16.978843 20 Intracellular trafficking 702 5.323425 21 Defense mechanisms 90 0.682490 22 Extracellular structures 163 1.236066 23 Nuclear structure 52 0.394328 24 Cytoskeleton 532 4.034276
DM_CNS_pep_Jun28.fa
---> 14,789 sequencesfilterbyname.sh \
in=DM_pep.fa \
out=DM_CNS_pep_Jun28.fa \
names=DM_567789_non0_Jun28.txt \
include=t substring
rpsblast -query ~/CNS-transcriptomes/DM/DM_CNS_pep_Jun28.fa -db Kog \
-out ~/CNS-transcriptomes/DM/DM_CNS_pep_Jun28_KOG.txt -evalue 1E-5 \
-outfmt "6 qseqid sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp qcovs" \
-max_hsps 1 -max_target_seqs 1