Reference transcripts (JGI v9.1).
## Download reference transcripts from JGI v9.1 release
wget ftp://ftp.xenbase.org/pub/Genomics/Sequences/xtropMRNA.fasta
wget ftp://ftp.xenbase.org/pub/Genomics/Sequences/xtropProtein.fasta
## Create k=31 Salmon index (salmon v0.11.4)
./salmon index -t ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7d_Xenopus/xtropMRNA.fasta \
-i ~/XT_9.1_transcripts_Nov12_k31_salmon_index --type quasi -k 31
## FastQC
fastqc -o . -f fastq --extract SRR5412261.fastq.gz SRR5412262.fastq.gz SRR5412263.fastq.gz SRR5412264.fastq.gz -t 10
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR5412261.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR5412261.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR5412261.cor.fq -o filtered_SRR5412261_fastp.cor.fq -q 5 -c -p -w 10 \
-h filtered_SRR5412261_fastp.html -j filtered_SRR5412261_fastp.json -R "filtered_SRR5412261_noribo report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR5412261_fastp.cor.fq -t 10
## Mapping --> 64.7378% reads mapped
~/install/salmon-latest_linux_x86_64/bin/salmon quant -i ~/XT_9.1_transcripts_Nov12_k31_salmon_index -l A \
-r ./filtered_SRR5412261_fastp.cor.fq \
-o ./filtered_SRR5412261_fastp_XT_9.1_transcripts_Nov12_salmon_quant --validateMappings
## FastQC
fastqc -o . -f fastq --extract SRR5412261.fastq.gz SRR5412262.fastq.gz SRR5412263.fastq.gz SRR5412264.fastq.gz -t 10
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR5412262.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR5412262.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR5412262.cor.fq -o filtered_SRR5412262_fastp.cor.fq -q 5 -c -p -w 6 \
-h filtered_SRR5412262_fastp.html -j filtered_SRR5412262_fastp.json -R "filtered_SRR5412262_fastp report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR5412262_fastp.cor.fq -t 6
## Mapping --> 65.077% reads mapped
~/install/salmon-latest_linux_x86_64/bin/salmon quant -i ~/XT_9.1_transcripts_Nov12_k31_salmon_index -l A \
-r ./filtered_SRR5412262_fastp.cor.fq \
-o ./filtered_SRR5412262_fastp_XT_9.1_transcripts_Nov12_salmon_quant --validateMappings
## FastQC
fastqc -o . -f fastq --extract SRR5412261.fastq.gz SRR5412262.fastq.gz SRR5412263.fastq.gz SRR5412264.fastq.gz -t 10
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR5412263.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR5412263.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR5412263.cor.fq -o filtered_SRR5412263_fastp.cor.fq -q 5 -c -p -w 6 \
-h filtered_SRR5412263_fastp.html -j filtered_SRR5412263_fastp.json -R "filtered_SRR5412263_fastp report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR5412263_fastp.cor.fq -t 6
## Mapping --> 63.7469% reads mapped
~/install/salmon-latest_linux_x86_64/bin/salmon quant -i ~/XT_9.1_transcripts_Nov12_k31_salmon_index -l A \
-r ./filtered_SRR5412263_fastp.cor.fq \
-o ./filtered_SRR5412263_fastp_XT_9.1_transcripts_Nov12_salmon_quant --validateMappings
## FastQC
fastqc -o . -f fastq --extract SRR5412261.fastq.gz SRR5412262.fastq.gz SRR5412263.fastq.gz SRR5412264.fastq.gz -t 10
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s ./SRR5412264.fastq.gz
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR5412264.cor.fq.gz -o filtered
## fastp
fastp -i filtered_SRR5412264.cor.fq -o filtered_SRR5412264_fastp.cor.fq -q 5 -c -p -w 6 \
-h filtered_SRR5412264_fastp.html -j filtered_SRR5412264_fastp.json -R "filtered_SRR5412264_fastp report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR5412264_fastp.cor.fq -t 6
## Mapping --> 64.046% reads mapped
~/install/salmon-latest_linux_x86_64/bin/salmon quant -i ~/XT_9.1_transcripts_Nov12_k31_salmon_index -l A \
-r ./filtered_SRR5412264_fastp.cor.fq \
-o ./filtered_SRR5412264_fastp_XT_9.1_transcripts_Nov12_salmon_quant --validateMappings
## MultiQC command to summarize all FastQC and Salmon output logs in the current folder
multiqc .
## Import libraries
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7d_Xenopus")
## Define function
def extract_non0(salmon_output_filename, library_ID):
with open(salmon_output_filename, "r") as infile:
lib = pd.read_csv(infile, sep='\t')
lib_non0 = lib.loc[lib["TPM"]>0]
lib_non0_counts = lib_non0[["Name", "TPM"]]
print("There are", lib_non0_counts.shape[0], "transcripts with TPM>0 in the reads library", library_ID)
return(lib_non0_counts)
## Analyses
lib61 = extract_non0("./filtered_SRR5412261_fastp_XT_9.1_transcripts_Nov12_salmon_quant/quant.sf", "SRR5412261")
lib62 = extract_non0("./filtered_SRR5412262_fastp_XT_9.1_transcripts_Nov12_salmon_quant/quant.sf", "SRR5412262")
lib63 = extract_non0("./filtered_SRR5412263_fastp_XT_9.1_transcripts_Nov12_salmon_quant/quant.sf", "SRR5412263")
lib64 = extract_non0("./filtered_SRR5412264_fastp_XT_9.1_transcripts_Nov12_salmon_quant/quant.sf", "SRR5412264")
There are 23723 transcripts with TPM>0 in the reads library SRR5412261 There are 24088 transcripts with TPM>0 in the reads library SRR5412262 There are 23934 transcripts with TPM>0 in the reads library SRR5412263 There are 23566 transcripts with TPM>0 in the reads library SRR5412264
%get lib61 --from Python3
%get lib62 --from Python3
head(lib61, 10)
head(lib62, 10)
Name | TPM | |
---|---|---|
0 | gi|106635719|gb|DQ523561| | 7.478246 |
1 | gi|106880494|ref|NM_001015715| | 0.465555 |
2 | gi|106880496|ref|NM_001006893| | 14.363708 |
3 | gi|106880498|ref|NM_001011503| | 15.264303 |
4 | gi|106880500|ref|NM_001016036| | 14.218328 |
6 | gi|109150090|gb|BC117669| | 6.490515 |
8 | gi|110164836|gb|DQ658157| | 1.253106 |
9 | gi|110164838|gb|DQ658158| | 0.386988 |
10 | gi|110164840|gb|DQ658159| | 0.227837 |
11 | gi|110164842|gb|DQ658160| | 0.095692 |
Name | TPM | |
---|---|---|
0 | gi|106635719|gb|DQ523561| | 6.086433 |
1 | gi|106880494|ref|NM_001015715| | 0.437127 |
2 | gi|106880496|ref|NM_001006893| | 21.246139 |
3 | gi|106880498|ref|NM_001011503| | 5.762818 |
4 | gi|106880500|ref|NM_001016036| | 25.038422 |
5 | gi|108742161|gb|BC117668| | 0.527462 |
6 | gi|109150090|gb|BC117669| | 3.100326 |
8 | gi|110164836|gb|DQ658157| | 0.248110 |
9 | gi|110164838|gb|DQ658158| | 0.134089 |
10 | gi|110164840|gb|DQ658159| | 1.026275 |
## Define function
def sorted_merged_table(df1, df2, df3, df4,
lib_name1, lib_name2, lib_name3, lib_name4):
non0_2 = df1.merge(df2, on='Name') # Create lookup merged table of all libraries
non0_3 = non0_2.merge(df3, on='Name')
non0_4 = non0_3.merge(df4, on='Name')
non0_4['Median'] = non0_4.median(axis=1) # Calculate median read count of each transcript across all libraries
non0_4_median_sorted = non0_4.sort_values(by="Median", ascending=False) # Sort transcripts by median read count
non0_4_median_sorted.columns = ("Name", lib_name1, lib_name2, lib_name3, # Rename columns
lib_name4, "Median")
return non0_4_median_sorted
## Create median-sorted lookup table
XT_sorted_merged = sorted_merged_table(lib61, lib62, lib63, lib64,
"SRR5412261", "SRR5412262", "SRR5412263", "SRR5412264")
print("Dataframe dimensions:", XT_sorted_merged.shape)
## Write to file
XT_sorted_merged['Name'].to_csv("XT_non0_Nov12.txt", header=None, index=None)
Dataframe dimensions: (21073, 6)
XT_non0_transcript_IDs = pd.read_csv("XT_non0_Nov12.txt", sep="|", header=None)
XT_non0_transcript_IDs.columns = ["GI", "GI#", "GB", "GB#", "Name"]
## Generate GI for protein sequences
XT_non0_transcript_IDs["Prot GI"] = XT_non0_transcript_IDs["GI#"]+1
print(XT_non0_transcript_IDs.head(10))
XT_non0_transcript_IDs["Prot GI"].to_csv("XT_non0_trans_prot_IDs.txt", header=None, index=None)
GI GI# GB GB# Name Prot GI 0 gi 56972345 gb BC088071 NaN 56972346 1 gi 163937686 gb BC156029 NaN 163937687 2 gi 165970565 gb BC158455 NaN 165970566 3 gi 301632351 gb XM_002945204 NaN 301632352 4 gi 77624253 gb CR942543 NaN 77624254 5 gi 54110673 gb CR848492 NaN 54110674 6 gi 56789757 gb BC088547 NaN 56789758 7 gi 301632303 gb XM_002945183 NaN 301632304 8 gi 161612117 gb BC156001 NaN 161612118 9 gi 165971171 gb BC158446 NaN 165971172
sed 's/gi|//' xtropProtein.fasta > xtropProtein_gi_simplified.fasta
sed 's/gb|//' xtropProtein_gi_simplified.fasta > xtropProtein_gi_gb_simplified.fasta
sed 's/|.*//' xtropProtein_gi_gb_simplified.fasta > xtropProtein_simplified_ID.fasta
faSomeRecords xtropProtein_simplified_ID.fasta XT_non0_trans_prot_IDs.txt XT_non0_pep_Nov13.fa
grep -c ">" XT_non0_pep_Nov13.fa
16378
%get XT_sorted_merged --from Python3
top_20 = head(XT_sorted_merged, 20)
write.table(top_20$Name, file = "top_20_IDs.txt", row.names=FALSE, col.names=FALSE, quote=FALSE)
faSomeRecords xtropMRNA.fasta top_20_IDs.txt top_20_trans.fa
## Nr blast [28725537.bc]
submitjob 10 -c 8 -m 15 diamond blastx -d ~/nr_diamond_May16.dmnd \
-q top_20_trans.fa -o ./top_20_trans_nr.txt --max-target-seqs 1 --max-hsps 1 \
-f 6 qseqid qtitle sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp \
--more-sensitive --evalue 1E-5 --unal 1 -p 8
28725537.bc.ccbr.utoronto.ca
%get top_20 --from R
top_20_nr = pd.read_csv("top_20_trans_nr.txt", header = None, sep="\t")
top_20_nr.columns = ["Name", "qtitle", "sseqid", "stitle", "pident", "length", "mismatch",
"gapopen", "qlen", "qstart", "qend", "slen", "sstart", "send", "evalue", "bitscore", "qcovhsp"]
merged_top_20 = top_20.merge(top_20_nr, on="Name")
merged_top_20.to_csv("XT_top_20.csv")
%get merged_top_20 --from Python3
merged_top_20
Name | SRR5412261 | SRR5412262 | SRR5412263 | SRR5412264 | Median | qtitle | sseqid | stitle | pident | ⋯ | gapopen | qlen | qstart | qend | slen | sstart | send | evalue | bitscore | qcovhsp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | gi|56972345|gb|BC088071| | 25467.714 | 17156.042 | 16762.691 | 19802.133 | 18479.087 | gi|56972345|gb|BC088071| Xenopus tropicalis cDNA clone IMAGE:7024642 | YP_203373.1 | YP_203373.1 cytochrome c oxidase subunit II (mitochondrion) [Xenopus tropicalis] | 92.8 | ⋯ | 0 | 725 | 1 | 666 | 229 | 8 | 229 | 9.1e-109 | 402.5 | 91.9 |
1 | gi|163937686|gb|BC156029| | 26156.461 | 15694.749 | 16590.780 | 17778.503 | 17184.641 | gi|163937686|gb|BC156029| Xenopus tropicalis cDNA clone IMAGE:6983060 | YP_203376.1 | YP_203376.1 cytochrome c oxidase subunit III (mitochondrion) [Xenopus tropicalis] | 93.3 | ⋯ | 0 | 799 | 2 | 766 | 261 | 7 | 261 | 3.9e-129 | 470.3 | 95.7 |
2 | gi|165970565|gb|BC158455| | 24266.289 | 17383.264 | 15214.021 | 16868.088 | 17125.676 | gi|165970565|gb|BC158455| Xenopus tropicalis cDNA clone IMAGE:6991509 | YP_004564340.1 | YP_004564340.1 cytochrome c oxidase subunit I [Rhinophrynus dorsalis] | 90.8 | ⋯ | 0 | 1650 | 3 | 1541 | 517 | 5 | 517 | 4.0e-261 | 909.8 | 93.3 |
3 | gi|301632351|gb|XM_002945204| | 16794.286 | 11962.705 | 21050.749 | 7328.453 | 14378.496 | gi|301632351|gb|XM_002945204| PREDICTED: Xenopus (Silurana) tropicalis 40S ribosomal protein S12-like (LOC100488821), mRNA | XP_002191710.2 | XP_002191710.2 PREDICTED: 40S ribosomal protein S12 [Taeniopygia guttata] | 86.3 | ⋯ | 0 | 191 | 36 | 188 | 201 | 63 | 113 | 1.3e-14 | 87.8 | 80.1 |
4 | gi|77624253|gb|CR942543| | 11952.081 | 13408.285 | 7262.156 | 5704.968 | 9607.118 | gi|77624253|gb|CR942543| Xenopus tropicalis finished cDNA, clone TEgg054l20 | SGW64293.1 | SGW64293.1 Uncharacterised protein [Chlamydia abortus] | 71.1 | ⋯ | 0 | 650 | 307 | 441 | 88 | 2 | 46 | 2.4e-07 | 65.5 | 20.8 |
5 | gi|54110673|gb|CR848492| | 10739.777 | 7117.134 | 6818.687 | 7714.787 | 7415.961 | gi|54110673|gb|CR848492| Xenopus tropicalis finished cDNA, clone TEgg105j19 | YP_203382.1 | YP_203382.1 cytochrome b (mitochondrion) [Xenopus tropicalis] | 96.9 | ⋯ | 0 | 389 | 2 | 385 | 380 | 252 | 379 | 2.5e-60 | 240.7 | 98.7 |
6 | gi|56789757|gb|BC088547| | 11062.100 | 7011.873 | 7449.709 | 6321.545 | 7230.791 | gi|56789757|gb|BC088547| Xenopus tropicalis hypothetical LOC496839, mRNA (cDNA clone MGC:97708 IMAGE:6985717), complete cds | YP_203375.1 | YP_203375.1 ATP synthase F0 subunit 6 (mitochondrion) [Xenopus tropicalis] | 94.2 | ⋯ | 0 | 880 | 130 | 747 | 227 | 1 | 206 | 1.3e-72 | 282.7 | 70.2 |
7 | gi|301632303|gb|XM_002945183| | 6822.635 | 10961.591 | 6286.849 | 7081.767 | 6952.201 | gi|301632303|gb|XM_002945183| PREDICTED: Xenopus (Silurana) tropicalis 60S ribosomal protein L11-like (LOC100497981), partial mRNA | XP_012812446.1 | XP_012812446.1 PREDICTED: 60S ribosomal protein L11 isoform X1 [Xenopus tropicalis] | 100.0 | ⋯ | 0 | 191 | 1 | 138 | 167 | 122 | 167 | 3.3e-18 | 99.8 | 72.3 |
8 | gi|161612117|gb|BC156001| | 6800.434 | 4887.040 | 6887.450 | 4313.739 | 5843.737 | gi|161612117|gb|BC156001| Xenopus tropicalis hypothetical protein LOC100135132, mRNA (cDNA clone MGC:185267 IMAGE:7887781), complete cds | OCA25140.1 | OCA25140.1 hypothetical protein XENTR_v90020538mg [Xenopus tropicalis] | 100.0 | ⋯ | 0 | 329 | 3 | 170 | 82 | 27 | 82 | 2.6e-23 | 117.5 | 51.1 |
9 | gi|165971171|gb|BC158446| | 9095.742 | 5547.460 | 5607.002 | 6020.808 | 5813.905 | gi|165971171|gb|BC158446| Xenopus tropicalis cDNA clone IMAGE:6982245 | YP_203370.1 | YP_203370.1 NADH dehydrogenase subunit 1 (mitochondrion) [Xenopus tropicalis] | 92.8 | ⋯ | 0 | 942 | 3 | 917 | 322 | 18 | 322 | 2.9e-147 | 530.8 | 97.1 |
10 | gi|110645425|gb|BC118862| | 6556.924 | 4769.381 | 6831.703 | 4946.521 | 5751.722 | gi|110645425|gb|BC118862| Xenopus tropicalis ribosomal protein S29, mRNA (cDNA clone MGC:146816 IMAGE:7688801), complete cds | EDL36629.1 | EDL36629.1 mCG7602, partial [Mus musculus] | 100.0 | ⋯ | 0 | 356 | 1 | 171 | 60 | 4 | 60 | 4.3e-27 | 130.2 | 48.0 |
11 | gi|49900027|gb|BC077025| | 5965.670 | 4033.391 | 6379.391 | 4080.571 | 5023.121 | gi|49900027|gb|BC077025| Xenopus tropicalis MGC89823 protein, mRNA (cDNA clone MGC:89823 IMAGE:7028509), complete cds | XP_008154493.1 | XP_008154493.1 PREDICTED: 60S ribosomal protein L38 [Eptesicus fuscus] | 90.9 | ⋯ | 0 | 327 | 39 | 269 | 109 | 33 | 109 | 1.2e-28 | 135.2 | 70.6 |
12 | gi|301632465|gb|XM_002945260| | 4075.516 | 4106.196 | 4668.846 | 5222.228 | 4387.521 | gi|301632465|gb|XM_002945260| PREDICTED: Xenopus (Silurana) tropicalis RNA-binding protein FUS-like (LOC100486778), partial mRNA | NP_001090836.1 | NP_001090836.1 RNA-binding protein FUS [Xenopus tropicalis] | 100.0 | ⋯ | 0 | 235 | 1 | 201 | 539 | 326 | 392 | 2.1e-30 | 140.6 | 85.5 |
13 | gi|166006844|gb|BC158442| | 6083.260 | 4357.335 | 3985.003 | 4303.580 | 4330.458 | gi|166006844|gb|BC158442| Xenopus tropicalis cDNA clone IMAGE:6977945 | YP_203382.1 | YP_203382.1 cytochrome b (mitochondrion) [Xenopus tropicalis] | 95.5 | ⋯ | 0 | 1169 | 1 | 1122 | 380 | 6 | 379 | 4.9e-197 | 696.4 | 96.0 |
14 | gi|55295235|gb|CR855703| | 4850.520 | 3640.195 | 4586.371 | 3510.823 | 4113.283 | gi|55295235|gb|CR855703| Xenopus tropicalis finished cDNA, clone TGas119p06 | * | * | -1.0 | ⋯ | -1 | 366 | -1 | -1 | -1 | -1 | -1 | -1.0e+00 | -1.0 | -1.0 |
15 | gi|56410017|gb|CR926212| | 4427.147 | 3749.859 | 4299.251 | 3742.414 | 4024.555 | gi|56410017|gb|CR926212| Xenopus tropicalis finished cDNA, clone TNeu141f09 | NP_001005095.1 | NP_001005095.1 ribosomal protein L36a [Xenopus tropicalis] | 100.0 | ⋯ | 0 | 372 | 4 | 321 | 106 | 1 | 106 | 9.6e-54 | 218.8 | 85.5 |
16 | gi|77626264|gb|CR760503| | 4266.092 | 3634.774 | 3855.119 | 3573.410 | 3744.946 | gi|77626264|gb|CR760503| Xenopus tropicalis finished cDNA, clone TEgg006f01 | NP_001017204.1 | NP_001017204.1 60S acidic ribosomal protein P1 [Xenopus tropicalis] | 100.0 | ⋯ | 0 | 480 | 75 | 269 | 113 | 1 | 65 | 1.1e-25 | 125.9 | 40.6 |
17 | gi|50603814|gb|BC077677| | 4131.295 | 2993.271 | 3998.739 | 2838.263 | 3496.005 | gi|50603814|gb|BC077677| Xenopus tropicalis MGC89854 protein, mRNA (cDNA clone MGC:89854 IMAGE:7028962), complete cds | XP_013816247.1 | XP_013816247.1 PREDICTED: 60S ribosomal protein L37a [Apteryx australis mantelli] | 95.0 | ⋯ | 0 | 400 | 2 | 301 | 124 | 25 | 124 | 1.0e-45 | 192.2 | 75.0 |
18 | gi|39794447|gb|BC064258| | 5018.614 | 3680.770 | 3262.165 | 3139.502 | 3471.468 | gi|39794447|gb|BC064258| Xenopus tropicalis cDNA clone IMAGE:5379153, partial cds | YP_203379.1 | YP_203379.1 NADH dehydrogenase subunit 4 (mitochondrion) [Xenopus tropicalis] | 91.7 | ⋯ | 0 | 1674 | 262 | 1638 | 459 | 1 | 459 | 2.8e-217 | 764.2 | 82.3 |
19 | gi|77626886|gb|CR760857| | 3713.808 | 3192.948 | 3710.173 | 3078.162 | 3451.561 | gi|77626886|gb|CR760857| Xenopus tropicalis finished cDNA, clone TTpA018i12 | NP_001008435.1 | NP_001008435.1 40S ribosomal protein S12 [Xenopus tropicalis] | 100.0 | ⋯ | 0 | 470 | 36 | 431 | 132 | 1 | 132 | 1.1e-67 | 265.4 | 84.3 |
rpsblast -query ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7d_Xenopus/XT_non0_pep_Nov13.fa -db Kog \
-out ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7d_Xenopus/XT_CNS_pep_Nov13_KOG.txt -evalue 1E-5 \
-outfmt "6 qseqid sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp qcovs" \
-max_hsps 1 -max_target_seqs 1
## Replace "[" and "]" with "#" for later import as dataframe
sed -i 's/\[/#/g' XT_CNS_pep_Nov13_KOG.txt
sed -i 's/\]./#/g' XT_CNS_pep_Nov13_KOG.txt
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7d_Xenopus")
XT_KOG = pd.read_csv("XT_CNS_pep_Nov13_KOG.txt", sep='#', header=None, engine="python")
type(XT_KOG)
pandas.core.frame.DataFrame
%get XT_KOG --from Python3
head(XT_KOG, 10)
0 | 1 | 2 | |
---|---|---|---|
0 | 106880495 gnl|CDD|231509 KOG3573, KOG3573, KOG3573, Caspase, apoptotic cysteine protease | Cell cycle control, cell division, chromosome partitioning | 37.288 177 104 4 519 330 504 195 24 195 1.08e-39 140 34 34 |
1 | 106880497 gnl|CDD|230473 KOG2534, KOG2534, KOG2534, DNA polymerase IV (family X) | Replication, recombination and repair | 38.268 358 192 4 334 1 334 353 1 353 6.26e-146 413 100 100 |
2 | 106880499 gnl|CDD|228344 KOG0395, KOG0395, KOG0395, Ras-related GTPase | General function prediction only | 36.816 201 119 3 211 10 210 196 3 195 1.86e-51 162 95 95 |
3 | 106880501 gnl|CDD|228463 KOG0515, KOG0515, KOG0515, p53-interacting protein 53BP/ASPP, contains ankyrin and SH3 domains | Cell cycle control, cell division, chromosome partitioning | 43.224 856 369 27 1101 258 1101 752 2 752 0.0 710 77 77 |
4 | 110164837 gnl|CDD|231844 KOG3913, KOG3913, KOG3913, Wnt family of developmental regulators | Signal transduction mechanisms | 46.053 380 165 7 376 10 376 356 4 356 4.92e-151 428 98 98 |
5 | 110164839 gnl|CDD|231844 KOG3913, KOG3913, KOG3913, Wnt family of developmental regulators | Signal transduction mechanisms | 53.760 359 158 4 354 1 354 356 1 356 0.0 509 100 100 |
6 | 110164841 gnl|CDD|231844 KOG3913, KOG3913, KOG3913, Wnt family of developmental regulators | Signal transduction mechanisms | 44.582 323 163 7 313 1 312 356 39 356 5.31e-138 392 99 99 |
7 | 110559940 gnl|CDD|228433 KOG0484, KOG0484, KOG0484, Transcription factor PHOX2/ARIX, contains HOX domain | Transcription | 69.841 63 19 0 311 117 179 125 14 76 2.08e-29 106 20 20 |
8 | 110628611 gnl|CDD|231846 KOG3915, KOG3915, KOG3915, Transcription regulator dachshund, contains SKI/SNO domain | Transcription | 70.185 540 106 5 692 153 692 641 157 641 0.0 706 78 78 |
9 | 110645313 gnl|CDD|228488 KOG0540, KOG0540, KOG0540, 3-Methylcrotonyl-CoA carboxylase, non-biotin containing subunit/Acetyl-CoA carboxylase carboxyl transferase, subunit beta | Amino acid transport and metabolism, Lipid transport and metabolism | 47.866 539 266 8 540 8 540 536 7 536 0.0 742 99 99 |
## Count the number of occurrences of each category
KOGs= ["RNA processing and modification", "Chromatin structure and dynamics", "Energy production and conversion", "Cell cycle control",
"Amino acid transport and metabolism", "Nucleotide transport and metabolism", "Carbohydrate transport and metabolism", "Coenzyme transport and metabolism",
"Lipid transport and metabolism", "Translation, ribosomal structure and biogenesis", "Transcription", "Replication, recombination and repair",
"Cell wall/membrane/envelope biogenesis", "Cell motility", "Posttranslational modification", "Inorganic ion transport and metabolism",
"Secondary metabolites", "General function prediction only", "Function unknown", "Signal transduction mechanisms", "Intracellular trafficking",
"Defense mechanisms", "Extracellular structures", "Nuclear structure", "Cytoskeleton"]
data = []
for KOG in KOGs:
print(KOG, XT_KOG[1].str.contains(KOG).sum())
data.append([KOG, XT_KOG[1].str.contains(KOG).sum()])
df = pd.DataFrame(data)
df.columns = ["KOG", "Count"]
df["XT_Percentage"] = df["Count"]/df["Count"].sum()*100
print(df)
df[["KOG", "XT_Percentage"]].to_csv("XT_KOG_summary.txt", sep="\t", index=None)
RNA processing and modification 475 Chromatin structure and dynamics 217 Energy production and conversion 279 Cell cycle control 375 Amino acid transport and metabolism 296 Nucleotide transport and metabolism 161 Carbohydrate transport and metabolism 360 Coenzyme transport and metabolism 89 Lipid transport and metabolism 419 Translation, ribosomal structure and biogenesis 428 Transcription 1234 Replication, recombination and repair 278 Cell wall/membrane/envelope biogenesis 164 Cell motility 28 Posttranslational modification 1145 Inorganic ion transport and metabolism 419 Secondary metabolites 178 General function prediction only 1752 Function unknown 933 Signal transduction mechanisms 2940 Intracellular trafficking 811 Defense mechanisms 124 Extracellular structures 318 Nuclear structure 85 Cytoskeleton 710 KOG Count XT_Percentage 0 RNA processing and modification 475 3.340836 1 Chromatin structure and dynamics 217 1.526234 2 Energy production and conversion 279 1.962301 3 Cell cycle control 375 2.637502 4 Amino acid transport and metabolism 296 2.081868 5 Nucleotide transport and metabolism 161 1.132367 6 Carbohydrate transport and metabolism 360 2.532002 7 Coenzyme transport and metabolism 89 0.625967 8 Lipid transport and metabolism 419 2.946969 9 Translation, ribosomal structure and biogenesis 428 3.010269 10 Transcription 1234 8.679139 11 Replication, recombination and repair 278 1.955268 12 Cell wall/membrane/envelope biogenesis 164 1.153467 13 Cell motility 28 0.196933 14 Posttranslational modification 1145 8.053172 15 Inorganic ion transport and metabolism 419 2.946969 16 Secondary metabolites 178 1.251934 17 General function prediction only 1752 12.322408 18 Function unknown 933 6.562104 19 Signal transduction mechanisms 2940 20.678014 20 Intracellular trafficking 811 5.704037 21 Defense mechanisms 124 0.872134 22 Extracellular structures 318 2.236601 23 Nuclear structure 85 0.597834 24 Cytoskeleton 710 4.993670
test_prot = pd.read_csv("xtropProtein_IDs_clean.txt", sep="|", header=None)
test_prot.columns = ["GI", "GI#", "GB", "GB#", "Name"]
print(test_prot.head(10))
test_prot["GI#"].to_csv("prot_GI.txt", header=None, index=None)
GI GI# GB GB# \ 0 gi 106635720 gb ABF82222 1 gi 106880495 ref NP_001015715 2 gi 106880497 ref NP_001006894 3 gi 106880499 ref NP_001011503 4 gi 106880501 ref NP_001016036 5 gi 106880505 ref NP_001016899 6 gi 108742162 gb AAI17669 7 gi 108860816 sp Q28CJ6 8 gi 109150091 gb AAI17670 9 gi 109892578 sp Q28BS0 Name 0 tropoelastin 1 [Xenopus (Silurana) tropicalis] 1 caspase 10, apoptosis-related cysteine peptid... 2 DNA-directed DNA polymerase beta [Xenopus (Si... 3 ras-related protein ras-dva [Xenopus (Siluran... 4 tumor protein p53 binding protein, 2 [Xenopus... 5 zinc finger, AN1-type domain 2A [Xenopus (Sil... 6 LOC734134 protein [Xenopus (Silurana) tropica... 7 RecName: Full=Nuclear distribution protein nu... 8 MGC146599 protein [Xenopus (Silurana) tropica... 9 RecName: Full=Zygotic DNA replication licensi...
#faSomeRecords XENTR_9.1.pep.fa XT_567789_non0_Jun28_IDs_clean.txt XT_non0_Nov11_pep.fa
grep -c ">" XT_non0_Nov11_pep.fa
19523
grep '^>' XENTR_9.1.pep.fa > XENTR_9.1.pep_IDs.txt
sed 's/>//' XENTR_9.1.pep_IDs.txt > XENTR_9.1.pep_IDs_clean.txt
XT_pep_names = pd.read_csv("XENTR_9.1.pep_IDs_clean.txt", sep=' ', header=None)
XT_pep_names.columns = ['Name', 'Protein name']
print(XT_pep_names.head(10))
Name Protein name 0 gnl|gene5703|XT-9_1-gene5703|rna18481| lamp5 1 gnl|gene29194|XT-9_1-gene29194|rna56249| LOC108646171 2 gnl|gene32155|XT-9_1-gene32155|rna60914| LOC100489996 3 gnl|gene5412|XT-9_1-gene5412|rna17549| slc25a42 4 gnl|gene13348|XT-9_1-gene13348|rna32485| LOC101732262 5 gnl|gene16688|XT-9_1-gene16688|rna36484| cxorf56 6 gnl|gene2364|XT-9_1-gene2364|rna7821| psat1 7 gnl|gene26570|XT-9_1-gene26570|rna52076| polr2k 8 gnl|gene25756|XT-9_1-gene25756|rna50801| LOC105945565 9 gnl|gene31412|XT-9_1-gene31412|rna59732| LOC108648088
Failed to switch to language SoS: list index out of range
XT_sorted_transcript_protein_names = XT_sorted_transcript_names.merge(XT_pep_names, on='Name')
%get XT_sorted_transcript_protein_names --from Python3
head(XT_sorted_transcript_protein_names, 50)
nrow(XT_sorted_transcript_protein_names)
Name | SRR5412261 | SRR5412262 | SRR5412263 | SRR5412264 | Median | Transcript name | Protein name | |
---|---|---|---|---|---|---|---|---|
0 | gnl|gene9400|XT-9_1-gene9400|rna28486| | 34262.40 | 21372.00 | 19923.400 | 23957.60 | 22664.800 | COX3 | COX3 |
1 | gnl|gene13358|XT-9_1-gene13358|rna32495| | 17891.20 | 19839.20 | 17838.000 | 23056.20 | 18865.200 | Xetrov90018627m.g | Xetrov90018627m.g |
2 | gnl|gene12159|XT-9_1-gene12159|rna31283| | 7685.61 | 8236.86 | 7409.810 | 9916.28 | 7961.235 | sars2 | sars2 |
3 | gnl|gene8523|XT-9_1-gene8523|rna27571| | 4686.72 | 5424.14 | 4983.960 | 4996.06 | 4990.010 | tmsb4x | tmsb4x |
4 | gnl|gene13537|XT-9_1-gene13537|rna32679| | 3684.75 | 3691.08 | 5331.400 | 4662.00 | 4176.540 | Xetrov90007052m.g | Xetrov90007052m.g |
5 | gnl|gene13769|XT-9_1-gene13769|rna32914| | 6162.55 | 3449.62 | 3759.590 | 3283.22 | 3604.605 | Xetrov90016335m.g | Xetrov90016335m.g |
6 | gnl|gene14873|XT-9_1-gene14873|rna34047| | 3293.41 | 3011.75 | 2935.460 | 3723.82 | 3152.580 | dvl2 | dvl2 |
7 | gnl|gene23564|XT-9_1-gene23564|rna47191| | 2177.18 | 3172.25 | 3111.710 | 2953.40 | 3032.555 | fkbp1b | fkbp1b |
8 | gnl|gene313|XT-9_1-gene313|rna961| | 3109.93 | 2670.48 | 2957.680 | 2773.92 | 2865.800 | eef1a1 | eef1a1 |
9 | gnl|gene11706|XT-9_1-gene11706|rna30825| | 2925.59 | 2238.96 | 2839.190 | 2412.61 | 2625.900 | Xetrov90005946m.g | Xetrov90005946m.g |
10 | gnl|gene6172|XT-9_1-gene6172|rna19929| | 2338.78 | 2710.91 | 2538.690 | 2927.91 | 2624.800 | cirbp | cirbp |
11 | gnl|gene762|XT-9_1-gene762|rna2442| | 2287.67 | 2863.72 | 2243.550 | 3005.64 | 2575.695 | actg1 | actg1 |
12 | gnl|gene33886|XT-9_1-gene33886|rna63832| | 3105.31 | 2061.37 | 2914.120 | 1773.81 | 2487.745 | rpl39 | rpl39 |
13 | gnl|gene14842|XT-9_1-gene14842|rna34016| | 1897.45 | 2241.87 | 2632.310 | 2551.92 | 2396.895 | Xetrov90018872m.g | Xetrov90018872m.g |
14 | gnl|gene16125|XT-9_1-gene16125|rna35596| | 2419.06 | 2357.58 | 2049.110 | 2242.28 | 2299.930 | cox4i2 | cox4i2 |
15 | gnl|gene3516|XT-9_1-gene3516|rna11539| | 2674.78 | 2113.82 | 2290.590 | 2019.46 | 2202.205 | rplp1 | rplp1 |
16 | gnl|gene1193|XT-9_1-gene1193|rna3828| | 1684.45 | 2493.07 | 1838.490 | 2509.48 | 2165.780 | actb | actb |
17 | gnl|gene30703|XT-9_1-gene30703|rna58670| | 2471.17 | 1542.44 | 2454.710 | 1516.01 | 1998.575 | rpl38 | rpl38 |
18 | gnl|gene17120|XT-9_1-gene17120|rna37099| | 1804.75 | 2580.81 | 1643.070 | 2050.96 | 1927.855 | acbd7 | acbd7 |
19 | gnl|gene5036|XT-9_1-gene5036|rna16400| | 2040.25 | 1791.25 | 1880.230 | 1927.44 | 1903.835 | snap25 | snap25 |
20 | gnl|gene206|XT-9_1-gene206|rna625| | 1947.24 | 1856.75 | 2087.360 | 1823.03 | 1901.995 | ppia | ppia |
21 | gnl|gene5805|XT-9_1-gene5805|rna18773| | 2400.87 | 1607.48 | 2157.750 | 1485.29 | 1882.615 | rpl37a | rpl37a |
22 | gnl|gene2815|XT-9_1-gene2815|rna9270| | 1941.56 | 1615.96 | 1950.610 | 1805.61 | 1873.585 | eef2.1 | eef2.1 |
23 | gnl|gene7201|XT-9_1-gene7201|rna23259| | 2179.98 | 1543.72 | 2148.580 | 1544.92 | 1846.750 | rps25 | rps25 |
24 | gnl|gene411|XT-9_1-gene411|rna1288| | 1651.30 | 2172.69 | 1325.560 | 1937.73 | 1794.515 | gapdh | gapdh |
25 | gnl|gene4592|XT-9_1-gene4592|rna14953| | 2415.58 | 1796.56 | 1437.620 | 1783.25 | 1789.905 | fth1 | fth1 |
26 | gnl|gene25950|XT-9_1-gene25950|rna51129| | 1711.46 | 1841.37 | 1594.450 | 1918.67 | 1776.415 | vamp2 | vamp2 |
27 | gnl|gene21347|XT-9_1-gene21347|rna43691| | 1774.86 | 1767.71 | 1843.740 | 1451.54 | 1771.285 | fxyd3 | fxyd3 |
28 | gnl|gene16546|XT-9_1-gene16546|rna36242| | 1929.19 | 1661.91 | 1835.190 | 1707.28 | 1771.235 | ndufa4 | ndufa4 |
29 | gnl|gene29740|XT-9_1-gene29740|rna57131| | 1091.21 | 1972.33 | 1879.310 | 1635.10 | 1757.205 | calm2 | calm2 |
30 | gnl|gene14332|XT-9_1-gene14332|rna33491| | 1196.21 | 1625.56 | 1708.270 | 2044.97 | 1666.915 | LOC101731835-like | LOC101731835-like |
31 | gnl|gene29484|XT-9_1-gene29484|rna56739| | 1506.27 | 1816.51 | 1516.400 | 1874.31 | 1666.455 | cfl1 | cfl1 |
32 | gnl|gene11448|XT-9_1-gene11448|rna30566| | 1589.40 | 1697.85 | 1597.670 | 1769.62 | 1647.760 | Xetrov90023860m.g | Xetrov90023860m.g |
33 | gnl|gene6576|XT-9_1-gene6576|rna21230| | 1855.76 | 1433.75 | 1859.490 | 1343.12 | 1644.755 | rpl36a | rpl36a |
34 | gnl|gene6432|XT-9_1-gene6432|rna20758| | 1520.94 | 1707.19 | 1583.110 | 1678.68 | 1630.895 | ubc | ubc |
35 | gnl|gene16958|XT-9_1-gene16958|rna36852| | 1179.88 | 1636.30 | 1615.090 | 1638.56 | 1625.695 | nptxr | nptxr |
36 | gnl|gene32637|XT-9_1-gene32637|rna61704| | 1910.73 | 1505.78 | 1716.260 | 1490.69 | 1611.020 | rps11 | rps11 |
37 | gnl|gene31967|XT-9_1-gene31967|rna60584| | 2074.05 | 1484.19 | 1737.730 | 1417.40 | 1610.960 | rpl12 | rpl12 |
38 | gnl|gene5104|XT-9_1-gene5104|rna16614| | 1518.96 | 1582.05 | 1636.660 | 1604.35 | 1593.200 | itm2b | itm2b |
39 | gnl|gene25085|XT-9_1-gene25085|rna49743| | 1360.77 | 1484.83 | 1836.480 | 1592.80 | 1538.815 | gpm6a | gpm6a |
40 | gnl|gene7696|XT-9_1-gene7696|rna24775| | 1408.28 | 1498.95 | 1560.870 | 1562.40 | 1529.910 | syp | syp |
41 | gnl|gene22504|XT-9_1-gene22504|rna45541| | 1735.81 | 1403.59 | 1652.250 | 1380.38 | 1527.920 | rpl35 | rpl35 |
42 | gnl|gene34123|XT-9_1-gene34123|rna64190| | 1223.71 | 1663.90 | 1433.740 | 1615.26 | 1524.500 | basp1 | basp1 |
43 | gnl|gene1050|XT-9_1-gene1050|rna3368| | 1430.14 | 1582.36 | 1336.670 | 1594.79 | 1506.250 | aldoc | aldoc |
44 | gnl|gene580|XT-9_1-gene580|rna1804| | 1432.96 | 1556.29 | 1446.140 | 1616.36 | 1501.215 | stmn2 | stmn2 |
45 | gnl|gene9711|XT-9_1-gene9711|rna28802| | 500.60 | 2486.31 | 414.573 | 2438.46 | 1469.530 | Xetrov90003859m.g | Xetrov90003859m.g |
46 | gnl|gene2705|XT-9_1-gene2705|rna8958| | 1106.54 | 1882.21 | 1168.030 | 1730.07 | 1449.050 | tuba1b | tuba1b |
47 | gnl|gene6005|XT-9_1-gene6005|rna19383| | 1698.20 | 1353.46 | 1534.050 | 1298.17 | 1443.755 | rpl10 | rpl10 |
48 | gnl|gene23502|XT-9_1-gene23502|rna47094| | 1638.36 | 1285.80 | 1597.680 | 1274.82 | 1441.740 | rps7 | rps7 |
49 | gnl|gene5084|XT-9_1-gene5084|rna16546| | 1552.41 | 1249.21 | 1342.130 | 1529.07 | 1435.600 | atp6v0c | atp6v0c |
SRR5412261-4_non0_Jun23.txt
---> 20,739 IDs## Identify transcripts with TPM>0 in all four libraries
> grep -Fwf SRR5412261_non0.txt SRR5412262_non0.txt > SRR5412261-2_non0_Jun23.txt
> grep -Fwf SRR5412261-2_non0_Jun23.txt SRR5412263_non0.txt > SRR5412261-3_non0_Jun23.txt
> grep -Fwf SRR5412261-3_non0_Jun23.txt SRR5412264_non0.txt > SRR5412261-4_non0_Jun23.txt
wc -l SRR5412261-4_non0_Jun23.txt
echo ""
head -n 10 SRR5412261-4_non0_Jun23.txt
20739 SRR5412261-4_non0_Jun23.txt gnl|gene30210|XT-9_1-gene30210|rna57878| gnl|gene9400|XT-9_1-gene9400|rna28486| gnl|gene13358|XT-9_1-gene13358|rna32495| gnl|gene18150|XT-9_1-gene18150|rna38736| gnl|gene12159|XT-9_1-gene12159|rna31283| gnl|gene25743|XT-9_1-gene25743|rna50783| gnl|gene8523|XT-9_1-gene8523|rna27571| gnl|gene13537|XT-9_1-gene13537|rna32679| gnl|gene17429|XT-9_1-gene17429|rna37560| gnl|gene26471|XT-9_1-gene26471|rna51932|
## Extract
filterbyname.sh \
in=XT_9.1_transcripts.fa \
out=XT_non0_transcripts.fa \
names=SRR5412261-4_non0_Jun23.txt \
include=t substring
java -ea -Xmx18549m -cp /home/zhanglab1/ndong/anaconda3/opt/bbmap-38.06/current/ driver.FilterReadsByName in=XT_9.1_transcripts.fa out=XT_non0_transcripts.fa names=SRR5412261-4_non0_Jun23.txt include=t substring Executing driver.FilterReadsByName [in=XT_9.1_transcripts.fa, out=XT_non0_transcripts.fa, names=SRR5412261-4_non0_Jun23.txt, include=t, substring] Input is being processed as unpaired Time: 98.370 seconds. Reads Processed: 34192 0.35k reads/sec Bases Processed: 109m 1.12m bases/sec Reads Out: 20739 Bases Out: 89731253
#grep '^>' XT_non0_transcripts.fa > XT_non0_transcripts_IDs.txt
wc -l XT_non0_transcripts_IDs.txt
head -n 10 XT_non0_transcripts_IDs.txt
20739 XT_non0_transcripts_IDs.txt >gnl|gene5703|XT-9_1-gene5703|rna18481| lamp5 >gnl|gene29194|XT-9_1-gene29194|rna56249| LOC108646171 >gnl|gene32155|XT-9_1-gene32155|rna60914| LOC100489996 >gnl|gene5412|XT-9_1-gene5412|rna17549| slc25a42 >gnl|gene16688|XT-9_1-gene16688|rna36484| cxorf56 >gnl|gene2364|XT-9_1-gene2364|rna7821| psat1 >gnl|gene26570|XT-9_1-gene26570|rna52076| polr2k >gnl|gene31412|XT-9_1-gene31412|rna59732| LOC108648088 >gnl|gene23209|XT-9_1-gene23209|rna46652| anxa11 >gnl|gene32165|XT-9_1-gene32165|rna60927| acp6
SRR5412261-4_non0_Jun23_pep.fa
---> 19,523 sequences## Extract protein sequences of TPM>0 transcripts
> faSomeRecords XENTR_9.1.pep.fa SRR5412261-4_non0_Jun23.txt SRR5412261-4_non0_Jun23_pep.fa
XT_CNS_pep_June23_KOG.txt
XT_CNS_pep_June23_KOG.xlsx
## KOG analysis of transcripts with TPM>0 in all 4 libraries [27921865.bc]
> rpsblast -query ~/CNS-transcriptomes/XT/SRR5412261-4_non0_Jun23_pep.fa -db Kog \
-out ~/CNS-transcriptomes/XT/XT_CNS_pep_June23_KOG.txt -evalue 1E-5 \
-outfmt "6 qseqid sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp qcovs" \
-max_hsps 1 -max_target_seqs 1