## Download Ensembl Release 92 ncRNA sequences
wget ftp://ftp.ensembl.org/pub/release-92/fasta/caenorhabditis_elegans/ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz
mv Caenorhabditis_elegans.WBcel235.ncrna.fa CE_ncRNA.fa
## Download Ensembl Release 92 cDNA
wget ftp://ftp.ensembl.org/pub/release-92/fasta/caenorhabditis_elegans/cdna/Caenorhabditis_elegans.WBcel235.cdna.all.fa.gz
mv Caenorhabditis_elegans.WBcel235.cdna.all.fa CE_cDNA.fa
## Create Salmon index (k=31)
cat CE_cDNA.fa CE_ncRNA.fa > CE_all_RNA.fa
salmon index -t CE_all_RNA.fa -i ~/CE_all_RNA_Jun29_k31_salmon_index --type quasi -k 31
## FastQC
fastqc -o . -f fastq --extract SRR7443976.fastq -t 10
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s SRR7443976.fastq
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR7443976.cor.fq -o filtered
## fastp
fastp -i filtered_SRR7443976.cor.fq -o filtered_SRR7443976_fastp.cor.fq \
-q 5 -c -p -w 10 \
-h filtered_SRR7443976_fastp.html -j filtered_SRR7443976_fastp.json \
-R "filtered_SRR7443976 report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR7443976_fastp.cor.fq -t 10
## Mapping
salmon quant -i ~/CE_all_RNA_Jun29_k31_salmon_index -l A \
-r filtered_SRR7443976_fastp.cor.fq \
-o ~/filtered_SRR7443976_fastp_CE_all_RNA_Jun29_k31_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR7443981.fastq -t 10
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s SRR7443981.fastq
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR7443981.cor.fq -o filtered
## fastp
fastp -i filtered_SRR7443981.cor.fq -o filtered_SRR7443981_fastp.cor.fq \
-q 5 -c -p -w 10 \
-h filtered_SRR7443981_fastp.html -j filtered_SRR7443981_fastp.json \
-R "filtered_SRR7443981 report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR7443981_fastp.cor.fq -t 10
## Mapping
salmon quant -i ~/CE_all_RNA_Jun29_k31_salmon_index -l A \
-r filtered_SRR7443981_fastp.cor.fq \
-o ~/filtered_SRR7443981_fastp_CE_all_RNA_Jun29_k31_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR7443983.fastq -t 10
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s SRR7443983.fastq
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR7443983.cor.fq -o filtered
## fastp
fastp -i filtered_SRR7443983.cor.fq -o filtered_SRR7443983_fastp.cor.fq \
-q 5 -c -p -w 10 \
-h filtered_SRR7443983_fastp.html -j filtered_SRR7443983_fastp.json \
-R "filtered_SRR7443983 report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR7443983_fastp.cor.fq -t 10
## Mapping (k=31)
salmon quant -i ~/CE_all_RNA_Jun29_k31_salmon_index -l A \
-r filtered_SRR7443983_fastp.cor.fq \
-o ~/filtered_SRR7443983_fastp_CE_all_RNA_Jun29_k31_salmon_quant
## FastQC
fastqc -o . -f fastq --extract SRR7443984.fastq -t 10
## rCorrector
perl ~/install/Rcorrector-master/run_rcorrector.pl -t 10 -s SRR7443984.fastq
## Filter
python ~/FilterUncorrectabledSEfastq.py -i SRR7443984.cor.fq -o filtered
## fastp
fastp -i filtered_SRR7443984.cor.fq -o filtered_SRR7443984_fastp.cor.fq \
-q 5 -c -p -w 10 \
-h filtered_SRR7443984_fastp.html -j filtered_SRR7443984_fastp.json \
-R "filtered_SRR7443984 report"
## FastQC
fastqc -o . -f fastq --extract filtered_SRR7443984_fastp.cor.fq -t 10
## Mapping (k=31)
salmon quant -i ~/CE_all_RNA_Jun29_k31_salmon_index -l A \
-r filtered_SRR7443984_fastp.cor.fq \
-o ~/filtered_SRR7443984_fastp_CE_all_RNA_Jun29_k31_salmon_quant
## MultiQC command to summarize all FastQC and Salmon output logs in the current folder
multiqc .
## Import libraries
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7e_CElegans")
## Define function
def extract_non0(salmon_output_filename, library_ID):
with open(salmon_output_filename, "r") as infile:
lib = pd.read_csv(infile, sep='\t')
lib_non0 = lib.loc[lib["TPM"]>0]
lib_non0_counts = lib_non0[["Name", "TPM"]]
print("There are", lib_non0_counts.shape[0], "transcripts with TPM>0 in the reads library", library_ID)
return(lib_non0_counts)
## Analyses
lib76 = extract_non0("./filtered_SRR7443976_fastp_CE_all_RNA_Jun29_k31_salmon_quant/quant.sf", "SRR7443976")
lib81 = extract_non0("./filtered_SRR7443981_fastp_CE_all_RNA_Jun29_k31_salmon_quant/quant.sf", "SRR7443981")
lib83 = extract_non0("./filtered_SRR7443983_fastp_CE_all_RNA_Jun29_k31_salmon_quant/quant.sf", "SRR7443983")
lib84 = extract_non0("./filtered_SRR7443984_fastp_CE_all_RNA_Jun29_k31_salmon_quant/quant.sf", "SRR7443984")
There are 26452 transcripts with TPM>0 in the reads library SRR7443976 There are 28423 transcripts with TPM>0 in the reads library SRR7443981 There are 24779 transcripts with TPM>0 in the reads library SRR7443983 There are 28210 transcripts with TPM>0 in the reads library SRR7443984
%get lib76 --from Python3
head(lib76, 20)
Name | TPM | |
---|---|---|
0 | Y110A7A.10.1 | 4.34596000 |
2 | F27C8.1.1 | 0.00165549 |
3 | F27C8.1.2 | 0.81521500 |
4 | F07C3.7 | 0.89214800 |
5 | F52H2.2a | 3.21511000 |
6 | F52H2.2b | 0.47940200 |
7 | T13A10.10a | 0.11426700 |
8 | T13A10.10b | 0.14689100 |
9 | C55C2.5a | 2.05299000 |
10 | C55C2.5b | 0.02874830 |
11 | C55C2.5c | 0.26859100 |
14 | T11F9.4b | 0.06992060 |
16 | F28F9.4 | 0.03724270 |
18 | Y53H1C.1b | 0.16832100 |
20 | C50F2.10 | 0.41513400 |
23 | T22H6.5 | 0.87820000 |
24 | T22H6.7 | 5.13717000 |
26 | M79.1b | 3.42322000 |
27 | M79.1c | 1.02495000 |
28 | C24F3.5a | 0.10242500 |
## Define function
def sorted_merged_table(df1, df2, df3, df4,
lib_name1, lib_name2, lib_name3, lib_name4):
non0_2 = df1.merge(df2, on='Name') # Create lookup merged table of all libraries
non0_3 = non0_2.merge(df3, on='Name')
non0_4 = non0_3.merge(df4, on='Name')
non0_4['Median'] = non0_4.median(axis=1) # Calculate median read count of each transcript across all libraries
non0_4_median_sorted = non0_4.sort_values(by="Median", ascending=False) # Sort transcripts by median read count
non0_4_median_sorted.columns = ("Name", lib_name1, lib_name2, lib_name3, # Rename columns
lib_name4, "Median")
return non0_4_median_sorted
## Create median-sorted lookup table
CE_sorted_merged = sorted_merged_table(lib76, lib81, lib83, lib84,
"SRR7443976", "SRR7443981", "SRR7443983", "SRR7443984")
print("Dataframe dimensions:", CE_sorted_merged.shape)
## Output non-0 transcript IDs to file
#CE_sorted_merged["Name"].to_csv("./CE_567789_non0_Jun28.txt", index=None)
Dataframe dimensions: (18307, 6)
%get CE_sorted_merged --from Python3
head(CE_sorted_merged, 10)
Name | SRR7443976 | SRR7443981 | SRR7443983 | SRR7443984 | Median | |
---|---|---|---|---|---|---|
16730 | F31C3.11 | 254332.00 | 266963.00 | 233013.00 | 291029.0 | 260647.50 |
16731 | F31C3.9 | 186760.00 | 144294.00 | 194841.00 | 175669.0 | 181214.50 |
16729 | F31C3.8 | 85603.80 | 84404.90 | 92519.20 | 76345.5 | 85004.35 |
16728 | F31C3.7 | 85581.30 | 84402.20 | 92559.60 | 76343.1 | 84991.75 |
17128 | R144.15 | 16107.30 | 13197.50 | 11744.20 | 11604.0 | 12470.85 |
16297 | F31C3.10 | 11235.90 | 8841.23 | 11140.50 | 10827.3 | 10983.90 |
16742 | Y102A5D.11 | 8561.35 | 15142.20 | 7212.02 | 11440.7 | 10001.03 |
16743 | Y102A5D.12 | 8560.12 | 15142.90 | 7212.08 | 11441.1 | 10000.61 |
16741 | Y102A5D.10 | 8561.40 | 15139.10 | 7211.84 | 11382.0 | 9971.70 |
16740 | Y102A5D.9 | 8561.30 | 15138.50 | 7211.72 | 11381.9 | 9971.60 |
CE_median_sorted_IDs = CE_sorted_merged["Name"]
CE_median_sorted_IDs.nunique()
18307
%get CE_median_sorted_IDs --from Python3
head(CE_median_sorted_IDs, 10)
library(biomaRt)
CE_ensembl = useMart("ensembl", dataset="celegans_gene_ensembl")
CE_ensembl_attributes = listAttributes(CE_ensembl)
CE_ensembl_filters = listFilters(CE_ensembl)
## Create getBM() query for converting transcript IDs to gene IDs
CE_biomart_output <- getBM(attributes = c('ensembl_transcript_id', 'ensembl_gene_id', 'external_transcript_name', "description",
'transcript_biotype', 'gene_biotype'),
filters = 'ensembl_transcript_id',
values = CE_median_sorted_IDs,
mart = CE_ensembl)
## Preview results
head(CE_biomart_output, 20)
dim(CE_biomart_output)
Batch submitting query [=>-----------------------------] 5% eta: 13s Batch submitting query [==>----------------------------] 8% eta: 19s Batch submitting query [==>----------------------------] 11% eta: 20s Batch submitting query [===>---------------------------] 14% eta: 22s Batch submitting query [====>--------------------------] 16% eta: 22s Batch submitting query [=====>-------------------------] 19% eta: 22s Batch submitting query [======>------------------------] 22% eta: 24s Batch submitting query [=======>-----------------------] 24% eta: 24s Batch submitting query [=======>-----------------------] 27% eta: 23s Batch submitting query [========>----------------------] 30% eta: 22s Batch submitting query [=========>---------------------] 32% eta: 22s Batch submitting query [==========>--------------------] 35% eta: 21s Batch submitting query [===========>-------------------] 38% eta: 20s Batch submitting query [============>------------------] 41% eta: 19s Batch submitting query [============>------------------] 43% eta: 18s Batch submitting query [=============>-----------------] 46% eta: 19s Batch submitting query [==============>----------------] 49% eta: 18s Batch submitting query [===============>---------------] 51% eta: 17s Batch submitting query [================>--------------] 54% eta: 16s Batch submitting query [=================>-------------] 57% eta: 15s Batch submitting query [=================>-------------] 59% eta: 14s Batch submitting query [==================>------------] 62% eta: 13s Batch submitting query [===================>-----------] 65% eta: 12s Batch submitting query [====================>----------] 68% eta: 11s Batch submitting query [=====================>---------] 70% eta: 10s Batch submitting query [======================>--------] 73% eta: 9s Batch submitting query [======================>--------] 76% eta: 8s Batch submitting query [=======================>-------] 78% eta: 7s Batch submitting query [========================>------] 81% eta: 6s Batch submitting query [=========================>-----] 84% eta: 5s Batch submitting query [==========================>----] 86% eta: 5s Batch submitting query [===========================>---] 89% eta: 4s Batch submitting query [===========================>---] 92% eta: 3s Batch submitting query [============================>--] 95% eta: 2s Batch submitting query [=============================>-] 97% eta: 1s Batch submitting query [===============================] 100% eta: 0s
ensembl_transcript_id | ensembl_gene_id | external_transcript_name | description | transcript_biotype | gene_biotype |
---|---|---|---|---|---|
AH10.8 | WBGene00197630 | AH10.8 | ncRNA | ncRNA | |
B0034.3a | WBGene00000403 | B0034.3a | CAlSYntenin/Alcadein homolog; Calsyntenin [Source:UniProtKB/TrEMBL;Acc:G5ED46] | protein_coding | protein_coding |
B0034.3c | WBGene00000403 | B0034.3c | CAlSYntenin/Alcadein homolog; Calsyntenin [Source:UniProtKB/TrEMBL;Acc:G5ED46] | protein_coding | protein_coding |
B0041.4 | WBGene00004415 | B0041.4 | 60S ribosomal protein L4 [Source:UniProtKB/Swiss-Prot;Acc:O02056] | protein_coding | protein_coding |
B0244.2 | WBGene00002048 | B0244.2 | IA2; IDA-1 protein; Related to Islet cell Diabetes Autoantigen [Source:UniProtKB/TrEMBL;Acc:G5EGJ5] | protein_coding | protein_coding |
B0250.1 | WBGene00004413 | B0250.1 | 60S ribosomal protein L8 [Source:UniProtKB/Swiss-Prot;Acc:Q9XVF7] | protein_coding | protein_coding |
B0250.19 | WBGene00219782 | B0250.19 | snoRNA | snoRNA | |
B0252.8 | WBGene00044615 | B0252.8 | protein_coding | protein_coding | |
B0285.12 | WBGene00045168 | B0285.12 | ncRNA | ncRNA | |
B0294.3 | WBGene00015118 | B0294.3 | protein_coding | protein_coding | |
B0334.12 | WBGene00007539 | B0334.12 | snoRNA | snoRNA | |
B0336.10.1 | WBGene00004435 | B0336.10.1 | 60S ribosomal protein L23 [Source:UniProtKB/Swiss-Prot;Acc:P48158] | protein_coding | protein_coding |
B0350.79 | WBGene00201166 | B0350.79 | ncRNA | ncRNA | |
B0393.1.2 | WBGene00004469 | B0393.1.2 | 40S ribosomal protein SA [Source:UniProtKB/Swiss-Prot;Acc:P46769] | protein_coding | protein_coding |
B0412.4 | WBGene00004498 | B0412.4 | Ribosomal Protein, Small subunit [Source:UniProtKB/TrEMBL;Acc:P90983] | protein_coding | protein_coding |
B0412.7 | WBGene00044939 | B0412.7 | snoRNA | snoRNA | |
B0495.15 | WBGene00197163 | B0495.15 | ncRNA | ncRNA | |
B0513.3a | WBGene00004443 | B0513.3a | 60S ribosomal protein L29 [Source:UniProtKB/TrEMBL;Acc:O45226] | protein_coding | protein_coding |
B0513.3b | WBGene00004443 | B0513.3b | 60S ribosomal protein L29 [Source:UniProtKB/TrEMBL;Acc:O45226] | protein_coding | protein_coding |
C01F4.8 | WBGene00199623 | C01F4.8 | ncRNA | ncRNA |
%get CE_biomart_output --from R
## Check dataframe dimension after import
print("Dataframe dimensions:", CE_biomart_output.shape, "\n")
## Change transcript ID column header to match TPM>0 read counts table for merging
CE_biomart_output_df = CE_biomart_output.rename(columns = {'ensembl_transcript_id':'Name'})
Dataframe dimensions: (18307, 6)
%get CE_biomart_output --from R
## Merge Biomart output & read counts table sorted by median read count
CE_sorted_counts_BM = CE_sorted_merged.merge(CE_biomart_output_df, on='Name')
## Extract entries of protein-coding transcripts
CE_PC_transcripts = CE_sorted_counts_BM.loc[CE_sorted_counts_BM["transcript_biotype"].str.contains("protein_coding")]
## Check dataframe dimensions
print(CE_PC_transcripts.shape)
(16121, 11)
%get CE_PC_transcripts --from Python3
## Preview the dataframe
head(CE_PC_transcripts, 20)
## Extract to CSV
write.csv(head(CE_PC_transcripts, 20), file = "CE_top_20_PC_transcripts.csv")
Name | SRR7443976 | SRR7443981 | SRR7443983 | SRR7443984 | Median | ensembl_gene_id | external_transcript_name | description | transcript_biotype | gene_biotype | |
---|---|---|---|---|---|---|---|---|---|---|---|
30 | C26F1.9 | 1040.840 | 1264.680 | 1382.360 | 1482.910 | 1323.5200 | WBGene00004453 | C26F1.9 | 60S ribosomal protein L39 [Source:UniProtKB/Swiss-Prot;Acc:P52814] | protein_coding | protein_coding |
35 | F46A8.7 | 1425.050 | 942.086 | 1154.540 | 1061.130 | 1107.8350 | WBGene00009750 | F46A8.7 | protein_coding | protein_coding | |
36 | R102.2 | 1271.800 | 667.139 | 1170.260 | 970.778 | 1070.5190 | WBGene00011289 | R102.2 | protein_coding | protein_coding | |
37 | Y37D8A.15.1 | 1116.960 | 606.606 | 1259.220 | 898.011 | 1007.4855 | WBGene00001457 | Y37D8A.15.1 | FMRFamide-like neuropeptides 14 KHEYLRF-amide 1 KHEYLRF-amide 2 KHEYLRF-amide 3 KHEYLRF-amide 4 [Source:UniProtKB/Swiss-Prot;Acc:Q9XWV7] | protein_coding | protein_coding |
39 | B0513.3a | 925.575 | 803.537 | 697.366 | 1002.770 | 864.5560 | WBGene00004443 | B0513.3a | 60S ribosomal protein L29 [Source:UniProtKB/TrEMBL;Acc:O45226] | protein_coding | protein_coding |
41 | B0513.3b | 798.273 | 770.185 | 766.005 | 1015.770 | 784.2290 | WBGene00004443 | B0513.3b | 60S ribosomal protein L29 [Source:UniProtKB/TrEMBL;Acc:O45226] | protein_coding | protein_coding |
43 | Y41D4B.5 | 896.841 | 678.738 | 737.116 | 786.792 | 761.9540 | WBGene00004497 | Y41D4B.5 | 40S ribosomal protein S28 [Source:UniProtKB/Swiss-Prot;Acc:Q95Y04] | protein_coding | protein_coding |
47 | C06B8.8.6 | 914.892 | 606.727 | 709.529 | 701.943 | 705.7360 | WBGene00004452 | C06B8.8.6 | 60S ribosomal protein L38 [Source:UniProtKB/Swiss-Prot;Acc:O17570] | protein_coding | protein_coding |
48 | T27C4.1.2 | 786.744 | 440.050 | 887.939 | 568.244 | 677.4940 | WBGene00020854 | T27C4.1.2 | protein_coding | protein_coding | |
49 | M03D4.3 | 1038.780 | 495.797 | 738.679 | 567.486 | 653.0825 | WBGene00019750 | M03D4.3 | protein_coding | protein_coding | |
51 | M142.1a | 728.419 | 407.291 | 673.979 | 582.605 | 628.2920 | WBGene00006843 | M142.1a | protein_coding | protein_coding | |
53 | C07A12.2 | 673.709 | 439.860 | 1139.670 | 444.689 | 559.1990 | WBGene00015560 | C07A12.2 | protein_coding | protein_coding | |
54 | Y48B6A.2.2 | 577.490 | 429.544 | 561.356 | 527.960 | 544.6580 | WBGene00004456 | Y48B6A.2.2 | 60S ribosomal protein L37a [Source:UniProtKB/Swiss-Prot;Acc:Q9U2A8] | protein_coding | protein_coding |
56 | F54D7.7 | 549.278 | 493.803 | 538.284 | 642.652 | 543.7810 | WBGene00023068 | F54D7.7 | Ribosomal Protein, Large subunit [Source:UniProtKB/TrEMBL;Acc:A0A0M6VD87] | protein_coding | protein_coding |
58 | F07D3.2 | 632.213 | 321.505 | 779.954 | 398.358 | 515.2855 | WBGene00001449 | F07D3.2 | FMRFamide-like neuropeptides 6 QQDSEVEREMM KSAYMRF-amide 1 KSAYMRF-amide 2 KSAYMRF-amide 3 KSAYMRF-amide 4 KSAYMRF-amide 5 KSAYMRF-amide 6 [Source:UniProtKB/Swiss-Prot;Acc:Q19165] | protein_coding | protein_coding |
61 | T03D8.3 | 580.152 | 326.811 | 610.454 | 419.574 | 499.8630 | WBGene00011392 | T03D8.3 | Seven B Two (Mammalian 7BT prohormone convertase chaperone) homolog [Source:UniProtKB/TrEMBL;Acc:Q9XTY3] | protein_coding | protein_coding |
64 | D1007.12 | 473.486 | 378.737 | 572.033 | 494.701 | 484.0935 | WBGene00004436 | D1007.12 | 60S ribosomal protein L24 [Source:UniProtKB/Swiss-Prot;Acc:O01868] | protein_coding | protein_coding |
68 | T20B3.14 | 612.494 | 263.868 | 531.541 | 406.198 | 468.8695 | WBGene00011854 | T20B3.14 | protein_coding | protein_coding | |
70 | F02A9.2b | 426.231 | 456.445 | 622.258 | 436.734 | 446.5895 | WBGene00001385 | F02A9.2b | Fatty-acid and retinol-binding protein 1 [Source:UniProtKB/Swiss-Prot;Acc:P34382] | protein_coding | protein_coding |
72 | F02A9.3.2 | 439.619 | 531.052 | 418.940 | 315.461 | 429.2795 | WBGene00001386 | F02A9.3.2 | Fatty-acid and retinol-binding protein 2 [Source:UniProtKB/Swiss-Prot;Acc:P34383] | protein_coding | protein_coding |
CE_PC_transcripts_ID <- CE_PC_transcripts$Name
## Create getBM() query for obtaining peptide sequences
CE_BM_peptide_seqs <- getSequence(id = CE_PC_transcripts_ID,
type = 'ensembl_transcript_id',
seqType = 'peptide',
mart = CE_ensembl)
## Export to FASTA
exportFASTA(CE_BM_peptide_seqs, file='./CE_non0_pep_Nov11.fasta')
Batch submitting query [=>-----------------------------] 6% eta: 34s Batch submitting query [==>----------------------------] 9% eta: 49s Batch submitting query [===>---------------------------] 12% eta: 1m Batch submitting query [====>--------------------------] 15% eta: 1m Batch submitting query [=====>-------------------------] 18% eta: 1m Batch submitting query [======>------------------------] 21% eta: 1m Batch submitting query [=======>-----------------------] 24% eta: 1m Batch submitting query [=======>-----------------------] 27% eta: 1m Batch submitting query [========>----------------------] 30% eta: 1m Batch submitting query [=========>---------------------] 33% eta: 1m Batch submitting query [==========>--------------------] 36% eta: 1m Batch submitting query [===========>-------------------] 39% eta: 1m Batch submitting query [============>------------------] 42% eta: 1m Batch submitting query [=============>-----------------] 45% eta: 1m Batch submitting query [==============>----------------] 48% eta: 1m Batch submitting query [===============>---------------] 52% eta: 1m Batch submitting query [================>--------------] 55% eta: 1m Batch submitting query [=================>-------------] 58% eta: 1m Batch submitting query [==================>------------] 61% eta: 47s Batch submitting query [===================>-----------] 64% eta: 44s Batch submitting query [====================>----------] 67% eta: 41s Batch submitting query [=====================>---------] 70% eta: 37s Batch submitting query [======================>--------] 73% eta: 34s Batch submitting query [======================>--------] 76% eta: 30s Batch submitting query [=======================>-------] 79% eta: 26s Batch submitting query [========================>------] 82% eta: 23s Batch submitting query [=========================>-----] 85% eta: 19s Batch submitting query [==========================>----] 88% eta: 16s Batch submitting query [===========================>---] 91% eta: 12s Batch submitting query [============================>--] 94% eta: 8s Batch submitting query [=============================>-] 97% eta: 4s Batch submitting query [===============================] 100% eta: 0s
grep -c ">" CE_non0_pep_Nov11.fasta
16121
rpsblast -query ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7e_CElegans/CE_non0_pep_Nov11.fasta -db Kog \
-out ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7e_CElegans/CE_CNS_pep_Nov11_KOG.txt -evalue 1E-5 \
-outfmt "6 qseqid sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp qcovs" \
-max_hsps 1 -max_target_seqs 1
## Replace "[" and "]" with "#" for later import as dataframe
sed -i 's/\[/#/g' CE_CNS_pep_Nov11_KOG.txt
sed -i 's/\]./#/g' CE_CNS_pep_Nov11_KOG.txt
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7e_CElegans")
CE_KOG = pd.read_csv("CE_CNS_pep_Nov11_KOG.txt", sep='#', header=None, engine="python")
type(CE_KOG)
pandas.core.frame.DataFrame
%get CE_KOG --from Python3
head(CE_KOG, 10)
0 | 1 | 2 | |
---|---|---|---|
0 | C23H4.1.4 gnl|CDD|231815 KOG3884, KOG3884, KOG3884, Neural proliferation, differentiation and control protein | Signal transduction mechanisms | 87.150 428 46 4 426 1 425 437 1 422 0.0 617 99 99 |
1 | B0034.3a gnl|CDD|229773 KOG1834, KOG1834, KOG1834, Calsyntenin | Extracellular structures | 50.496 1008 411 21 985 1 976 952 1 952 0.0 1333 99 99 |
2 | C24F3.6.1 gnl|CDD|231480 KOG3544, KOG3544, KOG3544, Collagens (type IV and type XIII), and related proteins | Extracellular structures | 40.323 310 154 5 292 12 291 327 14 322 1.51e-20 87.7 96 96 |
3 | B0244.2 gnl|CDD|228740 KOG0793, KOG0793, KOG0793, Protein tyrosine phosphatase | Signal transduction mechanisms | 28.947 836 472 21 768 1 766 1004 221 1004 0.0 1111 99 99 |
4 | B0336.10.1 gnl|CDD|228847 KOG0901, KOG0901, KOG0901, 60S ribosomal protein L14/L17/L23 | Translation, ribosomal structure and biogenesis | 75.172 145 31 2 141 1 140 145 1 145 6.64e-74 214 99 99 |
5 | C06B8.8.6 gnl|CDD|231437 KOG3499, KOG3499, KOG3499, 60S ribosomal protein L38 | Translation, ribosomal structure and biogenesis | 81.159 69 13 0 71 1 69 69 1 69 1.92e-31 101 97 97 |
6 | C23G10.3 gnl|CDD|231119 KOG3181, KOG3181, KOG3181, 40S ribosomal protein S3 | Translation, ribosomal structure and biogenesis | 70.492 244 72 0 248 3 246 244 1 244 3.17e-154 426 98 98 |
7 | B0365.3.2 gnl|CDD|228152 KOG0203, KOG0203, KOG0203, Na+/K+ ATPase, alpha subunit | Inorganic ion transport and metabolism | 74.326 1001 252 2 997 1 996 1019 19 1019 0.0 1929 99 99 |
8 | C04F12.4.2 gnl|CDD|231359 KOG3421, KOG3421, KOG3421, 60S ribosomal protein L14 | Translation, ribosomal structure and biogenesis | 51.852 135 65 0 136 1 135 136 1 135 2.19e-54 164 99 99 |
9 | C09D4.5.4 gnl|CDD|229637 KOG1696, KOG1696, KOG1696, 60s ribosomal protein L19 | Translation, ribosomal structure and biogenesis | 70.466 193 57 0 199 1 193 193 1 193 2.26e-73 217 97 97 |
## Count the number of occurrences of each category
KOGs= ["RNA processing and modification", "Chromatin structure and dynamics", "Energy production and conversion", "Cell cycle control",
"Amino acid transport and metabolism", "Nucleotide transport and metabolism", "Carbohydrate transport and metabolism", "Coenzyme transport and metabolism",
"Lipid transport and metabolism", "Translation, ribosomal structure and biogenesis", "Transcription", "Replication, recombination and repair",
"Cell wall/membrane/envelope biogenesis", "Cell motility", "Posttranslational modification", "Inorganic ion transport and metabolism",
"Secondary metabolites", "General function prediction only", "Function unknown", "Signal transduction mechanisms", "Intracellular trafficking",
"Defense mechanisms", "Extracellular structures", "Nuclear structure", "Cytoskeleton"]
data = []
for KOG in KOGs:
print(KOG, CE_KOG[1].str.contains(KOG).sum())
data.append([KOG, CE_KOG[1].str.contains(KOG).sum()])
df = pd.DataFrame(data)
df.columns = ["KOG", "Count"]
df["CE_Percentage"] = df["Count"]/df["Count"].sum()*100
print(df)
df[["KOG", "CE_Percentage"]].to_csv("CE_KOG_summary.txt", sep="\t", index=None)
RNA processing and modification 439 Chromatin structure and dynamics 222 Energy production and conversion 386 Cell cycle control 312 Amino acid transport and metabolism 293 Nucleotide transport and metabolism 109 Carbohydrate transport and metabolism 372 Coenzyme transport and metabolism 74 Lipid transport and metabolism 436 Translation, ribosomal structure and biogenesis 500 Transcription 964 Replication, recombination and repair 250 Cell wall/membrane/envelope biogenesis 91 Cell motility 17 Posttranslational modification 1021 Inorganic ion transport and metabolism 425 Secondary metabolites 133 General function prediction only 1414 Function unknown 836 Signal transduction mechanisms 2086 Intracellular trafficking 594 Defense mechanisms 95 Extracellular structures 302 Nuclear structure 57 Cytoskeleton 496 KOG Count CE_Percentage 0 RNA processing and modification 439 3.681650 1 Chromatin structure and dynamics 222 1.861791 2 Energy production and conversion 386 3.237169 3 Cell cycle control 312 2.616572 4 Amino acid transport and metabolism 293 2.457229 5 Nucleotide transport and metabolism 109 0.914123 6 Carbohydrate transport and metabolism 372 3.119758 7 Coenzyme transport and metabolism 74 0.620597 8 Lipid transport and metabolism 436 3.656491 9 Translation, ribosomal structure and biogenesis 500 4.193224 10 Transcription 964 8.084535 11 Replication, recombination and repair 250 2.096612 12 Cell wall/membrane/envelope biogenesis 91 0.763167 13 Cell motility 17 0.142570 14 Posttranslational modification 1021 8.562563 15 Inorganic ion transport and metabolism 425 3.564240 16 Secondary metabolites 133 1.115398 17 General function prediction only 1414 11.858437 18 Function unknown 836 7.011070 19 Signal transduction mechanisms 2086 17.494129 20 Intracellular trafficking 594 4.981550 21 Defense mechanisms 95 0.796713 22 Extracellular structures 302 2.532707 23 Nuclear structure 57 0.478028 24 Cytoskeleton 496 4.159678
CE_1346_non0_Jun30.txt
---> 18,307 IDs## Extract IDs of sequences with TPM>0 in all 4 libraries
> grep -Fwf SRR7443981_non0_Jun30.txt SRR7443984_non0_Jun30.txt > CE_14_non0_Jun30.txt
> grep -Fwf SRR7443983_non0_Jun30.txt CE_14_non0_Jun30.txt > CE_134_non0_Jun30.txt
> grep -Fwf SRR7443976_non0_Jun30.txt CE_134_non0_Jun30.txt > CE_1346_non0_Jun30.txt
## Check output
wc -l CE_1346_non0_Jun30.txt
echo ""
echo "Preview"
head -n 10 CE_1346_non0_Jun30.txt
18307 CE_1346_non0_Jun30.txt Preview F31C3.11 F31C3.9 F31C3.8 F31C3.7 R144.15 Y102A5D.12 Y102A5D.11 Y102A5D.10 Y102A5D.9 Y102A5D.8
CE_CNS_pep_Jun30.fa
---> 17,432 sequences## Extract protein sequences [27923035.bc]
filterbyname.sh in=CE_pep.fa \
out=CE_CNS_pep_Jun30.fa \
names=CE_1346_non0_Jun30.txt \
include=t substring
## KOG analysis [27923036.bc]
> rpsblast -query ~/CNS-transcriptomes/CE_2017/CE_CNS_pep_Jun30.fa -db Kog \
-out ~/CNS-transcriptomes/CE_2017/CE_CNS_pep_Jun30_KOG.txt -evalue 1E-5 \
-outfmt "6 qseqid sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp qcovs" \
-max_hsps 1 -max_target_seqs 1