perl -ane 'if(/\>/){$a++;print ">Locus_FX_$a\n"}else{print;}' FX.fasta > FX_renamed.fasta
trformat.pl -output FX.tr -input FX_renamed.fasta
sed -i -e 's/>Locus_FX_1/>evgLocus_FX_1/g' FX.tr
tr2aacds.pl -mrnaseq ~/FX.tr \
-NCPU=8 1>tr2aacds_FX_Mar31.log 2>tr2aacds_FX_Mar31.err \
-MINCDS=100 -logfile -debug
In ./okayset
folder
awk '/^>/ { p = ($0 ~ /complete/)} p' FX_OA.cds > FX_OA_complete.cds
awk '/^>/ { p = ($0 ~ /partial3/)} p' FX_OA.cds > FX_OA_partial3.cds
cat FX_OA_complete.cds FX_OA_partial3.cds > LS_FX_OA_CP3.cds
salmon index -t LS_FX_OA_CP3.cds -i ~/LS_FX_OA_CP3_salmon_index --type quasi -k 31
salmon quant -i ~/LS_FX_OA_CP3_salmon_index -l A -r filtered_DRR002012_1.cor.fq -o ~/LS_FX_OA_CP3_salmon_quant
## Import libraries
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/5_Compare_LS_CNS_transcriptomes/5b_De_novo_assembly")
## Extract TPM>0 transcript IDs and read counts from Salmon output
def extract_non0(salmon_output_filename):
with open(salmon_output_filename, "r") as infile:
lib = pd.read_csv(infile, sep='\t')
lib_non0 = lib.loc[lib["TPM"] > 0]
lib_non0_counts = lib_non0[["Name", "TPM"]]
return lib_non0_counts
libFX = extract_non0("./LS_FX_OA_CP3_salmon_quant/quant.sf")
libFX.shape
## Write IDs of transcripts with TPM>0 in all libraries to file
libFX.to_csv("quant_LS_FX_OA_CP3_non0.txt")
(35339, 2)
## Extract sequences with TPM>0
faSomeRecords LS_FX_evigene_Mar31/okayset/FX_OA.aa quant_LS_FX_OA_CP3_non0.txt LS_FX_OA_CP3_non0.aa
faSomeRecords LS_FX_evigene_Mar31/okayset/FX_OA.cds quant_LS_FX_OA_CP3_non0.txt LS_FX_OA_CP3_non0.cds
hmmsearch --cut_ga --tblout LS_FX_OA_CP3_non0_Pfam_cutga.txt database/Pfam-A.hmm ~/LS_FX_OA_CP3_non0.aa
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/5_Compare_LS_CNS_transcriptomes/5b_De_novo_assembly")
pfam = pd.read_csv("LS_FX_OA_CP3_non0_Pfam_cutga.txt", delim_whitespace=True, skipfooter=10, header=1, engine='python')
pfam_final = pfam.drop([0])
print(pfam_final.columns)
print(pfam_final["#"].nunique())
Index(['#', 'target', 'name', 'accession', 'query', 'name.1', 'accession.1', 'E-value', 'score', 'bias', 'E-value.1', 'score.1', 'bias.1', 'exp', 'reg', 'clu', 'ov', 'env', 'dom', 'rep', 'inc', 'description', 'of', 'target.1'], dtype='object') 10676
%get pfam_final --from Python3
head(pfam_final, 10)
# | target | name | accession | query | name.1 | accession.1 | E-value | score | bias | ⋯ | reg | clu | ov | env | dom | rep | inc | description | of | target.1 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
evgLocus_FX_24066 | - | 1-cysPrx_C | PF10417.8 | 3.4e-13 | 50.1 | 1.0 | 4.2e-13 | 49.8 | 0.1 | ⋯ | 2 | 2 | 1 | 1 | type=protein; | aalen=252,35%,complete-utrbad; | clen=2151; | strand=-; | offs=2103-1345; | evgclass=noclass,okay; |
evgLocus_FX_32834 | - | 1-cysPrx_C | PF10417.8 | 5.3e-13 | 49.5 | 0.1 | 1.8e-12 | 47.9 | 0.0 | ⋯ | 2 | 2 | 1 | 1 | type=protein; | aalen=198,51%,complete-utrpoor; | clen=1170; | strand=-; | offs=1006-410; | evgclass=noclass,okay; |
evgLocus_FX_29797 | - | 1-cysPrx_C | PF10417.8 | 2.7e-11 | 44.0 | 0.0 | 4.6e-11 | 43.3 | 0.0 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=248,52%,complete-utrpoor; | clen=1413; | strand=+; | offs=102-848; | evgclass=noclass,okay; |
evgLocus_FX_31631 | - | 1-cysPrx_C | PF10417.8 | 2.3e-10 | 41.1 | 0.1 | 4.1e-10 | 40.3 | 0.1 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=220,52%,complete-utrpoor; | clen=1257; | strand=-; | offs=1149-487; | evgclass=noclass,okay; |
evgLocus_FX_18416 | - | 14-3-3 | PF00244.19 | 1.2e-105 | 352.9 | 4.6 | 1.5e-105 | 352.6 | 4.6 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=258,18%,complete-utrbad; | clen=4107; | strand=+; | offs=132-908; | evgclass=noclass,okay; |
evgLocus_FX_19704 | - | 14-3-3 | PF00244.19 | 3.5e-99 | 331.7 | 11.4 | 4e-99 | 331.5 | 11.4 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=261,23%,complete-utrbad; | clen=3365; | strand=-; | offs=3284-2499; | evgclass=noclass,okay; |
evgLocus_FX_21724 | - | 14-3-3 | PF00244.19 | 1e-95 | 320.4 | 8.1 | 1.2e-95 | 320.2 | 8.1 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=255,28%,complete-utrbad; | clen=2662; | strand=-; | offs=2514-1747; | evgclass=noclass,okay; |
evgLocus_FX_18496 | - | 2-Hacid_dh | PF00389.29 | 3.1e-39 | 134.7 | 0.1 | 6.2e-39 | 133.7 | 0.1 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=535,39%,complete-utrbad; | clen=4039; | strand=+; | offs=157-1764; | evgclass=noclass,okay; |
evgLocus_FX_17939 | - | 2-Hacid_dh | PF00389.29 | 6.6e-30 | 104.5 | 0.0 | 7.6e-30 | 104.3 | 0.0 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=324,21%,complete-utrbad; | clen=4522; | strand=+; | offs=177-1151; | evgclass=noclass,okay; |
evgLocus_FX_17529 | - | 2-Hacid_dh | PF00389.29 | 4.9e-29 | 101.6 | 0.0 | 5.9e-29 | 101.4 | 0.0 | ⋯ | 1 | 1 | 1 | 1 | type=protein; | aalen=435,25%,complete-utrbad; | clen=5048; | strand=-; | offs=4771-3464; | evgclass=noclass,okay; |
## Split up file into chunks with 10,000 sequences (as required by program)
awk 'BEGIN {n_seq=0;} /^>/ {if(n_seq%10000==0){file=sprintf("myseq%d.fa",n_seq);} print >> \
file; n_seq++; next;} { print >> file; }' < ~/LS_FX_OA_CP3_non0.aa
## Analyses
../signalp -f short -l ./myseq0_log.txt -v -m ~/LS_FX_OA_CP3_non0_signalP_myseq0.fa -s best -t euk ./myseq0.fa
../signalp -f short -l ./myseq10000_log.txt -v -m ~/LS_FX_OA_CP3_non0_signalP_myseq10000.fa -s best -t euk ./myseq10000.fa
../signalp -f short -l ./myseq20000_log.txt -v -m ~/LS_FX_OA_CP3_non0_signalP_myseq20000.fa -s best -t euk ./myseq20000.fa
../signalp -f short -l ./myseq30000_log.txt -v -m ~/LS_FX_OA_CP3_non0_signalP_myseq30000.fa -s best -t euk ./myseq30000.fa
## Combine all predicted mature signaling peptides into one file
cat LS_FX_OA_CP3_non0_signalP_myseq*.fa > LS_FX_OA_CP3_non0_mature_signalP.fa
Output file: Phobius_FX_hits.txt
perl ~/install/tmp/tmpkplvHF/phobius/phobius.pl -short ~/LS_FX_OA_CP3_non0.aa
## Check the results
phobius = pd.read_csv("Phobius_FX_hits.txt", header=None)
print(phobius.shape)
print(phobius[0].nunique())
print(phobius.head(10))
(3450, 1) 3450 0 0 evgLocus_FX_146 1 evgLocus_FX_276 2 evgLocus_FX_435 3 evgLocus_FX_496 4 evgLocus_FX_503 5 evgLocus_FX_1083 6 evgLocus_FX_1596 7 evgLocus_FX_1767 8 evgLocus_FX_1932 9 evgLocus_FX_2200
## Extract IDs of all mature signaling peptides predicted by SignalP
grep '^>' LS_FX_OA_CP3_non0_mature_signalP.fa > LS_FX_OA_CP3_non0_mature_signalP_IDs.txt
## Simplify IDs of all mature signaling peptides predicted by SignalP
sed -i -e 's/type=/#/g' LS_FX_OA_CP3_non0_mature_signalP_IDs.txt
sed 's/\#.*/#/' LS_FX_OA_CP3_non0_mature_signalP_IDs.txt > LS_FX_OA_CP3_non0_mature_signalP_IDs_simplified.txt
sed -i -e 's/#//g' LS_FX_OA_CP3_non0_mature_signalP_IDs_simplified.txt
sed -i -e 's/>//g' LS_FX_OA_CP3_non0_mature_signalP_IDs_simplified.txt
## Find IDs present in both SignalP and Phobius results
fgrep -wf LS_FX_OA_CP3_non0_mature_signalP_IDs_simplified.txt Phobius_FX_hits.txt > Phobius_SignalP_match_LS_FX_non0.txt
## Extract sequences
faSomeRecords ~/LS_FX_evigene_Mar31/okayset/LS_FX_OA_CP3_non0.aa Phobius_SignalP_match_LS_FX_non0.txt Phobius_SignalP_match_LS_FX_non0.aa
## Find sequences that have either >=1 Pfam hit or signaling peptide annotation
cat Phobius_SignalP_match_LS_FX_non0.txt LS_FX_OA_CP3_non0_Pfam_cutga_hits.txt | sort | uniq > Pfam_or_SP_FX_non0.txt
## Extract sequences
faSomeRecords ~/LS_FX_OA_CP3_non0.aa Pfam_or_SP_FX_non0.txt Pfam_or_SP_FX_non0.aa
faSomeRecords ~/LS_FX_OA_CP3_non0.cds Pfam_or_SP_FX_non0.txt Pfam_or_SP_FX_non0.cds