## Current assembly
python ~/seq_length.py Pfam_SP_Apr18.aa > Pfam_SP_Apr18_length.txt
## De novo assembly
python ~/seq_length.py Pfam_or_SP_FX_non0.aa > Pfam_or_SP_FX_non0_length.txt
## EST library
python ~/seq_length.py Pfam_or_SP_FPS_Apr23.aa > Pfam_or_SP_FPS_Apr23_length.txt
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/5_Compare_LS_CNS_transcriptomes/5d_Compare_assemblies")
current_length=pd.read_csv("Pfam_SP_Apr18_length.txt", header=None, sep="\t")
current_length.columns=["Transcript", "AA length"]
de_novo_length=pd.read_csv("Pfam_or_SP_FX_non0_length.txt", header=None, sep="\t")
de_novo_length.columns=["Transcript", "AA length"]
EST_length=pd.read_csv("Pfam_or_SP_FPS_Apr23_length.txt", header=None, sep="\t")
EST_length.columns=["Transcript", "AA length"]
print(current_length.shape)
print(current_length.head(10))
print(current_length["AA length"].mean())
print(current_length["AA length"].median())
print(current_length["AA length"].max())
(17832, 2) Transcript AA length 0 evgLocus_Scallop_AE_100 253 1 evgLocus_Scallop_AE_150 312 2 evgLocus_Scallop_AE_215 829 3 evgLocus_Scallop_AE_379 571 4 evgLocus_Scallop_AE_483 499 5 evgLocus_Scallop_AE_519 435 6 evgLocus_Scallop_AE_548 86 7 evgLocus_Scallop_AE_727 455 8 evgLocus_Scallop_AE_898 148 9 evgLocus_Scallop_AE_1174 202 569.2954800358905 415.0 13109
print(de_novo_length.shape)
print(de_novo_length.head(10))
print(de_novo_length["AA length"].mean())
print(de_novo_length["AA length"].median())
print(de_novo_length["AA length"].max())
(11742, 2) Transcript AA length 0 evgLocus_FX_146 51 1 evgLocus_FX_496 42 2 evgLocus_FX_1767 42 3 evgLocus_FX_2332 35 4 evgLocus_FX_2952 36 5 evgLocus_FX_3081 48 6 evgLocus_FX_4240 52 7 evgLocus_FX_4690 39 8 evgLocus_FX_5159 41 9 evgLocus_FX_5667 51 469.95332992675867 353.0 8195
print(EST_length.shape)
print(EST_length.head(10))
print(EST_length["AA length"].mean())
print(EST_length["AA length"].median())
print(EST_length["AA length"].max())
(1946, 2) Transcript AA length 0 evgLocus_FPS_17 249 1 evgLocus_FPS_131 227 2 evgLocus_FPS_318 221 3 evgLocus_FPS_350 260 4 evgLocus_FPS_392 122 5 evgLocus_FPS_449 84 6 evgLocus_FPS_451 200 7 evgLocus_FPS_535 215 8 evgLocus_FPS_601 180 9 evgLocus_FPS_623 264 196.72096608427543 209.0 313
import seaborn as sns
sns.distplot(current_length["AA length"], hist=False, kde=True,
kde_kws = {'shade': True, 'linewidth': 1}, color = 'lightgreen')
sns.distplot(de_novo_length["AA length"], hist=False, kde=True,
kde_kws = {'shade': True,'linewidth': 1}, color = 'red')
sns.distplot(EST_length["AA length"], hist=False, kde=True,
kde_kws = {'shade': True,'linewidth': 1}, color = 'blue')
<matplotlib.axes._subplots.AxesSubplot at 0x7fd4a385da20>
## Current assembly: C:92.7%, S:82.1%, D:10.6%, F:0.3%, M:7.0%, n:978
### Results in ./run_BUSCO_LS_Pfam_SP_prot_Apr25/
python run_BUSCO.py -i ~/Pfam_SP_Apr18.aa \
-o BUSCO_LS_Pfam_SP_prot_Apr25 \
-l ~/metazoa_odb9/ -m proteins -c 10 -e 1e-05
## De novo assembly: C:91.5%, S:90.5%, D:1.0%, F:1.3%, M:7.2%, n:978
### Results in ./run_BUSCO_Pfam_or_SP_FX_non0/
python run_BUSCO.py \
-i ~/Lymnaea_CNS_transcriptome_files/5_Compare_LS_CNS_transcriptomes/5b_De_novo_assembly/Pfam_or_SP_FX_non0.aa \
-o BUSCO_Pfam_or_SP_FX_non0 \
-l ~/metazoa_odb9/ -m proteins -c 8 -e 1e-05
## EST library: C:13.4%, S:12.7%, D:0.7%, F:6.2%, M:80.4%, n:978
### Results in ./run_BUSCO_Pfam_or_SP_FPS_non0_Apr23/
python run_BUSCO.py \
-i ~/Lymnaea_CNS_transcriptome_files/5_Compare_LS_CNS_transcriptomes/5c_EST_library/Pfam_or_SP_FPS_Apr23.aa \
-o BUSCO_Pfam_or_SP_FPS_non0_Apr23 \
-l ~/metazoa_odb9/ -m proteins -c 8 -e 1e-05
## Current assembly
diamond blastp -d ~/nr_diamond.dmnd \
-q Pfam_SP_Apr18.aa -o ~/Pfam_SP_Apr18_nr.txt --max-target-seqs 1 --max-hsps 1 \
-f 6 qseqid qtitle sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp \
--more-sensitive --evalue 1E-5 --unal 1 -p 8
## De novo assembly
diamond blastp -d ~/nr_diamond.dmnd \
-q Pfam_or_SP_FX_non0.aa -o ~/Pfam_or_SP_FX_non0_nr.txt \
--max-target-seqs 1 --max-hsps 1 \
-f 6 qseqid qtitle sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp \
--more-sensitive --evalue 1E-5 --unal 1 -p 10
## EST library
diamond blastp -d ~/nr_diamond.dmnd \
-q Pfam_or_SP_FPS_Apr23.aa -o ~/Pfam_or_SP_FPS_Apr23_nr.txt \
--max-target-seqs 1 --max-hsps 1 \
-f 6 qseqid qtitle sseqid stitle pident length mismatch gapopen qlen qstart qend slen sstart send evalue bitscore qcovhsp \
--more-sensitive --evalue 1E-5 --unal 1 -p 10
current_blast=pd.read_csv("Pfam_SP_Apr18_nr.txt", header=None, sep="\t")
current_blast.columns = ["qseqid", "qtitle", "sseqid", "stitle", "pident", "length", "mismatch", "gapopen", "qlen", "qstart", "qend", "slen", "sstart",
"send", "evalue", "bitscore", "qcovhsp"]
print(current_blast.shape)
current_blast_final = current_blast[~current_blast["length"].astype(str).str.contains("-1")]
print(current_blast_final.shape)
print(current_blast_final["sseqid"].nunique())
(17832, 17) (15501, 17) 11507
%get current_blast_final --from Python3
head(current_blast_final, 10)
qseqid | qtitle | sseqid | stitle | pident | length | mismatch | gapopen | qlen | qstart | qend | slen | sstart | send | evalue | bitscore | qcovhsp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | evgLocus_Scallop_AE_100 | evgLocus_Scallop_AE_100 type=protein; aalen=253,60%,complete; clen=1253; strand=+; offs=98-859; evgclass=althi,okay,match:evgLocus_scallop_AG_117,pct:100/96/.; | XP_013070549.1 | XP_013070549.1 PREDICTED: proteasome activator complex subunit 3-like [Biomphalaria glabrata] | 91.4 | 256 | 17 | 3 | 253 | 1 | 253 | 254 | 1 | 254 | 8.1e-123 | 449.1 | 100.0 |
1 | evgLocus_Scallop_AE_150 | evgLocus_Scallop_AE_150 type=protein; aalen=312,91%,complete; clen=1027; strand=+; offs=88-1026; evgclass=althi1,okay,match:evgLocus_Trinity_AG_5547,pct:100/100/.; | XP_013096962.1 | XP_013096962.1 PREDICTED: ribonuclease P protein subunit p30-like [Biomphalaria glabrata] | 55.8 | 312 | 123 | 2 | 312 | 1 | 312 | 302 | 1 | 297 | 6.3e-85 | 323.6 | 100.0 |
2 | evgLocus_Scallop_AE_215 | evgLocus_Scallop_AE_215 type=protein; aalen=829,98%,complete; clen=2540; strand=+; offs=42-2531; evgclass=althi1,okay,match:evgLocus_Trinity_AF_5170,pct:100/100/.; | XP_013084140.1 | XP_013084140.1 PREDICTED: elongator complex protein 2-like [Biomphalaria glabrata] | 69.7 | 795 | 235 | 5 | 829 | 1 | 792 | 800 | 1 | 792 | 0.0e+00 | 1164.8 | 95.5 |
3 | evgLocus_Scallop_AE_379 | evgLocus_Scallop_AE_379 type=protein; aalen=571,70%,complete; clen=2427; strand=+; offs=93-1808; evgclass=althi1,okay,match:evgLocus_Trinity_RF_Nov18_9867,pct:100/100/.; | XP_005108749.1 | XP_005108749.1 PREDICTED: dnaJ homolog subfamily C member 21-like [Aplysia californica] | 59.6 | 641 | 188 | 5 | 571 | 1 | 571 | 640 | 1 | 640 | 4.6e-142 | 514.2 | 100.0 |
4 | evgLocus_Scallop_AE_483 | evgLocus_Scallop_AE_483 type=protein; aalen=499,78%,complete; clen=1916; strand=+; offs=164-1663; evgclass=althi1,okay,match:evgLocus_strawberry_AE_715,pct:100/98/.; | XP_013073089.1 | XP_013073089.1 PREDICTED: tubby-related protein 3-like isoform X3 [Biomphalaria glabrata] | 76.6 | 499 | 109 | 6 | 499 | 1 | 497 | 506 | 1 | 493 | 1.1e-205 | 725.3 | 99.6 |
5 | evgLocus_Scallop_AE_519 | evgLocus_Scallop_AE_519 type=protein; aalen=435,69%,complete; clen=1871; strand=+; offs=120-1427; evgclass=althi1,okay,match:evgLocus_Trinity_AE_121368,pct:100/100/.; | XP_013087266.1 | XP_013087266.1 PREDICTED: peptidase M20 domain-containing protein 2-like [Biomphalaria glabrata] | 71.4 | 392 | 112 | 0 | 435 | 37 | 428 | 440 | 46 | 437 | 2.8e-168 | 600.9 | 90.1 |
7 | evgLocus_Scallop_AE_727 | evgLocus_Scallop_AE_727 type=protein; aalen=455,63%,complete; clen=2158; strand=+; offs=38-1405; evgclass=althi1,okay,match:evgLocus_Trinity_AF_48007,pct:100/96/.; | XP_013062936.1 | XP_013062936.1 PREDICTED: transducin beta-like protein 2 [Biomphalaria glabrata] | 81.2 | 447 | 83 | 1 | 455 | 9 | 455 | 459 | 13 | 458 | 1.7e-200 | 708.0 | 98.2 |
8 | evgLocus_Scallop_AE_898 | evgLocus_Scallop_AE_898 type=protein; aalen=148,27%,complete-utrbad; clen=1601; strand=+; offs=509-955; evgclass=altmidfrag,okay,match:evgLocus_Trinity_AG_36866,pct:98/63/./evgLocus_stringtie_AG_51898; | XP_012936084.1 | XP_012936084.1 PREDICTED: neurobeachin-like protein 1 [Aplysia californica] | 75.3 | 89 | 22 | 0 | 148 | 59 | 147 | 2868 | 954 | 1042 | 2.5e-31 | 144.4 | 60.1 |
9 | evgLocus_Scallop_AE_1174 | evgLocus_Scallop_AE_1174 type=protein; aalen=202,73%,complete; clen=829; strand=+; offs=21-629; evgclass=althi1,okay,match:evgLocus_strawberry_AF_1713,pct:100/84/.; | XP_005111914.2 | XP_005111914.2 PREDICTED: uncharacterized protein LOC101858984 [Aplysia californica] | 76.8 | 155 | 36 | 0 | 202 | 16 | 170 | 1175 | 1 | 155 | 2.4e-69 | 271.2 | 76.7 |
10 | evgLocus_Scallop_AE_1234 | evgLocus_Scallop_AE_1234 type=protein; aalen=1037,85%,complete; clen=3624; strand=+; offs=314-3427; evgclass=althi1,okay,match:evgLocus_Trinity_AH_108909,pct:100/100/.; | XP_011431159.1 | XP_011431159.1 PREDICTED: uncharacterized protein LOC105330915 isoform X3 [Crassostrea gigas] | 53.1 | 813 | 338 | 11 | 1037 | 10 | 798 | 1120 | 11 | 804 | 8.4e-227 | 796.6 | 76.1 |
de_novo_blast=pd.read_csv("Pfam_or_SP_FX_non0_nr.txt", header=None, sep="\t")
de_novo_blast.columns = ["qseqid", "qtitle", "sseqid", "stitle", "pident", "length", "mismatch", "gapopen", "qlen", "qstart", "qend", "slen", "sstart",
"send", "evalue", "bitscore", "qcovhsp"]
print(de_novo_blast.shape)
de_novo_blast_final = de_novo_blast[~de_novo_blast["length"].astype(str).str.contains("-1")]
print(de_novo_blast_final.shape)
print(de_novo_blast_final["sseqid"].nunique())
(11742, 17) (10631, 17) 9708
EST_blast=pd.read_csv("Pfam_or_SP_FPS_Apr23_nr.txt", header=None, sep="\t")
EST_blast.columns = ["qseqid", "qtitle", "sseqid", "stitle", "pident", "length", "mismatch", "gapopen", "qlen", "qstart", "qend", "slen", "sstart",
"send", "evalue", "bitscore", "qcovhsp"]
print(EST_blast.shape)
EST_blast_final = EST_blast[~EST_blast["length"].astype(str).str.contains("-1")]
print(EST_blast_final.shape)
print(EST_blast_final["sseqid"].nunique())
(1946, 17) (1820, 17) 1587
current_unique_hits = current_blast_final["sseqid"].unique()
print(current_unique_hits.shape)
de_novo_unique_hits = de_novo_blast_final["sseqid"].unique()
print(de_novo_unique_hits.shape)
EST_unique_hits = EST_blast_final["sseqid"].unique()
print(EST_unique_hits.shape)
(11507,) (9708,) (1587,)
%matplotlib inline
from matplotlib_venn import venn2, venn2_circles
from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt
lst1 = current_unique_hits
lst2 = de_novo_unique_hits
lst3 = EST_unique_hits
venn3([set(lst1), set(lst2), set(lst3)], set_labels = ('Current assembly', 'De novo assembly', 'EST library'))
plt.title('Comparison')
Text(0.5,1,'Comparison')