Pfam_channel_IDs.txt
---> 69 Pfam IDs (include "ion_trans" [PF00520])
## Pfam analysis of aggregated LS protein-coding transcripts [27853611.bc]
hmmsearch --cut_ga --tblout LS_all_CDHIT_Pfam_cutga.txt database/Pfam-A.hmm ~/LS_all_CDHIT.fasta
## Extract sequences that contain >=1 channel domains ---> 402 sequence IDs (not all unique)
grep -f Pfam_channel_IDs.txt LS_all_CDHIT_Pfam_cutga.txt > LS_all_CDHIT_Pfam_chnl_hits_Apr24.txt
## Extract unique sequence IDs ---> 247 unique sequence IDs
sort LS_all_CDHIT_Pfam_chnl_hits_ID_Apr24.txt | uniq > LS_all_CDHIT_Pfam_chnl_unique_hits_Apr24.txt
## Extract sequences for TMHMM and Phobius analysis ---> 247 sequences
faSomeRecords LS_all_CDHIT.fasta LS_all_CDHIT_Pfam_chnl_unique_hits_Apr24.txt LS_all_CDHIT_Pfam_chnl_unique_hits_Apr24.aa
## Confirm above analyses in Linux
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors")
pfam = pd.read_csv("LS_all_CDHIT_Pfam_cutga.txt", delim_whitespace=True, skipfooter=10, header=1, engine='python')
pfam_final = pfam.drop(0)
print(pfam_final.shape)
pfam_final[["accession", "accession_extra"]] = pfam_final["accession"].str.split('.',expand=True)
pfam_IDs = pd.read_csv("Pfam_channel_IDs.txt", header=None)
IDs_list = pfam_IDs[0].tolist()
chnl_transcripts = pfam_final[pfam_final['accession'].isin(IDs_list)]
print(chnl_transcripts.shape)
print(chnl_transcripts["#"].nunique())
(31063, 24) (402, 25) 247
perl ~/install/tmp/tmpkplvHF/phobius/phobius.pl -short ~/LS_all_CDHIT_Pfam_chnl_unique_hits_Apr24.aa > Phobius_LS_all_CDHIT_Pfam_chnl_unique_hits_Apr24.txt
import pandas as pd
phobius = pd.read_csv("Phobius_LS_all_CDHIT_Pfam_chnl_unique_hits_Apr24.txt", delim_whitespace=True, engine='python')
print(phobius.head(10))
print(phobius.shape)
phobius_TM = phobius[phobius["ID"] != 0]
print(phobius_TM.shape)
phobius_TM_transcript = phobius_TM["SEQENCE"]
SEQENCE ID TM \ 0 evgLocus_FPS_1491 1 0 1 evgLocus_FX_16361 6 0 2 evgLocus_FX_16447 29 0 3 evgLocus_FX_16654 8 0 4 evgLocus_FX_16787 4 0 5 evgLocus_FX_16972 5 0 6 evgLocus_FX_17005 6 0 7 evgLocus_FX_17076 5 0 8 evgLocus_FX_17230 6 0 9 evgLocus_FX_17289 1 Y SP PREDICTION 0 o47-64i NaN 1 o4638-4658i4834-4852o4903-4925i5045-5070o5093-... NaN 2 o12-44i56-72o103-121i153-173o179-196i208-228o2... NaN 3 i244-267o279-297i352-377o389-408i435-458o591-6... NaN 4 o275-298i305-323o335-358i578-598o NaN 5 o795-814i826-847o867-885i897-922o1087-1109i NaN 6 i27-48o108-127i139-164o184-210i222-239o251-275i NaN 7 i330-349o914-935i956-973o985-1010i1172-1197o NaN 8 i194-218o259-281i288-306o322-341i362-383o419-443i NaN 9 n15-26c35/36o1251-1270i NaN (247, 5) (228, 5)
Used the web server ---> Results are in TMHMM_LS_all_CDHIT_Pfam_chnl_unique_hits_Apr24.csv
TMHMM = pd.read_csv("TMHMM_LS_all_CDHIT_Pfam_chnl_unique_hits_Apr24.csv", sep=",", header=1)
print(TMHMM.head(10))
TMHMM_TM =TMHMM[TMHMM["PredHel"] != "PredHel=0"]
print(TMHMM_TM.shape)
TMHMM_TM_transcripts = TMHMM_TM["ID"]
print(type(TMHMM_TM_transcripts))
ID Length ExpAA First60 PredHel \ 0 evgLocus_FPS_1491 len=91 ExpAA=13.34 First60=10.16 PredHel=1 1 evgLocus_FX_16361 len=5293 ExpAA=132.82 First60=0.00 PredHel=6 2 evgLocus_FX_16447 len=1947 ExpAA=581.37 First60=32.04 PredHel=24 3 evgLocus_FX_16654 len=1157 ExpAA=158.43 First60=0.00 PredHel=7 4 evgLocus_FX_16787 len=604 ExpAA=98.57 First60=10.79 PredHel=4 5 evgLocus_FX_16972 len=1549 ExpAA=88.02 First60=0.33 PredHel=4 6 evgLocus_FX_17005 len=348 ExpAA=129.57 First60=22.50 PredHel=6 7 evgLocus_FX_17076 len=1504 ExpAA=86.72 First60=0.00 PredHel=4 8 evgLocus_FX_17230 len=549 ExpAA=122.79 First60=0.01 PredHel=6 9 evgLocus_FX_17289 len=1271 ExpAA=19.59 First60=6.55 PredHel=0 Topology 0 Topology=o47-64i 1 Topology=o4638-4660i4830-4852o4903-4925i5045-5... 2 Topology=i13-35o50-72i153-175o179-201i208-230o... 3 Topology=i242-264o279-297i352-374o389-408i435-... 4 Topology=o275-297i304-323o338-360i578-600o 5 Topology=o828-850i867-886o901-923i1079-1101o 6 Topology=i27-49o110-129i141-163o183-205i217-23... 7 Topology=o913-935i955-974o989-1011i1172-1194o 8 Topology=i194-216o259-281i288-307o322-341i362-... 9 Topology=o (219, 6) <class 'pandas.core.series.Series'>
TM_transcripts = phobius_TM_transcript[phobius_TM_transcript.isin(TMHMM_TM_transcripts)]
print(TM_transcripts.shape)
TM_transcripts.to_csv("LS_predicted_CNS_chnls_Apr24.txt", index=None)
(219,)
LS_predicted_CNS_chnls_Apr24.aa
---> 219 sequencesfaSomeRecords LS_all_CDHIT.fasta LS_predicted_CNS_chnls_Apr24.txt LS_predicted_CNS_chnls_Apr24.aa
LS_predicted_CNS_chnls_Nr_hits.tsv
, which is taken from LS_all_CDHIT_nr_Apr25.txt
change_ID.py
used to change IDs in LS_predicted_CNS_chnls_Apr24.aa
to include Nr hit### change_ID.py used
# Dictionary with strings to replace and what to replace them with
replace_strings = {}
with open("LS_predicted_CNS_chnls_Nr_hits.tsv", "r") as id_file:
# Read file line-by-line
for line in id_file.readlines():
# Split line on TAB
ids = line.strip().split("\t")
# Fist entry is the original ID
original_id = ids[0]
# Second entry is your ID
my_id = ids[1]
# Add both to our dictionary of strings to replace
replace_strings[original_id] = my_id
# Read file with sequences
with open("LS_predicted_CNS_chnls_Apr24.aa", "r+") as infile:
# Read each line of file into a list
content = infile.readlines()
# Keep a list of the lines with the replaced strings
new_content = []
# Loop lines in the file content
for line in content:
new_line = line
# Find and replace any original_id with your own ids in the line of content and add it to our list of replaced lines
for original_id, my_id in replace_strings.items():
new_line = new_line.replace(original_id, my_id)
new_content.append(new_line)
# Write replaced content to a new file
with open("LS_predicted_CNS_chnls_Nr_IDs_Apr24.aa", "w") as outfile:
for line in new_content:
outfile.write(line)
LS_predicted_CNS_chnls_Nr_IDs_Apr24.fa
is the same as LS_predicted_CNS_chnls_Nr_IDs_Apr24.aa
(renamed for filterbyname.sh)
## TRP chnls
filterbyname.sh in=LS_predicted_CNS_chnls_Nr_IDs_Apr24.fa out=LS_TRP_chnls_Apr27.fa include=t names=LS_TRP_IDs_Apr27.txt substring
## K+ chnls
filterbyname.sh in=LS_predicted_CNS_chnls_Nr_IDs_Apr24.fa out=LS_K_chnls_Apr27.fa include=t names=LS_K_chnl_IDs_Apr27.txt substring
## Ca2+ chnls
filterbyname.sh in=LS_predicted_CNS_chnls_Nr_IDs_Apr24.fa out=LS_Ca_chnls_Apr27.fa include=t names=LS_Ca_chnl_IDs_Apr27.txt substring
## Cation chnls
filterbyname.sh in=LS_predicted_CNS_chnls_Nr_IDs_Apr24.fa out=LS_cation_chnls_Apr27.fa include=t names=LS_cation_chnl_IDs_Apr27.txt substring
## Cl- chnls
filterbyname.sh in=LS_predicted_CNS_chnls_Nr_IDs_Apr24.fa out=LS_Cl_chnls_Apr27.fa include=t names=LS_Cl_chnl_IDs.txt substring
## Na+ chnls
filterbyname.sh in=LS_predicted_CNS_chnls_Nr_IDs_Apr24.fa out=LS_Na_chnls_Apr27.fa include=t names=LS_Na_chnl_IDs_Apr27.txt substring
## Neurotransmitter receptors
filterbyname.sh in=LS_predicted_CNS_chnls_Nr_IDs_Apr24.fa out=LS_NTRs_Apr27.fa include=t names=LS_NTR_IDs_Apr27.txt substring
cat LS_*_Apr27.fa > LS_all_chnl_Apr27.fa
grep -c ">" LS_all_chnl_Apr27.fa
211
## Import libraries and set working directory
import pandas as pd
import os
os.chdir("/home/zhanglab1/ndong/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors")
blastp \
-query ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_all_chnl_Apr27.fa \
-subject ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7a_Drosophila/DM_non0_pep_Nov11.fasta \
-out ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_chnls_DM_BLASTP_Nov25.txt \
-evalue 1E-5 \
-outfmt 6 \
-max_hsps 1 \
-max_target_seqs 1
DM_blastp_chnls = pd.read_csv("LS_chnls_DM_BLASTP_Nov25.txt", sep="\t", header=None)
DM_blastp_chnls.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
DM = DM_blastp_chnls[["qseqid", "bitscore"]]
DM.columns = ["ID", "DM"]
%get DM --from Python3
dim(DM)
head(DM, 10)
ID | DM | |
---|---|---|
0 | evgLocus_FX_16361_XP_013065477.1_PREDICTED:_ryanodine_receptor_44F-like_[Biomphalaria_glabrata] | 3670.0 |
1 | evgLocus_FX_17353_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1124.0 |
2 | evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two_pore_calcium_channel_protein_2-like_[Aplysia_californica] | 94.4 |
3 | evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Aplysia_californica] | 102.0 |
4 | evgLocus_FX_35751_XP_005090384.1_PREDICTED:_protein_orai-2-like_[Aplysia_californica] | 206.0 |
5 | evgLocus_FX_42627_AAO83838.2_voltage-dependent_L-type_calcium_channel_alpha-1_subunit_isoform_a_[Lymnaea_stagnalis] | 299.0 |
6 | evgLocus_scallop_AG_24453_XP_013069789.1_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Biomphalaria_glabrata] | 52.4 |
7 | evgLocus_strawberry_AE_27300_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 898.0 |
8 | evgLocus_strawberry_AG_27619_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 896.0 |
9 | evgLocus_Strawberry_DRR_59689_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1314.0 |
blastp \
-query ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_all_chnl_Apr27.fa \
-subject ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7b_Zebrafish/DR_non0_pep_Nov11.fasta \
-out ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_chnls_DR_BLASTP_Nov25.txt \
-evalue 1E-5 \
-outfmt 6 \
-max_hsps 1 \
-max_target_seqs 1
DR_blastp_chnls = pd.read_csv("LS_chnls_DR_BLASTP_Nov25.txt", sep="\t", header=None)
DR_blastp_chnls.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
DR = DR_blastp_chnls[["qseqid", "bitscore"]]
DR.columns = ["ID", "DR"]
%get DR --from Python3
dim(DR)
head(DR, 10)
ID | DR | |
---|---|---|
0 | evgLocus_FX_16361_XP_013065477.1_PREDICTED:_ryanodine_receptor_44F-like_[Biomphalaria_glabrata] | 2325 |
1 | evgLocus_FX_17353_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1035 |
2 | evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two_pore_calcium_channel_protein_2-like_[Aplysia_californica] | 523 |
3 | evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Aplysia_californica] | 677 |
4 | evgLocus_FX_35751_XP_005090384.1_PREDICTED:_protein_orai-2-like_[Aplysia_californica] | 223 |
5 | evgLocus_FX_42627_AAO83838.2_voltage-dependent_L-type_calcium_channel_alpha-1_subunit_isoform_a_[Lymnaea_stagnalis] | 275 |
6 | evgLocus_scallop_AG_24453_XP_013069789.1_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Biomphalaria_glabrata] | 167 |
7 | evgLocus_strawberry_AE_27300_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1023 |
8 | evgLocus_strawberry_AG_27619_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1014 |
9 | evgLocus_Strawberry_DRR_59689_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1339 |
blastp \
-query ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_all_chnl_Apr27.fa \
-subject ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7c_Mouse/MM_non0_pep_Nov11.fasta \
-out ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_chnls_MM_BLASTP_Nov25.txt \
-evalue 1E-5 \
-outfmt 6 \
-max_hsps 1 \
-max_target_seqs 1
MM_blastp_chnls = pd.read_csv("LS_chnls_MM_BLASTP_Nov25.txt", sep="\t", header=None)
MM_blastp_chnls.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
MM = MM_blastp_chnls[["qseqid", "bitscore"]]
MM.columns = ["ID", "MM"]
%get MM --from Python3
dim(MM)
head(MM, 10)
ID | MM | |
---|---|---|
0 | evgLocus_FX_16361_XP_013065477.1_PREDICTED:_ryanodine_receptor_44F-like_[Biomphalaria_glabrata] | 2673 |
1 | evgLocus_FX_17353_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 994 |
2 | evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two_pore_calcium_channel_protein_2-like_[Aplysia_californica] | 182 |
3 | evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Aplysia_californica] | 654 |
4 | evgLocus_FX_35751_XP_005090384.1_PREDICTED:_protein_orai-2-like_[Aplysia_californica] | 226 |
5 | evgLocus_FX_42627_AAO83838.2_voltage-dependent_L-type_calcium_channel_alpha-1_subunit_isoform_a_[Lymnaea_stagnalis] | 277 |
6 | evgLocus_scallop_AG_24453_XP_013069789.1_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Biomphalaria_glabrata] | 104 |
7 | evgLocus_strawberry_AE_27300_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 733 |
8 | evgLocus_strawberry_AG_27619_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 733 |
9 | evgLocus_Strawberry_DRR_59689_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1363 |
blastp \
-query ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_all_chnl_Apr27.fa \
-subject ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7d_Xenopus/XT_non0_pep_Nov13.fa \
-out ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_chnls_XT_BLASTP_Nov25.txt \
-evalue 1E-5 \
-outfmt 6 \
-max_hsps 1 \
-max_target_seqs 1
XT_blastp_chnls = pd.read_csv("LS_chnls_XT_BLASTP_Nov25.txt", sep="\t", header=None)
XT_blastp_chnls.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
XT = XT_blastp_chnls[["qseqid", "bitscore"]]
XT.columns = ["ID", "XT"]
%get XT --from Python3
dim(XT)
head(XT, 10)
ID | XT | |
---|---|---|
0 | evgLocus_FX_16361_XP_013065477.1_PREDICTED:_ryanodine_receptor_44F-like_[Biomphalaria_glabrata] | 2184 |
1 | evgLocus_FX_17353_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 956 |
2 | evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two_pore_calcium_channel_protein_2-like_[Aplysia_californica] | 527 |
3 | evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Aplysia_californica] | 623 |
4 | evgLocus_FX_35751_XP_005090384.1_PREDICTED:_protein_orai-2-like_[Aplysia_californica] | 241 |
5 | evgLocus_FX_42627_AAO83838.2_voltage-dependent_L-type_calcium_channel_alpha-1_subunit_isoform_a_[Lymnaea_stagnalis] | 277 |
6 | evgLocus_scallop_AG_24453_XP_013069789.1_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Biomphalaria_glabrata] | 187 |
7 | evgLocus_strawberry_AE_27300_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 738 |
8 | evgLocus_strawberry_AG_27619_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 754 |
9 | evgLocus_Strawberry_DRR_59689_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1332 |
blastp \
-query ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_all_chnl_Apr27.fa \
-subject ~/Lymnaea_CNS_transcriptome_files/7_Interspecies_comparison/7e_CElegans/CE_non0_pep_Nov11.fasta \
-out ~/Lymnaea_CNS_transcriptome_files/8_Identify_LS_CNS_channels_receptors/LS_chnls_CE_BLASTP_Nov25.txt \
-evalue 1E-5 \
-outfmt 6 \
-max_hsps 1 \
-max_target_seqs 1
CE_blastp_chnls = pd.read_csv("LS_chnls_CE_BLASTP_Nov25.txt", sep="\t", header=None)
CE_blastp_chnls.columns = ["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]
CE = CE_blastp_chnls[["qseqid", "bitscore"]]
CE.columns = ["ID", "CE"]
%get CE --from Python3
dim(CE)
head(CE, 10)
ID | CE | |
---|---|---|
0 | evgLocus_FX_16361_XP_013065477.1_PREDICTED:_ryanodine_receptor_44F-like_[Biomphalaria_glabrata] | 2085.0 |
1 | evgLocus_FX_17353_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1045.0 |
2 | evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two_pore_calcium_channel_protein_2-like_[Aplysia_californica] | 91.3 |
3 | evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Aplysia_californica] | 70.1 |
4 | evgLocus_FX_35751_XP_005090384.1_PREDICTED:_protein_orai-2-like_[Aplysia_californica] | 125.0 |
5 | evgLocus_FX_42627_AAO83838.2_voltage-dependent_L-type_calcium_channel_alpha-1_subunit_isoform_a_[Lymnaea_stagnalis] | 268.0 |
6 | evgLocus_strawberry_AE_27300_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 791.0 |
7 | evgLocus_strawberry_AG_27619_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 791.0 |
8 | evgLocus_Strawberry_DRR_59689_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1052.0 |
9 | evgLocus_Stringtie_DRR_40665_XP_013068047.1_PREDICTED:_inositol_1,4,5-trisphosphate_receptor_type_1-like_isoform_X6_[Biomphalaria_glabrata] | 1794.0 |
data = None
list = [DM, DR, MM, XT, CE]
for f in list:
if data is None:
data = f
else:
data = data.merge(f, on="ID")
data = data.set_index("ID")
%get data --from Python3
head(data, 10)
dim(data)
write.table(data, file="test.txt", sep=",")
DM | DR | MM | XT | CE | |
---|---|---|---|---|---|
evgLocus_FX_16361_XP_013065477.1_PREDICTED:_ryanodine_receptor_44F-like_[Biomphalaria_glabrata] | 3670.0 | 2325 | 2673 | 2184 | 2085.0 |
evgLocus_FX_17353_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1124.0 | 1035 | 994 | 956 | 1045.0 |
evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two_pore_calcium_channel_protein_2-like_[Aplysia_californica] | 94.4 | 523 | 182 | 527 | 91.3 |
evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Aplysia_californica] | 102.0 | 677 | 654 | 623 | 70.1 |
evgLocus_FX_35751_XP_005090384.1_PREDICTED:_protein_orai-2-like_[Aplysia_californica] | 206.0 | 223 | 226 | 241 | 125.0 |
evgLocus_FX_42627_AAO83838.2_voltage-dependent_L-type_calcium_channel_alpha-1_subunit_isoform_a_[Lymnaea_stagnalis] | 299.0 | 275 | 277 | 277 | 268.0 |
evgLocus_strawberry_AE_27300_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 898.0 | 1023 | 733 | 738 | 791.0 |
evgLocus_strawberry_AG_27619_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 896.0 | 1014 | 733 | 754 | 791.0 |
evgLocus_Strawberry_DRR_59689_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1314.0 | 1339 | 1363 | 1332 | 1052.0 |
evgLocus_Stringtie_DRR_40665_XP_013068047.1_PREDICTED:_inositol_1,4,5-trisphosphate_receptor_type_1-like_isoform_X6_[Biomphalaria_glabrata] | 2914.0 | 3205 | 3293 | 3020 | 1794.0 |
import numpy as np
matrix = data.values
print(matrix.mean())
print(matrix.std())
print(matrix.shape)
499.1713989637306 554.7305200325252 (193, 5)
## Standardize bitscores
data_standardized = (data-matrix.mean())/matrix.std()
DM DR \ ID evgLocus_FX_16361_XP_013065477.1_PREDICTED:_rya... 5.715980 3.291379 evgLocus_FX_17353_AAO83841.1_voltage-dependent_... 1.126364 0.965926 evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two... -0.729672 0.042955 evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two... -0.715972 0.320568 evgLocus_FX_35751_XP_005090384.1_PREDICTED:_pro... -0.528493 -0.497848 evgLocus_FX_42627_AAO83838.2_voltage-dependent_... -0.360844 -0.404109 evgLocus_strawberry_AE_27300_AAO83843.2_voltage... 0.718959 0.944294 evgLocus_strawberry_AG_27619_AAO83843.2_voltage... 0.715354 0.928070 evgLocus_Strawberry_DRR_59689_AAO83841.1_voltag... 1.468873 1.513940 evgLocus_Stringtie_DRR_40665_XP_013068047.1_PRE... 4.353156 4.877735 MM XT \ ID evgLocus_FX_16361_XP_013065477.1_PREDICTED:_rya... 3.918711 3.037202 evgLocus_FX_17353_AAO83841.1_voltage-dependent_... 0.892016 0.823514 evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two... -0.571758 0.050166 evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two... 0.279106 0.223223 evgLocus_FX_35751_XP_005090384.1_PREDICTED:_pro... -0.492440 -0.465400 evgLocus_FX_42627_AAO83838.2_voltage-dependent_... -0.400503 -0.400503 evgLocus_strawberry_AE_27300_AAO83843.2_voltage... 0.421517 0.430531 evgLocus_strawberry_AG_27619_AAO83843.2_voltage... 0.421517 0.459374 evgLocus_Strawberry_DRR_59689_AAO83841.1_voltag... 1.557204 1.501321 evgLocus_Stringtie_DRR_40665_XP_013068047.1_PRE... 5.036371 4.544240 CE ID evgLocus_FX_16361_XP_013065477.1_PREDICTED:_rya... 2.858737 evgLocus_FX_17353_AAO83841.1_voltage-dependent_... 0.983953 evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two... -0.735260 evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two... -0.773477 evgLocus_FX_35751_XP_005090384.1_PREDICTED:_pro... -0.674510 evgLocus_FX_42627_AAO83838.2_voltage-dependent_... -0.416727 evgLocus_strawberry_AE_27300_AAO83843.2_voltage... 0.526073 evgLocus_strawberry_AG_27619_AAO83843.2_voltage... 0.526073 evgLocus_Strawberry_DRR_59689_AAO83841.1_voltag... 0.996571 evgLocus_Stringtie_DRR_40665_XP_013068047.1_PRE... 2.334158
%get data_standardized --from Python3
head(data_standardized, 10)
write.table(data_standardized, file = "chnl_comparison_standardized.txt", sep="\t")
DM | DR | MM | XT | CE | |
---|---|---|---|---|---|
evgLocus_FX_16361_XP_013065477.1_PREDICTED:_ryanodine_receptor_44F-like_[Biomphalaria_glabrata] | 5.7159801 | 3.29137939 | 3.9187110 | 3.03720192 | 2.8587369 |
evgLocus_FX_17353_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1.1263642 | 0.96592594 | 0.8920162 | 0.82351445 | 0.9839527 |
evgLocus_FX_19019_XP_012943292.1_PREDICTED:_two_pore_calcium_channel_protein_2-like_[Aplysia_californica] | -0.7296721 | 0.04295527 | -0.5717576 | 0.05016598 | -0.7352604 |
evgLocus_FX_21501_XP_005089220.2_PREDICTED:_two_pore_calcium_channel_protein_1-like_[Aplysia_californica] | -0.7159718 | 0.32056755 | 0.2791060 | 0.22322298 | -0.7734772 |
evgLocus_FX_35751_XP_005090384.1_PREDICTED:_protein_orai-2-like_[Aplysia_californica] | -0.5284934 | -0.49784785 | -0.4924398 | -0.46539967 | -0.6745102 |
evgLocus_FX_42627_AAO83838.2_voltage-dependent_L-type_calcium_channel_alpha-1_subunit_isoform_a_[Lymnaea_stagnalis] | -0.3608444 | -0.40410865 | -0.4005033 | -0.40050329 | -0.4167274 |
evgLocus_strawberry_AE_27300_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 0.7189592 | 0.94429382 | 0.4215175 | 0.43053085 | 0.5260727 |
evgLocus_strawberry_AG_27619_AAO83843.2_voltage-dependent_T-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 0.7153538 | 0.92806972 | 0.4215175 | 0.45937368 | 0.5260727 |
evgLocus_Strawberry_DRR_59689_AAO83841.1_voltage-dependent_non-L-type_calcium_channel_alpha-1_subunit_isoform_A_[Lymnaea_stagnalis] | 1.4688729 | 1.51393978 | 1.5572040 | 1.50132104 | 0.9965715 |
evgLocus_Stringtie_DRR_40665_XP_013068047.1_PREDICTED:_inositol_1,4,5-trisphosphate_receptor_type_1-like_isoform_X6_[Biomphalaria_glabrata] | 4.3531562 | 4.87773523 | 5.0363708 | 4.54423997 | 2.3341579 |
## Convert CSV to TSV as required by Clustergrammer
standardized_csv = pd.read_csv("chnl_comparison_standardized_edited.csv", skipfooter = 1, header = 0, engine='python')
print(standardized_csv.head(10))
print(standardized_csv.shape)
standardized_csv.columns = ["", "", "DM", "DR", "MM", "XT", "CE"]
standardized_csv.to_csv("chnl_comparison_final.txt", sep="\t", index=None)
Unnamed: 0 Unnamed: 1 \ 0 NaN NaN 1 evgLocus_FX_23187_ABA60384.1_nicotinic_acetylc... Category: Ach 2 evgLocus_FX_21433_ABA60380.1_nicotinic_acetylc... Category: Ach 3 evgLocus_FX_18804_ABA60390.1_nicotinic_acetylc... Category: Ach 4 evgLocus_stringtie_AE_5950_XP_012940025.1_PRED... Category: Ach 5 evgLocus_stringtie_AH_15026_XP_013060683.1_PRE... Category: Ach 6 evgLocus_stringtie_AH_58757_XP_013094335.1_PRE... Category: Ach 7 evgLocus_scallop_AH_33824_ABA60383.1_nicotinic... Category: Ach 8 evgLocus_Scallop_AE_39871_XP_013094335.1_PREDI... Category: Ach 9 evgLocus_FX_24558_XP_013066091.1_PREDICTED:_ac... Category: Ach DM DR MM \ 0 Type: Invertebrate Type: Vertebrate Type: Vertebrate 1 0.0880222004612374 -0.0850348002503357 -0.0832321231595901 2 0.0519686586463263 -0.117482987883756 -0.11568031079301 3 0.00870440846843307 -0.191392748604323 -0.200406134058051 4 -0.697945011103824 -0.696142334013078 -0.701550365285315 5 -0.694339656922332 -0.672707531833386 -0.672707531833386 6 -0.681720917287114 -0.667299500561149 -0.661891469288913 7 -0.667299500561149 -0.674510208924131 -0.679918240196368 8 -0.634851312927729 -0.62223257329251 -0.611416510748037 9 -0.629443281655493 -0.613219187838783 -0.62223257329251 XT CE 0 Type: Vertebrate Type: Invertebrate 1 -0.0868374773410812 -0.027349133346478 2 -0.113877633702265 -0.0940481857040634 3 -0.225643613328489 -0.153536529698667 4 -0.701550365285315 -0.708761073648297 5 -0.679918240196368 -0.714169104920534 6 -0.663694146379658 -0.696142334013078 7 -0.678115563105622 -0.685326271468605 8 -0.625837927474001 -0.651075406744439 9 -0.604205802385055 -0.651075406744439 (194, 7)
Heat map is created using the Clustergrammer web server using the chnl_comparison_final.txt
file.