List of methylation machinery gene IDs comes from this GitHub Issue:
%%bash
echo "TODAY'S DATE:"
date
echo "------------"
echo ""
#Display operating system info
lsb_release -a
echo ""
echo "------------"
echo "HOSTNAME: "; hostname
echo ""
echo "------------"
echo "Computer Specs:"
echo ""
lscpu
echo ""
echo "------------"
echo ""
echo "Memory Specs"
echo ""
free -mh
TODAY'S DATE: Fri Feb 26 11:36:59 PST 2021 ------------ Distributor ID: Ubuntu Description: Ubuntu 20.04.2 LTS Release: 20.04 Codename: focal ------------ HOSTNAME: computer ------------ Computer Specs: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 45 bits physical, 48 bits virtual CPU(s): 8 On-line CPU(s) list: 0-7 Thread(s) per core: 1 Core(s) per socket: 1 Socket(s): 8 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 165 Model name: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz Stepping: 2 CPU MHz: 2399.998 BogoMIPS: 4799.99 Hypervisor vendor: VMware Virtualization type: full L1d cache: 256 KiB L1i cache: 256 KiB L2 cache: 2 MiB L3 cache: 128 MiB NUMA node0 CPU(s): 0-7 Vulnerability Itlb multihit: KVM: Mitigation: VMX unsupported Vulnerability L1tf: Not affected Vulnerability Mds: Not affected Vulnerability Meltdown: Not affected Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling Vulnerability Srbds: Not affected Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon nopl xtopology tsc_reliable nonstop_tsc cpuid pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch cpuid_fault invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 invpcid rdseed adx smap clflushopt xsaveopt xsavec xgetbv1 xsaves arat pku ospke md_clear flush_l1d arch_capabilities ------------ Memory Specs total used free shared buff/cache available Mem: 53Gi 3.7Gi 42Gi 267Mi 7.2Gi 49Gi Swap: 2.0Gi 0B 2.0Gi
No LSB modules are available.
# Set data directories
%env data_dir=/home/samb/data/P_generosa
%env genes_gff=/home/samb/data/P_generosa/Panopea-generosa-vv0.74.a4.gene.gff3
%env unique_pgen_match_IDs=/home/samb/data/P_generosa/20210219_pgen_methylation-machinery_gene-IDs.txt
%env meth_machinery_list=/home/samb/data/P_generosa/20210219_methylation_list.txt
%env results_table=/home/samb/data/P_generosa/20210222_pgen_methylation-machinery_BLAST-evals.tab
data_dir="/home/samb/data/P_generosa"
env: data_dir=/home/samb/data/P_generosa env: genes_gff=/home/samb/data/P_generosa/Panopea-generosa-vv0.74.a4.gene.gff3 env: unique_pgen_match_IDs=/home/samb/data/P_generosa/20210219_pgen_methylation-machinery_gene-IDs.txt env: meth_machinery_list=/home/samb/data/P_generosa/20210219_methylation_list.txt env: results_table=/home/samb/data/P_generosa/20210222_pgen_methylation-machinery_BLAST-evals.tab
cd {data_dir}
/home/samb/data/P_generosa
%%bash
wget --quiet https://gannet.fish.washington.edu/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation/Panopea-generosa-vv0.74.a4.5d951bcf45b4b-diamond_functional.tab
wget --quiet https://gannet.fish.washington.edu/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation/Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab
wget --quiet https://gannet.fish.washington.edu/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation/checksums.md5
wget --quiet https://gannet.fish.washington.edu/Atumefaciens/20190928_Pgenerosa_v074.a4_gensas_annotation/Panopea-generosa-vv0.74.a4.gene.gff3
ls -lh
total 14M -rw-rw-r-- 1 samb samb 147 Feb 19 10:59 20210219_methylation_list.txt -rw-rw-r-- 1 samb samb 1.5M Oct 3 2019 Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab -rw-rw-r-- 1 samb samb 1.3M Oct 3 2019 Panopea-generosa-vv0.74.a4.5d951bcf45b4b-diamond_functional.tab -rw-rw-r-- 1 samb samb 11M Oct 14 2019 Panopea-generosa-vv0.74.a4.gene.gff3 -rw-rw-r-- 1 samb samb 6.0K Feb 19 20:33 checksums.md5
%%bash
line="-----------------------------------------------------------"
for file in *
do
echo ""
echo "${line}"
echo ""
echo "${file}"
echo ""
head -n 15 "${file}"
echo ""
done
----------------------------------------------------------- 20210219_methylation_list.txt dnmt1 dnmt3a dnmt3b dnmt3l mbd1 mbd2 mbd3 mbd4 mbd5 mbd6 mecp2 Baz2a Baz2b UHRF1 UHRF2 ----------------------------------------------------------- Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab # # Output is generated by GenSAS 7.x-5.0 # #name : mRNA #start : Start of alignment in subject #end : End of alignment in subject #m_start : Start of alignment in query #m_end : End of alignment in query #al : Alignment length #score : Row score of the match #evalue : E value of the match #identity : Percentage of identical matches mame start end score Accession Match ID m_start m_end E-value identity al 21910-PGEN_.00g000010.m01 121 229 165 Q86IC9 sp|Q86IC9|CAMT1_DICDI 11 122 8.93e-14 35.652 115 21910-PGEN_.00g000020.m01 147 467 968 P04177 sp|P04177|TY3H_RAT 20 339 3.47e-127 55.140 321 ----------------------------------------------------------- Panopea-generosa-vv0.74.a4.5d951bcf45b4b-diamond_functional.tab # # Output is generated by GenSAS 7.x-5.0 # #name : mRNA #start : Start of alignment in subject #end : End of alignment in subject #m_start : Start of alignment in query #m_end : End of alignment in query #al : Alignment length #score : Row score of the match #evalue : E value of the match #identity : Percentage of identical matches mame start end score Accession Match ID m_start m_end E-value identity al 21910-PGEN_.00g000020.m01 147 467 945 P04177 sp|P04177|TY3H_RAT 20 339 7.9e-101 55.1 321 21910-PGEN_.00g000050.m01 566 722 180 Q8L840 sp|Q8L840|RQL4A_ARATH 2 167 2.4e-12 35.1 168 ----------------------------------------------------------- Panopea-generosa-vv0.74.a4.gene.gff3 ##gff-version 3 ##Generated using GenSAS, Monday 7th of October 2019 04:54:37 AM ##Project Name : Pgenerosa_v074 PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 2 4719 . + . ID=PGEN_.00g000010;Name=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140;Alias=21510-PGEN_.00g234140;original_name=21510-PGEN_.00g234140;Notes=sp|Q86IC9|CAMT1_DICDI [BLAST protein vs protein (blastp) 2.7.1],PF01596.12 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 19808 36739 . - . ID=PGEN_.00g000020;Name=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150;Alias=21510-PGEN_.00g234150;original_name=21510-PGEN_.00g234150;Notes=sp|P04177|TY3H_RAT [BLAST protein vs protein (blastp) 2.7.1],sp|P04177|TY3H_RAT [DIAMOND Functional 0.9.22],IPR036951 [InterProScan 5.29-68.0],PF00351.16 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 49248 52578 . - . ID=PGEN_.00g000030;Name=PGEN_.00g000030;original_ID=21510-PGEN_.00g234160;Alias=21510-PGEN_.00g234160;original_name=21510-PGEN_.00g234160;Notes=PF08054.6 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 55792 67546 . + . ID=PGEN_.00g000040;Name=PGEN_.00g000040;original_ID=21510-PGEN_.00g234170;Alias=21510-PGEN_.00g234170;original_name=21510-PGEN_.00g234170 PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 67586 69113 . - . ID=PGEN_.00g000050;Name=PGEN_.00g000050;original_ID=21510-PGEN_.00g234180;Alias=21510-PGEN_.00g234180;original_name=21510-PGEN_.00g234180;Notes=sp|Q8L840|RQL4A_ARATH [BLAST protein vs protein (blastp) 2.7.1],sp|Q8L840|RQL4A_ARATH [DIAMOND Functional 0.9.22],PF00270.24 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 70713 81099 . + . ID=PGEN_.00g000060;Name=PGEN_.00g000060;original_ID=21510-PGEN_.00g234190;Alias=21510-PGEN_.00g234190;original_name=21510-PGEN_.00g234190;Notes=sp|Q61043|NIN_MOUSE [DIAMOND Functional 0.9.22],PF04443.7 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 183686 186073 . + . ID=PGEN_.00g000070;Name=PGEN_.00g000070;original_ID=21510-PGEN_.00g234200;Alias=21510-PGEN_.00g234200;original_name=21510-PGEN_.00g234200;Notes=PF15364.1 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 187328 188353 . + . ID=PGEN_.00g000080;Name=PGEN_.00g000080;original_ID=21510-PGEN_.00g234210;Alias=21510-PGEN_.00g234210;original_name=21510-PGEN_.00g234210;Notes=sp|A1E2V0|BIRC3_CANLF [BLAST protein vs protein (blastp) 2.7.1],sp|Q24307|DIAP2_DROME [DIAMOND Functional 0.9.22],IPR001370 [InterProScan 5.29-68.0],PF02229.11 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 189849 190460 . - . ID=PGEN_.00g000090;Name=PGEN_.00g000090;original_ID=21510-PGEN_.00g234220;Alias=21510-PGEN_.00g234220;original_name=21510-PGEN_.00g234220;Notes=sp|P34456|YMD2_CAEEL [BLAST protein vs protein (blastp) 2.7.1],PF10284.4 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 191069 191410 . - . ID=PGEN_.00g000100;Name=PGEN_.00g000100;original_ID=21510-PGEN_.00g234230;Alias=21510-PGEN_.00g234230;original_name=21510-PGEN_.00g234230;Notes=PF10228.4 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 191836 192282 . - . ID=PGEN_.00g000110;Name=PGEN_.00g000110;original_ID=21510-PGEN_.00g234240;Alias=21510-PGEN_.00g234240;original_name=21510-PGEN_.00g234240;Notes=IPR009044 [InterProScan 5.29-68.0],PF02229.11 [Pfam 1.6] PGA_scaffold1__77_contigs__length_89643857 GenSAS_5d9637f372b5d-publish gene 192526 193324 . - . ID=PGEN_.00g000120;Name=PGEN_.00g000120;original_ID=21510-PGEN_.00g234250;Alias=21510-PGEN_.00g234250;original_name=21510-PGEN_.00g234250;Notes=sp|P34457|YMD3_CAEEL [DIAMOND Functional 0.9.22],IPR036397 [InterProScan 5.29-68.0],PF13276.1 [Pfam 1.6] ----------------------------------------------------------- checksums.md5 fb0ed4cf6af7adaa6579da118a887cb9 ./Panopea-generosa-vv0.74.a4.5d82b23f69545-braker.genes.gff3 aed6dae0cc90216f8606077c1927e6ff ./Panopea-generosa-vv0.74.a4.5d938f378020d-pasa_refine.CDS.fna 4b3369aa246cb179ecf374f5c4f38966 ./Panopea-generosa-vv0.74.a4.5d3f05ff5e0bd-augustus.genes.fna f0afd89f974e81fcbadef8cae13423d9 ./Panopea-generosa-vv0.74.a4.5d82b316cd298-trnascan.tRNA.fna 36fbf825ef85893cc6256f2dcc2b86c2 ./Panopea-generosa-vv0.74.a4.5d82b316cd298-trnascan.tRNA.gff3 8f11ea587217243e02568aec3c59d78c ./Panopea-generosa-vv0.74.a4.5d3f05ff5e0bd-augustus.CDS.fna 8401b6372774f594f8aad52d0aa5a1d6 ./Panopea-generosa-vv0.74.a4.rRNA.gff3 acd5a79a0879f602815ace545faedab7 ./Panopea-generosa-vv0.74.a4.5d938f378020d-pasa_refine.protein.faa 4f751f896347e96241daf6006d3a3abe ./Panopea-generosa-vv0.74.a4.5d82b2fdbf06f-genemark.protein.faa 7c348d72a890232bd09676e38d6b8ada ./Panopea-generosa-vv0.74.a4.gene.gff3 a0a5cbd449cf90e8b91a188504e76f8c ./Panopea-generosa-vv0.74.a4.5d82b2fdbf06f-genemark.CDS.fna 965b78b7ebb3274edb8bab0ca1e0e8c1 ./Panopea-generosa-vv0.74.a4.5d250896def4c-repeatmasker.repeats.gff3 14d207d90085eeac0002ba8cdf7ca3aa ./Panopea-generosa-vv0.74.a4.CDS.gff3 c37fa754433cdab8d609efb0f90a83fe ./Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab d36db1a170b0f037287af6fbfe0b45dd ./Panopea-generosa-vv0.74.a4.5d938f378020d-pasa_refine.genes.gff3
%%bash
diff <(md5sum Panopea-generosa-vv0.74.a4.gene.gff3 | cut -d " " -f1) <(grep "Panopea-generosa-vv0.74.a4.gene.gff3" checksums.md5 | cut -d " " -f1)
%%bash
# Pull out unique list of pgen IDs matching methylation machinery list
while read -r line
do
# Test for empty line
[ -z ${line} ] && { echo "Empty line found in ${meth_machinery_list}."; exit 1; }
# Search GFF for methylation gene name
if grep --quiet --ignore-case "|${line}" "${genes_gff}"; then
# Loop through matches, in case of multiple matches
for match in $(grep --ignore-case "|${line}" "${genes_gff}" | awk -F'[=;]' '{print $2}')
do
# Print tab-delimited results
printf "%s\t%s\n" "${match}" "${line}"
done
fi
done < ${meth_machinery_list} | sort -k1,1 -u >> ${unique_pgen_match_IDs}
head ${unique_pgen_match_IDs}
PGEN_.00g104080 Baz2b PGEN_.00g104170 Baz2b PGEN_.00g116950 mbd5 PGEN_.00g186870 ctcf PGEN_.00g192900 UHRF1 PGEN_.00g202750 mbd2 PGEN_.00g209890 mbd2 PGEN_.00g209900 mbd4 PGEN_.00g243700 egr1 PGEN_.00g249090 egr1
%%bash
printf "%s\t%s\t%s\t%s\n" "Gene_ID" "gene_name" "BLASTp_evalue" "DIAMOND_evalue" > ${results_table}
while read -r pgen_ID meth_machinery
do
blastp=$(grep "${pgen_ID}" Panopea-generosa-vv0.74.a4.5d951a9b74287-blast_functional.tab | head -n 1 | cut -f9)
diamond=$(grep "${pgen_ID}" Panopea-generosa-vv0.74.a4.5d951bcf45b4b-diamond_functional.tab | head -n 1 | cut -f9)
printf "%s\t%s\t%s\t%s\n" "${pgen_ID}" "${meth_machinery}" "${blastp}" "${diamond}"
done < ${unique_pgen_match_IDs} >> ${results_table}
%%bash
cat ${results_table} | column -t
Gene_ID gene_name BLASTp_evalue DIAMOND_evalue PGEN_.00g104080 Baz2b 1.05e-98 5.4e-102 PGEN_.00g104170 Baz2b 3.09e-96 1.2e-109 PGEN_.00g116950 mbd5 6.40e-21 2.8e-20 PGEN_.00g186870 ctcf 1.25e-116 PGEN_.00g192900 UHRF1 2.32e-19 PGEN_.00g202750 mbd2 9.46e-82 2.6e-63 PGEN_.00g209890 mbd2 4.37e-19 9.2e-09 PGEN_.00g209900 mbd4 3.14e-32 8.0e-29 PGEN_.00g243700 egr1 6.24e-58 2.2e-23 PGEN_.00g249090 egr1 4.19e-18 2.6e-06 PGEN_.00g283000 dnmt1 5.03e-10 PGEN_.00g283010 dnmt1 0.0 7.3e-224