import numpy as np
import pandas as pd
df = pd.read_csv('native_data/polyadb_processed_v3_w_hg38.csv', delimiter=',')
df = df.query("site_type == '3_most_exon'").copy().reset_index(drop=True)
#df = df.loc[~df['wide_seq_ext'].str.slice(175 - 70, 175 - 70 + 205).str.contains("AAAAAAA|AAAGAAAA|AAACAAAA|AAAAGAAA|AAAACAAA")].copy().reset_index(drop=True)
print(df)
gene gene_id sitenum num_sites pas \ 0 AADACL3 AADACL3.2 2 2 0 1 ABCA4 ABCA4.1 5 5 0 2 ABCB10 ABCB10.6 2 7 0 3 ABCB10 ABCB10.5 3 7 0 4 ABCB10 ABCB10.4 4 7 2 5 ABCB10 ABCB10.3 5 7 0 6 ABCB10 ABCB10.2 6 7 0 7 ABCB10 ABCB10.1 7 7 0 8 ABCD3 ABCD3.6 6 15 3 9 ABCD3 ABCD3.7 7 15 0 10 ABCD3 ABCD3.8 8 15 2 11 ABCD3 ABCD3.9 9 15 4 12 ABCD3 ABCD3.10 10 15 4 13 ABCD3 ABCD3.11 11 15 2 14 ABCD3 ABCD3.12 12 15 3 15 ABCD3 ABCD3.13 13 15 0 16 ABCD3 ABCD3.14 14 15 0 17 ABCD3 ABCD3.15 15 15 0 18 ABL2 ABL2.18 7 24 4 19 ABL2 ABL2.17 8 24 3 20 ABL2 ABL2.16 9 24 2 21 ABL2 ABL2.15 10 24 0 22 ABL2 ABL2.14 11 24 0 23 ABL2 ABL2.13 12 24 0 24 ABL2 ABL2.12 13 24 2 25 ABL2 ABL2.11 14 24 3 26 ABL2 ABL2.10 15 24 0 27 ABL2 ABL2.9 16 24 4 28 ABL2 ABL2.8 17 24 0 29 ABL2 ABL2.7 18 24 4 ... ... ... ... ... ... 114575 UTY UTY.5 8 12 3 114576 UTY UTY.4 9 12 0 114577 UTY UTY.3 10 12 3 114578 UTY UTY.2 11 12 0 114579 UTY UTY.1 12 12 0 114580 ZFY ZFY.3 3 7 0 114581 ZFY ZFY.4 4 7 0 114582 ZFY ZFY.5 5 7 0 114583 ZFY ZFY.6 6 7 0 114584 ZFY ZFY.7 7 7 0 114585 na na.38381 3 38383 0 114586 na na.38379 5 38383 0 114587 na na.38377 7 38383 0 114588 na na.38376 8 38383 0 114589 na na.38375 9 38383 0 114590 na na.38373 11 38383 0 114591 na na.38372 12 38383 4 114592 na na.38371 13 38383 3 114593 na na.38360 24 38383 0 114594 na na.38359 25 38383 3 114595 na na.38351 33 38383 -1 114596 na na.38350 34 38383 2 114597 na na.38349 35 38383 0 114598 na na.38344 40 38383 2 114599 na na.38343 41 38383 0 114600 na na.38342 42 38383 3 114601 na na.38341 43 38383 2 114602 na na.38329 38329 38383 0 114603 na na.38330 38330 38383 0 114604 na na.38331 38331 38383 0 seq \ 0 GCTGCAGGTGGTGGTTGCTGAAGGTGGGGGAGGCTGTGGCAATTTC... 1 TTTCTGCATGTTTGTCTGTGTGTCTGCGTTGTGTGTGATTTTCATG... 2 GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT... 3 CTATTTCATGAAAAGCATGGAATATTATATTTTATTGTTCATAATT... 4 CATAATTAATGAATAAAATTGATATGAATGAATATAGTGTTCTTTG... 5 TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC... 6 AACTTCTCACTACATTGTTTCTTAGTAGAATTTGGCTGTGGAGATT... 7 TCAGGAATAAAGAAAAGACTAACATTACACATATCCAAAAACATGT... 8 CATTTAATATTATATAGGATATTGCTAATTGTGTATATGTTGGTTT... 9 ATAATATGTACTAAGAATGTCCTTATTCTTGTGGTTAAAAACCTGC... 10 GGAGTGCATTTGACTCCAGGAAAAGCCATTTTGGTTTTCCTTAACT... 11 GATTTTATGTTTAAAAAGTATGTTCTAAAATTATTATATATACATG... 12 ATGATGTCAAACGATCCTAAGCGAAGATGATTTCAGTTCATCAAAT... 13 ACTGGTTTTGTTTTTTTGCAGAATTAACTATAACAATCACTGGCTA... 14 AACTATAACAATCACTGGCTACCGAAGTAAACTGATGTACTGAATT... 15 AATTTTTACCACTTCTGTTTAGCGAACTTGTATACTTATTTTCTGT... 16 GCCTTGACTTGAAAACATAGATAGTTTAATCTTGACTTGAAAAACA... 17 ATGAGAAAATAAGTATGAAACAGCAATGGTAGTTTGTTTTGCATTA... 18 AGGAAATCAGTGATGTGGTGCAGAGGTAGCCACTGTTAGCCTGGTG... 19 CTGAGGGGAGAGGGAAAAGGACTTGTTTTCCTGTGTTCTTGTTTTC... 20 CCACAAGGCCATTGCTGCTGTAATAAGAACTGCAAATCAGAGTGCT... 21 AAGTTTCTTAGGATTCAACTGAAAATTGGCTAGTAAAATATGTCCC... 22 ACAGGGTTATCTATATTTTTGTTCTTGATTTTTGAGTTTCATCATC... 23 GACTTAAATTTTCACATATATGTATACTTAATTTTCAGCAATTATG... 24 CTGATTCTCTAATGACTGTCACTGGACAAACCCCAGTTTTATGCTC... 25 GTGTGTCACATTACCAGTGTTGTCAGGTATTTGTTCTTAATTGTTA... 26 CTCCGTCTGTTGTCTGACTGTGAACTGCTAACAGTGTTAAACTTGA... 27 GTCTGTTGTCTGACTGTGAACTGCTAACAGTGTTAAACTTGATGTA... 28 ACTGCTTTCTCTGTCTTCTCACAAGGTTTGCCAAGTTGTGTTCTGT... 29 GTCTAAGCAGCATTTCTTTCTCTCTCTGTAATGCAGATGTCTATAA... ... ... 114575 ACTTGTATTTCATATGAAAAATTTGCTAATTTAATATTAACTCATT... 114576 ATATATGGGGGTACATGTGATATGTTGTTACATATATAGAATGTGT... 114577 TTTATTTTTGTATAATACCGATTTCCACTTTGGTTTGACTTTGCTC... 114578 AGACCTGCATCCAGAGAGGAGCTTCCTGATCCAGGGCTATTCTGTC... 114579 ACAACTCTCTTCACTTTTTTCAGCTTATTTTCTCTTTCATGTTCAG... 114580 TTCAGTTTCTTTAATAGATGAGGAAAAATAGCAACAAGCAAGTTGC... 114581 CATATAGGAAGTGTGATACTATTGTAAATGTGTATTTGAGAATATG... 114582 GTAAATGTGTATTTGAGAATATGCAAAAATAAAAATAATATTTTAC... 114583 ATACCAATACTAAGTTAACTCTTTCTTTAAAACACGCAGTTTTAAG... 114584 TTTGTTTTATCTTAATGAAAGATGGTTTGGTTAGCATCTGCACTTG... 114585 GGCACTGAGCCGCTGGAGTCTGCACATTGATAAATTTACTTACAGT... 114586 GCCCAGCCCTCACCTTGATCTCAGATTCCTTGTTTCCAGTACTGGA... 114587 TGAATCTGCCGCATCTTGATACTGGACTTCCAGTCTCCAGAACTGT... 114588 TTTTGCTATAGCAACCCAAATGGACTAAGAAAACACCAGAGGCCAT... 114589 GGAATTGCTATGTAGTAATTATTCAAATTGCATGCACAGCTAAGAA... 114590 TTTGTTCCTTAATCTGTTAGTATGGTATATTACATCGATTAATATT... 114591 TATTAAATCAGTCATGAATTCCCATTATGAGCCCCACTTGCTCATG... 114592 CATTTCCTCCAGAGGCCAACAAAACTCTCCCTACTCTCTAAGCCCC... 114593 ACCACTGCCACAGCTGCCTACAACAGACACATCAGATGACACTCCG... 114594 GATGACACTCCGGGCAAATAAATGATTTTCACTGAGGACTTACTGG... 114595 ACAGAGGGGTGCAGGACTCGGGCCCTGGCACACAAGGAGGTAGCTG... 114596 TCCTCTGTCCCACTTAACCCGAGGGACCCCAGAAGCAAGTGTCACC... 114597 CTGGCTGGCTACCAGTGAGCAGCGGCACGGTGCTTCCCCCCAATCC... 114598 GAAGTGACTTCTACCTTACTGCTTTAAAAAATGTATCTGTCTTTTG... 114599 ACAAGTGAAGATTACAGTTTTTCATCAAATTTGTGAAGTTTTCAGC... 114600 TTTGTGCCTGTCTCATTTTTTGTTGTTGTTGAAAACTGAACATTTA... 114601 ATCAGATTCTTCATTTTCCTAATGTGTGTTATGATTGGTTACTGTT... 114602 GTCCATTCTTGTATTGCTATAAAGAAATATTGGATACTGGCTAATT... 114603 TCAGTTCATCAACGAATCGGTATATTAATGTCATATTTAACAGTTA... 114604 CGGTATATTAATGTCATATTTAACAGTTATAGGAATAAACTAAGCA... seq_ext \ 0 TAAATTGTCGTAATCTTTTTGCTGGTTGATGGTCTTGCCTTGATGT... 1 CCACTAGCAGCTTTGGCCTCCATATTGCTCTCATTTCAAGCAGATC... 2 TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC... 3 CATGAACTAAGCATTTATTAGTTCCCTGATTAGACTGGAAGAAGAA... 4 AGAAGAAACCACTATTTCATGAAAAGCATGGAATATTATATTTTAT... 5 TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT... 6 TTTTTTTTTATGACCCAGGAACCAAAGATGCAGTCTGTCATTTCTT... 7 TATAAAACTTCTCACTACATTGTTTCTTAGTAGAATTTGGCTGTGG... 8 AAAGCAACAAATTAACTAGATACAGAATAATGGAGAACAAGTTGTT... 9 ATTTAATATTATATAGGATATTGCTAATTGTGTATATGTTGGTTTA... 10 GGCAGATTTCTTTAGCTGCCACAGTAATACTCATTCCTTGTGTGTG... 11 TACATGAAATAATGCACTGAGTATGCAATGCTATCACTGTCTTTGA... 12 GGCACTGTTTTATCTCTGTGAATCTTGAATAACTTTTTTATATTTG... 13 TCTCAAACTGAGCTTCAGAAAGGGGCATTTTGTACTCTTGTTTTTG... 14 CATTTTGTACTCTTGTTTTTGCATAACTGGTTTTGTTTTTTTGCAG... 15 CTGGCTACCGAAGTAAACTGATGTACTGAATTCCATAATACATAAC... 16 TGTTCTTTTTATTCTGGTATCTAAATACTGAGAAGTTCATTTATAA... 17 TGCCAAGACATATCACCGTGTTCTCATAATAAGTTTTTACTTTTTA... 18 TGGTGTGCCCGGGACAAACCCTGTCCTTAATAACTTATTGTCATGT... 19 TGTGGTGCAGAGGTAGCCACTGTTAGCCTGGTGGGAAAATGCACAC... 20 CAAGAGAAATTTTTGTTCAGGGCTGTTGGAAGTAGCTGTTAGCCTT... 21 TATAGTGGAGACCCTTTACCAGCAGCATTTTAAGATTTAAATTAGA... 22 GTCATTCTTGTTATCAGGCTGTGCCCCTACCAGGAGTTCATATTGG... 23 GCTTAAATTTGCTTCGGAATTAGGCACCATAAACTTTAGTTGGAGA... 24 CTCAGAACCTCCATCTCCTCCCAGACCCTGCACTTTTCCTTCTCTG... 25 TAAGGGGACATAGTGAGACAATGTGGGATTTAACTAAAAACACATC... 26 TCTTAATTGTTATTGTAATATATTTTCAGTTGTTTTTCTAATTTAA... 27 AATTGTTATTGTAATATATTTTCAGTTGTTTTTCTAATTTAATTCT... 28 ACTGCTAACAGTGTTAAACTTGATGTAAATAAATGAGGCCCTTGAA... 29 AAGTAGGCTGGAGATTGGAGTGTTTCTCCCATGATTTTAGTTGACA... ... ... 114575 AAGTATTAAAAAATCTATATCGCTAGTAAATTGTAATAAGTTCTAT... 114576 ATCTTATACCTACAGATTATTTTATTTTTTAAGTTGATAATATTTC... 114577 TCAATGTATTTAGAACTGTTTAATTCCCAAGTGGTGGGGAGTGGGG... 114578 CTACCCACTCCAGGGCCTCATCTCTGCTGAGAGCTGCAGAGATGAC... 114579 GTTTCTGGCCAGAAAAGCAACACCATAAGGTTCCTGAAACAACAGG... 114580 TCTAGTCCACTTTTCTTTACATTTTATTCAATACGCTGTCCTGAAT... 114581 TCCCTTCCTCTGTACAAATTGTAAAATACTTAAAATAACTATGAAG... 114582 AAATACTTAAAATAACTATGAAGAATTCATATAGGAAGTGTGATAC... 114583 GCAAAAATAAAAATAATATTTTACTGTTAATAAATGTTTACTTGTA... 114584 TTGCGCTAGTTGATGAGGCCTTTGCTTAATGTGTTCTTGAAAGTCA... 114585 AACTCAAGCAACTTTCTCCAAAGCCTAGGGTTCAGCAAGAGTAAGC... 114586 GCTGGAAGAGGCGGGAAAGATCCTCCCCTAGAGCTTCCAGGGGGAG... 114587 CAGAGGGAAGGCTCTGTCTATGAATGAGAAAGTGGGTCCCCACCAG... 114588 CTGTGGGCAATAAATGTCTGTTGTTTATTACCTGTCCAGTATCTTT... 114589 AAGTTCCTGGCTGCCTCCCCTGAGGGTTCTGATGTATTGGCTATGT... 114590 AAAAGCCTAGTTGTTGTCTCCTGCAAGATGTATTCAGTTGTAATTT... 114591 CTTTGTTCCTTAATCTGTTAGTATGGTATATTACATCGATTAATAT... 114592 GGAGGGAGAAAGAGAGCATGTGCTTAAAGAAAAGTTATGGGGCTGG... 114593 CCACTGATGGACGGACAGACGTGGGCAGGGTACGTGTCACTAAACC... 114594 TGTCACTAAACCTCCCACCACTGCCACAGCTGCCTACAACAGACAC... 114595 TCTGCGAAGTGATGGAGACACTGACAGTGCAGGAAGAGGCAAACCC... 114596 CCACCACTCTGCCCCCTCTCATCACCTGGTCTGGGCCCTCAACACC... 114597 ACCGCCAGAATTGCATGGCCCATCATACTTGCTACCCCACCATCCT... 114598 GAAAGGAGAAAAGGGAGGGAGGGAGGGAAGGAGGGAGGACTTGTAT... 114599 AAAATGTCTTGTGTCTAAACATGGATCCCTTGTAATTCACGCAACT... 114600 TTCTGTTGGCTGTGCTTTTTCTTGCGTATAGGCTGTACTTTTTTTT... 114601 TGTTGTTGAAAACTGAACATTTAAAATAATATAATTTGGCAACTGT... 114602 AACATTATATAGGATGTCTTGTTTTAGGCATTTATTGCTAGATAAC... 114603 TGTTTGTAAAGTAGTTGTCTAGACTCTAGTGAAAATAATTACAGAT... 114604 TCTAGACTCTAGTGAAAATAATTACAGATAATCTCAGTTCATCAAC... wide_seq \ 0 TGTATGTTACATTCATGGGAATGTCTAAATTGTCGTAATCTTTTTG... 1 CCAGAACTAGAAACCCCGGGCCATCCCACTAGCAGCTTTGGCCTCC... 2 TCAGGTTTTGTATTTTCTTTTCTTGTGGAATATTTTAATTAATATA... 3 AATGTAAATCAAATGGAAGTTTTCCCATGAACTAAGCATTTATTAG... 4 TTATTAGTTCCCTGATTAGACTGGAAGAAGAAACCACTATTTCATG... 5 GGTTGAATCTGAGGAAAATAATCCTTGTGCCATAGAAGTATTTACG... 6 AACTTTATCTGTGTCTGTCACTTTTTTTTTTTTTATGACCCAGGAA... 7 ACCAAAGATGCAGTCTGTCATTTCTTATAAAACTTCTCACTACATT... 8 CTTAGTTTTTTTTAAAAAAAAAAACAAAGCAACAAATTAACTAGAT... 9 ATAATGGAGAACAAGTTGTTAAAACATTTAATATTATATAGGATAT... 10 TGCATAACAGCGTTTATTATACAGTGGCAGATTTCTTTAGCTGCCA... 11 CTCCAAATGAGCCATAGGAAGGCACTACATGAAATAATGCACTGAG... 12 ATACATGGGTGAATTATGTTTCCGAGGCACTGTTTTATCTCTGTGA... 13 CAATAAGCTAGATACGAAATCAGTTTCTCAAACTGAGCTTCAGAAA... 14 TCTCAAACTGAGCTTCAGAAAGGGGCATTTTGTACTCTTGTTTTTG... 15 TTTGCAGAATTAACTATAACAATCACTGGCTACCGAAGTAAACTGA... 16 TCAGATATCCTATACAACCTTTGCTTGTTCTTTTTATTCTGGTATC... 17 TTTCATCCATGAGCACCACGCTGCATGCCAAGACATATCACCGTGT... 18 AGCTACAGGTTTCTTCAGCAGCTGCTGGTGTGCCCGGGACAAACCC... 19 TGTCATGTGTACAGGAAATCAGTGATGTGGTGCAGAGGTAGCCACT... 20 GCAGAAAAGAAAGCTGGGAATGTACCAAGAGAAATTTTTGTTCAGG... 21 GATGACCTCTGTACATAGAGGTCATTATAGTGGAGACCCTTTACCA... 22 TGCTCCCTTCCATTTTGCTACTGATGTCATTCTTGTTATCAGGCTG... 23 TAAGGAGTTTGTGTAGTCAGAACTTGCTTAAATTTGCTTCGGAATT... 24 AGGCTGACTTAGCTGTTGATGTATACTCAGAACCTCCATCTCCTCC... 25 GAGCATCGGGCACTGTGTGCATTGCTAAGGGGACATAGTGAGACAA... 26 ATTACCAGTGTTGTCAGGTATTTGTTCTTAATTGTTATTGTAATAT... 27 CCAGTGTTGTCAGGTATTTGTTCTTAATTGTTATTGTAATATATTT... 28 CTCTCCGTCTGTTGTCTGACTGTGAACTGCTAACAGTGTTAAACTT... 29 TAAAGGTTGCAATATTTTATTGGCAAAGTAGGCTGGAGATTGGAGT... ... ... 114575 ACATGGTTTGGGGTGAAGGGGAGGAAAGTATTAAAAAATCTATATC... 114576 TGATCAGTATGCAGATCCTTTACAGATCTTATACCTACAGATTATT... 114577 ATTTCCTTTAGGATTTTCTCTTAGATCAATGTATTTAGAACTGTTT... 114578 TCAGGTGACCAGCTGCAGAAAGGAGCTACCCACTCCAGGGCCTCAT... 114579 GGCAAAAGCACCACCACCCACAGAGGTTTCTGGCCAGAAAAGCAAC... 114580 GCTGTGTAAAAATAGAATTATTGCTTCTAGTCCACTTTTCTTTACA... 114581 AAAATAGCCTATTAGATATTGTATGTCCCTTCCTCTGTACAAATTG... 114582 TGTCCCTTCCTCTGTACAAATTGTAAAATACTTAAAATAACTATGA... 114583 ATTGTAAATGTGTATTTGAGAATATGCAAAAATAAAAATAATATTT... 114584 ATTTACAAGTAGTATTTCAGAGTACTTGCGCTAGTTGATGAGGCCT... 114585 TAAAAATTATGGAACGAATTGTAGAAACTCAAGCAACTTTCTCCAA... 114586 GCAATCATGACTGGAGCCACCAAGAGCTGGAAGAGGCGGGAAAGAT... 114587 TTGCTCCTTCCACAGTGTGAGGACACAGAGGGAAGGCTCTGTCTAT... 114588 GATACTGGACTTCCAGTCTCCAGAACTGTGGGCAATAAATGTCTGT... 114589 AAGATGTGGGGGATATTTTGCAAGAAAGTTCCTGGCTGCCTCCCCT... 114590 GACCATCTATCTGGACGAATAGGAGAAAAGCCTAGTTGTTGTCTCC... 114591 AAGATGTATTCAGTTGTAATTTCGCCTTTGTTCCTTAATCTGTTAG... 114592 GAAAGAGAGGGAAGGAAGGAAAGAGGGAGGGAGAAAGAGAGCATGT... 114593 CAGGTGCAGCCAGAGAGACACTAGCCCACTGATGGACGGACAGACG... 114594 GACGGACAGACGTGGGCAGGGTACGTGTCACTAAACCTCCCACCAC... 114595 AAACCCTCTGCCAGCAACAAGGGTGTCTGCGAAGTGATGGAGACAC... 114596 CCCCAAAGATGGCTCCGGCGCCTGCCCACCACTCTGCCCCCTCTCA... 114597 GCCATGACCAACAGTCTCTTCCTCGACCGCCAGAATTGCATGGCCC... 114598 GATGGATGGAAGAGAAGGAAAGGAAGAAAGGAGAAAAGGGAGGGAG... 114599 AAATGTATCTGTCTTTTGACAACATAAAATGTCTTGTGTCTAAACA... 114600 GTGTCTGGACTTCCTTAGAATGAATTTCTGTTGGCTGTGCTTTTTC... 114601 GTTTTGTGCCTGTCTCATTTTTTGTTGTTGTTGAAAACTGAACATT... 114602 AAAATTAAGTGCAGAAAATATGTTTAACATTATATAGGATGTCTTG... 114603 TGCATTATGCAATGATGTAGAATACTGTTTGTAAAGTAGTTGTCTA... 114604 TAGAATACTGTTTGTAAAGTAGTTGTCTAGACTCTAGTGAAAATAA... wide_seq_ext pas_pos \ 0 TGCGTGGGTTTTCTCTGAGTTCTCCAGCTTCCTCCCACATTCCAAA... 12788705 1 TGGAACACCTGATGGTGAAACCAAACAAATACAAAATCCTTCTCCA... 94458420 2 GAGTTTTAATAATTGTAACTTTTTAAATGTCTATAGCACTGAAGTT... 229653571 3 AAGTGCTTTTTCTCCATGGATGAGGCTAGACCCTAAGAAGTAATTA... 229652609 4 GTAATTAAGTCAATGTAAATCAAATGGAAGTTTTCCCATGAACTAA... 229652570 5 TGTGTTCTTTATAAAGTGTGATTTTCAGAAAGCAAACAACACAATT... 229652351 6 CAATTTTATCTTTAGGTAATATTTTATATCATAGATTAAAATTTAT... 229652210 7 TAGTGAACTTTATCTGTGTCTGTCACTTTTTTTTTTTTTATGACCC... 229652165 8 ATTACAGAATATACTTAGAAAGGCAAAGTACATTGTAAAATAAAGT... 94982896 9 TTAGTTTTTTTTAAAAAAAAAAACAAAGCAACAAATTAACTAGATA... 94982947 10 GTGGTAGTTGGAAACAAATCATAATGTATTATTTAAATGTTTAACA... 94983300 11 GTGTGTTAGAAGCCCATTCATTAGAAGTGTGGTGGTTATTTGGTAT... 94983669 12 TCTTTGACTGTGATTTTATGTTTAAAAAGTATGTTCTAAAATTATT... 94983783 13 TAATGACTTTATGTATTATTTGCACAGGGAGAATTGAAACTGAGTA... 94983960 14 AGGGAGAATTGAAACTGAGTATAATCAATAAGCTAGATACGAAATC... 94983985 15 TCAGAAAGGGGCATTTTGTACTCTTGTTTTTGCATAACTGGTTTTG... 94984049 16 CGAACTTGTATACTTATTTTCTGTTCAGATTAAAAAAAAAAAAAAA... 94984196 17 GCAGTGGGAAATGGTAGTTTAATCCGAAGAATAAACCAAAGAATAA... 94984889 18 CAAATTTGCCTTCCGAGAGGCTGTGAGCAAACTGGAACTCAGCCTG... 179076831 19 TTCAGCAGCTGCTGGTGTGCCCGGGACAAACCCTGTCCTTAATAAC... 179076768 20 GGTACTAATGGTGATTATGCTCCAATTTACCTAATGAATTTGGTGG... 179076299 21 GATTAGCCTTAAAAAATTTGGCCCTAAGAAATTCATTCAGTGTTTT... 179073543 22 GCCTTGGTGGCCACTTCTGAATTATAGCTACATTTCATTATGACCC... 179071745 23 AAAGCTGGAGGCAAACTGTCGAATGCTAGCAGGCTTTATGAAAGGA... 179069576 24 TCTTTCCTTGGATGAACAGCAGTGGCGTGGTTGGTGTGTTGACTGT... 179068920 25 AAAGGTCCCGCCCCGCAGGTGTGTATGTGGTGCACAGAGGGTGAGG... 179068650 26 ATAGTGAGACAATGTGGGATTTAACTAAAAACACATCAATTGTGTG... 179068566 27 TGAGACAATGTGGGATTTAACTAAAAACACATCAATTGTGTGTCAC... 179068562 28 GTTCTTAATTGTTATTGTAATATATTTTCAGTTGTTTTTCTAATTT... 179068493 29 TGCTTTCTCTGTCTTCTCACAAGGTTTGCCAAGTTGTGTTCTGTTT... 179068366 ... ... ... 114575 TTTTTTTAAAGCTAGATGACTTTGAAATGCTATACTGTCCTGCTTG... 15360284 114576 TGTTTCTCTCTATTTAAGTCCTTGTTGGTTTTTTTCATTAGCTTTT... 15356350 114577 ATTTTGATATGTTGTATTTTAATTTTTATTCAGTTTTATGTATTTT... 15347429 114578 CCCTTCTGCGGCCCATAAAAGCCCTGGGTTCAGGCAGAGCTGAGCA... 15346489 114579 CTTCCAGGCTGAGTGGGTGGAGCAAGCCCAGCAGGCCTGAGCAAAA... 15345808 114580 CATTTTTTAAAGCCAGTCTTGTTCACATCCATTACTATACATTGAA... 2848257 114581 TTTTAAAGAGGTACAAAATCTTACAAGGACATAAATTATTATTTGG... 2850501 114582 CAAGGACATAAATTATTATTTGGTTGAAAAATAGCCTATTAGATAT... 2850524 114583 GTAAAATACTTAAAATAACTATGAAGAATTCATATAGGAAGTGTGA... 2850596 114584 CATCAGTGGAATACAAAATGCTTTTTATGTAAATATTGGTAACCTT... 2850826 114585 TTTTTACCACTGATTCAGTAAATCTCCTAACTTTGCAGGAACTGGG... 21717303 114586 TATTTTTTGAGATGCGGTCTTGCTCTGTTTCCCAGTTTGGAATGCA... 21145209 114587 CCTCATGAATGGGATGAGCACTCCTACAAAAAGGATTCCAGAGAGC... 21034500 114588 CTATGAATGAGAAAGTGGGTCCCCACCAGACATTGAATCTGCCGCA... 21034408 114589 AGAAATTATTACCGATTGTAAATTTTACCAGTGGTCTCCAGAGAGG... 16905140 114590 ACCCTTGCCTGGGGTGACTGGAAGAGCTGCTTCTGTAGAATCCTGG... 14773268 114591 AGACCATCTATCTGGACGAATAGGAGAAAAGCCTAGTTGTTGTCTC... 14773219 114592 ACAAATGGATAAACAGAGGAAGGAAGGGATGGAGAAAGAGAGGAGA... 14208316 114593 GATTTCTTTTGACTCCCATGCTAATGCCGTTAACTCCTATAGACCC... 13318680 114594 TCCTATAGACCCTTCTCAGGTGCAGCCAGAGAGACACTAGCCCACT... 13318646 114595 CCTATCCCCCTGCACCCTCTCAGTGCCTCAAGAACTGCTGCAGTTC... 13313328 114596 ACGCGCGGCACAAGCTGCAGCACCACCTGCAAAAGGTGGGGCACCC... 13311189 114597 CCCTGCACAACCTCGAGGACATCCACATGAGCCTGGAGAAGGACGT... 13309498 114598 TTACAGCCCGGGCGACAGAGAAAGACTCCATCTCAAAAAAAAAAAA... 2834743 114599 AGGAGGGAGGACTTGTATGTGAGAAGTGACTTCTACCTTACTGCTT... 2834640 114600 AGGCCTTTGAACAATGTAATATAACTGATTTATAAGCTACTCTAGT... 2834108 114601 ATTTCTGTTGGCTGTGCTTTTTCTTGCGTATAGGCTGTACTTTTTT... 2834035 114602 GATTATTTTGGATAGCTTTGTTTAACGGTGAATGAAATGATTAAAA... 14802687 114603 AAATGTGTAAGTCTGTCGTTTGTTCTTGACTTCTGTCATGTTTTCA... 14804113 114604 GTTTGTTCTTGACTTCTGTCATGTTTTCAAGAATGCATTATGCAAT... 14804130 cut_mode cut_mode_hg38 chrom strand site_type rpm 0 12788725 12728758 chr1 + 3_most_exon 3.328937 1 94458390 93992834 chr1 - 3_most_exon 0.877850 2 229653545 229517798 chr1 - 3_most_exon 0.984183 3 229652589 229516842 chr1 - 3_most_exon 1.306171 4 229652545 229516798 chr1 - 3_most_exon 1.047152 5 229652329 229516582 chr1 - 3_most_exon 2.855958 6 229652175 229516428 chr1 - 3_most_exon 0.786950 7 229652139 229516392 chr1 - 3_most_exon 0.823096 8 94982938 94517382 chr1 + 3_most_exon 0.942836 9 94982973 94517417 chr1 + 3_most_exon 1.032161 10 94983315 94517759 chr1 + 3_most_exon 0.691924 11 94983671 94518115 chr1 + 3_most_exon 0.897813 12 94983784 94518228 chr1 + 3_most_exon 1.012715 13 94983984 94518428 chr1 + 3_most_exon 1.355078 14 94984010 94518454 chr1 + 3_most_exon 1.552984 15 94984085 94518529 chr1 + 3_most_exon 2.642226 16 94984219 94518663 chr1 + 3_most_exon 2.419415 17 94984907 94519351 chr1 + 3_most_exon 1.366080 18 179076801 179107666 chr1 - 3_most_exon 1.029291 19 179076734 179107599 chr1 - 3_most_exon 1.977591 20 179076275 179107140 chr1 - 3_most_exon 11.955727 21 179073519 179104384 chr1 - 3_most_exon 1.463386 22 179071726 179102591 chr1 - 3_most_exon 1.452910 23 179069558 179100423 chr1 - 3_most_exon 1.254179 24 179068898 179099763 chr1 - 3_most_exon 1.269648 25 179068626 179099491 chr1 - 3_most_exon 1.220517 26 179068543 179099408 chr1 - 3_most_exon 26.100034 27 179068518 179099383 chr1 - 3_most_exon 0.908233 28 179068465 179099330 chr1 - 3_most_exon 39.062476 29 179068345 179099210 chr1 - 3_most_exon 1.294196 ... ... ... ... ... ... ... 114575 15360265 13248385 chrY - 3_most_exon 1.087289 114576 15356321 13244440 chrY - 3_most_exon 1.461156 114577 15347398 13235517 chrY - 3_most_exon 2.089522 114578 15346466 13234585 chrY - 3_most_exon 4.872050 114579 15345776 13233895 chrY - 3_most_exon 1.447637 114580 2848286 2980245 chrY + 3_most_exon 0.684789 114581 2850530 2982489 chrY + 3_most_exon 4.418879 114582 2850555 2982514 chrY + 3_most_exon 1.460987 114583 2850618 2982577 chrY + 3_most_exon 1.472577 114584 2850847 2982806 chrY + 3_most_exon 1.365811 114585 21717276 19555390 chrY - 3_most_exon 1.638900 114586 21145185 18983299 chrY - 3_most_exon 1.570060 114587 21034482 18872596 chrY - 3_most_exon 1.726768 114588 21034387 18872501 chrY - 3_most_exon 1.622725 114589 16905123 14793243 chrY - 3_most_exon 1.118751 114590 14773246 12661315 chrY - 3_most_exon 3.527876 114591 14773198 12661267 chrY - 3_most_exon 1.662990 114592 14208308 12087602 chrY - 3_most_exon 2.291199 114593 13318659 11162983 chrY - 3_most_exon 8.626132 114594 13318623 11162947 chrY - 3_most_exon 0.828349 114595 13313303 11157627 chrY - 3_most_exon 1.055492 114596 13311165 11155489 chrY - 3_most_exon 1.098569 114597 13309477 11153801 chrY - 3_most_exon 3.041256 114598 2834722 2966681 chrY - 3_most_exon 1.064109 114599 2834619 2966578 chrY - 3_most_exon 3.441034 114600 2834085 2966044 chrY - 3_most_exon 0.482540 114601 2834002 2965961 chrY - 3_most_exon 0.885032 114602 14802713 12690784 chrY + 3_most_exon 3.055810 114603 14804136 12692207 chrY + 3_most_exon 11.683354 114604 14804162 12692233 chrY + 3_most_exon 3.513120 [114605 rows x 16 columns]
#Get PolyADB positions in hg38 coordinates
'''
polyadb_bed_hg19 = pd.read_csv("polyadb_coordinates_utr3_hg19.bed", sep='\t', header=None, names=['chrom', 'pas_pos_hg19', 'end', 'gene', 'gene_id', 'strand'])
polyadb_bed_hg38 = pd.read_csv("polyadb_coordinates_utr3_hg38.bed", sep='\t', header=None, names=['chrom', 'pas_pos_hg38', 'end', 'gene', 'gene_id', 'strand'])
polyadb_bed_hg38 = polyadb_bed_hg38.join(polyadb_bed_hg19[['gene_id', 'pas_pos_hg19']].set_index('gene_id'), on='gene_id', how='inner').copy().reset_index(drop=True)
polyadb_bed_hg38['padb_join_id'] = polyadb_bed_hg38['chrom'] + "_" + polyadb_bed_hg38['pas_pos_hg19'].astype(str) + "_" + polyadb_bed_hg38['gene']
df['padb_join_id'] = df['chrom'] + "_" + df['pas_pos'].astype(str) + "_" + df['gene']
df = df.join(polyadb_bed_hg38[['padb_join_id', 'pas_pos_hg38']].set_index("padb_join_id"), on='padb_join_id', how='inner').copy().reset_index(drop=True)
df['cut_mode_hg38'] = df['cut_mode'] - df['pas_pos'] + df['pas_pos_hg38']
'''
print(len(df))
114605
#Store PolyADB cut mode intervals in hg38
df['start'] = df['cut_mode_hg38'] - 35
df['end'] = df['cut_mode_hg38'] + 35
polyadb_bed = df[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand']].copy().reset_index(drop=True)
polyadb_bed = polyadb_bed.loc[(~polyadb_bed['start'].isnull()) & (polyadb_bed['start'] > -1)].copy().reset_index(drop=True)
polyadb_bed.to_csv("polyadb_cut_mode_coordinates_hg38.bed", sep='\t', header=False, index=False)
#Load perturb-seq experimental data (knockout gene-level)
perturb_df = pd.read_csv("perturb/pseudobulk_counts_de_novo_polyA_sites_by_gene.tsv", sep='\t').query("misprime == False").copy().reset_index(drop=True)
perturb_df['start'] = perturb_df['Position'].astype(int)
perturb_df['end'] = perturb_df['Position'].astype(int) + 1
perturb_df = perturb_df.rename(columns={'Chromosome' : 'chrom', 'symbol' : 'gene'})
perturb_df['chrom'] = 'chr' + perturb_df['chrom'].astype(str)
perturb_df = perturb_df[['chrom', 'start', 'end', 'gene', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5']]
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
perturb_df
chrom | start | end | gene | NT | CDC73 | CPSF1 | CPSF2 | CPSF3 | CPSF3L | ... | PAPOLA | PCF11 | RBBP6 | RPRD1A | RPRD1B | SCAF8 | SF3A1 | SRSF3 | SYMPK | THOC5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chrX | 293484 | 293485 | PLCXD1 | 349.0 | 66.0 | 49.0 | 31.0 | 21.0 | 15.0 | ... | 114.0 | 12.0 | 57.0 | 53.0 | 205.0 | 179.0 | 3.0 | 45.0 | 7.0 | 35.0 |
1 | chrX | 299444 | 299445 | PLCXD1 | 274.0 | 63.0 | 50.0 | 47.0 | 40.0 | 7.0 | ... | 34.0 | 10.0 | 39.0 | 90.0 | 135.0 | 172.0 | 2.0 | 56.0 | 7.0 | 35.0 |
2 | chrX | 303356 | 303357 | PLCXD1 | 1036.0 | 128.0 | 179.0 | 237.0 | 202.0 | 80.0 | ... | 391.0 | 88.0 | 211.0 | 223.0 | 327.0 | 565.0 | 13.0 | 158.0 | 57.0 | 174.0 |
3 | chrX | 1309922 | 1309923 | CSF2RA | 18.0 | 2.0 | 3.0 | 2.0 | 3.0 | 2.0 | ... | 12.0 | 1.0 | 5.0 | 8.0 | 5.0 | 15.0 | 0.0 | 2.0 | 1.0 | 1.0 |
4 | chrX | 1599220 | 1599221 | AKAP17A | 39.0 | 6.0 | 6.0 | 3.0 | 11.0 | 3.0 | ... | 14.0 | 5.0 | 11.0 | 6.0 | 15.0 | 22.0 | 4.0 | 7.0 | 1.0 | 4.0 |
5 | chrX | 1602520 | 1602521 | AKAP17A | 1019.0 | 162.0 | 178.0 | 209.0 | 226.0 | 140.0 | ... | 353.0 | 126.0 | 293.0 | 169.0 | 422.0 | 575.0 | 71.0 | 125.0 | 61.0 | 156.0 |
6 | chrX | 2717856 | 2717857 | CD99 | 356.0 | 67.0 | 47.0 | 58.0 | 75.0 | 25.0 | ... | 91.0 | 33.0 | 87.0 | 85.0 | 147.0 | 216.0 | 10.0 | 41.0 | 18.0 | 31.0 |
7 | chrX | 2733667 | 2733668 | CD99 | 80.0 | 18.0 | 8.0 | 8.0 | 5.0 | 0.0 | ... | 8.0 | 2.0 | 9.0 | 23.0 | 33.0 | 92.0 | 1.0 | 21.0 | 1.0 | 8.0 |
8 | chrX | 2736437 | 2736438 | CD99 | 40.0 | 4.0 | 4.0 | 8.0 | 6.0 | 1.0 | ... | 9.0 | 3.0 | 12.0 | 6.0 | 16.0 | 23.0 | 0.0 | 2.0 | 2.0 | 4.0 |
9 | chrX | 2741309 | 2741310 | CD99 | 12141.0 | 1397.0 | 1676.0 | 1970.0 | 2428.0 | 816.0 | ... | 3983.0 | 1292.0 | 3508.0 | 2336.0 | 4527.0 | 7130.0 | 273.0 | 853.0 | 544.0 | 1144.0 |
10 | chrX | 2844041 | 2844042 | GYG2 | 28.0 | 17.0 | 4.0 | 2.0 | 4.0 | 2.0 | ... | 10.0 | 1.0 | 2.0 | 4.0 | 18.0 | 18.0 | 0.0 | 5.0 | 0.0 | 0.0 |
11 | chrX | 2882288 | 2882289 | GYG2 | 81.0 | 7.0 | 10.0 | 13.0 | 7.0 | 1.0 | ... | 27.0 | 4.0 | 5.0 | 16.0 | 19.0 | 45.0 | 3.0 | 2.0 | 2.0 | 7.0 |
12 | chrX | 2882818 | 2882819 | GYG2 | 327.0 | 29.0 | 29.0 | 34.0 | 27.0 | 12.0 | ... | 103.0 | 25.0 | 77.0 | 75.0 | 129.0 | 219.0 | 7.0 | 18.0 | 10.0 | 18.0 |
13 | chrX | 7353066 | 7353067 | STS | 97.0 | 14.0 | 17.0 | 26.0 | 29.0 | 13.0 | ... | 22.0 | 9.0 | 32.0 | 24.0 | 44.0 | 54.0 | 0.0 | 4.0 | 10.0 | 15.0 |
14 | chrX | 7354641 | 7354642 | STS | 193.0 | 18.0 | 37.0 | 64.0 | 41.0 | 4.0 | ... | 68.0 | 17.0 | 51.0 | 29.0 | 97.0 | 108.0 | 3.0 | 9.0 | 11.0 | 15.0 |
15 | chrX | 8466510 | 8466511 | VCX3B | 5.0 | 3.0 | 6.0 | 7.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 5.0 | 0.0 | 4.0 | 1.0 | 0.0 | 4.0 | 0.0 | 3.0 |
16 | chrX | 9702292 | 9702293 | TBL1X | 73.0 | 11.0 | 17.0 | 12.0 | 10.0 | 4.0 | ... | 28.0 | 6.0 | 21.0 | 25.0 | 30.0 | 46.0 | 2.0 | 5.0 | 2.0 | 8.0 |
17 | chrX | 9719743 | 9719744 | TBL1X | 667.0 | 52.0 | 64.0 | 88.0 | 99.0 | 34.0 | ... | 194.0 | 57.0 | 172.0 | 125.0 | 239.0 | 371.0 | 4.0 | 38.0 | 25.0 | 67.0 |
18 | chrX | 9741053 | 9741054 | TBL1X | 12.0 | 1.0 | 3.0 | 4.0 | 6.0 | 5.0 | ... | 6.0 | 2.0 | 4.0 | 5.0 | 5.0 | 10.0 | 0.0 | 3.0 | 2.0 | 0.0 |
19 | chrX | 9948359 | 9948360 | SHROOM2 | 129.0 | 19.0 | 23.0 | 26.0 | 28.0 | 10.0 | ... | 40.0 | 12.0 | 54.0 | 28.0 | 55.0 | 63.0 | 3.0 | 12.0 | 10.0 | 16.0 |
20 | chrX | 9949443 | 9949444 | SHROOM2 | 220.0 | 24.0 | 43.0 | 67.0 | 58.0 | 20.0 | ... | 96.0 | 26.0 | 56.0 | 46.0 | 119.0 | 144.0 | 6.0 | 17.0 | 15.0 | 18.0 |
21 | chrX | 10016763 | 10016764 | WWC3 | 222.0 | 51.0 | 45.0 | 42.0 | 48.0 | 24.0 | ... | 90.0 | 18.0 | 66.0 | 39.0 | 80.0 | 134.0 | 8.0 | 44.0 | 15.0 | 82.0 |
22 | chrX | 10017211 | 10017212 | WWC3 | 39.0 | 5.0 | 6.0 | 10.0 | 9.0 | 4.0 | ... | 19.0 | 9.0 | 12.0 | 7.0 | 8.0 | 17.0 | 1.0 | 1.0 | 2.0 | 2.0 |
23 | chrX | 10017571 | 10017572 | WWC3 | 11.0 | 4.0 | 5.0 | 14.0 | 3.0 | 1.0 | ... | 7.0 | 1.0 | 8.0 | 5.0 | 1.0 | 3.0 | 0.0 | 4.0 | 3.0 | 3.0 |
24 | chrX | 10144474 | 10144475 | WWC3 | 101.0 | 12.0 | 54.0 | 76.0 | 59.0 | 9.0 | ... | 44.0 | 25.0 | 34.0 | 31.0 | 49.0 | 14.0 | 2.0 | 3.0 | 13.0 | 27.0 |
25 | chrX | 10234779 | 10234780 | CLCN4 | 26.0 | 2.0 | 5.0 | 6.0 | 1.0 | 2.0 | ... | 5.0 | 1.0 | 9.0 | 4.0 | 7.0 | 10.0 | 0.0 | 2.0 | 4.0 | 5.0 |
26 | chrX | 10237652 | 10237653 | CLCN4 | 24.0 | 1.0 | 0.0 | 6.0 | 6.0 | 1.0 | ... | 12.0 | 1.0 | 7.0 | 7.0 | 10.0 | 16.0 | 1.0 | 7.0 | 0.0 | 3.0 |
27 | chrX | 11118730 | 11118731 | HCCS | 379.0 | 65.0 | 44.0 | 47.0 | 55.0 | 39.0 | ... | 112.0 | 25.0 | 93.0 | 62.0 | 165.0 | 249.0 | 21.0 | 34.0 | 14.0 | 47.0 |
28 | chrX | 11121879 | 11121880 | HCCS | 5005.0 | 763.0 | 423.0 | 617.0 | 676.0 | 337.0 | ... | 1190.0 | 297.0 | 1091.0 | 950.0 | 2208.0 | 3124.0 | 118.0 | 604.0 | 172.0 | 465.0 |
29 | chrX | 11122516 | 11122517 | HCCS | 51.0 | 8.0 | 7.0 | 8.0 | 19.0 | 1.0 | ... | 23.0 | 6.0 | 16.0 | 9.0 | 22.0 | 27.0 | 1.0 | 4.0 | 1.0 | 4.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
47534 | chr21 | 46301134 | 46301135 | C21orf58 | 229.0 | 34.0 | 46.0 | 64.0 | 58.0 | 12.0 | ... | 112.0 | 17.0 | 65.0 | 55.0 | 113.0 | 99.0 | 6.0 | 15.0 | 21.0 | 35.0 |
47535 | chr21 | 46313176 | 46313177 | C21orf58 | 20.0 | 5.0 | 5.0 | 4.0 | 2.0 | 3.0 | ... | 9.0 | 0.0 | 5.0 | 5.0 | 12.0 | 18.0 | 1.0 | 2.0 | 0.0 | 2.0 |
47536 | chr21 | 46321022 | 46321023 | C21orf58 | 121.0 | 24.0 | 20.0 | 18.0 | 26.0 | 15.0 | ... | 36.0 | 10.0 | 24.0 | 24.0 | 43.0 | 78.0 | 1.0 | 8.0 | 3.0 | 7.0 |
47537 | chr21 | 46321427 | 46321428 | C21orf58 | 32.0 | 4.0 | 1.0 | 3.0 | 4.0 | 1.0 | ... | 15.0 | 1.0 | 2.0 | 7.0 | 10.0 | 12.0 | 1.0 | 1.0 | 1.0 | 1.0 |
47538 | chr21 | 46322416 | 46322417 | C21orf58 | 43.0 | 5.0 | 6.0 | 4.0 | 3.0 | 3.0 | ... | 12.0 | 1.0 | 4.0 | 4.0 | 8.0 | 24.0 | 2.0 | 3.0 | 2.0 | 1.0 |
47539 | chr21 | 46323009 | 46323010 | C21orf58 | 51.0 | 3.0 | 8.0 | 6.0 | 8.0 | 3.0 | ... | 18.0 | 1.0 | 9.0 | 8.0 | 13.0 | 28.0 | 0.0 | 4.0 | 1.0 | 5.0 |
47540 | chrM | 4402 | 4403 | MT-ND1 | 97471.0 | 10727.0 | 17813.0 | 21401.0 | 21728.0 | 5469.0 | ... | 33724.0 | 9194.0 | 22724.0 | 37521.0 | 37739.0 | 55028.0 | 1762.0 | 8064.0 | 5025.0 | 9950.0 |
47541 | chrM | 4874 | 4875 | MT-ND2 | 3442.0 | 781.0 | 1903.0 | 1481.0 | 1072.0 | 388.0 | ... | 1089.0 | 402.0 | 949.0 | 1473.0 | 1192.0 | 1832.0 | 310.0 | 347.0 | 315.0 | 1055.0 |
47542 | chrM | 5516 | 5517 | MT-ND2 | 161152.0 | 26722.0 | 64478.0 | 59672.0 | 45752.0 | 12101.0 | ... | 51116.0 | 18076.0 | 42266.0 | 62672.0 | 56966.0 | 85135.0 | 5327.0 | 12046.0 | 12750.0 | 23927.0 |
47543 | chrM | 5900 | 5901 | MT-ND2 | 7060.0 | 881.0 | 1846.0 | 2011.0 | 2010.0 | 399.0 | ... | 2696.0 | 832.0 | 2200.0 | 1072.0 | 2765.0 | 4054.0 | 169.0 | 742.0 | 535.0 | 927.0 |
47544 | chrM | 6534 | 6535 | MT-CO1 | 69097.0 | 7881.0 | 13739.0 | 15455.0 | 16516.0 | 3338.0 | ... | 23893.0 | 8083.0 | 21089.0 | 12623.0 | 27872.0 | 38585.0 | 1440.0 | 4086.0 | 3665.0 | 7889.0 |
47545 | chrM | 7115 | 7116 | MT-CO1 | 47710.0 | 5792.0 | 10685.0 | 12721.0 | 12303.0 | 3086.0 | ... | 16290.0 | 5501.0 | 15028.0 | 8227.0 | 19079.0 | 27416.0 | 1185.0 | 3189.0 | 3202.0 | 7032.0 |
47546 | chrM | 7439 | 7440 | MT-CO1 | 712899.0 | 66308.0 | 97667.0 | 143284.0 | 162471.0 | 31702.0 | ... | 249056.0 | 76925.0 | 197512.0 | 82165.0 | 302205.0 | 422529.0 | 11362.0 | 40206.0 | 35995.0 | 62462.0 |
47547 | chrM | 8294 | 8295 | MT-CO2 | 1131025.0 | 121149.0 | 187490.0 | 224730.0 | 230681.0 | 53614.0 | ... | 367953.0 | 120726.0 | 311390.0 | 209615.0 | 433037.0 | 609931.0 | 17969.0 | 66104.0 | 50592.0 | 116558.0 |
47548 | chrM | 9207 | 9208 | MT-ATP6 | 769101.0 | 113270.0 | 195132.0 | 204341.0 | 190157.0 | 41156.0 | ... | 254845.0 | 88300.0 | 236634.0 | 232539.0 | 282356.0 | 428224.0 | 14533.0 | 43492.0 | 41806.0 | 77761.0 |
47549 | chrM | 9991 | 9992 | MT-CO3 | 942328.0 | 90177.0 | 168006.0 | 204315.0 | 205105.0 | 38004.0 | ... | 324703.0 | 93999.0 | 254561.0 | 216205.0 | 357633.0 | 516692.0 | 13692.0 | 52770.0 | 44826.0 | 97510.0 |
47550 | chrM | 10404 | 10405 | MT-ND3 | 410615.0 | 70415.0 | 149244.0 | 136603.0 | 112429.0 | 23843.0 | ... | 132119.0 | 47861.0 | 118202.0 | 193483.0 | 143076.0 | 210334.0 | 8604.0 | 27109.0 | 27945.0 | 49768.0 |
47551 | chrM | 10946 | 10947 | MT-ND4 | 24534.0 | 4203.0 | 9287.0 | 8612.0 | 7227.0 | 1450.0 | ... | 8153.0 | 2956.0 | 7533.0 | 8182.0 | 8362.0 | 12710.0 | 661.0 | 1721.0 | 1804.0 | 3891.0 |
47552 | chrM | 11326 | 11327 | MT-ND4 | 15820.0 | 2472.0 | 6056.0 | 5854.0 | 4873.0 | 1031.0 | ... | 5322.0 | 1999.0 | 5367.0 | 6627.0 | 5513.0 | 8638.0 | 539.0 | 1160.0 | 1291.0 | 2943.0 |
47553 | chrM | 12142 | 12143 | MT-ND4 | 499343.0 | 56106.0 | 84547.0 | 107740.0 | 107939.0 | 25895.0 | ... | 164071.0 | 47958.0 | 118091.0 | 105913.0 | 187450.0 | 280727.0 | 8461.0 | 36212.0 | 23365.0 | 55436.0 |
47554 | chrM | 13008 | 13009 | MT-ND5 | 39407.0 | 5570.0 | 8354.0 | 8779.0 | 9235.0 | 1791.0 | ... | 13064.0 | 4394.0 | 11105.0 | 8145.0 | 12860.0 | 22688.0 | 705.0 | 2331.0 | 2130.0 | 4393.0 |
47555 | chrM | 13992 | 13993 | MT-ND5 | 11492.0 | 1407.0 | 2310.0 | 2854.0 | 2965.0 | 686.0 | ... | 3526.0 | 1320.0 | 3074.0 | 1853.0 | 4138.0 | 6628.0 | 268.0 | 960.0 | 711.0 | 1802.0 |
47556 | chrM | 14747 | 14748 | MT-ND5 | 23396.0 | 2596.0 | 2848.0 | 5522.0 | 6476.0 | 978.0 | ... | 8422.0 | 2951.0 | 8120.0 | 2481.0 | 9710.0 | 14555.0 | 323.0 | 1560.0 | 1432.0 | 2311.0 |
47557 | chrM | 15887 | 15888 | MT-CYB | 506445.0 | 51806.0 | 90860.0 | 127597.0 | 131681.0 | 24342.0 | ... | 180604.0 | 60051.0 | 171024.0 | 129651.0 | 204620.0 | 283866.0 | 8134.0 | 30467.0 | 29304.0 | 51724.0 |
47558 | chrM | 16557 | 16558 | MT-CYB | 1544.0 | 236.0 | 416.0 | 493.0 | 480.0 | 64.0 | ... | 513.0 | 198.0 | 599.0 | 254.0 | 524.0 | 908.0 | 44.0 | 80.0 | 129.0 | 192.0 |
47559 | chrM | 12589 | 12590 | MT-ND6 | 5396.0 | 536.0 | 1331.0 | 1280.0 | 1143.0 | 200.0 | ... | 1968.0 | 522.0 | 1493.0 | 1018.0 | 1710.0 | 2749.0 | 77.0 | 269.0 | 283.0 | 602.0 |
47560 | chrM | 13043 | 13044 | MT-ND6 | 5952.0 | 620.0 | 1574.0 | 1435.0 | 1304.0 | 239.0 | ... | 2086.0 | 615.0 | 1515.0 | 1164.0 | 1763.0 | 2948.0 | 102.0 | 395.0 | 293.0 | 647.0 |
47561 | chrM | 13403 | 13404 | MT-ND6 | 5654.0 | 644.0 | 1617.0 | 1515.0 | 1307.0 | 237.0 | ... | 2010.0 | 591.0 | 1593.0 | 1202.0 | 1734.0 | 2794.0 | 93.0 | 317.0 | 278.0 | 649.0 |
47562 | chrM | 13768 | 13769 | MT-ND6 | 12441.0 | 1404.0 | 3770.0 | 3326.0 | 2827.0 | 428.0 | ... | 4441.0 | 1308.0 | 3713.0 | 2896.0 | 4107.0 | 6309.0 | 220.0 | 585.0 | 686.0 | 1150.0 |
47563 | chrM | 14123 | 14124 | MT-ND6 | 38851.0 | 4750.0 | 10819.0 | 10066.0 | 8939.0 | 1075.0 | ... | 14100.0 | 4305.0 | 12045.0 | 9608.0 | 13198.0 | 20205.0 | 518.0 | 1623.0 | 1967.0 | 2904.0 |
47564 rows × 31 columns
#Intersect perturb-seq measurement dataframe against PolyADB V3
perturb_df.to_csv("pseudobulk_counts_de_novo_polyA_sites_by_gene.coordinates.bed", sep='\t', header=False, index=False)
!bedtools intersect -a polyadb_cut_mode_coordinates_hg38.bed -b pseudobulk_counts_de_novo_polyA_sites_by_gene.coordinates.bed -wa -wb > pseudobulk_counts_de_novo_polyA_sites_by_gene_intersect.bed
perturb_bed_hg38 = pd.read_csv("pseudobulk_counts_de_novo_polyA_sites_by_gene_intersect.bed", sep='\t', error_bad_lines=False, index_col=False, names=['chrom', 'start', 'end', 'gene', 'gene_id', 'strand', 'chrom_2', 'start_2', 'end_2', 'gene_2', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5'])
perturb_bed_hg38 = perturb_bed_hg38.query("gene == gene_2").copy().reset_index(drop=True)
#Assign count to closest annotated cleavage site
perturb_bed_hg38['se'] = (perturb_bed_hg38['start_2'] - (perturb_bed_hg38['start'] + 30))**2
perturb_bed_hg38 = perturb_bed_hg38.sort_values(by='se', ascending=True).drop_duplicates(subset=['gene_2', 'chrom_2', 'start_2'], keep='first').copy().reset_index(drop=True)
perturb_bed_hg38 = perturb_bed_hg38[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5']]
perturb_bed_hg38['sort_index'] = perturb_bed_hg38['start']
perturb_bed_hg38.loc[perturb_bed_hg38['strand'] == '-', 'sort_index'] *= -1
perturb_bed_hg38 = perturb_bed_hg38.sort_values(by='sort_index', ascending=False).drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index()
print("len(perturb_bed_hg38) = " + str(len(perturb_bed_hg38)))
len(perturb_bed_hg38) = 22370
#Append measurements to APA annotation dataframe
df = df.join(perturb_bed_hg38[['gene_id', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5']].set_index('gene_id'), on='gene_id', how='left').copy().reset_index(drop=True)
count_cols = ['NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5']
for count_col in count_cols :
df.loc[df[count_col].isnull(), count_col] = 0
total_counts = []
for _, row in df.iterrows() :
total_count = 0
for count_col in count_cols :
total_count += row[count_col]
total_counts.append(total_count)
df['total_count'] = np.array(total_counts)
#Remove genes with zero total count across all conditions
#df_gene = df.groupby(['gene']).agg({'total_count' : 'sum'}).reset_index().rename(columns={'total_count' : 'total_count_gene'})
#df = df.join(df_gene.set_index("gene"), on='gene', how='inner').copy().reset_index(drop=True)
df = df.query("total_count > 0.").copy().reset_index(drop=True)
print("len(df) = " + str(len(df)))
len(df) = 22370
df
gene | gene_id | sitenum | num_sites | pas | seq | seq_ext | wide_seq | wide_seq_ext | pas_pos | ... | PCF11 | RBBP6 | RPRD1A | RPRD1B | SCAF8 | SF3A1 | SRSF3 | SYMPK | THOC5 | total_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ABCB10 | ABCB10.6 | 2 | 7 | 0 | GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT... | TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC... | TCAGGTTTTGTATTTTCTTTTCTTGTGGAATATTTTAATTAATATA... | GAGTTTTAATAATTGTAACTTTTTAAATGTCTATAGCACTGAAGTT... | 229653571 | ... | 3.0 | 14.0 | 17.0 | 30.0 | 39.0 | 1.0 | 4.0 | 5.0 | 17.0 | 576.0 |
1 | ABCB10 | ABCB10.3 | 5 | 7 | 0 | TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC... | TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT... | GGTTGAATCTGAGGAAAATAATCCTTGTGCCATAGAAGTATTTACG... | TGTGTTCTTTATAAAGTGTGATTTTCAGAAAGCAAACAACACAATT... | 229652351 | ... | 140.0 | 385.0 | 304.0 | 563.0 | 865.0 | 16.0 | 67.0 | 52.0 | 125.0 | 8657.0 |
2 | ABCD3 | ABCD3.14 | 14 | 15 | 0 | GCCTTGACTTGAAAACATAGATAGTTTAATCTTGACTTGAAAAACA... | TGTTCTTTTTATTCTGGTATCTAAATACTGAGAAGTTCATTTATAA... | TCAGATATCCTATACAACCTTTGCTTGTTCTTTTTATTCTGGTATC... | CGAACTTGTATACTTATTTTCTGTTCAGATTAAAAAAAAAAAAAAA... | 94984196 | ... | 180.0 | 508.0 | 342.0 | 655.0 | 971.0 | 23.0 | 109.0 | 67.0 | 133.0 | 11240.0 |
3 | ABCD3 | ABCD3.15 | 15 | 15 | 0 | ATGAGAAAATAAGTATGAAACAGCAATGGTAGTTTGTTTTGCATTA... | TGCCAAGACATATCACCGTGTTCTCATAATAAGTTTTTACTTTTTA... | TTTCATCCATGAGCACCACGCTGCATGCCAAGACATATCACCGTGT... | GCAGTGGGAAATGGTAGTTTAATCCGAAGAATAAACCAAAGAATAA... | 94984889 | ... | 3.0 | 18.0 | 16.0 | 31.0 | 23.0 | 3.0 | 0.0 | 1.0 | 4.0 | 329.0 |
4 | ABL2 | ABL2.17 | 8 | 24 | 3 | CTGAGGGGAGAGGGAAAAGGACTTGTTTTCCTGTGTTCTTGTTTTC... | TGTGGTGCAGAGGTAGCCACTGTTAGCCTGGTGGGAAAATGCACAC... | TGTCATGTGTACAGGAAATCAGTGATGTGGTGCAGAGGTAGCCACT... | TTCAGCAGCTGCTGGTGTGCCCGGGACAAACCCTGTCCTTAATAAC... | 179076768 | ... | 6.0 | 12.0 | 8.0 | 24.0 | 34.0 | 2.0 | 9.0 | 2.0 | 16.0 | 485.0 |
5 | ABL2 | ABL2.16 | 9 | 24 | 2 | CCACAAGGCCATTGCTGCTGTAATAAGAACTGCAAATCAGAGTGCT... | CAAGAGAAATTTTTGTTCAGGGCTGTTGGAAGTAGCTGTTAGCCTT... | GCAGAAAAGAAAGCTGGGAATGTACCAAGAGAAATTTTTGTTCAGG... | GGTACTAATGGTGATTATGCTCCAATTTACCTAATGAATTTGGTGG... | 179076299 | ... | 0.0 | 4.0 | 1.0 | 3.0 | 9.0 | 1.0 | 0.0 | 2.0 | 1.0 | 82.0 |
6 | ABL2 | ABL2.8 | 17 | 24 | 0 | ACTGCTTTCTCTGTCTTCTCACAAGGTTTGCCAAGTTGTGTTCTGT... | ACTGCTAACAGTGTTAAACTTGATGTAAATAAATGAGGCCCTTGAA... | CTCTCCGTCTGTTGTCTGACTGTGAACTGCTAACAGTGTTAAACTT... | GTTCTTAATTGTTATTGTAATATATTTTCAGTTGTTTTTCTAATTT... | 179068493 | ... | 83.0 | 124.0 | 132.0 | 254.0 | 364.0 | 13.0 | 61.0 | 29.0 | 76.0 | 3961.0 |
7 | ACADM | ACADM.14 | 14 | 19 | 0 | TCTATTGTACACAATCTCATTTCATATGTTTGCATTTTGGCAAAGA... | CTTGCCTTAAATTATTTTTATATGACTGTTGGTCTCTAGGTAGCCT... | CCTTATTTAAAATAAATCAATAAAGCTTGCCTTAAATTATTTTTAT... | CAAGAACTTTCTTGAAAATCTTATTTAATTCTGAGCCCATATTTCA... | 76229155 | ... | 405.0 | 1137.0 | 707.0 | 1187.0 | 1779.0 | 42.0 | 184.0 | 146.0 | 330.0 | 20840.0 |
8 | ACAP3 | ACAP3.3 | 3 | 5 | 2 | CCGGCCTCCTCCGGAGGCACCTTCTCCTGGTACTCGGCCCAGAGCC... | CCACGTGGCTGGCCACGAAGGTCCCCGTGCCAGACAGCCCCAGCCG... | CTGGCTGGACGCGGGCGTCCCAAGGCCACGTGGCTGGCCACGAAGG... | CTGGCGTCGCGGGTGCTGGGCGGGAGGGGCTCTGGCCTGGGTCCTC... | 1228274 | ... | 28.0 | 108.0 | 57.0 | 112.0 | 172.0 | 8.0 | 27.0 | 15.0 | 42.0 | 2090.0 |
9 | ACAP3 | ACAP3.2 | 4 | 5 | 0 | TCTTGCCCCAGGCCCCTGCTGGCGGGTCTCACCCCCCACCCCTCGC... | AAGAACAGAATTGATTCTTGCCCCTCTCCCTGTGTGAGCTTGGCCC... | CTCTTGCCTGCTGCCTGTGACCCTGAAGAACAGAATTGATTCTTGC... | TGGGGAGGCTCCCTGAGGGCACAGTGGGCGCTGGACCCGGCCCCCC... | 1227789 | ... | 49.0 | 99.0 | 73.0 | 160.0 | 187.0 | 12.0 | 44.0 | 22.0 | 40.0 | 2661.0 |
10 | ACBD3 | ACBD3.11 | 12 | 22 | 0 | GTGGATGCTGAAGTTACATGAGCTACATGTTAAATATTTAAAGTCT... | AGATTCCTCAGACTCATCCAGCCCTTGGGTGCTGACCAGCAGAGTC... | AGCATTCATACTTTGGGGTTAAAGGAGATTCCTCAGACTCATCCAG... | TGATGGTTTGTGAACTCTTGCTGGGAATCAAAATTTCCTTGAGACT... | 226334019 | ... | 12.0 | 48.0 | 24.0 | 56.0 | 75.0 | 5.0 | 25.0 | 5.0 | 40.0 | 1490.0 |
11 | ACBD3 | ACBD3.10 | 13 | 22 | 0 | TTTGTTTTGGCTTCATAGAGTATCTCAAATTGAAACTTTTCTGCAC... | TGGTATTCATACTACTAGTAGCAAAATACAGGTTTTTTGTTTTGTT... | AACTTTGAATCCTTGTATCTTTATTTGGTATTCATACTACTAGTAG... | TATCAAGATACGTAGAACACCTCAGAGATTTTTCTTCAGGAACTTC... | 226333494 | ... | 14.0 | 50.0 | 28.0 | 55.0 | 84.0 | 3.0 | 19.0 | 10.0 | 23.0 | 1392.0 |
12 | ACBD3 | ACBD3.2 | 21 | 22 | 0 | ACAGTACAAGTGCGATTTCAAAAAGATCTTGAAAGTAATATATTTA... | AGAATATTTTTGGTTTTAAACTTTCTTATTGCCTTTGGCTGTTGAT... | GCGGTTCCTGTCATGTGTTCATGTCAGAATATTTTTGGTTTTAAAC... | CCTAAAAATATCATTGTTCTTGGGAGCAGTGTATGTTACTTTACAT... | 226332399 | ... | 119.0 | 333.0 | 241.0 | 579.0 | 862.0 | 39.0 | 105.0 | 63.0 | 114.0 | 8577.0 |
13 | ACBD6 | ACBD6.6 | 7 | 12 | 0 | AACTACAAAAATAATACTTCTTTTCCACCCGTCTTTGGTATGTATT... | GACTGGAAAACTGCAGTCTGTAATAGCATAAGGCTTCCATTATGAA... | CACAACTGGCAAGGCTTAATCAAAAGACTGGAAAACTGCAGTCTGT... | CCAGAGGAGGTGACAGGCTGCAAAACAGTTTCTTTGGTGCTGCAGC... | 180257391 | ... | 253.0 | 619.0 | 698.0 | 1173.0 | 2039.0 | 65.0 | 256.0 | 83.0 | 267.0 | 20294.0 |
14 | ACOT7 | ACOT7.1 | 18 | 18 | 0 | GGGAATGCTTCCGAGCACGCTGTAGGGTATGGGAAGAACCCAGCAC... | TGCTACACAGTGTTGTCCCGAGCGCCGGGAGGCGTTGGGCAGAAAC... | TTTATTTATATCATTCCAGTATCAATGCTACACAGTGTTGTCCCGA... | GTATCACAGTGTTAACCTGTACTCTCTCCTGCAAACCTACACACCA... | 6324354 | ... | 635.0 | 1524.0 | 1207.0 | 2112.0 | 3292.0 | 110.0 | 542.0 | 325.0 | 583.0 | 37630.0 |
15 | ACTA1 | ACTA1.1 | 1 | 1 | 0 | TATTTTTCGAAACAAAGCCCTGTGGAAGAAAATGGAAAACTTGAAG... | GACACAGTGTTTATAACGTGTACATACATTAACTTATTACCTCATT... | ACTTCCGTTGCTGCCATCGTAAACTGACACAGTGTTTATAACGTGT... | TGGGGGGGCGGCTGAGCTCCAGCCACCCCGCAGTCACTTTCTTTGT... | 229567020 | ... | 26.0 | 35.0 | 10.0 | 24.0 | 38.0 | 10.0 | 8.0 | 14.0 | 7.0 | 731.0 |
16 | ACTL8 | ACTL8.1 | 1 | 1 | 0 | AGCCTGGGATGCCCTTGCCACCCGTGGTTGGATCTTGTTTTATATC... | ATTTCTGGTCCTACAGGCCCTTTCTGGCCAGGGAGGCATTGCTGCA... | GACTAGGGGATGGGGGACAGTTGACATTTCTGGTCCTACAGGCCCT... | AGTAGGTTTTAACTGGGGTAGCACTCCTGCTAGGAGTCCCAATTAT... | 18153534 | ... | 3.0 | 2.0 | 9.0 | 6.0 | 11.0 | 1.0 | 0.0 | 1.0 | 2.0 | 169.0 |
17 | ACTN2 | ACTN2.4 | 4 | 8 | 0 | GAGCGATCTGTGATGCTGAGCTTCTGTAATCACTCATCCCATCAGA... | GTGCCTGGTGCACTGGATTACGCTGCGTTCTCTTCCGCACTCTACG... | GCCCGCCTACTCGGGCCCAGGCAGTGTGCCTGGTGCACTGGATTAC... | CTGCGTCGGGAGCTGCCCCCGGATCAGGCCCAGTACTGCATCAAGA... | 236925956 | ... | 1.0 | 10.0 | 7.0 | 19.0 | 22.0 | 0.0 | 8.0 | 2.0 | 7.0 | 380.0 |
18 | ACTN2 | ACTN2.5 | 5 | 8 | 0 | TACAAAATACCCAAGATTTAAGACCGGGGGGAAAAAACCACAAATT... | TAGGAAATTAGGAGGATCTAGGGACAGAAGGAAAGTGAAAAATGTG... | TAAACAGAACAAATTACTTGAGTAATAGGAAATTAGGAGGATCTAG... | TTCTGAGTTTTTAGCAAAATGTAATGAAATATCAGGTTGATTTCTT... | 236926307 | ... | 1.0 | 14.0 | 10.0 | 17.0 | 28.0 | 0.0 | 11.0 | 4.0 | 11.0 | 390.0 |
19 | ADAM15 | ADAM15.6 | 6 | 7 | 0 | GGTTGGACGGGATTGAGGAAGGTCCGCACAGCCTGTCTCTGCTCAG... | TCTGCGGACCTGCCGGCGTAGTTGCAGCGGGGGCTTGGGGAGGGGC... | ACCGCCACGCGCTGTCAAGCAACACTCTGCGGACCTGCCGGCGTAG... | CTACCATGACTGAAGGCGCCAGAGACTGGCGGTGTCTTAAGACTCC... | 155035225 | ... | 120.0 | 346.0 | 311.0 | 457.0 | 811.0 | 15.0 | 43.0 | 66.0 | 110.0 | 8455.0 |
20 | ADAR | ADAR.3 | 8 | 10 | 0 | CTTTTATGTGTCCCTTGATAACAGTGACTTAACAATATACATTCCT... | CATAGACTCGGGTACTGTGATGATGGCTGCAGTCCAGTTTTATGAT... | GAGCAGAGTGAGGAAGACCCCCAAGCATAGACTCGGGTACTGTGAT... | CACATTGAAGGGACTTCGTTGGTTTTTTGGAGTCTTGGTTGTGACT... | 154554567 | ... | 488.0 | 1307.0 | 919.0 | 1672.0 | 2662.0 | 88.0 | 310.0 | 257.0 | 442.0 | 30159.0 |
21 | ADGRB2 | ADGRB2.2 | 3 | 4 | 0 | CTGTCCGTCCCTGTCCCGGGCTGGGGAGGGGGGAGGGGAACTTTGT... | CTTGTTTCTCAGAGGCCCCTCAGCCACTGGAACCCCATCTTCAGCC... | TTGGCCTGGGGTCCCAGGGCCCTTCCTTGTTTCTCAGAGGCCCCTC... | GGGCAGGGAGGCGCCGTGGACTCAGCCAGGCTGGGGGAGCCGGACA... | 32192737 | ... | 14.0 | 33.0 | 24.0 | 54.0 | 71.0 | 1.0 | 0.0 | 9.0 | 6.0 | 725.0 |
22 | ADGRL2 | ADGRL2.17 | 17 | 26 | 0 | ATTGCTAGGGTAAAATAAATACATTTGTGTCCAACTGAAATATAAT... | ACAATGAACTATTCTCATGAAAAATGGCTAAAGAAATTATATTTTG... | TTATTTCATATGTTTCCTCAACTGTACAATGAACTATTCTCATGAA... | AAGGCAAAGATTGAAAACATGCTTAACCACTAGCAATCAAGCCACA... | 82457511 | ... | 12.0 | 70.0 | 42.0 | 98.0 | 135.0 | 6.0 | 33.0 | 12.0 | 24.0 | 2003.0 |
23 | ADGRL2 | ADGRL2.21 | 21 | 26 | 0 | ATGAGCCCATCACTAATATCCAGTGTAAAGTTTAACACGGTTTGAC... | TTCAGCAAAATTCTGCTTTTTTTTCATCCCTTTGTGTAAACCTGTT... | AAAGTTCTAATGAAATGTAAATTGTTTCAGCAAAATTCTGCTTTTT... | CTCTTCCATATTCCTTCTGCCTATATTTAGTAATTAATTTATTTTA... | 82458083 | ... | 90.0 | 166.0 | 132.0 | 317.0 | 394.0 | 17.0 | 47.0 | 30.0 | 69.0 | 4259.0 |
24 | ADGRL2 | ADGRL2.22 | 22 | 26 | 2 | CCAGGACTAAAAAAAGAAGGATTGGAAGTTCTGCCATCAAATTTGG... | AAGTTTTAAGAGGGATATATTCTGAAAGCATTTTTGTTTGTTGCAT... | TACATATGCACAAAATCTGCTGGATAAGTTTTAAGAGGGATATATT... | GCCTGTTTGTTCTTTATGCTGAAAGGAATATATGTCTTCCAATTGC... | 82458403 | ... | 1.0 | 7.0 | 4.0 | 7.0 | 7.0 | 0.0 | 2.0 | 5.0 | 1.0 | 117.0 |
25 | ADGRL2 | ADGRL2.25 | 25 | 26 | 0 | TTCTTCCTAGAGACATTCACTCTGTCTTTATTAAAAATAATAATAA... | TCTCTTTGTGATAGTCACAACTGTGAACTTATAAAGTCACATTTTT... | TACAGTTTTCACATTCTTGATTGTTTCTCTTTGTGATAGTCACAAC... | TGTATTTATTAACCCAGTTTCTTTGGGTCTGCCTAGACTTCACCTG... | 82459593 | ... | 2.0 | 11.0 | 4.0 | 23.0 | 33.0 | 0.0 | 2.0 | 3.0 | 5.0 | 262.0 |
26 | ADIPOR1 | ADIPOR1.5 | 3 | 7 | 3 | GAGCCTTCCCACCTGCGGGGTGGAGGAGGAACTTCCCAAGTGCTTT... | GGAATTCCGTTACGGCCTAGAAGGCGGCTGTACTGATGACACCCTT... | ACTTCTATGGAGTCTCCAACCTTCAGGAATTCCGTTACGGCCTAGA... | CCAGTCTCATCAGATTTTCCATGTCCTGGTGGTGGCAGCAGCCTTT... | 202910652 | ... | 6.0 | 18.0 | 22.0 | 65.0 | 69.0 | 1.0 | 13.0 | 7.0 | 12.0 | 989.0 |
27 | ADIPOR1 | ADIPOR1.2 | 6 | 7 | 0 | AAGAAACCTGCTATCATTGCTATGTATCTTGATGCAAAGACTATGA... | GTAAAATTTAAATGGGGAAAGATATTTAATATTTAATACTAAGCTT... | TTTTTTTTTTTCTTGGCAGAGTAATGTAAAATTTAAATGGGGAAAG... | CCGGCTAATCATGGAAGTGTGTCCAGGCTTCAAGTAACTTGAGTTT... | 202909995 | ... | 575.0 | 1550.0 | 1115.0 | 2724.0 | 3575.0 | 159.0 | 633.0 | 328.0 | 784.0 | 40043.0 |
28 | ADORA1 | ADORA1.2 | 2 | 2 | 0 | TCTGTTGGAAATTGGGTGTGCCCTGGCTCCCAAGGGAGGCCCATGT... | ACCTTCTGAACATGAGTGTCAACTCCAGGACTTGCTTCCAAGCCCT... | GGGAGGGGAGGTGGCCGTCGAGTTGACCTTCTGAACATGAGTGTCA... | GTGTGGGAGGGCGAGGCGGGGGATCCTGGAGCCCCTGTGTCGGGGG... | 203136512 | ... | 5.0 | 8.0 | 9.0 | 6.0 | 11.0 | 1.0 | 0.0 | 5.0 | 0.0 | 136.0 |
29 | ADPRHL2 | ADPRHL2.3 | 3 | 3 | 0 | CTGGCTCTTGGCTGCCATGTGGCTTATTAACAGCTTCCAGTGGAAG... | TTGTGTGGATGAAGGGACAGGCACTTGCATCCAGCTGATCTAGGTC... | CCTAGCAGGGTCCCCGAGCAGCAGGTTGTGTGGATGAAGGGACAGG... | TAGCATCATTTCTCCCTGTTGGGTTTTAGCCAGTTTGCCAGCAAGC... | 36559508 | ... | 439.0 | 1126.0 | 836.0 | 1981.0 | 2681.0 | 103.0 | 475.0 | 209.0 | 527.0 | 30706.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
22340 | ZFX | ZFX.13 | 13 | 21 | 4 | CTTTTGCAACTTTATGTATGTAAATGTACCCTGAATTATATATATA... | TGAGGACTGCATTTTGGAATCTCCTAGAGGTAACTCATGGCTTATA... | GTTGTGTGCTACAAATGACACTTACTGAGGACTGCATTTTGGAATC... | GCTCATGTTGTATTTTGATTATTCTGTCTGTGCGGCTTCATCTTGG... | 24230364 | ... | 4.0 | 18.0 | 11.0 | 27.0 | 29.0 | 1.0 | 5.0 | 5.0 | 13.0 | 402.0 |
22341 | ZFX | ZFX.15 | 15 | 21 | 0 | AAGTTAATGCTTTTTCTCAAGGAAAAAAAATGTACAGTTTTTGTAA... | TCAGAATATTTTCCTGTTATTGAATGTTTAGTCTATTTGATACCAG... | TGAGAATAACGTGCAAAAATAAAAATCAGAATATTTTCCTGTTATT... | ATAAAAAGAAAATCATATAGGGATGTGTGACATTATTGTAATTGTG... | 24232603 | ... | 93.0 | 261.0 | 161.0 | 346.0 | 520.0 | 28.0 | 78.0 | 61.0 | 112.0 | 5657.0 |
22342 | ZFX | ZFX.17 | 17 | 21 | 0 | TGGATTTGTTATATATTGTTCCTGTTATTTTTGACATCTTTGCTAT... | TTTCATCTTTTGTTTCGTGTATATACTGTTTGCCTTTTTCATAAAA... | AATTAAGTTTACCCTATGGATTTTGTTTCATCTTTTGTTTCGTGTA... | AATAAATCATAACCATTTTGCCACATTCTGTAACTGTTTAGCTAAG... | 24234179 | ... | 54.0 | 99.0 | 72.0 | 130.0 | 226.0 | 9.0 | 12.0 | 18.0 | 21.0 | 2096.0 |
22343 | ZFX | ZFX.19 | 19 | 21 | 2 | TTTTTGTAGTCCATGATTTATCTTGTTATTGCTTATGTAATCTTTT... | GAGTTCTTTGTACATTAAGCCTATTAACTTTCCAATACATTTGCTG... | AATCCTGTGCCCCATTTTCTATTTGGAGTTCTTTGTACATTAAGCC... | TATGTATATAATAGCACTTTAAGTTGTCGCTCTGATAAACTGCCTA... | 24235904 | ... | 6.0 | 16.0 | 10.0 | 15.0 | 18.0 | 1.0 | 3.0 | 4.0 | 4.0 | 344.0 |
22344 | ZIC3 | ZIC3.13 | 13 | 13 | 0 | TTGTGTTCAGAAAGCAATACCCAGGATCCCTTTAATTTTTGTTTGA... | TAAAGTGTTTAAAACTAGAAACATGAATTGTTGTTTTGTTAATTTT... | AATACCAGCATTCTATATGTAGAACTAAAGTGTTTAAAACTAGAAA... | AGTATGCATTATCTCATGCATTGTGTTGACCTTGGTGAAAATCTTT... | 136659831 | ... | 4.0 | 3.0 | 3.0 | 6.0 | 8.0 | 1.0 | 1.0 | 1.0 | 1.0 | 121.0 |
22345 | ZMYM3 | ZMYM3.1 | 3 | 3 | 0 | GAGAAGCTGTTGTCACGACTAACCTTCTGTCTCTGAAATTGTTTGT... | GGCGCCTGGCCTGTCCCTTCAGTGAGCCATGCCCACCCTTGTGTTG... | GTTTAAATTGTATATATTGTTCTGAGGCGCCTGGCCTGTCCCTTCA... | GTTTCCCAGGCTCCTTTTTGTGTTTTTATAACTGTCACCAGTTAGC... | 70459493 | ... | 116.0 | 337.0 | 252.0 | 475.0 | 696.0 | 20.0 | 138.0 | 65.0 | 120.0 | 8211.0 |
22346 | ZNF157 | ZNF157.5 | 5 | 6 | 2 | CCTTGTTGATAAGTATAGGTCTGGCTAGTCATTTATCTCTCCATTC... | AAGTTCTGCAGCTTGCTTTTTTCATGTAATGTTACCTTTCCAAGAT... | CATAAATGGGCTCACACTGTATTTAAAGTTCTGCAGCTTGCTTTTT... | TAGCATGAAGTGTGCAATACATACTACTATATTGTCTTTATATAAT... | 47274001 | ... | 1.0 | 1.0 | 3.0 | 5.0 | 9.0 | 0.0 | 2.0 | 0.0 | 1.0 | 86.0 |
22347 | ZNF182 | ZNF182.2 | 1 | 2 | 0 | GAAATAATATGGTAGATAATTTAGACTTATTTAGTAGAAGTTCTGC... | TACAAACCTTGTTGCTTCAATACAAAGACCCGATAAACACGAATCA... | AGTTTGGAAACATACTTCATCTGAGTACAAACCTTGTTGCTTCAAT... | ATTTTCTGACAAGAAAACAATTATCACCAAGAGTGCTCGTGACTGT... | 47836945 | ... | 3.0 | 8.0 | 8.0 | 25.0 | 32.0 | 0.0 | 8.0 | 5.0 | 5.0 | 380.0 |
22348 | ZNF182 | ZNF182.1 | 2 | 2 | 0 | TTGAAAATGTGGGTGTACTATGTACTATGTGGATGTACTACCTTTT... | GGCTGGGGGGATGGAATTGAGAGGGAGACAACTGTATCCTTTCATA... | CCTTGGGAGGGGAAGGGGCAGTGGTGGCTGGGGGGATGGAATTGAG... | TTCATAAGTTATCTCTGGAAAGAGACATGAGAACCTGGTAACCCTG... | 47834270 | ... | 20.0 | 40.0 | 42.0 | 87.0 | 96.0 | 7.0 | 14.0 | 14.0 | 38.0 | 1251.0 |
22349 | ZNF185 | ZNF185.5 | 5 | 6 | 0 | TTGCTTAGTGTTTCTAATCATACTTAATCCACACTAATGTGCGCAA... | TTTTGCAGATCTGAGGAAGAGGGATGCATTACCTTTTTGCTTCTTT... | CTCTCATGTCTAAAAAGGCACAGAATTTTGCAGATCTGAGGAAGAG... | TTGAAAGAAATCTTGCAAGAGCCATTATTGACTTAGATCCAAAACA... | 152141997 | ... | 10.0 | 27.0 | 16.0 | 56.0 | 63.0 | 4.0 | 5.0 | 7.0 | 18.0 | 576.0 |
22350 | ZNF275 | ZNF275.4 | 4 | 6 | 2 | ATTTTATGTCTACGTATATTGTTCCTTTACTGAACCCACCACATGC... | TAAGATGGGTGAAAGTCGATGCCTTCTAGTCTCAGTGAATTTAACC... | GTTCAAACGTGTGTTCTCTGTTCTCTAAGATGGGTGAAAGTCGATG... | TGCCACTTGGCTGCTTCCTGGCCAAGTCGCACCTGACTGCATGAAC... | 152617885 | ... | 3.0 | 3.0 | 5.0 | 17.0 | 26.0 | 1.0 | 3.0 | 4.0 | 3.0 | 291.0 |
22351 | ZNF275 | ZNF275.6 | 6 | 6 | 0 | GTCTGATCCCCTACCAAATCTAGCACAGTGCCTTGCATCAAGTAGA... | CCCCCACCCATTAAATTGTGAGCTCTTAGAAGACAGGGGTGGCCTT... | GCTTCACCCCCTCCTCCTCAGCCCTCCCCCACCCATTAAATTGTGA... | TTCTGGCACTCACTATAATCAGCCTTGCACTAGAGCTGTTTGTGGA... | 152618362 | ... | 55.0 | 164.0 | 90.0 | 240.0 | 268.0 | 10.0 | 31.0 | 13.0 | 66.0 | 3151.0 |
22352 | ZNF280C | ZNF280C.3 | 4 | 6 | 0 | TTGGTTACATTGAATACAGATTTGCTGAACAGTTTTGATGTTATTT... | GCTCATATATATATAGATGTATATTTTTTTTAATTTCTTGTTTGTT... | GGACCAAAAAATGCACTTGTTTCTTGCTCATATATATATAGATGTA... | GCCATTAATGTAATTCCTCTGGATAAAGATAATATATTCAAAAAAT... | 129336743 | ... | 44.0 | 69.0 | 72.0 | 82.0 | 155.0 | 14.0 | 18.0 | 15.0 | 49.0 | 2016.0 |
22353 | ZNF280C | ZNF280C.1 | 6 | 6 | 0 | TGATTCAGAAATAGCCATGTCATGCATGTGTCCTTTTTTGTTTTCA... | AGCTTGACAATCTGATCCCTCTTCACCTTCAGACTGTTAGTTATTT... | GTTTTAATAGGTGTTAGTGAGTTTTAGCTTGACAATCTGATCCCTC... | TAAAACTATTTATGCAGTCAGTACTAAGCTTACTTGTTATAAGCAG... | 129335349 | ... | 5.0 | 19.0 | 11.0 | 10.0 | 21.0 | 0.0 | 1.0 | 3.0 | 3.0 | 290.0 |
22354 | ZNF41 | ZNF41.16 | 7 | 22 | 2 | CCTAGCTCTAGGCTATGTTACAGAAATATAGTCATTGAATGATACA... | AATGGAATTTTTAAATTTAAAGATATAACTTTATGAATTGAGAAAT... | CTCATGGTATATATATTTTAAGTGCAATGGAATTTTTAAATTTAAA... | ACATAGAAAAGATTTCCATGAAAAACTTTTTTCTTTTCCCTTGGGA... | 47306101 | ... | 6.0 | 19.0 | 9.0 | 22.0 | 26.0 | 0.0 | 1.0 | 3.0 | 15.0 | 331.0 |
22355 | ZNF41 | ZNF41.12 | 11 | 22 | 3 | GATAATTCATTTTCATTGTCATGTTGTATCCCATTCTGTGAATATG... | TCATTCACATTATGTATGTGAGAGTCATCCATATGTTGCATATAGT... | TATACTCTTTAGCATCTGTCTTCTGTCATTCACATTATGTATGTGA... | TTTTGTATTTTATATAAATTGAGTCAATTATATATCATATAATTGA... | 47305154 | ... | 1.0 | 0.0 | 4.0 | 12.0 | 5.0 | 0.0 | 0.0 | 0.0 | 1.0 | 68.0 |
22356 | ZNF41 | ZNF41.2 | 21 | 22 | 4 | TTTAAGTTTTACATCTAGATCTAGCATGTATTTTGAGTTATATGGT... | TTGCCTAACCAAAGATTACAATGATTTTTTCCTGTGTTTTCTTCTA... | TTTGAGTGCTGTATCTAAGAAATCCTTGCCTAACCAAAGATTACAA... | CTTATTTTGATGAAGTCCGTTCTTTCCATTTGTTCATTTCTGGGTT... | 47304199 | ... | 13.0 | 20.0 | 19.0 | 29.0 | 46.0 | 0.0 | 4.0 | 0.0 | 10.0 | 499.0 |
22357 | ZNF449 | ZNF449.3 | 3 | 11 | 0 | TCTATCAGACGTATTGATTATAGCAGTACTATAGTTATTCTGCTGT... | AAATTATGACAATCCTTTTAGAGGTAGGGTCAATATAGTGGATAAA... | TCATAGGTGTAAACATAAAGCATATAAATTATGACAATCCTTTTAG... | TCTTCTTTTTTTAATTACAATGAAAAATTTTGTGTTCCAAGGCAAC... | 134495399 | ... | 3.0 | 4.0 | 3.0 | 8.0 | 17.0 | 2.0 | 5.0 | 1.0 | 4.0 | 192.0 |
22358 | ZNF449 | ZNF449.6 | 6 | 11 | 0 | AGTGCCTTAGAATGGATGTGCCCAACTGCTCTGTATTTATGCAATA... | ATGTAATGGCTTCTCTTTTCTCTCTTGTGGAATTGCATTCAAACCA... | TGAATGTAGAGATGAAAAATACAGAATGTAATGGCTTCTCTTTTCT... | GGCTCAAATTGATATCCCAGTAGCAATAAACATATAATATAGGAGG... | 134497046 | ... | 10.0 | 24.0 | 26.0 | 48.0 | 51.0 | 4.0 | 8.0 | 5.0 | 4.0 | 571.0 |
22359 | ZNF449 | ZNF449.8 | 8 | 11 | 0 | AAGATTAAATGAAATATATTTTGCTCTGGCCCTACACACTGTAAGC... | CCTAGCTATTATAAAGGGGAAATTACAGTACCTACCTCAAAAGTAC... | TAAGGGCAAAATACAGTACCTACCTCCTAGCTATTATAAAGGGGAA... | GGATAATTATACTTCTATGTCTAATTGTACTTCTGAGCATTTCAAA... | 134497629 | ... | 2.0 | 6.0 | 2.0 | 10.0 | 10.0 | 0.0 | 0.0 | 3.0 | 7.0 | 171.0 |
22360 | ZNF630 | ZNF630.4 | 2 | 5 | 0 | AATATGGAATCAATTTGCTCACCCTCAAACTGTCTCAGCCCTCTTC... | CCTTCCACAGAATGATCTGCTTAGGCCCTCAGAATATTTTCAACCC... | TATGTTCCTCAGTATACATGATTACCCTTCCACAGAATGATCTGCT... | CTCACATTGAATAGAAATAGTGACAACTTCTCAACTGTAGAATAGA... | 47917631 | ... | 3.0 | 7.0 | 3.0 | 7.0 | 22.0 | 2.0 | 4.0 | 3.0 | 3.0 | 241.0 |
22361 | ZNF674 | ZNF674.1 | 6 | 6 | 0 | GGATTATTGATGTGTAAAAATTTTTTTGATTGTAGTCTCCAGAAAT... | CTAGGAAAGAAATACACCAAATTATTAAGTAAATTGGCATTTGAAT... | ATCTGTGTAGTGTGGAGGGAAAAAGCTAGGAAAGAAATACACCAAA... | TAGGTGAGTGTGTGTGTATAGATAAACACATGGAACAAAAAGTTAG... | 46357184 | ... | 9.0 | 34.0 | 12.0 | 46.0 | 58.0 | 1.0 | 7.0 | 4.0 | 17.0 | 654.0 |
22362 | ZNF711 | ZNF711.8 | 8 | 11 | 0 | TGTTATGTGGGATTATTATTTCTAAATGTTACTCATTGAAATGAGC... | TGTGTTATGTGGCTGTAAATGATGTACACGCTGTAAAATAAGATCG... | AAATTTGGAATATCTACTAAAATTGTGTGTTATGTGGCTGTAAATG... | AATCAGTTCCTTGAGAATAAATTTTTTATCTTTCTTAACTTCAGAA... | 84528343 | ... | 52.0 | 157.0 | 142.0 | 439.0 | 491.0 | 11.0 | 68.0 | 33.0 | 92.0 | 4818.0 |
22363 | ZNF81 | ZNF81.11 | 11 | 15 | 0 | TTTATTGCATTTCTTCCTCCACTATTCTTCTCTAACAGATGACCAA... | TTGGGCAGCATGATAACAACCCAGCAAAAAGCTAACTGATACATTG... | GTTAACTGTTTTAAGTCACTAAGTTTTGGGCAGCATGATAACAACC... | GCCATCCATCCCAGCCATCCCAAACAACTACCAGAATAATGGATTA... | 47781589 | ... | 1.0 | 9.0 | 5.0 | 14.0 | 12.0 | 1.0 | 4.0 | 1.0 | 6.0 | 168.0 |
22364 | ZNF81 | ZNF81.14 | 14 | 15 | 0 | TCCATTCATGTTTTGATGGAAATTTGTATTTCCAGCTTTTGGCTAT... | TTTTATTGCTAAGTGTTATTTCATTATATGGACATACCAGAATTTG... | GTTATATGCGTCTATGTTCATTCCTTTTTATTGCTAAGTGTTATTT... | TTGTTTGGCTCCTTCCATCCAGCATAATAAGTTTGAGATTCATTCA... | 47784996 | ... | 16.0 | 45.0 | 31.0 | 78.0 | 83.0 | 4.0 | 11.0 | 7.0 | 16.0 | 1011.0 |
22365 | ZRSR2 | ZRSR2.3 | 3 | 4 | 0 | CAGGAGGAGGTCGGGTAATAGAGACAGAACTGTTCAGAGTCCCAAA... | AGGAGCCGCCGCAGCCGGAGCCAAAGTTCCTCTAGGTCCCGAAGTC... | GAGCCGGAGCCGGAGCCGGAGCCGCAGGAGCCGCCGCAGCCGGAGC... | GGAAGAAATAGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCA... | 15841360 | ... | 52.0 | 142.0 | 105.0 | 210.0 | 329.0 | 26.0 | 78.0 | 20.0 | 65.0 | 3824.0 |
22366 | ZXDA | ZXDA.13 | 1 | 13 | 2 | TGTGACCCTCAGCAAGTCACTTAACCTATCTGAGCCTTAATTTCCT... | ATGAGTTTGGAGATCTAAATTCTGATCTTGAGTCTGGAACTGACAA... | TGCAAAAAGAGCACAGCCCTGGACTATGAGTTTGGAGATCTAAATT... | TTGAGGATATTCTGGACTAAATATTTAAGTGCAGTCATTTCTTTTT... | 57934228 | ... | 4.0 | 13.0 | 9.0 | 17.0 | 13.0 | 0.0 | 0.0 | 0.0 | 7.0 | 325.0 |
22367 | ZXDA | ZXDA.5 | 9 | 13 | 0 | TTGTTTATCTTCAGTCCTTGATTTATTAACATTTTGCCAATTGAAA... | CCTTTTTGTTTAAACAATGGAACATGATTTAATTTTATCTCATTTG... | TTCTGTTAGTGTTGGCATAAGCTTGCCTTTTTGTTTAAACAATGGA... | GACATGGCATGTAATTACAGACAGAAGTGTGACTGTAAACATATTA... | 57931885 | ... | 13.0 | 15.0 | 12.0 | 32.0 | 45.0 | 0.0 | 1.0 | 9.0 | 12.0 | 495.0 |
22368 | ZXDB | ZXDB.2 | 2 | 6 | 3 | TGGTTTGAATAGATTGCTTTTAAGGTCTTTCTGCTCTGTGATTCCT... | CAAGTCAGTTAACCTATCTGAGCCTTAATTTCCTTATTTATAAATT... | AACTGACAAGTTGTGTGACCCTGAGCAAGTCAGTTAACCTATCTGA... | ACAGCCCTGGACTACAAGTTTGGAGATTTAAATTCTGATCTTGAGT... | 57621181 | ... | 1.0 | 4.0 | 7.0 | 9.0 | 10.0 | 1.0 | 2.0 | 3.0 | 7.0 | 242.0 |
22369 | ZXDB | ZXDB.5 | 5 | 6 | 0 | GCAGATGAGTATGTCAAGGATTGAGATGAACACATAAGTCTTGGAA... | TGTTGTCATTCATTAAGGCCTCTTAAATAGACCACTATTTTTTGTG... | GTACTCATATAGCATATTTCAAAAATGTTGTCATTCATTAAGGCCT... | TTGATTGTCTACCCAATCAACAGTTTTCCCTCTTTGCTCTGGAAAT... | 57623885 | ... | 28.0 | 75.0 | 67.0 | 132.0 | 211.0 | 10.0 | 13.0 | 18.0 | 38.0 | 2088.0 |
22370 rows × 46 columns
#Make Valid PAS lookup hierarchy
cano_pas1 = 'AATAAA'
cano_pas2 = 'ATTAAA'
valid_pas = []
valid_pas.append({})
valid_pas[0]['AATAAA'] = True
valid_pas.append({})
valid_pas[1]['ATTAAA'] = True
valid_pas.append({})
valid_pas[2]['AGTAAA'] = True
valid_pas[2]['TATAAA'] = True
valid_pas[2]['CATAAA'] = True
valid_pas[2]['GATAAA'] = True
valid_pas.append({})
for pos in range(0, 6) :
for base in ['A', 'C', 'G', 'T'] :
valid_pas[3][cano_pas1[:pos] + base + cano_pas1[pos+1:]] = True
valid_pas.append({})
for pos1 in range(0, 6) :
for pos2 in range(pos1 + 1, 6) :
for base1 in ['A', 'C', 'G', 'T'] :
for base2 in ['A', 'C', 'G', 'T'] :
valid_pas[4][cano_pas1[:pos1] + base1 + cano_pas1[pos1+1:pos2] + base2 + cano_pas1[pos2+1:]] = True
#Global dataframe generation
gene_dict_pas_4 = {}
gene_dict_pas_3 = {}
gene_dict_pas_2 = {}
gene_dict_pas_1 = {}
for index, row in df.iterrows() :
gene = row['gene']
found_pas = row['pas']
if gene not in gene_dict_pas_4 :
gene_dict_pas_4[gene] = 0
if gene not in gene_dict_pas_3 :
gene_dict_pas_3[gene] = 0
if gene not in gene_dict_pas_2 :
gene_dict_pas_2[gene] = 0
if gene not in gene_dict_pas_1 :
gene_dict_pas_1[gene] = 0
if found_pas != -1 and found_pas <= 4 :
gene_dict_pas_4[gene] += 1
if found_pas != -1 and found_pas <= 3 :
gene_dict_pas_3[gene] += 1
if found_pas != -1 and found_pas <= 2 :
gene_dict_pas_2[gene] += 1
if found_pas != -1 and found_pas <= 1 :
gene_dict_pas_1[gene] += 1
prox_sitenum_pas_4 = []
prox_sitenum_pas_3 = []
prox_sitenum_pas_2 = []
prox_sitenum_pas_1 = []
num_sites_pas_4 = []
num_sites_pas_3 = []
num_sites_pas_2 = []
num_sites_pas_1 = []
gene_next_dict_pas_4 = {}
gene_next_dict_pas_3 = {}
gene_next_dict_pas_2 = {}
gene_next_dict_pas_1 = {}
for index, row in df.iterrows() :
gene = row['gene']
if gene not in gene_next_dict_pas_4 :
gene_next_dict_pas_4[gene] = -1
if gene not in gene_next_dict_pas_3 :
gene_next_dict_pas_3[gene] = -1
if gene not in gene_next_dict_pas_2 :
gene_next_dict_pas_2[gene] = -1
if gene not in gene_next_dict_pas_1 :
gene_next_dict_pas_1[gene] = -1
found_pas = row['pas']
if found_pas != -1 and found_pas <= 4 :
gene_next_dict_pas_4[gene] += 1
if found_pas != -1 and found_pas <= 3 :
gene_next_dict_pas_3[gene] += 1
if found_pas != -1 and found_pas <= 2 :
gene_next_dict_pas_2[gene] += 1
if found_pas != -1 and found_pas <= 1 :
gene_next_dict_pas_1[gene] += 1
sitenum_pas_4 = -1
if found_pas != -1 and found_pas <= 4 :
sitenum_pas_4 = gene_next_dict_pas_4[gene]
sitenum_pas_3 = -1
if found_pas != -1 and found_pas <= 3 :
sitenum_pas_3 = gene_next_dict_pas_3[gene]
sitenum_pas_2 = -1
if found_pas != -1 and found_pas <= 2 :
sitenum_pas_2 = gene_next_dict_pas_2[gene]
sitenum_pas_1 = -1
if found_pas != -1 and found_pas <= 1 :
sitenum_pas_1 = gene_next_dict_pas_1[gene]
prox_sitenum_pas_4.append(sitenum_pas_4 + (1 if sitenum_pas_4 != -1 else 0))
prox_sitenum_pas_3.append(sitenum_pas_3 + (1 if sitenum_pas_3 != -1 else 0))
prox_sitenum_pas_2.append(sitenum_pas_2 + (1 if sitenum_pas_2 != -1 else 0))
prox_sitenum_pas_1.append(sitenum_pas_1 + (1 if sitenum_pas_1 != -1 else 0))
num_sites_pas_4.append(gene_dict_pas_4[gene])
num_sites_pas_3.append(gene_dict_pas_3[gene])
num_sites_pas_2.append(gene_dict_pas_2[gene])
num_sites_pas_1.append(gene_dict_pas_1[gene])
df['sitenum_pas_4'] = prox_sitenum_pas_4
df['sitenum_pas_3'] = prox_sitenum_pas_3
df['sitenum_pas_2'] = prox_sitenum_pas_2
df['sitenum_pas_1'] = prox_sitenum_pas_1
df['num_sites_pas_4'] = num_sites_pas_4
df['num_sites_pas_3'] = num_sites_pas_3
df['num_sites_pas_2'] = num_sites_pas_2
df['num_sites_pas_1'] = num_sites_pas_1
'''
df = df[['gene',
'gene_id',
'sitenum',
'num_sites',
'sitenum_pas_4',
'num_sites_pas_4',
'sitenum_pas_3',
'num_sites_pas_3',
'sitenum_pas_2',
'num_sites_pas_2',
'sitenum_pas_1',
'num_sites_pas_1',
'pas',
'seq',
'pas_pos',
'cut_mode',
'chrom',
'strand']]
'''
df = df.sort_values(by=['chrom', 'gene', 'sitenum']).copy().reset_index(drop=True)
print(df.head())
print(df.tail())
print('Total number of members: ' + str(len(df)))
gene gene_id sitenum num_sites pas \ 0 ABCB10 ABCB10.6 2 7 0 1 ABCB10 ABCB10.3 5 7 0 2 ABCD3 ABCD3.14 14 15 0 3 ABCD3 ABCD3.15 15 15 0 4 ABL2 ABL2.17 8 24 3 seq \ 0 GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT... 1 TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC... 2 GCCTTGACTTGAAAACATAGATAGTTTAATCTTGACTTGAAAAACA... 3 ATGAGAAAATAAGTATGAAACAGCAATGGTAGTTTGTTTTGCATTA... 4 CTGAGGGGAGAGGGAAAAGGACTTGTTTTCCTGTGTTCTTGTTTTC... seq_ext \ 0 TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC... 1 TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT... 2 TGTTCTTTTTATTCTGGTATCTAAATACTGAGAAGTTCATTTATAA... 3 TGCCAAGACATATCACCGTGTTCTCATAATAAGTTTTTACTTTTTA... 4 TGTGGTGCAGAGGTAGCCACTGTTAGCCTGGTGGGAAAATGCACAC... wide_seq \ 0 TCAGGTTTTGTATTTTCTTTTCTTGTGGAATATTTTAATTAATATA... 1 GGTTGAATCTGAGGAAAATAATCCTTGTGCCATAGAAGTATTTACG... 2 TCAGATATCCTATACAACCTTTGCTTGTTCTTTTTATTCTGGTATC... 3 TTTCATCCATGAGCACCACGCTGCATGCCAAGACATATCACCGTGT... 4 TGTCATGTGTACAGGAAATCAGTGATGTGGTGCAGAGGTAGCCACT... wide_seq_ext pas_pos \ 0 GAGTTTTAATAATTGTAACTTTTTAAATGTCTATAGCACTGAAGTT... 229653571 1 TGTGTTCTTTATAAAGTGTGATTTTCAGAAAGCAAACAACACAATT... 229652351 2 CGAACTTGTATACTTATTTTCTGTTCAGATTAAAAAAAAAAAAAAA... 94984196 3 GCAGTGGGAAATGGTAGTTTAATCCGAAGAATAAACCAAAGAATAA... 94984889 4 TTCAGCAGCTGCTGGTGTGCCCGGGACAAACCCTGTCCTTAATAAC... 179076768 ... THOC5 total_count sitenum_pas_4 sitenum_pas_3 \ 0 ... 17.0 576.0 1 1 1 ... 125.0 8657.0 2 2 2 ... 133.0 11240.0 1 1 3 ... 4.0 329.0 2 2 4 ... 16.0 485.0 1 1 sitenum_pas_2 sitenum_pas_1 num_sites_pas_4 num_sites_pas_3 \ 0 1 1 2 2 1 2 2 2 2 2 1 1 2 2 3 2 2 2 2 4 -1 -1 3 3 num_sites_pas_2 num_sites_pas_1 0 2 2 1 2 2 2 2 2 3 2 2 4 2 1 [5 rows x 54 columns] gene gene_id sitenum num_sites pas \ 22365 ZRSR2 ZRSR2.3 3 4 0 22366 ZXDA ZXDA.13 1 13 2 22367 ZXDA ZXDA.5 9 13 0 22368 ZXDB ZXDB.2 2 6 3 22369 ZXDB ZXDB.5 5 6 0 seq \ 22365 CAGGAGGAGGTCGGGTAATAGAGACAGAACTGTTCAGAGTCCCAAA... 22366 TGTGACCCTCAGCAAGTCACTTAACCTATCTGAGCCTTAATTTCCT... 22367 TTGTTTATCTTCAGTCCTTGATTTATTAACATTTTGCCAATTGAAA... 22368 TGGTTTGAATAGATTGCTTTTAAGGTCTTTCTGCTCTGTGATTCCT... 22369 GCAGATGAGTATGTCAAGGATTGAGATGAACACATAAGTCTTGGAA... seq_ext \ 22365 AGGAGCCGCCGCAGCCGGAGCCAAAGTTCCTCTAGGTCCCGAAGTC... 22366 ATGAGTTTGGAGATCTAAATTCTGATCTTGAGTCTGGAACTGACAA... 22367 CCTTTTTGTTTAAACAATGGAACATGATTTAATTTTATCTCATTTG... 22368 CAAGTCAGTTAACCTATCTGAGCCTTAATTTCCTTATTTATAAATT... 22369 TGTTGTCATTCATTAAGGCCTCTTAAATAGACCACTATTTTTTGTG... wide_seq \ 22365 GAGCCGGAGCCGGAGCCGGAGCCGCAGGAGCCGCCGCAGCCGGAGC... 22366 TGCAAAAAGAGCACAGCCCTGGACTATGAGTTTGGAGATCTAAATT... 22367 TTCTGTTAGTGTTGGCATAAGCTTGCCTTTTTGTTTAAACAATGGA... 22368 AACTGACAAGTTGTGTGACCCTGAGCAAGTCAGTTAACCTATCTGA... 22369 GTACTCATATAGCATATTTCAAAAATGTTGTCATTCATTAAGGCCT... wide_seq_ext pas_pos \ 22365 GGAAGAAATAGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCA... 15841360 22366 TTGAGGATATTCTGGACTAAATATTTAAGTGCAGTCATTTCTTTTT... 57934228 22367 GACATGGCATGTAATTACAGACAGAAGTGTGACTGTAAACATATTA... 57931885 22368 ACAGCCCTGGACTACAAGTTTGGAGATTTAAATTCTGATCTTGAGT... 57621181 22369 TTGATTGTCTACCCAATCAACAGTTTTCCCTCTTTGCTCTGGAAAT... 57623885 ... THOC5 total_count sitenum_pas_4 sitenum_pas_3 \ 22365 ... 65.0 3824.0 1 1 22366 ... 7.0 325.0 1 1 22367 ... 12.0 495.0 2 2 22368 ... 7.0 242.0 1 1 22369 ... 38.0 2088.0 2 2 sitenum_pas_2 sitenum_pas_1 num_sites_pas_4 num_sites_pas_3 \ 22365 1 1 1 1 22366 1 -1 2 2 22367 2 1 2 2 22368 -1 -1 2 2 22369 1 1 2 2 num_sites_pas_2 num_sites_pas_1 22365 1 1 22366 2 1 22367 2 1 22368 1 1 22369 1 1 [5 rows x 54 columns] Total number of members: 22370
df
gene | gene_id | sitenum | num_sites | pas | seq | seq_ext | wide_seq | wide_seq_ext | pas_pos | ... | THOC5 | total_count | sitenum_pas_4 | sitenum_pas_3 | sitenum_pas_2 | sitenum_pas_1 | num_sites_pas_4 | num_sites_pas_3 | num_sites_pas_2 | num_sites_pas_1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ABCB10 | ABCB10.6 | 2 | 7 | 0 | GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT... | TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC... | TCAGGTTTTGTATTTTCTTTTCTTGTGGAATATTTTAATTAATATA... | GAGTTTTAATAATTGTAACTTTTTAAATGTCTATAGCACTGAAGTT... | 229653571 | ... | 17.0 | 576.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
1 | ABCB10 | ABCB10.3 | 5 | 7 | 0 | TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC... | TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT... | GGTTGAATCTGAGGAAAATAATCCTTGTGCCATAGAAGTATTTACG... | TGTGTTCTTTATAAAGTGTGATTTTCAGAAAGCAAACAACACAATT... | 229652351 | ... | 125.0 | 8657.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
2 | ABCD3 | ABCD3.14 | 14 | 15 | 0 | GCCTTGACTTGAAAACATAGATAGTTTAATCTTGACTTGAAAAACA... | TGTTCTTTTTATTCTGGTATCTAAATACTGAGAAGTTCATTTATAA... | TCAGATATCCTATACAACCTTTGCTTGTTCTTTTTATTCTGGTATC... | CGAACTTGTATACTTATTTTCTGTTCAGATTAAAAAAAAAAAAAAA... | 94984196 | ... | 133.0 | 11240.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
3 | ABCD3 | ABCD3.15 | 15 | 15 | 0 | ATGAGAAAATAAGTATGAAACAGCAATGGTAGTTTGTTTTGCATTA... | TGCCAAGACATATCACCGTGTTCTCATAATAAGTTTTTACTTTTTA... | TTTCATCCATGAGCACCACGCTGCATGCCAAGACATATCACCGTGT... | GCAGTGGGAAATGGTAGTTTAATCCGAAGAATAAACCAAAGAATAA... | 94984889 | ... | 4.0 | 329.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
4 | ABL2 | ABL2.17 | 8 | 24 | 3 | CTGAGGGGAGAGGGAAAAGGACTTGTTTTCCTGTGTTCTTGTTTTC... | TGTGGTGCAGAGGTAGCCACTGTTAGCCTGGTGGGAAAATGCACAC... | TGTCATGTGTACAGGAAATCAGTGATGTGGTGCAGAGGTAGCCACT... | TTCAGCAGCTGCTGGTGTGCCCGGGACAAACCCTGTCCTTAATAAC... | 179076768 | ... | 16.0 | 485.0 | 1 | 1 | -1 | -1 | 3 | 3 | 2 | 1 |
5 | ABL2 | ABL2.16 | 9 | 24 | 2 | CCACAAGGCCATTGCTGCTGTAATAAGAACTGCAAATCAGAGTGCT... | CAAGAGAAATTTTTGTTCAGGGCTGTTGGAAGTAGCTGTTAGCCTT... | GCAGAAAAGAAAGCTGGGAATGTACCAAGAGAAATTTTTGTTCAGG... | GGTACTAATGGTGATTATGCTCCAATTTACCTAATGAATTTGGTGG... | 179076299 | ... | 1.0 | 82.0 | 2 | 2 | 1 | -1 | 3 | 3 | 2 | 1 |
6 | ABL2 | ABL2.8 | 17 | 24 | 0 | ACTGCTTTCTCTGTCTTCTCACAAGGTTTGCCAAGTTGTGTTCTGT... | ACTGCTAACAGTGTTAAACTTGATGTAAATAAATGAGGCCCTTGAA... | CTCTCCGTCTGTTGTCTGACTGTGAACTGCTAACAGTGTTAAACTT... | GTTCTTAATTGTTATTGTAATATATTTTCAGTTGTTTTTCTAATTT... | 179068493 | ... | 76.0 | 3961.0 | 3 | 3 | 2 | 1 | 3 | 3 | 2 | 1 |
7 | ACADM | ACADM.14 | 14 | 19 | 0 | TCTATTGTACACAATCTCATTTCATATGTTTGCATTTTGGCAAAGA... | CTTGCCTTAAATTATTTTTATATGACTGTTGGTCTCTAGGTAGCCT... | CCTTATTTAAAATAAATCAATAAAGCTTGCCTTAAATTATTTTTAT... | CAAGAACTTTCTTGAAAATCTTATTTAATTCTGAGCCCATATTTCA... | 76229155 | ... | 330.0 | 20840.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
8 | ACAP3 | ACAP3.3 | 3 | 5 | 2 | CCGGCCTCCTCCGGAGGCACCTTCTCCTGGTACTCGGCCCAGAGCC... | CCACGTGGCTGGCCACGAAGGTCCCCGTGCCAGACAGCCCCAGCCG... | CTGGCTGGACGCGGGCGTCCCAAGGCCACGTGGCTGGCCACGAAGG... | CTGGCGTCGCGGGTGCTGGGCGGGAGGGGCTCTGGCCTGGGTCCTC... | 1228274 | ... | 42.0 | 2090.0 | 1 | 1 | 1 | -1 | 2 | 2 | 2 | 1 |
9 | ACAP3 | ACAP3.2 | 4 | 5 | 0 | TCTTGCCCCAGGCCCCTGCTGGCGGGTCTCACCCCCCACCCCTCGC... | AAGAACAGAATTGATTCTTGCCCCTCTCCCTGTGTGAGCTTGGCCC... | CTCTTGCCTGCTGCCTGTGACCCTGAAGAACAGAATTGATTCTTGC... | TGGGGAGGCTCCCTGAGGGCACAGTGGGCGCTGGACCCGGCCCCCC... | 1227789 | ... | 40.0 | 2661.0 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 1 |
10 | ACBD3 | ACBD3.11 | 12 | 22 | 0 | GTGGATGCTGAAGTTACATGAGCTACATGTTAAATATTTAAAGTCT... | AGATTCCTCAGACTCATCCAGCCCTTGGGTGCTGACCAGCAGAGTC... | AGCATTCATACTTTGGGGTTAAAGGAGATTCCTCAGACTCATCCAG... | TGATGGTTTGTGAACTCTTGCTGGGAATCAAAATTTCCTTGAGACT... | 226334019 | ... | 40.0 | 1490.0 | 1 | 1 | 1 | 1 | 3 | 3 | 3 | 3 |
11 | ACBD3 | ACBD3.10 | 13 | 22 | 0 | TTTGTTTTGGCTTCATAGAGTATCTCAAATTGAAACTTTTCTGCAC... | TGGTATTCATACTACTAGTAGCAAAATACAGGTTTTTTGTTTTGTT... | AACTTTGAATCCTTGTATCTTTATTTGGTATTCATACTACTAGTAG... | TATCAAGATACGTAGAACACCTCAGAGATTTTTCTTCAGGAACTTC... | 226333494 | ... | 23.0 | 1392.0 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 |
12 | ACBD3 | ACBD3.2 | 21 | 22 | 0 | ACAGTACAAGTGCGATTTCAAAAAGATCTTGAAAGTAATATATTTA... | AGAATATTTTTGGTTTTAAACTTTCTTATTGCCTTTGGCTGTTGAT... | GCGGTTCCTGTCATGTGTTCATGTCAGAATATTTTTGGTTTTAAAC... | CCTAAAAATATCATTGTTCTTGGGAGCAGTGTATGTTACTTTACAT... | 226332399 | ... | 114.0 | 8577.0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
13 | ACBD6 | ACBD6.6 | 7 | 12 | 0 | AACTACAAAAATAATACTTCTTTTCCACCCGTCTTTGGTATGTATT... | GACTGGAAAACTGCAGTCTGTAATAGCATAAGGCTTCCATTATGAA... | CACAACTGGCAAGGCTTAATCAAAAGACTGGAAAACTGCAGTCTGT... | CCAGAGGAGGTGACAGGCTGCAAAACAGTTTCTTTGGTGCTGCAGC... | 180257391 | ... | 267.0 | 20294.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
14 | ACOT7 | ACOT7.1 | 18 | 18 | 0 | GGGAATGCTTCCGAGCACGCTGTAGGGTATGGGAAGAACCCAGCAC... | TGCTACACAGTGTTGTCCCGAGCGCCGGGAGGCGTTGGGCAGAAAC... | TTTATTTATATCATTCCAGTATCAATGCTACACAGTGTTGTCCCGA... | GTATCACAGTGTTAACCTGTACTCTCTCCTGCAAACCTACACACCA... | 6324354 | ... | 583.0 | 37630.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
15 | ACTA1 | ACTA1.1 | 1 | 1 | 0 | TATTTTTCGAAACAAAGCCCTGTGGAAGAAAATGGAAAACTTGAAG... | GACACAGTGTTTATAACGTGTACATACATTAACTTATTACCTCATT... | ACTTCCGTTGCTGCCATCGTAAACTGACACAGTGTTTATAACGTGT... | TGGGGGGGCGGCTGAGCTCCAGCCACCCCGCAGTCACTTTCTTTGT... | 229567020 | ... | 7.0 | 731.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
16 | ACTL8 | ACTL8.1 | 1 | 1 | 0 | AGCCTGGGATGCCCTTGCCACCCGTGGTTGGATCTTGTTTTATATC... | ATTTCTGGTCCTACAGGCCCTTTCTGGCCAGGGAGGCATTGCTGCA... | GACTAGGGGATGGGGGACAGTTGACATTTCTGGTCCTACAGGCCCT... | AGTAGGTTTTAACTGGGGTAGCACTCCTGCTAGGAGTCCCAATTAT... | 18153534 | ... | 2.0 | 169.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
17 | ACTN2 | ACTN2.4 | 4 | 8 | 0 | GAGCGATCTGTGATGCTGAGCTTCTGTAATCACTCATCCCATCAGA... | GTGCCTGGTGCACTGGATTACGCTGCGTTCTCTTCCGCACTCTACG... | GCCCGCCTACTCGGGCCCAGGCAGTGTGCCTGGTGCACTGGATTAC... | CTGCGTCGGGAGCTGCCCCCGGATCAGGCCCAGTACTGCATCAAGA... | 236925956 | ... | 7.0 | 380.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
18 | ACTN2 | ACTN2.5 | 5 | 8 | 0 | TACAAAATACCCAAGATTTAAGACCGGGGGGAAAAAACCACAAATT... | TAGGAAATTAGGAGGATCTAGGGACAGAAGGAAAGTGAAAAATGTG... | TAAACAGAACAAATTACTTGAGTAATAGGAAATTAGGAGGATCTAG... | TTCTGAGTTTTTAGCAAAATGTAATGAAATATCAGGTTGATTTCTT... | 236926307 | ... | 11.0 | 390.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
19 | ADAM15 | ADAM15.6 | 6 | 7 | 0 | GGTTGGACGGGATTGAGGAAGGTCCGCACAGCCTGTCTCTGCTCAG... | TCTGCGGACCTGCCGGCGTAGTTGCAGCGGGGGCTTGGGGAGGGGC... | ACCGCCACGCGCTGTCAAGCAACACTCTGCGGACCTGCCGGCGTAG... | CTACCATGACTGAAGGCGCCAGAGACTGGCGGTGTCTTAAGACTCC... | 155035225 | ... | 110.0 | 8455.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
20 | ADAR | ADAR.3 | 8 | 10 | 0 | CTTTTATGTGTCCCTTGATAACAGTGACTTAACAATATACATTCCT... | CATAGACTCGGGTACTGTGATGATGGCTGCAGTCCAGTTTTATGAT... | GAGCAGAGTGAGGAAGACCCCCAAGCATAGACTCGGGTACTGTGAT... | CACATTGAAGGGACTTCGTTGGTTTTTTGGAGTCTTGGTTGTGACT... | 154554567 | ... | 442.0 | 30159.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
21 | ADGRB2 | ADGRB2.2 | 3 | 4 | 0 | CTGTCCGTCCCTGTCCCGGGCTGGGGAGGGGGGAGGGGAACTTTGT... | CTTGTTTCTCAGAGGCCCCTCAGCCACTGGAACCCCATCTTCAGCC... | TTGGCCTGGGGTCCCAGGGCCCTTCCTTGTTTCTCAGAGGCCCCTC... | GGGCAGGGAGGCGCCGTGGACTCAGCCAGGCTGGGGGAGCCGGACA... | 32192737 | ... | 6.0 | 725.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
22 | ADGRL2 | ADGRL2.17 | 17 | 26 | 0 | ATTGCTAGGGTAAAATAAATACATTTGTGTCCAACTGAAATATAAT... | ACAATGAACTATTCTCATGAAAAATGGCTAAAGAAATTATATTTTG... | TTATTTCATATGTTTCCTCAACTGTACAATGAACTATTCTCATGAA... | AAGGCAAAGATTGAAAACATGCTTAACCACTAGCAATCAAGCCACA... | 82457511 | ... | 24.0 | 2003.0 | 1 | 1 | 1 | 1 | 4 | 4 | 4 | 3 |
23 | ADGRL2 | ADGRL2.21 | 21 | 26 | 0 | ATGAGCCCATCACTAATATCCAGTGTAAAGTTTAACACGGTTTGAC... | TTCAGCAAAATTCTGCTTTTTTTTCATCCCTTTGTGTAAACCTGTT... | AAAGTTCTAATGAAATGTAAATTGTTTCAGCAAAATTCTGCTTTTT... | CTCTTCCATATTCCTTCTGCCTATATTTAGTAATTAATTTATTTTA... | 82458083 | ... | 69.0 | 4259.0 | 2 | 2 | 2 | 2 | 4 | 4 | 4 | 3 |
24 | ADGRL2 | ADGRL2.22 | 22 | 26 | 2 | CCAGGACTAAAAAAAGAAGGATTGGAAGTTCTGCCATCAAATTTGG... | AAGTTTTAAGAGGGATATATTCTGAAAGCATTTTTGTTTGTTGCAT... | TACATATGCACAAAATCTGCTGGATAAGTTTTAAGAGGGATATATT... | GCCTGTTTGTTCTTTATGCTGAAAGGAATATATGTCTTCCAATTGC... | 82458403 | ... | 1.0 | 117.0 | 3 | 3 | 3 | -1 | 4 | 4 | 4 | 3 |
25 | ADGRL2 | ADGRL2.25 | 25 | 26 | 0 | TTCTTCCTAGAGACATTCACTCTGTCTTTATTAAAAATAATAATAA... | TCTCTTTGTGATAGTCACAACTGTGAACTTATAAAGTCACATTTTT... | TACAGTTTTCACATTCTTGATTGTTTCTCTTTGTGATAGTCACAAC... | TGTATTTATTAACCCAGTTTCTTTGGGTCTGCCTAGACTTCACCTG... | 82459593 | ... | 5.0 | 262.0 | 4 | 4 | 4 | 3 | 4 | 4 | 4 | 3 |
26 | ADIPOR1 | ADIPOR1.5 | 3 | 7 | 3 | GAGCCTTCCCACCTGCGGGGTGGAGGAGGAACTTCCCAAGTGCTTT... | GGAATTCCGTTACGGCCTAGAAGGCGGCTGTACTGATGACACCCTT... | ACTTCTATGGAGTCTCCAACCTTCAGGAATTCCGTTACGGCCTAGA... | CCAGTCTCATCAGATTTTCCATGTCCTGGTGGTGGCAGCAGCCTTT... | 202910652 | ... | 12.0 | 989.0 | 1 | 1 | -1 | -1 | 2 | 2 | 1 | 1 |
27 | ADIPOR1 | ADIPOR1.2 | 6 | 7 | 0 | AAGAAACCTGCTATCATTGCTATGTATCTTGATGCAAAGACTATGA... | GTAAAATTTAAATGGGGAAAGATATTTAATATTTAATACTAAGCTT... | TTTTTTTTTTTCTTGGCAGAGTAATGTAAAATTTAAATGGGGAAAG... | CCGGCTAATCATGGAAGTGTGTCCAGGCTTCAAGTAACTTGAGTTT... | 202909995 | ... | 784.0 | 40043.0 | 2 | 2 | 1 | 1 | 2 | 2 | 1 | 1 |
28 | ADORA1 | ADORA1.2 | 2 | 2 | 0 | TCTGTTGGAAATTGGGTGTGCCCTGGCTCCCAAGGGAGGCCCATGT... | ACCTTCTGAACATGAGTGTCAACTCCAGGACTTGCTTCCAAGCCCT... | GGGAGGGGAGGTGGCCGTCGAGTTGACCTTCTGAACATGAGTGTCA... | GTGTGGGAGGGCGAGGCGGGGGATCCTGGAGCCCCTGTGTCGGGGG... | 203136512 | ... | 0.0 | 136.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
29 | ADPRHL2 | ADPRHL2.3 | 3 | 3 | 0 | CTGGCTCTTGGCTGCCATGTGGCTTATTAACAGCTTCCAGTGGAAG... | TTGTGTGGATGAAGGGACAGGCACTTGCATCCAGCTGATCTAGGTC... | CCTAGCAGGGTCCCCGAGCAGCAGGTTGTGTGGATGAAGGGACAGG... | TAGCATCATTTCTCCCTGTTGGGTTTTAGCCAGTTTGCCAGCAAGC... | 36559508 | ... | 527.0 | 30706.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
22340 | ZFX | ZFX.13 | 13 | 21 | 4 | CTTTTGCAACTTTATGTATGTAAATGTACCCTGAATTATATATATA... | TGAGGACTGCATTTTGGAATCTCCTAGAGGTAACTCATGGCTTATA... | GTTGTGTGCTACAAATGACACTTACTGAGGACTGCATTTTGGAATC... | GCTCATGTTGTATTTTGATTATTCTGTCTGTGCGGCTTCATCTTGG... | 24230364 | ... | 13.0 | 402.0 | 2 | -1 | -1 | -1 | 5 | 4 | 4 | 3 |
22341 | ZFX | ZFX.15 | 15 | 21 | 0 | AAGTTAATGCTTTTTCTCAAGGAAAAAAAATGTACAGTTTTTGTAA... | TCAGAATATTTTCCTGTTATTGAATGTTTAGTCTATTTGATACCAG... | TGAGAATAACGTGCAAAAATAAAAATCAGAATATTTTCCTGTTATT... | ATAAAAAGAAAATCATATAGGGATGTGTGACATTATTGTAATTGTG... | 24232603 | ... | 112.0 | 5657.0 | 3 | 2 | 2 | 2 | 5 | 4 | 4 | 3 |
22342 | ZFX | ZFX.17 | 17 | 21 | 0 | TGGATTTGTTATATATTGTTCCTGTTATTTTTGACATCTTTGCTAT... | TTTCATCTTTTGTTTCGTGTATATACTGTTTGCCTTTTTCATAAAA... | AATTAAGTTTACCCTATGGATTTTGTTTCATCTTTTGTTTCGTGTA... | AATAAATCATAACCATTTTGCCACATTCTGTAACTGTTTAGCTAAG... | 24234179 | ... | 21.0 | 2096.0 | 4 | 3 | 3 | 3 | 5 | 4 | 4 | 3 |
22343 | ZFX | ZFX.19 | 19 | 21 | 2 | TTTTTGTAGTCCATGATTTATCTTGTTATTGCTTATGTAATCTTTT... | GAGTTCTTTGTACATTAAGCCTATTAACTTTCCAATACATTTGCTG... | AATCCTGTGCCCCATTTTCTATTTGGAGTTCTTTGTACATTAAGCC... | TATGTATATAATAGCACTTTAAGTTGTCGCTCTGATAAACTGCCTA... | 24235904 | ... | 4.0 | 344.0 | 5 | 4 | 4 | -1 | 5 | 4 | 4 | 3 |
22344 | ZIC3 | ZIC3.13 | 13 | 13 | 0 | TTGTGTTCAGAAAGCAATACCCAGGATCCCTTTAATTTTTGTTTGA... | TAAAGTGTTTAAAACTAGAAACATGAATTGTTGTTTTGTTAATTTT... | AATACCAGCATTCTATATGTAGAACTAAAGTGTTTAAAACTAGAAA... | AGTATGCATTATCTCATGCATTGTGTTGACCTTGGTGAAAATCTTT... | 136659831 | ... | 1.0 | 121.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
22345 | ZMYM3 | ZMYM3.1 | 3 | 3 | 0 | GAGAAGCTGTTGTCACGACTAACCTTCTGTCTCTGAAATTGTTTGT... | GGCGCCTGGCCTGTCCCTTCAGTGAGCCATGCCCACCCTTGTGTTG... | GTTTAAATTGTATATATTGTTCTGAGGCGCCTGGCCTGTCCCTTCA... | GTTTCCCAGGCTCCTTTTTGTGTTTTTATAACTGTCACCAGTTAGC... | 70459493 | ... | 120.0 | 8211.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
22346 | ZNF157 | ZNF157.5 | 5 | 6 | 2 | CCTTGTTGATAAGTATAGGTCTGGCTAGTCATTTATCTCTCCATTC... | AAGTTCTGCAGCTTGCTTTTTTCATGTAATGTTACCTTTCCAAGAT... | CATAAATGGGCTCACACTGTATTTAAAGTTCTGCAGCTTGCTTTTT... | TAGCATGAAGTGTGCAATACATACTACTATATTGTCTTTATATAAT... | 47274001 | ... | 1.0 | 86.0 | 1 | 1 | 1 | -1 | 1 | 1 | 1 | 0 |
22347 | ZNF182 | ZNF182.2 | 1 | 2 | 0 | GAAATAATATGGTAGATAATTTAGACTTATTTAGTAGAAGTTCTGC... | TACAAACCTTGTTGCTTCAATACAAAGACCCGATAAACACGAATCA... | AGTTTGGAAACATACTTCATCTGAGTACAAACCTTGTTGCTTCAAT... | ATTTTCTGACAAGAAAACAATTATCACCAAGAGTGCTCGTGACTGT... | 47836945 | ... | 5.0 | 380.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
22348 | ZNF182 | ZNF182.1 | 2 | 2 | 0 | TTGAAAATGTGGGTGTACTATGTACTATGTGGATGTACTACCTTTT... | GGCTGGGGGGATGGAATTGAGAGGGAGACAACTGTATCCTTTCATA... | CCTTGGGAGGGGAAGGGGCAGTGGTGGCTGGGGGGATGGAATTGAG... | TTCATAAGTTATCTCTGGAAAGAGACATGAGAACCTGGTAACCCTG... | 47834270 | ... | 38.0 | 1251.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
22349 | ZNF185 | ZNF185.5 | 5 | 6 | 0 | TTGCTTAGTGTTTCTAATCATACTTAATCCACACTAATGTGCGCAA... | TTTTGCAGATCTGAGGAAGAGGGATGCATTACCTTTTTGCTTCTTT... | CTCTCATGTCTAAAAAGGCACAGAATTTTGCAGATCTGAGGAAGAG... | TTGAAAGAAATCTTGCAAGAGCCATTATTGACTTAGATCCAAAACA... | 152141997 | ... | 18.0 | 576.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
22350 | ZNF275 | ZNF275.4 | 4 | 6 | 2 | ATTTTATGTCTACGTATATTGTTCCTTTACTGAACCCACCACATGC... | TAAGATGGGTGAAAGTCGATGCCTTCTAGTCTCAGTGAATTTAACC... | GTTCAAACGTGTGTTCTCTGTTCTCTAAGATGGGTGAAAGTCGATG... | TGCCACTTGGCTGCTTCCTGGCCAAGTCGCACCTGACTGCATGAAC... | 152617885 | ... | 3.0 | 291.0 | 1 | 1 | 1 | -1 | 2 | 2 | 2 | 1 |
22351 | ZNF275 | ZNF275.6 | 6 | 6 | 0 | GTCTGATCCCCTACCAAATCTAGCACAGTGCCTTGCATCAAGTAGA... | CCCCCACCCATTAAATTGTGAGCTCTTAGAAGACAGGGGTGGCCTT... | GCTTCACCCCCTCCTCCTCAGCCCTCCCCCACCCATTAAATTGTGA... | TTCTGGCACTCACTATAATCAGCCTTGCACTAGAGCTGTTTGTGGA... | 152618362 | ... | 66.0 | 3151.0 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 1 |
22352 | ZNF280C | ZNF280C.3 | 4 | 6 | 0 | TTGGTTACATTGAATACAGATTTGCTGAACAGTTTTGATGTTATTT... | GCTCATATATATATAGATGTATATTTTTTTTAATTTCTTGTTTGTT... | GGACCAAAAAATGCACTTGTTTCTTGCTCATATATATATAGATGTA... | GCCATTAATGTAATTCCTCTGGATAAAGATAATATATTCAAAAAAT... | 129336743 | ... | 49.0 | 2016.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
22353 | ZNF280C | ZNF280C.1 | 6 | 6 | 0 | TGATTCAGAAATAGCCATGTCATGCATGTGTCCTTTTTTGTTTTCA... | AGCTTGACAATCTGATCCCTCTTCACCTTCAGACTGTTAGTTATTT... | GTTTTAATAGGTGTTAGTGAGTTTTAGCTTGACAATCTGATCCCTC... | TAAAACTATTTATGCAGTCAGTACTAAGCTTACTTGTTATAAGCAG... | 129335349 | ... | 3.0 | 290.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
22354 | ZNF41 | ZNF41.16 | 7 | 22 | 2 | CCTAGCTCTAGGCTATGTTACAGAAATATAGTCATTGAATGATACA... | AATGGAATTTTTAAATTTAAAGATATAACTTTATGAATTGAGAAAT... | CTCATGGTATATATATTTTAAGTGCAATGGAATTTTTAAATTTAAA... | ACATAGAAAAGATTTCCATGAAAAACTTTTTTCTTTTCCCTTGGGA... | 47306101 | ... | 15.0 | 331.0 | 1 | 1 | 1 | -1 | 3 | 2 | 1 | 0 |
22355 | ZNF41 | ZNF41.12 | 11 | 22 | 3 | GATAATTCATTTTCATTGTCATGTTGTATCCCATTCTGTGAATATG... | TCATTCACATTATGTATGTGAGAGTCATCCATATGTTGCATATAGT... | TATACTCTTTAGCATCTGTCTTCTGTCATTCACATTATGTATGTGA... | TTTTGTATTTTATATAAATTGAGTCAATTATATATCATATAATTGA... | 47305154 | ... | 1.0 | 68.0 | 2 | 2 | -1 | -1 | 3 | 2 | 1 | 0 |
22356 | ZNF41 | ZNF41.2 | 21 | 22 | 4 | TTTAAGTTTTACATCTAGATCTAGCATGTATTTTGAGTTATATGGT... | TTGCCTAACCAAAGATTACAATGATTTTTTCCTGTGTTTTCTTCTA... | TTTGAGTGCTGTATCTAAGAAATCCTTGCCTAACCAAAGATTACAA... | CTTATTTTGATGAAGTCCGTTCTTTCCATTTGTTCATTTCTGGGTT... | 47304199 | ... | 10.0 | 499.0 | 3 | -1 | -1 | -1 | 3 | 2 | 1 | 0 |
22357 | ZNF449 | ZNF449.3 | 3 | 11 | 0 | TCTATCAGACGTATTGATTATAGCAGTACTATAGTTATTCTGCTGT... | AAATTATGACAATCCTTTTAGAGGTAGGGTCAATATAGTGGATAAA... | TCATAGGTGTAAACATAAAGCATATAAATTATGACAATCCTTTTAG... | TCTTCTTTTTTTAATTACAATGAAAAATTTTGTGTTCCAAGGCAAC... | 134495399 | ... | 4.0 | 192.0 | 1 | 1 | 1 | 1 | 3 | 3 | 3 | 3 |
22358 | ZNF449 | ZNF449.6 | 6 | 11 | 0 | AGTGCCTTAGAATGGATGTGCCCAACTGCTCTGTATTTATGCAATA... | ATGTAATGGCTTCTCTTTTCTCTCTTGTGGAATTGCATTCAAACCA... | TGAATGTAGAGATGAAAAATACAGAATGTAATGGCTTCTCTTTTCT... | GGCTCAAATTGATATCCCAGTAGCAATAAACATATAATATAGGAGG... | 134497046 | ... | 4.0 | 571.0 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 |
22359 | ZNF449 | ZNF449.8 | 8 | 11 | 0 | AAGATTAAATGAAATATATTTTGCTCTGGCCCTACACACTGTAAGC... | CCTAGCTATTATAAAGGGGAAATTACAGTACCTACCTCAAAAGTAC... | TAAGGGCAAAATACAGTACCTACCTCCTAGCTATTATAAAGGGGAA... | GGATAATTATACTTCTATGTCTAATTGTACTTCTGAGCATTTCAAA... | 134497629 | ... | 7.0 | 171.0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
22360 | ZNF630 | ZNF630.4 | 2 | 5 | 0 | AATATGGAATCAATTTGCTCACCCTCAAACTGTCTCAGCCCTCTTC... | CCTTCCACAGAATGATCTGCTTAGGCCCTCAGAATATTTTCAACCC... | TATGTTCCTCAGTATACATGATTACCCTTCCACAGAATGATCTGCT... | CTCACATTGAATAGAAATAGTGACAACTTCTCAACTGTAGAATAGA... | 47917631 | ... | 3.0 | 241.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
22361 | ZNF674 | ZNF674.1 | 6 | 6 | 0 | GGATTATTGATGTGTAAAAATTTTTTTGATTGTAGTCTCCAGAAAT... | CTAGGAAAGAAATACACCAAATTATTAAGTAAATTGGCATTTGAAT... | ATCTGTGTAGTGTGGAGGGAAAAAGCTAGGAAAGAAATACACCAAA... | TAGGTGAGTGTGTGTGTATAGATAAACACATGGAACAAAAAGTTAG... | 46357184 | ... | 17.0 | 654.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
22362 | ZNF711 | ZNF711.8 | 8 | 11 | 0 | TGTTATGTGGGATTATTATTTCTAAATGTTACTCATTGAAATGAGC... | TGTGTTATGTGGCTGTAAATGATGTACACGCTGTAAAATAAGATCG... | AAATTTGGAATATCTACTAAAATTGTGTGTTATGTGGCTGTAAATG... | AATCAGTTCCTTGAGAATAAATTTTTTATCTTTCTTAACTTCAGAA... | 84528343 | ... | 92.0 | 4818.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
22363 | ZNF81 | ZNF81.11 | 11 | 15 | 0 | TTTATTGCATTTCTTCCTCCACTATTCTTCTCTAACAGATGACCAA... | TTGGGCAGCATGATAACAACCCAGCAAAAAGCTAACTGATACATTG... | GTTAACTGTTTTAAGTCACTAAGTTTTGGGCAGCATGATAACAACC... | GCCATCCATCCCAGCCATCCCAAACAACTACCAGAATAATGGATTA... | 47781589 | ... | 6.0 | 168.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
22364 | ZNF81 | ZNF81.14 | 14 | 15 | 0 | TCCATTCATGTTTTGATGGAAATTTGTATTTCCAGCTTTTGGCTAT... | TTTTATTGCTAAGTGTTATTTCATTATATGGACATACCAGAATTTG... | GTTATATGCGTCTATGTTCATTCCTTTTTATTGCTAAGTGTTATTT... | TTGTTTGGCTCCTTCCATCCAGCATAATAAGTTTGAGATTCATTCA... | 47784996 | ... | 16.0 | 1011.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
22365 | ZRSR2 | ZRSR2.3 | 3 | 4 | 0 | CAGGAGGAGGTCGGGTAATAGAGACAGAACTGTTCAGAGTCCCAAA... | AGGAGCCGCCGCAGCCGGAGCCAAAGTTCCTCTAGGTCCCGAAGTC... | GAGCCGGAGCCGGAGCCGGAGCCGCAGGAGCCGCCGCAGCCGGAGC... | GGAAGAAATAGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCA... | 15841360 | ... | 65.0 | 3824.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
22366 | ZXDA | ZXDA.13 | 1 | 13 | 2 | TGTGACCCTCAGCAAGTCACTTAACCTATCTGAGCCTTAATTTCCT... | ATGAGTTTGGAGATCTAAATTCTGATCTTGAGTCTGGAACTGACAA... | TGCAAAAAGAGCACAGCCCTGGACTATGAGTTTGGAGATCTAAATT... | TTGAGGATATTCTGGACTAAATATTTAAGTGCAGTCATTTCTTTTT... | 57934228 | ... | 7.0 | 325.0 | 1 | 1 | 1 | -1 | 2 | 2 | 2 | 1 |
22367 | ZXDA | ZXDA.5 | 9 | 13 | 0 | TTGTTTATCTTCAGTCCTTGATTTATTAACATTTTGCCAATTGAAA... | CCTTTTTGTTTAAACAATGGAACATGATTTAATTTTATCTCATTTG... | TTCTGTTAGTGTTGGCATAAGCTTGCCTTTTTGTTTAAACAATGGA... | GACATGGCATGTAATTACAGACAGAAGTGTGACTGTAAACATATTA... | 57931885 | ... | 12.0 | 495.0 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 1 |
22368 | ZXDB | ZXDB.2 | 2 | 6 | 3 | TGGTTTGAATAGATTGCTTTTAAGGTCTTTCTGCTCTGTGATTCCT... | CAAGTCAGTTAACCTATCTGAGCCTTAATTTCCTTATTTATAAATT... | AACTGACAAGTTGTGTGACCCTGAGCAAGTCAGTTAACCTATCTGA... | ACAGCCCTGGACTACAAGTTTGGAGATTTAAATTCTGATCTTGAGT... | 57621181 | ... | 7.0 | 242.0 | 1 | 1 | -1 | -1 | 2 | 2 | 1 | 1 |
22369 | ZXDB | ZXDB.5 | 5 | 6 | 0 | GCAGATGAGTATGTCAAGGATTGAGATGAACACATAAGTCTTGGAA... | TGTTGTCATTCATTAAGGCCTCTTAAATAGACCACTATTTTTTGTG... | GTACTCATATAGCATATTTCAAAAATGTTGTCATTCATTAAGGCCT... | TTGATTGTCTACCCAATCAACAGTTTTCCCTCTTTGCTCTGGAAAT... | 57623885 | ... | 38.0 | 2088.0 | 2 | 2 | 1 | 1 | 2 | 2 | 1 | 1 |
22370 rows × 54 columns
df.to_csv('polyadb_processed_utr3_perturb.csv', header=True, index=False, sep='\t')
df = pd.read_csv('polyadb_processed_utr3_perturb.csv', sep='\t')
import matplotlib.pyplot as plt
import numpy as np
f = plt.figure()
t1 = df.query("num_sites >= 2").groupby("num_sites").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
f = plt.figure()
t1 = df.query("num_sites_pas_4 >= 2").groupby("num_sites_pas_4").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
f = plt.figure()
t1 = df.query("num_sites_pas_3 >= 2").groupby("num_sites_pas_3").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
f = plt.figure()
t1 = df.query("num_sites_pas_2 >= 2").groupby("num_sites_pas_2").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
f = plt.figure()
t1 = df.query("num_sites_pas_1 >= 2").groupby("num_sites_pas_1").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
n genes (total) = 11159
n genes (total) = 5622
n genes (total) = 5270
n genes (total) = 4692
n genes (total) = 3944
#Process features
def _one_hot_encode_inplace(seq, x, i, k) :
for j in range(len(seq)) :
if seq[j] == 'A' :
x[i, k, j, 0] = 1.
elif seq[j] == 'C' :
x[i, k, j, 1] = 1.
elif seq[j] == 'G' :
x[i, k, j, 2] = 1.
elif seq[j] == 'T' :
x[i, k, j, 3] = 1.
return
min_pas_level = 3
max_num_sites = 10
df_sel = df.query("num_sites_pas_" + str(min_pas_level) + " >= 2 and num_sites_pas_" + str(min_pas_level) + " <= 10 and sitenum_pas_" + str(min_pas_level) + " != -1").copy().reset_index(drop=True)
genes = df_sel['gene'].unique()
gene_dict = {gene : gene_i for gene_i, gene in enumerate(genes)}
cell_types = np.array(['rpm', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5'], dtype=np.object)
cell_type_dict = {cell_type : cell_type_i for cell_type_i, cell_type in enumerate(cell_types)}
x = np.zeros((genes.shape[0], max_num_sites, 205, 4))
x_ext = np.zeros((genes.shape[0], max_num_sites, 356, 4))
m = np.zeros((genes.shape[0], max_num_sites))
l = np.zeros((genes.shape[0], max_num_sites))
prev_pos_dict = {}
c = np.zeros((genes.shape[0], max_num_sites, cell_types.shape[0]))
gene_df_dict = {}
gene_df_dict['gene'] = ['N/A'] * genes.shape[0]
gene_df_dict['strand'] = ['N/A'] * genes.shape[0]
gene_df_dict['chrom'] = ['N/A'] * genes.shape[0]
gene_df_cols = ['gene', 'strand', 'chrom']
for k in range(max_num_sites) :
gene_df_dict['gene_id' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['site_type' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['pas' + "_" + str(k)] = [-1] * genes.shape[0]
gene_df_dict['wide_seq_ext' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['pas_exists' + "_" + str(k)] = [0] * genes.shape[0]
gene_df_dict['dist' + "_" + str(k)] = [-1] * genes.shape[0]
gene_df_dict['pas_pos' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['cut_mode' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['cut_mode_hg38' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_cols.extend([
'gene_id' + "_" + str(k),
'site_type' + "_" + str(k),
'pas' + "_" + str(k),
'wide_seq_ext' + "_" + str(k),
'pas_exists' + "_" + str(k),
'dist' + "_" + str(k),
'pas_pos' + "_" + str(k),
'cut_mode' + "_" + str(k),
'cut_mode_hg38' + "_" + str(k),
])
for index, row in df_sel.iterrows() :
gene = row['gene']
gene_id = row['gene_id']
strand = row['strand']
chrom = row['chrom']
i = gene_dict[gene]
k = row["sitenum_pas_" + str(min_pas_level)] - 1
if i % 1000 == 0 and k == 0 :
print("Processing gene " + str(i) + "...")
seq = row['wide_seq_ext'][175-70:175-70+205]
_one_hot_encode_inplace(seq, x, i, k)
_one_hot_encode_inplace(row['wide_seq_ext'], x_ext, i, k)
m[i, k] = 1.
if gene not in prev_pos_dict :
l[i, k] = 0.
else :
l[i, k] = np.abs(row['cut_mode'] - prev_pos_dict[gene])
gene_df_dict['gene'][i] = gene
gene_df_dict['strand'][i] = strand
gene_df_dict['chrom'][i] = chrom
gene_df_dict['gene_id' + "_" + str(k)][i] = gene_id
gene_df_dict['pas' + "_" + str(k)][i] = row['pas']
gene_df_dict['site_type' + "_" + str(k)][i] = row['site_type']
gene_df_dict['wide_seq_ext' + "_" + str(k)][i] = row['wide_seq_ext']
gene_df_dict['pas_exists' + "_" + str(k)][i] = int(m[i, k])
gene_df_dict['dist' + "_" + str(k)][i] = l[i, k]
gene_df_dict['pas_pos' + "_" + str(k)][i] = row['pas_pos']
gene_df_dict['cut_mode' + "_" + str(k)][i] = row['cut_mode']
gene_df_dict['cut_mode_hg38' + "_" + str(k)][i] = row['cut_mode_hg38']
prev_pos_dict[gene] = row['cut_mode']
for cell_type_i in range(cell_types.shape[0]) :
cell_type = cell_types[cell_type_i]
c[i, k, cell_type_i] = row[cell_type]
gene_df = pd.DataFrame(gene_df_dict)
gene_df = gene_df[gene_df_cols]
print(len(gene_df))
print("x.shape = " + str(x.shape))
print("x_ext.shape = " + str(x_ext.shape))
print("m.shape = " + str(m.shape))
print("l.shape = " + str(l.shape))
print("c.shape = " + str(c.shape))
Processing gene 0... Processing gene 1000... Processing gene 2000... Processing gene 3000... Processing gene 4000... Processing gene 5000... 5267 x.shape = (5267, 10, 205, 4) x_ext.shape = (5267, 10, 356, 4) m.shape = (5267, 10) l.shape = (5267, 10) c.shape = (5267, 10, 28)
total_c = np.sum(c, axis=(1, 2))
x = x[total_c > 0, ...]
x_ext = x_ext[total_c > 0, ...]
m = m[total_c > 0, :]
l = l[total_c > 0, :]
c = c[total_c > 0, ...]
gene_df = gene_df.iloc[np.nonzero(total_c > 0)[0]].copy().reset_index(drop=True)
print(len(gene_df))
print("x.shape = " + str(x.shape))
print("x_ext.shape = " + str(x_ext.shape))
print("m.shape = " + str(m.shape))
print("l.shape = " + str(l.shape))
print("c.shape = " + str(c.shape))
5267 x.shape = (5267, 10, 205, 4) x_ext.shape = (5267, 10, 356, 4) m.shape = (5267, 10) l.shape = (5267, 10) c.shape = (5267, 10, 28)
y = c / np.sum(c, axis=1, keepdims=True)
y[np.isnan(y)] = 0.
s = np.array(np.sum(c, axis=1) > 0, dtype=np.float)
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in true_divide from ipykernel import kernelapp as app
gene_df.to_csv("polyadb_features_pas_" + str(min_pas_level) + "_utr3_perturb.csv", sep='\t')
np.savez("polyadb_features_pas_" + str(min_pas_level) + "_utr3_perturb.npz", x=x, x_ext=x_ext, m=m, l=l, c=c, y=y, s=s)
np.savez("polyadb_features_pas_" + str(min_pas_level) + "_utr3_perturb_no_x.npz", m=m, l=l, c=c, y=y, s=s)
save_dict = np.load("polyadb_features_pas_3_utr3_perturb_no_x.npz")
m, l, c, y, s = save_dict['m'], save_dict['l'], save_dict['c'], save_dict['y'], save_dict['s']
dist_index = np.array([np.nonzero(m[i, :])[0][-1] for i in range(m.shape[0])])
y_dist = []
for i in range(y.shape[0]) :
y_dist.append(y[i, dist_index[i], :][None, :])
y_dist = np.concatenate(y_dist, axis=0)
dist_mask = np.zeros(m.shape)
for i in range(m.shape[0]) :
dist_mask[i, dist_index[i]] = 1.
cell_types = np.array(['rpm', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5'], dtype=np.object)
cell_type_dict = {cell_type : cell_type_i for cell_type_i, cell_type in enumerate(cell_types)}
#PolyADB annotated distal site comparison
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, pearsonr
cell_type_1 = 'rpm'
cell_type_2 = 'NT'
min_count_1 = 0.
min_count_2 = 10.
cell_type_1_ix = cell_type_dict[cell_type_1]
cell_type_2_ix = cell_type_dict[cell_type_2]
keep_index = np.nonzero((np.sum(c[..., cell_type_1_ix], axis=-1) >= min_count_1) & (np.sum(c[..., cell_type_2_ix], axis=-1) >= min_count_2))[0]
print("n = " + str(keep_index.shape[0]))
print(spearmanr(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix]))
print(pearsonr(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix]))
f = plt.figure(figsize=(4, 4))
plt.scatter(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix], color='black', alpha=0.25, s=8)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
n = 5267 SpearmanrResult(correlation=0.821364836472268, pvalue=0.0) (0.8027300007698112, 0.0)
#Comparison for identified sites in perturb-seq data only, all perturbations
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, pearsonr
cell_type_1 = 'NT'
min_count_1 = 10.
min_count_2 = 10.
cell_type_1_ix = cell_type_dict[cell_type_1]
for cell_type_2_ix, cell_type_2 in enumerate(cell_types.tolist()) :
if cell_type_2_ix in [0, 1] :
continue
print(cell_type_1 + " vs. " + cell_type_2)
keep_index = np.nonzero((np.sum(c[..., cell_type_1_ix], axis=-1) >= min_count_1) & (np.sum(c[..., cell_type_2_ix], axis=-1) >= min_count_2))[0]
print("n = " + str(keep_index.shape[0]))
print(spearmanr(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix]))
print(pearsonr(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix]))
f = plt.figure(figsize=(4, 4))
plt.scatter(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix], color='black', alpha=0.25, s=8)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
NT vs. CDC73 n = 5055 SpearmanrResult(correlation=0.9750539570429886, pvalue=0.0) (0.9846600912724476, 0.0)
NT vs. CPSF1 n = 5164 SpearmanrResult(correlation=0.9689793565326417, pvalue=0.0) (0.9724669024408563, 0.0)
NT vs. CPSF2 n = 5195 SpearmanrResult(correlation=0.9741926705449473, pvalue=0.0) (0.9766110164828116, 0.0)
NT vs. CPSF3 n = 5210 SpearmanrResult(correlation=0.9784803708042666, pvalue=0.0) (0.9802802272402124, 0.0)
NT vs. CPSF3L n = 4864 SpearmanrResult(correlation=0.9610740902389276, pvalue=0.0) (0.978394238702559, 0.0)
NT vs. CPSF4 n = 4974 SpearmanrResult(correlation=0.9589531960994394, pvalue=0.0) (0.9662141466494838, 0.0)
NT vs. CPSF6 n = 5257 SpearmanrResult(correlation=0.9404204340515289, pvalue=0.0) (0.9078228071671943, 0.0)
NT vs. CSTF1 n = 5100 SpearmanrResult(correlation=0.971883942118247, pvalue=0.0) (0.9796054725105341, 0.0)
NT vs. CSTF3 n = 5216 SpearmanrResult(correlation=0.966175618302429, pvalue=0.0) (0.9683614083931555, 0.0)
NT vs. CTR9 n = 5115 SpearmanrResult(correlation=0.9784407845757428, pvalue=0.0) (0.9838629964154828, 0.0)
NT vs. FIP1L1 n = 5046 SpearmanrResult(correlation=0.9644785929270852, pvalue=0.0) (0.9687039048146195, 0.0)
NT vs. LEO1 n = 5262 SpearmanrResult(correlation=0.9927891750614752, pvalue=0.0) (0.9936795730362967, 0.0)
NT vs. NUDT21 n = 5192 SpearmanrResult(correlation=0.8925655057773901, pvalue=0.0) (0.8470834005022289, 0.0)
NT vs. PABPC1 n = 5231 SpearmanrResult(correlation=0.9907217201717271, pvalue=0.0) (0.9920970228428011, 0.0)
NT vs. PABPN1 n = 5234 SpearmanrResult(correlation=0.9868614018616282, pvalue=0.0) (0.9883314458218786, 0.0)
NT vs. PAF1 n = 4818 SpearmanrResult(correlation=0.9660302976021368, pvalue=0.0) (0.9788456726669531, 0.0)
NT vs. PAPOLA n = 5248 SpearmanrResult(correlation=0.9903297518640427, pvalue=0.0) (0.9904321751721639, 0.0)
NT vs. PCF11 n = 5087 SpearmanrResult(correlation=0.976767788247931, pvalue=0.0) (0.9806927408261735, 0.0)
NT vs. RBBP6 n = 5228 SpearmanrResult(correlation=0.9790176073417867, pvalue=0.0) (0.9775180238105726, 0.0)
NT vs. RPRD1A n = 5208 SpearmanrResult(correlation=0.9883445607859678, pvalue=0.0) (0.9921415453754213, 0.0)
NT vs. RPRD1B n = 5255 SpearmanrResult(correlation=0.992871435489016, pvalue=0.0) (0.9935379522556784, 0.0)
NT vs. SCAF8 n = 5263 SpearmanrResult(correlation=0.9950755692855895, pvalue=0.0) (0.9956598820170155, 0.0)
NT vs. SF3A1 n = 3919 SpearmanrResult(correlation=0.9369491501110663, pvalue=0.0) (0.9751583347981829, 0.0)
NT vs. SRSF3 n = 4941 SpearmanrResult(correlation=0.9628408287711419, pvalue=0.0) (0.97375495379757, 0.0)
NT vs. SYMPK n = 4736 SpearmanrResult(correlation=0.9571236331532373, pvalue=0.0) (0.9705724095187731, 0.0)
NT vs. THOC5 n = 5126 SpearmanrResult(correlation=0.962649260663901, pvalue=0.0) (0.9643536705864496, 0.0)