import numpy as np
import pandas as pd
df = pd.read_csv('native_data/polyadb_processed_v3_w_hg38.csv', delimiter=',')
#df = df.query("site_type == '3_most_exon'").copy().reset_index(drop=True)
#df = df.loc[~df['wide_seq_ext'].str.slice(175 - 70, 175 - 70 + 205).str.contains("AAAAAAA|AAAGAAAA|AAACAAAA|AAAAGAAA|AAAACAAA")].copy().reset_index(drop=True)
print(df)
gene gene_id sitenum num_sites pas \ 0 AADACL3 AADACL3.1 1 2 0 1 AADACL3 AADACL3.2 2 2 0 2 ABCA4 ABCA4.5 1 5 0 3 ABCA4 ABCA4.4 2 5 2 4 ABCA4 ABCA4.3 3 5 4 5 ABCA4 ABCA4.2 4 5 3 6 ABCA4 ABCA4.1 5 5 0 7 ABCB10 ABCB10.7 1 7 4 8 ABCB10 ABCB10.6 2 7 0 9 ABCB10 ABCB10.5 3 7 0 10 ABCB10 ABCB10.4 4 7 2 11 ABCB10 ABCB10.3 5 7 0 12 ABCB10 ABCB10.2 6 7 0 13 ABCB10 ABCB10.1 7 7 0 14 ABCD3 ABCD3.1 1 15 -1 15 ABCD3 ABCD3.2 2 15 0 16 ABCD3 ABCD3.3 3 15 0 17 ABCD3 ABCD3.4 4 15 4 18 ABCD3 ABCD3.5 5 15 4 19 ABCD3 ABCD3.6 6 15 3 20 ABCD3 ABCD3.7 7 15 0 21 ABCD3 ABCD3.8 8 15 2 22 ABCD3 ABCD3.9 9 15 4 23 ABCD3 ABCD3.10 10 15 4 24 ABCD3 ABCD3.11 11 15 2 25 ABCD3 ABCD3.12 12 15 3 26 ABCD3 ABCD3.13 13 15 0 27 ABCD3 ABCD3.14 14 15 0 28 ABCD3 ABCD3.15 15 15 0 29 ABHD17AP3 ABHD17AP3.1 1 1 0 ... ... ... ... ... ... 228470 na na.38342 42 38383 3 228471 na na.38341 43 38383 2 228472 na na.38340 44 38383 0 228473 na na.38339 45 38383 0 228474 na na.38313 38313 38383 0 228475 na na.38314 38314 38383 0 228476 na na.38315 38315 38383 0 228477 na na.38316 38316 38383 0 228478 na na.38317 38317 38383 0 228479 na na.38318 38318 38383 -1 228480 na na.38319 38319 38383 0 228481 na na.38320 38320 38383 -1 228482 na na.38321 38321 38383 4 228483 na na.38322 38322 38383 2 228484 na na.38323 38323 38383 4 228485 na na.38324 38324 38383 2 228486 na na.38325 38325 38383 0 228487 na na.38326 38326 38383 0 228488 na na.38327 38327 38383 0 228489 na na.38328 38328 38383 0 228490 na na.38329 38329 38383 0 228491 na na.38330 38330 38383 0 228492 na na.38331 38331 38383 0 228493 na na.38332 38332 38383 4 228494 na na.38333 38333 38383 0 228495 na na.38334 38334 38383 2 228496 na na.38335 38335 38383 0 228497 na na.38336 38336 38383 0 228498 na na.38337 38337 38383 -1 228499 na na.38338 38338 38383 -1 seq \ 0 GTGCTTTCTAATCTGTGGAATGCCAGGGTCCCAGTGTGGGAGCCTT... 1 GCTGCAGGTGGTGGTTGCTGAAGGTGGGGGAGGCTGTGGCAATTTC... 2 CAGTAAAATATTTTCTGCATTTGCCCAAGGACACATTCCCAACGAA... 3 GTTTATCAAATACAACTCAGACGTCAGTCTCCTGGCCCCTTTGAGA... 4 ATTCAAATATGTGAAGAGCATCCACTTTAAAATATTTAAAATGCAG... 5 GACTTGAGGACCCATCTTTGTTTTTAGAATATTGTATGCTTTTGAG... 6 TTTCTGCATGTTTGTCTGTGTGTCTGCGTTGTGTGTGATTTTCATG... 7 TCCTCTCAGGTGCATTTCTTTACTGCTTTGTAGCAAAGTCTCTATA... 8 GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT... 9 CTATTTCATGAAAAGCATGGAATATTATATTTTATTGTTCATAATT... 10 CATAATTAATGAATAAAATTGATATGAATGAATATAGTGTTCTTTG... 11 TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC... 12 AACTTCTCACTACATTGTTTCTTAGTAGAATTTGGCTGTGGAGATT... 13 TCAGGAATAAAGAAAAGACTAACATTACACATATCCAAAAACATGT... 14 TTCTTGAATTTTATAAGTATCTCTAGCTTCTTGATAACTTATAAGG... 15 TTCGAGACAAGCCTGGACAAAAAGCGAGACCCGCTTCTTTAAAAAA... 16 GTATGGTTGTTTTACATATGTGTATGTGTGTATATGCATTTCAGTT... 17 CATTTCAGTTGATTAATAAATTATTTCCATACTGGTTTTTGTTGCT... 18 TCCTTGCAGTCATATTTTTATGGTACTTAACTACATTTTTGTGATA... 19 CATTTAATATTATATAGGATATTGCTAATTGTGTATATGTTGGTTT... 20 ATAATATGTACTAAGAATGTCCTTATTCTTGTGGTTAAAAACCTGC... 21 GGAGTGCATTTGACTCCAGGAAAAGCCATTTTGGTTTTCCTTAACT... 22 GATTTTATGTTTAAAAAGTATGTTCTAAAATTATTATATATACATG... 23 ATGATGTCAAACGATCCTAAGCGAAGATGATTTCAGTTCATCAAAT... 24 ACTGGTTTTGTTTTTTTGCAGAATTAACTATAACAATCACTGGCTA... 25 AACTATAACAATCACTGGCTACCGAAGTAAACTGATGTACTGAATT... 26 AATTTTTACCACTTCTGTTTAGCGAACTTGTATACTTATTTTCTGT... 27 GCCTTGACTTGAAAACATAGATAGTTTAATCTTGACTTGAAAAACA... 28 ATGAGAAAATAAGTATGAAACAGCAATGGTAGTTTGTTTTGCATTA... 29 TACGCACTCCTTTCCTTTTGGAAGCAAGAAGAAAATACGTGAAAAC... ... ... 228470 TTTGTGCCTGTCTCATTTTTTGTTGTTGTTGAAAACTGAACATTTA... 228471 ATCAGATTCTTCATTTTCCTAATGTGTGTTATGATTGGTTACTGTT... 228472 TGTCAGTTTTGTGTAATTGTCTCATGAACAATGAATTATGCACACC... 228473 TCCTCTTAAAGTTTAAAAAGGCTTTAGAGACTGAGAGGAAAATGCA... 228474 AGGGAGGGGATGATTTGGTTTCTGGTGACTTGGCTTTTAATGGTGA... 228475 AATCTGGTACTTTGTCATGAGTTACTTGTACAACTAGTTAGAAGTT... 228476 CACAGACAAGATTTGAGCATTACCTCCAGTCTCTTTGCCAGCTGAC... 228477 CACCCTTTGATGACTACTTTTTCGGACTCAGCCCACCTGTACCCAG... 228478 ATCACGTTGTATCCCATAAATGTATATCGTTTAAATTTGTTAATTA... 228479 AAGGTTTGTGAGTTGGTCCAAGAACAGCTGGAGTTCAAATCACCTC... 228480 ATGAGGATACTCATCATGGGATTTATTCTCCTTAAAATTTGTACTT... 228481 GAGGCCCTGTAATTGAAATGAGTCTACTTTAAATCCTTTAGGAGGA... 228482 TTGGAGGGCAAGTCTGGTGCCAGCAGCCACAGTAATTCCAGCTCCA... 228483 GAACGAAAGTCGGAGTTTCGAAGACGATCAGATACCGTCGTTGTTC... 228484 ATTGGTGGAAGGGCACCACCAGGAGTGGAGCCTGCAGCTTAATTTG... 228485 CTGCCCTGGTGGAGCACTGAGAAGACAGTCGAACTTGACTATCTAG... 228486 GAGACTTCACAACCTGCTAAAAATGAGCTTTCAGGACCTACCCATA... 228487 TTAGGACTTCCTGAGACTGTGTCACAAGTTTGTGTCCACAACTTTG... 228488 TTCTGAGGGAGGGATGTAGCTTTTTTATGTTTGGAGCTATTTTATT... 228489 GCCCACTGCAACCAGCCAGCCCCTTTCCATTCCAGTATCCCTTTCC... 228490 GTCCATTCTTGTATTGCTATAAAGAAATATTGGATACTGGCTAATT... 228491 TCAGTTCATCAACGAATCGGTATATTAATGTCATATTTAACAGTTA... 228492 CGGTATATTAATGTCATATTTAACAGTTATAGGAATAAACTAAGCA... 228493 ACCCTCTTTGGGTCCCCTCCGTTTGTATGGGAGCTCTGTTTTCATG... 228494 CCTAATAATTGGTCTGCTCAAACGTGCTAGCTGTGTGCACTCAGCC... 228495 GCCAGCAATGGATCCCCTCCTTTTGTATGGGAGCTCTGTATTCACG... 228496 CCTGAATAATGGTCAGGGAAGGTAATGTGAATGATGCAAAGATGAT... 228497 TCTTTGTACCTACTCTCTGTTCTTACACCCTCTCCCCTTTTGAAAG... 228498 CTGATCACCCAGGTGATGGGACTTTTGTCTAGGCTCTGCCTATGGG... 228499 GCAACATAGTGAGACCTGTCACAGCAAAAAAAAAAAAAAAAAAAAA... seq_ext \ 0 TTACCCCCAGGGAATTAGGGGAGAGGAAACACCTTTATTTGCTTTC... 1 TAAATTGTCGTAATCTTTTTGCTGGTTGATGGTCTTGCCTTGATGT... 2 CCCCAGTATCCCCATCTTGGTGGGACAACAGAACCCAAGAACTGGC... 3 CCTCTGATCCATCTCTGTCTGCATGAGTGACAGCTGGCAGAGTCCT... 4 CCTGGCTTTTCTCCAATAATACAAGTAGAGGATCGGGTTAAAATAG... 5 CCTATTTACTAAGAGTCCACTCCAATGTAGGAATGGTTAGGAGACC... 6 CCACTAGCAGCTTTGGCCTCCATATTGCTCTCATTTCAAGCAGATC... 7 CCGGAATTTCCCCCAAGGGTTCAACACTGTGGTTGGAGAAAAGGGT... 8 TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC... 9 CATGAACTAAGCATTTATTAGTTCCCTGATTAGACTGGAAGAAGAA... 10 AGAAGAAACCACTATTTCATGAAAAGCATGGAATATTATATTTTAT... 11 TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT... 12 TTTTTTTTTATGACCCAGGAACCAAAGATGCAGTCTGTCATTTCTT... 13 TATAAAACTTCTCACTACATTGTTTCTTAGTAGAATTTGGCTGTGG... 14 GACCTCCAACTTAAATTATGAGTCATTATTTTTGTCATAAGTTTTA... 15 ATCCCAGCACTTTGGGAGGCTGAGATGGGAGGATCGCTTGAATCCA... 16 ATTTAATATGTAATGTTATTGTTACATATTTATAACACAGCCATAT... 17 ACAGCCATATAATAGTATGGTTGTTTTACATATGTGTATGTGTGTA... 18 TTGTATTTTGTTTTACTATAACCAAGTGAATAGGCCAAATCCTTCT... 19 AAAGCAACAAATTAACTAGATACAGAATAATGGAGAACAAGTTGTT... 20 ATTTAATATTATATAGGATATTGCTAATTGTGTATATGTTGGTTTA... 21 GGCAGATTTCTTTAGCTGCCACAGTAATACTCATTCCTTGTGTGTG... 22 TACATGAAATAATGCACTGAGTATGCAATGCTATCACTGTCTTTGA... 23 GGCACTGTTTTATCTCTGTGAATCTTGAATAACTTTTTTATATTTG... 24 TCTCAAACTGAGCTTCAGAAAGGGGCATTTTGTACTCTTGTTTTTG... 25 CATTTTGTACTCTTGTTTTTGCATAACTGGTTTTGTTTTTTTGCAG... 26 CTGGCTACCGAAGTAAACTGATGTACTGAATTCCATAATACATAAC... 27 TGTTCTTTTTATTCTGGTATCTAAATACTGAGAAGTTCATTTATAA... 28 TGCCAAGACATATCACCGTGTTCTCATAATAAGTTTTTACTTTTTA... 29 GGGACCCCGCTCCAACCCAGGGGCTGTGGACGATGTACAGGCAACA... ... ... 228470 TTCTGTTGGCTGTGCTTTTTCTTGCGTATAGGCTGTACTTTTTTTT... 228471 TGTTGTTGAAAACTGAACATTTAAAATAATATAATTTGGCAACTGT... 228472 TCTAGTGTGTGTTCTTTATGGCAGTGTTTCATCTTTTGGCATCCTT... 228473 AATGTAAACTAACAATTGGTTTTGGGATTGTCTTCTTACAAAATTA... 228474 TTTTTGTCAACATGACATATACATTATAACATTTTGGAAGTTTGTT... 228475 AAATTTGAAGTCATTGAAAAACCCCAGGCCTGAAGAGATAAAGTAA... 228476 GATCTTGTCTAATTACCATGTGCTTATAAAGCCTGACCTGAAGCTC... 228477 TTAATTCCACTGCCCTTCCCAAAACCTGTAAGAACTAATGACAATC... 228478 TATTTCTTAGCTTGATGTAGCTGTTCCACCGTGTGTGCATATATCA... 228479 TAGGCTGCAGGGCTCCGTGTTCTTTATTTTCTGTTGATCATGAATC... 228480 CCCTGCTACAGTCACAACCAGAAGCTTACTGTTGTATGCATGGCAA... 228481 CTCCCGACCGGGGGAGGTACTGATGAAAAATAACAATACAGGACTC... 228482 GAGGCCCTGTAATTGAAATGAGTCTACTTTAAATCCTTTAGGAGGA... 228483 CAGTTCGGACCAGAGCTAAAGCATTTGCCAAGAATGTTTTCATTAA... 228484 GGTTTTTGGGTTCCCAGGCGAGTATGGTTGCAAAGCTGAAACTTAA... 228485 GATTGGATGGTTTAGTGAGGCCCTTGGATCGGCCCCGCCGGGTCAG... 228486 TTTGTGTGTTGTTTTTTCTCAATTCCTCCTATTTTCTCTTTACAGG... 228487 CTAAAATGTATAAAACTAAGCTGGGCCCCAACCACCTTGGGCACAT... 228488 ATTTCGAGTTCTCTGTGTTTTTTGTCCACAGGAATTTCCTTGTGGG... 228489 CACCCAACCAGTCATCTGAGTAGACTTCCTCCTCAGCCAGGGCAGT... 228490 AACATTATATAGGATGTCTTGTTTTAGGCATTTATTGCTAGATAAC... 228491 TGTTTGTAAAGTAGTTGTCTAGACTCTAGTGAAAATAATTACAGAT... 228492 TCTAGACTCTAGTGAAAATAATTACAGATAATCTCAGTTCATCAAC... 228493 AAGGGACAATGATCGGGATATAAACCCAGGCATTCAAGCCAGCAAT... 228494 ATTCAGTAATTGATAGGGAGACTCTTGTGGAAGCAGAGTTAGGAAA... 228495 TAAGAACACAGTGGGAGGGACAATGATGGGATATGCACCCAGGCAT... 228496 CTACTGTTCCTAAATTTATGTCGATGTATATATTATATTTCTTAGA... 228497 TTGTTGGACCCTTATCAGTAGTTCTGCTTTTTGCACTTTGAAGCAT... 228498 TAACTTTTGTCTCAGCTCTGCCTACAGGGGCTTCGTGACATATCTC... 228499 AGGCCAAGGCAGGAGGTTTGCTTGAGGCCAGAACTTCAATACCAGC... wide_seq \ 0 TGAGACAATTTTACTGTCTTAGTTATTACCCCCAGGGAATTAGGGG... 1 TGTATGTTACATTCATGGGAATGTCTAAATTGTCGTAATCTTTTTG... 2 AATCATTTTATTTCCCTCTTTTTCTCCCCAGTATCCCCATCTTGGT... 3 ATCTCCTCCTGTCATCTCCACCAGGCCTCTGATCCATCTCTGTCTG... 4 CAATGTTTTTTCTACATTCTCAAAGCCTGGCTTTTCTCCAATAATA... 5 ATCTCAACAGATCCGGGACCTGTGGCCTATTTACTAAGAGTCCACT... 6 CCAGAACTAGAAACCCCGGGCCATCCCACTAGCAGCTTTGGCCTCC... 7 AAGTGGCCAATGCAGTGGCCTTCATCCGGAATTTCCCCCAAGGGTT... 8 TCAGGTTTTGTATTTTCTTTTCTTGTGGAATATTTTAATTAATATA... 9 AATGTAAATCAAATGGAAGTTTTCCCATGAACTAAGCATTTATTAG... 10 TTATTAGTTCCCTGATTAGACTGGAAGAAGAAACCACTATTTCATG... 11 GGTTGAATCTGAGGAAAATAATCCTTGTGCCATAGAAGTATTTACG... 12 AACTTTATCTGTGTCTGTCACTTTTTTTTTTTTTATGACCCAGGAA... 13 ACCAAAGATGCAGTCTGTCATTTCTTATAAAACTTCTCACTACATT... 14 AGTTAGATGTTTTAAACATCTAATTGACCTCCAACTTAAATTATGA... 15 CCAGGCGTGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCT... 16 ATCTTATTGAAATGTAACTTTAGTCATTTAATATGTAATGTTATTG... 17 AATGTTATTGTTACATATTTATAACACAGCCATATAATAGTATGGT... 18 TCTTGTGATCCAGGTTTTATCATTCTTGTATTTTGTTTTACTATAA... 19 CTTAGTTTTTTTTAAAAAAAAAAACAAAGCAACAAATTAACTAGAT... 20 ATAATGGAGAACAAGTTGTTAAAACATTTAATATTATATAGGATAT... 21 TGCATAACAGCGTTTATTATACAGTGGCAGATTTCTTTAGCTGCCA... 22 CTCCAAATGAGCCATAGGAAGGCACTACATGAAATAATGCACTGAG... 23 ATACATGGGTGAATTATGTTTCCGAGGCACTGTTTTATCTCTGTGA... 24 CAATAAGCTAGATACGAAATCAGTTTCTCAAACTGAGCTTCAGAAA... 25 TCTCAAACTGAGCTTCAGAAAGGGGCATTTTGTACTCTTGTTTTTG... 26 TTTGCAGAATTAACTATAACAATCACTGGCTACCGAAGTAAACTGA... 27 TCAGATATCCTATACAACCTTTGCTTGTTCTTTTTATTCTGGTATC... 28 TTTCATCCATGAGCACCACGCTGCATGCCAAGACATATCACCGTGT... 29 GCATGTGGACCCCCGGGCGGCCCAGGGGACCCCGCTCCAACCCAGG... ... ... 228470 GTGTCTGGACTTCCTTAGAATGAATTTCTGTTGGCTGTGCTTTTTC... 228471 GTTTTGTGCCTGTCTCATTTTTTGTTGTTGTTGAAAACTGAACATT... 228472 GGTATTTTGACCAAGATTTGTAATGTCTAGTGTGTGTTCTTTATGG... 228473 AATGAATTATGCACACCCTTTAATAAATGTAAACTAACAATTGGTT... 228474 AAGGTATCCTAGATTATTCTTTCTATTTTTGTCAACATGACATATA... 228475 TCCCCACTTCAAATTTGAAGATCCCAAATTTGAAGTCATTGAAAAA... 228476 ATCACTTCATTGCAAAGTCCAATTAGATCTTGTCTAATTACCATGT... 228477 ACTCCCTGCCAGCAAAACATTGCTTTTAATTCCACTGCCCTTCCCA... 228478 TGATAAGTATGAAATGTGACGGATATATTTCTTAGCTTGATGTAGC... 228479 GGTGGCCTGTCTTTTTGCCTTTGCTTAGGCTGCAGGGCTCCGTGTT... 228480 AAGTGACATTGAGAAAGATGGCAAGCCCTGCTACAGTCACAACCAG... 228481 AGGCAGCAGGTGCAAAAATTACCCACTCCCGACCGGGGGAGGTACT... 228482 AAAAATAACAATACAGGACTCTTTCGAGGCCCTGTAATTGAAATGA... 228483 TAGAGGTGAAATTCTTGGACTGGTGCAGTTCGGACCAGAGCTAAAG... 228484 AGCTTCCCAGAAACCAAAGTCTTTGGGTTTTTGGGTTCCCAGGCGA... 228485 GTACACACTGCCAGTCGCTACTACTGATTGGATGGTTTAGTGAGGC... 228486 TGTATCGATTTTCCAGGAGTATTCTTTTGTGTGTTGTTTTTTCTCA... 228487 ATATTGATTGATGTCTCATGTCTCCCTAAAATGTATAAAACTAAGC... 228488 CTCTCACCTGAGCATCAGAGGGATGATTTCGAGTTCTCTGTGTTTT... 228489 CTAAAACTCTAAGGCACCTTCTCCCCACCCAACCAGTCATCTGAGT... 228490 AAAATTAAGTGCAGAAAATATGTTTAACATTATATAGGATGTCTTG... 228491 TGCATTATGCAATGATGTAGAATACTGTTTGTAAAGTAGTTGTCTA... 228492 TAGAATACTGTTTGTAAAGTAGTTGTCTAGACTCTAGTGAAAATAA... 228493 CATCTATTGCCTGAGAGCACAGAGGAAGGGACAATGATCGGGATAT... 228494 GGGAAAAAACAAAAATACAATGGTTATTCAGTAATTGATAGGGAGA... 228495 AAGAAATAGCCAATCATCTATCACCTAAGAACACAGTGGGAGGGAC... 228496 TCCACACTCTAATATTCCCTAATATCTACTGTTCCTAAATTTATGT... 228497 GTTTTTGCCCTTTGCCTTGTGATCTTTGTTGGACCCTTATCAGTAG... 228498 CTATGTATTGATCATCCCGGTGATGTAACTTTTGTCTCAGCTCTGC... 228499 ACATGTATAATCCCAACATTTTGGAAGGCCAAGGCAGGAGGTTTGC... wide_seq_ext pas_pos \ 0 AGACAGACAGGAAGATTTGAGAAAAATCAATGAGAGGAAAAAGTCA... 12780316 1 TGCGTGGGTTTTCTCTGAGTTCTCCAGCTTCCTCCCACATTCCAAA... 12788705 2 GCCAAAGAAATTTGCTCTGATCTTGCTTTTCTACAACAGAATCATT... 94546865 3 GCTCCTCATGCTGTATCCCCAGTCTCTCGGCCTGCCATGTCATCAT... 94501567 4 CTGGACCTTATACCCACATGGTCATTTCTTTCCTCAGGAGCCCCAC... 94476130 5 TGGCTGGGATGAGGGAGGGCTGCTACCACTGCCTCAATATTTCACC... 94465954 6 TGGAACACCTGATGGTGAAACCAAACAAATACAAAATCCTTCTCCA... 94458420 7 TGCTGATGACCCTTCCTCTGTGACCGCTGAGGAAATCCAGAGAGTG... 229661641 8 GAGTTTTAATAATTGTAACTTTTTAAATGTCTATAGCACTGAAGTT... 229653571 9 AAGTGCTTTTTCTCCATGGATGAGGCTAGACCCTAAGAAGTAATTA... 229652609 10 GTAATTAAGTCAATGTAAATCAAATGGAAGTTTTCCCATGAACTAA... 229652570 11 TGTGTTCTTTATAAAGTGTGATTTTCAGAAAGCAAACAACACAATT... 229652351 12 CAATTTTATCTTTAGGTAATATTTTATATCATAGATTAAAATTTAT... 229652210 13 TAGTGAACTTTATCTGTGTCTGTCACTTTTTTTTTTTTTATGACCC... 229652165 14 AGAAGAATATTAAATTAAGTACTTGTAGTGGGGCTGTACACCAGTG... 94942786 15 TAGGTACTTGGAAAAATTTTGTGGCATTAAAAACCAGACAAATGTA... 94944251 16 GTCTCTGATATTTGTGATGGCAAGAATCACTTTTAAGTTTTCTTTG... 94944623 17 GTTTTCTTTGAGTTATCTTATTGAAATGTAACTTTAGTCATTTAAT... 94944659 18 CTTTGTTTTGTTTTTCGAATTTTTCTGTTTATTTTCCTAATTCTTT... 94944865 19 ATTACAGAATATACTTAGAAAGGCAAAGTACATTGTAAAATAAAGT... 94982896 20 TTAGTTTTTTTTAAAAAAAAAAACAAAGCAACAAATTAACTAGATA... 94982947 21 GTGGTAGTTGGAAACAAATCATAATGTATTATTTAAATGTTTAACA... 94983300 22 GTGTGTTAGAAGCCCATTCATTAGAAGTGTGGTGGTTATTTGGTAT... 94983669 23 TCTTTGACTGTGATTTTATGTTTAAAAAGTATGTTCTAAAATTATT... 94983783 24 TAATGACTTTATGTATTATTTGCACAGGGAGAATTGAAACTGAGTA... 94983960 25 AGGGAGAATTGAAACTGAGTATAATCAATAAGCTAGATACGAAATC... 94983985 26 TCAGAAAGGGGCATTTTGTACTCTTGTTTTTGCATAACTGGTTTTG... 94984049 27 CGAACTTGTATACTTATTTTCTGTTCAGATTAAAAAAAAAAAAAAA... 94984196 28 GCAGTGGGAAATGGTAGTTTAATCCGAAGAATAAACCAAAGAATAA... 94984889 29 AGCAATAAGGCAGCCCCCGGACCTCACCCCGCGCCGGCCCCCCAGG... 214778817 ... ... ... 228470 AGGCCTTTGAACAATGTAATATAACTGATTTATAAGCTACTCTAGT... 2834108 228471 ATTTCTGTTGGCTGTGCTTTTTCTTGCGTATAGGCTGTACTTTTTT... 2834035 228472 TATCCTAATCAATTACTGAGCCAATTGATTTTTTTTTTTGCGTTAA... 2802582 228473 CATCTTTTGGCATCCTTGAAATGTCAGTTTTGTGTAATTGTCTCAT... 2802478 228474 TAAAATACAAATACTACTGTCATTCTTTAGTAAGTGCATATAATTA... 2904140 228475 GGGAGCTTTTAAGCTTGAGCAAATATATGATAAAGTAGACATGAAT... 6769480 228476 CCTCCTGGCATCATCCCATGGCAAGATCCAATCAGAACATGCCTTG... 7325548 228477 CTTAAGAAGTTTCTTTATAATTCTCCCCACCCTTGAGAATTGTGAG... 7375114 228478 GGGGATTGCTAAAGGAACACCTTGTAAAGATTTCCACCCTTCCTCT... 7388596 228479 CTGATGCACTACAACACTTCTTTGTTTTTTTTTTGAGTCCCCCAAC... 8470546 228480 TACATTGTCTGAGACCTGAGAGGTGAGACTGAGCCTGGTGGACCCC... 8518516 228481 GGGTTCCACTCTGAAGAGGGAGCCTGAGAAACAGCTACCACATTCA... 10035458 228482 AGGCAGCAGGTGCAAAAATTACCCACTCCCGACCGGGGGAGGTACT... 10035508 228483 ACTGAGGCCATGATTAAGAGGGATGGCTGGGGGCATTCGTATTGTG... 10035923 228484 AACGATGCCAACTGGTGATGCAGCGGTGTTATTTCCATGACCCGCT... 10036102 228485 GAATTCCCAGTAAGTGTGGGTCATAGGCTTGCGTGAAGTCCCTACC... 10036661 228486 GCCACTTTCATACCTGCCTACTGTTGTATAGACTTCAGAGTAATGT... 14799668 228487 GCTTCACTTTATCCCTGCCTTTCTAGACTGAACCAACATACTTCTT... 14800185 228488 ATTAACTGAGACCTGTCCTAAATTTTTAGGGTTCCATCAGATATGC... 14800374 228489 TTTTCATCAGTTGGATTATTTTTCAGTTTAGTAAGTGATGCAAGCC... 14801802 228490 GATTATTTTGGATAGCTTTGTTTAACGGTGAATGAAATGATTAAAA... 14802687 228491 AAATGTGTAAGTCTGTCGTTTGTTCTTGACTTCTGTCATGTTTTCA... 14804113 228492 GTTTGTTCTTGACTTCTGTCATGTTTTCAAGAATGCATTATGCAAT... 14804130 228493 CTCACTAAAATGCTAATTAGGCAAAAACAGGAGGTAAAGAAATAGC... 21242131 228494 GAACTCCCTTCAGGACAGGAGGATAGATGGTTCATCCCAGGTGATT... 21244040 228495 AGTAGAAAAGAGAGCTCACTAAAATGATAATTAGGTAAAAACAGGA... 21248927 228496 CTGAGCATAGAATCCAACTGTTAGTTTTTCAGCCTTGCCCCCTCCA... 21249813 228497 TATCAAGACAATATGTGCACTGCTGAACATAGACCCTTATCAGTAG... 21920929 228498 GTAATGCAACTCTTCTCTAGGCTCTGCCTACAAGGTCCTTTGTGAC... 58993353 228499 AACAAAACAAAAAAAGTACTGAAAAACTGATATGATAGGTGTGGTG... 59022296 cut_mode cut_mode_hg38 chrom strand site_type rpm 0 12780335 12720333 chr1 + Intron 0.909316 1 12788725 12728758 chr1 + 3_most_exon 3.328937 2 94546835 94081279 chr1 - Intron 1.375917 3 94501545 94035989 chr1 - Intron 1.096222 4 94476120 94010564 chr1 - Intron 1.455630 5 94465928 94000372 chr1 - Intron 0.557419 6 94458390 93992834 chr1 - 3_most_exon 0.877850 7 229661637 229525890 chr1 - Intron 0.956384 8 229653545 229517798 chr1 - 3_most_exon 0.984183 9 229652589 229516842 chr1 - 3_most_exon 1.306171 10 229652545 229516798 chr1 - 3_most_exon 1.047152 11 229652329 229516582 chr1 - 3_most_exon 2.855958 12 229652175 229516428 chr1 - 3_most_exon 0.786950 13 229652139 229516392 chr1 - 3_most_exon 0.823096 14 94942811 94477255 chr1 + Intron 0.843443 15 94944279 94478723 chr1 + Intron 3.573857 16 94944645 94479089 chr1 + Intron 1.277234 17 94944673 94479117 chr1 + Intron 2.655549 18 94944867 94479311 chr1 + Intron 1.309722 19 94982938 94517382 chr1 + 3_most_exon 0.942836 20 94982973 94517417 chr1 + 3_most_exon 1.032161 21 94983315 94517759 chr1 + 3_most_exon 0.691924 22 94983671 94518115 chr1 + 3_most_exon 0.897813 23 94983784 94518228 chr1 + 3_most_exon 1.012715 24 94983984 94518428 chr1 + 3_most_exon 1.355078 25 94984010 94518454 chr1 + 3_most_exon 1.552984 26 94984085 94518529 chr1 + 3_most_exon 2.642226 27 94984219 94518663 chr1 + 3_most_exon 2.419415 28 94984907 94519351 chr1 + 3_most_exon 1.366080 29 214778785 214605442 chr1 - Single_exon 2.484512 ... ... ... ... ... ... ... 228470 2834085 2966044 chrY - 3_most_exon 0.482540 228471 2834002 2965961 chrY - 3_most_exon 0.885032 228472 2802564 2934523 chrY - Single_exon 4.082408 228473 2802447 2934406 chrY - Single_exon 1.428628 228474 2904164 3036123 chrY + Intron 1.286321 228475 6769514 6901473 chrY + Single_exon 0.724360 228476 7325572 7457531 chrY + Intron 2.374808 228477 7375141 7507100 chrY + Intron 1.232042 228478 7388632 7520591 chrY + Internal_exon 2.582800 228479 8470571 8602530 chrY + Intron 1.647257 228480 8518539 8650498 chrY + Intron 6.061376 228481 10035483 10197874 chrY + Intron 2.529132 228482 10035510 10197901 chrY + Intron 1.085304 228483 10035930 10198321 chrY + Intron 1.548146 228484 10036131 10198522 chrY + Intron 2.642066 228485 10036680 10199071 chrY + Single_exon 1.318344 228486 14799703 12687774 chrY + Intron 1.221395 228487 14800204 12688275 chrY + Intron 1.641890 228488 14800401 12688472 chrY + Intron 1.297871 228489 14801821 12689892 chrY + Intron 1.137537 228490 14802713 12690784 chrY + 3_most_exon 3.055810 228491 14804136 12692207 chrY + 3_most_exon 11.683354 228492 14804162 12692233 chrY + 3_most_exon 3.513120 228493 21242151 19080265 chrY + Intron 0.625966 228494 21244065 19082179 chrY + Intron 0.932752 228495 21248947 19087061 chrY + Single_exon 1.444722 228496 21249836 19087950 chrY + Single_exon 1.011533 228497 21920957 19759071 chrY + Intron 1.310305 228498 58993378 56847231 chrY + Intron 0.982032 228499 59022321 56876174 chrY + Single_exon 2.584362 [228500 rows x 16 columns]
#Get PolyADB positions in hg38 coordinates
'''
polyadb_bed_hg19 = pd.read_csv("polyadb_coordinates_utr3_hg19.bed", sep='\t', header=None, names=['chrom', 'pas_pos_hg19', 'end', 'gene', 'gene_id', 'strand'])
polyadb_bed_hg38 = pd.read_csv("polyadb_coordinates_utr3_hg38.bed", sep='\t', header=None, names=['chrom', 'pas_pos_hg38', 'end', 'gene', 'gene_id', 'strand'])
polyadb_bed_hg38 = polyadb_bed_hg38.join(polyadb_bed_hg19[['gene_id', 'pas_pos_hg19']].set_index('gene_id'), on='gene_id', how='inner').copy().reset_index(drop=True)
polyadb_bed_hg38['padb_join_id'] = polyadb_bed_hg38['chrom'] + "_" + polyadb_bed_hg38['pas_pos_hg19'].astype(str) + "_" + polyadb_bed_hg38['gene']
df['padb_join_id'] = df['chrom'] + "_" + df['pas_pos'].astype(str) + "_" + df['gene']
df = df.join(polyadb_bed_hg38[['padb_join_id', 'pas_pos_hg38']].set_index("padb_join_id"), on='padb_join_id', how='inner').copy().reset_index(drop=True)
df['cut_mode_hg38'] = df['cut_mode'] - df['pas_pos'] + df['pas_pos_hg38']
'''
print(len(df))
228500
#Store PolyADB cut mode intervals in hg38
df['start'] = df['cut_mode_hg38'] - 35
df['end'] = df['cut_mode_hg38'] + 35
polyadb_bed = df[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand']].copy().reset_index(drop=True)
polyadb_bed = polyadb_bed.loc[(~polyadb_bed['start'].isnull()) & (polyadb_bed['start'] > -1)].copy().reset_index(drop=True)
polyadb_bed.to_csv("polyadb_cut_mode_coordinates_hg38.bed", sep='\t', header=False, index=False)
#Load perturb-seq experimental data (knockout gene-level)
perturb_df = pd.read_csv("perturb/pseudobulk_counts_de_novo_polyA_sites_by_gene.tsv", sep='\t').query("misprime == False").copy().reset_index(drop=True)
perturb_df['start'] = perturb_df['Position'].astype(int)
perturb_df['end'] = perturb_df['Position'].astype(int) + 1
perturb_df = perturb_df.rename(columns={'Chromosome' : 'chrom', 'symbol' : 'gene'})
perturb_df['chrom'] = 'chr' + perturb_df['chrom'].astype(str)
perturb_df = perturb_df[['chrom', 'start', 'end', 'gene', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5']]
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
perturb_df
chrom | start | end | gene | NT | CDC73 | CPSF1 | CPSF2 | CPSF3 | CPSF3L | ... | PAPOLA | PCF11 | RBBP6 | RPRD1A | RPRD1B | SCAF8 | SF3A1 | SRSF3 | SYMPK | THOC5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chrX | 293484 | 293485 | PLCXD1 | 349.0 | 66.0 | 49.0 | 31.0 | 21.0 | 15.0 | ... | 114.0 | 12.0 | 57.0 | 53.0 | 205.0 | 179.0 | 3.0 | 45.0 | 7.0 | 35.0 |
1 | chrX | 299444 | 299445 | PLCXD1 | 274.0 | 63.0 | 50.0 | 47.0 | 40.0 | 7.0 | ... | 34.0 | 10.0 | 39.0 | 90.0 | 135.0 | 172.0 | 2.0 | 56.0 | 7.0 | 35.0 |
2 | chrX | 303356 | 303357 | PLCXD1 | 1036.0 | 128.0 | 179.0 | 237.0 | 202.0 | 80.0 | ... | 391.0 | 88.0 | 211.0 | 223.0 | 327.0 | 565.0 | 13.0 | 158.0 | 57.0 | 174.0 |
3 | chrX | 1309922 | 1309923 | CSF2RA | 18.0 | 2.0 | 3.0 | 2.0 | 3.0 | 2.0 | ... | 12.0 | 1.0 | 5.0 | 8.0 | 5.0 | 15.0 | 0.0 | 2.0 | 1.0 | 1.0 |
4 | chrX | 1599220 | 1599221 | AKAP17A | 39.0 | 6.0 | 6.0 | 3.0 | 11.0 | 3.0 | ... | 14.0 | 5.0 | 11.0 | 6.0 | 15.0 | 22.0 | 4.0 | 7.0 | 1.0 | 4.0 |
5 | chrX | 1602520 | 1602521 | AKAP17A | 1019.0 | 162.0 | 178.0 | 209.0 | 226.0 | 140.0 | ... | 353.0 | 126.0 | 293.0 | 169.0 | 422.0 | 575.0 | 71.0 | 125.0 | 61.0 | 156.0 |
6 | chrX | 2717856 | 2717857 | CD99 | 356.0 | 67.0 | 47.0 | 58.0 | 75.0 | 25.0 | ... | 91.0 | 33.0 | 87.0 | 85.0 | 147.0 | 216.0 | 10.0 | 41.0 | 18.0 | 31.0 |
7 | chrX | 2733667 | 2733668 | CD99 | 80.0 | 18.0 | 8.0 | 8.0 | 5.0 | 0.0 | ... | 8.0 | 2.0 | 9.0 | 23.0 | 33.0 | 92.0 | 1.0 | 21.0 | 1.0 | 8.0 |
8 | chrX | 2736437 | 2736438 | CD99 | 40.0 | 4.0 | 4.0 | 8.0 | 6.0 | 1.0 | ... | 9.0 | 3.0 | 12.0 | 6.0 | 16.0 | 23.0 | 0.0 | 2.0 | 2.0 | 4.0 |
9 | chrX | 2741309 | 2741310 | CD99 | 12141.0 | 1397.0 | 1676.0 | 1970.0 | 2428.0 | 816.0 | ... | 3983.0 | 1292.0 | 3508.0 | 2336.0 | 4527.0 | 7130.0 | 273.0 | 853.0 | 544.0 | 1144.0 |
10 | chrX | 2844041 | 2844042 | GYG2 | 28.0 | 17.0 | 4.0 | 2.0 | 4.0 | 2.0 | ... | 10.0 | 1.0 | 2.0 | 4.0 | 18.0 | 18.0 | 0.0 | 5.0 | 0.0 | 0.0 |
11 | chrX | 2882288 | 2882289 | GYG2 | 81.0 | 7.0 | 10.0 | 13.0 | 7.0 | 1.0 | ... | 27.0 | 4.0 | 5.0 | 16.0 | 19.0 | 45.0 | 3.0 | 2.0 | 2.0 | 7.0 |
12 | chrX | 2882818 | 2882819 | GYG2 | 327.0 | 29.0 | 29.0 | 34.0 | 27.0 | 12.0 | ... | 103.0 | 25.0 | 77.0 | 75.0 | 129.0 | 219.0 | 7.0 | 18.0 | 10.0 | 18.0 |
13 | chrX | 7353066 | 7353067 | STS | 97.0 | 14.0 | 17.0 | 26.0 | 29.0 | 13.0 | ... | 22.0 | 9.0 | 32.0 | 24.0 | 44.0 | 54.0 | 0.0 | 4.0 | 10.0 | 15.0 |
14 | chrX | 7354641 | 7354642 | STS | 193.0 | 18.0 | 37.0 | 64.0 | 41.0 | 4.0 | ... | 68.0 | 17.0 | 51.0 | 29.0 | 97.0 | 108.0 | 3.0 | 9.0 | 11.0 | 15.0 |
15 | chrX | 8466510 | 8466511 | VCX3B | 5.0 | 3.0 | 6.0 | 7.0 | 0.0 | 1.0 | ... | 1.0 | 0.0 | 5.0 | 0.0 | 4.0 | 1.0 | 0.0 | 4.0 | 0.0 | 3.0 |
16 | chrX | 9702292 | 9702293 | TBL1X | 73.0 | 11.0 | 17.0 | 12.0 | 10.0 | 4.0 | ... | 28.0 | 6.0 | 21.0 | 25.0 | 30.0 | 46.0 | 2.0 | 5.0 | 2.0 | 8.0 |
17 | chrX | 9719743 | 9719744 | TBL1X | 667.0 | 52.0 | 64.0 | 88.0 | 99.0 | 34.0 | ... | 194.0 | 57.0 | 172.0 | 125.0 | 239.0 | 371.0 | 4.0 | 38.0 | 25.0 | 67.0 |
18 | chrX | 9741053 | 9741054 | TBL1X | 12.0 | 1.0 | 3.0 | 4.0 | 6.0 | 5.0 | ... | 6.0 | 2.0 | 4.0 | 5.0 | 5.0 | 10.0 | 0.0 | 3.0 | 2.0 | 0.0 |
19 | chrX | 9948359 | 9948360 | SHROOM2 | 129.0 | 19.0 | 23.0 | 26.0 | 28.0 | 10.0 | ... | 40.0 | 12.0 | 54.0 | 28.0 | 55.0 | 63.0 | 3.0 | 12.0 | 10.0 | 16.0 |
20 | chrX | 9949443 | 9949444 | SHROOM2 | 220.0 | 24.0 | 43.0 | 67.0 | 58.0 | 20.0 | ... | 96.0 | 26.0 | 56.0 | 46.0 | 119.0 | 144.0 | 6.0 | 17.0 | 15.0 | 18.0 |
21 | chrX | 10016763 | 10016764 | WWC3 | 222.0 | 51.0 | 45.0 | 42.0 | 48.0 | 24.0 | ... | 90.0 | 18.0 | 66.0 | 39.0 | 80.0 | 134.0 | 8.0 | 44.0 | 15.0 | 82.0 |
22 | chrX | 10017211 | 10017212 | WWC3 | 39.0 | 5.0 | 6.0 | 10.0 | 9.0 | 4.0 | ... | 19.0 | 9.0 | 12.0 | 7.0 | 8.0 | 17.0 | 1.0 | 1.0 | 2.0 | 2.0 |
23 | chrX | 10017571 | 10017572 | WWC3 | 11.0 | 4.0 | 5.0 | 14.0 | 3.0 | 1.0 | ... | 7.0 | 1.0 | 8.0 | 5.0 | 1.0 | 3.0 | 0.0 | 4.0 | 3.0 | 3.0 |
24 | chrX | 10144474 | 10144475 | WWC3 | 101.0 | 12.0 | 54.0 | 76.0 | 59.0 | 9.0 | ... | 44.0 | 25.0 | 34.0 | 31.0 | 49.0 | 14.0 | 2.0 | 3.0 | 13.0 | 27.0 |
25 | chrX | 10234779 | 10234780 | CLCN4 | 26.0 | 2.0 | 5.0 | 6.0 | 1.0 | 2.0 | ... | 5.0 | 1.0 | 9.0 | 4.0 | 7.0 | 10.0 | 0.0 | 2.0 | 4.0 | 5.0 |
26 | chrX | 10237652 | 10237653 | CLCN4 | 24.0 | 1.0 | 0.0 | 6.0 | 6.0 | 1.0 | ... | 12.0 | 1.0 | 7.0 | 7.0 | 10.0 | 16.0 | 1.0 | 7.0 | 0.0 | 3.0 |
27 | chrX | 11118730 | 11118731 | HCCS | 379.0 | 65.0 | 44.0 | 47.0 | 55.0 | 39.0 | ... | 112.0 | 25.0 | 93.0 | 62.0 | 165.0 | 249.0 | 21.0 | 34.0 | 14.0 | 47.0 |
28 | chrX | 11121879 | 11121880 | HCCS | 5005.0 | 763.0 | 423.0 | 617.0 | 676.0 | 337.0 | ... | 1190.0 | 297.0 | 1091.0 | 950.0 | 2208.0 | 3124.0 | 118.0 | 604.0 | 172.0 | 465.0 |
29 | chrX | 11122516 | 11122517 | HCCS | 51.0 | 8.0 | 7.0 | 8.0 | 19.0 | 1.0 | ... | 23.0 | 6.0 | 16.0 | 9.0 | 22.0 | 27.0 | 1.0 | 4.0 | 1.0 | 4.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
47534 | chr21 | 46301134 | 46301135 | C21orf58 | 229.0 | 34.0 | 46.0 | 64.0 | 58.0 | 12.0 | ... | 112.0 | 17.0 | 65.0 | 55.0 | 113.0 | 99.0 | 6.0 | 15.0 | 21.0 | 35.0 |
47535 | chr21 | 46313176 | 46313177 | C21orf58 | 20.0 | 5.0 | 5.0 | 4.0 | 2.0 | 3.0 | ... | 9.0 | 0.0 | 5.0 | 5.0 | 12.0 | 18.0 | 1.0 | 2.0 | 0.0 | 2.0 |
47536 | chr21 | 46321022 | 46321023 | C21orf58 | 121.0 | 24.0 | 20.0 | 18.0 | 26.0 | 15.0 | ... | 36.0 | 10.0 | 24.0 | 24.0 | 43.0 | 78.0 | 1.0 | 8.0 | 3.0 | 7.0 |
47537 | chr21 | 46321427 | 46321428 | C21orf58 | 32.0 | 4.0 | 1.0 | 3.0 | 4.0 | 1.0 | ... | 15.0 | 1.0 | 2.0 | 7.0 | 10.0 | 12.0 | 1.0 | 1.0 | 1.0 | 1.0 |
47538 | chr21 | 46322416 | 46322417 | C21orf58 | 43.0 | 5.0 | 6.0 | 4.0 | 3.0 | 3.0 | ... | 12.0 | 1.0 | 4.0 | 4.0 | 8.0 | 24.0 | 2.0 | 3.0 | 2.0 | 1.0 |
47539 | chr21 | 46323009 | 46323010 | C21orf58 | 51.0 | 3.0 | 8.0 | 6.0 | 8.0 | 3.0 | ... | 18.0 | 1.0 | 9.0 | 8.0 | 13.0 | 28.0 | 0.0 | 4.0 | 1.0 | 5.0 |
47540 | chrM | 4402 | 4403 | MT-ND1 | 97471.0 | 10727.0 | 17813.0 | 21401.0 | 21728.0 | 5469.0 | ... | 33724.0 | 9194.0 | 22724.0 | 37521.0 | 37739.0 | 55028.0 | 1762.0 | 8064.0 | 5025.0 | 9950.0 |
47541 | chrM | 4874 | 4875 | MT-ND2 | 3442.0 | 781.0 | 1903.0 | 1481.0 | 1072.0 | 388.0 | ... | 1089.0 | 402.0 | 949.0 | 1473.0 | 1192.0 | 1832.0 | 310.0 | 347.0 | 315.0 | 1055.0 |
47542 | chrM | 5516 | 5517 | MT-ND2 | 161152.0 | 26722.0 | 64478.0 | 59672.0 | 45752.0 | 12101.0 | ... | 51116.0 | 18076.0 | 42266.0 | 62672.0 | 56966.0 | 85135.0 | 5327.0 | 12046.0 | 12750.0 | 23927.0 |
47543 | chrM | 5900 | 5901 | MT-ND2 | 7060.0 | 881.0 | 1846.0 | 2011.0 | 2010.0 | 399.0 | ... | 2696.0 | 832.0 | 2200.0 | 1072.0 | 2765.0 | 4054.0 | 169.0 | 742.0 | 535.0 | 927.0 |
47544 | chrM | 6534 | 6535 | MT-CO1 | 69097.0 | 7881.0 | 13739.0 | 15455.0 | 16516.0 | 3338.0 | ... | 23893.0 | 8083.0 | 21089.0 | 12623.0 | 27872.0 | 38585.0 | 1440.0 | 4086.0 | 3665.0 | 7889.0 |
47545 | chrM | 7115 | 7116 | MT-CO1 | 47710.0 | 5792.0 | 10685.0 | 12721.0 | 12303.0 | 3086.0 | ... | 16290.0 | 5501.0 | 15028.0 | 8227.0 | 19079.0 | 27416.0 | 1185.0 | 3189.0 | 3202.0 | 7032.0 |
47546 | chrM | 7439 | 7440 | MT-CO1 | 712899.0 | 66308.0 | 97667.0 | 143284.0 | 162471.0 | 31702.0 | ... | 249056.0 | 76925.0 | 197512.0 | 82165.0 | 302205.0 | 422529.0 | 11362.0 | 40206.0 | 35995.0 | 62462.0 |
47547 | chrM | 8294 | 8295 | MT-CO2 | 1131025.0 | 121149.0 | 187490.0 | 224730.0 | 230681.0 | 53614.0 | ... | 367953.0 | 120726.0 | 311390.0 | 209615.0 | 433037.0 | 609931.0 | 17969.0 | 66104.0 | 50592.0 | 116558.0 |
47548 | chrM | 9207 | 9208 | MT-ATP6 | 769101.0 | 113270.0 | 195132.0 | 204341.0 | 190157.0 | 41156.0 | ... | 254845.0 | 88300.0 | 236634.0 | 232539.0 | 282356.0 | 428224.0 | 14533.0 | 43492.0 | 41806.0 | 77761.0 |
47549 | chrM | 9991 | 9992 | MT-CO3 | 942328.0 | 90177.0 | 168006.0 | 204315.0 | 205105.0 | 38004.0 | ... | 324703.0 | 93999.0 | 254561.0 | 216205.0 | 357633.0 | 516692.0 | 13692.0 | 52770.0 | 44826.0 | 97510.0 |
47550 | chrM | 10404 | 10405 | MT-ND3 | 410615.0 | 70415.0 | 149244.0 | 136603.0 | 112429.0 | 23843.0 | ... | 132119.0 | 47861.0 | 118202.0 | 193483.0 | 143076.0 | 210334.0 | 8604.0 | 27109.0 | 27945.0 | 49768.0 |
47551 | chrM | 10946 | 10947 | MT-ND4 | 24534.0 | 4203.0 | 9287.0 | 8612.0 | 7227.0 | 1450.0 | ... | 8153.0 | 2956.0 | 7533.0 | 8182.0 | 8362.0 | 12710.0 | 661.0 | 1721.0 | 1804.0 | 3891.0 |
47552 | chrM | 11326 | 11327 | MT-ND4 | 15820.0 | 2472.0 | 6056.0 | 5854.0 | 4873.0 | 1031.0 | ... | 5322.0 | 1999.0 | 5367.0 | 6627.0 | 5513.0 | 8638.0 | 539.0 | 1160.0 | 1291.0 | 2943.0 |
47553 | chrM | 12142 | 12143 | MT-ND4 | 499343.0 | 56106.0 | 84547.0 | 107740.0 | 107939.0 | 25895.0 | ... | 164071.0 | 47958.0 | 118091.0 | 105913.0 | 187450.0 | 280727.0 | 8461.0 | 36212.0 | 23365.0 | 55436.0 |
47554 | chrM | 13008 | 13009 | MT-ND5 | 39407.0 | 5570.0 | 8354.0 | 8779.0 | 9235.0 | 1791.0 | ... | 13064.0 | 4394.0 | 11105.0 | 8145.0 | 12860.0 | 22688.0 | 705.0 | 2331.0 | 2130.0 | 4393.0 |
47555 | chrM | 13992 | 13993 | MT-ND5 | 11492.0 | 1407.0 | 2310.0 | 2854.0 | 2965.0 | 686.0 | ... | 3526.0 | 1320.0 | 3074.0 | 1853.0 | 4138.0 | 6628.0 | 268.0 | 960.0 | 711.0 | 1802.0 |
47556 | chrM | 14747 | 14748 | MT-ND5 | 23396.0 | 2596.0 | 2848.0 | 5522.0 | 6476.0 | 978.0 | ... | 8422.0 | 2951.0 | 8120.0 | 2481.0 | 9710.0 | 14555.0 | 323.0 | 1560.0 | 1432.0 | 2311.0 |
47557 | chrM | 15887 | 15888 | MT-CYB | 506445.0 | 51806.0 | 90860.0 | 127597.0 | 131681.0 | 24342.0 | ... | 180604.0 | 60051.0 | 171024.0 | 129651.0 | 204620.0 | 283866.0 | 8134.0 | 30467.0 | 29304.0 | 51724.0 |
47558 | chrM | 16557 | 16558 | MT-CYB | 1544.0 | 236.0 | 416.0 | 493.0 | 480.0 | 64.0 | ... | 513.0 | 198.0 | 599.0 | 254.0 | 524.0 | 908.0 | 44.0 | 80.0 | 129.0 | 192.0 |
47559 | chrM | 12589 | 12590 | MT-ND6 | 5396.0 | 536.0 | 1331.0 | 1280.0 | 1143.0 | 200.0 | ... | 1968.0 | 522.0 | 1493.0 | 1018.0 | 1710.0 | 2749.0 | 77.0 | 269.0 | 283.0 | 602.0 |
47560 | chrM | 13043 | 13044 | MT-ND6 | 5952.0 | 620.0 | 1574.0 | 1435.0 | 1304.0 | 239.0 | ... | 2086.0 | 615.0 | 1515.0 | 1164.0 | 1763.0 | 2948.0 | 102.0 | 395.0 | 293.0 | 647.0 |
47561 | chrM | 13403 | 13404 | MT-ND6 | 5654.0 | 644.0 | 1617.0 | 1515.0 | 1307.0 | 237.0 | ... | 2010.0 | 591.0 | 1593.0 | 1202.0 | 1734.0 | 2794.0 | 93.0 | 317.0 | 278.0 | 649.0 |
47562 | chrM | 13768 | 13769 | MT-ND6 | 12441.0 | 1404.0 | 3770.0 | 3326.0 | 2827.0 | 428.0 | ... | 4441.0 | 1308.0 | 3713.0 | 2896.0 | 4107.0 | 6309.0 | 220.0 | 585.0 | 686.0 | 1150.0 |
47563 | chrM | 14123 | 14124 | MT-ND6 | 38851.0 | 4750.0 | 10819.0 | 10066.0 | 8939.0 | 1075.0 | ... | 14100.0 | 4305.0 | 12045.0 | 9608.0 | 13198.0 | 20205.0 | 518.0 | 1623.0 | 1967.0 | 2904.0 |
47564 rows × 31 columns
#Intersect perturb-seq measurement dataframe against PolyADB V3
perturb_df.to_csv("pseudobulk_counts_de_novo_polyA_sites_by_gene.coordinates.bed", sep='\t', header=False, index=False)
!bedtools intersect -a polyadb_cut_mode_coordinates_hg38.bed -b pseudobulk_counts_de_novo_polyA_sites_by_gene.coordinates.bed -wa -wb > pseudobulk_counts_de_novo_polyA_sites_by_gene_intersect.bed
perturb_bed_hg38 = pd.read_csv("pseudobulk_counts_de_novo_polyA_sites_by_gene_intersect.bed", sep='\t', error_bad_lines=False, index_col=False, names=['chrom', 'start', 'end', 'gene', 'gene_id', 'strand', 'chrom_2', 'start_2', 'end_2', 'gene_2', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5'])
perturb_bed_hg38 = perturb_bed_hg38.query("gene == gene_2").copy().reset_index(drop=True)
#Assign count to closest annotated cleavage site
perturb_bed_hg38['se'] = (perturb_bed_hg38['start_2'] - (perturb_bed_hg38['start'] + 30))**2
perturb_bed_hg38 = perturb_bed_hg38.sort_values(by='se', ascending=True).drop_duplicates(subset=['gene_2', 'chrom_2', 'start_2'], keep='first').copy().reset_index(drop=True)
perturb_bed_hg38 = perturb_bed_hg38[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5']]
perturb_bed_hg38['sort_index'] = perturb_bed_hg38['start']
perturb_bed_hg38.loc[perturb_bed_hg38['strand'] == '-', 'sort_index'] *= -1
perturb_bed_hg38 = perturb_bed_hg38.sort_values(by='sort_index', ascending=False).drop_duplicates(subset=['gene_id'], keep='first').copy().reset_index()
print("len(perturb_bed_hg38) = " + str(len(perturb_bed_hg38)))
len(perturb_bed_hg38) = 34495
#Append measurements to APA annotation dataframe
df = df.join(perturb_bed_hg38[['gene_id', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5']].set_index('gene_id'), on='gene_id', how='left').copy().reset_index(drop=True)
count_cols = ['NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5']
for count_col in count_cols :
df.loc[df[count_col].isnull(), count_col] = 0
total_counts = []
for _, row in df.iterrows() :
total_count = 0
for count_col in count_cols :
total_count += row[count_col]
total_counts.append(total_count)
df['total_count'] = np.array(total_counts)
#Remove genes with zero total count across all conditions
#df_gene = df.groupby(['gene']).agg({'total_count' : 'sum'}).reset_index().rename(columns={'total_count' : 'total_count_gene'})
#df = df.join(df_gene.set_index("gene"), on='gene', how='inner').copy().reset_index(drop=True)
df = df.query("total_count > 0.").copy().reset_index(drop=True)
print("len(df) = " + str(len(df)))
len(df) = 34495
df
gene | gene_id | sitenum | num_sites | pas | seq | seq_ext | wide_seq | wide_seq_ext | pas_pos | ... | PCF11 | RBBP6 | RPRD1A | RPRD1B | SCAF8 | SF3A1 | SRSF3 | SYMPK | THOC5 | total_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ABCB10 | ABCB10.6 | 2 | 7 | 0 | GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT... | TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC... | TCAGGTTTTGTATTTTCTTTTCTTGTGGAATATTTTAATTAATATA... | GAGTTTTAATAATTGTAACTTTTTAAATGTCTATAGCACTGAAGTT... | 229653571 | ... | 3.0 | 14.0 | 17.0 | 30.0 | 39.0 | 1.0 | 4.0 | 5.0 | 17.0 | 576.0 |
1 | ABCB10 | ABCB10.3 | 5 | 7 | 0 | TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC... | TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT... | GGTTGAATCTGAGGAAAATAATCCTTGTGCCATAGAAGTATTTACG... | TGTGTTCTTTATAAAGTGTGATTTTCAGAAAGCAAACAACACAATT... | 229652351 | ... | 140.0 | 385.0 | 304.0 | 563.0 | 865.0 | 16.0 | 67.0 | 52.0 | 125.0 | 8657.0 |
2 | ABCD3 | ABCD3.2 | 2 | 15 | 0 | TTCGAGACAAGCCTGGACAAAAAGCGAGACCCGCTTCTTTAAAAAA... | ATCCCAGCACTTTGGGAGGCTGAGATGGGAGGATCGCTTGAATCCA... | CCAGGCGTGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCT... | TAGGTACTTGGAAAAATTTTGTGGCATTAAAAACCAGACAAATGTA... | 94944251 | ... | 22.0 | 27.0 | 49.0 | 107.0 | 137.0 | 4.0 | 52.0 | 15.0 | 7.0 | 1291.0 |
3 | ABCD3 | ABCD3.3 | 3 | 15 | 0 | GTATGGTTGTTTTACATATGTGTATGTGTGTATATGCATTTCAGTT... | ATTTAATATGTAATGTTATTGTTACATATTTATAACACAGCCATAT... | ATCTTATTGAAATGTAACTTTAGTCATTTAATATGTAATGTTATTG... | GTCTCTGATATTTGTGATGGCAAGAATCACTTTTAAGTTTTCTTTG... | 94944623 | ... | 4.0 | 8.0 | 11.0 | 21.0 | 19.0 | 2.0 | 12.0 | 1.0 | 1.0 | 247.0 |
4 | ABCD3 | ABCD3.14 | 14 | 15 | 0 | GCCTTGACTTGAAAACATAGATAGTTTAATCTTGACTTGAAAAACA... | TGTTCTTTTTATTCTGGTATCTAAATACTGAGAAGTTCATTTATAA... | TCAGATATCCTATACAACCTTTGCTTGTTCTTTTTATTCTGGTATC... | CGAACTTGTATACTTATTTTCTGTTCAGATTAAAAAAAAAAAAAAA... | 94984196 | ... | 180.0 | 508.0 | 342.0 | 655.0 | 971.0 | 23.0 | 109.0 | 67.0 | 133.0 | 11240.0 |
5 | ABCD3 | ABCD3.15 | 15 | 15 | 0 | ATGAGAAAATAAGTATGAAACAGCAATGGTAGTTTGTTTTGCATTA... | TGCCAAGACATATCACCGTGTTCTCATAATAAGTTTTTACTTTTTA... | TTTCATCCATGAGCACCACGCTGCATGCCAAGACATATCACCGTGT... | GCAGTGGGAAATGGTAGTTTAATCCGAAGAATAAACCAAAGAATAA... | 94984889 | ... | 3.0 | 18.0 | 16.0 | 31.0 | 23.0 | 3.0 | 0.0 | 1.0 | 4.0 | 329.0 |
6 | ABL2 | ABL2.17 | 8 | 24 | 3 | CTGAGGGGAGAGGGAAAAGGACTTGTTTTCCTGTGTTCTTGTTTTC... | TGTGGTGCAGAGGTAGCCACTGTTAGCCTGGTGGGAAAATGCACAC... | TGTCATGTGTACAGGAAATCAGTGATGTGGTGCAGAGGTAGCCACT... | TTCAGCAGCTGCTGGTGTGCCCGGGACAAACCCTGTCCTTAATAAC... | 179076768 | ... | 6.0 | 12.0 | 8.0 | 24.0 | 34.0 | 2.0 | 9.0 | 2.0 | 16.0 | 485.0 |
7 | ABL2 | ABL2.16 | 9 | 24 | 2 | CCACAAGGCCATTGCTGCTGTAATAAGAACTGCAAATCAGAGTGCT... | CAAGAGAAATTTTTGTTCAGGGCTGTTGGAAGTAGCTGTTAGCCTT... | GCAGAAAAGAAAGCTGGGAATGTACCAAGAGAAATTTTTGTTCAGG... | GGTACTAATGGTGATTATGCTCCAATTTACCTAATGAATTTGGTGG... | 179076299 | ... | 0.0 | 4.0 | 1.0 | 3.0 | 9.0 | 1.0 | 0.0 | 2.0 | 1.0 | 82.0 |
8 | ABL2 | ABL2.8 | 17 | 24 | 0 | ACTGCTTTCTCTGTCTTCTCACAAGGTTTGCCAAGTTGTGTTCTGT... | ACTGCTAACAGTGTTAAACTTGATGTAAATAAATGAGGCCCTTGAA... | CTCTCCGTCTGTTGTCTGACTGTGAACTGCTAACAGTGTTAAACTT... | GTTCTTAATTGTTATTGTAATATATTTTCAGTTGTTTTTCTAATTT... | 179068493 | ... | 83.0 | 124.0 | 132.0 | 254.0 | 364.0 | 13.0 | 61.0 | 29.0 | 76.0 | 3961.0 |
9 | ACADM | ACADM.4 | 4 | 19 | 0 | TAAACTTATACATATGAAGCTTTATATGTTTTGTTTGGAATATGTT... | CCAGGATTAGGATTTAGTTTTGGTATATGTTCGGTTCTATCTTTTG... | TACAAAAGCCAATCGACAACGTGAACCAGGATTAGGATTTAGTTTT... | TCTTTACAGGTCCTGAGAAGTATTTCTCGTTTTCATTGGAGATCAC... | 76194251 | ... | 22.0 | 53.0 | 27.0 | 55.0 | 109.0 | 1.0 | 12.0 | 19.0 | 15.0 | 1167.0 |
10 | ACADM | ACADM.14 | 14 | 19 | 0 | TCTATTGTACACAATCTCATTTCATATGTTTGCATTTTGGCAAAGA... | CTTGCCTTAAATTATTTTTATATGACTGTTGGTCTCTAGGTAGCCT... | CCTTATTTAAAATAAATCAATAAAGCTTGCCTTAAATTATTTTTAT... | CAAGAACTTTCTTGAAAATCTTATTTAATTCTGAGCCCATATTTCA... | 76229155 | ... | 405.0 | 1137.0 | 707.0 | 1187.0 | 1779.0 | 42.0 | 184.0 | 146.0 | 330.0 | 20840.0 |
11 | ACAP3 | ACAP3.3 | 3 | 5 | 2 | CCGGCCTCCTCCGGAGGCACCTTCTCCTGGTACTCGGCCCAGAGCC... | CCACGTGGCTGGCCACGAAGGTCCCCGTGCCAGACAGCCCCAGCCG... | CTGGCTGGACGCGGGCGTCCCAAGGCCACGTGGCTGGCCACGAAGG... | CTGGCGTCGCGGGTGCTGGGCGGGAGGGGCTCTGGCCTGGGTCCTC... | 1228274 | ... | 28.0 | 108.0 | 57.0 | 112.0 | 172.0 | 8.0 | 27.0 | 15.0 | 42.0 | 2090.0 |
12 | ACAP3 | ACAP3.2 | 4 | 5 | 0 | TCTTGCCCCAGGCCCCTGCTGGCGGGTCTCACCCCCCACCCCTCGC... | AAGAACAGAATTGATTCTTGCCCCTCTCCCTGTGTGAGCTTGGCCC... | CTCTTGCCTGCTGCCTGTGACCCTGAAGAACAGAATTGATTCTTGC... | TGGGGAGGCTCCCTGAGGGCACAGTGGGCGCTGGACCCGGCCCCCC... | 1227789 | ... | 49.0 | 99.0 | 73.0 | 160.0 | 187.0 | 12.0 | 44.0 | 22.0 | 40.0 | 2661.0 |
13 | ACBD3 | ACBD3.18 | 5 | 22 | 2 | ATGTGTGTAGAGATATGCCAAAATATATCATTATCCCTACCCCATG... | GCATTGCTATTCATTTAAGTAATTGGCTTGTAAATGATACATTCAA... | TAACCTATATTAGCAGCAAAGAGAAGCATTGCTATTCATTTAAGTA... | TGAAAATCCAGACCTTGGAAGAAGATATCTTAAAATCTTAATTGTG... | 226344254 | ... | 4.0 | 1.0 | 6.0 | 7.0 | 19.0 | 1.0 | 6.0 | 1.0 | 0.0 | 147.0 |
14 | ACBD3 | ACBD3.17 | 6 | 22 | 4 | CACTGCTGTCAGCGTGCATGTCAGTGAGTCCAGCGATGACGACGAG... | AATTATGACATTGGGTTTGGGGTGTATTTTGAATGGACAGACTCTC... | TCTCTTTTGGGAATTTGCCACAGACAATTATGACATTGGGTTTGGG... | CGAGGAGAAGTGGTCACTGTTCGAGTACCCACCCATGAAGAAGGAT... | 226340041 | ... | 40.0 | 137.0 | 116.0 | 238.0 | 302.0 | 6.0 | 31.0 | 28.0 | 31.0 | 3479.0 |
15 | ACBD3 | ACBD3.11 | 12 | 22 | 0 | GTGGATGCTGAAGTTACATGAGCTACATGTTAAATATTTAAAGTCT... | AGATTCCTCAGACTCATCCAGCCCTTGGGTGCTGACCAGCAGAGTC... | AGCATTCATACTTTGGGGTTAAAGGAGATTCCTCAGACTCATCCAG... | TGATGGTTTGTGAACTCTTGCTGGGAATCAAAATTTCCTTGAGACT... | 226334019 | ... | 12.0 | 48.0 | 24.0 | 56.0 | 75.0 | 5.0 | 25.0 | 5.0 | 40.0 | 1490.0 |
16 | ACBD3 | ACBD3.10 | 13 | 22 | 0 | TTTGTTTTGGCTTCATAGAGTATCTCAAATTGAAACTTTTCTGCAC... | TGGTATTCATACTACTAGTAGCAAAATACAGGTTTTTTGTTTTGTT... | AACTTTGAATCCTTGTATCTTTATTTGGTATTCATACTACTAGTAG... | TATCAAGATACGTAGAACACCTCAGAGATTTTTCTTCAGGAACTTC... | 226333494 | ... | 14.0 | 50.0 | 28.0 | 55.0 | 84.0 | 3.0 | 19.0 | 10.0 | 23.0 | 1392.0 |
17 | ACBD3 | ACBD3.2 | 21 | 22 | 0 | ACAGTACAAGTGCGATTTCAAAAAGATCTTGAAAGTAATATATTTA... | AGAATATTTTTGGTTTTAAACTTTCTTATTGCCTTTGGCTGTTGAT... | GCGGTTCCTGTCATGTGTTCATGTCAGAATATTTTTGGTTTTAAAC... | CCTAAAAATATCATTGTTCTTGGGAGCAGTGTATGTTACTTTACAT... | 226332399 | ... | 119.0 | 333.0 | 241.0 | 579.0 | 862.0 | 39.0 | 105.0 | 63.0 | 114.0 | 8577.0 |
18 | ACBD6 | ACBD6.12 | 1 | 12 | 2 | ATAATGTGATGCAATGCAAAATGCAGTATTACAACTTTTATTGTGA... | GCCTGGGCAACAAGAGCGAAGAAACTCCACCTCAAATAATAATAAT... | GAAGATTGTGCGCCATTGCACTCCAGCCTGGGCAACAAGAGCGAAG... | CTGTAATCCCAGCTACTCGGGAGGCTGAGGAGGCGGAGGTTGCGGT... | 180470430 | ... | 4.0 | 9.0 | 9.0 | 12.0 | 21.0 | 1.0 | 8.0 | 1.0 | 2.0 | 248.0 |
19 | ACBD6 | ACBD6.8 | 5 | 12 | 0 | TTTTTATCTTTGTAGATATTGGCTTACTTTCTTTGGATTTTCATTC... | ATGTACCCTAGAGCCCACAAATTCCTTAGTTTGAGGAATTATGGCA... | CCTGCATATCTCTTTAAAGGAGTTTATGTACCCTAGAGCCCACAAA... | TTATCTAGAAGATGAAGAGCTGGAAATTGAGTCACTTAAAATGATC... | 180386169 | ... | 4.0 | 1.0 | 8.0 | 8.0 | 22.0 | 0.0 | 0.0 | 0.0 | 4.0 | 139.0 |
20 | ACBD6 | ACBD6.6 | 7 | 12 | 0 | AACTACAAAAATAATACTTCTTTTCCACCCGTCTTTGGTATGTATT... | GACTGGAAAACTGCAGTCTGTAATAGCATAAGGCTTCCATTATGAA... | CACAACTGGCAAGGCTTAATCAAAAGACTGGAAAACTGCAGTCTGT... | CCAGAGGAGGTGACAGGCTGCAAAACAGTTTCTTTGGTGCTGCAGC... | 180257391 | ... | 253.0 | 619.0 | 698.0 | 1173.0 | 2039.0 | 65.0 | 256.0 | 83.0 | 267.0 | 20294.0 |
21 | ACOT11 | ACOT11.3 | 3 | 9 | 2 | TCTCTGGAATCTGTCAACCCAGTTTTGGGCTCCAGGTGGATGGGTT... | TCCCTTGTTAAAGGGGCAGTGGGAGTTATGGGGTCATCAAGGACCT... | TTGGGTTATCATAAGGTGTTAAGAGTCCCTTGTTAAAGGGGCAGTG... | TCATGCCTTCTGTGTCTGGAAGAGGCGGCAGAGGCAACAGTGTTTA... | 55075774 | ... | 3.0 | 6.0 | 8.0 | 24.0 | 21.0 | 2.0 | 0.0 | 2.0 | 1.0 | 192.0 |
22 | ACOT7 | ACOT7.13 | 6 | 18 | 0 | GGAGCTTTTCCTCCGGCTGAGATATATATATAGAATACATTTTTAG... | TAGAGGGAAGGAACCAGGATTCTGAGTCTCTGCAGTGTGGGCCCCG... | CACAGCCCCCAGGAAGAGCTCTTCTTAGAGGGAAGGAACCAGGATT... | GGACAGATGTGACCAGCTCCTCCCGCAGCCTGACCCTGCGGTCCAC... | 6392222 | ... | 1.0 | 1.0 | 4.0 | 6.0 | 8.0 | 0.0 | 0.0 | 0.0 | 1.0 | 87.0 |
23 | ACOT7 | ACOT7.8 | 11 | 18 | 0 | TCCCCTGGGTGTGGCCGCGTCATGTTGATCTACGCTGTTATTTCTC... | GCAGAGGGGCAGGGAAGGGCGCCAGCTTGAATGGAGGGAATGGCAG... | AGGGCCAGCCCTGGGGGAAGGGGCTGCAGAGGGGCAGGGAAGGGCG... | CCCCTTCCAGAACTCCATACCCTGGGGGGGGGTGGTCAGAGTGCAG... | 6336907 | ... | 1.0 | 5.0 | 1.0 | 5.0 | 14.0 | 2.0 | 0.0 | 0.0 | 0.0 | 84.0 |
24 | ACOT7 | ACOT7.1 | 18 | 18 | 0 | GGGAATGCTTCCGAGCACGCTGTAGGGTATGGGAAGAACCCAGCAC... | TGCTACACAGTGTTGTCCCGAGCGCCGGGAGGCGTTGGGCAGAAAC... | TTTATTTATATCATTCCAGTATCAATGCTACACAGTGTTGTCCCGA... | GTATCACAGTGTTAACCTGTACTCTCTCCTGCAAACCTACACACCA... | 6324354 | ... | 635.0 | 1524.0 | 1207.0 | 2112.0 | 3292.0 | 110.0 | 542.0 | 325.0 | 583.0 | 37630.0 |
25 | ACTA1 | ACTA1.1 | 1 | 1 | 0 | TATTTTTCGAAACAAAGCCCTGTGGAAGAAAATGGAAAACTTGAAG... | GACACAGTGTTTATAACGTGTACATACATTAACTTATTACCTCATT... | ACTTCCGTTGCTGCCATCGTAAACTGACACAGTGTTTATAACGTGT... | TGGGGGGGCGGCTGAGCTCCAGCCACCCCGCAGTCACTTTCTTTGT... | 229567020 | ... | 26.0 | 35.0 | 10.0 | 24.0 | 38.0 | 10.0 | 8.0 | 14.0 | 7.0 | 731.0 |
26 | ACTL8 | ACTL8.1 | 1 | 1 | 0 | AGCCTGGGATGCCCTTGCCACCCGTGGTTGGATCTTGTTTTATATC... | ATTTCTGGTCCTACAGGCCCTTTCTGGCCAGGGAGGCATTGCTGCA... | GACTAGGGGATGGGGGACAGTTGACATTTCTGGTCCTACAGGCCCT... | AGTAGGTTTTAACTGGGGTAGCACTCCTGCTAGGAGTCCCAATTAT... | 18153534 | ... | 3.0 | 2.0 | 9.0 | 6.0 | 11.0 | 1.0 | 0.0 | 1.0 | 2.0 | 169.0 |
27 | ACTN2 | ACTN2.4 | 4 | 8 | 0 | GAGCGATCTGTGATGCTGAGCTTCTGTAATCACTCATCCCATCAGA... | GTGCCTGGTGCACTGGATTACGCTGCGTTCTCTTCCGCACTCTACG... | GCCCGCCTACTCGGGCCCAGGCAGTGTGCCTGGTGCACTGGATTAC... | CTGCGTCGGGAGCTGCCCCCGGATCAGGCCCAGTACTGCATCAAGA... | 236925956 | ... | 1.0 | 10.0 | 7.0 | 19.0 | 22.0 | 0.0 | 8.0 | 2.0 | 7.0 | 380.0 |
28 | ACTN2 | ACTN2.5 | 5 | 8 | 0 | TACAAAATACCCAAGATTTAAGACCGGGGGGAAAAAACCACAAATT... | TAGGAAATTAGGAGGATCTAGGGACAGAAGGAAAGTGAAAAATGTG... | TAAACAGAACAAATTACTTGAGTAATAGGAAATTAGGAGGATCTAG... | TTCTGAGTTTTTAGCAAAATGTAATGAAATATCAGGTTGATTTCTT... | 236926307 | ... | 1.0 | 14.0 | 10.0 | 17.0 | 28.0 | 0.0 | 11.0 | 4.0 | 11.0 | 390.0 |
29 | ADAM15 | ADAM15.6 | 6 | 7 | 0 | GGTTGGACGGGATTGAGGAAGGTCCGCACAGCCTGTCTCTGCTCAG... | TCTGCGGACCTGCCGGCGTAGTTGCAGCGGGGGCTTGGGGAGGGGC... | ACCGCCACGCGCTGTCAAGCAACACTCTGCGGACCTGCCGGCGTAG... | CTACCATGACTGAAGGCGCCAGAGACTGGCGGTGTCTTAAGACTCC... | 155035225 | ... | 120.0 | 346.0 | 311.0 | 457.0 | 811.0 | 15.0 | 43.0 | 66.0 | 110.0 | 8455.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
34465 | ZNF182 | ZNF182.2 | 1 | 2 | 0 | GAAATAATATGGTAGATAATTTAGACTTATTTAGTAGAAGTTCTGC... | TACAAACCTTGTTGCTTCAATACAAAGACCCGATAAACACGAATCA... | AGTTTGGAAACATACTTCATCTGAGTACAAACCTTGTTGCTTCAAT... | ATTTTCTGACAAGAAAACAATTATCACCAAGAGTGCTCGTGACTGT... | 47836945 | ... | 3.0 | 8.0 | 8.0 | 25.0 | 32.0 | 0.0 | 8.0 | 5.0 | 5.0 | 380.0 |
34466 | ZNF182 | ZNF182.1 | 2 | 2 | 0 | TTGAAAATGTGGGTGTACTATGTACTATGTGGATGTACTACCTTTT... | GGCTGGGGGGATGGAATTGAGAGGGAGACAACTGTATCCTTTCATA... | CCTTGGGAGGGGAAGGGGCAGTGGTGGCTGGGGGGATGGAATTGAG... | TTCATAAGTTATCTCTGGAAAGAGACATGAGAACCTGGTAACCCTG... | 47834270 | ... | 20.0 | 40.0 | 42.0 | 87.0 | 96.0 | 7.0 | 14.0 | 14.0 | 38.0 | 1251.0 |
34467 | ZNF185 | ZNF185.5 | 5 | 6 | 0 | TTGCTTAGTGTTTCTAATCATACTTAATCCACACTAATGTGCGCAA... | TTTTGCAGATCTGAGGAAGAGGGATGCATTACCTTTTTGCTTCTTT... | CTCTCATGTCTAAAAAGGCACAGAATTTTGCAGATCTGAGGAAGAG... | TTGAAAGAAATCTTGCAAGAGCCATTATTGACTTAGATCCAAAACA... | 152141997 | ... | 10.0 | 27.0 | 16.0 | 56.0 | 63.0 | 4.0 | 5.0 | 7.0 | 18.0 | 576.0 |
34468 | ZNF275 | ZNF275.4 | 4 | 6 | 2 | ATTTTATGTCTACGTATATTGTTCCTTTACTGAACCCACCACATGC... | TAAGATGGGTGAAAGTCGATGCCTTCTAGTCTCAGTGAATTTAACC... | GTTCAAACGTGTGTTCTCTGTTCTCTAAGATGGGTGAAAGTCGATG... | TGCCACTTGGCTGCTTCCTGGCCAAGTCGCACCTGACTGCATGAAC... | 152617885 | ... | 3.0 | 3.0 | 5.0 | 17.0 | 26.0 | 1.0 | 3.0 | 4.0 | 3.0 | 291.0 |
34469 | ZNF275 | ZNF275.6 | 6 | 6 | 0 | GTCTGATCCCCTACCAAATCTAGCACAGTGCCTTGCATCAAGTAGA... | CCCCCACCCATTAAATTGTGAGCTCTTAGAAGACAGGGGTGGCCTT... | GCTTCACCCCCTCCTCCTCAGCCCTCCCCCACCCATTAAATTGTGA... | TTCTGGCACTCACTATAATCAGCCTTGCACTAGAGCTGTTTGTGGA... | 152618362 | ... | 55.0 | 164.0 | 90.0 | 240.0 | 268.0 | 10.0 | 31.0 | 13.0 | 66.0 | 3151.0 |
34470 | ZNF280C | ZNF280C.3 | 4 | 6 | 0 | TTGGTTACATTGAATACAGATTTGCTGAACAGTTTTGATGTTATTT... | GCTCATATATATATAGATGTATATTTTTTTTAATTTCTTGTTTGTT... | GGACCAAAAAATGCACTTGTTTCTTGCTCATATATATATAGATGTA... | GCCATTAATGTAATTCCTCTGGATAAAGATAATATATTCAAAAAAT... | 129336743 | ... | 44.0 | 69.0 | 72.0 | 82.0 | 155.0 | 14.0 | 18.0 | 15.0 | 49.0 | 2016.0 |
34471 | ZNF280C | ZNF280C.1 | 6 | 6 | 0 | TGATTCAGAAATAGCCATGTCATGCATGTGTCCTTTTTTGTTTTCA... | AGCTTGACAATCTGATCCCTCTTCACCTTCAGACTGTTAGTTATTT... | GTTTTAATAGGTGTTAGTGAGTTTTAGCTTGACAATCTGATCCCTC... | TAAAACTATTTATGCAGTCAGTACTAAGCTTACTTGTTATAAGCAG... | 129335349 | ... | 5.0 | 19.0 | 11.0 | 10.0 | 21.0 | 0.0 | 1.0 | 3.0 | 3.0 | 290.0 |
34472 | ZNF41 | ZNF41.22 | 1 | 22 | 0 | TAGCTAATTTTCCAGCACCAGTCGTTAAATGAATACCCTTTTCTAA... | TATTTTGGTGTATGGTATGAGATGGAATCAGTTCGTATATTCCATT... | GTTAAATTTTAATCTAACTGGAATTTATTTTGGTGTATGGTATGAG... | ACAAATACGCCTTAAATTTATTTTTAATATATTTATTTCAATTTTT... | 47339710 | ... | 0.0 | 5.0 | 1.0 | 4.0 | 11.0 | 0.0 | 1.0 | 1.0 | 6.0 | 101.0 |
34473 | ZNF41 | ZNF41.16 | 7 | 22 | 2 | CCTAGCTCTAGGCTATGTTACAGAAATATAGTCATTGAATGATACA... | AATGGAATTTTTAAATTTAAAGATATAACTTTATGAATTGAGAAAT... | CTCATGGTATATATATTTTAAGTGCAATGGAATTTTTAAATTTAAA... | ACATAGAAAAGATTTCCATGAAAAACTTTTTTCTTTTCCCTTGGGA... | 47306101 | ... | 6.0 | 19.0 | 9.0 | 22.0 | 26.0 | 0.0 | 1.0 | 3.0 | 15.0 | 331.0 |
34474 | ZNF41 | ZNF41.12 | 11 | 22 | 3 | GATAATTCATTTTCATTGTCATGTTGTATCCCATTCTGTGAATATG... | TCATTCACATTATGTATGTGAGAGTCATCCATATGTTGCATATAGT... | TATACTCTTTAGCATCTGTCTTCTGTCATTCACATTATGTATGTGA... | TTTTGTATTTTATATAAATTGAGTCAATTATATATCATATAATTGA... | 47305154 | ... | 1.0 | 0.0 | 4.0 | 12.0 | 5.0 | 0.0 | 0.0 | 0.0 | 1.0 | 68.0 |
34475 | ZNF41 | ZNF41.2 | 21 | 22 | 4 | TTTAAGTTTTACATCTAGATCTAGCATGTATTTTGAGTTATATGGT... | TTGCCTAACCAAAGATTACAATGATTTTTTCCTGTGTTTTCTTCTA... | TTTGAGTGCTGTATCTAAGAAATCCTTGCCTAACCAAAGATTACAA... | CTTATTTTGATGAAGTCCGTTCTTTCCATTTGTTCATTTCTGGGTT... | 47304199 | ... | 13.0 | 20.0 | 19.0 | 29.0 | 46.0 | 0.0 | 4.0 | 0.0 | 10.0 | 499.0 |
34476 | ZNF449 | ZNF449.3 | 3 | 11 | 0 | TCTATCAGACGTATTGATTATAGCAGTACTATAGTTATTCTGCTGT... | AAATTATGACAATCCTTTTAGAGGTAGGGTCAATATAGTGGATAAA... | TCATAGGTGTAAACATAAAGCATATAAATTATGACAATCCTTTTAG... | TCTTCTTTTTTTAATTACAATGAAAAATTTTGTGTTCCAAGGCAAC... | 134495399 | ... | 3.0 | 4.0 | 3.0 | 8.0 | 17.0 | 2.0 | 5.0 | 1.0 | 4.0 | 192.0 |
34477 | ZNF449 | ZNF449.6 | 6 | 11 | 0 | AGTGCCTTAGAATGGATGTGCCCAACTGCTCTGTATTTATGCAATA... | ATGTAATGGCTTCTCTTTTCTCTCTTGTGGAATTGCATTCAAACCA... | TGAATGTAGAGATGAAAAATACAGAATGTAATGGCTTCTCTTTTCT... | GGCTCAAATTGATATCCCAGTAGCAATAAACATATAATATAGGAGG... | 134497046 | ... | 10.0 | 24.0 | 26.0 | 48.0 | 51.0 | 4.0 | 8.0 | 5.0 | 4.0 | 571.0 |
34478 | ZNF449 | ZNF449.8 | 8 | 11 | 0 | AAGATTAAATGAAATATATTTTGCTCTGGCCCTACACACTGTAAGC... | CCTAGCTATTATAAAGGGGAAATTACAGTACCTACCTCAAAAGTAC... | TAAGGGCAAAATACAGTACCTACCTCCTAGCTATTATAAAGGGGAA... | GGATAATTATACTTCTATGTCTAATTGTACTTCTGAGCATTTCAAA... | 134497629 | ... | 2.0 | 6.0 | 2.0 | 10.0 | 10.0 | 0.0 | 0.0 | 3.0 | 7.0 | 171.0 |
34479 | ZNF630 | ZNF630.4 | 2 | 5 | 0 | AATATGGAATCAATTTGCTCACCCTCAAACTGTCTCAGCCCTCTTC... | CCTTCCACAGAATGATCTGCTTAGGCCCTCAGAATATTTTCAACCC... | TATGTTCCTCAGTATACATGATTACCCTTCCACAGAATGATCTGCT... | CTCACATTGAATAGAAATAGTGACAACTTCTCAACTGTAGAATAGA... | 47917631 | ... | 3.0 | 7.0 | 3.0 | 7.0 | 22.0 | 2.0 | 4.0 | 3.0 | 3.0 | 241.0 |
34480 | ZNF674 | ZNF674.6 | 1 | 6 | 0 | GTTCACCTGTTGAAGGACATCTAGGTTGTATCCAGTGTTGGGCTAT... | ATTCTTTTTATTGCTGAGTAGCATTCCATGGTATGCATGTTTGTTT... | CAGGTTGTTGTATGTATCAAGCTTGATTCTTTTTATTGCTGAGTAG... | CTTAAGGGATTAGTTCTTTTTCACTCAGCATAATCCTCCAGAGATT... | 46379244 | ... | 0.0 | 2.0 | 3.0 | 8.0 | 3.0 | 0.0 | 1.0 | 0.0 | 2.0 | 75.0 |
34481 | ZNF674 | ZNF674.1 | 6 | 6 | 0 | GGATTATTGATGTGTAAAAATTTTTTTGATTGTAGTCTCCAGAAAT... | CTAGGAAAGAAATACACCAAATTATTAAGTAAATTGGCATTTGAAT... | ATCTGTGTAGTGTGGAGGGAAAAAGCTAGGAAAGAAATACACCAAA... | TAGGTGAGTGTGTGTGTATAGATAAACACATGGAACAAAAAGTTAG... | 46357184 | ... | 9.0 | 34.0 | 12.0 | 46.0 | 58.0 | 1.0 | 7.0 | 4.0 | 17.0 | 654.0 |
34482 | ZNF711 | ZNF711.1 | 1 | 11 | 0 | TATAAAGGTGACTAACAAATTACTTATTGTTTTATCTTATTTAACA... | TGCAGATTTTACTTTATGTGAGAAAATCTACAATTTCTTCGAGACA... | AGAAGGAAAAATAAAAAGAAATTGCTGCAGATTTTACTTTATGTGA... | TTTTTATAATAGGTAAAGAGAGCGTTTTCCCAAAGAAAATAACATA... | 84501071 | ... | 12.0 | 25.0 | 21.0 | 52.0 | 68.0 | 0.0 | 21.0 | 0.0 | 17.0 | 885.0 |
34483 | ZNF711 | ZNF711.8 | 8 | 11 | 0 | TGTTATGTGGGATTATTATTTCTAAATGTTACTCATTGAAATGAGC... | TGTGTTATGTGGCTGTAAATGATGTACACGCTGTAAAATAAGATCG... | AAATTTGGAATATCTACTAAAATTGTGTGTTATGTGGCTGTAAATG... | AATCAGTTCCTTGAGAATAAATTTTTTATCTTTCTTAACTTCAGAA... | 84528343 | ... | 52.0 | 157.0 | 142.0 | 439.0 | 491.0 | 11.0 | 68.0 | 33.0 | 92.0 | 4818.0 |
34484 | ZNF75D | ZNF75D.6 | 4 | 9 | 0 | TAATTGTTAATGAATATTAATTTTGTTAATGAATATATATTAAACC... | TTTTAGCATGTACAGCATGAAAGTTTTATATGTTTATTAATTTTTG... | TAAGGCATCTTTTGTCTGGAATATGTTTTAGCATGTACAGCATGAA... | TTAGGCTAGTTTTTTGGTATACCATTTCTAAACCAATGGTAGGAAC... | 134452509 | ... | 4.0 | 10.0 | 15.0 | 24.0 | 37.0 | 3.0 | 7.0 | 3.0 | 5.0 | 300.0 |
34485 | ZNF75D | ZNF75D.1 | 9 | 9 | 0 | TAAAATTGTACTTCGTAATGAAAATGACACATTTTATCTTAAATTT... | GTTTATAATGTTGGCCAAGGCTTATTTATATATGTTTATTTAGTAT... | GGATTTATATCTATGTCAGATCCTGGTTTATAATGTTGGCCAAGGC... | ACTATCTCTTAAGAGAAGATAATGTGACGTCAAGGGAAGTTGGAAG... | 134419749 | ... | 25.0 | 66.0 | 36.0 | 103.0 | 82.0 | 5.0 | 26.0 | 19.0 | 17.0 | 1500.0 |
34486 | ZNF81 | ZNF81.1 | 1 | 15 | 0 | TCAGGCTGTTTATCTGGCTGTTCATCTGTGTCTTTTGTAATATTCT... | CTCCTAATCCCATGCAGTTTCCTGGGTGATAGGAGCATCTTTTGTT... | TTTGACCCTTGTTCTTGACACAGAGCTCCTAATCCCATGCAGTTTC... | ATGTATGTGGATAGGGGTGCTAGGAACATCTGCTGTTCTAACATTT... | 47720423 | ... | 8.0 | 88.0 | 30.0 | 95.0 | 183.0 | 3.0 | 42.0 | 2.0 | 24.0 | 1172.0 |
34487 | ZNF81 | ZNF81.11 | 11 | 15 | 0 | TTTATTGCATTTCTTCCTCCACTATTCTTCTCTAACAGATGACCAA... | TTGGGCAGCATGATAACAACCCAGCAAAAAGCTAACTGATACATTG... | GTTAACTGTTTTAAGTCACTAAGTTTTGGGCAGCATGATAACAACC... | GCCATCCATCCCAGCCATCCCAAACAACTACCAGAATAATGGATTA... | 47781589 | ... | 1.0 | 9.0 | 5.0 | 14.0 | 12.0 | 1.0 | 4.0 | 1.0 | 6.0 | 168.0 |
34488 | ZNF81 | ZNF81.14 | 14 | 15 | 0 | TCCATTCATGTTTTGATGGAAATTTGTATTTCCAGCTTTTGGCTAT... | TTTTATTGCTAAGTGTTATTTCATTATATGGACATACCAGAATTTG... | GTTATATGCGTCTATGTTCATTCCTTTTTATTGCTAAGTGTTATTT... | TTGTTTGGCTCCTTCCATCCAGCATAATAAGTTTGAGATTCATTCA... | 47784996 | ... | 16.0 | 45.0 | 31.0 | 78.0 | 83.0 | 4.0 | 11.0 | 7.0 | 16.0 | 1011.0 |
34489 | ZRSR2 | ZRSR2.1 | 1 | 4 | 0 | AATACTGTATGACTTTATGTCCTATTTCAAAGCAGATGTATATTCA... | TTTCTTTATGAGTTGCATACTATTATTGAAAAGCAAGAAAATTATG... | TATCTTTGTGCTCCTTAGATCTTTATTTCTTTATGAGTTGCATACT... | GTAATTTTTTTCCTGTTTATTTGAAATGCTTCTTTTAAATCACATA... | 15820049 | ... | 7.0 | 4.0 | 8.0 | 23.0 | 43.0 | 3.0 | 9.0 | 7.0 | 11.0 | 354.0 |
34490 | ZRSR2 | ZRSR2.3 | 3 | 4 | 0 | CAGGAGGAGGTCGGGTAATAGAGACAGAACTGTTCAGAGTCCCAAA... | AGGAGCCGCCGCAGCCGGAGCCAAAGTTCCTCTAGGTCCCGAAGTC... | GAGCCGGAGCCGGAGCCGGAGCCGCAGGAGCCGCCGCAGCCGGAGC... | GGAAGAAATAGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCA... | 15841360 | ... | 52.0 | 142.0 | 105.0 | 210.0 | 329.0 | 26.0 | 78.0 | 20.0 | 65.0 | 3824.0 |
34491 | ZXDA | ZXDA.13 | 1 | 13 | 2 | TGTGACCCTCAGCAAGTCACTTAACCTATCTGAGCCTTAATTTCCT... | ATGAGTTTGGAGATCTAAATTCTGATCTTGAGTCTGGAACTGACAA... | TGCAAAAAGAGCACAGCCCTGGACTATGAGTTTGGAGATCTAAATT... | TTGAGGATATTCTGGACTAAATATTTAAGTGCAGTCATTTCTTTTT... | 57934228 | ... | 4.0 | 13.0 | 9.0 | 17.0 | 13.0 | 0.0 | 0.0 | 0.0 | 7.0 | 325.0 |
34492 | ZXDA | ZXDA.5 | 9 | 13 | 0 | TTGTTTATCTTCAGTCCTTGATTTATTAACATTTTGCCAATTGAAA... | CCTTTTTGTTTAAACAATGGAACATGATTTAATTTTATCTCATTTG... | TTCTGTTAGTGTTGGCATAAGCTTGCCTTTTTGTTTAAACAATGGA... | GACATGGCATGTAATTACAGACAGAAGTGTGACTGTAAACATATTA... | 57931885 | ... | 13.0 | 15.0 | 12.0 | 32.0 | 45.0 | 0.0 | 1.0 | 9.0 | 12.0 | 495.0 |
34493 | ZXDB | ZXDB.2 | 2 | 6 | 3 | TGGTTTGAATAGATTGCTTTTAAGGTCTTTCTGCTCTGTGATTCCT... | CAAGTCAGTTAACCTATCTGAGCCTTAATTTCCTTATTTATAAATT... | AACTGACAAGTTGTGTGACCCTGAGCAAGTCAGTTAACCTATCTGA... | ACAGCCCTGGACTACAAGTTTGGAGATTTAAATTCTGATCTTGAGT... | 57621181 | ... | 1.0 | 4.0 | 7.0 | 9.0 | 10.0 | 1.0 | 2.0 | 3.0 | 7.0 | 242.0 |
34494 | ZXDB | ZXDB.5 | 5 | 6 | 0 | GCAGATGAGTATGTCAAGGATTGAGATGAACACATAAGTCTTGGAA... | TGTTGTCATTCATTAAGGCCTCTTAAATAGACCACTATTTTTTGTG... | GTACTCATATAGCATATTTCAAAAATGTTGTCATTCATTAAGGCCT... | TTGATTGTCTACCCAATCAACAGTTTTCCCTCTTTGCTCTGGAAAT... | 57623885 | ... | 28.0 | 75.0 | 67.0 | 132.0 | 211.0 | 10.0 | 13.0 | 18.0 | 38.0 | 2088.0 |
34495 rows × 46 columns
#Make Valid PAS lookup hierarchy
cano_pas1 = 'AATAAA'
cano_pas2 = 'ATTAAA'
valid_pas = []
valid_pas.append({})
valid_pas[0]['AATAAA'] = True
valid_pas.append({})
valid_pas[1]['ATTAAA'] = True
valid_pas.append({})
valid_pas[2]['AGTAAA'] = True
valid_pas[2]['TATAAA'] = True
valid_pas[2]['CATAAA'] = True
valid_pas[2]['GATAAA'] = True
valid_pas.append({})
for pos in range(0, 6) :
for base in ['A', 'C', 'G', 'T'] :
valid_pas[3][cano_pas1[:pos] + base + cano_pas1[pos+1:]] = True
valid_pas.append({})
for pos1 in range(0, 6) :
for pos2 in range(pos1 + 1, 6) :
for base1 in ['A', 'C', 'G', 'T'] :
for base2 in ['A', 'C', 'G', 'T'] :
valid_pas[4][cano_pas1[:pos1] + base1 + cano_pas1[pos1+1:pos2] + base2 + cano_pas1[pos2+1:]] = True
#Global dataframe generation
gene_dict_pas_4 = {}
gene_dict_pas_3 = {}
gene_dict_pas_2 = {}
gene_dict_pas_1 = {}
for index, row in df.iterrows() :
gene = row['gene']
found_pas = row['pas']
if gene not in gene_dict_pas_4 :
gene_dict_pas_4[gene] = 0
if gene not in gene_dict_pas_3 :
gene_dict_pas_3[gene] = 0
if gene not in gene_dict_pas_2 :
gene_dict_pas_2[gene] = 0
if gene not in gene_dict_pas_1 :
gene_dict_pas_1[gene] = 0
if found_pas != -1 and found_pas <= 4 :
gene_dict_pas_4[gene] += 1
if found_pas != -1 and found_pas <= 3 :
gene_dict_pas_3[gene] += 1
if found_pas != -1 and found_pas <= 2 :
gene_dict_pas_2[gene] += 1
if found_pas != -1 and found_pas <= 1 :
gene_dict_pas_1[gene] += 1
prox_sitenum_pas_4 = []
prox_sitenum_pas_3 = []
prox_sitenum_pas_2 = []
prox_sitenum_pas_1 = []
num_sites_pas_4 = []
num_sites_pas_3 = []
num_sites_pas_2 = []
num_sites_pas_1 = []
gene_next_dict_pas_4 = {}
gene_next_dict_pas_3 = {}
gene_next_dict_pas_2 = {}
gene_next_dict_pas_1 = {}
for index, row in df.iterrows() :
gene = row['gene']
if gene not in gene_next_dict_pas_4 :
gene_next_dict_pas_4[gene] = -1
if gene not in gene_next_dict_pas_3 :
gene_next_dict_pas_3[gene] = -1
if gene not in gene_next_dict_pas_2 :
gene_next_dict_pas_2[gene] = -1
if gene not in gene_next_dict_pas_1 :
gene_next_dict_pas_1[gene] = -1
found_pas = row['pas']
if found_pas != -1 and found_pas <= 4 :
gene_next_dict_pas_4[gene] += 1
if found_pas != -1 and found_pas <= 3 :
gene_next_dict_pas_3[gene] += 1
if found_pas != -1 and found_pas <= 2 :
gene_next_dict_pas_2[gene] += 1
if found_pas != -1 and found_pas <= 1 :
gene_next_dict_pas_1[gene] += 1
sitenum_pas_4 = -1
if found_pas != -1 and found_pas <= 4 :
sitenum_pas_4 = gene_next_dict_pas_4[gene]
sitenum_pas_3 = -1
if found_pas != -1 and found_pas <= 3 :
sitenum_pas_3 = gene_next_dict_pas_3[gene]
sitenum_pas_2 = -1
if found_pas != -1 and found_pas <= 2 :
sitenum_pas_2 = gene_next_dict_pas_2[gene]
sitenum_pas_1 = -1
if found_pas != -1 and found_pas <= 1 :
sitenum_pas_1 = gene_next_dict_pas_1[gene]
prox_sitenum_pas_4.append(sitenum_pas_4 + (1 if sitenum_pas_4 != -1 else 0))
prox_sitenum_pas_3.append(sitenum_pas_3 + (1 if sitenum_pas_3 != -1 else 0))
prox_sitenum_pas_2.append(sitenum_pas_2 + (1 if sitenum_pas_2 != -1 else 0))
prox_sitenum_pas_1.append(sitenum_pas_1 + (1 if sitenum_pas_1 != -1 else 0))
num_sites_pas_4.append(gene_dict_pas_4[gene])
num_sites_pas_3.append(gene_dict_pas_3[gene])
num_sites_pas_2.append(gene_dict_pas_2[gene])
num_sites_pas_1.append(gene_dict_pas_1[gene])
df['sitenum_pas_4'] = prox_sitenum_pas_4
df['sitenum_pas_3'] = prox_sitenum_pas_3
df['sitenum_pas_2'] = prox_sitenum_pas_2
df['sitenum_pas_1'] = prox_sitenum_pas_1
df['num_sites_pas_4'] = num_sites_pas_4
df['num_sites_pas_3'] = num_sites_pas_3
df['num_sites_pas_2'] = num_sites_pas_2
df['num_sites_pas_1'] = num_sites_pas_1
'''
df = df[['gene',
'gene_id',
'sitenum',
'num_sites',
'sitenum_pas_4',
'num_sites_pas_4',
'sitenum_pas_3',
'num_sites_pas_3',
'sitenum_pas_2',
'num_sites_pas_2',
'sitenum_pas_1',
'num_sites_pas_1',
'pas',
'seq',
'pas_pos',
'cut_mode',
'chrom',
'strand']]
'''
df = df.sort_values(by=['chrom', 'gene', 'sitenum']).copy().reset_index(drop=True)
print(df.head())
print(df.tail())
print('Total number of members: ' + str(len(df)))
gene gene_id sitenum num_sites pas \ 0 ABCB10 ABCB10.6 2 7 0 1 ABCB10 ABCB10.3 5 7 0 2 ABCD3 ABCD3.2 2 15 0 3 ABCD3 ABCD3.3 3 15 0 4 ABCD3 ABCD3.14 14 15 0 seq \ 0 GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT... 1 TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC... 2 TTCGAGACAAGCCTGGACAAAAAGCGAGACCCGCTTCTTTAAAAAA... 3 GTATGGTTGTTTTACATATGTGTATGTGTGTATATGCATTTCAGTT... 4 GCCTTGACTTGAAAACATAGATAGTTTAATCTTGACTTGAAAAACA... seq_ext \ 0 TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC... 1 TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT... 2 ATCCCAGCACTTTGGGAGGCTGAGATGGGAGGATCGCTTGAATCCA... 3 ATTTAATATGTAATGTTATTGTTACATATTTATAACACAGCCATAT... 4 TGTTCTTTTTATTCTGGTATCTAAATACTGAGAAGTTCATTTATAA... wide_seq \ 0 TCAGGTTTTGTATTTTCTTTTCTTGTGGAATATTTTAATTAATATA... 1 GGTTGAATCTGAGGAAAATAATCCTTGTGCCATAGAAGTATTTACG... 2 CCAGGCGTGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCT... 3 ATCTTATTGAAATGTAACTTTAGTCATTTAATATGTAATGTTATTG... 4 TCAGATATCCTATACAACCTTTGCTTGTTCTTTTTATTCTGGTATC... wide_seq_ext pas_pos \ 0 GAGTTTTAATAATTGTAACTTTTTAAATGTCTATAGCACTGAAGTT... 229653571 1 TGTGTTCTTTATAAAGTGTGATTTTCAGAAAGCAAACAACACAATT... 229652351 2 TAGGTACTTGGAAAAATTTTGTGGCATTAAAAACCAGACAAATGTA... 94944251 3 GTCTCTGATATTTGTGATGGCAAGAATCACTTTTAAGTTTTCTTTG... 94944623 4 CGAACTTGTATACTTATTTTCTGTTCAGATTAAAAAAAAAAAAAAA... 94984196 ... THOC5 total_count sitenum_pas_4 sitenum_pas_3 \ 0 ... 17.0 576.0 1 1 1 ... 125.0 8657.0 2 2 2 ... 7.0 1291.0 1 1 3 ... 1.0 247.0 2 2 4 ... 133.0 11240.0 3 3 sitenum_pas_2 sitenum_pas_1 num_sites_pas_4 num_sites_pas_3 \ 0 1 1 2 2 1 2 2 2 2 2 1 1 4 4 3 2 2 4 4 4 3 3 4 4 num_sites_pas_2 num_sites_pas_1 0 2 2 1 2 2 2 4 4 3 4 4 4 4 4 [5 rows x 54 columns] gene gene_id sitenum num_sites pas \ 34490 ZRSR2 ZRSR2.3 3 4 0 34491 ZXDA ZXDA.13 1 13 2 34492 ZXDA ZXDA.5 9 13 0 34493 ZXDB ZXDB.2 2 6 3 34494 ZXDB ZXDB.5 5 6 0 seq \ 34490 CAGGAGGAGGTCGGGTAATAGAGACAGAACTGTTCAGAGTCCCAAA... 34491 TGTGACCCTCAGCAAGTCACTTAACCTATCTGAGCCTTAATTTCCT... 34492 TTGTTTATCTTCAGTCCTTGATTTATTAACATTTTGCCAATTGAAA... 34493 TGGTTTGAATAGATTGCTTTTAAGGTCTTTCTGCTCTGTGATTCCT... 34494 GCAGATGAGTATGTCAAGGATTGAGATGAACACATAAGTCTTGGAA... seq_ext \ 34490 AGGAGCCGCCGCAGCCGGAGCCAAAGTTCCTCTAGGTCCCGAAGTC... 34491 ATGAGTTTGGAGATCTAAATTCTGATCTTGAGTCTGGAACTGACAA... 34492 CCTTTTTGTTTAAACAATGGAACATGATTTAATTTTATCTCATTTG... 34493 CAAGTCAGTTAACCTATCTGAGCCTTAATTTCCTTATTTATAAATT... 34494 TGTTGTCATTCATTAAGGCCTCTTAAATAGACCACTATTTTTTGTG... wide_seq \ 34490 GAGCCGGAGCCGGAGCCGGAGCCGCAGGAGCCGCCGCAGCCGGAGC... 34491 TGCAAAAAGAGCACAGCCCTGGACTATGAGTTTGGAGATCTAAATT... 34492 TTCTGTTAGTGTTGGCATAAGCTTGCCTTTTTGTTTAAACAATGGA... 34493 AACTGACAAGTTGTGTGACCCTGAGCAAGTCAGTTAACCTATCTGA... 34494 GTACTCATATAGCATATTTCAAAAATGTTGTCATTCATTAAGGCCT... wide_seq_ext pas_pos \ 34490 GGAAGAAATAGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCA... 15841360 34491 TTGAGGATATTCTGGACTAAATATTTAAGTGCAGTCATTTCTTTTT... 57934228 34492 GACATGGCATGTAATTACAGACAGAAGTGTGACTGTAAACATATTA... 57931885 34493 ACAGCCCTGGACTACAAGTTTGGAGATTTAAATTCTGATCTTGAGT... 57621181 34494 TTGATTGTCTACCCAATCAACAGTTTTCCCTCTTTGCTCTGGAAAT... 57623885 ... THOC5 total_count sitenum_pas_4 sitenum_pas_3 \ 34490 ... 65.0 3824.0 2 2 34491 ... 7.0 325.0 1 1 34492 ... 12.0 495.0 2 2 34493 ... 7.0 242.0 1 1 34494 ... 38.0 2088.0 2 2 sitenum_pas_2 sitenum_pas_1 num_sites_pas_4 num_sites_pas_3 \ 34490 2 2 2 2 34491 1 -1 2 2 34492 2 1 2 2 34493 -1 -1 2 2 34494 1 1 2 2 num_sites_pas_2 num_sites_pas_1 34490 2 2 34491 2 1 34492 2 1 34493 1 1 34494 1 1 [5 rows x 54 columns] Total number of members: 34495
df
gene | gene_id | sitenum | num_sites | pas | seq | seq_ext | wide_seq | wide_seq_ext | pas_pos | ... | THOC5 | total_count | sitenum_pas_4 | sitenum_pas_3 | sitenum_pas_2 | sitenum_pas_1 | num_sites_pas_4 | num_sites_pas_3 | num_sites_pas_2 | num_sites_pas_1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ABCB10 | ABCB10.6 | 2 | 7 | 0 | GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT... | TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC... | TCAGGTTTTGTATTTTCTTTTCTTGTGGAATATTTTAATTAATATA... | GAGTTTTAATAATTGTAACTTTTTAAATGTCTATAGCACTGAAGTT... | 229653571 | ... | 17.0 | 576.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
1 | ABCB10 | ABCB10.3 | 5 | 7 | 0 | TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC... | TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT... | GGTTGAATCTGAGGAAAATAATCCTTGTGCCATAGAAGTATTTACG... | TGTGTTCTTTATAAAGTGTGATTTTCAGAAAGCAAACAACACAATT... | 229652351 | ... | 125.0 | 8657.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
2 | ABCD3 | ABCD3.2 | 2 | 15 | 0 | TTCGAGACAAGCCTGGACAAAAAGCGAGACCCGCTTCTTTAAAAAA... | ATCCCAGCACTTTGGGAGGCTGAGATGGGAGGATCGCTTGAATCCA... | CCAGGCGTGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCT... | TAGGTACTTGGAAAAATTTTGTGGCATTAAAAACCAGACAAATGTA... | 94944251 | ... | 7.0 | 1291.0 | 1 | 1 | 1 | 1 | 4 | 4 | 4 | 4 |
3 | ABCD3 | ABCD3.3 | 3 | 15 | 0 | GTATGGTTGTTTTACATATGTGTATGTGTGTATATGCATTTCAGTT... | ATTTAATATGTAATGTTATTGTTACATATTTATAACACAGCCATAT... | ATCTTATTGAAATGTAACTTTAGTCATTTAATATGTAATGTTATTG... | GTCTCTGATATTTGTGATGGCAAGAATCACTTTTAAGTTTTCTTTG... | 94944623 | ... | 1.0 | 247.0 | 2 | 2 | 2 | 2 | 4 | 4 | 4 | 4 |
4 | ABCD3 | ABCD3.14 | 14 | 15 | 0 | GCCTTGACTTGAAAACATAGATAGTTTAATCTTGACTTGAAAAACA... | TGTTCTTTTTATTCTGGTATCTAAATACTGAGAAGTTCATTTATAA... | TCAGATATCCTATACAACCTTTGCTTGTTCTTTTTATTCTGGTATC... | CGAACTTGTATACTTATTTTCTGTTCAGATTAAAAAAAAAAAAAAA... | 94984196 | ... | 133.0 | 11240.0 | 3 | 3 | 3 | 3 | 4 | 4 | 4 | 4 |
5 | ABCD3 | ABCD3.15 | 15 | 15 | 0 | ATGAGAAAATAAGTATGAAACAGCAATGGTAGTTTGTTTTGCATTA... | TGCCAAGACATATCACCGTGTTCTCATAATAAGTTTTTACTTTTTA... | TTTCATCCATGAGCACCACGCTGCATGCCAAGACATATCACCGTGT... | GCAGTGGGAAATGGTAGTTTAATCCGAAGAATAAACCAAAGAATAA... | 94984889 | ... | 4.0 | 329.0 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 |
6 | ABL2 | ABL2.17 | 8 | 24 | 3 | CTGAGGGGAGAGGGAAAAGGACTTGTTTTCCTGTGTTCTTGTTTTC... | TGTGGTGCAGAGGTAGCCACTGTTAGCCTGGTGGGAAAATGCACAC... | TGTCATGTGTACAGGAAATCAGTGATGTGGTGCAGAGGTAGCCACT... | TTCAGCAGCTGCTGGTGTGCCCGGGACAAACCCTGTCCTTAATAAC... | 179076768 | ... | 16.0 | 485.0 | 1 | 1 | -1 | -1 | 3 | 3 | 2 | 1 |
7 | ABL2 | ABL2.16 | 9 | 24 | 2 | CCACAAGGCCATTGCTGCTGTAATAAGAACTGCAAATCAGAGTGCT... | CAAGAGAAATTTTTGTTCAGGGCTGTTGGAAGTAGCTGTTAGCCTT... | GCAGAAAAGAAAGCTGGGAATGTACCAAGAGAAATTTTTGTTCAGG... | GGTACTAATGGTGATTATGCTCCAATTTACCTAATGAATTTGGTGG... | 179076299 | ... | 1.0 | 82.0 | 2 | 2 | 1 | -1 | 3 | 3 | 2 | 1 |
8 | ABL2 | ABL2.8 | 17 | 24 | 0 | ACTGCTTTCTCTGTCTTCTCACAAGGTTTGCCAAGTTGTGTTCTGT... | ACTGCTAACAGTGTTAAACTTGATGTAAATAAATGAGGCCCTTGAA... | CTCTCCGTCTGTTGTCTGACTGTGAACTGCTAACAGTGTTAAACTT... | GTTCTTAATTGTTATTGTAATATATTTTCAGTTGTTTTTCTAATTT... | 179068493 | ... | 76.0 | 3961.0 | 3 | 3 | 2 | 1 | 3 | 3 | 2 | 1 |
9 | ACADM | ACADM.4 | 4 | 19 | 0 | TAAACTTATACATATGAAGCTTTATATGTTTTGTTTGGAATATGTT... | CCAGGATTAGGATTTAGTTTTGGTATATGTTCGGTTCTATCTTTTG... | TACAAAAGCCAATCGACAACGTGAACCAGGATTAGGATTTAGTTTT... | TCTTTACAGGTCCTGAGAAGTATTTCTCGTTTTCATTGGAGATCAC... | 76194251 | ... | 15.0 | 1167.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
10 | ACADM | ACADM.14 | 14 | 19 | 0 | TCTATTGTACACAATCTCATTTCATATGTTTGCATTTTGGCAAAGA... | CTTGCCTTAAATTATTTTTATATGACTGTTGGTCTCTAGGTAGCCT... | CCTTATTTAAAATAAATCAATAAAGCTTGCCTTAAATTATTTTTAT... | CAAGAACTTTCTTGAAAATCTTATTTAATTCTGAGCCCATATTTCA... | 76229155 | ... | 330.0 | 20840.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
11 | ACAP3 | ACAP3.3 | 3 | 5 | 2 | CCGGCCTCCTCCGGAGGCACCTTCTCCTGGTACTCGGCCCAGAGCC... | CCACGTGGCTGGCCACGAAGGTCCCCGTGCCAGACAGCCCCAGCCG... | CTGGCTGGACGCGGGCGTCCCAAGGCCACGTGGCTGGCCACGAAGG... | CTGGCGTCGCGGGTGCTGGGCGGGAGGGGCTCTGGCCTGGGTCCTC... | 1228274 | ... | 42.0 | 2090.0 | 1 | 1 | 1 | -1 | 2 | 2 | 2 | 1 |
12 | ACAP3 | ACAP3.2 | 4 | 5 | 0 | TCTTGCCCCAGGCCCCTGCTGGCGGGTCTCACCCCCCACCCCTCGC... | AAGAACAGAATTGATTCTTGCCCCTCTCCCTGTGTGAGCTTGGCCC... | CTCTTGCCTGCTGCCTGTGACCCTGAAGAACAGAATTGATTCTTGC... | TGGGGAGGCTCCCTGAGGGCACAGTGGGCGCTGGACCCGGCCCCCC... | 1227789 | ... | 40.0 | 2661.0 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 1 |
13 | ACBD3 | ACBD3.18 | 5 | 22 | 2 | ATGTGTGTAGAGATATGCCAAAATATATCATTATCCCTACCCCATG... | GCATTGCTATTCATTTAAGTAATTGGCTTGTAAATGATACATTCAA... | TAACCTATATTAGCAGCAAAGAGAAGCATTGCTATTCATTTAAGTA... | TGAAAATCCAGACCTTGGAAGAAGATATCTTAAAATCTTAATTGTG... | 226344254 | ... | 0.0 | 147.0 | 1 | 1 | 1 | -1 | 5 | 4 | 4 | 3 |
14 | ACBD3 | ACBD3.17 | 6 | 22 | 4 | CACTGCTGTCAGCGTGCATGTCAGTGAGTCCAGCGATGACGACGAG... | AATTATGACATTGGGTTTGGGGTGTATTTTGAATGGACAGACTCTC... | TCTCTTTTGGGAATTTGCCACAGACAATTATGACATTGGGTTTGGG... | CGAGGAGAAGTGGTCACTGTTCGAGTACCCACCCATGAAGAAGGAT... | 226340041 | ... | 31.0 | 3479.0 | 2 | -1 | -1 | -1 | 5 | 4 | 4 | 3 |
15 | ACBD3 | ACBD3.11 | 12 | 22 | 0 | GTGGATGCTGAAGTTACATGAGCTACATGTTAAATATTTAAAGTCT... | AGATTCCTCAGACTCATCCAGCCCTTGGGTGCTGACCAGCAGAGTC... | AGCATTCATACTTTGGGGTTAAAGGAGATTCCTCAGACTCATCCAG... | TGATGGTTTGTGAACTCTTGCTGGGAATCAAAATTTCCTTGAGACT... | 226334019 | ... | 40.0 | 1490.0 | 3 | 2 | 2 | 1 | 5 | 4 | 4 | 3 |
16 | ACBD3 | ACBD3.10 | 13 | 22 | 0 | TTTGTTTTGGCTTCATAGAGTATCTCAAATTGAAACTTTTCTGCAC... | TGGTATTCATACTACTAGTAGCAAAATACAGGTTTTTTGTTTTGTT... | AACTTTGAATCCTTGTATCTTTATTTGGTATTCATACTACTAGTAG... | TATCAAGATACGTAGAACACCTCAGAGATTTTTCTTCAGGAACTTC... | 226333494 | ... | 23.0 | 1392.0 | 4 | 3 | 3 | 2 | 5 | 4 | 4 | 3 |
17 | ACBD3 | ACBD3.2 | 21 | 22 | 0 | ACAGTACAAGTGCGATTTCAAAAAGATCTTGAAAGTAATATATTTA... | AGAATATTTTTGGTTTTAAACTTTCTTATTGCCTTTGGCTGTTGAT... | GCGGTTCCTGTCATGTGTTCATGTCAGAATATTTTTGGTTTTAAAC... | CCTAAAAATATCATTGTTCTTGGGAGCAGTGTATGTTACTTTACAT... | 226332399 | ... | 114.0 | 8577.0 | 5 | 4 | 4 | 3 | 5 | 4 | 4 | 3 |
18 | ACBD6 | ACBD6.12 | 1 | 12 | 2 | ATAATGTGATGCAATGCAAAATGCAGTATTACAACTTTTATTGTGA... | GCCTGGGCAACAAGAGCGAAGAAACTCCACCTCAAATAATAATAAT... | GAAGATTGTGCGCCATTGCACTCCAGCCTGGGCAACAAGAGCGAAG... | CTGTAATCCCAGCTACTCGGGAGGCTGAGGAGGCGGAGGTTGCGGT... | 180470430 | ... | 2.0 | 248.0 | 1 | 1 | 1 | -1 | 3 | 3 | 3 | 2 |
19 | ACBD6 | ACBD6.8 | 5 | 12 | 0 | TTTTTATCTTTGTAGATATTGGCTTACTTTCTTTGGATTTTCATTC... | ATGTACCCTAGAGCCCACAAATTCCTTAGTTTGAGGAATTATGGCA... | CCTGCATATCTCTTTAAAGGAGTTTATGTACCCTAGAGCCCACAAA... | TTATCTAGAAGATGAAGAGCTGGAAATTGAGTCACTTAAAATGATC... | 180386169 | ... | 4.0 | 139.0 | 2 | 2 | 2 | 1 | 3 | 3 | 3 | 2 |
20 | ACBD6 | ACBD6.6 | 7 | 12 | 0 | AACTACAAAAATAATACTTCTTTTCCACCCGTCTTTGGTATGTATT... | GACTGGAAAACTGCAGTCTGTAATAGCATAAGGCTTCCATTATGAA... | CACAACTGGCAAGGCTTAATCAAAAGACTGGAAAACTGCAGTCTGT... | CCAGAGGAGGTGACAGGCTGCAAAACAGTTTCTTTGGTGCTGCAGC... | 180257391 | ... | 267.0 | 20294.0 | 3 | 3 | 3 | 2 | 3 | 3 | 3 | 2 |
21 | ACOT11 | ACOT11.3 | 3 | 9 | 2 | TCTCTGGAATCTGTCAACCCAGTTTTGGGCTCCAGGTGGATGGGTT... | TCCCTTGTTAAAGGGGCAGTGGGAGTTATGGGGTCATCAAGGACCT... | TTGGGTTATCATAAGGTGTTAAGAGTCCCTTGTTAAAGGGGCAGTG... | TCATGCCTTCTGTGTCTGGAAGAGGCGGCAGAGGCAACAGTGTTTA... | 55075774 | ... | 1.0 | 192.0 | 1 | 1 | 1 | -1 | 1 | 1 | 1 | 0 |
22 | ACOT7 | ACOT7.13 | 6 | 18 | 0 | GGAGCTTTTCCTCCGGCTGAGATATATATATAGAATACATTTTTAG... | TAGAGGGAAGGAACCAGGATTCTGAGTCTCTGCAGTGTGGGCCCCG... | CACAGCCCCCAGGAAGAGCTCTTCTTAGAGGGAAGGAACCAGGATT... | GGACAGATGTGACCAGCTCCTCCCGCAGCCTGACCCTGCGGTCCAC... | 6392222 | ... | 1.0 | 87.0 | 1 | 1 | 1 | 1 | 3 | 3 | 3 | 3 |
23 | ACOT7 | ACOT7.8 | 11 | 18 | 0 | TCCCCTGGGTGTGGCCGCGTCATGTTGATCTACGCTGTTATTTCTC... | GCAGAGGGGCAGGGAAGGGCGCCAGCTTGAATGGAGGGAATGGCAG... | AGGGCCAGCCCTGGGGGAAGGGGCTGCAGAGGGGCAGGGAAGGGCG... | CCCCTTCCAGAACTCCATACCCTGGGGGGGGGTGGTCAGAGTGCAG... | 6336907 | ... | 0.0 | 84.0 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 |
24 | ACOT7 | ACOT7.1 | 18 | 18 | 0 | GGGAATGCTTCCGAGCACGCTGTAGGGTATGGGAAGAACCCAGCAC... | TGCTACACAGTGTTGTCCCGAGCGCCGGGAGGCGTTGGGCAGAAAC... | TTTATTTATATCATTCCAGTATCAATGCTACACAGTGTTGTCCCGA... | GTATCACAGTGTTAACCTGTACTCTCTCCTGCAAACCTACACACCA... | 6324354 | ... | 583.0 | 37630.0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
25 | ACTA1 | ACTA1.1 | 1 | 1 | 0 | TATTTTTCGAAACAAAGCCCTGTGGAAGAAAATGGAAAACTTGAAG... | GACACAGTGTTTATAACGTGTACATACATTAACTTATTACCTCATT... | ACTTCCGTTGCTGCCATCGTAAACTGACACAGTGTTTATAACGTGT... | TGGGGGGGCGGCTGAGCTCCAGCCACCCCGCAGTCACTTTCTTTGT... | 229567020 | ... | 7.0 | 731.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
26 | ACTL8 | ACTL8.1 | 1 | 1 | 0 | AGCCTGGGATGCCCTTGCCACCCGTGGTTGGATCTTGTTTTATATC... | ATTTCTGGTCCTACAGGCCCTTTCTGGCCAGGGAGGCATTGCTGCA... | GACTAGGGGATGGGGGACAGTTGACATTTCTGGTCCTACAGGCCCT... | AGTAGGTTTTAACTGGGGTAGCACTCCTGCTAGGAGTCCCAATTAT... | 18153534 | ... | 2.0 | 169.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
27 | ACTN2 | ACTN2.4 | 4 | 8 | 0 | GAGCGATCTGTGATGCTGAGCTTCTGTAATCACTCATCCCATCAGA... | GTGCCTGGTGCACTGGATTACGCTGCGTTCTCTTCCGCACTCTACG... | GCCCGCCTACTCGGGCCCAGGCAGTGTGCCTGGTGCACTGGATTAC... | CTGCGTCGGGAGCTGCCCCCGGATCAGGCCCAGTACTGCATCAAGA... | 236925956 | ... | 7.0 | 380.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
28 | ACTN2 | ACTN2.5 | 5 | 8 | 0 | TACAAAATACCCAAGATTTAAGACCGGGGGGAAAAAACCACAAATT... | TAGGAAATTAGGAGGATCTAGGGACAGAAGGAAAGTGAAAAATGTG... | TAAACAGAACAAATTACTTGAGTAATAGGAAATTAGGAGGATCTAG... | TTCTGAGTTTTTAGCAAAATGTAATGAAATATCAGGTTGATTTCTT... | 236926307 | ... | 11.0 | 390.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
29 | ADAM15 | ADAM15.6 | 6 | 7 | 0 | GGTTGGACGGGATTGAGGAAGGTCCGCACAGCCTGTCTCTGCTCAG... | TCTGCGGACCTGCCGGCGTAGTTGCAGCGGGGGCTTGGGGAGGGGC... | ACCGCCACGCGCTGTCAAGCAACACTCTGCGGACCTGCCGGCGTAG... | CTACCATGACTGAAGGCGCCAGAGACTGGCGGTGTCTTAAGACTCC... | 155035225 | ... | 110.0 | 8455.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
34465 | ZNF182 | ZNF182.2 | 1 | 2 | 0 | GAAATAATATGGTAGATAATTTAGACTTATTTAGTAGAAGTTCTGC... | TACAAACCTTGTTGCTTCAATACAAAGACCCGATAAACACGAATCA... | AGTTTGGAAACATACTTCATCTGAGTACAAACCTTGTTGCTTCAAT... | ATTTTCTGACAAGAAAACAATTATCACCAAGAGTGCTCGTGACTGT... | 47836945 | ... | 5.0 | 380.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
34466 | ZNF182 | ZNF182.1 | 2 | 2 | 0 | TTGAAAATGTGGGTGTACTATGTACTATGTGGATGTACTACCTTTT... | GGCTGGGGGGATGGAATTGAGAGGGAGACAACTGTATCCTTTCATA... | CCTTGGGAGGGGAAGGGGCAGTGGTGGCTGGGGGGATGGAATTGAG... | TTCATAAGTTATCTCTGGAAAGAGACATGAGAACCTGGTAACCCTG... | 47834270 | ... | 38.0 | 1251.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
34467 | ZNF185 | ZNF185.5 | 5 | 6 | 0 | TTGCTTAGTGTTTCTAATCATACTTAATCCACACTAATGTGCGCAA... | TTTTGCAGATCTGAGGAAGAGGGATGCATTACCTTTTTGCTTCTTT... | CTCTCATGTCTAAAAAGGCACAGAATTTTGCAGATCTGAGGAAGAG... | TTGAAAGAAATCTTGCAAGAGCCATTATTGACTTAGATCCAAAACA... | 152141997 | ... | 18.0 | 576.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
34468 | ZNF275 | ZNF275.4 | 4 | 6 | 2 | ATTTTATGTCTACGTATATTGTTCCTTTACTGAACCCACCACATGC... | TAAGATGGGTGAAAGTCGATGCCTTCTAGTCTCAGTGAATTTAACC... | GTTCAAACGTGTGTTCTCTGTTCTCTAAGATGGGTGAAAGTCGATG... | TGCCACTTGGCTGCTTCCTGGCCAAGTCGCACCTGACTGCATGAAC... | 152617885 | ... | 3.0 | 291.0 | 1 | 1 | 1 | -1 | 2 | 2 | 2 | 1 |
34469 | ZNF275 | ZNF275.6 | 6 | 6 | 0 | GTCTGATCCCCTACCAAATCTAGCACAGTGCCTTGCATCAAGTAGA... | CCCCCACCCATTAAATTGTGAGCTCTTAGAAGACAGGGGTGGCCTT... | GCTTCACCCCCTCCTCCTCAGCCCTCCCCCACCCATTAAATTGTGA... | TTCTGGCACTCACTATAATCAGCCTTGCACTAGAGCTGTTTGTGGA... | 152618362 | ... | 66.0 | 3151.0 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 1 |
34470 | ZNF280C | ZNF280C.3 | 4 | 6 | 0 | TTGGTTACATTGAATACAGATTTGCTGAACAGTTTTGATGTTATTT... | GCTCATATATATATAGATGTATATTTTTTTTAATTTCTTGTTTGTT... | GGACCAAAAAATGCACTTGTTTCTTGCTCATATATATATAGATGTA... | GCCATTAATGTAATTCCTCTGGATAAAGATAATATATTCAAAAAAT... | 129336743 | ... | 49.0 | 2016.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
34471 | ZNF280C | ZNF280C.1 | 6 | 6 | 0 | TGATTCAGAAATAGCCATGTCATGCATGTGTCCTTTTTTGTTTTCA... | AGCTTGACAATCTGATCCCTCTTCACCTTCAGACTGTTAGTTATTT... | GTTTTAATAGGTGTTAGTGAGTTTTAGCTTGACAATCTGATCCCTC... | TAAAACTATTTATGCAGTCAGTACTAAGCTTACTTGTTATAAGCAG... | 129335349 | ... | 3.0 | 290.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
34472 | ZNF41 | ZNF41.22 | 1 | 22 | 0 | TAGCTAATTTTCCAGCACCAGTCGTTAAATGAATACCCTTTTCTAA... | TATTTTGGTGTATGGTATGAGATGGAATCAGTTCGTATATTCCATT... | GTTAAATTTTAATCTAACTGGAATTTATTTTGGTGTATGGTATGAG... | ACAAATACGCCTTAAATTTATTTTTAATATATTTATTTCAATTTTT... | 47339710 | ... | 6.0 | 101.0 | 1 | 1 | 1 | 1 | 4 | 3 | 2 | 1 |
34473 | ZNF41 | ZNF41.16 | 7 | 22 | 2 | CCTAGCTCTAGGCTATGTTACAGAAATATAGTCATTGAATGATACA... | AATGGAATTTTTAAATTTAAAGATATAACTTTATGAATTGAGAAAT... | CTCATGGTATATATATTTTAAGTGCAATGGAATTTTTAAATTTAAA... | ACATAGAAAAGATTTCCATGAAAAACTTTTTTCTTTTCCCTTGGGA... | 47306101 | ... | 15.0 | 331.0 | 2 | 2 | 2 | -1 | 4 | 3 | 2 | 1 |
34474 | ZNF41 | ZNF41.12 | 11 | 22 | 3 | GATAATTCATTTTCATTGTCATGTTGTATCCCATTCTGTGAATATG... | TCATTCACATTATGTATGTGAGAGTCATCCATATGTTGCATATAGT... | TATACTCTTTAGCATCTGTCTTCTGTCATTCACATTATGTATGTGA... | TTTTGTATTTTATATAAATTGAGTCAATTATATATCATATAATTGA... | 47305154 | ... | 1.0 | 68.0 | 3 | 3 | -1 | -1 | 4 | 3 | 2 | 1 |
34475 | ZNF41 | ZNF41.2 | 21 | 22 | 4 | TTTAAGTTTTACATCTAGATCTAGCATGTATTTTGAGTTATATGGT... | TTGCCTAACCAAAGATTACAATGATTTTTTCCTGTGTTTTCTTCTA... | TTTGAGTGCTGTATCTAAGAAATCCTTGCCTAACCAAAGATTACAA... | CTTATTTTGATGAAGTCCGTTCTTTCCATTTGTTCATTTCTGGGTT... | 47304199 | ... | 10.0 | 499.0 | 4 | -1 | -1 | -1 | 4 | 3 | 2 | 1 |
34476 | ZNF449 | ZNF449.3 | 3 | 11 | 0 | TCTATCAGACGTATTGATTATAGCAGTACTATAGTTATTCTGCTGT... | AAATTATGACAATCCTTTTAGAGGTAGGGTCAATATAGTGGATAAA... | TCATAGGTGTAAACATAAAGCATATAAATTATGACAATCCTTTTAG... | TCTTCTTTTTTTAATTACAATGAAAAATTTTGTGTTCCAAGGCAAC... | 134495399 | ... | 4.0 | 192.0 | 1 | 1 | 1 | 1 | 3 | 3 | 3 | 3 |
34477 | ZNF449 | ZNF449.6 | 6 | 11 | 0 | AGTGCCTTAGAATGGATGTGCCCAACTGCTCTGTATTTATGCAATA... | ATGTAATGGCTTCTCTTTTCTCTCTTGTGGAATTGCATTCAAACCA... | TGAATGTAGAGATGAAAAATACAGAATGTAATGGCTTCTCTTTTCT... | GGCTCAAATTGATATCCCAGTAGCAATAAACATATAATATAGGAGG... | 134497046 | ... | 4.0 | 571.0 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 |
34478 | ZNF449 | ZNF449.8 | 8 | 11 | 0 | AAGATTAAATGAAATATATTTTGCTCTGGCCCTACACACTGTAAGC... | CCTAGCTATTATAAAGGGGAAATTACAGTACCTACCTCAAAAGTAC... | TAAGGGCAAAATACAGTACCTACCTCCTAGCTATTATAAAGGGGAA... | GGATAATTATACTTCTATGTCTAATTGTACTTCTGAGCATTTCAAA... | 134497629 | ... | 7.0 | 171.0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
34479 | ZNF630 | ZNF630.4 | 2 | 5 | 0 | AATATGGAATCAATTTGCTCACCCTCAAACTGTCTCAGCCCTCTTC... | CCTTCCACAGAATGATCTGCTTAGGCCCTCAGAATATTTTCAACCC... | TATGTTCCTCAGTATACATGATTACCCTTCCACAGAATGATCTGCT... | CTCACATTGAATAGAAATAGTGACAACTTCTCAACTGTAGAATAGA... | 47917631 | ... | 3.0 | 241.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
34480 | ZNF674 | ZNF674.6 | 1 | 6 | 0 | GTTCACCTGTTGAAGGACATCTAGGTTGTATCCAGTGTTGGGCTAT... | ATTCTTTTTATTGCTGAGTAGCATTCCATGGTATGCATGTTTGTTT... | CAGGTTGTTGTATGTATCAAGCTTGATTCTTTTTATTGCTGAGTAG... | CTTAAGGGATTAGTTCTTTTTCACTCAGCATAATCCTCCAGAGATT... | 46379244 | ... | 2.0 | 75.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
34481 | ZNF674 | ZNF674.1 | 6 | 6 | 0 | GGATTATTGATGTGTAAAAATTTTTTTGATTGTAGTCTCCAGAAAT... | CTAGGAAAGAAATACACCAAATTATTAAGTAAATTGGCATTTGAAT... | ATCTGTGTAGTGTGGAGGGAAAAAGCTAGGAAAGAAATACACCAAA... | TAGGTGAGTGTGTGTGTATAGATAAACACATGGAACAAAAAGTTAG... | 46357184 | ... | 17.0 | 654.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
34482 | ZNF711 | ZNF711.1 | 1 | 11 | 0 | TATAAAGGTGACTAACAAATTACTTATTGTTTTATCTTATTTAACA... | TGCAGATTTTACTTTATGTGAGAAAATCTACAATTTCTTCGAGACA... | AGAAGGAAAAATAAAAAGAAATTGCTGCAGATTTTACTTTATGTGA... | TTTTTATAATAGGTAAAGAGAGCGTTTTCCCAAAGAAAATAACATA... | 84501071 | ... | 17.0 | 885.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
34483 | ZNF711 | ZNF711.8 | 8 | 11 | 0 | TGTTATGTGGGATTATTATTTCTAAATGTTACTCATTGAAATGAGC... | TGTGTTATGTGGCTGTAAATGATGTACACGCTGTAAAATAAGATCG... | AAATTTGGAATATCTACTAAAATTGTGTGTTATGTGGCTGTAAATG... | AATCAGTTCCTTGAGAATAAATTTTTTATCTTTCTTAACTTCAGAA... | 84528343 | ... | 92.0 | 4818.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
34484 | ZNF75D | ZNF75D.6 | 4 | 9 | 0 | TAATTGTTAATGAATATTAATTTTGTTAATGAATATATATTAAACC... | TTTTAGCATGTACAGCATGAAAGTTTTATATGTTTATTAATTTTTG... | TAAGGCATCTTTTGTCTGGAATATGTTTTAGCATGTACAGCATGAA... | TTAGGCTAGTTTTTTGGTATACCATTTCTAAACCAATGGTAGGAAC... | 134452509 | ... | 5.0 | 300.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
34485 | ZNF75D | ZNF75D.1 | 9 | 9 | 0 | TAAAATTGTACTTCGTAATGAAAATGACACATTTTATCTTAAATTT... | GTTTATAATGTTGGCCAAGGCTTATTTATATATGTTTATTTAGTAT... | GGATTTATATCTATGTCAGATCCTGGTTTATAATGTTGGCCAAGGC... | ACTATCTCTTAAGAGAAGATAATGTGACGTCAAGGGAAGTTGGAAG... | 134419749 | ... | 17.0 | 1500.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
34486 | ZNF81 | ZNF81.1 | 1 | 15 | 0 | TCAGGCTGTTTATCTGGCTGTTCATCTGTGTCTTTTGTAATATTCT... | CTCCTAATCCCATGCAGTTTCCTGGGTGATAGGAGCATCTTTTGTT... | TTTGACCCTTGTTCTTGACACAGAGCTCCTAATCCCATGCAGTTTC... | ATGTATGTGGATAGGGGTGCTAGGAACATCTGCTGTTCTAACATTT... | 47720423 | ... | 24.0 | 1172.0 | 1 | 1 | 1 | 1 | 3 | 3 | 3 | 3 |
34487 | ZNF81 | ZNF81.11 | 11 | 15 | 0 | TTTATTGCATTTCTTCCTCCACTATTCTTCTCTAACAGATGACCAA... | TTGGGCAGCATGATAACAACCCAGCAAAAAGCTAACTGATACATTG... | GTTAACTGTTTTAAGTCACTAAGTTTTGGGCAGCATGATAACAACC... | GCCATCCATCCCAGCCATCCCAAACAACTACCAGAATAATGGATTA... | 47781589 | ... | 6.0 | 168.0 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 |
34488 | ZNF81 | ZNF81.14 | 14 | 15 | 0 | TCCATTCATGTTTTGATGGAAATTTGTATTTCCAGCTTTTGGCTAT... | TTTTATTGCTAAGTGTTATTTCATTATATGGACATACCAGAATTTG... | GTTATATGCGTCTATGTTCATTCCTTTTTATTGCTAAGTGTTATTT... | TTGTTTGGCTCCTTCCATCCAGCATAATAAGTTTGAGATTCATTCA... | 47784996 | ... | 16.0 | 1011.0 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
34489 | ZRSR2 | ZRSR2.1 | 1 | 4 | 0 | AATACTGTATGACTTTATGTCCTATTTCAAAGCAGATGTATATTCA... | TTTCTTTATGAGTTGCATACTATTATTGAAAAGCAAGAAAATTATG... | TATCTTTGTGCTCCTTAGATCTTTATTTCTTTATGAGTTGCATACT... | GTAATTTTTTTCCTGTTTATTTGAAATGCTTCTTTTAAATCACATA... | 15820049 | ... | 11.0 | 354.0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
34490 | ZRSR2 | ZRSR2.3 | 3 | 4 | 0 | CAGGAGGAGGTCGGGTAATAGAGACAGAACTGTTCAGAGTCCCAAA... | AGGAGCCGCCGCAGCCGGAGCCAAAGTTCCTCTAGGTCCCGAAGTC... | GAGCCGGAGCCGGAGCCGGAGCCGCAGGAGCCGCCGCAGCCGGAGC... | GGAAGAAATAGGGACCGCAGCAGGGACCGCAGCCGGGGCCGGGGCA... | 15841360 | ... | 65.0 | 3824.0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
34491 | ZXDA | ZXDA.13 | 1 | 13 | 2 | TGTGACCCTCAGCAAGTCACTTAACCTATCTGAGCCTTAATTTCCT... | ATGAGTTTGGAGATCTAAATTCTGATCTTGAGTCTGGAACTGACAA... | TGCAAAAAGAGCACAGCCCTGGACTATGAGTTTGGAGATCTAAATT... | TTGAGGATATTCTGGACTAAATATTTAAGTGCAGTCATTTCTTTTT... | 57934228 | ... | 7.0 | 325.0 | 1 | 1 | 1 | -1 | 2 | 2 | 2 | 1 |
34492 | ZXDA | ZXDA.5 | 9 | 13 | 0 | TTGTTTATCTTCAGTCCTTGATTTATTAACATTTTGCCAATTGAAA... | CCTTTTTGTTTAAACAATGGAACATGATTTAATTTTATCTCATTTG... | TTCTGTTAGTGTTGGCATAAGCTTGCCTTTTTGTTTAAACAATGGA... | GACATGGCATGTAATTACAGACAGAAGTGTGACTGTAAACATATTA... | 57931885 | ... | 12.0 | 495.0 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 1 |
34493 | ZXDB | ZXDB.2 | 2 | 6 | 3 | TGGTTTGAATAGATTGCTTTTAAGGTCTTTCTGCTCTGTGATTCCT... | CAAGTCAGTTAACCTATCTGAGCCTTAATTTCCTTATTTATAAATT... | AACTGACAAGTTGTGTGACCCTGAGCAAGTCAGTTAACCTATCTGA... | ACAGCCCTGGACTACAAGTTTGGAGATTTAAATTCTGATCTTGAGT... | 57621181 | ... | 7.0 | 242.0 | 1 | 1 | -1 | -1 | 2 | 2 | 1 | 1 |
34494 | ZXDB | ZXDB.5 | 5 | 6 | 0 | GCAGATGAGTATGTCAAGGATTGAGATGAACACATAAGTCTTGGAA... | TGTTGTCATTCATTAAGGCCTCTTAAATAGACCACTATTTTTTGTG... | GTACTCATATAGCATATTTCAAAAATGTTGTCATTCATTAAGGCCT... | TTGATTGTCTACCCAATCAACAGTTTTCCCTCTTTGCTCTGGAAAT... | 57623885 | ... | 38.0 | 2088.0 | 2 | 2 | 1 | 1 | 2 | 2 | 1 | 1 |
34495 rows × 54 columns
df.to_csv('polyadb_processed_perturb.csv', header=True, index=False, sep='\t')
df = pd.read_csv('polyadb_processed_perturb.csv', sep='\t')
import matplotlib.pyplot as plt
import numpy as np
f = plt.figure()
t1 = df.query("num_sites >= 2").groupby("num_sites").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
f = plt.figure()
t1 = df.query("num_sites_pas_4 >= 2").groupby("num_sites_pas_4").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
f = plt.figure()
t1 = df.query("num_sites_pas_3 >= 2").groupby("num_sites_pas_3").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
f = plt.figure()
t1 = df.query("num_sites_pas_2 >= 2").groupby("num_sites_pas_2").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
f = plt.figure()
t1 = df.query("num_sites_pas_1 >= 2").groupby("num_sites_pas_1").agg({"gene" : "nunique"}).reset_index(drop=True)
print("n genes (total) = " + str(int(np.sum(t1['gene'].values))))
plt.bar(np.arange(len(t1)) + 1, t1['gene'].values)
plt.show()
n genes (total) = 11672
n genes (total) = 8124
n genes (total) = 7804
n genes (total) = 7292
n genes (total) = 6593
#Process features
min_pas_level = 3
max_num_sites = 10
df_sel = df.query("num_sites_pas_" + str(min_pas_level) + " >= 2 and num_sites_pas_" + str(min_pas_level) + " <= 10 and sitenum_pas_" + str(min_pas_level) + " != -1").copy().reset_index(drop=True)
genes = df_sel['gene'].unique()
gene_dict = {gene : gene_i for gene_i, gene in enumerate(genes)}
cell_types = np.array(['rpm', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5'], dtype=np.object)
cell_type_dict = {cell_type : cell_type_i for cell_type_i, cell_type in enumerate(cell_types)}
m = np.zeros((genes.shape[0], max_num_sites))
l = np.zeros((genes.shape[0], max_num_sites))
prev_pos_dict = {}
c = np.zeros((genes.shape[0], max_num_sites, cell_types.shape[0]))
gene_df_dict = {}
gene_df_dict['gene'] = ['N/A'] * genes.shape[0]
gene_df_dict['strand'] = ['N/A'] * genes.shape[0]
gene_df_dict['chrom'] = ['N/A'] * genes.shape[0]
gene_df_cols = ['gene', 'strand', 'chrom']
for k in range(max_num_sites) :
gene_df_dict['gene_id' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['site_type' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['pas' + "_" + str(k)] = [-1] * genes.shape[0]
gene_df_dict['wide_seq_ext' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['pas_exists' + "_" + str(k)] = [0] * genes.shape[0]
gene_df_dict['dist' + "_" + str(k)] = [-1] * genes.shape[0]
gene_df_dict['pas_pos' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['cut_mode' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_dict['cut_mode_hg38' + "_" + str(k)] = ['N/A'] * genes.shape[0]
gene_df_cols.extend([
'gene_id' + "_" + str(k),
'site_type' + "_" + str(k),
'pas' + "_" + str(k),
'wide_seq_ext' + "_" + str(k),
'pas_exists' + "_" + str(k),
'dist' + "_" + str(k),
'pas_pos' + "_" + str(k),
'cut_mode' + "_" + str(k),
'cut_mode_hg38' + "_" + str(k),
])
for index, row in df_sel.iterrows() :
gene = row['gene']
gene_id = row['gene_id']
strand = row['strand']
chrom = row['chrom']
i = gene_dict[gene]
k = row["sitenum_pas_" + str(min_pas_level)] - 1
if i % 1000 == 0 and k == 0 :
print("Processing gene " + str(i) + "...")
m[i, k] = 1.
if gene not in prev_pos_dict :
l[i, k] = 0.
else :
l[i, k] = np.abs(row['cut_mode'] - prev_pos_dict[gene])
gene_df_dict['gene'][i] = gene
gene_df_dict['strand'][i] = strand
gene_df_dict['chrom'][i] = chrom
gene_df_dict['gene_id' + "_" + str(k)][i] = gene_id
gene_df_dict['pas' + "_" + str(k)][i] = row['pas']
gene_df_dict['site_type' + "_" + str(k)][i] = row['site_type']
gene_df_dict['wide_seq_ext' + "_" + str(k)][i] = row['wide_seq_ext']
gene_df_dict['pas_exists' + "_" + str(k)][i] = int(m[i, k])
gene_df_dict['dist' + "_" + str(k)][i] = l[i, k]
gene_df_dict['pas_pos' + "_" + str(k)][i] = row['pas_pos']
gene_df_dict['cut_mode' + "_" + str(k)][i] = row['cut_mode']
gene_df_dict['cut_mode_hg38' + "_" + str(k)][i] = row['cut_mode_hg38']
prev_pos_dict[gene] = row['cut_mode']
for cell_type_i in range(cell_types.shape[0]) :
cell_type = cell_types[cell_type_i]
c[i, k, cell_type_i] = row[cell_type]
gene_df = pd.DataFrame(gene_df_dict)
gene_df = gene_df[gene_df_cols]
print(len(gene_df))
print("m.shape = " + str(m.shape))
print("l.shape = " + str(l.shape))
print("c.shape = " + str(c.shape))
Processing gene 0... Processing gene 1000... Processing gene 2000... Processing gene 3000... Processing gene 4000... Processing gene 5000... Processing gene 6000... Processing gene 7000... 7740 m.shape = (7740, 10) l.shape = (7740, 10) c.shape = (7740, 10, 28)
total_c = np.sum(c, axis=(1, 2))
m = m[total_c > 0, :]
l = l[total_c > 0, :]
c = c[total_c > 0, ...]
gene_df = gene_df.iloc[np.nonzero(total_c > 0)[0]].copy().reset_index(drop=True)
print(len(gene_df))
print("m.shape = " + str(m.shape))
print("l.shape = " + str(l.shape))
print("c.shape = " + str(c.shape))
7740 m.shape = (7740, 10) l.shape = (7740, 10) c.shape = (7740, 10, 28)
y = c / np.sum(c, axis=1, keepdims=True)
y[np.isnan(y)] = 0.
s = np.array(np.sum(c, axis=1) > 0, dtype=np.float)
/home/jlinder2/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in true_divide from ipykernel import kernelapp as app
gene_df.to_csv("polyadb_features_pas_" + str(min_pas_level) + "_perturb.csv", sep='\t')
#np.savez("polyadb_features_pas_" + str(min_pas_level) + "_perturb.npz", x=x, m=m, l=l, c=c, y=y, s=s)
np.savez("polyadb_features_pas_" + str(min_pas_level) + "_perturb_no_x.npz", m=m, l=l, c=c, y=y, s=s)
save_dict = np.load("polyadb_features_pas_3_perturb_no_x.npz")
m, l, c, y, s = save_dict['m'], save_dict['l'], save_dict['c'], save_dict['y'], save_dict['s']
dist_index = np.array([np.nonzero(m[i, :])[0][-1] for i in range(m.shape[0])])
y_dist = []
for i in range(y.shape[0]) :
y_dist.append(y[i, dist_index[i], :][None, :])
y_dist = np.concatenate(y_dist, axis=0)
dist_mask = np.zeros(m.shape)
for i in range(m.shape[0]) :
dist_mask[i, dist_index[i]] = 1.
cell_types = np.array(['rpm', 'NT', 'CDC73', 'CPSF1', 'CPSF2', 'CPSF3', 'CPSF3L', 'CPSF4', 'CPSF6', 'CSTF1', 'CSTF3', 'CTR9', 'FIP1L1', 'LEO1', 'NUDT21', 'PABPC1', 'PABPN1', 'PAF1', 'PAPOLA', 'PCF11', 'RBBP6', 'RPRD1A', 'RPRD1B', 'SCAF8', 'SF3A1', 'SRSF3', 'SYMPK', 'THOC5'], dtype=np.object)
cell_type_dict = {cell_type : cell_type_i for cell_type_i, cell_type in enumerate(cell_types)}
#PolyADB annotated distal site comparison
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, pearsonr
cell_type_1 = 'rpm'
cell_type_2 = 'NT'
min_count_1 = 0.
min_count_2 = 10.
cell_type_1_ix = cell_type_dict[cell_type_1]
cell_type_2_ix = cell_type_dict[cell_type_2]
keep_index = np.nonzero((np.sum(c[..., cell_type_1_ix], axis=-1) >= min_count_1) & (np.sum(c[..., cell_type_2_ix], axis=-1) >= min_count_2))[0]
print("n = " + str(keep_index.shape[0]))
print(spearmanr(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix]))
print(pearsonr(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix]))
f = plt.figure(figsize=(4, 4))
plt.scatter(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix], color='black', alpha=0.25, s=8)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
n = 7740 SpearmanrResult(correlation=0.8297174276563369, pvalue=0.0) (0.8119959825371189, 0.0)
#Comparison for identified sites in perturb-seq data only, all perturbations
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, pearsonr
cell_type_1 = 'NT'
min_count_1 = 10.
min_count_2 = 10.
cell_type_1_ix = cell_type_dict[cell_type_1]
for cell_type_2_ix, cell_type_2 in enumerate(cell_types.tolist()) :
if cell_type_2_ix in [0, 1] :
continue
print(cell_type_1 + " vs. " + cell_type_2)
keep_index = np.nonzero((np.sum(c[..., cell_type_1_ix], axis=-1) >= min_count_1) & (np.sum(c[..., cell_type_2_ix], axis=-1) >= min_count_2))[0]
print("n = " + str(keep_index.shape[0]))
print(spearmanr(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix]))
print(pearsonr(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix]))
f = plt.figure(figsize=(4, 4))
plt.scatter(y_dist[keep_index, cell_type_1_ix], y_dist[keep_index, cell_type_2_ix], color='black', alpha=0.25, s=8)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
NT vs. CDC73 n = 7368 SpearmanrResult(correlation=0.9759855957124476, pvalue=0.0) (0.977429182010042, 0.0)
NT vs. CPSF1 n = 7512 SpearmanrResult(correlation=0.9707458894559108, pvalue=0.0) (0.9712552262959689, 0.0)
NT vs. CPSF2 n = 7572 SpearmanrResult(correlation=0.9757298456247357, pvalue=0.0) (0.975279275247471, 0.0)
NT vs. CPSF3 n = 7596 SpearmanrResult(correlation=0.9786991020022822, pvalue=0.0) (0.9782541411488914, 0.0)
NT vs. CPSF3L n = 7050 SpearmanrResult(correlation=0.9643231858578178, pvalue=0.0) (0.9710536764786775, 0.0)
NT vs. CPSF4 n = 7169 SpearmanrResult(correlation=0.9651319917724214, pvalue=0.0) (0.9680692405652714, 0.0)
NT vs. CPSF6 n = 7721 SpearmanrResult(correlation=0.9561261001495693, pvalue=0.0) (0.9358077955848794, 0.0)
NT vs. CSTF1 n = 7388 SpearmanrResult(correlation=0.9757753260631744, pvalue=0.0) (0.9799748426987822, 0.0)
NT vs. CSTF3 n = 7590 SpearmanrResult(correlation=0.970391938876148, pvalue=0.0) (0.9702205308817213, 0.0)
NT vs. CTR9 n = 7427 SpearmanrResult(correlation=0.9737019236985452, pvalue=0.0) (0.9716469803701628, 0.0)
NT vs. FIP1L1 n = 7292 SpearmanrResult(correlation=0.9670152745948519, pvalue=0.0) (0.968782173637769, 0.0)
NT vs. LEO1 n = 7727 SpearmanrResult(correlation=0.9908326634777129, pvalue=0.0) (0.9901086906369176, 0.0)
NT vs. NUDT21 n = 7590 SpearmanrResult(correlation=0.9226209090802132, pvalue=0.0) (0.8913509649313834, 0.0)
NT vs. PABPC1 n = 7659 SpearmanrResult(correlation=0.9907838399889082, pvalue=0.0) (0.9910951179765417, 0.0)
NT vs. PABPN1 n = 7675 SpearmanrResult(correlation=0.9814972569452343, pvalue=0.0) (0.9786606490597117, 0.0)
NT vs. PAF1 n = 7031 SpearmanrResult(correlation=0.9613290519809636, pvalue=0.0) (0.9621589110159761, 0.0)
NT vs. PAPOLA n = 7695 SpearmanrResult(correlation=0.9907384788284151, pvalue=0.0) (0.9902172229697065, 0.0)
NT vs. PCF11 n = 7347 SpearmanrResult(correlation=0.9783597430606967, pvalue=0.0) (0.9799179980774865, 0.0)
NT vs. RBBP6 n = 7646 SpearmanrResult(correlation=0.9799537785137653, pvalue=0.0) (0.9778830754060095, 0.0)
NT vs. RPRD1A n = 7616 SpearmanrResult(correlation=0.9897929678008958, pvalue=0.0) (0.9919689061410408, 0.0)
NT vs. RPRD1B n = 7706 SpearmanrResult(correlation=0.9919697086187015, pvalue=0.0) (0.9918952298812577, 0.0)
NT vs. SCAF8 n = 7732 SpearmanrResult(correlation=0.9929301487763096, pvalue=0.0) (0.9919385471179399, 0.0)
NT vs. SF3A1 n = 5652 SpearmanrResult(correlation=0.9535519570269069, pvalue=0.0) (0.9751683983677917, 0.0)
NT vs. SRSF3 n = 7185 SpearmanrResult(correlation=0.967020722169193, pvalue=0.0) (0.9692531386700382, 0.0)
NT vs. SYMPK n = 6830 SpearmanrResult(correlation=0.9640766067368118, pvalue=0.0) (0.9714010921978059, 0.0)
NT vs. THOC5 n = 7457 SpearmanrResult(correlation=0.9703668483971793, pvalue=0.0) (0.9699731954408999, 0.0)