block_size = 50000
hdf5_fname = '../raw/total-3L.h5'
vcf_fname = '../raw/total-3L.vcf.gz'
import gzip
import dask.array as da
import numpy as np
import tables
import vcf
#Preamble to get the maximum position
store = tables.open_file(hdf5_fname, 'r')
pos_array = store.get_node('/3L/variants/POS').iterrows()
num_snps = pos_array.nrows
max_pos = pos_array.read(num_snps - 1)[0] # [-1] notation not yet supported
store.close()
print(num_snps, max_pos)
9643193 41956556
#1-index vs 0-index
def compute_bin_index(position):
return (position - 1) // block_size
def compute_snps_per_bin_hdf5(in_memory=False):
# lets save memory, not a lot in this case, but an example
count_snps = np.zeros(1 + max_pos // block_size, dtype=np.uint16 if block_size < 65536 else np.uint32)
store = tables.open_file(hdf5_fname, 'r')
if in_memory:
pos_iter = store.get_node('/3L/variants/POS').read()
else:
pos_iter = store.get_node('/3L/variants/POS').iterrows()
for pos in pos_iter:
count_snps[compute_bin_index(pos)] += 1
store.close()
return count_snps
#We can do in-memory load
%time compute_snps_per_bin_hdf5(True)
CPU times: user 35.8 s, sys: 1.86 s, total: 37.6 s Wall time: 37.7 s
array([ 44, 922, 947, 1508, 16, 367, 1235, 1570, 193, 255, 471, 0, 608, 844, 1056, 1619, 1869, 2161, 513, 790, 746, 122, 1363, 560, 1118, 2746, 964, 606, 0, 796, 556, 2965, 3118, 1341, 2240, 2875, 2028, 3171, 2948, 4471, 5361, 4225, 4150, 4391, 4521, 4125, 1693, 3975, 3986, 3742, 2866, 3822, 4708, 3372, 2529, 3106, 4640, 4710, 6672, 2866, 3729, 4116, 3987, 4380, 4710, 3915, 4001, 4106, 5719, 5161, 5011, 5496, 4526, 4518, 4488, 3837, 4584, 4038, 5364, 2917, 3237, 3837, 3691, 2930, 3083, 917, 368, 156, 705, 83, 2, 112, 126, 0, 37, 223, 3992, 3508, 2452, 405, 3070, 3722, 4343, 5154, 4086, 3949, 5600, 5617, 5373, 5003, 4531, 3858, 5463, 5895, 5544, 4980, 5603, 3133, 4859, 6090, 4047, 5047, 3224, 3732, 4010, 6464, 3667, 4161, 5077, 3353, 4211, 6107, 5768, 7598, 7645, 6855, 5855, 3962, 6124, 5011, 4262, 5818, 1912, 2457, 139, 75, 2721, 2273, 802, 2750, 2063, 3023, 7881, 4446, 0, 718, 4897, 6491, 5247, 4265, 5307, 6874, 2528, 3260, 7526, 5011, 8233, 5258, 6112, 9027, 10422, 6377, 7475, 6492, 6977, 6474, 7203, 5535, 4862, 5044, 7313, 4583, 5018, 5578, 487, 3944, 8671, 9137, 1323, 7948, 3542, 2495, 7390, 6856, 7613, 5310, 5593, 2672, 94, 0, 6377, 9638, 1869, 5084, 10877, 5454, 5577, 7232, 9003, 11156, 8447, 7614, 8652, 7402, 7646, 8516, 4900, 5634, 4821, 6556, 6860, 8818, 8058, 613, 3253, 5546, 10322, 11512, 8572, 6927, 5496, 2345, 3954, 9128, 9448, 12448, 15173, 9542, 9089, 11119, 8993, 8994, 9812, 10215, 11149, 6595, 8018, 8130, 10845, 11007, 15774, 14550, 7671, 10244, 12786, 11360, 9341, 12174, 5104, 10359, 11389, 12209, 9287, 9675, 11509, 11266, 14995, 11674, 13059, 13882, 9820, 10316, 12637, 13539, 2254, 5762, 10234, 468, 1052, 9829, 14275, 14710, 14613, 18258, 17301, 13921, 11845, 12823, 8641, 8454, 7292, 8200, 7988, 15067, 16474, 15719, 7810, 17140, 14419, 14896, 18316, 12084, 16779, 12500, 15195, 8956, 11441, 17814, 18211, 8627, 11326, 2006, 3239, 12783, 8305, 11581, 13666, 11973, 2310, 4870, 8834, 5568, 3455, 518, 4658, 8508, 14035, 16461, 16358, 12676, 13480, 14489, 19479, 16009, 18529, 16847, 17870, 16099, 17109, 18108, 16293, 16167, 16217, 13641, 16917, 20694, 15810, 16748, 13495, 13814, 13936, 16500, 13654, 13691, 12080, 14253, 12127, 9302, 5118, 5593, 18194, 15659, 12246, 12029, 13922, 13134, 16186, 18298, 20525, 18659, 21030, 18924, 17696, 16517, 18570, 14149, 18232, 13296, 15713, 11750, 18306, 16713, 12147, 17511, 16945, 12972, 13601, 17045, 15020, 14985, 15522, 14177, 15084, 14432, 18245, 17672, 14977, 11461, 9786, 12393, 13718, 10137, 13898, 17502, 15773, 3541, 17, 83, 14821, 10063, 13753, 12751, 14524, 13997, 13351, 13838, 12543, 5876, 13796, 12067, 12039, 9658, 10416, 17690, 16934, 14585, 19616, 12132, 10731, 12070, 12407, 13273, 12940, 10325, 14044, 17366, 11488, 159, 12850, 11440, 14626, 17196, 13620, 18362, 14321, 16261, 19155, 16360, 10809, 13839, 19611, 19180, 17469, 18089, 11198, 14285, 17635, 20275, 18957, 15682, 13970, 8300, 11441, 9392, 8534, 3784, 0, 0, 2897, 5830, 3534, 12097, 13585, 16870, 15340, 13788, 10533, 4649, 6595, 12812, 15825, 15143, 18161, 12345, 14831, 20867, 15622, 15926, 9297, 12006, 827, 12112, 4436, 5412, 12243, 13814, 19302, 13905, 14789, 15232, 16068, 16951, 14979, 16634, 16891, 14953, 9272, 16003, 10372, 8836, 4931, 19684, 14789, 11601, 14881, 12228, 14918, 18051, 19081, 16351, 16290, 19718, 18345, 17682, 13877, 12350, 15571, 19650, 13450, 12475, 13394, 10194, 10142, 17092, 19755, 11751, 12403, 7420, 2170, 10262, 12772, 13331, 16900, 16210, 13294, 14439, 20928, 21125, 19971, 17322, 20123, 23387, 17079, 13429, 15201, 16550, 20723, 13883, 18487, 15762, 16634, 18440, 12863, 19423, 18392, 17595, 14690, 15024, 14074, 7844, 15101, 15403, 10952, 12973, 16704, 12864, 13769, 16925, 18148, 19011, 16384, 6266, 17466, 14969, 16772, 12345, 17831, 17754, 19683, 18952, 22256, 22189, 19185, 18692, 23531, 22969, 20267, 20132, 16693, 16970, 13396, 13317, 14341, 15761, 17150, 20039, 14563, 19437, 16707, 13422, 16409, 16343, 15070, 17827, 20098, 14659, 14133, 19803, 17216, 14998, 9952, 13632, 6683, 9092, 9794, 15253, 20426, 19026, 19931, 12619, 15700, 19188, 19448, 14252, 15184, 14658, 12583, 16365, 19098, 16086, 12608, 12744, 12112, 15531, 15659, 16561, 15900, 18745, 15044, 17080, 16244, 21544, 13571, 20607, 23138, 17525, 9832, 13346, 14747, 12534, 15450, 15450, 10981, 14293, 16707, 11216, 12352, 14434, 16374, 16622, 17817, 13938, 17086, 16703, 14063, 17986, 15860, 15884, 16079, 18990, 22215, 11866, 12072, 21262, 18214, 18930, 19353, 18313, 18534, 20240, 19587, 17230, 14985, 14481, 18932, 19152, 15008, 14858, 18255, 17377, 18675, 19624, 15350, 13101, 14924, 14745, 18368, 17607, 18308, 15751, 16676, 17483, 12450, 13574, 14681, 20729, 8802, 11164, 19105, 20751, 14946, 14812, 16508, 13045, 13598, 1662, 9985, 1732, 2541, 0, 5170, 13475, 20586, 6070, 18577, 18802, 11117, 17653, 17283, 17812, 16100, 21801, 20480, 11822, 16371, 4693, 3153, 10670, 18295, 9412, 10875, 8988, 13147, 17466, 18072, 14239, 18637, 11022, 12864, 13788, 13225, 12474, 16018, 11694, 11701, 14004, 8149, 17778, 18699, 11032, 17928, 18699, 9506, 1009, 13449, 19832, 17837, 21597, 17062, 15945, 17439, 10506, 12991, 16155, 17368, 19337, 19747, 21282, 19063, 23277, 19012, 17843, 12667, 13727, 15519, 16703, 13610, 19547, 21413, 18396, 21770, 20833, 16803, 23080, 23685, 23059, 24809, 15926, 23384, 26079, 24982, 23776, 24136, 23378, 19434, 15781, 17058, 15817, 21908, 21144, 11910, 5914, 13589, 18370, 20585, 22493, 20552, 18456, 20849, 19781, 16845, 17556, 17573, 16246, 9085, 9751, 5038, 4399, 4222, 586], dtype=uint16)
%time compute_snps_per_bin_hdf5(False)
CPU times: user 49.3 s, sys: 24 ms, total: 49.3 s Wall time: 49.3 s
array([ 44, 922, 947, 1508, 16, 367, 1235, 1570, 193, 255, 471, 0, 608, 844, 1056, 1619, 1869, 2161, 513, 790, 746, 122, 1363, 560, 1118, 2746, 964, 606, 0, 796, 556, 2965, 3118, 1341, 2240, 2875, 2028, 3171, 2948, 4471, 5361, 4225, 4150, 4391, 4521, 4125, 1693, 3975, 3986, 3742, 2866, 3822, 4708, 3372, 2529, 3106, 4640, 4710, 6672, 2866, 3729, 4116, 3987, 4380, 4710, 3915, 4001, 4106, 5719, 5161, 5011, 5496, 4526, 4518, 4488, 3837, 4584, 4038, 5364, 2917, 3237, 3837, 3691, 2930, 3083, 917, 368, 156, 705, 83, 2, 112, 126, 0, 37, 223, 3992, 3508, 2452, 405, 3070, 3722, 4343, 5154, 4086, 3949, 5600, 5617, 5373, 5003, 4531, 3858, 5463, 5895, 5544, 4980, 5603, 3133, 4859, 6090, 4047, 5047, 3224, 3732, 4010, 6464, 3667, 4161, 5077, 3353, 4211, 6107, 5768, 7598, 7645, 6855, 5855, 3962, 6124, 5011, 4262, 5818, 1912, 2457, 139, 75, 2721, 2273, 802, 2750, 2063, 3023, 7881, 4446, 0, 718, 4897, 6491, 5247, 4265, 5307, 6874, 2528, 3260, 7526, 5011, 8233, 5258, 6112, 9027, 10422, 6377, 7475, 6492, 6977, 6474, 7203, 5535, 4862, 5044, 7313, 4583, 5018, 5578, 487, 3944, 8671, 9137, 1323, 7948, 3542, 2495, 7390, 6856, 7613, 5310, 5593, 2672, 94, 0, 6377, 9638, 1869, 5084, 10877, 5454, 5577, 7232, 9003, 11156, 8447, 7614, 8652, 7402, 7646, 8516, 4900, 5634, 4821, 6556, 6860, 8818, 8058, 613, 3253, 5546, 10322, 11512, 8572, 6927, 5496, 2345, 3954, 9128, 9448, 12448, 15173, 9542, 9089, 11119, 8993, 8994, 9812, 10215, 11149, 6595, 8018, 8130, 10845, 11007, 15774, 14550, 7671, 10244, 12786, 11360, 9341, 12174, 5104, 10359, 11389, 12209, 9287, 9675, 11509, 11266, 14995, 11674, 13059, 13882, 9820, 10316, 12637, 13539, 2254, 5762, 10234, 468, 1052, 9829, 14275, 14710, 14613, 18258, 17301, 13921, 11845, 12823, 8641, 8454, 7292, 8200, 7988, 15067, 16474, 15719, 7810, 17140, 14419, 14896, 18316, 12084, 16779, 12500, 15195, 8956, 11441, 17814, 18211, 8627, 11326, 2006, 3239, 12783, 8305, 11581, 13666, 11973, 2310, 4870, 8834, 5568, 3455, 518, 4658, 8508, 14035, 16461, 16358, 12676, 13480, 14489, 19479, 16009, 18529, 16847, 17870, 16099, 17109, 18108, 16293, 16167, 16217, 13641, 16917, 20694, 15810, 16748, 13495, 13814, 13936, 16500, 13654, 13691, 12080, 14253, 12127, 9302, 5118, 5593, 18194, 15659, 12246, 12029, 13922, 13134, 16186, 18298, 20525, 18659, 21030, 18924, 17696, 16517, 18570, 14149, 18232, 13296, 15713, 11750, 18306, 16713, 12147, 17511, 16945, 12972, 13601, 17045, 15020, 14985, 15522, 14177, 15084, 14432, 18245, 17672, 14977, 11461, 9786, 12393, 13718, 10137, 13898, 17502, 15773, 3541, 17, 83, 14821, 10063, 13753, 12751, 14524, 13997, 13351, 13838, 12543, 5876, 13796, 12067, 12039, 9658, 10416, 17690, 16934, 14585, 19616, 12132, 10731, 12070, 12407, 13273, 12940, 10325, 14044, 17366, 11488, 159, 12850, 11440, 14626, 17196, 13620, 18362, 14321, 16261, 19155, 16360, 10809, 13839, 19611, 19180, 17469, 18089, 11198, 14285, 17635, 20275, 18957, 15682, 13970, 8300, 11441, 9392, 8534, 3784, 0, 0, 2897, 5830, 3534, 12097, 13585, 16870, 15340, 13788, 10533, 4649, 6595, 12812, 15825, 15143, 18161, 12345, 14831, 20867, 15622, 15926, 9297, 12006, 827, 12112, 4436, 5412, 12243, 13814, 19302, 13905, 14789, 15232, 16068, 16951, 14979, 16634, 16891, 14953, 9272, 16003, 10372, 8836, 4931, 19684, 14789, 11601, 14881, 12228, 14918, 18051, 19081, 16351, 16290, 19718, 18345, 17682, 13877, 12350, 15571, 19650, 13450, 12475, 13394, 10194, 10142, 17092, 19755, 11751, 12403, 7420, 2170, 10262, 12772, 13331, 16900, 16210, 13294, 14439, 20928, 21125, 19971, 17322, 20123, 23387, 17079, 13429, 15201, 16550, 20723, 13883, 18487, 15762, 16634, 18440, 12863, 19423, 18392, 17595, 14690, 15024, 14074, 7844, 15101, 15403, 10952, 12973, 16704, 12864, 13769, 16925, 18148, 19011, 16384, 6266, 17466, 14969, 16772, 12345, 17831, 17754, 19683, 18952, 22256, 22189, 19185, 18692, 23531, 22969, 20267, 20132, 16693, 16970, 13396, 13317, 14341, 15761, 17150, 20039, 14563, 19437, 16707, 13422, 16409, 16343, 15070, 17827, 20098, 14659, 14133, 19803, 17216, 14998, 9952, 13632, 6683, 9092, 9794, 15253, 20426, 19026, 19931, 12619, 15700, 19188, 19448, 14252, 15184, 14658, 12583, 16365, 19098, 16086, 12608, 12744, 12112, 15531, 15659, 16561, 15900, 18745, 15044, 17080, 16244, 21544, 13571, 20607, 23138, 17525, 9832, 13346, 14747, 12534, 15450, 15450, 10981, 14293, 16707, 11216, 12352, 14434, 16374, 16622, 17817, 13938, 17086, 16703, 14063, 17986, 15860, 15884, 16079, 18990, 22215, 11866, 12072, 21262, 18214, 18930, 19353, 18313, 18534, 20240, 19587, 17230, 14985, 14481, 18932, 19152, 15008, 14858, 18255, 17377, 18675, 19624, 15350, 13101, 14924, 14745, 18368, 17607, 18308, 15751, 16676, 17483, 12450, 13574, 14681, 20729, 8802, 11164, 19105, 20751, 14946, 14812, 16508, 13045, 13598, 1662, 9985, 1732, 2541, 0, 5170, 13475, 20586, 6070, 18577, 18802, 11117, 17653, 17283, 17812, 16100, 21801, 20480, 11822, 16371, 4693, 3153, 10670, 18295, 9412, 10875, 8988, 13147, 17466, 18072, 14239, 18637, 11022, 12864, 13788, 13225, 12474, 16018, 11694, 11701, 14004, 8149, 17778, 18699, 11032, 17928, 18699, 9506, 1009, 13449, 19832, 17837, 21597, 17062, 15945, 17439, 10506, 12991, 16155, 17368, 19337, 19747, 21282, 19063, 23277, 19012, 17843, 12667, 13727, 15519, 16703, 13610, 19547, 21413, 18396, 21770, 20833, 16803, 23080, 23685, 23059, 24809, 15926, 23384, 26079, 24982, 23776, 24136, 23378, 19434, 15781, 17058, 15817, 21908, 21144, 11910, 5914, 13589, 18370, 20585, 22493, 20552, 18456, 20849, 19781, 16845, 17556, 17573, 16246, 9085, 9751, 5038, 4399, 4222, 586], dtype=uint16)
def compute_snps_per_bin_vcf():
# lets save memory, not a lot in this case, but an example
count_snps = np.zeros(1 + max_pos // block_size, dtype=np.uint16 if block_size < 65536 else np.uint32)
vcf_file = vcf.Reader(filename=vcf_fname)
for rec in vcf_file:
count_snps[compute_bin_index(rec.POS)] += 1
store.close()
return count_snps
#This will take days!
#%time compute_snps_per_bin_vcf()
#TODO: Do partial function application of the above
def read_lines(name, num):
with gzip.open(name) as f:
for i, l in enumerate(f):
if i == num:
break
def parse_lines(name, num):
f = vcf.Reader(filename=name)
for i, rec in enumerate(f):
if i == num:
break
num_lines = 10000
%time read_lines(vcf_fname, num_lines)
CPU times: user 1.92 s, sys: 16 ms, total: 1.94 s Wall time: 1.94 s
%time parse_lines(vcf_fname, num_lines)
CPU times: user 3min 28s, sys: 1.07 s, total: 3min 29s Wall time: 3min 30s