In [54]:
import pandas as pd
import numpy as np
In [65]:
arr = np.random.randn(100000) * 50
arr
Out[65]:
array([ 17.05715119, -61.3171255 ,  46.61055087, ...,  27.6605187 ,
        29.83815425, -41.28542945])
In [66]:
ref_q1 = pd.DataFrame(arr, columns=["value"]).quantile(0.1).value
print("ref q1 is %s" % ref_q1)
ref q1 is -64.44685728671065
In [67]:
arr = np.random.permutation(arr)
sub_groups = []
while len(arr) > 0:
    num_rows = len(arr) if len(arr) <= 1000 else np.random.randint(100, 1000)
    print("will take %s rows from %s" % (num_rows, len(arr)))
    sub_group = arr[0:num_rows]
    sub_groups.append(sub_group)
    arr = arr[num_rows:]
will take 798 rows from 100000
will take 555 rows from 99202
will take 609 rows from 98647
will take 436 rows from 98038
will take 930 rows from 97602
will take 840 rows from 96672
will take 649 rows from 95832
will take 256 rows from 95183
will take 657 rows from 94927
will take 743 rows from 94270
will take 978 rows from 93527
will take 559 rows from 92549
will take 204 rows from 91990
will take 902 rows from 91786
will take 937 rows from 90884
will take 879 rows from 89947
will take 339 rows from 89068
will take 622 rows from 88729
will take 811 rows from 88107
will take 140 rows from 87296
will take 101 rows from 87156
will take 407 rows from 87055
will take 435 rows from 86648
will take 602 rows from 86213
will take 507 rows from 85611
will take 715 rows from 85104
will take 285 rows from 84389
will take 923 rows from 84104
will take 868 rows from 83181
will take 495 rows from 82313
will take 288 rows from 81818
will take 266 rows from 81530
will take 281 rows from 81264
will take 704 rows from 80983
will take 824 rows from 80279
will take 913 rows from 79455
will take 420 rows from 78542
will take 581 rows from 78122
will take 726 rows from 77541
will take 400 rows from 76815
will take 995 rows from 76415
will take 618 rows from 75420
will take 483 rows from 74802
will take 913 rows from 74319
will take 515 rows from 73406
will take 685 rows from 72891
will take 280 rows from 72206
will take 224 rows from 71926
will take 836 rows from 71702
will take 196 rows from 70866
will take 706 rows from 70670
will take 928 rows from 69964
will take 440 rows from 69036
will take 958 rows from 68596
will take 314 rows from 67638
will take 195 rows from 67324
will take 134 rows from 67129
will take 140 rows from 66995
will take 328 rows from 66855
will take 272 rows from 66527
will take 657 rows from 66255
will take 836 rows from 65598
will take 872 rows from 64762
will take 425 rows from 63890
will take 114 rows from 63465
will take 272 rows from 63351
will take 617 rows from 63079
will take 939 rows from 62462
will take 281 rows from 61523
will take 215 rows from 61242
will take 392 rows from 61027
will take 805 rows from 60635
will take 708 rows from 59830
will take 387 rows from 59122
will take 265 rows from 58735
will take 709 rows from 58470
will take 477 rows from 57761
will take 149 rows from 57284
will take 286 rows from 57135
will take 896 rows from 56849
will take 230 rows from 55953
will take 101 rows from 55723
will take 122 rows from 55622
will take 684 rows from 55500
will take 279 rows from 54816
will take 296 rows from 54537
will take 943 rows from 54241
will take 520 rows from 53298
will take 710 rows from 52778
will take 530 rows from 52068
will take 315 rows from 51538
will take 450 rows from 51223
will take 893 rows from 50773
will take 248 rows from 49880
will take 579 rows from 49632
will take 187 rows from 49053
will take 898 rows from 48866
will take 774 rows from 47968
will take 872 rows from 47194
will take 861 rows from 46322
will take 596 rows from 45461
will take 506 rows from 44865
will take 828 rows from 44359
will take 790 rows from 43531
will take 948 rows from 42741
will take 899 rows from 41793
will take 177 rows from 40894
will take 519 rows from 40717
will take 440 rows from 40198
will take 286 rows from 39758
will take 535 rows from 39472
will take 708 rows from 38937
will take 519 rows from 38229
will take 216 rows from 37710
will take 759 rows from 37494
will take 807 rows from 36735
will take 117 rows from 35928
will take 484 rows from 35811
will take 484 rows from 35327
will take 744 rows from 34843
will take 902 rows from 34099
will take 843 rows from 33197
will take 886 rows from 32354
will take 488 rows from 31468
will take 356 rows from 30980
will take 102 rows from 30624
will take 869 rows from 30522
will take 391 rows from 29653
will take 993 rows from 29262
will take 888 rows from 28269
will take 381 rows from 27381
will take 759 rows from 27000
will take 218 rows from 26241
will take 622 rows from 26023
will take 942 rows from 25401
will take 759 rows from 24459
will take 940 rows from 23700
will take 441 rows from 22760
will take 788 rows from 22319
will take 368 rows from 21531
will take 917 rows from 21163
will take 336 rows from 20246
will take 469 rows from 19910
will take 983 rows from 19441
will take 374 rows from 18458
will take 538 rows from 18084
will take 512 rows from 17546
will take 520 rows from 17034
will take 570 rows from 16514
will take 250 rows from 15944
will take 929 rows from 15694
will take 303 rows from 14765
will take 299 rows from 14462
will take 356 rows from 14163
will take 625 rows from 13807
will take 746 rows from 13182
will take 969 rows from 12436
will take 657 rows from 11467
will take 485 rows from 10810
will take 222 rows from 10325
will take 519 rows from 10103
will take 866 rows from 9584
will take 138 rows from 8718
will take 100 rows from 8580
will take 581 rows from 8480
will take 863 rows from 7899
will take 794 rows from 7036
will take 632 rows from 6242
will take 981 rows from 5610
will take 877 rows from 4629
will take 926 rows from 3752
will take 510 rows from 2826
will take 975 rows from 2316
will take 577 rows from 1341
will take 764 rows from 764
In [68]:
computed = []
for group in sub_groups:
    df = pd.DataFrame(group, columns=["value"])
    computed.append([len(group), df.quantile(0.1).value, df.quantile(0.5).value])

    
df = pd.DataFrame(computed, columns=["population", "q1", "q5"])
q1 = (df.q1*df.population).sum() / df.population.sum()
print("df.population.sum() is %s" % df.population.sum())
print("computed q1 is %s" % q1)
df.population.sum() is 100000
computed q1 is -64.18192971579693
In [69]:
a = [1,2,3,4,5,6,7]
b = a[0:3]
c = a[3:]
print("b is %s and c is %s" % (b,c))
b is [1, 2, 3] and c is [4, 5, 6, 7]
In [ ]: