import pandas as pd
import numpy as np
arr = np.random.randn(100000) * 50
arr
array([ 17.05715119, -61.3171255 , 46.61055087, ..., 27.6605187 , 29.83815425, -41.28542945])
ref_q1 = pd.DataFrame(arr, columns=["value"]).quantile(0.1).value
print("ref q1 is %s" % ref_q1)
ref q1 is -64.44685728671065
arr = np.random.permutation(arr)
sub_groups = []
while len(arr) > 0:
num_rows = len(arr) if len(arr) <= 1000 else np.random.randint(100, 1000)
print("will take %s rows from %s" % (num_rows, len(arr)))
sub_group = arr[0:num_rows]
sub_groups.append(sub_group)
arr = arr[num_rows:]
will take 798 rows from 100000 will take 555 rows from 99202 will take 609 rows from 98647 will take 436 rows from 98038 will take 930 rows from 97602 will take 840 rows from 96672 will take 649 rows from 95832 will take 256 rows from 95183 will take 657 rows from 94927 will take 743 rows from 94270 will take 978 rows from 93527 will take 559 rows from 92549 will take 204 rows from 91990 will take 902 rows from 91786 will take 937 rows from 90884 will take 879 rows from 89947 will take 339 rows from 89068 will take 622 rows from 88729 will take 811 rows from 88107 will take 140 rows from 87296 will take 101 rows from 87156 will take 407 rows from 87055 will take 435 rows from 86648 will take 602 rows from 86213 will take 507 rows from 85611 will take 715 rows from 85104 will take 285 rows from 84389 will take 923 rows from 84104 will take 868 rows from 83181 will take 495 rows from 82313 will take 288 rows from 81818 will take 266 rows from 81530 will take 281 rows from 81264 will take 704 rows from 80983 will take 824 rows from 80279 will take 913 rows from 79455 will take 420 rows from 78542 will take 581 rows from 78122 will take 726 rows from 77541 will take 400 rows from 76815 will take 995 rows from 76415 will take 618 rows from 75420 will take 483 rows from 74802 will take 913 rows from 74319 will take 515 rows from 73406 will take 685 rows from 72891 will take 280 rows from 72206 will take 224 rows from 71926 will take 836 rows from 71702 will take 196 rows from 70866 will take 706 rows from 70670 will take 928 rows from 69964 will take 440 rows from 69036 will take 958 rows from 68596 will take 314 rows from 67638 will take 195 rows from 67324 will take 134 rows from 67129 will take 140 rows from 66995 will take 328 rows from 66855 will take 272 rows from 66527 will take 657 rows from 66255 will take 836 rows from 65598 will take 872 rows from 64762 will take 425 rows from 63890 will take 114 rows from 63465 will take 272 rows from 63351 will take 617 rows from 63079 will take 939 rows from 62462 will take 281 rows from 61523 will take 215 rows from 61242 will take 392 rows from 61027 will take 805 rows from 60635 will take 708 rows from 59830 will take 387 rows from 59122 will take 265 rows from 58735 will take 709 rows from 58470 will take 477 rows from 57761 will take 149 rows from 57284 will take 286 rows from 57135 will take 896 rows from 56849 will take 230 rows from 55953 will take 101 rows from 55723 will take 122 rows from 55622 will take 684 rows from 55500 will take 279 rows from 54816 will take 296 rows from 54537 will take 943 rows from 54241 will take 520 rows from 53298 will take 710 rows from 52778 will take 530 rows from 52068 will take 315 rows from 51538 will take 450 rows from 51223 will take 893 rows from 50773 will take 248 rows from 49880 will take 579 rows from 49632 will take 187 rows from 49053 will take 898 rows from 48866 will take 774 rows from 47968 will take 872 rows from 47194 will take 861 rows from 46322 will take 596 rows from 45461 will take 506 rows from 44865 will take 828 rows from 44359 will take 790 rows from 43531 will take 948 rows from 42741 will take 899 rows from 41793 will take 177 rows from 40894 will take 519 rows from 40717 will take 440 rows from 40198 will take 286 rows from 39758 will take 535 rows from 39472 will take 708 rows from 38937 will take 519 rows from 38229 will take 216 rows from 37710 will take 759 rows from 37494 will take 807 rows from 36735 will take 117 rows from 35928 will take 484 rows from 35811 will take 484 rows from 35327 will take 744 rows from 34843 will take 902 rows from 34099 will take 843 rows from 33197 will take 886 rows from 32354 will take 488 rows from 31468 will take 356 rows from 30980 will take 102 rows from 30624 will take 869 rows from 30522 will take 391 rows from 29653 will take 993 rows from 29262 will take 888 rows from 28269 will take 381 rows from 27381 will take 759 rows from 27000 will take 218 rows from 26241 will take 622 rows from 26023 will take 942 rows from 25401 will take 759 rows from 24459 will take 940 rows from 23700 will take 441 rows from 22760 will take 788 rows from 22319 will take 368 rows from 21531 will take 917 rows from 21163 will take 336 rows from 20246 will take 469 rows from 19910 will take 983 rows from 19441 will take 374 rows from 18458 will take 538 rows from 18084 will take 512 rows from 17546 will take 520 rows from 17034 will take 570 rows from 16514 will take 250 rows from 15944 will take 929 rows from 15694 will take 303 rows from 14765 will take 299 rows from 14462 will take 356 rows from 14163 will take 625 rows from 13807 will take 746 rows from 13182 will take 969 rows from 12436 will take 657 rows from 11467 will take 485 rows from 10810 will take 222 rows from 10325 will take 519 rows from 10103 will take 866 rows from 9584 will take 138 rows from 8718 will take 100 rows from 8580 will take 581 rows from 8480 will take 863 rows from 7899 will take 794 rows from 7036 will take 632 rows from 6242 will take 981 rows from 5610 will take 877 rows from 4629 will take 926 rows from 3752 will take 510 rows from 2826 will take 975 rows from 2316 will take 577 rows from 1341 will take 764 rows from 764
computed = []
for group in sub_groups:
df = pd.DataFrame(group, columns=["value"])
computed.append([len(group), df.quantile(0.1).value, df.quantile(0.5).value])
df = pd.DataFrame(computed, columns=["population", "q1", "q5"])
q1 = (df.q1*df.population).sum() / df.population.sum()
print("df.population.sum() is %s" % df.population.sum())
print("computed q1 is %s" % q1)
df.population.sum() is 100000 computed q1 is -64.18192971579693
a = [1,2,3,4,5,6,7]
b = a[0:3]
c = a[3:]
print("b is %s and c is %s" % (b,c))
b is [1, 2, 3] and c is [4, 5, 6, 7]