19 Tableaux et analyse de données

19.1 Tableau unidimensionnel de données

In [2]:
from sympy import isprime
L = [i for i in range(100) if isprime(i)]
L
Out[2]:
[2,
 3,
 5,
 7,
 11,
 13,
 17,
 19,
 23,
 29,
 31,
 37,
 41,
 43,
 47,
 53,
 59,
 61,
 67,
 71,
 73,
 79,
 83,
 89,
 97]
In [3]:
from pandas import Series
s = Series(L)
s
Out[3]:
0      2
1      3
2      5
3      7
4     11
5     13
6     17
7     19
8     23
9     29
10    31
11    37
12    41
13    43
14    47
15    53
16    59
17    61
18    67
19    71
20    73
21    79
22    83
23    89
24    97
dtype: int64
In [4]:
s[13]
Out[4]:
43
In [7]:
s.argmax()
Out[7]:
24
In [8]:
s.cumsum()
Out[8]:
0        2
1        5
2       10
3       17
4       28
5       41
6       58
7       77
8      100
9      129
10     160
11     197
12     238
13     281
14     328
15     381
16     440
17     501
18     568
19     639
20     712
21     791
22     874
23     963
24    1060
dtype: int64

19.2 Afficher quelques statistiques

In [9]:
s.describe()
Out[9]:
count    25.000000
mean     42.400000
std      29.475979
min       2.000000
25%      17.000000
50%      41.000000
75%      67.000000
max      97.000000
dtype: float64
In [10]:
s.mean()
Out[10]:
42.399999999999999
In [11]:
s.std()
Out[11]:
29.475978920696313
In [12]:
s.min()
Out[12]:
2
In [13]:
def f(p):
    return p**2 - 3
s.apply(f)
Out[13]:
0        1
1        6
2       22
3       46
4      118
5      166
6      286
7      358
8      526
9      838
10     958
11    1366
12    1678
13    1846
14    2206
15    2806
16    3478
17    3718
18    4486
19    5038
20    5326
21    6238
22    6886
23    7918
24    9406
dtype: int64

19.3 Opérations sur une série

In [16]:
s * 10000 + 45
Out[16]:
0      20045
1      30045
2      50045
3      70045
4     110045
5     130045
6     170045
7     190045
8     230045
9     290045
10    310045
11    370045
12    410045
13    430045
14    470045
15    530045
16    590045
17    610045
18    670045
19    710045
20    730045
21    790045
22    830045
23    890045
24    970045
dtype: int64
In [17]:
s / sum(s)
Out[17]:
0     0.001887
1     0.002830
2     0.004717
3     0.006604
4     0.010377
5     0.012264
6     0.016038
7     0.017925
8     0.021698
9     0.027358
10    0.029245
11    0.034906
12    0.038679
13    0.040566
14    0.044340
15    0.050000
16    0.055660
17    0.057547
18    0.063208
19    0.066981
20    0.068868
21    0.074528
22    0.078302
23    0.083962
24    0.091509
dtype: float64
In [18]:
t = Series([i**3 for i in range(25)])
In [19]:
t
Out[19]:
0         0
1         1
2         8
3        27
4        64
5       125
6       216
7       343
8       512
9       729
10     1000
11     1331
12     1728
13     2197
14     2744
15     3375
16     4096
17     4913
18     5832
19     6859
20     8000
21     9261
22    10648
23    12167
24    13824
dtype: int64
In [22]:
s ** t
Out[22]:
0                       1
1                       3
2                  390625
3    -9223372036854775808
4    -9223372036854775808
5    -9223372036854775808
6    -9223372036854775808
7    -9223372036854775808
8    -9223372036854775808
9    -9223372036854775808
10   -9223372036854775808
11   -9223372036854775808
12   -9223372036854775808
13   -9223372036854775808
14   -9223372036854775808
15   -9223372036854775808
16   -9223372036854775808
17   -9223372036854775808
18   -9223372036854775808
19   -9223372036854775808
20   -9223372036854775808
21   -9223372036854775808
22   -9223372036854775808
23   -9223372036854775808
24   -9223372036854775808
dtype: int64

19.4 Concaténation de deux séries

In [25]:
from pandas import concat
df = concat([s,t], axis=1)
df
Out[25]:
0 1
0 2 0
1 3 1
2 5 8
3 7 27
4 11 64
5 13 125
6 17 216
7 19 343
8 23 512
9 29 729
10 31 1000
11 37 1331
12 41 1728
13 43 2197
14 47 2744
15 53 3375
16 59 4096
17 61 4913
18 67 5832
19 71 6859
20 73 8000
21 79 9261
22 83 10648
23 89 12167
24 97 13824
In [26]:
type(df)
Out[26]:
pandas.core.frame.DataFrame

19.5 Tableau 2-dimensionnel de données

In [34]:
import pandas as pa
d = {'nb premiers':s, 'cubes':t}
df = pa.DataFrame(d, columns=['nb premiers', 'cubes'])
In [35]:
df
Out[35]:
nb premiers cubes
0 2 0
1 3 1
2 5 8
3 7 27
4 11 64
5 13 125
6 17 216
7 19 343
8 23 512
9 29 729
10 31 1000
11 37 1331
12 41 1728
13 43 2197
14 47 2744
15 53 3375
16 59 4096
17 61 4913
18 67 5832
19 71 6859
20 73 8000
21 79 9261
22 83 10648
23 89 12167
24 97 13824
In [37]:
df.describe()
Out[37]:
nb premiers cubes
count 25.000000 25.000000
mean 42.400000 3600.000000
std 29.475979 4236.452427
min 2.000000 0.000000
25% 17.000000 216.000000
50% 41.000000 1728.000000
75% 67.000000 5832.000000
max 97.000000 13824.000000

19.6 Accéder à une colonne d'un tableau

In [36]:
df['cubes']
Out[36]:
0         0
1         1
2         8
3        27
4        64
5       125
6       216
7       343
8       512
9       729
10     1000
11     1331
12     1728
13     2197
14     2744
15     3375
16     4096
17     4913
18     5832
19     6859
20     8000
21     9261
22    10648
23    12167
24    13824
Name: cubes, dtype: int64

19.7 Afficher les premières/dernières lignes

In [38]:
L = [isprime(i) for i in range(10000)]
In [39]:
L[:10]
Out[39]:
[False, False, True, True, False, True, False, True, False, False]
In [40]:
L = map(isprime, range(10000))
In [43]:
L[:10]
Out[43]:
[False, False, True, True, False, True, False, True, False, False]
In [44]:
s = Series(L)
In [45]:
t = s.cumsum()
In [49]:
df = pa.DataFrame()
In [50]:
df['isprime'] = s
In [51]:
df['pi_x'] = t
In [53]:
df.head()
Out[53]:
isprime pi_x
0 False 0
1 False 0
2 True 1
3 True 2
4 False 2
In [55]:
df.tail(8)
Out[55]:
isprime pi_x
9992 False 1229
9993 False 1229
9994 False 1229
9995 False 1229
9996 False 1229
9997 False 1229
9998 False 1229
9999 False 1229

19.8 Sous-tableau

In [56]:
df[500:520]
Out[56]:
isprime pi_x
500 False 95
501 False 95
502 False 95
503 True 96
504 False 96
505 False 96
506 False 96
507 False 96
508 False 96
509 True 97
510 False 97
511 False 97
512 False 97
513 False 97
514 False 97
515 False 97
516 False 97
517 False 97
518 False 97
519 False 97

19.9 Ajouter une colonne dans un tableau

In [57]:
from math import log   
In [58]:
10000 / log(10000)
Out[58]:
1085.7362047581294
In [62]:
from math import sqrt
12.29*sqrt(10000)
Out[62]:
1229.0
In [63]:
def x_sur_log_x(x):         
    if x > 1:               
        return x/log(x)     
    else:                   
        return None          
In [64]:
X = Series(range(10000))
gauss = X.apply(x_sur_log_x)
nous = X.apply(lambda x:12.29*sqrt(x))
In [65]:
df['x_logx'] = gauss
df['nous'] = nous
In [66]:
df.head()
Out[66]:
isprime pi_x x_logx nous
0 False 0 NaN 0.000000
1 False 0 NaN 12.290000
2 True 1 2.885390 17.380685
3 True 2 2.730718 21.286904
4 False 2 2.885390 24.580000

19.10 Visualiser les données

In [67]:
%matplotlib inline
In [68]:
df.plot()
Out[68]:
<matplotlib.axes._subplots.AxesSubplot at 0x19d80a3d0>
In [70]:
del df['nous']
df[:100].plot()
Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x19d98a450>
In [79]:
from sympy import Li
df['Li_x'] = Series([Li(x).n() for x in range(10000)], dtype='float64')
In [80]:
df.head()
Out[80]:
isprime pi_x x_logx Li_x
0 False 0 NaN -1.045164
1 False 0 NaN -inf
2 True 1 2.885390 0.000000
3 True 2 2.730718 1.118425
4 False 2 2.885390 1.922421
In [81]:
df.Li_x
Out[81]:
0         -1.045164
1              -inf
2          0.000000
3          1.118425
4          1.922421
5          2.589425
6          3.177059
7          3.711888
8          4.208555
9          4.676074
10         5.120436
11         5.545845
12         5.955384
13         6.351384
14         6.735662
15         7.109661
16         7.474553
17         7.831301
18         8.180711
19         8.523462
20         8.860136
21         9.191234
22         9.517189
23         9.838383
24        10.155152
25        10.467793
26        10.776570
27        11.081723
28        11.383464
29        11.681988
           ...     
9970    1241.834312
9971    1241.942921
9972    1242.051528
9973    1242.160134
9974    1242.268739
9975    1242.377343
9976    1242.485945
9977    1242.594547
9978    1242.703147
9979    1242.811746
9980    1242.920344
9981    1243.028940
9982    1243.137536
9983    1243.246130
9984    1243.354723
9985    1243.463315
9986    1243.571906
9987    1243.680495
9988    1243.789084
9989    1243.897671
9990    1244.006257
9991    1244.114842
9992    1244.223425
9993    1244.332008
9994    1244.440589
9995    1244.549169
9996    1244.657748
9997    1244.766326
9998    1244.874903
9999    1244.983478
Name: Li_x, dtype: float64
In [84]:
df.plot()
Out[84]:
<matplotlib.axes._subplots.AxesSubplot at 0x19def5250>

19.11 Exporter des données

In [85]:
from pandas import ExcelWriter
writer = ExcelWriter('tableau.xlsx')
df.to_excel(writer, 'Feuille 1')
writer.save()
In [87]:
df.to_csv('tableau.csv')
In [88]:
ls
16-02-01.ipynb  16-02-16.ipynb  16-03-22.ipynb  16-04-19.ipynb  tableau.csv
16-02-03.ipynb  16-02-23.ipynb  16-04-12.ipynb  16-05-03.ipynb  tableau.xlsx
In [89]:
!head tableau.csv
,isprime,pi_x,x_logx,Li_x
0,False,0,,-1.04516378012
1,False,0,,-inf
2,True,1,2.88539008178,0.0
3,True,2,2.73071767988,1.11842481455
4,False,2,2.88539008178,1.92242131492
5,True,3,3.1066746728,2.58942452992
6,False,3,3.34866375931,3.17705861042
7,True,4,3.59728839659,3.71188798588
8,False,4,3.8471867757,4.20855451944

19.12 Importer des données

In [91]:
df2 = pa.read_excel('tableau.xlsx')
In [92]:
df2.head()
Out[92]:
isprime pi_x x_logx Li_x
0 False 0 NaN -1.045164
1 False 0 NaN -inf
2 True 1 2.885390 0.000000
3 True 2 2.730718 1.118425
4 False 2 2.885390 1.922421
In [ ]:
pa.read_csv

19.13 Exemple: analyser des données de data.gov.be

In [ ]: