In [1]:
import pandas as pd
In [2]:
journal = pd.DataFrame([['Alice', 'Algebra', 4],
                        ['Bob', 'Algebra', 5],
                        ['Alice', 'Calculus', 3],
                        ['Bob', 'Calculus', 2]],
                       columns=['student', 'course', 'grade'])
In [13]:
journal
Out[13]:
student course grade
0 Alice Algebra 4
1 Bob Algebra 5
2 Alice Calculus 3
3 Bob Calculus 2
In [10]:
grades = journal.pivot('student', 'course', 'grade')
In [11]:
grades['final_grade'] = 0.6 * grades['Algebra'] + 0.4 * grades['Calculus']
In [12]:
grades
Out[12]:
course Algebra Calculus final_grade
student
Alice 4 3 3.6
Bob 5 2 3.8
In [14]:
journal = pd.DataFrame([['Alice', 'Algebra', 4],
                        ['Bob', 'Algebra', 5],
                        ['Alice', 'Calculus', 3],
                        ['Bob', 'Calculus', 2],
                        ['Alice', 'Calculus', 4]],
                       columns=['student', 'course', 'grade'])
In [15]:
journal
Out[15]:
student course grade
0 Alice Algebra 4
1 Bob Algebra 5
2 Alice Calculus 3
3 Bob Calculus 2
4 Alice Calculus 4
In [29]:
grades_wide = journal.pivot_table(
    'grade', 'student', 'course', aggfunc='last').reset_index()
In [30]:
grades_wide
Out[30]:
course student Algebra Calculus
0 Alice 4 4
1 Bob 5 2
In [33]:
grades_wide.melt(id_vars='student', value_name='grade')
Out[33]:
student course grade
0 Alice Algebra 4
1 Bob Algebra 5
2 Alice Calculus 4
3 Bob Calculus 2
In [36]:
students = pd.DataFrame([['Alice', 123],
                         ['Bob', 567],
                         ['Claudia', 152]],
                         columns=['name', 'id'])
In [37]:
grades = pd.DataFrame([[567, 4], 
                       [123, 5],
                       [199, 2]],
                     columns=['student', 'grade'])
In [45]:
students\
 .merge(grades, left_on='id', right_on='student')\
 .drop("student", axis=1)
Out[45]:
name id grade
0 Alice 123 5
1 Bob 567 4
In [42]:
(students
 .merge(grades, left_on='id', right_on='student')
 .drop("student", axis=1))
Out[42]:
name id grade
0 Alice 123 5
1 Bob 567 4
In [46]:
students.merge(grades, left_on='id', right_on='student',
               how='left')
Out[46]:
name id student grade
0 Alice 123 123.0 5.0
1 Bob 567 567.0 4.0
2 Claudia 152 NaN NaN
In [47]:
students.merge(grades, left_on='id', right_on='student',
               how='right')
Out[47]:
name id student grade
0 Alice 123.0 123 5
1 Bob 567.0 567 4
2 NaN NaN 199 2
In [48]:
students.merge(grades, left_on='id', right_on='student',
               how='outer')
Out[48]:
name id student grade
0 Alice 123.0 123.0 5.0
1 Bob 567.0 567.0 4.0
2 Claudia 152.0 NaN NaN
3 NaN NaN 199.0 2.0
In [53]:
grades.set_index('student')
Out[53]:
grade
student
567 4
123 5
199 2
In [54]:
students
Out[54]:
name id
0 Alice 123
1 Bob 567
2 Claudia 152
In [56]:
students.join(grades.set_index('student'), on='id')
Out[56]:
name id grade
0 Alice 123 5.0
1 Bob 567 4.0
2 Claudia 152 NaN
In [57]:
a = pd.DataFrame([[1, 2],
                  [3, 4]],
                 index=[0, 10],
                 columns=['x', 'y'])
b = pd.DataFrame([[10, 20],
                  [30, 40]],
                 index=[0, 15],
                 columns=['x', 'z'])
In [67]:
pd.concat([a, b], sort=False)
Out[67]:
x y z
0 1 2.0 NaN
10 3 4.0 NaN
0 10 NaN 20.0
15 30 NaN 40.0
In [68]:
b.rename(columns={'z': 'y'})
Out[68]:
x y
0 10 20
15 30 40
In [65]:
pd.concat([a, b.rename(columns={'z': 'y'})], sort=False)
Out[65]:
x y
0 1 2
10 3 4
0 10 20
15 30 40
In [59]:
b
Out[59]:
x z
0 10 20
15 30 40
In [70]:
pd.concat([a, b], sort=False, axis=1)
Out[70]:
x y x z
0 1.0 2.0 10.0 20.0
10 3.0 4.0 NaN NaN
15 NaN NaN 30.0 40.0
In [71]:
electors = pd.read_csv("https://raw.githubusercontent.com/ischurov/dj-prog/master/elections_usa_2016/electors.csv")
In [74]:
results = pd.read_csv("https://raw.githubusercontent.com/ischurov/dj-prog/master/elections_usa_2016/results.csv")
In [80]:
(results
 .merge(electors, left_on='State', right_on='state')
 .drop('state', axis=1)
 .reindex(['State', 'electors'] + list(results.columns[1:]), axis=1)
)
Out[80]:
State electors Clinton Trump Johnson Stein Castle McMullin
0 Florida 29 0.216434 0.618447 0.044573 0.028206 0.060405 0.031935
1 Connecticut 7 0.335210 0.376658 0.078704 0.043349 0.082353 0.083727
2 Georgia 16 0.478620 0.349473 0.050898 0.022348 0.047795 0.050867
3 Texas 38 0.410668 0.481318 0.021310 0.037528 0.025420 0.023756
4 Vermont 3 0.535389 0.282250 0.056555 0.049578 0.036442 0.039785
5 New Mexico 5 0.415840 0.476644 0.026709 0.039428 0.023617 0.017762
6 Illinois 20 0.474358 0.357438 0.051357 0.048942 0.026575 0.041328
7 Kentucky 8 0.471646 0.401216 0.032773 0.043606 0.025530 0.025228
8 Iowa 6 0.236087 0.554566 0.055705 0.055484 0.044951 0.053208
9 Alaska 3 0.266377 0.558781 0.027315 0.045619 0.047453 0.054456
10 New York 29 0.574767 0.260585 0.037644 0.050695 0.038890 0.037420
11 Massachusetts 11 0.410697 0.382163 0.054659 0.052399 0.047773 0.052310
12 Arkansas 6 0.314170 0.507609 0.027165 0.031464 0.052778 0.066814
13 Missouri 10 0.376060 0.495095 0.040536 0.031930 0.032593 0.023787
14 Kansas 6 0.521597 0.274009 0.028772 0.063021 0.061055 0.051545
15 Idaho 4 0.446396 0.402379 0.047877 0.039520 0.024004 0.039823
16 Wisconsin 10 0.465110 0.384974 0.048064 0.045944 0.018904 0.037004
17 Mississippi 6 0.484679 0.399645 0.031280 0.040009 0.025287 0.019099
18 Washington 12 0.453101 0.295005 0.069243 0.072575 0.046492 0.063584
19 Oklahoma 7 0.415016 0.460028 0.035717 0.047230 0.020983 0.021025
20 California 55 0.361255 0.474504 0.026585 0.048928 0.049233 0.039494
21 South Carolina 9 0.284012 0.563711 0.025350 0.033313 0.057033 0.036582
22 Hawaii 4 0.340205 0.495558 0.047842 0.046553 0.022075 0.047767
23 Maryland 10 0.330668 0.454763 0.060220 0.062474 0.061841 0.030034
24 Arizona 11 0.402962 0.461755 0.032080 0.028232 0.040466 0.034505
25 Montana 3 0.301029 0.467710 0.065851 0.064865 0.059654 0.040891
26 Ohio 18 0.408565 0.442375 0.036774 0.044167 0.037256 0.030862
27 Oregon 7 0.507037 0.326198 0.033469 0.031248 0.060057 0.041991
28 Rhode Island 4 0.480796 0.375042 0.032021 0.035813 0.035697 0.040632
29 South Dakota 3 0.442753 0.405675 0.033403 0.029012 0.041692 0.047466
30 Alabama 9 0.502080 0.331939 0.025574 0.053111 0.035491 0.051805
31 North Dakota 3 0.351798 0.518499 0.022923 0.038819 0.041381 0.026581
32 Virginia 13 0.398891 0.443718 0.029539 0.046741 0.039699 0.041412
33 New Jersey 14 0.521112 0.310993 0.051805 0.054116 0.034991 0.026983
34 Wyoming 3 0.367787 0.470639 0.025698 0.064606 0.024209 0.047061
35 Maine 4 0.419599 0.468375 0.023796 0.026384 0.029827 0.032019
36 D.C. 3 0.435301 0.377277 0.039180 0.038634 0.049569 0.060040
37 Tennessee 11 0.518468 0.312364 0.041535 0.039084 0.048283 0.040266
38 Pennsylvania 20 0.368591 0.462395 0.031488 0.068597 0.035429 0.033501
39 Nebraska 5 0.389383 0.441899 0.050905 0.043413 0.036925 0.037475
40 Delaware 3 0.409732 0.460836 0.029507 0.021225 0.032677 0.046022
41 Michigan 16 0.459701 0.357738 0.050462 0.034790 0.056483 0.040826
42 New Hampshire 4 0.364389 0.473694 0.038713 0.036334 0.047538 0.039332
43 Indiana 11 0.508193 0.263173 0.059571 0.062370 0.062833 0.043860
44 North Carolina 15 0.425924 0.422739 0.031383 0.030405 0.054823 0.034726
45 Colorado 9 0.470725 0.365023 0.053773 0.045958 0.020434 0.044088
46 West Virginia 5 0.383755 0.462127 0.027529 0.045644 0.027393 0.053553
47 Utah 6 0.449015 0.406636 0.031201 0.048796 0.018637 0.045715
48 Minnesota 10 0.334120 0.499643 0.063750 0.036953 0.041495 0.024039
49 Louisiana 8 0.414652 0.425511 0.030315 0.058871 0.026953 0.043696
50 Nevada 6 0.442711 0.331313 0.062424 0.059264 0.052769 0.051520
In [81]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=['x', 'y'])
In [89]:
df.reindex(['y', 'x', 'z'], axis=1)
Out[89]:
y x z
0 2 1 NaN
1 4 3 NaN
In [98]:
tables = pd.read_html("https://bit.ly/2FfPmiX", header=0)
In [99]:
type(tables)
Out[99]:
list
In [100]:
len(tables)
Out[100]:
1
In [115]:
df = (tables[0][['DATE_OBS', 'TMPMAX', 'TMPMIN', 'TMPMN']]
      .dropna().loc[366:])
In [117]:
%matplotlib inline
In [131]:
df[:3650]['TMPMN'].plot(label='tmpmn', legend=True)
df[:3650]['TMPMN'].rolling(window=365).mean().plot(label='rolling mean', 
                                                  legend=True,
                                                  lw=4)
Out[131]:
<matplotlib.axes._subplots.AxesSubplot at 0x1296829e8>
In [132]:
df['TMPMN'].rolling(window=3650).mean().plot(label='rolling mean', 
                                                  legend=True,
                                                  lw=4)
Out[132]:
<matplotlib.axes._subplots.AxesSubplot at 0x1296820b8>
In [136]:
df.index = pd.to_datetime(df['DATE_OBS'])
In [139]:
df.drop('DATE_OBS', axis=1, inplace=True)
In [141]:
df['TMPMN'].rolling(window=3650).mean().plot(label='rolling mean', 
                                                  legend=True,
                                                  lw=4)
Out[141]:
<matplotlib.axes._subplots.AxesSubplot at 0x125d6b5f8>
In [147]:
df.index.year
Out[147]:
Int64Index([1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949, 1949,
            ...
            2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006],
           dtype='int64', name='DATE_OBS', length=21121)
In [148]:
df['month'] = df.index.month
df['year'] = df.index.year
In [152]:
month_names = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 
               'sep', 
               'oct', 'nov', 'dec']
mean_by_months = df.pivot_table(values='TMPMN', 
                                index='year', 
                                columns='month')
mean_by_months.columns = month_names
In [158]:
mean_by_months.plot.scatter(x='apr', y='may')
Out[158]:
<matplotlib.axes._subplots.AxesSubplot at 0x125d89710>
In [162]:
import matplotlib.pyplot as plt
In [160]:
corr = mean_by_months.corr()
In [175]:
mean_by_months.plot.scatter(x='jan', y='jan')
Out[175]:
<matplotlib.axes._subplots.AxesSubplot at 0x1257c9780>
In [171]:
mean_by_months.plot.scatter(x='jan', y='feb')
Out[171]:
<matplotlib.axes._subplots.AxesSubplot at 0x1257560f0>
In [176]:
corr
Out[176]:
jan feb mar apr may jun jul aug sep oct nov dec
jan 1.000000 0.384755 0.257302 0.216484 0.047153 0.039691 0.206568 -0.120758 0.134955 0.117936 -0.142560 -0.059667
feb 0.384755 1.000000 0.379095 0.216300 0.017000 0.028284 0.252928 -0.097432 0.033740 0.071301 0.059700 -0.012276
mar 0.257302 0.379095 1.000000 0.328143 0.071455 0.177538 0.091281 0.022202 -0.081379 -0.037189 -0.031521 -0.075481
apr 0.216484 0.216300 0.328143 1.000000 0.041331 0.017182 0.159124 -0.117359 0.028134 0.037973 0.077483 0.000211
may 0.047153 0.017000 0.071455 0.041331 1.000000 -0.121644 0.028159 0.118944 0.038196 0.178622 0.175155 -0.156485
jun 0.039691 0.028284 0.177538 0.017182 -0.121644 1.000000 0.306204 0.093280 -0.060803 0.111169 -0.370548 0.150160
jul 0.206568 0.252928 0.091281 0.159124 0.028159 0.306204 1.000000 0.411425 0.131770 0.132230 0.000195 0.053797
aug -0.120758 -0.097432 0.022202 -0.117359 0.118944 0.093280 0.411425 1.000000 0.222137 0.179547 0.123741 0.002483
sep 0.134955 0.033740 -0.081379 0.028134 0.038196 -0.060803 0.131770 0.222137 1.000000 0.277824 0.077645 0.054884
oct 0.117936 0.071301 -0.037189 0.037973 0.178622 0.111169 0.132230 0.179547 0.277824 1.000000 0.146520 -0.031469
nov -0.142560 0.059700 -0.031521 0.077483 0.175155 -0.370548 0.000195 0.123741 0.077645 0.146520 1.000000 -0.112312
dec -0.059667 -0.012276 -0.075481 0.000211 -0.156485 0.150160 0.053797 0.002483 0.054884 -0.031469 -0.112312 1.000000
In [174]:
plt.imshow(corr, cmap='seismic', vmin=-1, vmax=1)
plt.colorbar()
plt.xticks(range(12), month_names)
plt.yticks(range(12), month_names);
In [ ]: