In [2]:
import numpy as np
import pandas as pd

Reading the file into a dataframe:

In [20]:
pop_df = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/populations.txt", sep='\t')
pop_df
Out[20]:
year hare lynx carrot
0 1900 30000.0 4000.0 48300
1 1901 47200.0 6100.0 48200
2 1902 70200.0 9800.0 41500
3 1903 77400.0 35200.0 38200
4 1904 36300.0 59400.0 40600
5 1905 20600.0 41700.0 39800
6 1906 18100.0 19000.0 38600
7 1907 21400.0 13000.0 42300
8 1908 22000.0 8300.0 44500
9 1909 25400.0 9100.0 42100
10 1910 27100.0 7400.0 46000
11 1911 40300.0 8000.0 46800
12 1912 57000.0 12300.0 43800
13 1913 76600.0 19500.0 40900
14 1914 52300.0 45700.0 39400
15 1915 19500.0 51100.0 39000
16 1916 11200.0 29700.0 36700
17 1917 7600.0 15800.0 41800
18 1918 14600.0 9700.0 43300
19 1919 16200.0 10100.0 41300
20 1920 24700.0 8600.0 47300
In [21]:
pop_df.head(5)
Out[21]:
year hare lynx carrot
0 1900 30000.0 4000.0 48300
1 1901 47200.0 6100.0 48200
2 1902 70200.0 9800.0 41500
3 1903 77400.0 35200.0 38200
4 1904 36300.0 59400.0 40600
In [22]:
pop_df.columns
Out[22]:
Index([u'year', u'hare', u'lynx', u'carrot'], dtype='object')
In [23]:
pop_df.values
Out[23]:
array([[  1900.,  30000.,   4000.,  48300.],
       [  1901.,  47200.,   6100.,  48200.],
       [  1902.,  70200.,   9800.,  41500.],
       [  1903.,  77400.,  35200.,  38200.],
       [  1904.,  36300.,  59400.,  40600.],
       [  1905.,  20600.,  41700.,  39800.],
       [  1906.,  18100.,  19000.,  38600.],
       [  1907.,  21400.,  13000.,  42300.],
       [  1908.,  22000.,   8300.,  44500.],
       [  1909.,  25400.,   9100.,  42100.],
       [  1910.,  27100.,   7400.,  46000.],
       [  1911.,  40300.,   8000.,  46800.],
       [  1912.,  57000.,  12300.,  43800.],
       [  1913.,  76600.,  19500.,  40900.],
       [  1914.,  52300.,  45700.,  39400.],
       [  1915.,  19500.,  51100.,  39000.],
       [  1916.,  11200.,  29700.,  36700.],
       [  1917.,   7600.,  15800.,  41800.],
       [  1918.,  14600.,   9700.,  43300.],
       [  1919.,  16200.,  10100.,  41300.],
       [  1920.,  24700.,   8600.,  47300.]])
In [24]:
pop_df.dtypes
Out[24]:
year        int64
hare      float64
lynx      float64
carrot      int64
dtype: object

We can access columns (Pandas series) using their labels:

In [25]:
hare_df = pop_df["hare"]
hare_df
Out[25]:
0     30000.0
1     47200.0
2     70200.0
3     77400.0
4     36300.0
5     20600.0
6     18100.0
7     21400.0
8     22000.0
9     25400.0
10    27100.0
11    40300.0
12    57000.0
13    76600.0
14    52300.0
15    19500.0
16    11200.0
17     7600.0
18    14600.0
19    16200.0
20    24700.0
Name: hare, dtype: float64

Or alternatively using the label as a property of the dataframe:

In [26]:
pop_df.hare
Out[26]:
0     30000.0
1     47200.0
2     70200.0
3     77400.0
4     36300.0
5     20600.0
6     18100.0
7     21400.0
8     22000.0
9     25400.0
10    27100.0
11    40300.0
12    57000.0
13    76600.0
14    52300.0
15    19500.0
16    11200.0
17     7600.0
18    14600.0
19    16200.0
20    24700.0
Name: hare, dtype: float64

The usual numeric operations are available for dataframes or series:

In [27]:
print "Mean Hare Population: ", hare_df.mean()
Mean Hare Population:  34080.952381
In [28]:
print "Mean Populations: \n", pop_df[["hare","lynx","carrot"]].mean()
print "\n"
print "Standard Deviations: \n", pop_df[["hare","lynx","carrot"]].std()
Mean Populations: 
hare      34080.952381
lynx      20166.666667
carrot    42400.000000
dtype: float64


Standard Deviations: 
hare      21413.981859
lynx      16655.999920
carrot     3404.555771
dtype: float64

The describe() method provides a detailed description of variables:

In [29]:
pop_df[["hare","lynx","carrot"]].describe()
Out[29]:
hare lynx carrot
count 21.000000 21.000000 21.000000
mean 34080.952381 20166.666667 42400.000000
std 21413.981859 16655.999920 3404.555771
min 7600.000000 4000.000000 36700.000000
25% 19500.000000 8600.000000 39800.000000
50% 25400.000000 12300.000000 41800.000000
75% 47200.000000 29700.000000 44500.000000
max 77400.000000 59400.000000 48300.000000
In [30]:
pop_df.describe()
Out[30]:
year hare lynx carrot
count 21.000000 21.000000 21.000000 21.000000
mean 1910.000000 34080.952381 20166.666667 42400.000000
std 6.204837 21413.981859 16655.999920 3404.555771
min 1900.000000 7600.000000 4000.000000 36700.000000
25% 1905.000000 19500.000000 8600.000000 39800.000000
50% 1910.000000 25400.000000 12300.000000 41800.000000
75% 1915.000000 47200.000000 29700.000000 44500.000000
max 1920.000000 77400.000000 59400.000000 48300.000000

A better way to do correlation analysis:

In [31]:
pop_df[["hare","lynx","carrot"]].corr()
Out[31]:
hare lynx carrot
hare 1.000000 0.071892 -0.016604
lynx 0.071892 1.000000 -0.680577
carrot -0.016604 -0.680577 1.000000
In [32]:
pop_df
Out[32]:
year hare lynx carrot
0 1900 30000.0 4000.0 48300
1 1901 47200.0 6100.0 48200
2 1902 70200.0 9800.0 41500
3 1903 77400.0 35200.0 38200
4 1904 36300.0 59400.0 40600
5 1905 20600.0 41700.0 39800
6 1906 18100.0 19000.0 38600
7 1907 21400.0 13000.0 42300
8 1908 22000.0 8300.0 44500
9 1909 25400.0 9100.0 42100
10 1910 27100.0 7400.0 46000
11 1911 40300.0 8000.0 46800
12 1912 57000.0 12300.0 43800
13 1913 76600.0 19500.0 40900
14 1914 52300.0 45700.0 39400
15 1915 19500.0 51100.0 39000
16 1916 11200.0 29700.0 36700
17 1917 7600.0 15800.0 41800
18 1918 14600.0 9700.0 43300
19 1919 16200.0 10100.0 41300
20 1920 24700.0 8600.0 47300

Also sorting is done easily:

In [35]:
pop_df.sort_values(by=['hare'])
Out[35]:
year hare lynx carrot
17 1917 7600.0 15800.0 41800
16 1916 11200.0 29700.0 36700
18 1918 14600.0 9700.0 43300
19 1919 16200.0 10100.0 41300
6 1906 18100.0 19000.0 38600
15 1915 19500.0 51100.0 39000
5 1905 20600.0 41700.0 39800
7 1907 21400.0 13000.0 42300
8 1908 22000.0 8300.0 44500
20 1920 24700.0 8600.0 47300
9 1909 25400.0 9100.0 42100
10 1910 27100.0 7400.0 46000
0 1900 30000.0 4000.0 48300
4 1904 36300.0 59400.0 40600
11 1911 40300.0 8000.0 46800
1 1901 47200.0 6100.0 48200
14 1914 52300.0 45700.0 39400
12 1912 57000.0 12300.0 43800
2 1902 70200.0 9800.0 41500
13 1913 76600.0 19500.0 40900
3 1903 77400.0 35200.0 38200

More examples of accessing and manipulating data in dataframes:

In [37]:
# finding all instances when the population of hares is above 50k
hare_above_50K = pop_df.hare>50000
print hare_above_50K
print "\n"
print pop_df[hare_above_50K]
print "\n"
print pop_df[hare_above_50K].year
0     False
1     False
2      True
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12     True
13     True
14     True
15    False
16    False
17    False
18    False
19    False
20    False
Name: hare, dtype: bool


    year     hare     lynx  carrot
2   1902  70200.0   9800.0   41500
3   1903  77400.0  35200.0   38200
12  1912  57000.0  12300.0   43800
13  1913  76600.0  19500.0   40900
14  1914  52300.0  45700.0   39400


2     1902
3     1903
12    1912
13    1913
14    1914
Name: year, dtype: int64
In [38]:
# finding all instances when the population of one of the animal species is above 50k
above_50K = (pop_df["hare"]>50000) | (pop_df["lynx"]>50000)
print pop_df[above_50K]
#print pop_df[hare_above_50K].year
    year     hare     lynx  carrot
2   1902  70200.0   9800.0   41500
3   1903  77400.0  35200.0   38200
4   1904  36300.0  59400.0   40600
12  1912  57000.0  12300.0   43800
13  1913  76600.0  19500.0   40900
14  1914  52300.0  45700.0   39400
15  1915  19500.0  51100.0   39000
In [39]:
pop2 = pop_df.drop("year", axis=1)
pop2
Out[39]:
hare lynx carrot
0 30000.0 4000.0 48300
1 47200.0 6100.0 48200
2 70200.0 9800.0 41500
3 77400.0 35200.0 38200
4 36300.0 59400.0 40600
5 20600.0 41700.0 39800
6 18100.0 19000.0 38600
7 21400.0 13000.0 42300
8 22000.0 8300.0 44500
9 25400.0 9100.0 42100
10 27100.0 7400.0 46000
11 40300.0 8000.0 46800
12 57000.0 12300.0 43800
13 76600.0 19500.0 40900
14 52300.0 45700.0 39400
15 19500.0 51100.0 39000
16 11200.0 29700.0 36700
17 7600.0 15800.0 41800
18 14600.0 9700.0 43300
19 16200.0 10100.0 41300
20 24700.0 8600.0 47300

When necessary, we can convert a dataframe (or a series) into a Numpy array:

In [40]:
poptable = np.array(pop2)
poptable
Out[40]:
array([[ 30000.,   4000.,  48300.],
       [ 47200.,   6100.,  48200.],
       [ 70200.,   9800.,  41500.],
       [ 77400.,  35200.,  38200.],
       [ 36300.,  59400.,  40600.],
       [ 20600.,  41700.,  39800.],
       [ 18100.,  19000.,  38600.],
       [ 21400.,  13000.,  42300.],
       [ 22000.,   8300.,  44500.],
       [ 25400.,   9100.,  42100.],
       [ 27100.,   7400.,  46000.],
       [ 40300.,   8000.,  46800.],
       [ 57000.,  12300.,  43800.],
       [ 76600.,  19500.,  40900.],
       [ 52300.,  45700.,  39400.],
       [ 19500.,  51100.,  39000.],
       [ 11200.,  29700.,  36700.],
       [  7600.,  15800.,  41800.],
       [ 14600.,   9700.,  43300.],
       [ 16200.,  10100.,  41300.],
       [ 24700.,   8600.,  47300.]])

Example of basic visualization using Pandas and with Matplotlib:

In [41]:
%matplotlib inline
import matplotlib.pyplot as plt
In [42]:
plt.plot(pop_df["year"], pop_df["hare"])
Out[42]:
[<matplotlib.lines.Line2D at 0xa732978>]
In [43]:
plt.plot(pop_df["year"], pop2, label=['Hares','Lynxes','Carrots'])
plt.legend( ('Hares','Lynxes','Carrots') )
plt.ylabel('Population')
plt.xlabel('Year')
plt.show()
In [44]:
plt.hist(pop_df["carrot"], bins=8, alpha=0.5)
plt.xlabel('Carrots')
plt.ylabel('Count')
plt.title('Histogram of Carrot Populaions')
plt.axis([36000, 49000, 0, 6])
plt.grid(True)

Pandas has its own versatile "plot" method that can handle most types of charts:

In [45]:
pop_df.plot(x="year", title="Populations")
Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0xaac0198>
In [46]:
pop_df.plot(x="carrot", y="lynx", kind="scatter")
Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0xbd68128>
In [51]:
pop_df.boxplot(column=["hare","lynx","carrot"], return_type='axes')
Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0xc351e80>
In [52]:
fox_col = np.random.randint(low=5000, high=20000, size=21)
fox_col
Out[52]:
array([13337, 18638,  8665, 11701, 12877,  9977,  8070,  5090,  9776,
       15221, 12950, 12359,  6017,  7982, 14739,  5478,  8197,  6860,
       16206,  6910, 16556])
In [53]:
pop_df["fox"] = pd.Series(fox_col, index=pop_df.index)
pop_df
Out[53]:
year hare lynx carrot fox
0 1900 30000.0 4000.0 48300 13337
1 1901 47200.0 6100.0 48200 18638
2 1902 70200.0 9800.0 41500 8665
3 1903 77400.0 35200.0 38200 11701
4 1904 36300.0 59400.0 40600 12877
5 1905 20600.0 41700.0 39800 9977
6 1906 18100.0 19000.0 38600 8070
7 1907 21400.0 13000.0 42300 5090
8 1908 22000.0 8300.0 44500 9776
9 1909 25400.0 9100.0 42100 15221
10 1910 27100.0 7400.0 46000 12950
11 1911 40300.0 8000.0 46800 12359
12 1912 57000.0 12300.0 43800 6017
13 1913 76600.0 19500.0 40900 7982
14 1914 52300.0 45700.0 39400 14739
15 1915 19500.0 51100.0 39000 5478
16 1916 11200.0 29700.0 36700 8197
17 1917 7600.0 15800.0 41800 6860
18 1918 14600.0 9700.0 43300 16206
19 1919 16200.0 10100.0 41300 6910
20 1920 24700.0 8600.0 47300 16556
In [54]:
pop_df.plot(x="year", y="fox", kind="area", title="Fox Population")
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0xc5b3c50>
In [63]:
pd.scatter_matrix(pop_df[["hare","lynx","carrot"]], figsize=(14,14), hist_kwds={'bins':8}, alpha=.5, marker='o', s=50)
Out[63]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001538C908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000155E64A8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000156EF278>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000000157A1240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001589BDA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000000159796A0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000015A4E198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000015B49A20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000015BFC898>]], dtype=object)
In [ ]: