#!/usr/bin/env python # coding: utf-8 # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/fonnesbeck/Bios8366/blob/master/notebooks/Section1_2-Introduction-to-Pandas.ipynb) # # # Introduction to Pandas # # **pandas** is a Python package providing fast, flexible, and expressive data structures designed to work with *relational* or *labeled* data both. It is a fundamental high-level building block for doing practical, real world data analysis in Python. # # pandas is well suited for: # # - Tabular data with heterogeneously-typed columns, as in an SQL table or Excel spreadsheet # - Ordered and unordered (not necessarily fixed-frequency) time series data. # - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels # - Any other form of observational / statistical data sets. The data actually need not be labeled at all to be placed into a pandas data structure # # # Key features: # # - Easy handling of **missing data** # - **Size mutability**: columns can be inserted and deleted from DataFrame and higher dimensional objects # - Automatic and explicit **data alignment**: objects can be explicitly aligned to a set of labels, or the data can be aligned automatically # - Powerful, flexible **group by functionality** to perform split-apply-combine operations on data sets # - Intelligent label-based **slicing, fancy indexing, and subsetting** of large data sets # - Intuitive **merging and joining** data sets # - Flexible **reshaping and pivoting** of data sets # - **Hierarchical labeling** of axes # - Robust **IO tools** for loading data from flat files, Excel files, databases, and HDF5 # - **Time series functionality**: date range generation and frequency conversion, moving window statistics, moving window linear regressions, date shifting and lagging, etc. # In[ ]: import pandas as pd import numpy as np DATA_PATH = 'https://raw.githubusercontent.com/fonnesbeck/Bios8366/master/data/' # ## Pandas Data Structures # ### Series # # A **Series** is a single vector of data (like a NumPy array) with an *index* that labels each element in the vector. # In[ ]: counts = pd.Series([632, 1638, 569, 115]) counts # If an index is not specified, a default sequence of integers is assigned as the index. A NumPy array comprises the values of the `Series`, while the index is a pandas `Index` object. # In[ ]: counts.values # In[ ]: counts.index # We can assign meaningful labels to the index, if they are available: # In[ ]: bacteria = pd.Series([632, 1638, 569, 115], index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes']) bacteria # These labels can be used to refer to the values in the `Series`. # In[ ]: bacteria['Actinobacteria'] # In[ ]: bacteria[[name.endswith('bacteria') for name in bacteria.index]] # In[ ]: [name.endswith('bacteria') for name in bacteria.index] # Notice that the indexing operation preserved the association between the values and the corresponding indices. # # We can still use positional indexing if we wish. # In[ ]: bacteria[0] # We can give both the array of values and the index meaningful labels themselves: # In[ ]: bacteria.name = 'counts' bacteria.index.name = 'phylum' bacteria # NumPy's math functions and other operations can be applied to Series without losing the data structure. # In[ ]: np.log(bacteria) # We can also filter according to the values in the `Series`: # In[ ]: bacteria[bacteria>1000] # A `Series` can be thought of as an ordered key-value store. In fact, we can create one from a `dict`: # In[ ]: bacteria_dict = {'Proteobacteria': 1638, 'Actinobacteria': 569,'Firmicutes': 632, 'Bacteroidetes': 115} pd.Series(bacteria_dict) # If we pass a custom index to `Series`, it will select the corresponding values from the dict, and treat indices without corrsponding values as missing. Pandas uses the `NaN` (not a number) type for missing values. # In[ ]: bacteria2 = pd.Series(bacteria_dict, index=['Cyanobacteria','Firmicutes', 'Proteobacteria','Actinobacteria']) bacteria2 # In[ ]: bacteria2.isnull() # Critically, the labels are used to **align data** when used in operations with other Series objects: # In[ ]: bacteria + bacteria2 # Contrast this with NumPy arrays, where arrays of the same length will combine values element-wise; adding Series combined values with the same label in the resulting series. Notice also that the missing values were propogated by addition. # ### DataFrame # # Inevitably, we want to be able to store, view and manipulate data that is *multivariate*, where for every index there are multiple fields or columns of data, often of varying data type. # # A `DataFrame` is a tabular data structure, encapsulating multiple series like columns in a spreadsheet. Data are stored internally as a 2-dimensional object, but the `DataFrame` allows us to represent and manipulate higher-dimensional data. # In[ ]: data = pd.DataFrame({'value':[632, 1638, 569, 115, 433, 1130, 754, 555], 'patient':[1, 1, 1, 1, 2, 2, 2, 2], 'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes', 'Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes']}) data # We can change the order by indexing them in the order we desire: # In[ ]: data[['phylum','value','patient']] # A `DataFrame` has a second index, representing the columns: # In[ ]: data.columns # The `dtypes` attribute reveals the data type for each column in our DataFrame. # # - `int64` is numeric integer values # - `object` strings (letters and numbers) # - `float64` floating-point values # In[ ]: data.dtypes # If we wish to access columns, we can do so either by dict-like indexing or by attribute: # In[ ]: data['patient'] # In[ ]: data.patient # In[ ]: type(data.value) # In[ ]: data[['value']] # Notice this is different than with `Series`, where dict-like indexing retrieved a particular element (row). # # If we want access to a row in a `DataFrame`, we index its `loc` attribute. # In[ ]: data.loc[1] # ### Exercise # # Try out these commands to see what they return: # # - `data.head()` # - `data.tail(3)` # - `data.shape` # In[ ]: # An alternative way of initializing a `DataFrame` is with a list of dicts: # In[ ]: data = pd.DataFrame([{'patient': 1, 'phylum': 'Firmicutes', 'value': 632}, {'patient': 1, 'phylum': 'Proteobacteria', 'value': 1638}, {'patient': 1, 'phylum': 'Actinobacteria', 'value': 569}, {'patient': 1, 'phylum': 'Bacteroidetes', 'value': 115}, {'patient': 2, 'phylum': 'Firmicutes', 'value': 433}, {'patient': 2, 'phylum': 'Proteobacteria', 'value': 1130}, {'patient': 2, 'phylum': 'Actinobacteria', 'value': 754}, {'patient': 2, 'phylum': 'Bacteroidetes', 'value': 555}]) # In[ ]: data # Its important to note that the Series returned when a DataFrame is indexed is merely a **view** on the DataFrame, and not a copy of the data itself. So you must be cautious when manipulating this data: # In[ ]: vals = data.value vals # In[ ]: vals[5] = 0 vals # Now look at the original `DataFrame`! # In[ ]: data # If we plan on modifying an extracted Series, its a good idea to make a copy. # In[ ]: vals = data.value.copy() vals[5] = 1000 data # We can create or modify columns by assignment: # In[ ]: data.value[[3,4,6]] = [14, 21, 5] data # In[ ]: data['year'] = 2013 data # But note, we cannot use the attribute indexing method to add a new column: # In[ ]: data.treatment = 1 data # In[ ]: data.treatment # ### Exercise # # From the `data` table above, create an index to return all rows for which the phylum name ends in "bacteria" and the value is greater than 1000. # In[ ]: data[data.phylum.str.endswith('bacteria') & (data.value > 1000)] # Specifying a `Series` as a new columns cause its values to be added according to the `DataFrame`'s index: # In[ ]: treatment = pd.Series([0]*4 + [1]*2) treatment # In[ ]: data['treatment'] = treatment data # Other Python data structures (ones without an index) need to be the same length as the `DataFrame`: # In[ ]: month = ['Jan', 'Feb', 'Mar', 'Apr'] data['month'] = month # In[ ]: data['month'] = ['Jan']*len(data) data # We can use the `drop` method to remove rows or columns, which by default drops rows. We can be explicit by using the `axis` argument: # In[ ]: data.drop('month', axis=1, inplace=True) # In[ ]: data # We can extract the underlying data as a simple `ndarray` by accessing the `values` attribute: # In[ ]: data.values # Notice that because of the mix of string and integer (and `NaN`) values, the dtype of the array is `object`. The dtype will automatically be chosen to be as general as needed to accomodate all the columns. # In[ ]: df = pd.DataFrame({'foo': [1,2,3], 'bar':[0.4, -1.0, 4.5]}) df.values # Pandas uses a custom data structure to represent the indices of Series and DataFrames. # In[ ]: data.index # Index objects are immutable: # In[ ]: data.index[0] = 15 # This is so that Index objects can be shared between data structures without fear that they will be changed. # In[ ]: bacteria2.index = bacteria.index # In[ ]: bacteria2 # ## Importing data # A key, but often under-appreciated, step in data analysis is importing the data that we wish to analyze. Though it is easy to load basic data structures into Python using built-in tools or those provided by packages like NumPy, it is non-trivial to import structured data well, and to easily convert this input into a robust data structure: # # genes = np.loadtxt("genes.csv", delimiter=",", dtype=[('gene', '|S10'), ('value', '