# coding: utf-8 # # # Open in Colab # # # Data Analysis with Pandas # # [Pandas](https://pandas.pydata.org/) is a Python library providing high-performance, easy-to-use data structures and data analysis tools. # In[16]: # Import the pandas package under the alias "pd" import pandas as pd # ## Series and DataFrames # # The primary data structures in *pandas* are implemented as two classes: # # - **`DataFrame`**, which you can imagine as a relational data table, with rows and named columns. # - **`Series`**, which is a single column. A `DataFrame` contains one or more `Series` and a name for each `Series`. # # The data frame is a commonly used abstraction for data manipulation. # In[12]: # Create a Series object pd.Series({'CAL':38332521, 'TEX':26448193, 'NY':19651127}) # In[24]: # Create a DataFrame object contraining two Series pop = pd.Series({'CAL':38332521, 'TEX':26448193, 'NY':19651127}) area = pd.Series({'CAL':423967, 'TEX':695662, 'NY':141297}) pd.DataFrame({'population':pop, 'area':area}) # ## Loading data # # Pandas is commonly used to load and analyse datasets. # In[40]: # Load the California housing dataset into a DataFrame df_cal_housing = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",") # Print DataFrame shape df_cal_housing.shape # In[37]: # Print a concise summary of the DataFrame df_cal_housing.info() # In[34]: # Print generics statistics about the DataFrame columns df_cal_housing.describe() # In[35]: # Show the first records of the DataFrame df_cal_housing.head() # In[26]: # Show 10 random samples df_cal_housing.sample(n=10) # ## Visualizing data # # Leveraging Matplotlib, Pandas can easily create beautiful graphs to gain insights about the data. # In[41]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns sns.set() # Plot the distribution of values in the "housing_median_age" column as an histogram df_cal_housing.hist('housing_median_age') # In[33]: # Plot the distribution of values for all columns as an histogram df_cal_housing.hist(figsize=(10, 8)) plt.tight_layout() # In[42]: sns.pairplot(df_cal_housing)