# # Data Analysis with Pandas
# [Pandas](https://pandas.pydata.org/) is a Python library providing high-performance, easy-to-use data structures and data analysis tools.
# Import the pandas package under the alias "pd"
import pandas as pd
# ## Series and DataFrames
# The primary data structures in *pandas* are implemented as two classes:
# - **`DataFrame`**, which you can imagine as a relational data table, with rows and named columns.
# - **`Series`**, which is a single column. A `DataFrame` contains one or more `Series` and a name for each `Series`.
# The data frame is a commonly used abstraction for data manipulation.
# Create a Series object
pd.Series({'CAL':38332521, 'TEX':26448193, 'NY':19651127})
# Create a DataFrame object contraining two Series
pop = pd.Series({'CAL':38332521, 'TEX':26448193, 'NY':19651127})
area = pd.Series({'CAL':423967, 'TEX':695662, 'NY':141297})
pd.DataFrame({'population':pop, 'area':area})
# ## Loading data
# Pandas is commonly used to load and analyse datasets.
# Load the California housing dataset into a DataFrame
df_cal_housing = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")
# Print DataFrame shape
df_cal_housing.shape
# Print a concise summary of the DataFrame
df_cal_housing.info()
# Print generics statistics about the DataFrame columns
df_cal_housing.describe()
# Show the first records of the DataFrame
df_cal_housing.head()
# Show 10 random samples
df_cal_housing.sample(n=10)
# ## Visualizing data
# Leveraging Matplotlib, Pandas can easily create beautiful graphs to gain insights about the data.
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import seaborn as sns
sns.set()
# Plot the distribution of values in the "housing_median_age" column as an histogram
df_cal_housing.hist('housing_median_age')
# Plot the distribution of values for all columns as an histogram
df_cal_housing.hist(figsize=(10, 8))
plt.tight_layout()
sns.pairplot(df_cal_housing)
