#!/usr/bin/env python # coding: utf-8 # # Dataframes 10/7/2019 & 10/25/2019 # ## 1.A recap from last time # Recall we previously had talked about pandas series. Dataframes include columns that are like individual pandas series. They resemble tabular data in Microsoft Excel. Keep in mind it has a row and column index. Let's construct a Dataframe object. A common way with using a dict of equal length lists or numpy arrays. Each column can then be accessed as a series object. # In[2]: import pandas as pd states = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'] year = [2000, 2001, 2002, 2001, 2002] pop = [1.5, 1.7, 3.6, 2.4, 2.9] data = {'States': states, 'Year': year, 'Population': pop} dataframe = pd.DataFrame(data) dataframe #May have to put print(dataframe) in your text editor # In[5]: #Specify columns, index dataframe2 = pd.DataFrame(data, columns = ['States', 'Population', 'Year'], index = ['one', 'two', 'three', 'four', 'five']) dataframe2 # Selecting columns will return a series object. Use the syntax below: # In[9]: dataframe2['Population'] #dataframe2[['Population']] <- this turns into a dataframe object # With this in mind, we can create new columns in a similar matter: # In[7]: dataframe2['West'] = ['False', 'False', 'False', 'True', 'True'] dataframe2 # ## 2.Indexing # Sometimes, we may choose to retrieve one or more columns and possibly assign it to some variable. Most of the time, we can select pieces of the dataframe and analyze them during exploratory data analysis, the phase before machine learning. # In[54]: salaries = pd.read_csv("Position_Salaries.csv") # In[11]: salaries.columns # < - Printing columns # Retrieve the level and salary columns # In[12]: salaries[['Level', 'Salary']] # Print first five rows # In[13]: salaries[:5] # ### 2a.Boolean Indexing # In[18]: salaries[salaries['Salary'] > 100000] # In[21]: salaries['Salary'][salaries['Salary'] > 100000] # ### 2b. iloc/loc # In[25]: salaries.iloc[0] #prints first row, returns series object # In[28]: (salaries.iloc[:, [0,1]]) #prints all rows, first two columns; We choose column 1 and column 2 (indexed at 0 & 1) # In[30]: salaries.iloc[[4,5,6], :] #prints 4th, 5th, 6th rows, all columns # In[56]: #Slightly bit more complicated: import numpy as np salaries.iloc[np.r_[1:3, 6:8, 8], np.r_[0:1, 2:3]] #last index is exclusive unless specifically stated # In[55]: salaries.iloc[4, 1] = 88888 # Change individual value salaries # LOC is fairly similar; instead of numbers, use the row/column names. for this specific example, let's change the index of the dataframe to the positions # In[57]: salaries.set_index(['Position'], inplace = True) salaries # In[59]: salaries.loc[['Business Analyst', 'Junior Consultant'], :] #prints first two rows, all columns # In[60]: salaries.loc[:, ['Level', 'Salary']] #prints all rows, second/third column # We use the 'any' function to determine if any of the values in the Salary column contain values less than $10,000. # In[61]: (salaries.Salary < 10000).values.any() # ## Basic Arithmetic # In[62]: # Let's reset the index: # multiply the salaries by 2. salaries.reset_index(inplace = True) salaries['Salary'] = salaries['Salary'] * 2 salaries # You can use other operations when dealing and maniuplating with individual columns. # ## Filling Misisng Data # We will be importing the wine dataset. This dataset contains null values (missing values). 5 columns contain some missing data, while 22 rows contain some missing data. # In[11]: wine = pd.read_csv("winequality-red.csv") wine.isnull().any(axis = 0) #add the .sum operator # In[4]: #Check the first few rows of the data frame. wine.head() # In[ ]: #wine.dropna(axis=0, inplace = True) #wine.dropna(axis = 1, inplace = True) # Sometimes, we may want to fill in the missing data with a specific value. Keep in mind that dropping null values or replacing them with a particular value may ultimately affect the accuracy of your model. It's important to utilize certain strategies on how to deal with null values depending on the context of the problem. This is referred to as data imputation. Here is an article on some approaches used: https://www.theanalysisfactor.com/seven-ways-to-make-up-data-common-methods-to-imputing-missing-data/ # In[67]: #Fill all missing data with zeros wine.fillna(0, inplace = True) wine.isnull().any(axis=0) # We can fill columns with missing data by specific values corresponding to their column. # In[13]: wine.fillna({wine.columns[2]:0, wine.columns[4]:2}, inplace = True) #Columns 2 and 4 values have been replaced. {name of dataframe}.columns provides a list of column names wine.isnull().any(axis = 0) # In[70]: wine.head()