import pandas as pd
import os
print pd.__version__
0.20.3
DATA_DIR = '../data'
# reading table
# making seperator as comma
# renaming column names for 0th row of the file
df = pd.read_table(
os.path.abspath(os.path.join(DATA_DIR,'day1/iris.csv')),
sep=',',
header=0,
names=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
)
df.head(5)
sepal_len | sepal_wid | petal_len | petal_wid | class | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
# see dimension of the dataset
# 150 rows, 5 columns
df.shape
(150, 5)
# drop method takes the column names in array
# axis=1 corresponds to columns
# inplace=True does not require you to hold it in other variable, memory efficient
df.drop(['class'], axis=1, inplace=True)
df.head(5)
sepal_len | sepal_wid | petal_len | petal_wid | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
# drop method takes the row names in array
# axis=0 corresponds to rows, bydefault axis=0 in drop method
# inplace=True does not require you to hold it in other variable, memory efficient
df.drop([0, 1], axis=0, inplace=True)
df.head(5)
sepal_len | sepal_wid | petal_len | petal_wid | |
---|---|---|---|---|
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
5 | 5.4 | 3.9 | 1.7 | 0.4 |
6 | 4.6 | 3.4 | 1.4 | 0.3 |
df = pd.read_table(
'http://bit.ly/imdbratings',
sep=','
)
df.head(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# sort_values() method returns bydefault by ascending order
# sort_values() can take 'inplace=True/False' for changing the values inplace
print df['star_rating'].sort_values().head(5)
print df['star_rating'].sort_values().tail(5)
978 7.4 950 7.4 949 7.4 948 7.4 947 7.4 Name: star_rating, dtype: float64 6 8.9 3 9.0 2 9.1 1 9.2 0 9.3 Name: star_rating, dtype: float64
# ascending=True/False parameter in sort_values() can decide the sorting order
print df['star_rating'].sort_values(ascending=False).head(5)
print df['star_rating'].sort_values(ascending=False).tail(5)
0 9.3 1 9.2 2 9.1 3 9.0 6 8.9 Name: star_rating, dtype: float64 947 7.4 948 7.4 949 7.4 950 7.4 978 7.4 Name: star_rating, dtype: float64
# relatively better way to do is to use the below mentioned technique
# to sort by multiple fields, just populate the array inside sort_values()
df.sort_values(['duration'], ascending=True).head(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
389 | 8.0 | Freaks | UNRATED | Drama | 64 | [u'Wallace Ford', u'Leila Hyams', u'Olga Bacla... |
338 | 8.0 | Battleship Potemkin | UNRATED | History | 66 | [u'Aleksandr Antonov', u'Vladimir Barsky', u'G... |
258 | 8.1 | The Cabinet of Dr. Caligari | UNRATED | Crime | 67 | [u'Werner Krauss', u'Conrad Veidt', u'Friedric... |
293 | 8.1 | Duck Soup | PASSED | Comedy | 68 | [u'Groucho Marx', u'Harpo Marx', u'Chico Marx'] |
88 | 8.4 | The Kid | NOT RATED | Comedy | 68 | [u'Charles Chaplin', u'Edna Purviance', u'Jack... |
df = pd.read_table(
'http://bit.ly/imdbratings',
sep=','
)
df.head(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# we need movies above 8.5
df_rating_bools = df['star_rating'].map(lambda row: row>8.5)
df[df_rating_bools].tail(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
35 | 8.6 | Modern Times | G | Comedy | 87 | [u'Charles Chaplin', u'Paulette Goddard', u'He... |
36 | 8.6 | Saving Private Ryan | R | Action | 169 | [u'Tom Hanks', u'Matt Damon', u'Tom Sizemore'] |
37 | 8.6 | Raiders of the Lost Ark | PG | Action | 115 | [u'Harrison Ford', u'Karen Allen', u'Paul Free... |
38 | 8.6 | Rear Window | APPROVED | Mystery | 112 | [u'James Stewart', u'Grace Kelly', u'Wendell C... |
39 | 8.6 | Psycho | R | Horror | 109 | [u'Anthony Perkins', u'Janet Leigh', u'Vera Mi... |
# we need movies above 8.5
boolean = list()
for row in df['star_rating']:
if row > 8.5: boolean.append(True)
else: boolean.append(False)
# boolean is a list, and since column in pandas is a series, so we need to convert list to series
df_rating_bools = pd.Series(boolean)
df[df_rating_bools].tail(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
35 | 8.6 | Modern Times | G | Comedy | 87 | [u'Charles Chaplin', u'Paulette Goddard', u'He... |
36 | 8.6 | Saving Private Ryan | R | Action | 169 | [u'Tom Hanks', u'Matt Damon', u'Tom Sizemore'] |
37 | 8.6 | Raiders of the Lost Ark | PG | Action | 115 | [u'Harrison Ford', u'Karen Allen', u'Paul Free... |
38 | 8.6 | Rear Window | APPROVED | Mystery | 112 | [u'James Stewart', u'Grace Kelly', u'Wendell C... |
39 | 8.6 | Psycho | R | Horror | 109 | [u'Anthony Perkins', u'Janet Leigh', u'Vera Mi... |
# df['star_rating'] > 8.5 automatically searches/iterates through all the rows satisying this condition
df[df['star_rating'] > 8.5].tail(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
35 | 8.6 | Modern Times | G | Comedy | 87 | [u'Charles Chaplin', u'Paulette Goddard', u'He... |
36 | 8.6 | Saving Private Ryan | R | Action | 169 | [u'Tom Hanks', u'Matt Damon', u'Tom Sizemore'] |
37 | 8.6 | Raiders of the Lost Ark | PG | Action | 115 | [u'Harrison Ford', u'Karen Allen', u'Paul Free... |
38 | 8.6 | Rear Window | APPROVED | Mystery | 112 | [u'James Stewart', u'Grace Kelly', u'Wendell C... |
39 | 8.6 | Psycho | R | Horror | 109 | [u'Anthony Perkins', u'Janet Leigh', u'Vera Mi... |
df = pd.read_table(
'http://bit.ly/imdbratings',
sep=','
)
df.head(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
0 | 9.3 | The Shawshank Redemption | R | Crime | 142 | [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... |
1 | 9.2 | The Godfather | R | Crime | 175 | [u'Marlon Brando', u'Al Pacino', u'James Caan'] |
2 | 9.1 | The Godfather: Part II | R | Crime | 200 | [u'Al Pacino', u'Robert De Niro', u'Robert Duv... |
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
4 | 8.9 | Pulp Fiction | R | Crime | 154 | [u'John Travolta', u'Uma Thurman', u'Samuel L.... |
# we want movies that have rating above 8.5 and duration above 200mins
df[(df['star_rating'] > 8.5) & (df['duration'] > 200)].head(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
df_rating_bools = df['star_rating'].map(lambda row: row>8.5)
df_duration_bools = df['duration'].map(lambda row: row>200)
df[df_rating_bools & df_duration_bools].tail(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
7 | 8.9 | The Lord of the Rings: The Return of the King | PG-13 | Adventure | 201 | [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... |
17 | 8.7 | Seven Samurai | UNRATED | Drama | 207 | [u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K... |
# this appoarch is inspired by python 'if in [1,2]' functionality
bools = df['genre'].isin(['Drama', 'Action'])
df[bools].head(5)
star_rating | title | content_rating | genre | duration | actors_list | |
---|---|---|---|---|---|---|
3 | 9.0 | The Dark Knight | PG-13 | Action | 152 | [u'Christian Bale', u'Heath Ledger', u'Aaron E... |
5 | 8.9 | 12 Angry Men | NOT RATED | Drama | 96 | [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... |
9 | 8.9 | Fight Club | R | Drama | 139 | [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... |
11 | 8.8 | Inception | PG-13 | Action | 148 | [u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'... |
12 | 8.8 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 124 | [u'Mark Hamill', u'Harrison Ford', u'Carrie Fi... |