We import the pandas package with the alias pd
We import the Boston Dataset from
import pandas as pd
Boston =pd.read_csv("http://vincentarelbundock.github.io/Rdatasets/csv/MASS/Boston.csv")
Boston.head(5) #Gives first few rows. In R we typically use head(object) while in Python we would use object.head()
Unnamed: 0 | crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.00632 | 18 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 2 | 0.02731 | 0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 3 | 0.02729 | 0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 4 | 0.03237 | 0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 5 | 0.06905 | 0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
Boston=Boston.drop('Unnamed: 0', 1) #Dropping a particular variable
Boston.dtypes
crim float64 zn float64 indus float64 chas int64 nox float64 rm float64 age float64 dis float64 rad int64 tax int64 ptratio float64 black float64 lstat float64 medv float64 dtype: object
Boston.info()# Gives information of object. In R this is given by str
<class 'pandas.core.frame.DataFrame'> Int64Index: 506 entries, 0 to 505 Data columns (total 14 columns): crim 506 non-null float64 zn 506 non-null float64 indus 506 non-null float64 chas 506 non-null int64 nox 506 non-null float64 rm 506 non-null float64 age 506 non-null float64 dis 506 non-null float64 rad 506 non-null int64 tax 506 non-null int64 ptratio 506 non-null float64 black 506 non-null float64 lstat 506 non-null float64 medv 506 non-null float64 dtypes: float64(11), int64(3) memory usage: 59.3 KB
type(Boston) #Gives type of object. In R this is given by class
pandas.core.frame.DataFrame
Boston.shape #Gives rows and columns. In R this is given by dim
(506, 14)
dir(Boston) #What are the various commands we can run on Boston
['T', '_AXIS_ALIASES', '_AXIS_IALIASES', '_AXIS_LEN', '_AXIS_NAMES', '_AXIS_NUMBERS', '_AXIS_ORDERS', '_AXIS_REVERSED', '_AXIS_SLICEMAP', '__abs__', '__add__', '__and__', '__array__', '__array_wrap__', '__bool__', '__bytes__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__finalize__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__invert__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__le__', '__len__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__unicode__', '__weakref__', '__xor__', '_accessors', '_add_numeric_operations', '_agg_by_level', '_align_frame', '_align_series', '_apply_broadcast', '_apply_empty_result', '_apply_raw', '_apply_standard', '_at', '_auto_consolidate', '_box_col_values', '_box_item_values', '_check_inplace_setting', '_check_is_chained_assignment_possible', '_check_setitem_copy', '_clear_item_cache', '_combine_const', '_combine_frame', '_combine_match_columns', '_combine_match_index', '_combine_series', '_combine_series_infer', '_compare_frame', '_compare_frame_evaluate', '_consolidate_inplace', '_construct_axes_dict', '_construct_axes_dict_for_slice', '_construct_axes_dict_from', '_construct_axes_from_arguments', '_constructor', '_constructor_expanddim', '_constructor_sliced', '_count_level', '_create_indexer', '_dir_additions', '_dir_deletions', '_ensure_valid_index', '_expand_axes', '_flex_compare_frame', '_from_arrays', '_from_axes', '_get_agg_axis', '_get_axis', '_get_axis_name', '_get_axis_number', '_get_axis_resolvers', '_get_block_manager_axis', '_get_bool_data', '_get_cacher', '_get_index_resolvers', '_get_item_cache', '_get_numeric_data', '_get_values', '_getitem_array', '_getitem_column', '_getitem_frame', '_getitem_multilevel', '_getitem_slice', '_iat', '_iget_item_cache', '_iloc', '_indexed_same', '_info_axis', '_info_axis_name', '_info_axis_number', '_info_repr', '_init_dict', '_init_mgr', '_init_ndarray', '_internal_names', '_internal_names_set', '_is_cached', '_is_datelike_mixed_type', '_is_mixed_type', '_is_numeric_mixed_type', '_is_view', '_ix', '_ixs', '_join_compat', '_loc', '_maybe_cache_changed', '_maybe_update_cacher', '_metadata', '_needs_reindex_multi', '_protect_consolidate', '_reduce', '_reindex_axes', '_reindex_axis', '_reindex_columns', '_reindex_index', '_reindex_multi', '_reindex_with_indexers', '_repr_fits_horizontal_', '_repr_fits_vertical_', '_repr_html_', '_reset_cache', '_sanitize_column', '_series', '_set_as_cached', '_set_axis', '_set_is_copy', '_set_item', '_setitem_array', '_setitem_frame', '_setitem_slice', '_setup_axes', '_slice', '_stat_axis', '_stat_axis_name', '_stat_axis_number', '_typ', '_unpickle_frame_compat', '_unpickle_matrix_compat', '_update_inplace', '_validate_dtype', '_xs', 'abs', 'add', 'add_prefix', 'add_suffix', 'age', 'align', 'all', 'any', 'append', 'apply', 'applymap', 'as_blocks', 'as_matrix', 'asfreq', 'assign', 'astype', 'at', 'at_time', 'axes', 'between_time', 'bfill', 'black', 'blocks', 'bool', 'boxplot', 'chas', 'clip', 'clip_lower', 'clip_upper', 'columns', 'combine', 'combineAdd', 'combineMult', 'combine_first', 'compound', 'consolidate', 'convert_objects', 'copy', 'corr', 'corrwith', 'count', 'cov', 'crim', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dis', 'div', 'divide', 'dot', 'drop', 'drop_duplicates', 'dropna', 'dtypes', 'duplicated', 'empty', 'eq', 'equals', 'eval', 'ffill', 'fillna', 'filter', 'first', 'first_valid_index', 'floordiv', 'from_csv', 'from_dict', 'from_items', 'from_records', 'ftypes', 'ge', 'get', 'get_dtype_counts', 'get_ftype_counts', 'get_value', 'get_values', 'groupby', 'gt', 'head', 'hist', 'iat', 'icol', 'idxmax', 'idxmin', 'iget_value', 'iloc', 'index', 'indus', 'info', 'insert', 'interpolate', 'irow', 'is_copy', 'isin', 'isnull', 'items', 'iteritems', 'iterkv', 'iterrows', 'itertuples', 'ix', 'join', 'keys', 'kurt', 'kurtosis', 'last', 'last_valid_index', 'le', 'load', 'loc', 'lookup', 'lstat', 'lt', 'mad', 'mask', 'max', 'mean', 'median', 'medv', 'memory_usage', 'merge', 'min', 'mod', 'mode', 'mul', 'multiply', 'ndim', 'ne', 'notnull', 'nox', 'pct_change', 'pipe', 'pivot', 'pivot_table', 'plot', 'pop', 'pow', 'prod', 'product', 'ptratio', 'quantile', 'query', 'rad', 'radd', 'rank', 'rdiv', 'reindex', 'reindex_axis', 'reindex_like', 'rename', 'rename_axis', 'reorder_levels', 'replace', 'resample', 'reset_index', 'rfloordiv', 'rm', 'rmod', 'rmul', 'rpow', 'rsub', 'rtruediv', 'sample', 'save', 'select', 'select_dtypes', 'sem', 'set_axis', 'set_index', 'set_value', 'shape', 'shift', 'size', 'skew', 'slice_shift', 'sort', 'sort_index', 'sortlevel', 'squeeze', 'stack', 'std', 'sub', 'subtract', 'sum', 'swapaxes', 'swaplevel', 'tail', 'take', 'tax', 'to_clipboard', 'to_csv', 'to_dense', 'to_dict', 'to_excel', 'to_gbq', 'to_hdf', 'to_html', 'to_json', 'to_latex', 'to_msgpack', 'to_panel', 'to_period', 'to_pickle', 'to_records', 'to_sparse', 'to_sql', 'to_stata', 'to_string', 'to_timestamp', 'to_wide', 'transpose', 'truediv', 'truncate', 'tshift', 'tz_convert', 'tz_localize', 'unstack', 'update', 'values', 'var', 'where', 'xs', 'zn']
Boston[0:3] #Gives rows from first row to third row. In python index starts from 0 while in R it starts from 1.
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
Boston[-5:504] #Gives rows from 506(total rows)-5 to the row number specified.
#Note index is one less than row number unlike R
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
501 | 0.06263 | 0 | 11.93 | 0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273 | 21 | 391.99 | 9.67 | 22.4 |
502 | 0.04527 | 0 | 11.93 | 0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273 | 21 | 396.90 | 9.08 | 20.6 |
503 | 0.06076 | 0 | 11.93 | 0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273 | 21 | 396.90 | 5.64 | 23.9 |
Boston[['medv','rm','chas']].head() #Note the double square brackets [[]]
medv | rm | chas | |
---|---|---|---|
0 | 24.0 | 6.575 | 0 |
1 | 21.6 | 6.421 | 0 |
2 | 34.7 | 7.185 | 0 |
3 | 33.4 | 6.998 | 0 |
4 | 36.2 | 7.147 | 0 |
Boston.ix[3:10,['medv','rm','chas','nox']]
#Note the use of command ix as well as putting row numbers and column names seperately
medv | rm | chas | nox | |
---|---|---|---|---|
3 | 33.4 | 6.998 | 0 | 0.458 |
4 | 36.2 | 7.147 | 0 | 0.458 |
5 | 28.7 | 6.430 | 0 | 0.458 |
6 | 22.9 | 6.012 | 0 | 0.524 |
7 | 27.1 | 6.172 | 0 | 0.524 |
8 | 16.5 | 5.631 | 0 | 0.524 |
9 | 18.9 | 6.004 | 0 | 0.524 |
10 | 15.0 | 6.377 | 0 | 0.524 |
Boston['medv']>49
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 False 10 False 11 False 12 False 13 False 14 False 15 False 16 False 17 False 18 False 19 False 20 False 21 False 22 False 23 False 24 False 25 False 26 False 27 False 28 False 29 False ... 476 False 477 False 478 False 479 False 480 False 481 False 482 False 483 False 484 False 485 False 486 False 487 False 488 False 489 False 490 False 491 False 492 False 493 False 494 False 495 False 496 False 497 False 498 False 499 False 500 False 501 False 502 False 503 False 504 False 505 False Name: medv, dtype: bool
Boston[Boston['medv']>49]
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
161 | 1.46336 | 0 | 19.58 | 0 | 0.6050 | 7.489 | 90.8 | 1.9709 | 5 | 403 | 14.7 | 374.43 | 1.73 | 50 |
162 | 1.83377 | 0 | 19.58 | 1 | 0.6050 | 7.802 | 98.2 | 2.0407 | 5 | 403 | 14.7 | 389.61 | 1.92 | 50 |
163 | 1.51902 | 0 | 19.58 | 1 | 0.6050 | 8.375 | 93.9 | 2.1620 | 5 | 403 | 14.7 | 388.45 | 3.32 | 50 |
166 | 2.01019 | 0 | 19.58 | 0 | 0.6050 | 7.929 | 96.2 | 2.0459 | 5 | 403 | 14.7 | 369.30 | 3.70 | 50 |
186 | 0.05602 | 0 | 2.46 | 0 | 0.4880 | 7.831 | 53.6 | 3.1992 | 3 | 193 | 17.8 | 392.63 | 4.45 | 50 |
195 | 0.01381 | 80 | 0.46 | 0 | 0.4220 | 7.875 | 32.0 | 5.6484 | 4 | 255 | 14.4 | 394.23 | 2.97 | 50 |
204 | 0.02009 | 95 | 2.68 | 0 | 0.4161 | 8.034 | 31.9 | 5.1180 | 4 | 224 | 14.7 | 390.55 | 2.88 | 50 |
225 | 0.52693 | 0 | 6.20 | 0 | 0.5040 | 8.725 | 83.0 | 2.8944 | 8 | 307 | 17.4 | 382.00 | 4.63 | 50 |
257 | 0.61154 | 20 | 3.97 | 0 | 0.6470 | 8.704 | 86.9 | 1.8010 | 5 | 264 | 13.0 | 389.70 | 5.12 | 50 |
267 | 0.57834 | 20 | 3.97 | 0 | 0.5750 | 8.297 | 67.0 | 2.4216 | 5 | 264 | 13.0 | 384.54 | 7.44 | 50 |
283 | 0.01501 | 90 | 1.21 | 1 | 0.4010 | 7.923 | 24.8 | 5.8850 | 1 | 198 | 13.6 | 395.52 | 3.16 | 50 |
368 | 4.89822 | 0 | 18.10 | 0 | 0.6310 | 4.970 | 100.0 | 1.3325 | 24 | 666 | 20.2 | 375.52 | 3.26 | 50 |
369 | 5.66998 | 0 | 18.10 | 1 | 0.6310 | 6.683 | 96.8 | 1.3567 | 24 | 666 | 20.2 | 375.33 | 3.73 | 50 |
370 | 6.53876 | 0 | 18.10 | 1 | 0.6310 | 7.016 | 97.5 | 1.2024 | 24 | 666 | 20.2 | 392.05 | 2.96 | 50 |
371 | 9.23230 | 0 | 18.10 | 0 | 0.6310 | 6.216 | 100.0 | 1.1691 | 24 | 666 | 20.2 | 366.15 | 9.53 | 50 |
372 | 8.26725 | 0 | 18.10 | 1 | 0.6680 | 5.875 | 89.6 | 1.1296 | 24 | 666 | 20.2 | 347.88 | 8.88 | 50 |
Boston[Boston['medv']>49 ][Boston['chas']==0]
/home/ajay/anaconda3/lib/python3.4/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index. "DataFrame index.", UserWarning)
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
161 | 1.46336 | 0 | 19.58 | 0 | 0.6050 | 7.489 | 90.8 | 1.9709 | 5 | 403 | 14.7 | 374.43 | 1.73 | 50 |
166 | 2.01019 | 0 | 19.58 | 0 | 0.6050 | 7.929 | 96.2 | 2.0459 | 5 | 403 | 14.7 | 369.30 | 3.70 | 50 |
186 | 0.05602 | 0 | 2.46 | 0 | 0.4880 | 7.831 | 53.6 | 3.1992 | 3 | 193 | 17.8 | 392.63 | 4.45 | 50 |
195 | 0.01381 | 80 | 0.46 | 0 | 0.4220 | 7.875 | 32.0 | 5.6484 | 4 | 255 | 14.4 | 394.23 | 2.97 | 50 |
204 | 0.02009 | 95 | 2.68 | 0 | 0.4161 | 8.034 | 31.9 | 5.1180 | 4 | 224 | 14.7 | 390.55 | 2.88 | 50 |
225 | 0.52693 | 0 | 6.20 | 0 | 0.5040 | 8.725 | 83.0 | 2.8944 | 8 | 307 | 17.4 | 382.00 | 4.63 | 50 |
257 | 0.61154 | 20 | 3.97 | 0 | 0.6470 | 8.704 | 86.9 | 1.8010 | 5 | 264 | 13.0 | 389.70 | 5.12 | 50 |
267 | 0.57834 | 20 | 3.97 | 0 | 0.5750 | 8.297 | 67.0 | 2.4216 | 5 | 264 | 13.0 | 384.54 | 7.44 | 50 |
368 | 4.89822 | 0 | 18.10 | 0 | 0.6310 | 4.970 | 100.0 | 1.3325 | 24 | 666 | 20.2 | 375.52 | 3.26 | 50 |
371 | 9.23230 | 0 | 18.10 | 0 | 0.6310 | 6.216 | 100.0 | 1.1691 | 24 | 666 | 20.2 | 366.15 | 9.53 | 50 |
Boston.query('medv >49 and rm >8')
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
163 | 1.51902 | 0 | 19.58 | 1 | 0.6050 | 8.375 | 93.9 | 2.1620 | 5 | 403 | 14.7 | 388.45 | 3.32 | 50 |
204 | 0.02009 | 95 | 2.68 | 0 | 0.4161 | 8.034 | 31.9 | 5.1180 | 4 | 224 | 14.7 | 390.55 | 2.88 | 50 |
225 | 0.52693 | 0 | 6.20 | 0 | 0.5040 | 8.725 | 83.0 | 2.8944 | 8 | 307 | 17.4 | 382.00 | 4.63 | 50 |
257 | 0.61154 | 20 | 3.97 | 0 | 0.6470 | 8.704 | 86.9 | 1.8010 | 5 | 264 | 13.0 | 389.70 | 5.12 | 50 |
267 | 0.57834 | 20 | 3.97 | 0 | 0.5750 | 8.297 | 67.0 | 2.4216 | 5 | 264 | 13.0 | 384.54 | 7.44 | 50 |
Boston.query('medv >49 or rm <4')
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
161 | 1.46336 | 0 | 19.58 | 0 | 0.6050 | 7.489 | 90.8 | 1.9709 | 5 | 403 | 14.7 | 374.43 | 1.73 | 50.0 |
162 | 1.83377 | 0 | 19.58 | 1 | 0.6050 | 7.802 | 98.2 | 2.0407 | 5 | 403 | 14.7 | 389.61 | 1.92 | 50.0 |
163 | 1.51902 | 0 | 19.58 | 1 | 0.6050 | 8.375 | 93.9 | 2.1620 | 5 | 403 | 14.7 | 388.45 | 3.32 | 50.0 |
166 | 2.01019 | 0 | 19.58 | 0 | 0.6050 | 7.929 | 96.2 | 2.0459 | 5 | 403 | 14.7 | 369.30 | 3.70 | 50.0 |
186 | 0.05602 | 0 | 2.46 | 0 | 0.4880 | 7.831 | 53.6 | 3.1992 | 3 | 193 | 17.8 | 392.63 | 4.45 | 50.0 |
195 | 0.01381 | 80 | 0.46 | 0 | 0.4220 | 7.875 | 32.0 | 5.6484 | 4 | 255 | 14.4 | 394.23 | 2.97 | 50.0 |
204 | 0.02009 | 95 | 2.68 | 0 | 0.4161 | 8.034 | 31.9 | 5.1180 | 4 | 224 | 14.7 | 390.55 | 2.88 | 50.0 |
225 | 0.52693 | 0 | 6.20 | 0 | 0.5040 | 8.725 | 83.0 | 2.8944 | 8 | 307 | 17.4 | 382.00 | 4.63 | 50.0 |
257 | 0.61154 | 20 | 3.97 | 0 | 0.6470 | 8.704 | 86.9 | 1.8010 | 5 | 264 | 13.0 | 389.70 | 5.12 | 50.0 |
267 | 0.57834 | 20 | 3.97 | 0 | 0.5750 | 8.297 | 67.0 | 2.4216 | 5 | 264 | 13.0 | 384.54 | 7.44 | 50.0 |
283 | 0.01501 | 90 | 1.21 | 1 | 0.4010 | 7.923 | 24.8 | 5.8850 | 1 | 198 | 13.6 | 395.52 | 3.16 | 50.0 |
365 | 4.55587 | 0 | 18.10 | 0 | 0.7180 | 3.561 | 87.9 | 1.6132 | 24 | 666 | 20.2 | 354.70 | 7.12 | 27.5 |
367 | 13.52220 | 0 | 18.10 | 0 | 0.6310 | 3.863 | 100.0 | 1.5106 | 24 | 666 | 20.2 | 131.42 | 13.33 | 23.1 |
368 | 4.89822 | 0 | 18.10 | 0 | 0.6310 | 4.970 | 100.0 | 1.3325 | 24 | 666 | 20.2 | 375.52 | 3.26 | 50.0 |
369 | 5.66998 | 0 | 18.10 | 1 | 0.6310 | 6.683 | 96.8 | 1.3567 | 24 | 666 | 20.2 | 375.33 | 3.73 | 50.0 |
370 | 6.53876 | 0 | 18.10 | 1 | 0.6310 | 7.016 | 97.5 | 1.2024 | 24 | 666 | 20.2 | 392.05 | 2.96 | 50.0 |
371 | 9.23230 | 0 | 18.10 | 0 | 0.6310 | 6.216 | 100.0 | 1.1691 | 24 | 666 | 20.2 | 366.15 | 9.53 | 50.0 |
372 | 8.26725 | 0 | 18.10 | 1 | 0.6680 | 5.875 | 89.6 | 1.1296 | 24 | 666 | 20.2 | 347.88 | 8.88 | 50.0 |