import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
%matplotlib inline
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))
CSV_PATH = os.path.join('data', 'hacknight_ticket_sample_data_2015.csv')
df = pd.read_csv(CSV_PATH,low_memory=False, parse_dates=['issue_date', 'ticket_queue_date'])
CSV_PATH = os.path.join('data', 'hacknight_sample_data_geocode_cleaned.csv')
addrs_df = pd.read_csv(CSV_PATH)
geocoded_df = pd.merge(left=df, right=addrs_df, how='inner', on='address')
geocoded_df.head()
ticket_number | issue_date | violation_location | license_plate_number | license_plate_state | license_plate_type | zipcode | violation_code | violation_description | unit | unit_description | vehicle_make | fine_level1_amount | fine_level2_amount | current_amount_due | total_payments | ticket_queue | ticket_queue_date | notice_level | hearing_disposition | notice_number | officer | address | lat | lng | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9188814621 | 2015-05-07 13:52:00 | 2134 S ARCHER AV | 7c189a16ef79db9413c1f46b7e5d1712e5c0c1575be352... | MI | PAS | 48103 | 0964190A | EXP. METER NON-CENTRAL BUSINESS DISTRICT | 498 | DOF | BUIC | 50 | 100 | 0.0 | 50.0 | Paid | 2015-05-13 | NaN | NaN | 0 | 798 | 2100 s archer av, chicago, il | 41.854262 | -87.631986 |
1 | 9188814621 | 2015-05-07 13:52:00 | 2134 S ARCHER AV | 7c189a16ef79db9413c1f46b7e5d1712e5c0c1575be352... | MI | PAS | 48103 | 0964190A | EXP. METER NON-CENTRAL BUSINESS DISTRICT | 498 | DOF | BUIC | 50 | 100 | 0.0 | 50.0 | Paid | 2015-05-13 | NaN | NaN | 0 | 798 | 2100 s archer av, chicago, il | 41.854262 | -87.631986 |
2 | 9188814621 | 2015-05-07 13:52:00 | 2134 S ARCHER AV | 7c189a16ef79db9413c1f46b7e5d1712e5c0c1575be352... | MI | PAS | 48103 | 0964190A | EXP. METER NON-CENTRAL BUSINESS DISTRICT | 498 | DOF | BUIC | 50 | 100 | 0.0 | 50.0 | Paid | 2015-05-13 | NaN | NaN | 0 | 798 | 2100 s archer av, chicago, il | 41.854262 | -87.631986 |
3 | 9188814621 | 2015-05-07 13:52:00 | 2134 S ARCHER AV | 7c189a16ef79db9413c1f46b7e5d1712e5c0c1575be352... | MI | PAS | 48103 | 0964190A | EXP. METER NON-CENTRAL BUSINESS DISTRICT | 498 | DOF | BUIC | 50 | 100 | 0.0 | 50.0 | Paid | 2015-05-13 | NaN | NaN | 0 | 798 | 2100 s archer av, chicago, il | 41.854262 | -87.631986 |
4 | 9188814621 | 2015-05-07 13:52:00 | 2134 S ARCHER AV | 7c189a16ef79db9413c1f46b7e5d1712e5c0c1575be352... | MI | PAS | 48103 | 0964190A | EXP. METER NON-CENTRAL BUSINESS DISTRICT | 498 | DOF | BUIC | 50 | 100 | 0.0 | 50.0 | Paid | 2015-05-13 | NaN | NaN | 0 | 798 | 2100 s archer av, chicago, il | 41.854262 | -87.631986 |
geocoded_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 385682 entries, 0 to 385681 Data columns (total 25 columns): ticket_number 385682 non-null int64 issue_date 385682 non-null datetime64[ns] violation_location 385682 non-null object license_plate_number 385682 non-null object license_plate_state 385679 non-null object license_plate_type 382750 non-null object zipcode 318542 non-null object violation_code 385682 non-null object violation_description 385682 non-null object unit 385682 non-null int64 unit_description 385682 non-null object vehicle_make 385682 non-null object fine_level1_amount 385682 non-null int64 fine_level2_amount 385682 non-null int64 current_amount_due 385682 non-null float64 total_payments 385682 non-null float64 ticket_queue 385682 non-null object ticket_queue_date 385682 non-null datetime64[ns] notice_level 275484 non-null object hearing_disposition 44913 non-null object notice_number 385682 non-null int64 officer 385682 non-null object address 385682 non-null object lat 385682 non-null float64 lng 385682 non-null float64 dtypes: datetime64[ns](2), float64(4), int64(5), object(14) memory usage: 76.5+ MB
geocoded_df['date_diff'] = geocoded_df['ticket_queue_date'] - geocoded_df['issue_date']
geocoded_df['date_diff'].astype(int)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-12-285d89a0f157> in <module>() ----> 1 geocoded_df['date_diff'].astype(int) ~\Anaconda3\envs\geo36\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs) 176 else: 177 kwargs[new_arg_name] = new_arg_value --> 178 return func(*args, **kwargs) 179 return wrapper 180 return _deprecate_kwarg ~\Anaconda3\envs\geo36\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs) 4999 # else, only a single dtype is given 5000 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, -> 5001 **kwargs) 5002 return self._constructor(new_data).__finalize__(self) 5003 ~\Anaconda3\envs\geo36\lib\site-packages\pandas\core\internals.py in astype(self, dtype, **kwargs) 3712 3713 def astype(self, dtype, **kwargs): -> 3714 return self.apply('astype', dtype=dtype, **kwargs) 3715 3716 def convert(self, **kwargs): ~\Anaconda3\envs\geo36\lib\site-packages\pandas\core\internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs) 3579 3580 kwargs['mgr'] = self -> 3581 applied = getattr(b, f)(**kwargs) 3582 result_blocks = _extend_blocks(applied, result_blocks) 3583 ~\Anaconda3\envs\geo36\lib\site-packages\pandas\core\internals.py in astype(self, dtype, copy, errors, values, **kwargs) 573 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): 574 return self._astype(dtype, copy=copy, errors=errors, values=values, --> 575 **kwargs) 576 577 def _astype(self, dtype, copy=False, errors='raise', values=None, ~\Anaconda3\envs\geo36\lib\site-packages\pandas\core\internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs) 662 663 # _astype_nansafe works fine with 1-d only --> 664 values = astype_nansafe(values.ravel(), dtype, copy=True) 665 values = values.reshape(self.shape) 666 ~\Anaconda3\envs\geo36\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy) 694 raise TypeError("cannot astype a timedelta from [{from_dtype}] " 695 "to [{to_dtype}]".format(from_dtype=arr.dtype, --> 696 to_dtype=dtype)) 697 698 elif (np.issubdtype(arr.dtype, np.floating) and TypeError: cannot astype a timedelta from [timedelta64[ns]] to [int32]
geocoded_df[(geocoded_df['date_diff'] > 90) & (geocoded_df['ticket_queue'] == 'Paid')]
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-10-84bba5fde5ad> in <module>() ----> 1 geocoded_df[(geocoded_df['date_diff'] > 90) & (geocoded_df['ticket_queue'] == 'Paid')] ~\Anaconda3\envs\geo36\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis) 1235 elif is_timedelta64_dtype(self): 1236 res_values = dispatch_to_index_op(op, self, other, -> 1237 pd.TimedeltaIndex) 1238 return self._constructor(res_values, index=self.index, 1239 name=res_name) ~\Anaconda3\envs\geo36\lib\site-packages\pandas\core\ops.py in dispatch_to_index_op(op, left, right, index_class) 1099 left_idx = left_idx._shallow_copy(freq=None) 1100 try: -> 1101 result = op(left_idx, right) 1102 except NullFrequencyError: 1103 # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError ~\Anaconda3\envs\geo36\lib\site-packages\pandas\core\indexes\timedeltas.py in wrapper(self, other) 76 elif not is_list_like(other): 77 raise TypeError(msg.format(cls=type(self).__name__, ---> 78 typ=type(other).__name__)) 79 else: 80 other = TimedeltaIndex(other).values TypeError: cannot compare a TimedeltaIndex with type int