# The usual preamble
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'
# This is necessary to show lots of columns in pandas 0.12.
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000)
pd.set_option('display.max_columns', 60)
One of the main problems with messy data is: how do you know if it's messy or not?
We're going to use the NYC 311 service request dataset again here, since it's big and a bit unwieldy.
requests = pd.read_csv('../data/311-service-requests.csv')
We're going to look at a few columns here. I know already that there are some problems with the zip code, so let's look at that first.
To get a sense for whether a column has problems, I usually use .unique()
to look at all its values. If it's a numeric column, I'll instead plot a histogram to get a sense of the distribution.
When we look at the unique values in "Incident Zip", it quickly becomes clear that this is a mess.
Some of the problems:
nan
s29616-0759
or 83
What we can do:
requests['Incident Zip'].unique()
array([11432.0, 11378.0, 10032.0, 10023.0, 10027.0, 11372.0, 11419.0, 11417.0, 10011.0, 11225.0, 11218.0, 10003.0, 10029.0, 10466.0, 11219.0, 10025.0, 10310.0, 11236.0, nan, 10033.0, 11216.0, 10016.0, 10305.0, 10312.0, 10026.0, 10309.0, 10036.0, 11433.0, 11235.0, 11213.0, 11379.0, 11101.0, 10014.0, 11231.0, 11234.0, 10457.0, 10459.0, 10465.0, 11207.0, 10002.0, 10034.0, 11233.0, 10453.0, 10456.0, 10469.0, 11374.0, 11221.0, 11421.0, 11215.0, 10007.0, 10019.0, 11205.0, 11418.0, 11369.0, 11249.0, 10005.0, 10009.0, 11211.0, 11412.0, 10458.0, 11229.0, 10065.0, 10030.0, 11222.0, 10024.0, 10013.0, 11420.0, 11365.0, 10012.0, 11214.0, 11212.0, 10022.0, 11232.0, 11040.0, 11226.0, 10281.0, 11102.0, 11208.0, 10001.0, 10472.0, 11414.0, 11223.0, 10040.0, 11220.0, 11373.0, 11203.0, 11691.0, 11356.0, 10017.0, 10452.0, 10280.0, 11217.0, 10031.0, 11201.0, 11358.0, 10128.0, 11423.0, 10039.0, 10010.0, 11209.0, 10021.0, 10037.0, 11413.0, 11375.0, 11238.0, 10473.0, 11103.0, 11354.0, 11361.0, 11106.0, 11385.0, 10463.0, 10467.0, 11204.0, 11237.0, 11377.0, 11364.0, 11434.0, 11435.0, 11210.0, 11228.0, 11368.0, 11694.0, 10464.0, 11415.0, 10314.0, 10301.0, 10018.0, 10038.0, 11105.0, 11230.0, 10468.0, 11104.0, 10471.0, 11416.0, 10075.0, 11422.0, 11355.0, 10028.0, 10462.0, 10306.0, 10461.0, 11224.0, 11429.0, 10035.0, 11366.0, 11362.0, 11206.0, 10460.0, 10304.0, 11360.0, 11411.0, 10455.0, 10475.0, 10069.0, 10303.0, 10308.0, 10302.0, 11357.0, 10470.0, 11367.0, 11370.0, 10454.0, 10451.0, 11436.0, 11426.0, 10153.0, 11004.0, 11428.0, 11427.0, 11001.0, 11363.0, 10004.0, 10474.0, 11430.0, 10000.0, 10307.0, 11239.0, 10119.0, 10006.0, 10048.0, 11697.0, 11692.0, 11693.0, 10573.0, 83.0, 11559.0, 10020.0, 77056.0, 11776.0, 70711.0, 10282.0, 11109.0, 10044.0, '10452', '11233', '10468', '10310', '11105', '10462', '10029', '10301', '10457', '10467', '10469', '11225', '10035', '10031', '11226', '10454', '11221', '10025', '11229', '11235', '11422', '10472', '11208', '11102', '10032', '11216', '10473', '10463', '11213', '10040', '10302', '11231', '10470', '11204', '11104', '11212', '10466', '11416', '11214', '10009', '11692', '11385', '11423', '11201', '10024', '11435', '10312', '10030', '11106', '10033', '10303', '11215', '11222', '11354', '10016', '10034', '11420', '10304', '10019', '11237', '11249', '11230', '11372', '11207', '11378', '11419', '11361', '10011', '11357', '10012', '11358', '10003', '10002', '11374', '10007', '11234', '10065', '11369', '11434', '11205', '11206', '11415', '11236', '11218', '11413', '10458', '11101', '10306', '11355', '10023', '11368', '10314', '11421', '10010', '10018', '11223', '10455', '11377', '11433', '11375', '10037', '11209', '10459', '10128', '10014', '10282', '11373', '10451', '11238', '11211', '10038', '11694', '11203', '11691', '11232', '10305', '10021', '11228', '10036', '10001', '10017', '11217', '11219', '10308', '10465', '11379', '11414', '10460', '11417', '11220', '11366', '10027', '11370', '10309', '11412', '11356', '10456', '11432', '10022', '10013', '11367', '11040', '10026', '10475', '11210', '11364', '11426', '10471', '10119', '11224', '11418', '11429', '11365', '10461', '11239', '10039', '00083', '11411', '10075', '11004', '11360', '10453', '10028', '11430', '10307', '11103', '10004', '10069', '10005', '10474', '11428', '11436', '10020', '11001', '11362', '11693', '10464', '11427', '10044', '11363', '10006', '10000', '02061', '77092-2016', '10280', '11109', '14225', '55164-0737', '19711', '07306', '000000', 'NO CLUE', '90010', '10281', '11747', '23541', '11776', '11697', '11788', '07604', 10112.0, 11788.0, 11563.0, 11580.0, 7087.0, 11042.0, 7093.0, 11501.0, 92123.0, 0.0, 11575.0, 7109.0, 11797.0, '10803', '11716', '11722', '11549-3650', '10162', '92123', '23502', '11518', '07020', '08807', '11577', '07114', '11003', '07201', '11563', '61702', '10103', '29616-0759', '35209-3114', '11520', '11735', '10129', '11005', '41042', '11590', 6901.0, 7208.0, 11530.0, 13221.0, 10954.0, 11735.0, 10103.0, 7114.0, 11111.0, 10107.0], dtype=object)
We can pass a na_values
option to pd.read_csv
to clean this up a little bit. We can also specify that the type of Incident Zip is a string, not a float.
na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str})
requests['Incident Zip'].unique()
array(['11432', '11378', '10032', '10023', '10027', '11372', '11419', '11417', '10011', '11225', '11218', '10003', '10029', '10466', '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016', '10305', '10312', '10026', '10309', '10036', '11433', '11235', '11213', '11379', '11101', '10014', '11231', '11234', '10457', '10459', '10465', '11207', '10002', '10034', '11233', '10453', '10456', '10469', '11374', '11221', '11421', '11215', '10007', '10019', '11205', '11418', '11369', '11249', '10005', '10009', '11211', '11412', '10458', '11229', '10065', '10030', '11222', '10024', '10013', '11420', '11365', '10012', '11214', '11212', '10022', '11232', '11040', '11226', '10281', '11102', '11208', '10001', '10472', '11414', '11223', '10040', '11220', '11373', '11203', '11691', '11356', '10017', '10452', '10280', '11217', '10031', '11201', '11358', '10128', '11423', '10039', '10010', '11209', '10021', '10037', '11413', '11375', '11238', '10473', '11103', '11354', '11361', '11106', '11385', '10463', '10467', '11204', '11237', '11377', '11364', '11434', '11435', '11210', '11228', '11368', '11694', '10464', '11415', '10314', '10301', '10018', '10038', '11105', '11230', '10468', '11104', '10471', '11416', '10075', '11422', '11355', '10028', '10462', '10306', '10461', '11224', '11429', '10035', '11366', '11362', '11206', '10460', '10304', '11360', '11411', '10455', '10475', '10069', '10303', '10308', '10302', '11357', '10470', '11367', '11370', '10454', '10451', '11436', '11426', '10153', '11004', '11428', '11427', '11001', '11363', '10004', '10474', '11430', '10000', '10307', '11239', '10119', '10006', '10048', '11697', '11692', '11693', '10573', '00083', '11559', '10020', '77056', '11776', '70711', '10282', '11109', '10044', '02061', '77092-2016', '14225', '55164-0737', '19711', '07306', '000000', '90010', '11747', '23541', '11788', '07604', '10112', '11563', '11580', '07087', '11042', '07093', '11501', '92123', '00000', '11575', '07109', '11797', '10803', '11716', '11722', '11549-3650', '10162', '23502', '11518', '07020', '08807', '11577', '07114', '11003', '07201', '61702', '10103', '29616-0759', '35209-3114', '11520', '11735', '10129', '11005', '41042', '11590', '06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object)
rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False)
len(requests[rows_with_dashes])
5
requests[rows_with_dashes]
Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
29136 | 26550551 | 10/24/2013 06:16:34 PM | NaN | DCA | Department of Consumer Affairs | Consumer Complaint | False Advertising | NaN | 77092-2016 | 2700 EAST SELTICE WAY | EAST SELTICE WAY | NaN | NaN | NaN | NaN | NaN | HOUSTON | NaN | NaN | Assigned | 11/13/2013 11:15:20 AM | 10/29/2013 11:16:16 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
30939 | 26548831 | 10/24/2013 09:35:10 AM | NaN | DCA | Department of Consumer Affairs | Consumer Complaint | Harassment | NaN | 55164-0737 | P.O. BOX 64437 | 64437 | NaN | NaN | NaN | NaN | NaN | ST. PAUL | NaN | NaN | Assigned | 11/13/2013 02:30:21 PM | 10/29/2013 02:31:06 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
70539 | 26488417 | 10/15/2013 03:40:33 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | Street | 11549-3650 | 365 HOFSTRA UNIVERSITY | HOFSTRA UNIVERSITY | NaN | NaN | NaN | NaN | NaN | HEMSTEAD | NaN | NaN | Assigned | 11/30/2013 01:20:33 PM | 10/16/2013 01:21:39 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | La Guardia Airport | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
85821 | 26468296 | 10/10/2013 12:36:43 PM | 10/26/2013 01:07:07 AM | DCA | Department of Consumer Affairs | Consumer Complaint | Debt Not Owed | NaN | 29616-0759 | PO BOX 25759 | BOX 25759 | NaN | NaN | NaN | NaN | NaN | GREENVILLE | NaN | NaN | Closed | 10/26/2013 09:20:28 AM | 10/26/2013 01:07:07 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
89304 | 26461137 | 10/09/2013 05:23:46 PM | 10/25/2013 01:06:41 AM | DCA | Department of Consumer Affairs | Consumer Complaint | Harassment | NaN | 35209-3114 | 600 BEACON PKWY | BEACON PKWY | NaN | NaN | NaN | NaN | NaN | BIRMINGHAM | NaN | NaN | Closed | 10/25/2013 02:43:42 PM | 10/25/2013 01:06:41 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
I thought these were missing data and originally deleted them like this:
requests['Incident Zip'][rows_with_dashes] = np.nan
But then my friend Dave pointed out that 9-digit zip codes are normal. Let's look at all the zip codes with more than 5 digits, make sure they're okay, and then truncate them.
long_zip_codes = requests['Incident Zip'].str.len() > 5
requests['Incident Zip'][long_zip_codes].unique()
array(['77092-2016', '55164-0737', '000000', '11549-3650', '29616-0759', '35209-3114'], dtype=object)
Those all look okay to truncate to me.
requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5)
Done.
Earlier I thought 00083 was a broken zip code, but turns out Central Park's zip code 00083! Shows what I know. I'm still concerned about the 00000 zip codes, though: let's look at that.
requests[requests['Incident Zip'] == '00000']
Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | Street Name | Cross Street 1 | Cross Street 2 | Intersection Street 1 | Intersection Street 2 | Address Type | City | Landmark | Facility Type | Status | Due Date | Resolution Action Updated Date | Community Board | Borough | X Coordinate (State Plane) | Y Coordinate (State Plane) | Park Facility Name | Park Borough | School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
42600 | 26529313 | 10/22/2013 02:51:06 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | NaN | 00000 | EWR EWR | EWR | NaN | NaN | NaN | NaN | NaN | NEWARK | NaN | NaN | Assigned | 12/07/2013 09:53:51 AM | 10/23/2013 09:54:43 AM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | Other | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
60843 | 26507389 | 10/17/2013 05:48:44 PM | NaN | TLC | Taxi and Limousine Commission | Taxi Complaint | Driver Complaint | Street | 00000 | 1 NEWARK AIRPORT | NEWARK AIRPORT | NaN | NaN | NaN | NaN | NaN | NEWARK | NaN | NaN | Assigned | 12/02/2013 11:59:46 AM | 10/18/2013 12:01:08 PM | 0 Unspecified | Unspecified | NaN | NaN | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N | NaN | NaN | NaN | Other | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
This looks bad to me. Let's set these to nan.
zero_zips = requests['Incident Zip'] == '00000'
requests.loc[zero_zips, 'Incident Zip'] = np.nan
Great. Let's see where we are now:
unique_zips = requests['Incident Zip'].unique()
unique_zips.sort()
unique_zips
array([nan, '00083', '02061', '06901', '07020', '07087', '07093', '07109', '07114', '07201', '07208', '07306', '07604', '08807', '10000', '10001', '10002', '10003', '10004', '10005', '10006', '10007', '10009', '10010', '10011', '10012', '10013', '10014', '10016', '10017', '10018', '10019', '10020', '10021', '10022', '10023', '10024', '10025', '10026', '10027', '10028', '10029', '10030', '10031', '10032', '10033', '10034', '10035', '10036', '10037', '10038', '10039', '10040', '10044', '10048', '10065', '10069', '10075', '10103', '10107', '10112', '10119', '10128', '10129', '10153', '10162', '10280', '10281', '10282', '10301', '10302', '10303', '10304', '10305', '10306', '10307', '10308', '10309', '10310', '10312', '10314', '10451', '10452', '10453', '10454', '10455', '10456', '10457', '10458', '10459', '10460', '10461', '10462', '10463', '10464', '10465', '10466', '10467', '10468', '10469', '10470', '10471', '10472', '10473', '10474', '10475', '10573', '10803', '10954', '11001', '11003', '11004', '11005', '11040', '11042', '11101', '11102', '11103', '11104', '11105', '11106', '11109', '11111', '11201', '11203', '11204', '11205', '11206', '11207', '11208', '11209', '11210', '11211', '11212', '11213', '11214', '11215', '11216', '11217', '11218', '11219', '11220', '11221', '11222', '11223', '11224', '11225', '11226', '11228', '11229', '11230', '11231', '11232', '11233', '11234', '11235', '11236', '11237', '11238', '11239', '11249', '11354', '11355', '11356', '11357', '11358', '11360', '11361', '11362', '11363', '11364', '11365', '11366', '11367', '11368', '11369', '11370', '11372', '11373', '11374', '11375', '11377', '11378', '11379', '11385', '11411', '11412', '11413', '11414', '11415', '11416', '11417', '11418', '11419', '11420', '11421', '11422', '11423', '11426', '11427', '11428', '11429', '11430', '11432', '11433', '11434', '11435', '11436', '11501', '11518', '11520', '11530', '11549', '11559', '11563', '11575', '11577', '11580', '11590', '11691', '11692', '11693', '11694', '11697', '11716', '11722', '11735', '11747', '11776', '11788', '11797', '13221', '14225', '19711', '23502', '23541', '29616', '35209', '41042', '55164', '61702', '70711', '77056', '77092', '90010', '92123'], dtype=object)
Amazing! This is much cleaner. There's something a bit weird here, though -- I looked up 77056 on Google maps, and that's in Texas.
Let's take a closer look:
zips = requests['Incident Zip']
# Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?)
is_close = zips.str.startswith('0') | zips.str.startswith('1')
# There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're False
is_far = ~(is_close) & zips.notnull()
zips[is_far]
12102 77056 13450 70711 29136 77092 30939 55164 44008 90010 47048 23541 57636 92123 71001 92123 71834 23502 80573 61702 85821 29616 89304 35209 94201 41042 Name: Incident Zip, dtype: object
requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort('Incident Zip')
Incident Zip | Descriptor | City | |
---|---|---|---|
71834 | 23502 | Harassment | NORFOLK |
47048 | 23541 | Harassment | NORFOLK |
85821 | 29616 | Debt Not Owed | GREENVILLE |
89304 | 35209 | Harassment | BIRMINGHAM |
94201 | 41042 | Harassment | FLORENCE |
30939 | 55164 | Harassment | ST. PAUL |
80573 | 61702 | Billing Dispute | BLOOMIGTON |
13450 | 70711 | Contract Dispute | CLIFTON |
12102 | 77056 | Debt Not Owed | HOUSTON |
29136 | 77092 | False Advertising | HOUSTON |
44008 | 90010 | Billing Dispute | LOS ANGELES |
57636 | 92123 | Harassment | SAN DIEGO |
71001 | 92123 | Billing Dispute | SAN DIEGO |
Okay, there really are requests coming from LA and Houston! Good to know. Filtering by zip code is probably a bad way to handle this -- we should really be looking at the city instead.
requests['City'].str.upper().value_counts()
BROOKLYN 31662 NEW YORK 22664 BRONX 18438 STATEN ISLAND 4766 JAMAICA 2246 FLUSHING 1803 ASTORIA 1568 RIDGEWOOD 1073 CORONA 707 OZONE PARK 693 LONG ISLAND CITY 678 FAR ROCKAWAY 652 ELMHURST 647 WOODSIDE 609 EAST ELMHURST 562 ... MELVILLE 1 PORT JEFFERSON STATION 1 NORWELL 1 EAST ROCKAWAY 1 BIRMINGHAM 1 ROSLYN 1 LOS ANGELES 1 MINEOLA 1 JERSEY CITY 1 ST. PAUL 1 CLIFTON 1 COL.ANVURES 1 EDGEWATER 1 ROSELYN 1 CENTRAL ISLIP 1 Length: 100, dtype: int64
It looks like these are legitimate complaints, so we'll just leave them alone.
Here's what we ended up doing to clean up our zip codes, all together:
na_values = ['NO CLUE', 'N/A', '0']
requests = pd.read_csv('../data/311-service-requests.csv',
na_values=na_values,
dtype={'Incident Zip': str})
def fix_zip_codes(zips):
# Truncate everything to length 5
zips = zips.str.slice(0, 5)
# Set 00000 zip codes to nan
zero_zips = zips == '00000'
zips[zero_zips] = np.nan
return zips
requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip'])
requests['Incident Zip'].unique()
array(['11432', '11378', '10032', '10023', '10027', '11372', '11419', '11417', '10011', '11225', '11218', '10003', '10029', '10466', '11219', '10025', '10310', '11236', nan, '10033', '11216', '10016', '10305', '10312', '10026', '10309', '10036', '11433', '11235', '11213', '11379', '11101', '10014', '11231', '11234', '10457', '10459', '10465', '11207', '10002', '10034', '11233', '10453', '10456', '10469', '11374', '11221', '11421', '11215', '10007', '10019', '11205', '11418', '11369', '11249', '10005', '10009', '11211', '11412', '10458', '11229', '10065', '10030', '11222', '10024', '10013', '11420', '11365', '10012', '11214', '11212', '10022', '11232', '11040', '11226', '10281', '11102', '11208', '10001', '10472', '11414', '11223', '10040', '11220', '11373', '11203', '11691', '11356', '10017', '10452', '10280', '11217', '10031', '11201', '11358', '10128', '11423', '10039', '10010', '11209', '10021', '10037', '11413', '11375', '11238', '10473', '11103', '11354', '11361', '11106', '11385', '10463', '10467', '11204', '11237', '11377', '11364', '11434', '11435', '11210', '11228', '11368', '11694', '10464', '11415', '10314', '10301', '10018', '10038', '11105', '11230', '10468', '11104', '10471', '11416', '10075', '11422', '11355', '10028', '10462', '10306', '10461', '11224', '11429', '10035', '11366', '11362', '11206', '10460', '10304', '11360', '11411', '10455', '10475', '10069', '10303', '10308', '10302', '11357', '10470', '11367', '11370', '10454', '10451', '11436', '11426', '10153', '11004', '11428', '11427', '11001', '11363', '10004', '10474', '11430', '10000', '10307', '11239', '10119', '10006', '10048', '11697', '11692', '11693', '10573', '00083', '11559', '10020', '77056', '11776', '70711', '10282', '11109', '10044', '02061', '77092', '14225', '55164', '19711', '07306', '90010', '11747', '23541', '11788', '07604', '10112', '11563', '11580', '07087', '11042', '07093', '11501', '92123', '11575', '07109', '11797', '10803', '11716', '11722', '11549', '10162', '23502', '11518', '07020', '08807', '11577', '07114', '11003', '07201', '61702', '10103', '29616', '35209', '11520', '11735', '10129', '11005', '41042', '11590', '06901', '07208', '11530', '13221', '10954', '11111', '10107'], dtype=object)