import pandas as pd
import nltk
from nltk.util import ngrams
nltk.download('punkt')
nltk.download('stopwords')
#Importing the dataset
%cd C:\Users\Matt\Dropbox\Python Workspace\CROW\CROL-PDF
data = pd.read_csv("procPublicationRequest_Oct-Dec_2014_clean - procPublicationRequest_Oct-Dec_2014_clean.csv")
#Snagging the "human_readable" column
human_readableList = list(data['human_readable'])
#Turn the values into strings
strReadable = [str(a) for a in human_readableList]
#Split into individual words
listOfLists = [a.split() for a in strReadable]
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\Matt\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Matt\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! C:\Users\Matt\Dropbox\Python Workspace\CROW\CROL-PDF
#Let's see an entry!
strReadable[0]
#It's a giant mess of text!
'OWNERS ARE WANTED BY THE PROPERTY CLERK DIVISION OF THE NEW YORK CITY POLICE DEPARTMENT. The following listed property is in the custody, of the Property Clerk Division without claimants. Recovered, lost, abandoned property, obtained from prisoners, emotionally disturbed, intoxicated and deceased persons; and property obtained from persons incapable of caring for themselves. Motor vehicles, boats, bicycles, business machines, cameras, calculating machines, electrical and optical property, furniture, furs, handbags, hardware, jewelry, photographic equipment, radios, robes, sound systems, surgical and musical instruments, tools, wearing apparel, communications equipment, computers, and other miscellaneous articles. INQUIRIES Inquiries relating to such property should be made in the Borough concerned, at the following office of the Property Clerk. FOR MOTOR VEHICLES(All Boroughs): Springfield Gardens Auto Pound, 174-20 North Boundary Road, Queens, NY 11430, (718) 553-9555 Erie Basin Auto Pound, 700 Columbia Street, Brooklyn, NY 11231, (718) 246-2030 FOR ALL OTHER PROPERTY Manhattan - 1 Police Plaza, New York, NY 10038, (646) 610-5906 Brooklyn - 84th Precinct, 301 Gold Street, Brooklyn, NY 11201, (718) 875-6675 Bronx Property Clerk - 215 East 161 Street, Bronx, NY 10451, (718) 590-2806 Queens Property Clerk - 47-07 Pearson Place, Long Island City, NY 11101, (718) 433-2678 Staten Island Property Clerk - 1 Edgewater Plaza, Staten Island, NY 10301, (718) 876-8484'
#Here's what happens when we tokenize with NLTK
firstEntryTokenized = nltk.word_tokenize(strReadable[0])
#It splits it into a list of individual words
firstEntryTokenized[:50]
['OWNERS', 'ARE', 'WANTED', 'BY', 'THE', 'PROPERTY', 'CLERK', 'DIVISION', 'OF', 'THE', 'NEW', 'YORK', 'CITY', 'POLICE', 'DEPARTMENT', '.', 'The', 'following', 'listed', 'property', 'is', 'in', 'the', 'custody', ',', 'of', 'the', 'Property', 'Clerk', 'Division', 'without', 'claimants', '.', 'Recovered', ',', 'lost', ',', 'abandoned', 'property', ',', 'obtained', 'from', 'prisoners', ',', 'emotionally', 'disturbed', ',', 'intoxicated', 'and', 'deceased']
#We can use some special functions if we then convert this into NLTK's special Text format
firstEntryText = nltk.Text(firstEntryTokenized)
#For instance, we can get a list of the most common words along with how often they show up
firstEntryFreqDist = nltk.FreqDist(firstEntryText)
#Let's see the 10 most common words!
firstEntryFreqDist.most_common(10)
[(',', 52), (')', 8), ('(', 8), ('NY', 7), ('718', 6), ('Property', 5), ('Clerk', 5), ('property', 5), ('the', 5), ('-', 5)]
#We can also search for phrases of different lengths
firstEntryBigrams = list(ngrams(firstEntryTokenized,2))
firstEntryBigrams[:50]
[('OWNERS', 'ARE'), ('ARE', 'WANTED'), ('WANTED', 'BY'), ('BY', 'THE'), ('THE', 'PROPERTY'), ('PROPERTY', 'CLERK'), ('CLERK', 'DIVISION'), ('DIVISION', 'OF'), ('OF', 'THE'), ('THE', 'NEW'), ('NEW', 'YORK'), ('YORK', 'CITY'), ('CITY', 'POLICE'), ('POLICE', 'DEPARTMENT'), ('DEPARTMENT', '.'), ('.', 'The'), ('The', 'following'), ('following', 'listed'), ('listed', 'property'), ('property', 'is'), ('is', 'in'), ('in', 'the'), ('the', 'custody'), ('custody', ','), (',', 'of'), ('of', 'the'), ('the', 'Property'), ('Property', 'Clerk'), ('Clerk', 'Division'), ('Division', 'without'), ('without', 'claimants'), ('claimants', '.'), ('.', 'Recovered'), ('Recovered', ','), (',', 'lost'), ('lost', ','), (',', 'abandoned'), ('abandoned', 'property'), ('property', ','), (',', 'obtained'), ('obtained', 'from'), ('from', 'prisoners'), ('prisoners', ','), (',', 'emotionally'), ('emotionally', 'disturbed'), ('disturbed', ','), (',', 'intoxicated'), ('intoxicated', 'and'), ('and', 'deceased'), ('deceased', 'persons')]
#And frequencies of phrases!
firstEntryFreq= nltk.FreqDist(firstEntryBigrams)
firstEntryFreq.most_common(50)
[((',', '('), 7), ((',', 'NY'), 7), (('(', '718'), 6), (('718', ')'), 6), (('Property', 'Clerk'), 5), (('Clerk', '-'), 3), (('Street', ','), 3), ((',', 'Brooklyn'), 2), (('Staten', 'Island'), 2), (('obtained', 'from'), 2), (('Brooklyn', ','), 2), (('machines', ','), 2), (('equipment', ','), 2), (('in', 'the'), 2), (('of', 'the'), 2), (('property', ','), 2), (('Auto', 'Pound'), 2), (('the', 'Property'), 2), (('Plaza', ','), 2), (('-', '1'), 2), (('Pound', ','), 2), ((',', 'hardware'), 1), (('jewelry', ','), 1), (('MOTOR', 'VEHICLES'), 1), (('11430', ','), 1), (('East', '161'), 1), (('property', 'obtained'), 1), ((',', '700'), 1), ((')', '433-2678'), 1), (('bicycles', ','), 1), (('persons', 'incapable'), 1), ((')', '246-2030'), 1), ((')', '553-9555'), 1), (('and', 'property'), 1), (('electrical', 'and'), 1), (('NY', '11101'), 1), (('sound', 'systems'), 1), (('.', 'The'), 1), (('York', ','), 1), (('musical', 'instruments'), 1), (('Clerk', 'Division'), 1), (('433-2678', 'Staten'), 1), (('CITY', 'POLICE'), 1), (('business', 'machines'), 1), (('Place', ','), 1), (('at', 'the'), 1), (('be', 'made'), 1), (('Queens', ','), 1), (('should', 'be'), 1), (('OF', 'THE'), 1)]