# Set the Working Directory
import os
os.getcwd()
os.chdir('/Users/zhanglun/Desktop/Workshop/PythonDemo')
os.getcwd()
'/Users/zhanglun/Desktop/Workshop/PythonDemo'
###Import packages.
import nltk
import string
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import brown
brown_tagged_sents=brown.tagged_sents(categories=None)
unigram_tagger=nltk.UnigramTagger(brown_tagged_sents)
wnl = nltk.WordNetLemmatizer()
porter=nltk.PorterStemmer()
stopwords=nltk.corpus.stopwords.words('english')
print (stopwords)
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']
text='Trump was born and raised in Queens, New York City, and earned an economics degree from the Wharton School. Later, he took charge of The Trump Organization, the real estate and construction firm founded by his paternal grandmother, which he ran for 45 years until 2016.'
print (text)
Trump was born and raised in Queens, New York City, and earned an economics degree from the Wharton School. Later, he took charge of The Trump Organization, the real estate and construction firm founded by his paternal grandmother, which he ran for 45 years until 2016.
text2=text.translate(str.maketrans("","",string.punctuation)).lower()
print (text)
print (text2)
Trump was born and raised in Queens, New York City, and earned an economics degree from the Wharton School. Later, he took charge of The Trump Organization, the real estate and construction firm founded by his paternal grandmother, which he ran for 45 years until 2016. trump was born and raised in queens new york city and earned an economics degree from the wharton school later he took charge of the trump organization the real estate and construction firm founded by his paternal grandmother which he ran for 45 years until 2016
#tokenization
wordlist=text2.split()
print (wordlist)
['trump', 'was', 'born', 'and', 'raised', 'in', 'queens', 'new', 'york', 'city', 'and', 'earned', 'an', 'economics', 'degree', 'from', 'the', 'wharton', 'school', 'later', 'he', 'took', 'charge', 'of', 'the', 'trump', 'organization', 'the', 'real', 'estate', 'and', 'construction', 'firm', 'founded', 'by', 'his', 'paternal', 'grandmother', 'which', 'he', 'ran', 'for', '45', 'years', 'until', '2016']
#drop stop words
for item in wordlist:
if item in stopwords:
print(item,'0')
else:
print(item,'1')
trump 1 was 0 born 1 and 0 raised 1 in 0 queens 1 new 1 york 1 city 1 and 0 earned 1 an 0 economics 1 degree 1 from 0 the 0 wharton 1 school 1 later 1 he 0 took 1 charge 1 of 0 the 0 trump 1 organization 1 the 0 real 1 estate 1 and 0 construction 1 firm 1 founded 1 by 0 his 0 paternal 1 grandmother 1 which 0 he 0 ran 1 for 0 45 1 years 1 until 0 2016 1
#lemmentization
wnl.lemmatize('was','v')
wnl.lemmatize('going','v')
wnl.lemmatize('boys','n')
wnl.lemmatize('women','n')
'woman'
#stemming
porter.stem('going')
porter.stem('amazing')
porter.stem('girls')
porter.stem('ponies')
#pos tagging
tag=unigram_tagger.tag(wordlist)
print (tag)
[('trump', 'VB'), ('was', 'BEDZ'), ('born', 'VBN'), ('and', 'CC'), ('raised', 'VBN'), ('in', 'IN'), ('queens', 'NNS'), ('new', 'JJ'), ('york', None), ('city', 'NN'), ('and', 'CC'), ('earned', 'VBN'), ('an', 'AT'), ('economics', 'NN'), ('degree', 'NN'), ('from', 'IN'), ('the', 'AT'), ('wharton', None), ('school', 'NN'), ('later', 'RBR'), ('he', 'PPS'), ('took', 'VBD'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('trump', 'VB'), ('organization', 'NN'), ('the', 'AT'), ('real', 'JJ'), ('estate', 'NN'), ('and', 'CC'), ('construction', 'NN'), ('firm', 'NN'), ('founded', 'VBN'), ('by', 'IN'), ('his', 'PP$'), ('paternal', None), ('grandmother', 'NN'), ('which', 'WDT'), ('he', 'PPS'), ('ran', 'VBD'), ('for', 'IN'), ('45', 'CD'), ('years', 'NNS'), ('until', 'CS'), ('2016', None)]
Export the Result (or just leave it here for the follow-up analysis)
####write out the segmentation result
import pandas as pd
my_df = pd.DataFrame(tag)
my_df.to_csv('out.csv', index=False, header=False)
###alternatively
pd.DataFrame(tag).to_csv ('out1.csv', index=False, header=False)