# getting text
with open("short.txt",'r')as f:
text = f.read()
print text
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.
text.split()
['Mr.', 'and', 'Mrs.', 'Dursley,', 'of', 'number', 'four,', 'Privet', 'Drive,', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal,', 'thank', 'you', 'very', 'much.', 'They', 'were', 'the', 'last', 'people', "you'd", 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious,', 'because', 'they', 'just', "didn't", 'hold', 'with', 'such', 'nonsense.']
from sklearn.feature_extraction.text import CountVectorizer
CountVectorizer().build_tokenizer()(text)
['Mr', 'and', 'Mrs', 'Dursley', 'of', 'number', 'four', 'Privet', 'Drive', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', 'thank', 'you', 'very', 'much', 'They', 'were', 'the', 'last', 'people', 'you', 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', 'because', 'they', 'just', 'didn', 'hold', 'with', 'such', 'nonsense']
from nltk.tokenize import word_tokenize
word_tokenize(text)
['Mr.', 'and', 'Mrs.', 'Dursley', ',', 'of', 'number', 'four', ',', 'Privet', 'Drive', ',', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', ',', 'thank', 'you', 'very', 'much', '.', 'They', 'were', 'the', 'last', 'people', 'you', "'d", 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', ',', 'because', 'they', 'just', 'did', "n't", 'hold', 'with', 'such', 'nonsense', '.']
from nltk.stem.snowball import SnowballStemmer
# See which languages are supported
print(" ".join(SnowballStemmer.languages))
danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish
#Create a new instance of a language specific subclass.
stemmer = SnowballStemmer("english")
print stemmer.stem('generously')
generous
"Splitting a long text into smaller samples is a common task in text analysis. As most kinds of quantitative text analysis take as inputs an unordered list of words, breaking a text up into smaller chunks allows one to preserve context that would otherwise be discarded; observing two words together in a paragraph-sized chunk of text tells us much more about the relationship between those two words than observing two words occurring together in an 100,000 word book. Or, as we will be using a selection of tragedies as our examples, we might consider the difference between knowing that two character names occur in the same scene versus knowing that the two names occur in the same play."