!pip install nltk
!pip install gensim
!pip install spacy
!python -m spacy download en
import nltk
import gensim
import spacy
import requests
nltk.download('punkt')
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date!
True
text_url = 'http://rare-technologies.com/the_matrix_synopsis.txt'
text = requests.get(text_url).text
words
len(text.split(' '))
6348
lines
len(text.split('\n'))
195
%%timeit -r 10
gensim.summarization.textcleaner.split_sentences(text)
100 loops, best of 10: 4.27 ms per loop
%%timeit -r 10
nltk.sent_tokenize(text)
100 loops, best of 10: 14.8 ms per loop
nlp = spacy.load('en')
%%timeit
nlp(text).sents
1 loop, best of 3: 1.96 s per loop