import nltk nltk.download() from urllib import urlopen url = "http://venturebeat.com/2014/07/04/facebooks-little-social-experiment-got-you-bummed-out-get-over-it/" html = urlopen(url).read() html[:500] text = nltk.clean_html(html) text[:500] from readability.readability import Document from bs4 import BeautifulSoup readable_article = Document(html).summary() readable_title = Document(html).title() soup = BeautifulSoup(readable_article) print '*** TITLE *** \n\"' + readable_title + '\"\n' print '*** CONTENT *** \n\"' + soup.text[:500] + '[...]\"' tokens = [word for sent in nltk.sent_tokenize(soup.text) for word in nltk.word_tokenize(sent)] for token in sorted(set(tokens))[:30]: print token + ' [' + str(tokens.count(token)) + ']' from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") stemmed_tokens = [stemmer.stem(t) for t in tokens] for token in sorted(set(stemmed_tokens))[50:75]: print token + ' [' + str(stemmed_tokens.count(token)) + ']' lemmatizer = nltk.WordNetLemmatizer() temp_sent = "Several women told me I have lying eyes." print [stemmer.stem(t) for t in nltk.word_tokenize(temp_sent)] print [lemmatizer.lemmatize(t) for t in nltk.word_tokenize(temp_sent)] fdist = nltk.FreqDist(stemmed_tokens) for item in fdist.items()[:25]: print item sorted(nltk.corpus.stopwords.words('english'))[:25] stemmed_tokens_no_stop = [stemmer.stem(t) for t in stemmed_tokens if t not in nltk.corpus.stopwords.words('english')] fdist2 = nltk.FreqDist(stemmed_tokens_no_stop) for item in fdist2.items()[:25]: print item def extract_entities(text): entities = [] for sentence in nltk.sent_tokenize(text): chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence))) entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')]) return entities for entity in extract_entities('My name is Charlie and I work for Altamira in Tysons Corner.'): print '[' + entity.node + '] ' + ' '.join(c[0] for c in entity.leaves()) from nltk.tag.stanford import NERTagger # change the paths below to point to wherever you unzipped the Stanford NER download file st = NERTagger('/Users/cgreenba/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/cgreenba/stanford-ner/stanford-ner.jar', 'utf-8') for i in st.tag('Up next is Tommy, who works at STPI in Washington.'.split()): print '[' + i[1] + '] ' + i[0] from nltk.corpus import reuters print '** BEGIN ARTICLE: ** \"' + reuters.raw(reuters.fileids()[0])[:500] + ' [...]\"' import datetime, re, sys from sklearn.feature_extraction.text import TfidfVectorizer def tokenize_and_stem(text): tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems token_dict = {} for article in reuters.fileids(): token_dict[article] = reuters.raw(article) tfidf = TfidfVectorizer(tokenizer=tokenize_and_stem, stop_words='english', decode_error='ignore') print 'building term-document matrix... [process started: ' + str(datetime.datetime.now()) + ']' sys.stdout.flush() tdm = tfidf.fit_transform(token_dict.values()) # this can take some time (about 60 seconds on my machine) print 'done! [process finished: ' + str(datetime.datetime.now()) + ']' from random import randint feature_names = tfidf.get_feature_names() print 'TDM contains ' + str(len(feature_names)) + ' terms and ' + str(tdm.shape[0]) + ' documents' print 'first term: ' + feature_names[0] print 'last term: ' + feature_names[len(feature_names) - 1] for i in range(0, 4): print 'random term: ' + feature_names[randint(1,len(feature_names) - 2)] import math from __future__ import division article_id = randint(0, tdm.shape[0] - 1) article_text = reuters.raw(reuters.fileids()[article_id]) sent_scores = [] for sentence in nltk.sent_tokenize(article_text): score = 0 sent_tokens = tokenize_and_stem(sentence) for token in (t for t in sent_tokens if t in feature_names): score += tdm[article_id, feature_names.index(token)] sent_scores.append((score / len(sent_tokens), sentence)) summary_length = int(math.ceil(len(sent_scores) / 5)) sent_scores.sort(key=lambda sent: sent[0], reverse=True) print '*** SUMMARY ***' for summary_sentence in sent_scores[:summary_length]: print summary_sentence[1] print '\n*** ORIGINAL ***' print article_text