from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='train')
data = news.data[:1000]
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
data_tokenized = []
for text in data:
text = text.lower()
#tokenize
tokens = word_tokenize(text)
#remove stopwords
filtered = [word for word in tokens if word not in stopwords.words('English')]
#stemmerize
ps = PorterStemmer()
filtered = [ps.stem(w) for w in tokens]
data_tokenized.append(' '.join(filtered))
# show a sample result
print data_tokenized[:1]
[u"from : lerxst @ wam.umd.edu ( where 's my thing ) subject : what car is thi ! ? nntp-posting-host : rac3.wam.umd.edu organ : univers of maryland , colleg park line : 15 i wa wonder if anyon out there could enlighten me on thi car i saw the other day . it wa a 2-door sport car , look to be from the late 60s/ earli 70 . it wa call a bricklin . the door were realli small . in addit , the front bumper wa separ from the rest of the bodi . thi is all i know . if anyon can tellm a model name , engin spec , year of product , where thi car is made , histori , or whatev info you have on thi funki look car , pleas e-mail . thank , - il -- -- brought to you by your neighborhood lerxst -- --"]
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None. - min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None. - max_features : int or None, default=None If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.
#vectorize text
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
stop_words='english')
tf = tf_vectorizer.fit_transform(data_tokenized)
# store the Count Vectoerizer with joblib, so when run second time, the code above can be commented away.
from sklearn.externals import joblib #也可以选择pickle等保存模型,请随意
joblib.dump(tf_vectorizer,'model.pkl' )
# #得到存储的tf_vectorizer,节省预处理时间
# tf_vectorizer = joblib.load(tf_ModelPath)
# tf = tf_vectorizer.fit_transform(docLst)
['model.pkl']
from sklearn.decomposition import LatentDirichletAllocation
n_topic = 20
lda = LatentDirichletAllocation(n_topics=n_topic,
max_iter=50,
learning_method='batch')
lda.fit(tf) #tf即为Document_word Sparse Matrix
/Users/zjm/anaconda/lib/python2.7/site-packages/sklearn/decomposition/online_lda.py:294: DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21 DeprecationWarning)
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method='batch', learning_offset=10.0, max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=20, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0)
def print_top_words(model, feature_names, n_top_words):
#打印每个主题下权重较高的term
for topic_idx, topic in enumerate(model.components_):
print "Topic #%d:" % topic_idx
print " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print
#打印主题-词语分布矩阵
print model.components_
n_top_words=20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
Topic #0: edu com thi jake indiana new ini write univers ha use comput articl doe host secur duo opinion depart york Topic #1: edu wa columbia com cc land jew cunixa posting nntp ani host write uoknor callison arab did dyer hi articl Topic #2: wa peopl hi say thi know come becaus want brian ha man look did time way ve happen day whi Topic #3: edu toronto henri thi just zoo reserv work state spencer wa write adam alaska use ohio like colorado posting nntp Topic #4: netcom com 408 guest servic commun 241 ca 9760 list request drug clipper use electron chip lin thi wa harley Topic #5: com thi window weapon israel write attack articl stratus ani civilian arab say right know doe ha edu manag onli Topic #6: edu game ca cs team season write articl pitt pittsburgh play player cmu new nntp posting playoff host univers comput Topic #7: wa thi health use tobacco like ani 1993 year diseas smokeless report com david medic case age state person articl Topic #8: thi wa edu think peopl write articl human say god believ natur whi becaus time make like did just ha Topic #9: com key thi nasa space astronaut encrypt research chip use shuttl govern ha bit clipper applic select mission need want Topic #10: wa edu weaver com right sandvik ca thi cooper arm write apple draw object trial spenc articl kent peopl posting Topic #11: edu scsi sale host simm posting nntp control ide disk mac drive com univers thi distribut comput work new card Topic #12: period 10 12 11 pp power play 14 19 20 15 orbit 28 18 13 scorer pt 17 93 23 Topic #13: edu thi use window problem file ani com univers pleas ca thank anyon program write need comput time imag help Topic #14: edu com thi wa write articl like good thing state make onli know att want ani way think number game Topic #15: armenian turkish wa genocid peopl turk soviet argic serdar muslim greek govern armenia russian kill war 000 thi popul world Topic #16: moral edu thi father keith wa write spirit son doe say ha use know think caltech com articl engin nntp Topic #17: edu com wa articl valu write scienc gay optilink homosexu use thi cramer did ca think univers men post uchicago Topic #18: edu thi write ca articl com wa motif power univers option uiuc doe revolv rushdi look host posting nntp like Topic #19: thi write ___ univers articl com edu ca gov __ nasa ibm jpl did like just professor wa know ha Topic #20: edu com thi write cs wa ha posting nntp host year articl univers morri team did run usa ibm distribut Topic #21: ax max 145 1t 04 ql 0d 3t wm cx tm 0t 0m sl gy bj gk 34 p2 m_ Topic #22: mu mv m0 m9 mt mp __ mh m8 mw mi mz md 22 mj 1x odomet lm mf h0 Topic #23: thi know edu point com just ani like write think peopl murder ha possibl object doe onli articl univers person Topic #24: wa jesu hi thi matthew said time propheci gun peopl day ha psalm john king messiah prophet onli hear gospel Topic #25: com wa hp edu thi ani softwar write peopl use just articl ti realli process level dseg quack post veri Topic #26: good veri 50 edu excel colorado miss tn geoffrey fair thi 00 com 75 mane modul cover homicid gun wa Topic #27: thi god christian edu argument say peopl know wa believ write bibl true truth ha doe exampl becaus hi think Topic #28: den p2 p3 p1 wa com cool nuclear p4 doubl radiu thi water edu approv plant tower know au n2 Topic #29: car 00 year new edu game wa card speed state insur drive driver use 15 rate color modem price buy [[ 3.33333333e-02 3.33333333e-02 3.33333333e-02 ..., 3.33333333e-02 3.33333333e-02 3.33333333e-02] [ 3.74148502e+00 2.55934619e+00 3.33333333e-02 ..., 3.33333333e-02 3.33333333e-02 3.33333333e-02] [ 3.33333333e-02 3.33333333e-02 3.33333333e-02 ..., 3.33333333e-02 3.33333333e-02 3.33333333e-02] ..., [ 9.97698755e+00 1.26471312e+01 3.33333333e-02 ..., 3.33333333e-02 3.33333333e-02 3.33333333e-02] [ 3.33333333e-02 3.33333333e-02 3.33333333e-02 ..., 3.33333333e-02 3.33333333e-02 1.03333333e+00] [ 7.70723979e+01 1.16162932e+01 3.33333333e-02 ..., 3.33333333e-02 3.33333333e-02 3.33333333e-02]]