import os
import re
import nltk
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from datetime import timedelta, datetime
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
[nltk_data] Downloading package punkt to /Users/liuruizhe/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] /Users/liuruizhe/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] /Users/liuruizhe/nltk_data... [nltk_data] Package wordnet is already up-to-date!
True
data = pd.read_csv('/Users/liuruizhe/Library/Mobile Documents/com~apple~CloudDocs/TMR/Kaggle_DJIA/DJIA/after_Combined_News_DJIA.csv')
data
Unnamed: 0 | Date | Label | Top1 | Top2 | Top3 | Top4 | Top5 | Top6 | Top7 | ... | Top19 | Top20 | Top21 | Top22 | Top23 | Top24 | Top25 | Unnamed: 28 | Unnamed: 29 | fluctuation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2008/8/8 | 0 | b georgia downs two russian warplanes as cou... | b breaking musharraf to be impeached | b russia today columns of troops roll into so... | b russian tanks are moving towards the capital... | b afghan children raped with impunity u n o... | b russian tanks have entered south ossetia wh... | b breaking georgia invades south ossetia rus... | ... | b this is a busy day the european union has ... | b georgia will withdraw soldiers from iraq t... | b why the pentagon thinks attacking iran is a ... | b caucasus in crisis georgia invades south os... | b indian shoe manufactory and again in a se... | b visitors suffering from mental illnesses ban... | b no help for mexico s kidnapping surge | NaN | NaN | 0.055658 |
1 | 1 | 2008/8/11 | 1 | b why wont america and nato help us if they w... | b bush puts foot down on georgian conflict | b jewish georgian minister thanks to israeli ... | b georgian army flees in disarray as russians ... | b olympic opening ceremony fireworks faked | b what were the mossad with fraudulent new zea... | b russia angered by israeli military sale to g... | ... | b china to overtake us as largest manufacturer | b war in south ossetia pics | b israeli physicians group condemns state tort... | b russia has just beaten the united states ov... | b perhaps the question about the georgia r... | b russia is so much better at war | b so this is what it s come to trading sex fo... | NaN | NaN | -1.793642 |
2 | 2 | 2008/8/12 | 0 | b remember that adorable year old who sang at... | b russia ends georgia operation | b if we had no sexual harassment we would have... | b al qa eda is losing support in iraq because ... | b ceasefire in georgia putin outmaneuvers the... | b why microsoft and intel tried to kill the xo... | b stratfor the russo georgian war and the bal... | ... | b russia georgia and nato cold war two | b remember that adorable year old who led you... | b war in georgia the israeli connection | b all signs point to the us encouraging georgi... | b christopher king argues that the us and nato... | b america the new mexico | b bbc news asia pacific extinction by man... | NaN | NaN | -0.757185 |
3 | 3 | 2008/8/13 | 0 | b u s refuses israel weapons to attack iran ... | b when the president ordered to attack tskhinv... | b israel clears troops who killed reuters cam... | b britain s policy of being tough on drugs is ... | b body of year old found in trunk latest r... | b china has moved million quake survivors ... | b bush announces operation get all up in russi... | ... | b russian convoy heads into georgia violating... | b israeli defence minister us against strike ... | b gorbachev we had no choice | b witness russian forces head towards tbilisi... | b quarter of russians blame u s for conflict... | b georgian president says us military will ta... | b nobel laureate aleksander solzhenitsyn accu... | NaN | NaN | 1.207153 |
4 | 4 | 2008/8/14 | 1 | b all the experts admit that we should legalis... | b war in south osetia pictures made by a r... | b swedish wrestler ara abrahamian throws away ... | b russia exaggerated the death toll in south o... | b missile that killed inside pakistan may ha... | b rushdie condemns random house s refusal to p... | b poland and us agree to missle defense deal ... | ... | b non media photos of south ossetia georgia c... | b georgian tv reporter shot by russian sniper ... | b saudi arabia mother moves to block child ma... | b taliban wages war on humanitarian aid workers | b russia world can forget about georgia s ... | b darfur rebels accuse sudan of mounting major... | b philippines peace advocate say muslims nee... | NaN | NaN | 1.795697 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1984 | 1984 | 2016/6/27 | 0 | barclays and rbs shares suspended from trading... | pope says church should ask forgiveness from g... | poland shocked by xenophobic abuse of poles ... | there will be no second referendum cabinet ag... | scotland welcome to join eu merkel ally says | sterling dips below friday s year low amid br... | no negative news about south african president... | ... | turkey sorry for downing russian jet | edward snowden lawyer vows new push for pardon... | brexit opinion poll reveals majority don t wan... | conservative mp leave campaigner the leave c... | economists predict UNITED KINGDOM recession f... | new eu superstate plan by france germany cr... | pakistani clerics declare transgender marriage... | NaN | NaN | NaN |
1985 | 1985 | 2016/6/28 | 1 | scientists to australia if you want to save... | the personal details of french police office... | s amp p cuts united kingdom sovereign credit r... | huge helium deposit found in africa | ceo of the south african state broadcaster qui... | brexit cost investors trillion the worst on... | hong kong democracy activists call for return ... | ... | YEAR old skull from borneo reveals surprise f... | palestinians stone western wall worshipers po... | jean claude juncker asks farage why are you h... | romanians for remainians offering a new home... | brexit gibraltar in talks with scotland to st... | suicide bombers strike lebanon | mexico s security forces routinely use sexual... | NaN | NaN | NaN |
1986 | 1986 | 2016/6/29 | 1 | explosion at airport in istanbul | yemeni former president terrorism is the offs... | UNITED KINGDOM must accept freedom of movement... | devastated scientists too late to captive bre... | british labor party leader jeremy corbyn loses... | a muslim shop in the UNITED KINGDOM was just f... | mexican authorities sexually torture women in ... | ... | emaciated lions in taiz zoo are trapped in blo... | rupert murdoch describes brexit as wonderful ... | more than killed in yemen suicide attacks | google found disastrous symantec and norton vu... | extremist violence on the rise in germany dom... | bbc news labour mps pass corbyn no confidence... | tiny new zealand town with too many jobs lau... | NaN | NaN | NaN |
1987 | 1987 | 2016/6/30 | 1 | jamaica proposes marijuana dispensers for tour... | stephen hawking says pollution and stupidity ... | boris johnson says he will not run for tory pa... | six gay men in ivory coast were abused and for... | switzerland denies citizenship to muslim immig... | palestinian terrorist stabs israeli teen girl ... | puerto rico will default on billion of debt ... | ... | calls to suspend saudi arabia from un human ri... | more than nobel laureates call out greenpeac... | british pedophile sentenced to years in us f... | us permitted offshore fracks in gulf of mexi... | we will be swimming in ridicule french beach... | uefa says no minutes of silence for istanbul v... | law enforcement sources gun used in paris ter... | NaN | NaN | NaN |
1988 | 1988 | 2016/7/1 | 1 | a year old woman in mexico city finally recei... | imf chief backs athens as permanent olympic host | the president of france says if brexit won so... | british man who must give police hours noti... | nobel laureates urge greenpeace to stop oppo... | brazil huge spike in number of police killing... | austria s highest court annuls presidential el... | ... | u s sailors detained by iran spoke too much u... | mass fish kill in vietnam solved as taiwan ste... | philippines president rodrigo duterte urges pe... | spain arrests three pakistanis accused of prom... | venezuela where anger over food shortages is ... | a hindu temple worker has been killed by three... | ozone layer hole seems to be healing us amp... | NaN | NaN | NaN |
1989 rows × 31 columns
def preprocess(processdata):
# 轉小寫
headlines = []
for i in range(1, 21):
headlines.append('Top'+str(i))
processdata[headlines] = processdata[headlines].astype(str)
processdata[headlines] = processdata[headlines].applymap(str.lower)
# 組成以天為單位的data
processdata_headlines = []
for row in range(0,len(processdata.index)):
processdata_headlines.append(' '.join(str(x) for x in processdata.iloc[row,2:27]))
# remove punctuation characters
for line in range(len(processdata_headlines)):
processdata_headlines[line] = re.sub(r'[^A-Za-z]'," ", processdata_headlines[line])
# 切字
for sentence in range(len(processdata_headlines)):
processdata_headlines[sentence] = word_tokenize(processdata_headlines[sentence])
# 去除停用詞
alpha = []
for abc in string.ascii_lowercase :
alpha.append(abc)
en_stops = stopwords.words('english')
en_stops.extend(alpha)
for sentence in range(len(processdata_headlines)):
processdata_headlines[sentence] = [w for w in processdata_headlines[sentence] if w not in en_stops]
# 單字變回原形
for sentence in range(len(processdata_headlines)):
processdata_headlines[sentence] = [WordNetLemmatizer().lemmatize(w) for w in processdata_headlines[sentence]]
processdata_headlines[sentence] = [WordNetLemmatizer().lemmatize(w, pos='v') for w in processdata_headlines[sentence]]
# 組回標題
final_processdata_headlines = []
for words in processdata_headlines :
filter_words = ""
for i in range(len(words)) :
filter_words = filter_words + words[i] + " "
final_processdata_headlines.append(filter_words)
return final_processdata_headlines
#將資料進行tfidf權重評分
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
final_traindata = preprocess(train)
final_testdata = preprocess(test)
tfidf_vector = TfidfVectorizer(min_df=0.01, max_df=0.99, max_features=160, ngram_range=(2, 2))
final_traindata_tfidf = tfidf_vector.fit_transform(final_traindata)
final_testdata_tfidf = tfidf_vector.transform(final_testdata)
/Users/liuruizhe/miniforge3/lib/python3.9/site-packages/pandas/core/frame.py:3636: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self[k1] = value[k2]
#印出字詞及其tfidf
terms = tfidf_vector.get_feature_names()
sums = final_traindata_tfidf.sum(axis=0)
data = []
for col, term in enumerate(terms):
data.append( (term, sums[0,col] ))
ranking = pd.DataFrame(data, columns=['term','tfidf'])
print(ranking)
term tfidf 0 air force 15.454863 1 air strike 22.767039 2 al jazeera 38.015786 3 al qaeda 40.412949 4 al qaida 19.172821 .. ... ... 155 world war 28.663142 156 year ago 32.269653 157 year jail 16.555130 158 year old 110.372398 159 year prison 26.403931 [160 rows x 2 columns]
#列印出處理後資料集matrix
dense = final_traindata_tfidf.todense()
denselist = dense.tolist()
df2 = pd.DataFrame(denselist, columns=terms)
df2
air force | air strike | al jazeera | al qaeda | al qaida | amnesty international | anti gay | around world | australian government | barack obama | ... | world biggest | world cup | world first | world largest | world news | world war | year ago | year jail | year old | year prison | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.00000 | 0.0 | 0.361791 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.400727 | 0.0 | 0.00000 | 0.228241 | 0.0 |
1 | 0.0 | 0.00000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.671348 | 0.0 | 0.00000 | 0.000000 | 0.0 |
2 | 0.0 | 0.00000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.790220 | 0.0 |
3 | 0.0 | 0.00000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.337538 | 0.0 |
4 | 0.0 | 0.00000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.375107 | 0.000000 | 0.0 | 0.00000 | 0.000000 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1606 | 0.0 | 0.00000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.780427 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.000000 | 0.0 |
1607 | 0.0 | 0.00000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.000000 | 0.0 |
1608 | 0.0 | 0.00000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.607745 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.000000 | 0.0 |
1609 | 0.0 | 0.00000 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.412763 | 0.000000 | 0.0 | 0.41653 | 0.000000 | 0.0 |
1610 | 0.0 | 0.38031 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.00000 | 0.000000 | 0.0 |
1611 rows × 160 columns