## 20-newsgroup
from sklearn.datasets import fetch_20newsgroups
text = fetch_20newsgroups()
print len(text.data)
11314
## MNIST
import cPickle
train_mnist, valid_mnist, test_mnist = cPickle.load(open('data/mnist.pkl', 'r'))
print train_mnist[0].shape
(50000, 784)
## human faces
from sklearn.datasets import fetch_olivetti_faces
faces = fetch_olivetti_faces()
faces.data.shape
(400, 4096)
## time series
from matplotlib import finance
symbol_dict = {
'TOT': 'Total',
'XOM': 'Exxon',
'CVX': 'Chevron',
'COP': 'ConocoPhillips',
'VLO': 'Valero Energy',
'MSFT': 'Microsoft',
'IBM': 'IBM',
'TWX': 'Time Warner',
'CMCSA': 'Comcast',
'CVC': 'Cablevision',
'YHOO': 'Yahoo',
'DELL': 'Dell',
'HPQ': 'HP',
'AMZN': 'Amazon',
'TM': 'Toyota',
'CAJ': 'Canon',
'MTU': 'Mitsubishi',
'SNE': 'Sony',
'F': 'Ford',
'HMC': 'Honda',
'NAV': 'Navistar',
'NOC': 'Northrop Grumman',
'BA': 'Boeing',
'KO': 'Coca Cola',
'MMM': '3M',
'MCD': 'Mc Donalds',
'PEP': 'Pepsi',
'MDLZ': 'Kraft Foods',
'K': 'Kellogg',
'UN': 'Unilever',
'MAR': 'Marriott',
'PG': 'Procter Gamble',
'CL': 'Colgate-Palmolive',
'NWS': 'News Corp',
'GE': 'General Electrics',
'WFC': 'Wells Fargo',
'JPM': 'JPMorgan Chase',
'AIG': 'AIG',
'AXP': 'American express',
'BAC': 'Bank of America',
'GS': 'Goldman Sachs',
'AAPL': 'Apple',
'SAP': 'SAP',
'CSCO': 'Cisco',
'TXN': 'Texas instruments',
'XRX': 'Xerox',
'LMT': 'Lookheed Martin',
'WMT': 'Wal-Mart',
'WAG': 'Walgreen',
'HD': 'Home Depot',
'GSK': 'GlaxoSmithKline',
'PFE': 'Pfizer',
'SNY': 'Sanofi-Aventis',
'NVS': 'Novartis',
'KMB': 'Kimberly-Clark',
'R': 'Ryder',
'GD': 'General Dynamics',
'RTN': 'Raytheon',
'CVS': 'CVS',
'CAT': 'Caterpillar',
'DD': 'DuPont de Nemours'}
d1 = datetime.datetime(2003, 01, 01)
d2 = datetime.datetime(2008, 01, 01)
symbols, names = np.array(symbol_dict.items()).T
quotes = [finance.quotes_historical_yahoo(symbol, d1, d2, asobject=True)
for symbol in symbols]
open_quotes = np.array([q.open for q in quotes]).astype(np.float)
close_quotes = np.array([q.close for q in quotes]).astype(np.float)
## It seems that the daily variations of the quotes are what carry most information
variation = close_quotes - open_quotes
print variation.shape
--------------------------------------------------------------------------- HTTPError Traceback (most recent call last) <ipython-input-9-8c45fc130542> in <module>() 67 symbols, names = np.array(symbol_dict.items()).T 68 quotes = [finance.quotes_historical_yahoo(symbol, d1, d2, asobject=True) ---> 69 for symbol in symbols] 70 open_quotes = np.array([q.open for q in quotes]).astype(np.float) 71 close_quotes = np.array([q.close for q in quotes]).astype(np.float) /usr/lib/pymodules/python2.7/matplotlib/finance.pyc in quotes_historical_yahoo(ticker, date1, date2, asobject, adjusted, cachename) 225 # warnings.warn("Recommend changing to asobject=None") 226 --> 227 fh = fetch_historical_yahoo(ticker, date1, date2, cachename) 228 229 try: /usr/lib/pymodules/python2.7/matplotlib/finance.pyc in fetch_historical_yahoo(ticker, date1, date2, cachename, dividends) 186 else: 187 mkdirs(cachedir) --> 188 urlfh = urlopen(url) 189 190 fh = open(cachename, 'wb') /usr/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout) 125 if _opener is None: 126 _opener = build_opener() --> 127 return _opener.open(url, data, timeout) 128 129 def install_opener(opener): /usr/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout) 408 for processor in self.process_response.get(protocol, []): 409 meth = getattr(processor, meth_name) --> 410 response = meth(req, response) 411 412 return response /usr/lib/python2.7/urllib2.pyc in http_response(self, request, response) 521 if not (200 <= code < 300): 522 response = self.parent.error( --> 523 'http', request, response, code, msg, hdrs) 524 525 return response /usr/lib/python2.7/urllib2.pyc in error(self, proto, *args) 446 if http_err: 447 args = (dict, 'default', 'http_error_default') + orig_args --> 448 return self._call_chain(*args) 449 450 # XXX probably also want an abstract factory that knows when it makes /usr/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args) 380 func = getattr(handler, meth_name) 381 --> 382 result = func(*args) 383 if result is not None: 384 return result /usr/lib/python2.7/urllib2.pyc in http_error_default(self, req, fp, code, msg, hdrs) 529 class HTTPDefaultErrorHandler(BaseHandler): 530 def http_error_default(self, req, fp, code, msg, hdrs): --> 531 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) 532 533 class HTTPRedirectHandler(BaseHandler): HTTPError: HTTP Error 404: Not Found
## evergreen data
import pandas as pd
df = pd.read_csv('data/evergreen.tsv', sep = '\t', )
print df.shape
df.describe()
(7395, 27)
/usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated. warnings.warn(d.msg, DeprecationWarning) /usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated. warnings.warn(d.msg, DeprecationWarning) /usr/local/lib/python2.7/dist-packages/pandas/core/config.py:570: DeprecationWarning: height has been deprecated. warnings.warn(d.msg, DeprecationWarning)
<class 'pandas.core.frame.DataFrame'> Index: 8 entries, count to max Data columns (total 21 columns): urlid 8 non-null values avglinksize 8 non-null values commonlinkratio_1 8 non-null values commonlinkratio_2 8 non-null values commonlinkratio_3 8 non-null values commonlinkratio_4 8 non-null values compression_ratio 8 non-null values embed_ratio 8 non-null values framebased 8 non-null values frameTagRatio 8 non-null values hasDomainLink 8 non-null values html_ratio 8 non-null values image_ratio 8 non-null values lengthyLinkDomain 8 non-null values linkwordscore 8 non-null values non_markup_alphanum_characters 8 non-null values numberOfLinks 8 non-null values numwords_in_url 8 non-null values parametrizedLinkRatio 8 non-null values spelling_errors_ratio 8 non-null values label 8 non-null values dtypes: float64(21)
## blackbox
black_X, black_y = cPickle.load(open('data/blackbox.pkl'))
print black_X.shape
(1000, 1875)