#!/usr/bin/env python # coding: utf-8 # # Walkthrough for S3-helper function #

# Forget where you put some sensitive information on your cloud database? #
Tired of downloading files from s3 only to preview them in excel, or textpad? #
Want to touch and reshape data but feel it's caged off from you? #
Look no further, your s3 problems are solved! #

# ## Table of Contents #
# This Jupyter notebook gives a walkthrough of several handy functions from the s3 module. #
With best intentions, these functions mirror the use of standard libraries, while empolying the backend of popular open source projects. #

The notebook highlights 7 functions: #
# # 1. List files (with wildcard) in a s3 bucket/key using ls() # 2. Read files into a string or bytes using read() and open() # 3. Read csv and json files on s3 into Pandas dataframes using read_csv() and read_json() # 4. Write csv and json files from Pandas dataframes to s3 using to_csv() and to_json() # 5. Write local files to s3 using write() # 6. Saving and Loading Scikit-Learn classifiers # 7. Moving files to new buckets and keys using mv() # # The only requirements are setting AWS environment variables or setting up the AWS CLI, and installing the `requirements.txt` modules. # # For this tutorial, we'll use the red wine quality dataset from UCI Center for Machine Learning and Intelligent Systems. # In[1]: import os import s3 # ## Listing files in a S3 bucket and key using ls( ) #

# s3.ls will list all the files and directories in a bucket/key akin to os.listdir() #
see the code #

# In[2]: s3_path = 's3://prod-datalytics/playground/' # It takes in a bucket or bucket, key pair. # In[3]: s3.ls(s3_path) # s3.ls also supports regex-like wildcard patterns exactly like glob.glob() # In[4]: s3.ls(s3_path + '*.csv') # With a programmatic method of getting s3 file paths, we can start doing some cools stuff. #
top # ## Read files in s3 with open() # # see the code # In[5]: f = s3.ls(s3_path + '*.csv')[0] f # we can open the file as a streaming body of bytes. # In[6]: s3.open(f) # this is helpful sometimes, but typically we want to read a file like Python's native #
 Open(filename, 'r') as f: #
   f.read() #
# In[7]: s3.read(f, encoding='utf-8')[:200] # displays the first 200 characters. # For more structured data, we can leverage Pandas' parsing engines... #
top # ## Read S3 files to memory with read_csv( ) and read_json( ) #

# s3.read_csv and read_json are identical to their Pandas' # ancestor and backbone. #
# Using this handy function, you have data displayed in a nice tabular format:
# see the code #

# In[8]: df = s3.read_csv(f, sep=',') df.head(3) # a csv is the most simple use case, we can handle alternative delimiters and json files too. # In[9]: files = s3.ls(s3_path) files # here are tab-separated values (tsv). # In[10]: print("We can read the {} tsv easily.".format(files[-1])) df = s3.read_csv(files[-1], sep='\t') df.tail(3) # here's a json file # In[11]: print("We can also read the {} file easily.".format(files[0])) df = s3.read_json(files[0]) df.sample(3) # they're actually all the same file-- in different formats!
# If you're new to Pandas, you'll be happy to learn that it is the de-facto tool for data manipulation.
# In[12]: df.dtypes # Getting basic stats and distributions are a function away.. # In[13]: df.describe().T # Everything is indexed!
# Here we get a quick calculation for the 75th percentile of alcohol content. # In[14]: df.describe()['alcohol']['75%'] # It's easy to filter a dataframe:
# Here we're going to get all the heavily alcoholic wines... # In[15]: df_alcoholic = df[df['alcohol'] > df.describe()['alcohol']['75%']] df_alcoholic.head() # It's also stupid easy to plot-- as Pandas extends the Matplotlib package. # In[16]: #this line is run once, typically at the beginning of the notebook to enable plotting. get_ipython().run_line_magic('matplotlib', 'inline') # In[17]: df_alcoholic.plot(kind='scatter', x='residual sugar', y='density') # What is that outlier? # In[2]: df_alcoholic[df_alcoholic['residual sugar'] > 12] # After processing and normalizing the data, we may want to upload this new file to s3. #
top # ## Write DataFrames to S3 with to_csv( ) and to_json( ) #

# s3.read_csv and read_json are almost identical to their Pandas ancestor and backbone. #
The difference is that s3.to_csv takes the dataframe as an argument, rather than being a function of a dataframe. #
see the code #

# In[18]: # where will the file get stored? s3_target = 's3://prod-datalytics/playground/wine_list.tsv.gz' # We can now use our filtered dataset, to write a new file to s3.
# Using Pandas to_csv args, we have a lot of control of the output format. # In[19]: s3.to_csv(df_alcoholic, s3_target, sep='\t', index=False, compression='gzip') # top # ## Write local files to S3 with disk_2_s3( ) # We can send local files to s3 too, first let's write a file to local disk using the built-in Pandas `to_csv()`. # In[20]: local_file = 'wine_list.tsv.gz' # In[21]: df_alcoholic.to_csv(local_file, sep='\t', index=False, compression='gzip') # In[22]: s3.disk_2_s3(file=local_file, s3_path=s3_target) # In[23]: # purge it! os.remove(local_file) # ## Saving and Loading Scikit-Learn Classifiers # If you're into machine learning, you're in luck! #
see the code # In[24]: from sklearn.ensemble import RandomForestClassifier # for the example let's just use a vanilla Random Forest Model # In[25]: clf = RandomForestClassifier() clf # Here is where we'd train and evaluate the model... # In[30]: # fit the model! # clf.fit(X, y) # My first run (not shown) I got a an test set accuracy of only 61%, which is pretty bad.
# You should try to beat that score! # In[ ]: ''' write some code here: look into train_test_split, gridsearchCV, and kfolds from Scikit-Learn. This is also a great dataset to practice: scaling values (see standardScaler) dimensionality reduction (see PCA) and a linear model (see Lasso or Logistic Regression) ''' # Once you're happy with the performance, we can persist the model as a pickle file. # In[31]: s3.dump_clf(clf, 's3://prod-datalytics/playground/models/clf.pkl') # And re-use it when the time is right! # In[32]: s3.load_clf('s3://prod-datalytics/playground/models/clf.pkl') # top # ## Movin' Files between buckets and keys # In the interest of good file-keeping let's move our saved classifier to it's own special folder (key).
# In[33]: s3.cp(old_path='s3://prod-datalytics/playground/models/clf.pkl', new_path='s3://prod-datalytics/production_space/models/clf.pkl',) # to move the file (and delete the old instance) we use `mv`, instead of `cp`. # In[34]: s3.mv(old_path='s3://prod-datalytics/playground/models/clf.pkl', new_path='s3://prod-datalytics/production_space/models/clf.pkl',) # top