!nvidia-smi
#This install will ask for 2 prompts
#Start with 0.11 and then subsequent prompt say Y (only for 0.11). If it hangs in solving dependency for more than 2 min than kill,
#restart with 0.10 install
#This script for 0.10 and 0.11 has some bugs so follow subsequent steps
!wget -nc https://github.com/rapidsai/notebooks-extended/raw/master/utils/rapids-colab.sh
!bash rapids-colab.sh
import sys, os
dist_package_index = sys.path.index("/usr/local/lib/python3.6/dist-packages")
sys.path = sys.path[:dist_package_index] + ["/usr/local/lib/python3.6/site-packages"] + sys.path[dist_package_index:]
sys.path
if os.path.exists('update_pyarrow.py'): ## This file only exists if you're using RAPIDS version 0.11 or higher
exec(open("update_pyarrow.py").read(), globals())
sys.path.append('/usr/local/lib/python3.6/site-packages/')
File ‘rapids-colab.sh’ already there; not retrieving. PLEASE READ ******************************************************************************************************** Colab v0.11+ Migration Notice: There has been a NECESSARY Colab script code change for VERSION 0.11+ that MAY REQUIRE an update how you install RAPIDS into Colab! Not all Colab notebooks are updated (like personal Colabs) and while the script will install RAPIDS correctly, a neccessary script to update pyarrow to v0.15.x to be compatible with RAPIDS v0.11+ may not run, and your RAPIDS instance will BREAK Please enter in the box your desired RAPIDS version (ex: '0.10' or '0.11', between 0.9 to 0.11, without the quotes) and hit Enter. 0.10 You may not have to change anything. All versions of our script should work with this version of Colab File ‘env-check.py’ already there; not retrieving. Checking for GPU type: *********************************************************************** Woo! Your instance has the right kind of GPU, a 'Tesla P100-PCIE-16GB'! *********************************************************************** ********************************************* Your Google Colab instance is RAPIDS ready! *********************************************
#only for 0.10
!conda install -y --prefix /usr/local -c rapidsai/label/xgboost -c rapidsai -c nvidia -c conda-forge dask-cudf xgboost
Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done Solving environment: | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done # All requested packages already installed.
import cudf
import numpy as np
import dask_cudf
bank_df=cudf.read_csv('https://raw.githubusercontent.com/srivatsan88/YouTubeLI/master/dataset/bank-full.csv',sep=';')
bank_df.shape
(45211, 17)
bank_df.head()
age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 58 | management | married | tertiary | no | 2143 | yes | no | unknown | 5 | may | 261 | 1 | -1 | 0 | unknown | no |
1 | 44 | technician | single | secondary | no | 29 | yes | no | unknown | 5 | may | 151 | 1 | -1 | 0 | unknown | no |
2 | 33 | entrepreneur | married | secondary | no | 2 | yes | yes | unknown | 5 | may | 76 | 1 | -1 | 0 | unknown | no |
3 | 47 | blue-collar | married | unknown | no | 1506 | yes | no | unknown | 5 | may | 92 | 1 | -1 | 0 | unknown | no |
4 | 33 | unknown | single | unknown | no | 1 | no | no | unknown | 5 | may | 198 | 1 | -1 | 0 | unknown | no |
np.unique(bank_df.y)
0 no 1 yes Name: y, dtype: object
!nvidia-smi
Sun Jan 12 22:34:58 2020 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 440.44 Driver Version: 418.67 CUDA Version: 10.1 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 Tesla P100-PCIE... Off | 00000000:00:04.0 Off | 0 | | N/A 40C P0 33W / 250W | 385MiB / 16280MiB | 0% Default | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: GPU Memory | | GPU PID Type Process name Usage | |=============================================================================| +-----------------------------------------------------------------------------+
1 - age (numeric)
2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur", "student","blue-collar","self-employed","retired","technician","services")
3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
4 - education (categorical: "unknown","secondary","primary","tertiary")
5 - default: has credit in default? (binary: "yes","no")
6 - balance: average yearly balance, in euros (numeric)
7 - housing: has housing loan? (binary: "yes","no")
8 - loan: has personal loan? (binary: "yes","no")
9 - contact: contact communication type (categorical: "unknown","telephone","cellular")
10 - day: last contact day of the month (numeric)
11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
12 - duration: last contact duration, in seconds (numeric)
13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
15 - previous: number of contacts performed before this campaign and for this client (numeric)
16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")
17 - y - has the client subscribed a term deposit? (binary: "yes","no")
print ("Rows : " ,bank_df.shape[0])
print ("Columns : " ,bank_df.shape[1])
Rows : 45211 Columns : 17
bank_df.dtypes
bank_df.isnull().sum()
bank_df.y.value_counts()
no 39922 yes 5289 Name: y, dtype: int32
import time
start_time = time.time()
bank_df.describe()
print("Time Taken GPU %s seconds " % (str(time.time() - start_time)))
Time Taken GPU 0.07101154327392578 seconds
dcudf=dask_cudf.from_cudf(bank_df, npartitions=2)
start_time = time.time()
dcudf.describe()
print("Time Taken GPU %s seconds " % (str(time.time() - start_time)))
Time Taken GPU 0.5221295356750488 seconds
bank_df.describe()
age | balance | day | duration | campaign | pdays | previous | |
---|---|---|---|---|---|---|---|
count | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 |
mean | 40.936210 | 1362.272058 | 15.806419 | 258.163080 | 2.763841 | 40.197828 | 0.580323 |
std | 10.618762 | 3044.765829 | 8.322476 | 257.527812 | 3.098021 | 100.128746 | 2.303441 |
min | 18.000000 | -8019.000000 | 1.000000 | 0.000000 | 1.000000 | -1.000000 | 0.000000 |
25% | 33.000000 | 72.000000 | 8.000000 | 103.000000 | 1.000000 | -1.000000 | 0.000000 |
50% | 39.000000 | 448.000000 | 16.000000 | 180.000000 | 2.000000 | -1.000000 | 0.000000 |
75% | 48.000000 | 1428.000000 | 21.000000 | 319.000000 | 3.000000 | -1.000000 | 0.000000 |
max | 95.000000 | 102127.000000 | 31.000000 | 4918.000000 | 63.000000 | 871.000000 | 275.000000 |
bank_df.groupby(['marital', 'y']).agg({'balance':'mean'})
balance | ||
---|---|---|
marital | y | |
divorced | no | 1107.095747 |
yes | 1707.964630 | |
married | no | 1370.746228 |
yes | 1915.810163 | |
single | no | 1235.869921 |
yes | 1674.875523 |
bank_df.groupby(['marital', 'y']).agg({'balance':'mean','y':'count'})
balance | y | ||
---|---|---|---|
marital | y | ||
divorced | no | 1107.095747 | 4585 |
yes | 1707.964630 | 622 | |
married | no | 1370.746228 | 24459 |
yes | 1915.810163 | 2755 | |
single | no | 1235.869921 | 10878 |
yes | 1674.875523 | 1912 |
bank_df.groupby(['loan', 'y']).agg({'balance':'mean','y':'count'})
balance | y | ||
---|---|---|---|
loan | y | ||
no | no | 1413.228726 | 33162 |
yes | 1897.001041 | 4805 | |
yes | no | 766.481953 | 6760 |
yes | 883.642562 | 484 |
loan_outcome=bank_df.groupby(['loan', 'y']).agg({'balance':'mean','y':'count'})
loan_outcome['y_perct']=loan_outcome['y']/loan_outcome['y'].sum()
loan_outcome
balance | y | y_perct | ||
---|---|---|---|---|
loan | y | |||
no | no | 1413.228726 | 33162 | 0.733494 |
yes | 1897.001041 | 4805 | 0.106279 | |
yes | no | 766.481953 | 6760 | 0.149521 |
yes | 883.642562 | 484 | 0.010705 |
def convert_hour(duration):
return duration/60
bank_df['duration_hour'] = bank_df['duration'].applymap(convert_hour)
bank_df
age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | duration_hour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 58 | management | married | tertiary | no | 2143 | yes | no | unknown | 5 | may | 261 | 1 | -1 | 0 | unknown | no | 4.350000 |
1 | 44 | technician | single | secondary | no | 29 | yes | no | unknown | 5 | may | 151 | 1 | -1 | 0 | unknown | no | 2.516667 |
2 | 33 | entrepreneur | married | secondary | no | 2 | yes | yes | unknown | 5 | may | 76 | 1 | -1 | 0 | unknown | no | 1.266667 |
3 | 47 | blue-collar | married | unknown | no | 1506 | yes | no | unknown | 5 | may | 92 | 1 | -1 | 0 | unknown | no | 1.533333 |
4 | 33 | unknown | single | unknown | no | 1 | no | no | unknown | 5 | may | 198 | 1 | -1 | 0 | unknown | no | 3.300000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
45206 | 51 | technician | married | tertiary | no | 825 | no | no | cellular | 17 | nov | 977 | 3 | -1 | 0 | unknown | yes | 16.283333 |
45207 | 71 | retired | divorced | primary | no | 1729 | no | no | cellular | 17 | nov | 456 | 2 | -1 | 0 | unknown | yes | 7.600000 |
45208 | 72 | retired | married | secondary | no | 5715 | no | no | cellular | 17 | nov | 1127 | 5 | 184 | 3 | success | yes | 18.783333 |
45209 | 57 | blue-collar | married | secondary | no | 668 | no | no | telephone | 17 | nov | 508 | 4 | -1 | 0 | unknown | no | 8.466667 |
45210 | 37 | entrepreneur | married | secondary | no | 2971 | no | no | cellular | 17 | nov | 361 | 2 | 188 | 11 | other | no | 6.016667 |
45211 rows × 18 columns
bank_df.groupby('y').balance.mean()
y no 1303.714969 yes 1804.267915 Name: balance, dtype: float64
bank_df.groupby('y').campaign.mean()
y no 2.846350 yes 2.141047 Name: campaign, dtype: float64
bank_df['campaign'].quantile(q=[0.25,0.5, 0.75, 0.9, 0.95, 1.0])
0.25 1.0 0.50 2.0 0.75 3.0 0.90 5.0 0.95 8.0 1.00 63.0 Name: campaign, dtype: float64
bank_campaign_df=bank_df.query("campaign <= 8")
bank_campaign_df.groupby(['campaign', 'y']).agg({'y':'count'})
y | ||
---|---|---|
campaign | y | |
1 | no | 14983 |
yes | 2561 | |
2 | no | 11104 |
yes | 1401 | |
3 | no | 4903 |
yes | 618 | |
4 | no | 3205 |
yes | 317 | |
5 | no | 1625 |
yes | 139 | |
6 | no | 1199 |
yes | 92 | |
7 | no | 688 |
yes | 47 | |
8 | no | 508 |
yes | 32 |
bank_df.education.value_counts()
secondary 23202 tertiary 13301 primary 6851 unknown 1857 Name: education, dtype: int32
bank_df.groupby(['education','y']).agg({'y':'count'})
y | ||
---|---|---|
education | y | |
primary | no | 6260 |
yes | 591 | |
secondary | no | 20752 |
yes | 2450 | |
tertiary | no | 11305 |
yes | 1996 | |
unknown | no | 1605 |
yes | 252 |