import pandas as pd #importing packages
import os as os
os.getcwd() #current working directory
'/home/ajay/Desktop'
os.chdir('/home/ajay/Downloads')
os.getcwd()
'/home/ajay/Downloads'
a=os.getcwd()
os.listdir(a)
['DAVID BOWIE - GREATEST HITS (2 CD)(2002)', 'Fedora-Live-Workstation-i686-21-5.iso', '10668959_931479900215467_5781145420016290845_o.jpg', '10668959_931479900215467_5781145420016290845_o(1).jpg', 'Welcome! | LinkedIn.html', '8.jpg', 'Skill development for the future | Blog | Michael & Susan Dell Foundation_files', '10344202_10154121676145471_6559129518343929224_o.jpg', '(2) Welcome! | LinkedIn.html', '10175075_10154039045855471_1279064786171241787_n.jpg', 'FAQ_MotorcycleMath.docx', 'dplyr_files', '10258648_10154121676135471_4503284835037000191_o.jpg', 'Photos.zip', 'Profiling_files', '10291764_10154039045790471_5415972910726259763_n.jpg', 'rmdA6xJ.jpg', '9.jpg', 'Social Media Workshop \xe2\x80\x93By Ajay Ohri.pptx', '10368373_10154122366665471_8257322375297843081_o.jpg', '10608234_931479880215469_9055872974209850319_o.jpg', 'PyCon US 2012 Presentation: Data analysis in Python with pandas.html', 'Usage_files', 'social-media-seminar.html', 'evaluate.pdf', 'alissa-coming-soon-v2-0(1).zip', 'visualization.html', 'hellboy-ron-perlman-1024x709.jpg', '10304704_10154039045795471_7980232226538706014_n.jpg', 'R Training.pdf', 'pyqt-whitepaper-a4.pdf', 'Webinar Requirement details.xlsx', '6.jpg', 'Usage.html', 'cdh5-repository_1.0_all.deb', '23#.jpg', 'Profiling.html', 'a9hMn.png', 'python for r users_files', 'Social Media Workshop \xe2\x80\x93By Ajay Ohri(1).pptx', '11075179_963029380409177_4968699437635809869_n.jpg', 'Benchmarks-:-Grouping.html', 'dropbox_2015.02.12_i386.deb', 'SAS-University-Edition', 'alissa-coming-soon-v2-0.zip', 'skype-ubuntu-precise_4.3.0.37-1_i386.deb', 'Fedora-Live-Workstation-x86_64-21', '10.jpg', 'Adele - Discography (Complete) [2008 - 2011]', 'how-to-install-oracle-java-7-in-ubuntu-12-04_files', 'social-media-seminar_files', '10177488_10154039045775471_3627498659729079825_n.jpg', 'PyCon US 2012 Presentation: Data analysis in Python with pandas_files', 'Sublime Text 2.0.2.tar.bz2', '10379813_10154121676105471_3067213873168041728_o.jpg', 'unvbasicvapp__9411003__vmx__en__sp0__1.zip', '20150409_195037.jpg', 'dropbox_2015.02.12_amd64.deb', 'flareget_4.1-80_amd64.deb', '1966148_931479106882213_4320838953761562592_o.jpg', 'how-to-install-oracle-java-7-in-ubuntu-12-04.html', '4.jpg', '10472818_931477176882406_2076366586743689079_o.jpg', '1912293_931479710215486_4316765684077047706_o(1).jpg', 'dplyr.html', 'CAX_EMC_MotorcycleMath_Data_Dictionary.xlsx', 'visualization_files', 'R_inferno.pdf', "Def Leppard - Vault Def Leppard's Greatest Hits 1980-1995 (1995) CBR320 vtwin88cube", 'MongoDB \xe2\x80\x93 State of the R | joy of data_files', 'CAX_EMC_Journalist_Data.zip', '7.jpg', 'MongoDB \xe2\x80\x93 State of the R | joy of data.html', 'FlareGet', '10708672_931478223548968_2305655250919027359_o(1).jpg', 'intel-linux-graphics-installer_1.0.8-0intel1_amd64.deb', 'firefox-36.0.4.tar.bz2', 'MoNo 1.png', '1622513_931479090215548_1838371454120774664_o.jpg', 'VMware-Player-6.0.5-2443746.i386.bundle', 'python for r users.html', 'Sandbox_HDP_2.2_VMware.ova', 'Courses_DecisionStats.pptx', '5.jpg', 'rvest_files', 'Benchmarks-:-Grouping_files', 'rvest.html', 'IMG_20150318_185525.jpg', 'CAX_EMC_Racer_Data.zip', '10694360_931478020215655_6148332260082911335_o.jpg', 'Fedora-Live-Workstation-i686-21', 'MoNo 1(1).png', 'kali-linux-1.1.0a-i386', '1912293_931479710215486_4316765684077047706_o.jpg', 'tutorials.html', '10708672_931478223548968_2305655250919027359_o.jpg', 'Skill development for the future | Blog | Michael & Susan Dell Foundation.html', 'R Training(1).pdf', 'openSUSE-13.2-DVD-x86_64.iso', '10585533_10155299926870471_1412421882_n.jpg', 'google-chrome-stable_current_i386(1).deb', 'flareget_4.1-80_i386.deb', '1658277_931479643548826_6461630666066372009_o.jpg', 'March invoice adaptive analytics - Sheet1.pdf', 'tutorials_files', 'google-talkplugin_current_i386.deb', 'google-chrome-stable_current_i386.deb', 'cdh5-repository_1.0_all(1).deb', '__Social_Network_Analysis_in_Telecommunications__Wiley_and_SAS_Business_Series_.pdf', 'Step1_Submission_format.doc']
os.chdir('/home/ajay/Desktop')
os.getcwd()
'/home/ajay/Desktop'
a=os.getcwd()
os.listdir(a)
['r-training.html', 'RcmdrMarkdown.html', 'March invoice Indicus - Sheet1.pdf', 'Untitled.ipynb', 'March AV Invoice - Sheet1.pdf', 'Silver_Surfer.jpg', 'bm-pd.ipynb.html', 'R Training.pdf', 'social media.png', 'Webinar Requirement details.xlsx', 'CDNOW_master.zip', '.ipynb_checkpoints', 'Website', 'bm-pd.ipynb_files', 'Social Media Analytics-2.pptx', 'python for analytics and r.txt', 'Untitled Folder', 'diamonds.csv', 'untitled.txt', 'Untitled1.ipynb', 'website sun.odp', '11079581_1582815845321431_8829427897815039618_o.jpg', 'aj.jpg', 'housing', '20150409_180702.jpg', 'Analytics Deck.pptx', 'datatable-faq.pdf', 'STAND OUT_poster.docx', '7d376e0f-f81a-47f4-b27a-f628e1728e15-medium.jpeg', 'CDNOW_master.txt', '"housing.csv"', 'r-training_files', 'RFM', 'Analytics Deck.pdf', '1.zip', 'Decisionstats.jpg']
diamonds=pd.read_csv("diamonds.csv")
#note header =0 means we take the first row as a header (default) else we can specify header=None
diamonds.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 53940 entries, 0 to 53939 Data columns (total 10 columns): carat 53940 non-null float64 cut 53940 non-null object color 53940 non-null object clarity 53940 non-null object depth 53940 non-null float64 table 53940 non-null float64 price 53940 non-null int64 x 53940 non-null float64 y 53940 non-null float64 z 53940 non-null float64 dtypes: float64(6), int64(1), object(3) memory usage: 3.9+ MB
diamonds.head()
carat | cut | color | clarity | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0.23 | Ideal | E | SI2 | 61.5 | 55 | 326 | 3.95 | 3.98 | 2.43 |
1 | 0.21 | Premium | E | SI1 | 59.8 | 61 | 326 | 3.89 | 3.84 | 2.31 |
2 | 0.23 | Good | E | VS1 | 56.9 | 65 | 327 | 4.05 | 4.07 | 2.31 |
3 | 0.29 | Premium | I | VS2 | 62.4 | 58 | 334 | 4.20 | 4.23 | 2.63 |
4 | 0.31 | Good | J | SI2 | 63.3 | 58 | 335 | 4.34 | 4.35 | 2.75 |
diamonds.tail(10)
carat | cut | color | clarity | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|---|---|---|
53930 | 0.71 | Premium | E | SI1 | 60.5 | 55 | 2756 | 5.79 | 5.74 | 3.49 |
53931 | 0.71 | Premium | F | SI1 | 59.8 | 62 | 2756 | 5.74 | 5.73 | 3.43 |
53932 | 0.70 | Very Good | E | VS2 | 60.5 | 59 | 2757 | 5.71 | 5.76 | 3.47 |
53933 | 0.70 | Very Good | E | VS2 | 61.2 | 59 | 2757 | 5.69 | 5.72 | 3.49 |
53934 | 0.72 | Premium | D | SI1 | 62.7 | 59 | 2757 | 5.69 | 5.73 | 3.58 |
53935 | 0.72 | Ideal | D | SI1 | 60.8 | 57 | 2757 | 5.75 | 5.76 | 3.50 |
53936 | 0.72 | Good | D | SI1 | 63.1 | 55 | 2757 | 5.69 | 5.75 | 3.61 |
53937 | 0.70 | Very Good | D | SI1 | 62.8 | 60 | 2757 | 5.66 | 5.68 | 3.56 |
53938 | 0.86 | Premium | H | SI2 | 61.0 | 58 | 2757 | 6.15 | 6.12 | 3.74 |
53939 | 0.75 | Ideal | D | SI2 | 62.2 | 55 | 2757 | 5.83 | 5.87 | 3.64 |
diamonds.columns
Index([u'carat', u'cut', u'color', u'clarity', u'depth', u'table', u'price', u'x', u'y', u'z'], dtype='object')
b=len(diamonds) #this is the total population size
print(b)
53940
import numpy as np
rows = np.random.choice(diamonds.index.values, 0.0001*b)
print(rows)
sampled_df = diamonds.ix[rows]
[45653 7503 47794 12017 46125]
sampled_df
carat | cut | color | clarity | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|---|---|---|
45653 | 0.25 | Ideal | H | IF | 61.4 | 57 | 525 | 4.05 | 4.08 | 2.49 |
7503 | 1.05 | Premium | G | SI2 | 61.3 | 58 | 4241 | 6.55 | 6.60 | 4.03 |
47794 | 0.71 | Ideal | J | VS2 | 62.4 | 54 | 1899 | 5.72 | 5.76 | 3.58 |
12017 | 1.00 | Premium | F | SI1 | 59.8 | 59 | 5151 | 6.55 | 6.49 | 3.90 |
46125 | 0.51 | Ideal | F | VS1 | 61.7 | 54 | 1744 | 5.14 | 5.17 | 3.18 |
diamonds.describe()
carat | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|
count | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 | 53940.000000 |
mean | 0.797940 | 61.749405 | 57.457184 | 3932.799722 | 5.731157 | 5.734526 | 3.538734 |
std | 0.474011 | 1.432621 | 2.234491 | 3989.439738 | 1.121761 | 1.142135 | 0.705699 |
min | 0.200000 | 43.000000 | 43.000000 | 326.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.400000 | 61.000000 | 56.000000 | 950.000000 | 4.710000 | 4.720000 | 2.910000 |
50% | 0.700000 | 61.800000 | 57.000000 | 2401.000000 | 5.700000 | 5.710000 | 3.530000 |
75% | 1.040000 | 62.500000 | 59.000000 | 5324.250000 | 6.540000 | 6.540000 | 4.040000 |
max | 5.010000 | 79.000000 | 95.000000 | 18823.000000 | 10.740000 | 58.900000 | 31.800000 |
cut=diamonds.groupby("cut")
cut.count()
carat | color | clarity | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|---|---|
cut | |||||||||
Fair | 1610 | 1610 | 1610 | 1610 | 1610 | 1610 | 1610 | 1610 | 1610 |
Good | 4906 | 4906 | 4906 | 4906 | 4906 | 4906 | 4906 | 4906 | 4906 |
Ideal | 21551 | 21551 | 21551 | 21551 | 21551 | 21551 | 21551 | 21551 | 21551 |
Premium | 13791 | 13791 | 13791 | 13791 | 13791 | 13791 | 13791 | 13791 | 13791 |
Very Good | 12082 | 12082 | 12082 | 12082 | 12082 | 12082 | 12082 | 12082 | 12082 |
cut.mean()
carat | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|
cut | |||||||
Fair | 1.046137 | 64.041677 | 59.053789 | 4358.757764 | 6.246894 | 6.182652 | 3.982770 |
Good | 0.849185 | 62.365879 | 58.694639 | 3928.864452 | 5.838785 | 5.850744 | 3.639507 |
Ideal | 0.702837 | 61.709401 | 55.951668 | 3457.541970 | 5.507451 | 5.520080 | 3.401448 |
Premium | 0.891955 | 61.264673 | 58.746095 | 4584.257704 | 5.973887 | 5.944879 | 3.647124 |
Very Good | 0.806381 | 61.818275 | 57.956150 | 3981.759891 | 5.740696 | 5.770026 | 3.559801 |
cut.median()
carat | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|
cut | |||||||
Fair | 1.00 | 65.0 | 58 | 3282.0 | 6.175 | 6.10 | 3.97 |
Good | 0.82 | 63.4 | 58 | 3050.5 | 5.980 | 5.99 | 3.70 |
Ideal | 0.54 | 61.8 | 56 | 1810.0 | 5.250 | 5.26 | 3.23 |
Premium | 0.86 | 61.4 | 59 | 3185.0 | 6.110 | 6.06 | 3.72 |
Very Good | 0.71 | 62.1 | 58 | 2648.0 | 5.740 | 5.77 | 3.56 |
pd.crosstab(diamonds.cut, diamonds.color)
color | D | E | F | G | H | I | J |
---|---|---|---|---|---|---|---|
cut | |||||||
Fair | 163 | 224 | 312 | 314 | 303 | 175 | 119 |
Good | 662 | 933 | 909 | 871 | 702 | 522 | 307 |
Ideal | 2834 | 3903 | 3826 | 4884 | 3115 | 2093 | 896 |
Premium | 1603 | 2337 | 2331 | 2924 | 2360 | 1428 | 808 |
Very Good | 1513 | 2400 | 2164 | 2299 | 1824 | 1204 | 678 |
diamonds.corr()
carat | depth | table | price | x | y | z | |
---|---|---|---|---|---|---|---|
carat | 1.000000 | 0.028224 | 0.181618 | 0.921591 | 0.975094 | 0.951722 | 0.953387 |
depth | 0.028224 | 1.000000 | -0.295779 | -0.010647 | -0.025289 | -0.029341 | 0.094924 |
table | 0.181618 | -0.295779 | 1.000000 | 0.127134 | 0.195344 | 0.183760 | 0.150929 |
price | 0.921591 | -0.010647 | 0.127134 | 1.000000 | 0.884435 | 0.865421 | 0.861249 |
x | 0.975094 | -0.025289 | 0.195344 | 0.884435 | 1.000000 | 0.974701 | 0.970772 |
y | 0.951722 | -0.029341 | 0.183760 | 0.865421 | 0.974701 | 1.000000 | 0.952006 |
z | 0.953387 | 0.094924 | 0.150929 | 0.861249 | 0.970772 | 0.952006 | 1.000000 |
import matplotlib as mt
%matplotlib inline #this line makes sure plots are in same notebook
from ggplot import *
p = ggplot(aes(x='price', y='carat'), data=diamonds)
p
<ggplot: (-1059997756)>
p + geom_point()
<ggplot: (-1059338452)>
p + geom_point() +facet_grid('cut')
<ggplot: (-1057884332)>
p = ggplot(aes(x='price', y='carat',color="cut"), data=diamonds)
p + geom_point()
<ggplot: (-1059249386)>
p = ggplot(aes(x='price', y='carat',color="clarity"), data=diamonds)
p + geom_point()
<ggplot: (-1060618628)>