In [1]:

import pandas as pd #importing packages
import os as os

In [2]:

os.getcwd() #current working directory

Out[2]:

'/home/ajay/Desktop'

In [3]:

os.chdir('/home/ajay/Downloads')

In [4]:

os.getcwd()

Out[4]:

'/home/ajay/Downloads'

In [5]:

a=os.getcwd()
os.listdir(a)

Out[5]:

['DAVID BOWIE - GREATEST HITS (2 CD)(2002)',
 'Fedora-Live-Workstation-i686-21-5.iso',
 '10668959_931479900215467_5781145420016290845_o.jpg',
 '10668959_931479900215467_5781145420016290845_o(1).jpg',
 'Welcome! | LinkedIn.html',
 '8.jpg',
 'Skill development for the future | Blog | Michael & Susan Dell Foundation_files',
 '10344202_10154121676145471_6559129518343929224_o.jpg',
 '(2) Welcome! | LinkedIn.html',
 '10175075_10154039045855471_1279064786171241787_n.jpg',
 'FAQ_MotorcycleMath.docx',
 'dplyr_files',
 '10258648_10154121676135471_4503284835037000191_o.jpg',
 'Photos.zip',
 'Profiling_files',
 '10291764_10154039045790471_5415972910726259763_n.jpg',
 'rmdA6xJ.jpg',
 '9.jpg',
 'Social Media Workshop \xe2\x80\x93By Ajay Ohri.pptx',
 '10368373_10154122366665471_8257322375297843081_o.jpg',
 '10608234_931479880215469_9055872974209850319_o.jpg',
 'PyCon US 2012 Presentation: Data analysis in Python with pandas.html',
 'Usage_files',
 'social-media-seminar.html',
 'evaluate.pdf',
 'alissa-coming-soon-v2-0(1).zip',
 'visualization.html',
 'hellboy-ron-perlman-1024x709.jpg',
 '10304704_10154039045795471_7980232226538706014_n.jpg',
 'R Training.pdf',
 'pyqt-whitepaper-a4.pdf',
 'Webinar Requirement details.xlsx',
 '6.jpg',
 'Usage.html',
 'cdh5-repository_1.0_all.deb',
 '23#.jpg',
 'Profiling.html',
 'a9hMn.png',
 'python for r users_files',
 'Social Media Workshop \xe2\x80\x93By Ajay Ohri(1).pptx',
 '11075179_963029380409177_4968699437635809869_n.jpg',
 'Benchmarks-:-Grouping.html',
 'dropbox_2015.02.12_i386.deb',
 'SAS-University-Edition',
 'alissa-coming-soon-v2-0.zip',
 'skype-ubuntu-precise_4.3.0.37-1_i386.deb',
 'Fedora-Live-Workstation-x86_64-21',
 '10.jpg',
 'Adele - Discography (Complete) [2008 - 2011]',
 'how-to-install-oracle-java-7-in-ubuntu-12-04_files',
 'social-media-seminar_files',
 '10177488_10154039045775471_3627498659729079825_n.jpg',
 'PyCon US 2012 Presentation: Data analysis in Python with pandas_files',
 'Sublime Text 2.0.2.tar.bz2',
 '10379813_10154121676105471_3067213873168041728_o.jpg',
 'unvbasicvapp__9411003__vmx__en__sp0__1.zip',
 '20150409_195037.jpg',
 'dropbox_2015.02.12_amd64.deb',
 'flareget_4.1-80_amd64.deb',
 '1966148_931479106882213_4320838953761562592_o.jpg',
 'how-to-install-oracle-java-7-in-ubuntu-12-04.html',
 '4.jpg',
 '10472818_931477176882406_2076366586743689079_o.jpg',
 '1912293_931479710215486_4316765684077047706_o(1).jpg',
 'dplyr.html',
 'CAX_EMC_MotorcycleMath_Data_Dictionary.xlsx',
 'visualization_files',
 'R_inferno.pdf',
 "Def Leppard - Vault Def Leppard's Greatest Hits 1980-1995 (1995) CBR320 vtwin88cube",
 'MongoDB \xe2\x80\x93 State of the R | joy of data_files',
 'CAX_EMC_Journalist_Data.zip',
 '7.jpg',
 'MongoDB \xe2\x80\x93 State of the R | joy of data.html',
 'FlareGet',
 '10708672_931478223548968_2305655250919027359_o(1).jpg',
 'intel-linux-graphics-installer_1.0.8-0intel1_amd64.deb',
 'firefox-36.0.4.tar.bz2',
 'MoNo 1.png',
 '1622513_931479090215548_1838371454120774664_o.jpg',
 'VMware-Player-6.0.5-2443746.i386.bundle',
 'python for r users.html',
 'Sandbox_HDP_2.2_VMware.ova',
 'Courses_DecisionStats.pptx',
 '5.jpg',
 'rvest_files',
 'Benchmarks-:-Grouping_files',
 'rvest.html',
 'IMG_20150318_185525.jpg',
 'CAX_EMC_Racer_Data.zip',
 '10694360_931478020215655_6148332260082911335_o.jpg',
 'Fedora-Live-Workstation-i686-21',
 'MoNo 1(1).png',
 'kali-linux-1.1.0a-i386',
 '1912293_931479710215486_4316765684077047706_o.jpg',
 'tutorials.html',
 '10708672_931478223548968_2305655250919027359_o.jpg',
 'Skill development for the future | Blog | Michael & Susan Dell Foundation.html',
 'R Training(1).pdf',
 'openSUSE-13.2-DVD-x86_64.iso',
 '10585533_10155299926870471_1412421882_n.jpg',
 'google-chrome-stable_current_i386(1).deb',
 'flareget_4.1-80_i386.deb',
 '1658277_931479643548826_6461630666066372009_o.jpg',
 'March invoice adaptive analytics   - Sheet1.pdf',
 'tutorials_files',
 'google-talkplugin_current_i386.deb',
 'google-chrome-stable_current_i386.deb',
 'cdh5-repository_1.0_all(1).deb',
 '__Social_Network_Analysis_in_Telecommunications__Wiley_and_SAS_Business_Series_.pdf',
 'Step1_Submission_format.doc']

In [8]:

os.chdir('/home/ajay/Desktop')
os.getcwd()

Out[8]:

'/home/ajay/Desktop'

In [9]:

a=os.getcwd()
os.listdir(a)

Out[9]:

['r-training.html',
 'RcmdrMarkdown.html',
 'March invoice Indicus   - Sheet1.pdf',
 'Untitled.ipynb',
 'March AV   Invoice   - Sheet1.pdf',
 'Silver_Surfer.jpg',
 'bm-pd.ipynb.html',
 'R Training.pdf',
 'social media.png',
 'Webinar Requirement details.xlsx',
 'CDNOW_master.zip',
 '.ipynb_checkpoints',
 'Website',
 'bm-pd.ipynb_files',
 'Social Media Analytics-2.pptx',
 'python for analytics and r.txt',
 'Untitled Folder',
 'diamonds.csv',
 'untitled.txt',
 'Untitled1.ipynb',
 'website sun.odp',
 '11079581_1582815845321431_8829427897815039618_o.jpg',
 'aj.jpg',
 'housing',
 '20150409_180702.jpg',
 'Analytics Deck.pptx',
 'datatable-faq.pdf',
 'STAND OUT_poster.docx',
 '7d376e0f-f81a-47f4-b27a-f628e1728e15-medium.jpeg',
 'CDNOW_master.txt',
 '"housing.csv"',
 'r-training_files',
 'RFM',
 'Analytics Deck.pdf',
 '1.zip',
 'Decisionstats.jpg']

In [105]:

diamonds=pd.read_csv("diamonds.csv")
#note header =0 means we take the first row as a header (default) else we can specify header=None

In [106]:

diamonds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat      53940 non-null float64
cut        53940 non-null object
color      53940 non-null object
clarity    53940 non-null object
depth      53940 non-null float64
table      53940 non-null float64
price      53940 non-null int64
x          53940 non-null float64
y          53940 non-null float64
z          53940 non-null float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.9+ MB

In [36]:

diamonds.head()

Out[36]:

	carat	cut	color	clarity	depth	table	price	x	y	z
0	0.23	Ideal	E	SI2	61.5	55	326	3.95	3.98	2.43
1	0.21	Premium	E	SI1	59.8	61	326	3.89	3.84	2.31
2	0.23	Good	E	VS1	56.9	65	327	4.05	4.07	2.31
3	0.29	Premium	I	VS2	62.4	58	334	4.20	4.23	2.63
4	0.31	Good	J	SI2	63.3	58	335	4.34	4.35	2.75

In [37]:

diamonds.tail(10)

Out[37]:

	carat	cut	color	clarity	depth	table	price	x	y	z
53930	0.71	Premium	E	SI1	60.5	55	2756	5.79	5.74	3.49
53931	0.71	Premium	F	SI1	59.8	62	2756	5.74	5.73	3.43
53932	0.70	Very Good	E	VS2	60.5	59	2757	5.71	5.76	3.47
53933	0.70	Very Good	E	VS2	61.2	59	2757	5.69	5.72	3.49
53934	0.72	Premium	D	SI1	62.7	59	2757	5.69	5.73	3.58
53935	0.72	Ideal	D	SI1	60.8	57	2757	5.75	5.76	3.50
53936	0.72	Good	D	SI1	63.1	55	2757	5.69	5.75	3.61
53937	0.70	Very Good	D	SI1	62.8	60	2757	5.66	5.68	3.56
53938	0.86	Premium	H	SI2	61.0	58	2757	6.15	6.12	3.74
53939	0.75	Ideal	D	SI2	62.2	55	2757	5.83	5.87	3.64

In [38]:

diamonds.columns

Out[38]:

Index([u'carat', u'cut', u'color', u'clarity', u'depth', u'table', u'price', u'x', u'y', u'z'], dtype='object')

In [92]:

b=len(diamonds) #this is the total population size
print(b)

In [93]:

import numpy as np

In [98]:

rows = np.random.choice(diamonds.index.values, 0.0001*b)
print(rows)
sampled_df = diamonds.ix[rows]

[45653  7503 47794 12017 46125]

In [99]:

sampled_df

Out[99]:

	carat	cut	color	clarity	depth	table	price	x	y	z
45653	0.25	Ideal	H	IF	61.4	57	525	4.05	4.08	2.49
7503	1.05	Premium	G	SI2	61.3	58	4241	6.55	6.60	4.03
47794	0.71	Ideal	J	VS2	62.4	54	1899	5.72	5.76	3.58
12017	1.00	Premium	F	SI1	59.8	59	5151	6.55	6.49	3.90
46125	0.51	Ideal	F	VS1	61.7	54	1744	5.14	5.17	3.18

In [108]:

diamonds.describe()

Out[108]:

	carat	depth	table	price	x	y	z
count	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000	53940.000000
mean	0.797940	61.749405	57.457184	3932.799722	5.731157	5.734526	3.538734
std	0.474011	1.432621	2.234491	3989.439738	1.121761	1.142135	0.705699
min	0.200000	43.000000	43.000000	326.000000	0.000000	0.000000	0.000000
25%	0.400000	61.000000	56.000000	950.000000	4.710000	4.720000	2.910000
50%	0.700000	61.800000	57.000000	2401.000000	5.700000	5.710000	3.530000
75%	1.040000	62.500000	59.000000	5324.250000	6.540000	6.540000	4.040000
max	5.010000	79.000000	95.000000	18823.000000	10.740000	58.900000	31.800000

In [109]:

cut=diamonds.groupby("cut")

In [110]:

cut.count()

Out[110]:

	carat	color	clarity	depth	table	price	x	y	z
cut
Fair	1610	1610	1610	1610	1610	1610	1610	1610	1610
Good	4906	4906	4906	4906	4906	4906	4906	4906	4906
Ideal	21551	21551	21551	21551	21551	21551	21551	21551	21551
Premium	13791	13791	13791	13791	13791	13791	13791	13791	13791
Very Good	12082	12082	12082	12082	12082	12082	12082	12082	12082

In [114]:

cut.mean()

Out[114]:

	carat	depth	table	price	x	y	z
cut
Fair	1.046137	64.041677	59.053789	4358.757764	6.246894	6.182652	3.982770
Good	0.849185	62.365879	58.694639	3928.864452	5.838785	5.850744	3.639507
Ideal	0.702837	61.709401	55.951668	3457.541970	5.507451	5.520080	3.401448
Premium	0.891955	61.264673	58.746095	4584.257704	5.973887	5.944879	3.647124
Very Good	0.806381	61.818275	57.956150	3981.759891	5.740696	5.770026	3.559801

In [115]:

cut.median()

Out[115]:

	carat	depth	table	price	x	y	z
cut
Fair	1.00	65.0	58	3282.0	6.175	6.10	3.97
Good	0.82	63.4	58	3050.5	5.980	5.99	3.70
Ideal	0.54	61.8	56	1810.0	5.250	5.26	3.23
Premium	0.86	61.4	59	3185.0	6.110	6.06	3.72
Very Good	0.71	62.1	58	2648.0	5.740	5.77	3.56

In [117]:

pd.crosstab(diamonds.cut, diamonds.color)

Out[117]:

color	D	E	F	G	H	I	J
cut
Fair	163	224	312	314	303	175	119
Good	662	933	909	871	702	522	307
Ideal	2834	3903	3826	4884	3115	2093	896
Premium	1603	2337	2331	2924	2360	1428	808
Very Good	1513	2400	2164	2299	1824	1204	678

In [121]:

diamonds.corr()

Out[121]:

	carat	depth	table	price	x	y	z
carat	1.000000	0.028224	0.181618	0.921591	0.975094	0.951722	0.953387
depth	0.028224	1.000000	-0.295779	-0.010647	-0.025289	-0.029341	0.094924
table	0.181618	-0.295779	1.000000	0.127134	0.195344	0.183760	0.150929
price	0.921591	-0.010647	0.127134	1.000000	0.884435	0.865421	0.861249
x	0.975094	-0.025289	0.195344	0.884435	1.000000	0.974701	0.970772
y	0.951722	-0.029341	0.183760	0.865421	0.974701	1.000000	0.952006
z	0.953387	0.094924	0.150929	0.861249	0.970772	0.952006	1.000000

In [164]:

import matplotlib as mt
%matplotlib inline #this line makes sure plots are in same notebook

In [166]:

from ggplot import *

In [ ]:

In [169]:

p = ggplot(aes(x='price', y='carat'), data=diamonds)
p

Out[169]:

<ggplot: (-1059997756)>

In [171]:

p + geom_point()

Out[171]:

<ggplot: (-1059338452)>

In [172]:

p + geom_point() +facet_grid('cut')

Out[172]:

<ggplot: (-1057884332)>

In [173]:

p = ggplot(aes(x='price', y='carat',color="cut"), data=diamonds)
p + geom_point()

Out[173]:

<ggplot: (-1059249386)>

In [174]:

p = ggplot(aes(x='price', y='carat',color="clarity"), data=diamonds)
p + geom_point()

Out[174]:

<ggplot: (-1060618628)>

In [ ]: