################################################################
## ImportingData #1.1
## Atul Singh
## www.datagenx.net
################################################################
# import
import numpy as np
import pandas as pd
!dir
Volume in drive C is OS Volume Serial Number is D202-8009 Directory of C:\learn\Git\MachineLearning\python_DC 21-12-2016 14:57 <DIR> . 21-12-2016 14:57 <DIR> .. 21-12-2016 13:38 <DIR> .ipynb_checkpoints 21-12-2016 14:51 1,038 ImportingData_#1.ipynb 21-12-2016 14:57 108,285 titanic.csv 21-12-2016 13:29 513,536 titanic.xls 3 File(s) 622,859 bytes 3 Dir(s) 311,402,659,840 bytes free
fh = open("dataset/titanic.txt", "r")
data = fh.read()
# print(data)
# closing the file handler
fh.close()
fh = open("dataset/titanic.txt", "r")
count = 0
for line in fh:
count = count+1
print(count)
1311
count = 0
with open("dataset/titanic.txt","r") as fh:
for line in fh:
count = count+1
print(count)
1311
with open("dataset/titanic.txt","r") as fh:
print(fh.readline()) #print 1st line
print(fh.readline()) #print 2nd line
print(fh.readline()) #print 3rd line
pclass|survived|name|sex|age|sibsp|parch|ticket|fare|cabin|embarked|boat|body|home.dest 1|1|Allen, Miss. Elisabeth Walton|female|29|0|0|24160|211.3375|B5|S|2||St Louis, MO 1|1|Allison, Master. Hudson Trevor|male|0.9167|1|2|113781|151.5500|C22 C26|S|11||Montreal, PQ / Chesterville, ON
# Zen of python
import this
The Zen of Python, by Tim Peters Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex. Complex is better than complicated. Flat is better than nested. Sparse is better than dense. Readability counts. Special cases aren't special enough to break the rules. Although practicality beats purity. Errors should never pass silently. Unless explicitly silenced. In the face of ambiguity, refuse the temptation to guess. There should be one-- and preferably only one --obvious way to do it. Although that way may not be obvious at first unless you're Dutch. Now is better than never. Although never is often better than *right* now. If the implementation is hard to explain, it's a bad idea. If the implementation is easy to explain, it may be a good idea. Namespaces are one honking great idea -- let's do more of those!
# reading file in numpy
#fhand = np.loadtxt("dataset/Employees.csv", delimiter=',')
# after trying so many files I got to know that we cant read non numeric data with this loadtxt method
fhand = np.loadtxt("dataset/numbers.txt", delimiter=',')
type(fhand)
print(fhand)
#for i in fhand:
# print(i)
[[ 1. 2. 3. 4. 5. 6. 7.] [ 8. 9. 0. 1. 2. 3. 4.] [ 6. 7. 8. 9. 4. 2. 4.] [ 3. 4. 6. 3. 2. 5. 7.] [ 0. 9. 7. 4. 7. 5. 3.]]
fhand = np.loadtxt("dataset/numbers.txt", delimiter=',', skiprows=2, usecols=[0,3]) # 1st and 4th column
print(fhand)
[[ 6. 9.] [ 3. 3.] [ 0. 4.]]
# by default, loadtxt func will take the numeric data but we can force to read the str data with dtype option
fhand = np.loadtxt("dataset/titanic.txt", delimiter='|', dtype=str)
print(fhand[0:4])
[["b'pclass'" "b'survived'" "b'name'" "b'sex'" "b'age'" "b'sibsp'" "b'parch'" "b'ticket'" "b'fare'" "b'cabin'" "b'embarked'" "b'boat'" "b'body'" "b'home.dest'"] ["b'1'" "b'1'" "b'Allen, Miss. Elisabeth Walton'" "b'female'" "b'29'" "b'0'" "b'0'" "b'24160'" "b'211.3375'" "b'B5'" "b'S'" "b'2'" "b''" "b'St Louis, MO'"] ["b'1'" "b'1'" "b'Allison, Master. Hudson Trevor'" "b'male'" "b'0.9167'" "b'1'" "b'2'" "b'113781'" "b'151.5500'" "b'C22 C26'" "b'S'" "b'11'" "b''" "b'Montreal, PQ / Chesterville, ON'"] ["b'1'" "b'0'" "b'Allison, Miss. Helen Loraine'" "b'female'" "b'2'" "b'1'" "b'2'" "b'113781'" "b'151.5500'" "b'C22 C26'" "b'S'" "b''" "b''" "b'Montreal, PQ / Chesterville, ON'"]]
# skip the header
fhand = np.loadtxt("dataset/titanic.txt", delimiter='|', dtype=str, skiprows=1, usecols=[0,1,2,3,4])
print(fhand[0:4])
[["b'1'" "b'1'" "b'Allen, Miss. Elisabeth Walton'" "b'female'" "b'29'"] ["b'1'" "b'1'" "b'Allison, Master. Hudson Trevor'" "b'male'" "b'0.9167'"] ["b'1'" "b'0'" "b'Allison, Miss. Helen Loraine'" "b'female'" "b'2'"] ["b'1'" "b'0'" "b'Allison, Mr. Hudson Joshua Creighton'" "b'male'" "b'30'"]]
fhand = np.genfromtxt("dataset/titanic.txt", delimiter='|', dtype=None, names=True, usecols=[0,1,2,3,4])
print(fhand[0:4])
# dtype=None, if set, np identify the data type
# names=True, if set, first line is header
[(1, 1, b'Allen, Miss. Elisabeth Walton', b'female', 29.0) (1, 1, b'Allison, Master. Hudson Trevor', b'male', 0.9167) (1, 0, b'Allison, Miss. Helen Loraine', b'female', 2.0) (1, 0, b'Allison, Mr. Hudson Joshua Creighton', b'male', 30.0)]
In
dtype is None
delimiter is ','
names is True ```
fhand = np.recfromcsv("dataset/titanic.txt", delimiter='|', names=True, usecols=[0,1,2,3,4])
print(fhand[0:4])
[(1, 1, b'Allen, Miss. Elisabeth Walton', b'female', 29.0) (1, 1, b'Allison, Master. Hudson Trevor', b'male', 0.9167) (1, 0, b'Allison, Miss. Helen Loraine', b'female', 2.0) (1, 0, b'Allison, Mr. Hudson Joshua Creighton', b'male', 30.0)]
fhand = np.recfromcsv("dataset/Employees.csv")
print(fhand[0:4])
[ (b'AARON', b'ELVIA J', b'WATER RATE TAKER', b'WATER MGMNT', b'$81000.00', b'$73862.00') (b'AARON', b'JEFFERY M', b'POLICE OFFICER', b'POLICE', b'$74628.00', b'$74628.00') (b'AARON', b'KIMBERLEI R', b'CHIEF CONTRACT EXPEDITER', b'FLEET MANAGEMNT', b'$77280.00', b'$70174.00') (b'ABAD JR', b'VICENTE M', b'CIVIL ENGINEER IV', b'WATER MGMNT', b'$96276.00', b'$96276.00')]
fh = pd.read_csv("dataset/Employees.csv")
print(fh[0:5])
LNAME FNAME JOB TITLE DEPARTMENT \ 0 AARON ELVIA J WATER RATE TAKER WATER MGMNT 1 AARON JEFFERY M POLICE OFFICER POLICE 2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER FLEET MANAGEMNT 3 ABAD JR VICENTE M CIVIL ENGINEER IV WATER MGMNT 4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC WATER MGMNT EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS 0 $81000.00 $73862.00 1 $74628.00 $74628.00 2 $77280.00 $70174.00 3 $96276.00 $96276.00 4 $84032.00 $76627.00
fh.head(4)
LNAME | FNAME | JOB TITLE | DEPARTMENT | EMPLOYEE ANNUAL SALARY | ESTIMATED ANNUAL SALARY MINUS FURLOUGHS | |
---|---|---|---|---|---|---|
0 | AARON | ELVIA J | WATER RATE TAKER | WATER MGMNT | $81000.00 | $73862.00 |
1 | AARON | JEFFERY M | POLICE OFFICER | POLICE | $74628.00 | $74628.00 |
2 | AARON | KIMBERLEI R | CHIEF CONTRACT EXPEDITER | FLEET MANAGEMNT | $77280.00 | $70174.00 |
3 | ABAD JR | VICENTE M | CIVIL ENGINEER IV | WATER MGMNT | $96276.00 | $96276.00 |
# Some more function
fh_pd = pd.read_csv("dataset/numbers.txt", header=None, nrows=3)
fh_pd
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
1 | 8 | 9 | 0 | 1 | 2 | 3 | 4 |
2 | 6 | 7 | 8 | 9 | 4 | 2 | 4 |
# Values only
fh_pd.values
array([[1, 2, 3, 4, 5, 6, 7], [8, 9, 0, 1, 2, 3, 4], [6, 7, 8, 9, 4, 2, 4]], dtype=int64)
# creating numpy array
narr = np.array(fh_pd.values)
narr
array([[1, 2, 3, 4, 5, 6, 7], [8, 9, 0, 1, 2, 3, 4], [6, 7, 8, 9, 4, 2, 4]], dtype=int64)
print(type(fh_pd), type(fh_pd.values), type(narr))
<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
#pd.read_csv(file, sep="\t", comment="#", na_values=["Nothing"])
# comment - ignore the data if it start with #
# na_values - convert Nothing to NA or NaN (python value for missing data)
############################################################
## Atul Singh | www.datagenx.net | lnked.in/atulsingh
############################################################