################################################################
## ImportingData #1.2
## Atul Singh
## www.datagenx.net
################################################################
# import
import numpy as np
import pandas as pd
import os
import pickle as pkl
wd = os.getcwd() # get working dir
files = os.listdir(wd) # get content of current working dir
print(wd)
print(files)
C:\Git\MachineLearning\python_DC ['.ipynb_checkpoints', 'dataset', 'ImportingData_#1.1.ipynb', 'ImportingData_#1.2.ipynb']
fh = pd.ExcelFile("dataset/titanic.xls")
print(fh.sheet_names)
['titanic1', 'titanic2', 'titanic3']
# parsing the sheets into pandas
df1 = fh.parse("titanic1") # first line becomes header
print(df1.head())
1 1.1 Allen, Miss. Elisabeth Walton female 29 \ 0 1 1 Allison, Master. Hudson Trevor male 0.9167 1 1 0 Allison, Miss. Helen Loraine female 2.0000 2 1 0 Allison, Mr. Hudson Joshua Creighton male 30.0000 3 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 4 1 1 Anderson, Mr. Harry male 48.0000 0 0.1 24160 211.3375 B5 S 2 Unnamed: 12 \ 0 1 2 113781 151.55 C22 C26 S 11 NaN 1 1 2 113781 151.55 C22 C26 S NaN NaN 2 1 2 113781 151.55 C22 C26 S NaN 135.0 3 1 2 113781 151.55 C22 C26 S NaN NaN 4 0 0 19952 26.55 E12 S 3 NaN St Louis, MO 0 Montreal, PQ / Chesterville, ON 1 Montreal, PQ / Chesterville, ON 2 Montreal, PQ / Chesterville, ON 3 Montreal, PQ / Chesterville, ON 4 New York, NY
df1 = fh.parse(1)
print(df1.head())
pclass survived name sex \ 0 1 1 Allen, Miss. Elisabeth Walton female 1 1 1 Allison, Master. Hudson Trevor male 2 1 0 Allison, Miss. Helen Loraine female 3 1 0 Allison, Mr. Hudson Joshua Creighton male 4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female age sibsp parch ticket fare cabin embarked boat body \ 0 29.0000 0 0 24160 211.3375 B5 S 2 NaN 1 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN 2 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN 3 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 4 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN home.dest 0 St Louis, MO 1 Montreal, PQ / Chesterville, ON 2 Montreal, PQ / Chesterville, ON 3 Montreal, PQ / Chesterville, ON 4 Montreal, PQ / Chesterville, ON
# parsing data with more option
df1 = fh.parse(0, skiprows=[2]) #1st line becomes header and after it will skip the 2nd row only,
print(df1.head())
1 1.1 Allen, Miss. Elisabeth Walton female 29 \ 0 1 1 Allison, Master. Hudson Trevor male 0.9167 1 1 0 Allison, Mr. Hudson Joshua Creighton male 30.0000 2 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 3 1 1 Anderson, Mr. Harry male 48.0000 4 1 1 Andrews, Miss. Kornelia Theodosia female 63.0000 0 0.1 24160 211.3375 B5 S 2 Unnamed: 12 \ 0 1 2 113781 151.5500 C22 C26 S 11 NaN 1 1 2 113781 151.5500 C22 C26 S NaN 135.0 2 1 2 113781 151.5500 C22 C26 S NaN NaN 3 0 0 19952 26.5500 E12 S 3 NaN 4 1 0 13502 77.9583 D7 S 10 NaN St Louis, MO 0 Montreal, PQ / Chesterville, ON 1 Montreal, PQ / Chesterville, ON 2 Montreal, PQ / Chesterville, ON 3 New York, NY 4 Hudson, NY
colnames = ["pclass","survived","name","sex","age","sibsp","parch","ticket","fare","cabin","embarked","boat","body","home.dest"]
df1 = fh.parse(0, skiprows=[2], names=colnames) #it will skip the 2nd row only,
print(df1.head())
pclass survived name sex \ 0 1 1 Allison, Master. Hudson Trevor male 1 1 0 Allison, Mr. Hudson Joshua Creighton male 2 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 3 1 1 Anderson, Mr. Harry male 4 1 1 Andrews, Miss. Kornelia Theodosia female age sibsp parch ticket fare cabin embarked boat body \ 0 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN 1 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 2 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN 3 48.0000 0 0 19952 26.5500 E12 S 3 NaN 4 63.0000 1 0 13502 77.9583 D7 S 10 NaN home.dest 0 Montreal, PQ / Chesterville, ON 1 Montreal, PQ / Chesterville, ON 2 Montreal, PQ / Chesterville, ON 3 New York, NY 4 Hudson, NY
#print(colnames[0:4])
# skip - 1,2,3 rows, cols - 0,1,2,3 and assigning name
df1 = fh.parse(0, skiprows=[1,2,3], parse_cols=[0,1,2,3], names=colnames[0:4])
print(df1.head())
pclass survived name sex 0 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 1 1 1 Anderson, Mr. Harry male 2 1 1 Andrews, Miss. Kornelia Theodosia female 3 1 0 Andrews, Mr. Thomas Jr male 4 1 1 Appleton, Mrs. Edward Dale (Charlotte Lamson) female
# # Import sas7bdat package
# from sas7bdat import SAS7BDAT
# # Save file to a DataFrame: df_sas
# with SAS7BDAT('sales.sas7bdat') as file:
# df_sas = file.to_data_frame()
# # Print head of DataFrame
# print(df_sas.head())
# # Plot histogram of DataFrame features (pandas and pyplot already imported)
# pd.DataFrame.hist(df_sas[['P']])
# plt.ylabel('count')
# plt.show()
## Importing SAS Stata files
# # Import pandas
# import pandas as pd
# # Load Stata file into a pandas DataFrame: df
# df = pd.read_stata("disarea.dta")
# # Print the head of the DataFrame df
# print(df.head())
# # Plot histogram of one column of the DataFrame
# pd.DataFrame.hist(df[['disa10']])
# plt.xlabel('Extent of disease')
# plt.ylabel('Number of coutries')
# plt.show()
# # Import packages
# import numpy as np
# import h5py
# # Assign filename: file
# file = "LIGO_data.hdf5"
# # Load file: data
# data = h5py.File(file, "r")
# # Print the datatype of the loaded file
# print(type(data))
# # Print the keys of the file
# for key in data.keys():
# print(key)
# # Get the HDF5 group: group
# group = data['strain']
# # Check out keys of group
# for key in group.keys():
# print(key)
# # Set variable equal to time series data: strain
# strain = data['strain']['Strain'].value
# # Set number of time points to sample: num_samples
# num_samples=10000
# # Set time vector
# time = np.arange(0, 1, 1/num_samples)
# # Plot data
# plt.plot(time, strain[:num_samples])
# plt.xlabel('GPS Time (s)')
# plt.ylabel('strain')
# plt.show()
# # Import package
# import scipy.io
# # Load MATLAB file: mat
# mat = scipy.io.loadmat('albeck_gene_expression.mat')
# # Print the datatype type of mat
# print(type(mat))
# # Print the keys of the MATLAB dictionary
# print(mat.keys())
# # Print the type of the value corresponding to the key 'CYratioCyt'
# print(type(mat['CYratioCyt']))
# # Print the shape of the value corresponding to the key 'CYratioCyt'
# print(np.shape(mat['CYratioCyt']))
# # Subset the array and plot it
# data = mat['CYratioCyt'][25, 5:]
# fig = plt.figure()
# plt.plot(data)
# plt.xlabel('time (min.)')
# plt.ylabel('normalized fluorescence (measure of expression)')
# plt.show()
############################################################
## Atul Singh | www.datagenx.net | lnked.in/atulsingh
############################################################