Notebook

In [ ]:

################################################################
##  ImportingData #1.2
##  Atul Singh
##  www.datagenx.net
################################################################

Importing Data #1¶

#1.2 Importing data from other file formats¶

In [6]:

# import 
import numpy as np
import pandas as pd
import os
import pickle as pkl

Running os library funcs¶

In [3]:

wd = os.getcwd()  # get working dir
files = os.listdir(wd)  # get content of current working dir
print(wd)
print(files)

C:\Git\MachineLearning\python_DC
['.ipynb_checkpoints', 'dataset', 'ImportingData_#1.1.ipynb', 'ImportingData_#1.2.ipynb']

Reading Excel using pandas¶

In [8]:

fh = pd.ExcelFile("dataset/titanic.xls")
print(fh.sheet_names)

['titanic1', 'titanic2', 'titanic3']

In [9]:

# parsing the sheets into pandas
df1 = fh.parse("titanic1")  # first line becomes header
print(df1.head())

   1  1.1                    Allen, Miss. Elisabeth Walton  female       29  \
0  1    1                   Allison, Master. Hudson Trevor    male   0.9167   
1  1    0                     Allison, Miss. Helen Loraine  female   2.0000   
2  1    0             Allison, Mr. Hudson Joshua Creighton    male  30.0000   
3  1    0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female  25.0000   
4  1    1                              Anderson, Mr. Harry    male  48.0000   

   0  0.1   24160  211.3375       B5  S    2  Unnamed: 12  \
0  1    2  113781    151.55  C22 C26  S   11          NaN   
1  1    2  113781    151.55  C22 C26  S  NaN          NaN   
2  1    2  113781    151.55  C22 C26  S  NaN        135.0   
3  1    2  113781    151.55  C22 C26  S  NaN          NaN   
4  0    0   19952     26.55      E12  S    3          NaN   

                      St Louis, MO  
0  Montreal, PQ / Chesterville, ON  
1  Montreal, PQ / Chesterville, ON  
2  Montreal, PQ / Chesterville, ON  
3  Montreal, PQ / Chesterville, ON  
4                     New York, NY

In [10]:

df1 = fh.parse(1)
print(df1.head())

   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.0000      0      0   24160  211.3375       B5        S    2    NaN   
1   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St Louis, MO  
1  Montreal, PQ / Chesterville, ON  
2  Montreal, PQ / Chesterville, ON  
3  Montreal, PQ / Chesterville, ON  
4  Montreal, PQ / Chesterville, ON

In [13]:

# parsing data with more option
df1 = fh.parse(0, skiprows=[2])  #1st line becomes header and after it will skip the 2nd row only, 
print(df1.head())

   1  1.1                    Allen, Miss. Elisabeth Walton  female       29  \
0  1    1                   Allison, Master. Hudson Trevor    male   0.9167   
1  1    0             Allison, Mr. Hudson Joshua Creighton    male  30.0000   
2  1    0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female  25.0000   
3  1    1                              Anderson, Mr. Harry    male  48.0000   
4  1    1                Andrews, Miss. Kornelia Theodosia  female  63.0000   

   0  0.1   24160  211.3375       B5  S    2  Unnamed: 12  \
0  1    2  113781  151.5500  C22 C26  S   11          NaN   
1  1    2  113781  151.5500  C22 C26  S  NaN        135.0   
2  1    2  113781  151.5500  C22 C26  S  NaN          NaN   
3  0    0   19952   26.5500      E12  S    3          NaN   
4  1    0   13502   77.9583       D7  S   10          NaN   

                      St Louis, MO  
0  Montreal, PQ / Chesterville, ON  
1  Montreal, PQ / Chesterville, ON  
2  Montreal, PQ / Chesterville, ON  
3                     New York, NY  
4                       Hudson, NY

In [14]:

colnames = ["pclass","survived","name","sex","age","sibsp","parch","ticket","fare","cabin","embarked","boat","body","home.dest"]
df1 = fh.parse(0, skiprows=[2], names=colnames)  #it will skip the 2nd row only, 
print(df1.head()) 

   pclass  survived                                             name     sex  \
0       1         1                   Allison, Master. Hudson Trevor    male   
1       1         0             Allison, Mr. Hudson Joshua Creighton    male   
2       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   
3       1         1                              Anderson, Mr. Harry    male   
4       1         1                Andrews, Miss. Kornelia Theodosia  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   
1  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
2  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  48.0000      0      0   19952   26.5500      E12        S    3    NaN   
4  63.0000      1      0   13502   77.9583       D7        S   10    NaN   

                         home.dest  
0  Montreal, PQ / Chesterville, ON  
1  Montreal, PQ / Chesterville, ON  
2  Montreal, PQ / Chesterville, ON  
3                     New York, NY  
4                       Hudson, NY

In [45]:

#print(colnames[0:4])
# skip - 1,2,3 rows, cols - 0,1,2,3 and assigning name
df1 = fh.parse(0, skiprows=[1,2,3], parse_cols=[0,1,2,3], names=colnames[0:4])  
print(df1.head()) 

   pclass  survived                                             name     sex
0       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female
1       1         1                              Anderson, Mr. Harry    male
2       1         1                Andrews, Miss. Kornelia Theodosia  female
3       1         0                           Andrews, Mr. Thomas Jr    male
4       1         1    Appleton, Mrs. Edward Dale (Charlotte Lamson)  female

Reading SAS files¶

In [46]:

# # Import sas7bdat package
# from sas7bdat import SAS7BDAT

# # Save file to a DataFrame: df_sas
# with SAS7BDAT('sales.sas7bdat') as file:
#     df_sas = file.to_data_frame()

# # Print head of DataFrame
# print(df_sas.head())

# # Plot histogram of DataFrame features (pandas and pyplot already imported)
# pd.DataFrame.hist(df_sas[['P']])
# plt.ylabel('count')
# plt.show()

In [47]:

## Importing SAS Stata files

# # Import pandas
# import pandas as pd

# # Load Stata file into a pandas DataFrame: df
# df = pd.read_stata("disarea.dta")

# # Print the head of the DataFrame df
# print(df.head())

# # Plot histogram of one column of the DataFrame
# pd.DataFrame.hist(df[['disa10']])
# plt.xlabel('Extent of disease')
# plt.ylabel('Number of coutries')
# plt.show()

Reading HDFS files¶

In [48]:

# # Import packages
# import numpy as np
# import h5py

# # Assign filename: file
# file = "LIGO_data.hdf5"

# # Load file: data
# data = h5py.File(file, "r")

# # Print the datatype of the loaded file
# print(type(data))

# # Print the keys of the file
# for key in data.keys():
#     print(key)

In [ ]:

# # Get the HDF5 group: group
# group = data['strain']

# # Check out keys of group
# for key in group.keys():
#     print(key)

# # Set variable equal to time series data: strain
# strain = data['strain']['Strain'].value

# # Set number of time points to sample: num_samples
# num_samples=10000

# # Set time vector
# time = np.arange(0, 1, 1/num_samples)

# # Plot data
# plt.plot(time, strain[:num_samples])
# plt.xlabel('GPS Time (s)')
# plt.ylabel('strain')
# plt.show()

Loading Matlab files¶

In [ ]:

# # Import package
# import scipy.io

# # Load MATLAB file: mat
# mat = scipy.io.loadmat('albeck_gene_expression.mat')

# # Print the datatype type of mat
# print(type(mat))

In [ ]:

# # Print the keys of the MATLAB dictionary
# print(mat.keys())

# # Print the type of the value corresponding to the key 'CYratioCyt'
# print(type(mat['CYratioCyt']))

# # Print the shape of the value corresponding to the key 'CYratioCyt'
# print(np.shape(mat['CYratioCyt']))

# # Subset the array and plot it
# data = mat['CYratioCyt'][25, 5:]
# fig = plt.figure()
# plt.plot(data)
# plt.xlabel('time (min.)')
# plt.ylabel('normalized fluorescence (measure of expression)')
# plt.show()

In [ ]:

############################################################
## Atul Singh  | www.datagenx.net | lnked.in/atulsingh
############################################################