import re
import numpy as np
import pandas as pd
numlist=["$10000","$20,000","30,000",40000,"50000 "]
for i,value in enumerate(numlist):
numlist[i]=re.sub(r"([$,])","",str(value))
numlist
['10000', '20000', '30000', '40000', '50000 ']
int(numlist[1])
20000
for i,value in enumerate(numlist):
numlist[i]=int(value)
numlist
[10000, 20000, 30000, 40000, 50000]
np.mean(numlist)
30000.0
numlist2=str(numlist)
numlist2.split(None,0)
['[10000, 20000, 30000, 40000, 50000]']
numlist2.split(None,0)[0]
'[10000, 20000, 30000, 40000, 50000]'
titanic =pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/Titanic.csv")
titanic=titanic.drop('Unnamed: 0', 1)
titanic.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1313 entries, 0 to 1312 Data columns (total 6 columns): Name 1313 non-null object PClass 1313 non-null object Age 756 non-null float64 Sex 1313 non-null object Survived 1313 non-null int64 SexCode 1313 non-null int64 dtypes: float64(1), int64(2), object(3) memory usage: 61.6+ KB
titanic.head()
Name | PClass | Age | Sex | Survived | SexCode | |
---|---|---|---|---|---|---|
0 | Allen, Miss Elisabeth Walton | 1st | 29.00 | female | 1 | 1 |
1 | Allison, Miss Helen Loraine | 1st | 2.00 | female | 0 | 1 |
2 | Allison, Mr Hudson Joshua Creighton | 1st | 30.00 | male | 0 | 0 |
3 | Allison, Mrs Hudson JC (Bessie Waldo Daniels) | 1st | 25.00 | female | 0 | 1 |
4 | Allison, Master Hudson Trevor | 1st | 0.92 | male | 1 | 0 |
a=titanic.iloc[:,1:]
b=titanic.iloc[:,1:].values
print(type(titanic))
print(type(a))
print(type(b))
<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>
a
PClass | Age | Sex | Survived | SexCode | |
---|---|---|---|---|---|
0 | 1st | 29.00 | female | 1 | 1 |
1 | 1st | 2.00 | female | 0 | 1 |
2 | 1st | 30.00 | male | 0 | 0 |
3 | 1st | 25.00 | female | 0 | 1 |
4 | 1st | 0.92 | male | 1 | 0 |
5 | 1st | 47.00 | male | 1 | 0 |
6 | 1st | 63.00 | female | 1 | 1 |
7 | 1st | 39.00 | male | 0 | 0 |
8 | 1st | 58.00 | female | 1 | 1 |
9 | 1st | 71.00 | male | 0 | 0 |
10 | 1st | 47.00 | male | 0 | 0 |
11 | 1st | 19.00 | female | 1 | 1 |
12 | 1st | NaN | female | 1 | 1 |
13 | 1st | NaN | male | 1 | 0 |
14 | 1st | NaN | male | 0 | 0 |
15 | 1st | 50.00 | female | 1 | 1 |
16 | 1st | 24.00 | male | 0 | 0 |
17 | 1st | 36.00 | male | 0 | 0 |
18 | 1st | 37.00 | male | 1 | 0 |
19 | 1st | 47.00 | female | 1 | 1 |
20 | 1st | 26.00 | male | 1 | 0 |
21 | 1st | 25.00 | male | 0 | 0 |
22 | 1st | 25.00 | male | 1 | 0 |
23 | 1st | 19.00 | female | 1 | 1 |
24 | 1st | 28.00 | male | 1 | 0 |
25 | 1st | 45.00 | male | 0 | 0 |
26 | 1st | 39.00 | male | 1 | 0 |
27 | 1st | 30.00 | female | 1 | 1 |
28 | 1st | 58.00 | female | 1 | 1 |
29 | 1st | NaN | male | 0 | 0 |
... | ... | ... | ... | ... | ... |
1283 | 3rd | 14.00 | female | 0 | 1 |
1284 | 3rd | 22.00 | male | 0 | 0 |
1285 | 3rd | NaN | male | 0 | 0 |
1286 | 3rd | NaN | male | 0 | 0 |
1287 | 3rd | NaN | male | 0 | 0 |
1288 | 3rd | NaN | male | 0 | 0 |
1289 | 3rd | NaN | male | 1 | 0 |
1290 | 3rd | NaN | male | 0 | 0 |
1291 | 3rd | 51.00 | male | 0 | 0 |
1292 | 3rd | 18.00 | male | 0 | 0 |
1293 | 3rd | 45.00 | female | 1 | 1 |
1294 | 3rd | NaN | male | 0 | 0 |
1295 | 3rd | NaN | male | 0 | 0 |
1296 | 3rd | NaN | male | 0 | 0 |
1297 | 3rd | 28.00 | male | 0 | 0 |
1298 | 3rd | 21.00 | male | 0 | 0 |
1299 | 3rd | 27.00 | male | 0 | 0 |
1300 | 3rd | NaN | male | 0 | 0 |
1301 | 3rd | 36.00 | male | 0 | 0 |
1302 | 3rd | NaN | male | 1 | 0 |
1303 | 3rd | 27.00 | male | 0 | 0 |
1304 | 3rd | 15.00 | female | 1 | 1 |
1305 | 3rd | NaN | male | 0 | 0 |
1306 | 3rd | NaN | female | 0 | 1 |
1307 | 3rd | NaN | female | 0 | 1 |
1308 | 3rd | 27.00 | male | 0 | 0 |
1309 | 3rd | 26.00 | male | 0 | 0 |
1310 | 3rd | 22.00 | male | 0 | 0 |
1311 | 3rd | 24.00 | male | 0 | 0 |
1312 | 3rd | 29.00 | male | 0 | 0 |
1313 rows × 5 columns
b
array([['1st', 29.0, 'female', 1, 1], ['1st', 2.0, 'female', 0, 1], ['1st', 30.0, 'male', 0, 0], ..., ['3rd', 22.0, 'male', 0, 0], ['3rd', 24.0, 'male', 0, 0], ['3rd', 29.0, 'male', 0, 0]], dtype=object)
titanic.columns[1:]
Index(['PClass', 'Age', 'Sex', 'Survived', 'SexCode'], dtype='object')
titanic.as_matrix(columns=titanic.columns[1:])
array([['1st', 29.0, 'female', 1, 1], ['1st', 2.0, 'female', 0, 1], ['1st', 30.0, 'male', 0, 0], ..., ['3rd', 22.0, 'male', 0, 0], ['3rd', 24.0, 'male', 0, 0], ['3rd', 29.0, 'male', 0, 0]], dtype=object)
data=titanic.as_matrix(columns=titanic.columns[1:])
len(data)
1313
range(0,len(data))
range(0, 1313)
g=pd.DataFrame(data=data[0:,0:], # values
index=range(0,len(data)), # 1st column as index
columns=titanic.columns[1:]) # 1st row as the column names
g.head()
PClass | Age | Sex | Survived | SexCode | |
---|---|---|---|---|---|
0 | 1st | 29 | female | 1 | 1 |
1 | 1st | 2 | female | 0 | 1 |
2 | 1st | 30 | male | 0 | 0 |
3 | 1st | 25 | female | 0 | 1 |
4 | 1st | 0.92 | male | 1 | 0 |