import numpy as np
import pandas as pd
from numpy.random import randn
labels = ['a','b','c']
data = [10,20,30]
arr = np.array(data)
d = {'a':10,'b':20,'c':30}
labels
['a', 'b', 'c']
data
[10, 20, 30]
arr
array([10, 20, 30])
d
{'a': 10, 'b': 20, 'c': 30}
pd.Series(data = data)
0 10 1 20 2 30 dtype: int64
pd.Series(data=data,index=labels)
a 10 b 20 c 30 dtype: int64
pd.Series(data,labels)
a 10 b 20 c 30 dtype: int64
pd.Series(arr)
0 10 1 20 2 30 dtype: int64
pd.Series(arr,labels)
a 10 b 20 c 30 dtype: int64
pd.Series(d)
a 10 b 20 c 30 dtype: int64
pd.Series(data=labels)
0 a 1 b 2 c dtype: object
pd.Series(data=[sum,print,len])
0 <built-in function sum> 1 <built-in function print> 2 <built-in function len> dtype: object
ser1 = pd.Series([1,2,3,4],['USA','Russia','China','Japan'])
ser1
USA 1 Russia 2 China 3 Japan 4 dtype: int64
ser2 = pd.Series([1,2,5,4],['USA','Russia','India','Japan'])
ser2
USA 1 Russia 2 India 5 Japan 4 dtype: int64
ser1['USA']
1
ser3 = pd.Series(data=labels)
ser3
0 a 1 b 2 c dtype: object
ser3[0]
'a'
ser1 + ser2
China NaN India NaN Japan 8.0 Russia 4.0 USA 2.0 dtype: float64
np.random.seed(101)
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
df
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
df['W']
A 2.706850 B 0.651118 C -2.018168 D 0.188695 E 0.190794 Name: W, dtype: float64
type(df['W'])
pandas.core.series.Series
type(df)
pandas.core.frame.DataFrame
df.W
A 2.706850 B 0.651118 C -2.018168 D 0.188695 E 0.190794 Name: W, dtype: float64
df[['W','Z']]
W | Z | |
---|---|---|
A | 2.706850 | 0.503826 |
B | 0.651118 | 0.605965 |
C | -2.018168 | -0.589001 |
D | 0.188695 | 0.955057 |
E | 0.190794 | 0.683509 |
df['new'] = df['W'] + df['Y']
df['new']
A 3.614819 B -0.196959 C -1.489355 D -0.744542 E 2.796762 Name: new, dtype: float64
df
W | X | Y | Z | new | |
---|---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 | 3.614819 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 | -0.196959 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 | -1.489355 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 | -0.744542 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 | 2.796762 |
df.drop('new', axis=1, inplace=True) # 1 refere-se ao eixo das colunas
df
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
df.drop('E') # 0 refere-se ao eixo das linhas, definido como padrão
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
df.shape
(5, 4)
df.loc['C']
W -2.018168 X 0.740122 Y 0.528813 Z -0.589001 Name: C, dtype: float64
df.iloc[2] # Localiza por posição numérica, mesmo os índices sendo strings
W -2.018168 X 0.740122 Y 0.528813 Z -0.589001 Name: C, dtype: float64
df.loc['B','Y']
-0.8480769834036315
df.loc[['A','B'],['W','Y']]
W | Y | |
---|---|---|
A | 2.706850 | 0.907969 |
B | 0.651118 | -0.848077 |
booldf = df > 0
df[booldf]
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | NaN | NaN | 0.605965 |
C | NaN | 0.740122 | 0.528813 | NaN |
D | 0.188695 | NaN | NaN | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
df['W'] > 0
A True B True C False D True E True Name: W, dtype: bool
df[df['W'] > 0]
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
df[df['Z'] < 0]
W | X | Y | Z | |
---|---|---|---|---|
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
df[df['W'] > 0][['Y','X']]
Y | X | |
---|---|---|
A | 0.907969 | 0.628133 |
B | -0.848077 | -0.319318 |
D | -0.933237 | -0.758872 |
E | 2.605967 | 1.978757 |
df[df['W'] < 0][['Y','X']]
Y | X | |
---|---|---|
C | 0.528813 | 0.740122 |
df[(df['W'] > 0) & (df['Y'] > 1)] # and
W | X | Y | Z | |
---|---|---|---|---|
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
df[(df['W'] > 0) | (df['Y'] > 1)] # or
W | X | Y | Z | States | |
---|---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 | CA |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 | NY |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 | OR |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 | CO |
df.reset_index() # não ocorre in_place
index | W | X | Y | Z | |
---|---|---|---|---|---|
0 | A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
1 | B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
2 | C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
3 | D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
4 | E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
new_index = 'CA NY WY OR CO'.split()
new_index
['CA', 'NY', 'WY', 'OR', 'CO']
df['States'] = new_index
df
W | X | Y | Z | States | |
---|---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 | CA |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 | NY |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 | WY |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 | OR |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 | CO |
df.set_index('States') # não ocorre in_place
W | X | Y | Z | |
---|---|---|---|---|
States | ||||
CA | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
NY | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
WY | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
OR | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
CO | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
fora = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
dentro = [1,2,3,1,2,3]
hier_index = list(zip(fora,dentro))
hier_index
[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index
MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
df = pd.DataFrame(randn(6,2), hier_index, ['A','B'])
df
A | B | ||
---|---|---|---|
G1 | 1 | 0.302665 | 1.693723 |
2 | -1.706086 | -1.159119 | |
3 | -0.134841 | 0.390528 | |
G2 | 1 | 0.166905 | 0.184502 |
2 | 0.807706 | 0.072960 | |
3 | 0.638787 | 0.329646 |
df.loc['G1']
A | B | |
---|---|---|
1 | 0.302665 | 1.693723 |
2 | -1.706086 | -1.159119 |
3 | -0.134841 | 0.390528 |
df.index.names = ['Grupos','Numeros']
df
A | B | ||
---|---|---|---|
Grupos | Numeros | ||
G1 | 1 | 0.302665 | 1.693723 |
2 | -1.706086 | -1.159119 | |
3 | -0.134841 | 0.390528 | |
G2 | 1 | 0.166905 | 0.184502 |
2 | 0.807706 | 0.072960 | |
3 | 0.638787 | 0.329646 |
df.loc['G2'].loc[2]['B']
0.07295967531703869
df.xs('G1')
A | B | |
---|---|---|
Numeros | ||
1 | 0.302665 | 1.693723 |
2 | -1.706086 | -1.159119 |
3 | -0.134841 | 0.390528 |
df.xs(1,level='Numeros') # Linhas com Numeros = 1
A | B | |
---|---|---|
Grupos | ||
G1 | 0.302665 | 1.693723 |
G2 | 0.166905 | 0.184502 |
dic = {'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}
dados = pd.DataFrame(dic)
dados
A | B | C | |
---|---|---|---|
0 | 1.0 | 5.0 | 1 |
1 | 2.0 | NaN | 2 |
2 | NaN | NaN | 3 |
dados.dropna() # por padrão o eixo é 0, ou seja, linhas
A | B | C | |
---|---|---|---|
0 | 1.0 | 5.0 | 1 |
dados.dropna(axis=1)
C | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
dados.dropna(thresh=2)
# necessário ter pelo menos 2 NaN para a linha ser removida
A | B | C | |
---|---|---|---|
0 | 1.0 | 5.0 | 1 |
1 | 2.0 | NaN | 2 |
dados.fillna(value='Nulo')
A | B | C | |
---|---|---|---|
0 | 1 | 5 | 1 |
1 | 2 | Nulo | 2 |
2 | Nulo | Nulo | 3 |
dados['A'].fillna(value=df['A'].mean())
0 1.000000 1 2.000000 2 0.012523 Name: A, dtype: float64
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)
df
Company | Person | Sales | |
---|---|---|---|
0 | GOOG | Sam | 200 |
1 | GOOG | Charlie | 120 |
2 | MSFT | Amy | 340 |
3 | MSFT | Vanessa | 124 |
4 | FB | Carl | 243 |
5 | FB | Sarah | 350 |
comp = df.groupby('Company')
comp.mean()
Sales | |
---|---|
Company | |
FB | 296.5 |
GOOG | 160.0 |
MSFT | 232.0 |
comp.sum()
Sales | |
---|---|
Company | |
FB | 593 |
GOOG | 320 |
MSFT | 464 |
comp.std()
Sales | |
---|---|
Company | |
FB | 75.660426 |
GOOG | 56.568542 |
MSFT | 152.735065 |
comp.sum().loc['FB']
Sales 593 Name: FB, dtype: int64
df.groupby('Company').count()
Person | Sales | |
---|---|---|
Company | ||
FB | 2 | 2 |
GOOG | 2 | 2 |
MSFT | 2 | 2 |
df.groupby('Company').max()
Person | Sales | |
---|---|---|
Company | ||
FB | Sarah | 350 |
GOOG | Sam | 200 |
MSFT | Vanessa | 340 |
df.groupby('Company').describe()
Sales | ||||||||
---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | |
Company | ||||||||
FB | 2.0 | 296.5 | 75.660426 | 243.0 | 269.75 | 296.5 | 323.25 | 350.0 |
GOOG | 2.0 | 160.0 | 56.568542 | 120.0 | 140.00 | 160.0 | 180.00 | 200.0 |
MSFT | 2.0 | 232.0 | 152.735065 | 124.0 | 178.00 | 232.0 | 286.00 | 340.0 |
df.groupby('Company').describe().transpose()
Company | FB | GOOG | MSFT | |
---|---|---|---|---|
Sales | count | 2.000000 | 2.000000 | 2.000000 |
mean | 296.500000 | 160.000000 | 232.000000 | |
std | 75.660426 | 56.568542 | 152.735065 | |
min | 243.000000 | 120.000000 | 124.000000 | |
25% | 269.750000 | 140.000000 | 178.000000 | |
50% | 296.500000 | 160.000000 | 232.000000 | |
75% | 323.250000 | 180.000000 | 286.000000 | |
max | 350.000000 | 200.000000 | 340.000000 |
df1 = pd.DataFrame({'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']},
index=[0,1,2,3])
df1
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
2 | A2 | B2 | C2 | D2 |
3 | A3 | B3 | C3 | D3 |
df2 = pd.DataFrame({'A':['A4','A5','A6','A7'],
'B':['B4','B5','B6','B7'],
'C':['C4','C5','C6','C7'],
'D':['D4','D5','D6','D7']},
index=[4,5,6,7])
df2
A | B | C | D | |
---|---|---|---|---|
4 | A4 | B4 | C4 | D4 |
5 | A5 | B5 | C5 | D5 |
6 | A6 | B6 | C6 | D6 |
7 | A7 | B7 | C7 | D7 |
df3 = pd.DataFrame({'A':['A8','A9','A10','A11'],
'B':['B8','B9','B10','B11'],
'C':['C8','C9','C10','C11'],
'D':['D8','D9','D10','D11']},
index=[8,9,10,11])
df3
A | B | C | D | |
---|---|---|---|---|
8 | A8 | B8 | C8 | D8 |
9 | A9 | B9 | C9 | D9 |
10 | A10 | B10 | C10 | D10 |
11 | A11 | B11 | C11 | D11 |
Concatenção basicamente conecta os DataFrames. Tenha em mente que a dimensão deve corresponder ao longo do eixo que você está concatenando.
pd.concat([df1,df2,df3])
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
2 | A2 | B2 | C2 | D2 |
3 | A3 | B3 | C3 | D3 |
4 | A4 | B4 | C4 | D4 |
5 | A5 | B5 | C5 | D5 |
6 | A6 | B6 | C6 | D6 |
7 | A7 | B7 | C7 | D7 |
8 | A8 | B8 | C8 | D8 |
9 | A9 | B9 | C9 | D9 |
10 | A10 | B10 | C10 | D10 |
11 | A11 | B11 | C11 | D11 |
pd.concat([df1,df2,df3], axis=1)
A | B | C | D | A | B | C | D | A | B | C | D | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A0 | B0 | C0 | D0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | A1 | B1 | C1 | D1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | A2 | B2 | C2 | D2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | A3 | B3 | C3 | D3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | A4 | B4 | C4 | D4 | NaN | NaN | NaN | NaN |
5 | NaN | NaN | NaN | NaN | A5 | B5 | C5 | D5 | NaN | NaN | NaN | NaN |
6 | NaN | NaN | NaN | NaN | A6 | B6 | C6 | D6 | NaN | NaN | NaN | NaN |
7 | NaN | NaN | NaN | NaN | A7 | B7 | C7 | D7 | NaN | NaN | NaN | NaN |
8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A8 | B8 | C8 | D8 |
9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A9 | B9 | C9 | D9 |
10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A10 | B10 | C10 | D10 |
11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | A11 | B11 | C11 | D11 |
left = pd.DataFrame({'key': ['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key': ['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
left
key | A | B | |
---|---|---|---|
0 | K0 | A0 | B0 |
1 | K1 | A1 | B1 |
2 | K2 | A2 | B2 |
3 | K3 | A3 | B3 |
right
key | C | D | |
---|---|---|---|
0 | K0 | C0 | D0 |
1 | K1 | C1 | D1 |
2 | K2 | C2 | D2 |
3 | K3 | C3 | D3 |
A função merge permite você unir DataFrames usando uma lógica similar ao SQL merging de tabelas. Exemplos:
pd.merge(left,right,how='inner',on='key')
key | A | B | C | D | |
---|---|---|---|---|---|
0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | A1 | B1 | C1 | D1 |
2 | K2 | A2 | B2 | C2 | D2 |
3 | K3 | A3 | B3 | C3 | D3 |
left = pd.DataFrame({'key1': ['K0','K0','K1','K2'],
'key2': ['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1': ['K0','K1','K1','K2'],
'key2': ['K0','K0','K0','K0'],
'A':['C0','C1','C2','C3'],
'B':['D0','D1','D2','D3']})
pd.merge(left,right,on=['key1','key2'])
key1 | key2 | A_x | B_x | A_y | B_y | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | K0 | A2 | B2 | C1 | D1 |
2 | K1 | K0 | A2 | B2 | C2 | D2 |
pd.merge(left,right,how='outer',on=['key1','key2'])
key1 | key2 | A_x | B_x | A_y | B_y | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K0 | K1 | A1 | B1 | NaN | NaN |
2 | K1 | K0 | A2 | B2 | C1 | D1 |
3 | K1 | K0 | A2 | B2 | C2 | D2 |
4 | K2 | K1 | A3 | B3 | NaN | NaN |
5 | K2 | K0 | NaN | NaN | C3 | D3 |
pd.merge(left, right, how='right', on=['key1','key2'])
key1 | key2 | A_x | B_x | A_y | B_y | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | K0 | A2 | B2 | C1 | D1 |
2 | K1 | K0 | A2 | B2 | C2 | D2 |
3 | K2 | K0 | NaN | NaN | C3 | D3 |
pd.merge(left, right, how='left', on=['key1','key2'])
key1 | key2 | A_x | B_x | A_y | B_y | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K0 | K1 | A1 | B1 | NaN | NaN |
2 | K1 | K0 | A2 | B2 | C1 | D1 |
3 | K1 | K0 | A2 | B2 | C2 | D2 |
4 | K2 | K1 | A3 | B3 | NaN | NaN |
Joining é um método conveniente para combinar as colunas em potencial dois DataFrames de índices diferentes em um único resultante DataFrame
left = pd.DataFrame({'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index=['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C1','C2'],
'D':['D0','D1','D2']},
index=['K0','K2','K3'])
left.join(right)
A | B | C | D | |
---|---|---|---|---|
K0 | A0 | B0 | C0 | D0 |
K1 | A1 | B1 | NaN | NaN |
K2 | A2 | B2 | C1 | D1 |
left.join(right,how='outer')
A | B | C | D | |
---|---|---|---|---|
K0 | A0 | B0 | C0 | D0 |
K1 | A1 | B1 | NaN | NaN |
K2 | A2 | B2 | C1 | D1 |
K3 | NaN | NaN | C2 | D2 |
df = pd.DataFrame({'col1':[1,2,3,4],
'col2':[444,555,666,444],
'col3':['abc','def','ghi','xyz']})
df.head()
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 444 | abc |
1 | 2 | 555 | def |
2 | 3 | 666 | ghi |
3 | 4 | 444 | xyz |
df['col2'].unique()
array([444, 555, 666])
df['col2'].nunique()
3
df['col2'].value_counts()
444 2 555 1 666 1 Name: col2, dtype: int64
def times(x):
return x*2
df['col1'].apply(times) # Aplicando a função times
0 2 1 4 2 6 3 8 Name: col1, dtype: int64
df['col3'].apply(len)
0 3 1 3 2 3 3 3 Name: col3, dtype: int64
df['col2'].apply(lambda x: x*3)
0 1332 1 1665 2 1998 3 1332 Name: col2, dtype: int64
df.drop('col1',axis=1)
col2 | col3 | |
---|---|---|
0 | 444 | abc |
1 | 555 | def |
2 | 666 | ghi |
3 | 444 | xyz |
df.columns
Index(['col1', 'col2', 'col3'], dtype='object')
df.index
RangeIndex(start=0, stop=4, step=1)
df.sort_values(by='col2')
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 444 | abc |
3 | 4 | 444 | xyz |
1 | 2 | 555 | def |
2 | 3 | 666 | ghi |
df.isnull()
col1 | col2 | col3 | |
---|---|---|---|
0 | False | False | False |
1 | False | False | False |
2 | False | False | False |
3 | False | False | False |
data = {'A':['foo','foo','foo','bar','bar','bar'],
'B':['one','one','two','two','one','one'],
'C':['x','y','x','y','x','y'],
'D':[1,3,2,5,4,1]}
df = pd.DataFrame(data)
df
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | x | 1 |
1 | foo | one | y | 3 |
2 | foo | two | x | 2 |
3 | bar | two | y | 5 |
4 | bar | one | x | 4 |
5 | bar | one | y | 1 |
df.pivot_table(values='D', index=['A','B'], columns='C')
C | x | y | |
---|---|---|---|
A | B | ||
bar | one | 4.0 | 1.0 |
two | NaN | 5.0 | |
foo | one | 1.0 | 3.0 |
two | 2.0 | NaN |
pwd
'/home/talantyr/Documentos/Projetos/Python Data Science'
pd.read_csv('example')
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
df = pd.read_csv('example')
df
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
df.to_csv('new_example',index=False)
pd.read_csv('new_example')
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
pd.read_excel('Excel_Sample.xlsx',sheet_name='Sheet1')
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
df.to_excel('New_Excel_Sample.xlsx',sheet_name='NewSheet')
data = pd.read_html('https://www.fdic.gov/bank/individual/failed/banklist.html')
type(data)
list
data[0].head()
Bank Name | City | ST | CERT | Acquiring Institution | Closing Date | Updated Date | |
---|---|---|---|---|---|---|---|
0 | The Enloe State Bank | Cooper | TX | 10716 | Legend Bank, N. A. | May 31, 2019 | June 18, 2019 |
1 | Washington Federal Bank for Savings | Chicago | IL | 30570 | Royal Savings Bank | December 15, 2017 | February 1, 2019 |
2 | The Farmers and Merchants State Bank of Argonia | Argonia | KS | 17719 | Conway Bank | October 13, 2017 | February 21, 2018 |
3 | Fayette County Bank | Saint Elmo | IL | 1802 | United Fidelity Bank, fsb | May 26, 2017 | January 29, 2019 |
4 | Guaranty Bank, (d/b/a BestBank in Georgia & Mi... | Milwaukee | WI | 30003 | First-Citizens Bank & Trust Company | May 5, 2017 | March 22, 2018 |
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:') # Banco de dados em memória
df.to_sql('tabela',engine)
sqldf = pd.read_sql('tabela', con=engine)
sqldf
index | a | b | c | d | |
---|---|---|---|---|---|
0 | 0 | 0 | 1 | 2 | 3 |
1 | 1 | 4 | 5 | 6 | 7 |
2 | 2 | 8 | 9 | 10 | 11 |
3 | 3 | 12 | 13 | 14 | 15 |