import pandas as pd
import numpy as np
sides = pd.DataFrame(data = {"shape" : ["triangle", "square", "rectangle", "pentagon", "hexagon"],
"sides" : [3, 4, 4, 5, 6]})
sides
shape | sides | |
---|---|---|
0 | triangle | 3 |
1 | square | 4 |
2 | rectangle | 4 |
3 | pentagon | 5 |
4 | hexagon | 6 |
# here I have interchanged the pentagon and hexagon positions
angles = pd.DataFrame(data = {"shape" : ["triangle", "square", "rectangle", "hexagon", "pentagon"],
"angles_in_degrees" : [180, 360, 360, 720, 540]})
angles
angles_in_degrees | shape | |
---|---|---|
0 | 180 | triangle |
1 | 360 | square |
2 | 360 | rectangle |
3 | 720 | hexagon |
4 | 540 | pentagon |
"""simple merge - I allow pandas to decide what works best here - It identifies the shape name and merges
angles for pentagon and hexagon correctly"""
pd.merge(left = sides, right = angles)
shape | sides | angles_in_degrees | |
---|---|---|---|
0 | triangle | 3 | 180 |
1 | square | 4 | 360 |
2 | rectangle | 4 | 360 |
3 | pentagon | 5 | 540 |
4 | hexagon | 6 | 720 |
"""as present in your notebook:
merged = pd.merge(left = combined_service_cat, right = years, left_index = True, right_index = True)
And we get a different results since we said merge on both indexes. Since the values are different for index 3 and 4 in both
dataframes we get a different result"""
pd.merge(left = sides, right = angles, left_index = True, right_index = True)
shape_x | sides | angles_in_degrees | shape_y | |
---|---|---|---|---|
0 | triangle | 3 | 180 | triangle |
1 | square | 4 | 360 | square |
2 | rectangle | 4 | 360 | rectangle |
3 | pentagon | 5 | 720 | hexagon |
4 | hexagon | 6 | 540 | pentagon |
"""simple concat - I didnt assign 'axis' parameter a value - so this results in 2nd dataframe stacked below the 1st dataframe
Please Note: This is what the solution for this project actually wants! DETE below TAFE or vice-versa"""
pd.concat([sides, angles])
angles_in_degrees | shape | sides | |
---|---|---|---|
0 | NaN | triangle | 3.0 |
1 | NaN | square | 4.0 |
2 | NaN | rectangle | 4.0 |
3 | NaN | pentagon | 5.0 |
4 | NaN | hexagon | 6.0 |
0 | 180.0 | triangle | NaN |
1 | 360.0 | square | NaN |
2 | 360.0 | rectangle | NaN |
3 | 720.0 | hexagon | NaN |
4 | 540.0 | pentagon | NaN |
"""as present in your notebook:
combined_service_cat = pd.concat([combined_service_cat, years], axis = 1)
I assign 'axis' parameter a value - so the results 2nd dataframe is now next to the 1st one.
However the column name and values are now duplicated for 'shape'
"""
pd.concat([sides, angles], axis = 1)
shape | sides | angles_in_degrees | shape | |
---|---|---|---|---|
0 | triangle | 3 | 180 | triangle |
1 | square | 4 | 360 | square |
2 | rectangle | 4 | 360 | rectangle |
3 | pentagon | 5 | 720 | hexagon |
4 | hexagon | 6 | 540 | pentagon |
dummy_years = pd.DataFrame({"years": ["1-2", 1, 5, 10, "11-12", np.NaN]})
dummy_years
years | |
---|---|
0 | 1-2 |
1 | 1 |
2 | 5 |
3 | 10 |
4 | 11-12 |
5 | NaN |
pattern = r"(?P<First_Year>[0-9][0-9]?)-?(?P<Second_Year>[1-9][0-9]?)?"
# I added transform astype(str) to your code so now it's posible extract all years
dummy_extract = dummy_years["years"].astype(str).str.extractall(pattern)
dummy_extract
First_Year | Second_Year | ||
---|---|---|---|
match | |||
0 | 0 | 1 | 2 |
1 | 0 | 1 | NaN |
2 | 0 | 5 | NaN |
3 | 0 | 10 | NaN |
4 | 0 | 11 | 12 |
dummy_new = dummy_extract.reset_index("match")
dummy_new
match | First_Year | Second_Year | |
---|---|---|---|
0 | 0 | 1 | 2 |
1 | 0 | 1 | NaN |
2 | 0 | 5 | NaN |
3 | 0 | 10 | NaN |
4 | 0 | 11 | 12 |
# Now we suppose we only want the first_year column, although in my proyect I calculated a new column as
# an arithmetic mean of the other two. That's why I extracted all years in my proyect.
dummy_new = dummy_new.drop(["match", "Second_Year"], axis = 1)
dummy_new
First_Year | |
---|---|
0 | 1 |
1 | 1 |
2 | 5 |
3 | 10 |
4 | 11 |
merge = pd.merge(left = dummy_years , right = dummy_new, left_index = True, right_index = True)
merge
years | First_Year | |
---|---|---|
0 | 1-2 | 1 |
1 | 1 | 1 |
2 | 5 | 5 |
3 | 10 | 10 |
4 | 11-12 | 11 |