#!/usr/bin/env python # coding: utf-8 # In[57]: import pandas as pd import numpy as np # In[58]: sides = pd.DataFrame(data = {"shape" : ["triangle", "square", "rectangle", "pentagon", "hexagon"], "sides" : [3, 4, 4, 5, 6]}) sides # In[59]: # here I have interchanged the pentagon and hexagon positions angles = pd.DataFrame(data = {"shape" : ["triangle", "square", "rectangle", "hexagon", "pentagon"], "angles_in_degrees" : [180, 360, 360, 720, 540]}) angles # In[60]: """simple merge - I allow pandas to decide what works best here - It identifies the shape name and merges angles for pentagon and hexagon correctly""" pd.merge(left = sides, right = angles) # In[61]: """as present in your notebook: merged = pd.merge(left = combined_service_cat, right = years, left_index = True, right_index = True) And we get a different results since we said merge on both indexes. Since the values are different for index 3 and 4 in both dataframes we get a different result""" pd.merge(left = sides, right = angles, left_index = True, right_index = True) # In[62]: """simple concat - I didnt assign 'axis' parameter a value - so this results in 2nd dataframe stacked below the 1st dataframe Please Note: This is what the solution for this project actually wants! DETE below TAFE or vice-versa""" pd.concat([sides, angles]) # In[63]: """as present in your notebook: combined_service_cat = pd.concat([combined_service_cat, years], axis = 1) I assign 'axis' parameter a value - so the results 2nd dataframe is now next to the 1st one. However the column name and values are now duplicated for 'shape' """ pd.concat([sides, angles], axis = 1) # In[64]: dummy_years = pd.DataFrame({"years": ["1-2", 1, 5, 10, "11-12", np.NaN]}) dummy_years # In[65]: pattern = r"(?P[0-9][0-9]?)-?(?P[1-9][0-9]?)?" # I added transform astype(str) to your code so now it's posible extract all years dummy_extract = dummy_years["years"].astype(str).str.extractall(pattern) dummy_extract # In[66]: dummy_new = dummy_extract.reset_index("match") dummy_new # In[67]: # Now we suppose we only want the first_year column, although in my proyect I calculated a new column as # an arithmetic mean of the other two. That's why I extracted all years in my proyect. dummy_new = dummy_new.drop(["match", "Second_Year"], axis = 1) dummy_new # In[68]: merge = pd.merge(left = dummy_years , right = dummy_new, left_index = True, right_index = True) merge