#!/usr/bin/env python # coding: utf-8 # In[10]: #import bread & butter import itertools import pandas as pd import requests from bs4 import BeautifulSoup # In[11]: def get_data(roleName): ''' Get H1B info for a given rolename Params: roleName -> Role to retrive the data for Output: Dataframe in format ['Employer', 'Title', 'Salary', 'Location', 'Start Date'] ''' #build url from role name url='https://h1bdata.info/index.php?em=&job='+roleName.replace(" ","+")+'&city=&year=All+Years' print(url) #make soup page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') page.close() tables = soup.find_all('table') #find relevant info args = [iter(tables[0].stripped_strings)] * 7 data_list = list(itertools.zip_longest(*args)) data_list = [x for x in data_list if x[6] == 'CERTIFIED'] data = pd.DataFrame(data_list) del data[4], data[6] data.columns = ['Employer', 'Title', 'Salary', 'Location', 'Start Date'] data['Salary']=pd.to_numeric(data['Salary'],errors='ignore') data['Start Date']=pd.to_datetime(data['Start Date'],errors='ignore') return data # In[12]: final_df=pd.DataFrame() #Result collector Dataframe #Variations of 'Data Scientist' roleName_List=['Data Scie','Associate Data Scie','Junior Data Scie','Senior Data Scie','Sr Data Scien','Sr. Data Scien', 'Lead Data Scien','Principal Data Scien','Jr Data Scien','Jr. Data Scien','Graduate Data Scien', 'ML','Machine lea','Applied Data Scientist'] for role in roleName_List: print(role) data=get_data(role) final_df=pd.concat([final_df,data]) # In[13]: final_df.to_csv('FINAL_ALL_H1B_Certified_Data_Scienitist.csv',index=False) # In[ ]: