#import bread & butter
import itertools
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_data(roleName):
'''
Get H1B info for a given rolename
Params: roleName -> Role to retrive the data for
Output: Dataframe in format ['Employer', 'Title', 'Salary', 'Location', 'Start Date']
'''
#build url from role name
url='https://h1bdata.info/index.php?em=&job='+roleName.replace(" ","+")+'&city=&year=All+Years'
print(url)
#make soup
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
page.close()
tables = soup.find_all('table')
#find relevant info
args = [iter(tables[0].stripped_strings)] * 7
data_list = list(itertools.zip_longest(*args))
data_list = [x for x in data_list if x[6] == 'CERTIFIED']
data = pd.DataFrame(data_list)
del data[4], data[6]
data.columns = ['Employer', 'Title', 'Salary', 'Location', 'Start Date']
data['Salary']=pd.to_numeric(data['Salary'],errors='ignore')
data['Start Date']=pd.to_datetime(data['Start Date'],errors='ignore')
return data
final_df=pd.DataFrame() #Result collector Dataframe
#Variations of 'Data Scientist'
roleName_List=['Data Scie','Associate Data Scie','Junior Data Scie','Senior Data Scie','Sr Data Scien','Sr. Data Scien',
'Lead Data Scien','Principal Data Scien','Jr Data Scien','Jr. Data Scien','Graduate Data Scien',
'ML','Machine lea','Applied Data Scientist']
for role in roleName_List:
print(role)
data=get_data(role)
final_df=pd.concat([final_df,data])
Data Scie https://h1bdata.info/index.php?em=&job=Data+Scie&city=&year=All+Years Associate Data Scie https://h1bdata.info/index.php?em=&job=Associate+Data+Scie&city=&year=All+Years Junior Data Scie https://h1bdata.info/index.php?em=&job=Junior+Data+Scie&city=&year=All+Years Senior Data Scie https://h1bdata.info/index.php?em=&job=Senior+Data+Scie&city=&year=All+Years Sr Data Scien https://h1bdata.info/index.php?em=&job=Sr+Data+Scien&city=&year=All+Years Sr. Data Scien https://h1bdata.info/index.php?em=&job=Sr.+Data+Scien&city=&year=All+Years Lead Data Scien https://h1bdata.info/index.php?em=&job=Lead+Data+Scien&city=&year=All+Years Principal Data Scien https://h1bdata.info/index.php?em=&job=Principal+Data+Scien&city=&year=All+Years Jr Data Scien https://h1bdata.info/index.php?em=&job=Jr+Data+Scien&city=&year=All+Years Jr. Data Scien https://h1bdata.info/index.php?em=&job=Jr.+Data+Scien&city=&year=All+Years Graduate Data Scien https://h1bdata.info/index.php?em=&job=Graduate+Data+Scien&city=&year=All+Years ML https://h1bdata.info/index.php?em=&job=ML&city=&year=All+Years Machine lea https://h1bdata.info/index.php?em=&job=Machine+lea&city=&year=All+Years Applied Data Scientist https://h1bdata.info/index.php?em=&job=Applied+Data+Scientist&city=&year=All+Years
final_df.to_csv('FINAL_ALL_H1B_Certified_Data_Scienitist.csv',index=False)