In [10]:
#import bread & butter
import itertools
import pandas as pd
import requests
from bs4 import BeautifulSoup
In [11]:
def get_data(roleName):
    '''
    Get H1B info for a given rolename
    
    Params: roleName -> Role to retrive the data for
    Output: Dataframe in format ['Employer', 'Title', 'Salary', 'Location', 'Start Date']
    '''
    
    #build url from role name
    url='https://h1bdata.info/index.php?em=&job='+roleName.replace(" ","+")+'&city=&year=All+Years'
    print(url)
    
    #make soup
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    page.close()
    
    tables = soup.find_all('table')
    
    #find relevant info
    args = [iter(tables[0].stripped_strings)] * 7
    data_list = list(itertools.zip_longest(*args))
    data_list = [x for x in data_list if x[6] == 'CERTIFIED']
    
    data = pd.DataFrame(data_list)
    del data[4], data[6]
    data.columns = ['Employer', 'Title', 'Salary', 'Location', 'Start Date']
    data['Salary']=pd.to_numeric(data['Salary'],errors='ignore')
    data['Start Date']=pd.to_datetime(data['Start Date'],errors='ignore')
    
    return data
In [12]:
final_df=pd.DataFrame() #Result collector Dataframe

#Variations of 'Data Scientist'
roleName_List=['Data Scie','Associate Data Scie','Junior Data Scie','Senior Data Scie','Sr Data Scien','Sr. Data Scien',
               'Lead Data Scien','Principal Data Scien','Jr Data Scien','Jr. Data Scien','Graduate Data Scien',
               'ML','Machine lea','Applied Data Scientist']


for role in roleName_List:
    
    print(role)

    data=get_data(role)
    final_df=pd.concat([final_df,data])
    
Data Scie
https://h1bdata.info/index.php?em=&job=Data+Scie&city=&year=All+Years
Associate Data Scie
https://h1bdata.info/index.php?em=&job=Associate+Data+Scie&city=&year=All+Years
Junior Data Scie
https://h1bdata.info/index.php?em=&job=Junior+Data+Scie&city=&year=All+Years
Senior Data Scie
https://h1bdata.info/index.php?em=&job=Senior+Data+Scie&city=&year=All+Years
Sr Data Scien
https://h1bdata.info/index.php?em=&job=Sr+Data+Scien&city=&year=All+Years
Sr. Data Scien
https://h1bdata.info/index.php?em=&job=Sr.+Data+Scien&city=&year=All+Years
Lead Data Scien
https://h1bdata.info/index.php?em=&job=Lead+Data+Scien&city=&year=All+Years
Principal Data Scien
https://h1bdata.info/index.php?em=&job=Principal+Data+Scien&city=&year=All+Years
Jr Data Scien
https://h1bdata.info/index.php?em=&job=Jr+Data+Scien&city=&year=All+Years
Jr. Data Scien
https://h1bdata.info/index.php?em=&job=Jr.+Data+Scien&city=&year=All+Years
Graduate Data Scien
https://h1bdata.info/index.php?em=&job=Graduate+Data+Scien&city=&year=All+Years
ML
https://h1bdata.info/index.php?em=&job=ML&city=&year=All+Years
Machine lea
https://h1bdata.info/index.php?em=&job=Machine+lea&city=&year=All+Years
Applied Data Scientist
https://h1bdata.info/index.php?em=&job=Applied+Data+Scientist&city=&year=All+Years
In [13]:
final_df.to_csv('FINAL_ALL_H1B_Certified_Data_Scienitist.csv',index=False)
In [ ]: