In [1]:
import itertools
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import re
import sys
import locale
import numpy as np
# import geopy
import plotly
import math
import urllib2
import pickle
import IPython.core.display as di
from time import sleep
from nltk.corpus import stopwords
from collections import Counter
from scipy import stats, integrate
# from geopy.geocoders import Nominatim
from bs4 import BeautifulSoup
from urllib2 import urlopen
from plotly.graph_objs import *#Scatter, Layout
import igraph as ig

# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

city_data=pickle.load( open( "city_data.p", "rb" ) )
## pickle.dump( city_data, open( "city_data.p", "wb" ) )

df=pickle.load( open( "df.p", "rb" ) )
## pickle.dump( df, open( "df.p", "wb" ) )

python_query = pickle.load( open( "python_query.p", "rb" ) )
## pickle.dump( python_query, open( "python_query.p", "wb" ) )

r_query = pickle.load( open( "r_query.p", "rb" ) )
## pickle.dump( r_query, open( "r_query.p", "wb" ) )

data_frames = pickle.load( open( "data_frames.p", "rb" ) )
## pickle.dump( data_frames, open( "data_frames.p", "wb" ) )


plotly.offline.init_notebook_mode()
%matplotlib inline


def get_states_from_lat_lon(df):
    geolocator = Nominatim()
    r_geo = lambda x:  geolocator.reverse(str(float(x['lat']))+","+str(float(x['lon'])))
    geo_loc = df.apply(r_geo, axis=1)
    states = []
    for i in range(len(geo_loc)):
        try:
            states.append((str(geo_loc[i]).split(',')[-3]).strip())
        except:
            states.append('') # if something goes wrong
    return states


def get_avg_salary(job="\"Data+Scientist\"", loc=["NY","NY"], radius=0, verbose=False):
    locale.setlocale(locale.LC_ALL, 'us')
    job = job.lower()
    loc = [i.lower() for i in loc]
    loc = '%2C+'.join(loc)
    url = 'http://www.indeed.com/jobs?q=%s&l=%s&radius=%d' % (job, loc, radius)
    soup = BeautifulSoup(urlopen(url).read(),"lxml")

    links = soup.find_all('a', title=re.compile('\+ \('))
    salaries, counts = [], []
    salary_regex = '(\d+,000)\+ \((\d+)\)'
    for a in links:
        title = a.get('title').encode('utf-8')
        results = re.search(salary_regex, title).groups()
        salaries.append( int(results[0].replace(',','')) )
        counts.append( int(results[1]) )

    d_y_m = soup.find_all('span', style="", string=re.compile('Did you mean'))
    o_r_r = soup.find_all('div', id=re.compile('original_radius_result'))
    
    if (len(counts)==0) or (len(o_r_r) > 0) or (len(d_y_m) > 0):
        num_jobs=0
        avg_salary=0
    else:
        left = []
        right = []
        jobs = []
        for i in range(len(counts)):
            if i==max(range(len(counts))):
                jobs.append(counts[i])
                for ii in range(counts[i]):
                    left.append(salaries[i])
                    right.append(salaries[i])
            else:
                jobs.append(counts[i]-counts[i+1])
                for ii in range(counts[i]-counts[i+1]):
                    left.append(salaries[i])
                    right.append(salaries[i+1])

        avg_salary = np.mean([np.mean(left),np.mean(right)])
        
        num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8')
        job_numbers = re.findall('\d+', num_jobs_area)

        if len(job_numbers) > 3: 
            num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
        else:
            num_jobs = int(job_numbers[2]) 
        
        if verbose==True:
            print 'Location: %s' % ", ".join(loc.split("%2C+"))
            print 'Job : %s' % " ".join(job.split("+"))
            print 'Salaries:'
            for j,l,r in zip(jobs,left,right):
                print "%s jobs: $%d-$%d" % (j,l,r)
            print ''
            print "%d jobs in %s with a mean salary of: $%d" % (num_jobs,",".join(loc.split("%2C+")),avg_salary)

    return avg_salary, num_jobs


def text_cleaner(website):
    try:
        site = urllib2.urlopen(website).read() # Connect to the job posting
    except: 
        return
    
    soup_obj = BeautifulSoup(site,"lxml")
    
    if len(soup_obj) == 0:
        soup_obj = BeautifulSoup(site, 'html5lib')
    
    for script in soup_obj(["script", "style"]):
        script.extract()

    text = soup_obj.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = ''.join(chunk for chunk in chunks if chunk).encode('utf-8')

    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore')
    except:
        return
   
    text = re.sub("[^a-zA-Z+3]"," ", text)
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
    text = text.lower().split()
    stop_words = set(stopwords.words("english"))
    text = [w for w in text if not w in stop_words]
    text = list(set(text))
    
    return text


def skills_info(city = None, state = None, verbose=False):
    final_job = 'data+scientist' 
    
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_state = state.split() 
        final_state = '+'.join(word for word in final_state)
        final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', final_state] 
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list)
    base_url = 'http://www.indeed.com'

    try:
        html = urlopen(final_site).read()
    except:
        'That city/state combination did not have any jobs. Exiting . . .' 
        return
    
    soup = BeautifulSoup(html)
    num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') 
    job_numbers = re.findall('\d+', num_jobs_area) 

    if len(job_numbers) > 3:
        total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
    else:
        total_num_jobs = int(job_numbers[2]) 

    city_title = city
    if city is None:
        city_title = 'Nationwide'
    
    if verbose:
        print 'There were', total_num_jobs, 'jobs found,', city_title

    num_pages = total_num_jobs/10
    job_descriptions = [] 

    for i in xrange(1,num_pages+1): 
        if verbose:
            print 'Getting page', i
        start_num = str(i*10)
        current_page = ''.join([final_site, '&start=', start_num])
        
        html_page = urllib2.urlopen(current_page).read()
        page_obj = BeautifulSoup(html_page) 
        job_link_area = page_obj.find(id = 'resultsCol') 
        job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')]
        job_URLS = filter(lambda x:'clk' in x, job_URLS)
        
        for j in xrange(0,len(job_URLS)):
            final_description = text_cleaner(job_URLS[j])
            if final_description:
                job_descriptions.append(final_description)
            sleep(1)
            
    if verbose:
        print 'Done with collecting the job postings!'    
        print 'There were', len(job_descriptions), 'jobs successfully found.'

    doc_frequency = Counter()
    [doc_frequency.update(item) for item in job_descriptions] 

    prog_lang_dict = Counter({'R':doc_frequency['r'], 'Python':doc_frequency['python'],
                    'Java':doc_frequency['java'], 'C++':doc_frequency['c++'], 
                   'Ruby':doc_frequency['ruby'], 'Julia':doc_frequency['julia'],
                  'Perl':doc_frequency['perl'], 'Matlab':doc_frequency['matlab'],
                  'JavaScript':doc_frequency['javascript'], 'Scala': doc_frequency['scala']})
                  
    analysis_tool_dict = Counter({'Excel':doc_frequency['excel'],  'Tableau':doc_frequency['tableau'],
                      'D3.js':doc_frequency['d3.js'], 'SAS':doc_frequency['sas'],
                      'SPSS':doc_frequency['spss'], 'D3':doc_frequency['d3']})  

    hadoop_dict = Counter({'Hadoop':doc_frequency['hadoop'], 'MapReduce':doc_frequency['mapreduce'],
               'Spark':doc_frequency['spark'], 'Pig':doc_frequency['pig'],
               'Hive':doc_frequency['hive'], 'Shark':doc_frequency['shark'],
               'Oozie':doc_frequency['oozie'], 'ZooKeeper':doc_frequency['zookeeper'],
               'Flume':doc_frequency['flume'], 'Mahout':doc_frequency['mahout']})
               
    database_dict = Counter({'SQL':doc_frequency['sql'], 'NoSQL':doc_frequency['nosql'],
                 'HBase':doc_frequency['hbase'], 'Cassandra':doc_frequency['cassandra'],
                 'MongoDB':doc_frequency['mongodb']})
                     
    overall_total_skills = prog_lang_dict + analysis_tool_dict + hadoop_dict + database_dict
    final_frame = pd.DataFrame(overall_total_skills.items(), columns = ['Term', 'NumPostings']) 
    final_frame.NumPostings = (final_frame.NumPostings)*100/len(job_descriptions) 
    final_frame.sort_values(by='NumPostings', ascending = False, inplace = True)
    
    data = Data([
        Bar(x=list(final_frame.Term.values),
            y=list(final_frame.NumPostings.values),
            name='Data Science Skills')])
    
    layout = Layout(height=480,
        hovermode='closest',
        margin=Margin(r=40,
            t=50,
            b=140,
            l=40,
            pad=2),
        title=city+','+state+' Data Science Skills',
        width=650,
        xaxis=XAxis(nticks=200,
            tickangle=-90,
            tickfont=dict(size=7)))
    
    fig = Figure(data=data, layout=layout)
    return fig, final_frame # End of the function


def create_graph(dict_of_lists):
    G=nx.Graph()
    for i in dict_of_lists.keys():
        for ii in dict_of_lists[i]:
            G.add_edges_from([(i,ii)])
    return G


def stack_graph(search_term = "r", pages=None):
#     try:
    list_of_links = {}
    base_add= "http://stackoverflow.com"
    headers = {'User-Agent' : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30"}

    #-- get number of pages
    def num_pages(link_url = "http://stackoverflow.com/questions/tagged/r?page=1&sort=frequent&pageSize=50"):
        url_0 = link_url
        req_0 = urllib2.Request(url_0, headers=headers)
        con_0 = urllib2.urlopen(req_0)
        soup_0 = BeautifulSoup(con_0.read(), "lxml")

        counts_flag = soup_0.find_all('div', class_=re.compile('summarycount'))
        questions_tagged = re.findall('\d+', str(counts_flag))  
        pages = int(''.join(questions_tagged))/50
        pages = math.ceil(pages)
        pages = int(pages)
        return pages
    #--

    link_url_0 = "http://stackoverflow.com/questions/tagged/%s?page=%s&sort=frequent&pageSize=50" % (search_term,"1")
    pages_0s = 1
    pages_0 = num_pages(link_url=link_url_0)

    if pages != None: pages_0s = pages[0]

    for i in range(pages_0s,pages_0+1):
        url = "http://stackoverflow.com/questions/tagged/%s?page=%s&sort=frequent&pageSize=50" % (search_term,str(i))
        req = urllib2.Request(url, headers=headers)
        con = urllib2.urlopen(req)
        soup = BeautifulSoup(con.read(), "lxml")
        links = soup.find_all('a', class_=re.compile('question-hyperlink'))

        for ii in links:
            question = base_add + ii['href']
            question_num = question.split('/')[4]

            link_url_1 = base_add + "/questions/linked/%s?pagesize=50&sort=hot" % (question_num)
            pages_1s = 1
            pages_1 = num_pages(link_url=link_url_1)
            linked_urls = []

            if pages != None: pages_1s = pages[1]

            for iii in range(pages_1s,pages_1+1):
                question_url = base_add + "/questions/linked/%s?page=%s&sort=hot&pagesize=50" % (question_num,str(iii))
                request = urllib2.Request(question_url, headers=headers)
                connection = urllib2.urlopen(request)
                new_soup = BeautifulSoup(connection.read(), "lxml")
                linked_qs = new_soup.find_all('a', class_=re.compile('question-hyperlink'))


                for iiii in linked_qs:
                    linked_urls.append(iiii['href'])

            list_of_links[ii['href']]=linked_urls

        print len(list_of_links.keys())

    return list_of_links
#     except:
#         print "CONNECTION TIME OUT"
#         print "query page: %s , link page: $s" % (page_0, page_1)
#         return (list_of_links,page_0,page_1)
#     #     return create_graph(list_of_links)


def map_fig(df,of='Salary'):
    if of=='Salary':
        df['text'] = df['city'] + '<br> ' +of+ (df[of.lower()]/1e6).astype(str)+' million'
        df.sort_values(by=of.lower(), ascending = False, inplace = True)
    elif of=='Jobs':
        df['text'] = df['city'] + '<br> ' + (df[of.lower()]).astype(str)+' '+ of
        df.sort_values(by=of.lower(), ascending = False, inplace = True)
    limits = [(50,3000),(20,50),(10,20),(3,10),(0,3)]
    colors = ['rgb(41, 128, 171)','rgb(104, 157, 46)','rgb(169, 140, 31)','rgb(178, 81, 28)','rgb(165, 28, 18)']
    cities = []
    scale = 25000
    start = 2
    for i in range(len(limits)):
        lim = limits[i]
        df_sub = df[lim[0]:lim[1]]
        city = dict(type = 'scattergeo',
            locationmode = 'USA-states',
            lon = df_sub['lon'],
            lat = df_sub['lat'],
            text = df_sub['text'],
            marker = dict(size = (df_sub['salary'])/(scale/start),
                color = colors[i],
                line = dict(width=0.5, color='rgb(255,255,255)'),
                sizemode = 'area'),
            name = '{0} - {1}'.format(lim[0],lim[1]) )
        cities.append(city)
        start+=50

    layout = dict(title = '2016 Data Science '+of+' by City',
            showlegend = True,
            geo = dict(scope='usa',
                projection=dict( type='albers usa' ),
                showland = True,
                landcolor = 'rgb(67, 67, 67)',
                subunitwidth=1,
                countrywidth=1,
                subunitcolor="rgb(255, 255, 255)",
                countrycolor="rgb(255, 255, 255)"),)

    fig = dict( data=cities, layout=layout )
    return fig


def bar_fig(city_data):
    bars = {}
    for i in city_data:
        city = i[0]
        data = i[1]
        color = i[2]
        bars[city] = Bar(x=list(data.Term.values), y=list(data.NumPostings.values),
                        marker=Marker(color=color), name=city)    

    data = Data([bars[i] for i in bars.keys()])

    layout = Layout(autosize=True, bargap=0.15, bargroupgap=0, barmode='group',
        boxgap=0.3, boxgroupgap=0.3, boxmode='overlay', dragmode='zoom',
        font=Font(color='#444', family="'Open sans', verdana, arial, sans-serif", size=12),
        height=579, hidesources=False, hovermode='x',
        legend=Legend(x=0.8986784140969163, y=0.8521303258145363,
            bgcolor='#fff',bordercolor='#444',borderwidth=0,
            font=Font(color='', family='', size=0),
            traceorder='normal',xanchor='left',yanchor='top'),
        margin=Margin(r=80,t=100,autoexpand=True,b=80,l=80,pad=0),
        paper_bgcolor='#fff', plot_bgcolor='#fff', separators='.,', showlegend=True,
        smith=False, title='Tools by job description',
        titlefont=dict(color='',family='',size=0),width=1000,
        xaxis=XAxis(anchor='y' ,autorange=True, autotick=True, domain=[0, 1],
            dtick=1, exponentformat='B', gridcolor='#eee', gridwidth=1,
            linecolor='#444', linewidth=1, mirror=False, nticks=0, overlaying=False,
            position=0, range=[-0.5, 27.5], rangemode='normal', showexponent='all',
            showgrid=False, showline=False, showticklabels=True, tick0=0, tickangle='auto',
            tickcolor='#444', tickfont=dict(color='',family='',size=0),ticklen=5,
            ticks='',tickwidth=1,title='<br>Language/tool',
            titlefont=dict(color='',family='',size=0),type='category',zeroline=False,
            zerolinecolor='#444',zerolinewidth=1),
        yaxis=YAxis(anchor='x', autorange=True, autotick=True,
            domain=[0, 1], dtick=10, exponentformat='B',gridcolor='#eee',gridwidth=1,
            linecolor='#444',linewidth=1,mirror=False,nticks=0,overlaying=False,
            position=0,range=[0, 54.90526315789474],rangemode='normal',showexponent='all',
            showgrid=True,showline=False,showticklabels=True,tick0=0,tickangle='auto',
            tickcolor='#444', tickfont=dict(color='',family='',size=0),
            ticklen=5, ticks='', tickwidth=1, title='Percentage of posts',
            titlefont=dict(color='', family='', size=0), type='linear',
            zeroline=True,zerolinecolor='#444',zerolinewidth=1) )

    fig = Figure(data=data, layout=layout)
    return fig

def network_plot(list_of_edges=[("God","Bad"),("Bad","Michevious"),("Michevious","Good")],color=None, title="Earth"):
    sets_of_edges = [set(i) for i in list_of_edges]

    edge_numbering = {}
    number = 0
    for i in list(set.union(*sets_of_edges)):
        edge_numbering[i]=number
        number += 1

    edges=[]
    for i in list_of_edges:
        edges.append((edge_numbering[i[0]],edge_numbering[i[1]]))

    N = len(edge_numbering.keys())
    
    text = list(set.union(*sets_of_edges))
    iG=ig.Graph(edges, directed=False)
    layt=iG.layout('kk', dim=3)
    
    Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
    Yn=[layt[k][1] for k in range(N)]# y-coordinates
    Zn=[layt[k][2] for k in range(N)]# z-coordinates
    Xe=[]
    Ye=[]
    Ze=[]
    for e in edges:
        Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
        Ye+=[layt[e[0]][1],layt[e[1]][1], None]  
        Ze+=[layt[e[0]][2],layt[e[1]][2], None]

    t_1=Scatter3d(x=Xe, y=Ye, z=Ze, mode='lines', 
        line=Line(color='rgb(125,125,125)', width=1),hoverinfo='none')

    t_2=Scatter3d(x=Xn, y=Yn, z=Zn, mode='markers', name='actors',
        marker=Marker(symbol='dot', size=6, color=color,
            line=Line(color='rgb(50,50,50)', width=0.5)),text=text,hoverinfo='text')

    axis=dict(showbackground=False, showline=False, zeroline=False,
        showgrid=False, showticklabels=False, title='')

    layout = Layout(title=title, width=1000, height=1000, showlegend=False,
        scene=Scene(xaxis=XAxis(axis), yaxis=YAxis(axis), zaxis=ZAxis(axis),),
        margin=Margin(t=100), hovermode='closest', annotations=Annotations([
            Annotation(showarrow=False, 
                text="Data source: <a href='http://stackoverflow.com'>[1]</a>",
                xref='paper', yref='paper', x=0, y=0.1, xanchor='left',   
                yanchor='bottom', font=Font(size=14))]),)

    data=Data([t_1, t_2])
    fig=Figure(data=data, layout=layout)
    return fig
Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 7 days