import json import requests from pandas import DataFrame # read population in from JSON-formatted data derived from the Wikipedia pop_json_url = "https://gist.github.com/rdhyee/8511607/" + \ "raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json" pop_list= requests.get(pop_json_url).json() df = DataFrame(pop_list) df[:5] df.dtypes from pandas import DataFrame, Series import numpy as np s1 = Series(np.arange(-1,4)) s1 from census import Census from us import states import settings c = Census(settings.CENSUS_KEY) c.sf1.get(('NAME', 'P0010001'), {'for': 'state:%s' % states.CA.fips}) from pandas import DataFrame import numpy as np from census import Census from us import states import settings c = Census(settings.CENSUS_KEY) r = c.sf1.get(('NAME', 'P0010001'), {'for': 'state:*'}) df1 = DataFrame(r) df1.head() len(df1) print df1.P0010001.sum() print print df1.P0010001.astype(int).sum() ~Series([True, True, False, True]) pop1 = df1['P0010001'].astype('int').sum() pop2 = df1[np.in1d(df1.state, [s.fips for s in states.STATES])]['P0010001'].astype('int').sum() pop1-pop2 import pandas as pd from pandas import DataFrame import census import settings import us from itertools import islice c=census.Census(settings.CENSUS_KEY) def places(variables="NAME"): for state in us.states.STATES: geo = {'for':'place:*', 'in':'state:{s_fips}'.format(s_fips=state.fips)} for place in c.sf1.get(variables, geo=geo): yield place r = list(islice(places("NAME,P0010001"), None)) places_df = DataFrame(r) places_df.P0010001 = places_df.P0010001.astype('int') print "number of places", len(places_df) print "total pop", places_df.P0010001.sum() places_df.head() places_df[places_df.state=='06'].sort_index(by='P0010001', ascending=False).head() # numpy and pandas related imports import numpy as np from pandas import Series, DataFrame import pandas as pd # for example, using lower and uppercase English letters import string lower = Series(list(string.lowercase), name='lower') upper = Series(list(string.uppercase), name='upper') df2 = pd.concat((lower, upper), axis=1) df2['ord'] = df2['lower'].apply(ord) df2.head() string.upper('b') words = ['Berkeley', 'I', 'School'] for (i, word) in islice(enumerate(words),1): print (i, word) def g2(): words = ['Berkeley', 'I', 'School'] for word in words: if word != 'I': for letter in list(word): yield letter my_g2 = g2() def g3(): words = ['Berkeley', 'I', 'School'] for word in words: yield words import us import census import settings import pandas as pd import numpy as np from pandas import DataFrame, Series from itertools import islice c = census.Census(settings.CENSUS_KEY) def states(variables='NAME'): geo={'for':'state:*'} states_fips = set([state.fips for state in us.states.STATES]) # need to filter out non-states for r in c.sf1.get(variables, geo=geo, year=2010): if r['state'] in states_fips: yield r # make a dataframe from the total populations of states in the 2010 Census df = DataFrame(states('NAME,P0010001')) df.P0010001 = df.P0010001.astype('int') df['first_letter'] = df.NAME.apply(lambda s:s[0]) df.head() print list(df.NAME) def normalize(s): """take a Series and divide each item by the sum so that the new series adds up to 1.0""" total = np.sum(s) return s.astype('float') / total def entropy(series): """Normalized Shannon Index""" # a series in which all the entries are equal should result in normalized entropy of 1.0 # eliminate 0s series1 = series[series!=0] # if len(series) < 2 (i.e., 0 or 1) then return 0 if len(series1) > 1: # calculate the maximum possible entropy for given length of input series max_s = -np.log(1.0/len(series)) total = float(sum(series1)) p = series1.astype('float')/float(total) return sum(-p*np.log(p))/max_s else: return 0.0 def gini_simpson(s): # https://en.wikipedia.org/wiki/Diversity_index#Gini.E2.80.93Simpson_index s1 = normalize(s) return 1-np.sum(s1*s1)