In [7]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
%matplotlib inline 

In [8]:

df1 = pd.read_csv('Data_Science_Topics_Survey.csv')      
headers =['time','Data Visualization','Machine Learning','Data Analysis/Statistics','Big Data (Spark/Hadoop)','Data Journalism','Deep Learning']
df1.columns = headers
df1 = df1.drop('time',axis=1)
df1.describe(include='all') 

Out[8]:

	Data Visualization	Machine Learning	Data Analysis/Statistics	Big Data (Spark/Hadoop)	Data Journalism	Deep Learning
count	2176	2180	2192	2188	2120	2169
unique	3	3	3	3	3	3
top	Very interested	Very interested	Very interested	Very interested	Somewhat interested	Very interested
freq	1340	1629	1688	1332	1081	1263

In [9]:

dic = dict.fromkeys(list(df1.columns.values))     #根据df列名建一个空的dictionary
dic

Out[9]:

{'Data Visualization': None,
 'Machine Learning': None,
 'Data Analysis/Statistics': None,
 'Big Data (Spark/Hadoop)': None,
 'Data Journalism': None,
 'Deep Learning': None}

In [10]:

for key in list(df1.columns.values):
    value = df1[key].value_counts()        #对列进行计数，得到一个计数list
    dic.update({key:value})                #将列名及计数结果写入dictionary

In [11]:

for k,v in dic.items():          #通过items
    print(str(k) + ':' + str(v))           #输出dictionary内容

Data Visualization:Very interested        1340
Somewhat interested     734
Not interested          102
Name: Data Visualization, dtype: int64
Machine Learning:Very interested        1629
Somewhat interested     477
Not interested           74
Name: Machine Learning, dtype: int64
Data Analysis/Statistics:Very interested        1688
Somewhat interested     444
Not interested           60
Name: Data Analysis/Statistics, dtype: int64
Big Data (Spark/Hadoop):Very interested        1332
Somewhat interested     729
Not interested          127
Name: Big Data (Spark/Hadoop), dtype: int64
Data Journalism:Somewhat interested    1081
Not interested          610
Very interested         429
Name: Data Journalism, dtype: int64
Deep Learning:Very interested        1263
Somewhat interested     770
Not interested          136
Name: Deep Learning, dtype: int64

In [12]:

Q1df = pd.DataFrame.from_dict(dic).transpose()             #从dictionary建立dataframe
Q1df.describe() 

Out[12]:

	Not interested	Somewhat interested	Very interested
count	6.000000	6.000000	6.000000
mean	184.833333	705.833333	1280.166667
std	210.344876	231.135819	451.455166
min	60.000000	444.000000	429.000000
25%	81.000000	540.000000	1280.250000
50%	114.500000	731.500000	1336.000000
75%	133.750000	761.000000	1556.750000
max	610.000000	1081.000000	1688.000000

In [7]:

cols = Q1df.columns.tolist()             #获取列名并写入list中
cols

Out[7]:

['Not interested', 'Somewhat interested', 'Very interested']

In [8]:

Q1df.reset_index(inplace=True)
Q1df.head()

Out[8]:

	index	Not interested	Somewhat interested	Very interested
0	Data Visualization	102	734	1340
1	Machine Learning	74	477	1629
2	Data Analysis/Statistics	60	444	1688
3	Big Data (Spark/Hadoop)	127	729	1332
4	Data Journalism	610	1081	429

In [9]:

Q1df.rename(columns={'index':'Type'},inplace=True)
Q1df.sort_values(by=['Type'])

Out[9]:

	Type	Not interested	Somewhat interested	Very interested
3	Big Data (Spark/Hadoop)	127	729	1332
2	Data Analysis/Statistics	60	444	1688
4	Data Journalism	610	1081	429
0	Data Visualization	102	734	1340
5	Deep Learning	136	770	1263
1	Machine Learning	74	477	1629

In [10]:

cols = ['Type','Very interested', 'Somewhat interested','Not interested' ]
Q1df = Q1df[cols]          #按cols中的顺序调整列    
Q1df

Out[10]:

	Type	Very interested	Somewhat interested	Not interested
0	Data Visualization	1340	734	102
1	Machine Learning	1629	477	74
2	Data Analysis/Statistics	1688	444	60
3	Big Data (Spark/Hadoop)	1332	729	127
4	Data Journalism	429	1081	610
5	Deep Learning	1263	770	136

In [11]:

Q1df = Q1df.set_index('Type')      #将Type列设为index
Q1df

Out[11]:

	Very interested	Somewhat interested	Not interested
Type
Data Visualization	1340	734	102
Machine Learning	1629	477	74
Data Analysis/Statistics	1688	444	60
Big Data (Spark/Hadoop)	1332	729	127
Data Journalism	429	1081	610
Deep Learning	1263	770	136

In [12]:

Q1df = Q1df.sort_values(by=['Very interested'],ascending=False)
Q1df['Total'] = Q1df['Very interested'] + Q1df['Somewhat interested'] + Q1df['Not interested']      #新建一个Total列
Q1df

Out[12]:

	Very interested	Somewhat interested	Not interested	Total
Type
Data Analysis/Statistics	1688	444	60	2192
Machine Learning	1629	477	74	2180
Data Visualization	1340	734	102	2176
Big Data (Spark/Hadoop)	1332	729	127	2188
Deep Learning	1263	770	136	2169
Data Journalism	429	1081	610	2120

In [13]:

Q2df = Q1df.copy()
Q2df['Very interested'] = np.round(Q2df['Very interested'] / Q2df['Total'] *100,decimals =2)
Q2df['Somewhat interested'] = np.round(Q2df['Somewhat interested'] / Q2df['Total'] *100,decimals =2)
Q2df['Not interested'] = np.round(Q2df['Not interested'] / Q2df['Total'] *100,decimals =2)
Q2df = Q2df.drop('Total',axis=1)
Q2df

Out[13]:

	Very interested	Somewhat interested	Not interested
Type
Data Analysis/Statistics	77.01	20.26	2.74
Machine Learning	74.72	21.88	3.39
Data Visualization	61.58	33.73	4.69
Big Data (Spark/Hadoop)	60.88	33.32	5.80
Deep Learning	58.23	35.50	6.27
Data Journalism	20.24	50.99	28.77

In [14]:

font1 = 14
font2 = 16

color_list = ['#5cb85c','#5bc0de','#d9534f']
ax = Q2df.plot(kind='bar',color=color_list,figsize=(20,8),width=0.8)

ax.set_title("Percentage of Respondents' Interst in Data Science Areas",size=font2,color='black')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90,color='black',size=font1)    # x轴刻度

ax.axes.get_yaxis().set_visible(False)            # 不显示y轴刻度
ax.xaxis.label.set_visible(False)                 # 不显示x轴名称

ax.spines['right'].set_color('none')              # 不显示right坐标
ax.spines['left'].set_color('none') 
ax.spines['top'].set_color('none') 

ax.legend(fontsize=font2)

rects = ax.patches             
for rect in rects:             # Get X and Y placement of label from rect.    
    y_value = rect.get_height()
    x_value = rect.get_x()
    label = "{:.2f}%".format(y_value)
    space = 5
    va = 'top'
    ax.annotate(
        label,                      # Use `label` as label
        (x_value+0.16, y_value+1.5),         # Place label at end of the bar
        xytext=(0, space),          # Vertically shift label by `space`
        textcoords="offset points", # Interpret `xytext` as offset in points
        ha='center',                # Horizontally center label
        va=va,
        color='blue',size=font1)                      # Vertically align label differently for

In [15]:

df2 = pd.read_csv('Police_Department_Incidents_-_Previous_Year__2016_.csv')      
df2.head()

Out[15]:

	IncidntNum	Category	Descript	DayOfWeek	Date	Time	PdDistrict	Resolution	Address	X	Y	Location	PdId
0	120058272	WEAPON LAWS	POSS OF PROHIBITED WEAPON	Friday	01/29/2016 12:00:00 AM	11:00	SOUTHERN	ARREST, BOOKED	800 Block of BRYANT ST	-122.403405	37.775421	(37.775420706711, -122.403404791479)	12005827212120
1	120058272	WEAPON LAWS	FIREARM, LOADED, IN VEHICLE, POSSESSION OR USE	Friday	01/29/2016 12:00:00 AM	11:00	SOUTHERN	ARREST, BOOKED	800 Block of BRYANT ST	-122.403405	37.775421	(37.775420706711, -122.403404791479)	12005827212168
2	141059263	WARRANTS	WARRANT ARREST	Monday	04/25/2016 12:00:00 AM	14:59	BAYVIEW	ARREST, BOOKED	KEITH ST / SHAFTER AV	-122.388856	37.729981	(37.7299809672996, -122.388856204292)	14105926363010
3	160013662	NON-CRIMINAL	LOST PROPERTY	Tuesday	01/05/2016 12:00:00 AM	23:50	TENDERLOIN	NONE	JONES ST / OFARRELL ST	-122.412971	37.785788	(37.7857883766888, -122.412970537591)	16001366271000
4	160002740	NON-CRIMINAL	LOST PROPERTY	Friday	01/01/2016 12:00:00 AM	00:30	MISSION	NONE	16TH ST / MISSION ST	-122.419672	37.765050	(37.7650501214668, -122.419671780296)	16000274071000

In [16]:

Q3df = df2[['PdDistrict','IncidntNum']]
Q3df.head()

Out[16]:

	PdDistrict	IncidntNum
0	SOUTHERN	120058272
1	SOUTHERN	120058272
2	BAYVIEW	141059263
3	TENDERLOIN	160013662
4	MISSION	160002740

In [17]:

Q3df = Q3df.groupby('PdDistrict').count()
Q3df

Out[17]:

	IncidntNum
PdDistrict
BAYVIEW	14303
CENTRAL	17666
INGLESIDE	11594
MISSION	19503
NORTHERN	20100
PARK	8699
RICHMOND	8922
SOUTHERN	28445
TARAVAL	11325
TENDERLOIN	9942

In [18]:

Q3df.describe()

Out[18]:

	IncidntNum
count	10.000000
mean	15049.900000
std	6341.731352
min	8699.000000
25%	10287.750000
50%	12948.500000
75%	19043.750000
max	28445.000000

In [19]:

Q3df.reset_index(inplace=True)
Q3df

Out[19]:

	PdDistrict	IncidntNum
0	BAYVIEW	14303
1	CENTRAL	17666
2	INGLESIDE	11594
3	MISSION	19503
4	NORTHERN	20100
5	PARK	8699
6	RICHMOND	8922
7	SOUTHERN	28445
8	TARAVAL	11325
9	TENDERLOIN	9942

In [20]:

Q3df = Q3df.rename(columns={'PdDistrict': 'Neighborhood','IncidntNum': 'Count'})
Q3df

Out[20]:

	Neighborhood	Count
0	BAYVIEW	14303
1	CENTRAL	17666
2	INGLESIDE	11594
3	MISSION	19503
4	NORTHERN	20100
5	PARK	8699
6	RICHMOND	8922
7	SOUTHERN	28445
8	TARAVAL	11325
9	TENDERLOIN	9942

In [21]:

Q3df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
Neighborhood    10 non-null object
Count           10 non-null int64
dtypes: int64(1), object(1)
memory usage: 240.0+ bytes

In [25]:

import folium
sanfancisco_geo = r'san_francisco.json' # geojson file
sanfancisco_map = folium.Map(location=[37.77, -122.42], zoom_start=12)

sanfancisco_map.choropleth(
    geo_data=sanfancisco_geo,
    data=Q3df,
    columns=['Neighborhood', 'Count'],
    key_on='feature.properties.DISTRICT',      # not correct will results in only color display 
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Crime Rate in San Francisco'
)

sanfancisco_map

Out[25]:

In [ ]: