import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
%matplotlib inline
df1 = pd.read_csv('Data_Science_Topics_Survey.csv')
headers =['time','Data Visualization','Machine Learning','Data Analysis/Statistics','Big Data (Spark/Hadoop)','Data Journalism','Deep Learning']
df1.columns = headers
df1 = df1.drop('time',axis=1)
df1.describe(include='all')
Data Visualization | Machine Learning | Data Analysis/Statistics | Big Data (Spark/Hadoop) | Data Journalism | Deep Learning | |
---|---|---|---|---|---|---|
count | 2176 | 2180 | 2192 | 2188 | 2120 | 2169 |
unique | 3 | 3 | 3 | 3 | 3 | 3 |
top | Very interested | Very interested | Very interested | Very interested | Somewhat interested | Very interested |
freq | 1340 | 1629 | 1688 | 1332 | 1081 | 1263 |
dic = dict.fromkeys(list(df1.columns.values)) #根据df列名建一个空的dictionary
dic
{'Data Visualization': None, 'Machine Learning': None, 'Data Analysis/Statistics': None, 'Big Data (Spark/Hadoop)': None, 'Data Journalism': None, 'Deep Learning': None}
for key in list(df1.columns.values):
value = df1[key].value_counts() #对列进行计数,得到一个计数list
dic.update({key:value}) #将列名及计数结果写入dictionary
for k,v in dic.items(): #通过items
print(str(k) + ':' + str(v)) #输出dictionary内容
Data Visualization:Very interested 1340 Somewhat interested 734 Not interested 102 Name: Data Visualization, dtype: int64 Machine Learning:Very interested 1629 Somewhat interested 477 Not interested 74 Name: Machine Learning, dtype: int64 Data Analysis/Statistics:Very interested 1688 Somewhat interested 444 Not interested 60 Name: Data Analysis/Statistics, dtype: int64 Big Data (Spark/Hadoop):Very interested 1332 Somewhat interested 729 Not interested 127 Name: Big Data (Spark/Hadoop), dtype: int64 Data Journalism:Somewhat interested 1081 Not interested 610 Very interested 429 Name: Data Journalism, dtype: int64 Deep Learning:Very interested 1263 Somewhat interested 770 Not interested 136 Name: Deep Learning, dtype: int64
Q1df = pd.DataFrame.from_dict(dic).transpose() #从dictionary建立dataframe
Q1df.describe()
Not interested | Somewhat interested | Very interested | |
---|---|---|---|
count | 6.000000 | 6.000000 | 6.000000 |
mean | 184.833333 | 705.833333 | 1280.166667 |
std | 210.344876 | 231.135819 | 451.455166 |
min | 60.000000 | 444.000000 | 429.000000 |
25% | 81.000000 | 540.000000 | 1280.250000 |
50% | 114.500000 | 731.500000 | 1336.000000 |
75% | 133.750000 | 761.000000 | 1556.750000 |
max | 610.000000 | 1081.000000 | 1688.000000 |
cols = Q1df.columns.tolist() #获取列名并写入list中
cols
['Not interested', 'Somewhat interested', 'Very interested']
Q1df.reset_index(inplace=True)
Q1df.head()
index | Not interested | Somewhat interested | Very interested | |
---|---|---|---|---|
0 | Data Visualization | 102 | 734 | 1340 |
1 | Machine Learning | 74 | 477 | 1629 |
2 | Data Analysis/Statistics | 60 | 444 | 1688 |
3 | Big Data (Spark/Hadoop) | 127 | 729 | 1332 |
4 | Data Journalism | 610 | 1081 | 429 |
Q1df.rename(columns={'index':'Type'},inplace=True)
Q1df.sort_values(by=['Type'])
Type | Not interested | Somewhat interested | Very interested | |
---|---|---|---|---|
3 | Big Data (Spark/Hadoop) | 127 | 729 | 1332 |
2 | Data Analysis/Statistics | 60 | 444 | 1688 |
4 | Data Journalism | 610 | 1081 | 429 |
0 | Data Visualization | 102 | 734 | 1340 |
5 | Deep Learning | 136 | 770 | 1263 |
1 | Machine Learning | 74 | 477 | 1629 |
cols = ['Type','Very interested', 'Somewhat interested','Not interested' ]
Q1df = Q1df[cols] #按cols中的顺序调整列
Q1df
Type | Very interested | Somewhat interested | Not interested | |
---|---|---|---|---|
0 | Data Visualization | 1340 | 734 | 102 |
1 | Machine Learning | 1629 | 477 | 74 |
2 | Data Analysis/Statistics | 1688 | 444 | 60 |
3 | Big Data (Spark/Hadoop) | 1332 | 729 | 127 |
4 | Data Journalism | 429 | 1081 | 610 |
5 | Deep Learning | 1263 | 770 | 136 |
Q1df = Q1df.set_index('Type') #将Type列设为index
Q1df
Very interested | Somewhat interested | Not interested | |
---|---|---|---|
Type | |||
Data Visualization | 1340 | 734 | 102 |
Machine Learning | 1629 | 477 | 74 |
Data Analysis/Statistics | 1688 | 444 | 60 |
Big Data (Spark/Hadoop) | 1332 | 729 | 127 |
Data Journalism | 429 | 1081 | 610 |
Deep Learning | 1263 | 770 | 136 |
Q1df = Q1df.sort_values(by=['Very interested'],ascending=False)
Q1df['Total'] = Q1df['Very interested'] + Q1df['Somewhat interested'] + Q1df['Not interested'] #新建一个Total列
Q1df
Very interested | Somewhat interested | Not interested | Total | |
---|---|---|---|---|
Type | ||||
Data Analysis/Statistics | 1688 | 444 | 60 | 2192 |
Machine Learning | 1629 | 477 | 74 | 2180 |
Data Visualization | 1340 | 734 | 102 | 2176 |
Big Data (Spark/Hadoop) | 1332 | 729 | 127 | 2188 |
Deep Learning | 1263 | 770 | 136 | 2169 |
Data Journalism | 429 | 1081 | 610 | 2120 |
Q2df = Q1df.copy()
Q2df['Very interested'] = np.round(Q2df['Very interested'] / Q2df['Total'] *100,decimals =2)
Q2df['Somewhat interested'] = np.round(Q2df['Somewhat interested'] / Q2df['Total'] *100,decimals =2)
Q2df['Not interested'] = np.round(Q2df['Not interested'] / Q2df['Total'] *100,decimals =2)
Q2df = Q2df.drop('Total',axis=1)
Q2df
Very interested | Somewhat interested | Not interested | |
---|---|---|---|
Type | |||
Data Analysis/Statistics | 77.01 | 20.26 | 2.74 |
Machine Learning | 74.72 | 21.88 | 3.39 |
Data Visualization | 61.58 | 33.73 | 4.69 |
Big Data (Spark/Hadoop) | 60.88 | 33.32 | 5.80 |
Deep Learning | 58.23 | 35.50 | 6.27 |
Data Journalism | 20.24 | 50.99 | 28.77 |
font1 = 14
font2 = 16
color_list = ['#5cb85c','#5bc0de','#d9534f']
ax = Q2df.plot(kind='bar',color=color_list,figsize=(20,8),width=0.8)
ax.set_title("Percentage of Respondents' Interst in Data Science Areas",size=font2,color='black')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90,color='black',size=font1) # x轴刻度
ax.axes.get_yaxis().set_visible(False) # 不显示y轴刻度
ax.xaxis.label.set_visible(False) # 不显示x轴名称
ax.spines['right'].set_color('none') # 不显示right坐标
ax.spines['left'].set_color('none')
ax.spines['top'].set_color('none')
ax.legend(fontsize=font2)
rects = ax.patches
for rect in rects: # Get X and Y placement of label from rect.
y_value = rect.get_height()
x_value = rect.get_x()
label = "{:.2f}%".format(y_value)
space = 5
va = 'top'
ax.annotate(
label, # Use `label` as label
(x_value+0.16, y_value+1.5), # Place label at end of the bar
xytext=(0, space), # Vertically shift label by `space`
textcoords="offset points", # Interpret `xytext` as offset in points
ha='center', # Horizontally center label
va=va,
color='blue',size=font1) # Vertically align label differently for
df2 = pd.read_csv('Police_Department_Incidents_-_Previous_Year__2016_.csv')
df2.head()
IncidntNum | Category | Descript | DayOfWeek | Date | Time | PdDistrict | Resolution | Address | X | Y | Location | PdId | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 120058272 | WEAPON LAWS | POSS OF PROHIBITED WEAPON | Friday | 01/29/2016 12:00:00 AM | 11:00 | SOUTHERN | ARREST, BOOKED | 800 Block of BRYANT ST | -122.403405 | 37.775421 | (37.775420706711, -122.403404791479) | 12005827212120 |
1 | 120058272 | WEAPON LAWS | FIREARM, LOADED, IN VEHICLE, POSSESSION OR USE | Friday | 01/29/2016 12:00:00 AM | 11:00 | SOUTHERN | ARREST, BOOKED | 800 Block of BRYANT ST | -122.403405 | 37.775421 | (37.775420706711, -122.403404791479) | 12005827212168 |
2 | 141059263 | WARRANTS | WARRANT ARREST | Monday | 04/25/2016 12:00:00 AM | 14:59 | BAYVIEW | ARREST, BOOKED | KEITH ST / SHAFTER AV | -122.388856 | 37.729981 | (37.7299809672996, -122.388856204292) | 14105926363010 |
3 | 160013662 | NON-CRIMINAL | LOST PROPERTY | Tuesday | 01/05/2016 12:00:00 AM | 23:50 | TENDERLOIN | NONE | JONES ST / OFARRELL ST | -122.412971 | 37.785788 | (37.7857883766888, -122.412970537591) | 16001366271000 |
4 | 160002740 | NON-CRIMINAL | LOST PROPERTY | Friday | 01/01/2016 12:00:00 AM | 00:30 | MISSION | NONE | 16TH ST / MISSION ST | -122.419672 | 37.765050 | (37.7650501214668, -122.419671780296) | 16000274071000 |
Q3df = df2[['PdDistrict','IncidntNum']]
Q3df.head()
PdDistrict | IncidntNum | |
---|---|---|
0 | SOUTHERN | 120058272 |
1 | SOUTHERN | 120058272 |
2 | BAYVIEW | 141059263 |
3 | TENDERLOIN | 160013662 |
4 | MISSION | 160002740 |
Q3df = Q3df.groupby('PdDistrict').count()
Q3df
IncidntNum | |
---|---|
PdDistrict | |
BAYVIEW | 14303 |
CENTRAL | 17666 |
INGLESIDE | 11594 |
MISSION | 19503 |
NORTHERN | 20100 |
PARK | 8699 |
RICHMOND | 8922 |
SOUTHERN | 28445 |
TARAVAL | 11325 |
TENDERLOIN | 9942 |
Q3df.describe()
IncidntNum | |
---|---|
count | 10.000000 |
mean | 15049.900000 |
std | 6341.731352 |
min | 8699.000000 |
25% | 10287.750000 |
50% | 12948.500000 |
75% | 19043.750000 |
max | 28445.000000 |
Q3df.reset_index(inplace=True)
Q3df
PdDistrict | IncidntNum | |
---|---|---|
0 | BAYVIEW | 14303 |
1 | CENTRAL | 17666 |
2 | INGLESIDE | 11594 |
3 | MISSION | 19503 |
4 | NORTHERN | 20100 |
5 | PARK | 8699 |
6 | RICHMOND | 8922 |
7 | SOUTHERN | 28445 |
8 | TARAVAL | 11325 |
9 | TENDERLOIN | 9942 |
Q3df = Q3df.rename(columns={'PdDistrict': 'Neighborhood','IncidntNum': 'Count'})
Q3df
Neighborhood | Count | |
---|---|---|
0 | BAYVIEW | 14303 |
1 | CENTRAL | 17666 |
2 | INGLESIDE | 11594 |
3 | MISSION | 19503 |
4 | NORTHERN | 20100 |
5 | PARK | 8699 |
6 | RICHMOND | 8922 |
7 | SOUTHERN | 28445 |
8 | TARAVAL | 11325 |
9 | TENDERLOIN | 9942 |
Q3df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10 entries, 0 to 9 Data columns (total 2 columns): Neighborhood 10 non-null object Count 10 non-null int64 dtypes: int64(1), object(1) memory usage: 240.0+ bytes
import folium
sanfancisco_geo = r'san_francisco.json' # geojson file
sanfancisco_map = folium.Map(location=[37.77, -122.42], zoom_start=12)
sanfancisco_map.choropleth(
geo_data=sanfancisco_geo,
data=Q3df,
columns=['Neighborhood', 'Count'],
key_on='feature.properties.DISTRICT', # not correct will results in only color display
fill_color='YlOrRd',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Crime Rate in San Francisco'
)
sanfancisco_map