import pandas as pd
df = pd.read_csv('ted_main.csv')
df.sample()
comments | description | duration | event | film_date | languages | main_speaker | name | num_speaker | published_date | ratings | related_talks | speaker_occupation | tags | title | url | views | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2429 | 18 | TED Fellow Zubaida Bai works with medical prof... | 404 | TED Residency | 1481587200 | 23 | Zubaida Bai | Zubaida Bai: A simple birth kit for mothers in... | 1 | 1492095971 | [{'id': 8, 'name': 'Informative', 'count': 126... | [{'id': 1289, 'hero': 'https://pe.tedcdn.com/i... | Women's health advocate | ['TED Residency', 'design', 'entrepreneur', 'f... | A simple birth kit for mothers in the developi... | https://www.ted.com/talks/zubaida_bai_a_simple... | 801279 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2550 entries, 0 to 2549 Data columns (total 17 columns): comments 2550 non-null int64 description 2550 non-null object duration 2550 non-null int64 event 2550 non-null object film_date 2550 non-null int64 languages 2550 non-null int64 main_speaker 2550 non-null object name 2550 non-null object num_speaker 2550 non-null int64 published_date 2550 non-null int64 ratings 2550 non-null object related_talks 2550 non-null object speaker_occupation 2544 non-null object tags 2550 non-null object title 2550 non-null object url 2550 non-null object views 2550 non-null int64 dtypes: int64(7), object(10) memory usage: 338.8+ KB
df.columns
Index(['comments', 'description', 'duration', 'event', 'film_date', 'languages', 'main_speaker', 'name', 'num_speaker', 'published_date', 'ratings', 'related_talks', 'speaker_occupation', 'tags', 'title', 'url', 'views'], dtype='object')
import datetime
df['published_date'] = df['published_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))
df['film_date'] = df['film_date'].apply(lambda x: datetime.datetime.fromtimestamp( int(x)).strftime('%d-%m-%Y'))
df.sample()
comments | description | duration | event | film_date | languages | main_speaker | name | num_speaker | published_date | ratings | related_talks | speaker_occupation | tags | title | url | views | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2380 | 36 | How do parents protect their children and help... | 856 | TEDxManchester | 14-02-2016 | 20 | Aala El-Khani | Aala El-Khani: What it's like to be a parent i... | 1 | 10-02-2017 | [{'id': 8, 'name': 'Informative', 'count': 203... | [{'id': 2353, 'hero': 'https://pe.tedcdn.com/i... | Humanitarian psychologist | ['Middle East', 'Syria', 'TEDx', 'children', '... | What it's like to be a parent in a war zone | https://www.ted.com/talks/aala_el_khani_what_i... | 896491 |
def create_month(x):
month = pd.DatetimeIndex(x).month_name()
return month
df['published_month'] = create_month(df['published_date'])
def create_day(x):
day = pd.DatetimeIndex(x).day_name()
return day
df['published_day'] = create_day(df['published_date'])
def create_year(x):
year = pd.DatetimeIndex(x).year
return year
df['published_year'] = create_year(df['published_date'])
def create_year(x):
year = pd.DatetimeIndex(x).year
return year
df['film_year'] = create_year(df['film_date'])
def create_day(x):
day = pd.DatetimeIndex(x).day_name()
return day
df['film_day'] = create_day(df['film_date'])
def create_month(x):
month = pd.DatetimeIndex(x).month_name()
return month
df['film_month'] = create_month(df['film_date'])
data = df.groupby(['published_year','published_month'])['views'].sum().sort_values(ascending=False).reset_index()
data["published_year"].astype(str)
0 2006 1 2012 2 2013 3 2014 4 2013 ... 136 2007 137 2017 138 2006 139 2006 140 2006 Name: published_year, Length: 141, dtype: object
df['published_year'].max()
2017
def create_day_number(x):
day = pd.DatetimeIndex(x).dayofweek
return day
df['published_day_no'] = create_day_number(df['published_date'])
df.sample()
comments | description | duration | event | film_date | languages | main_speaker | name | num_speaker | published_date | ... | title | url | views | published_month | published_day | published_year | film_year | film_day | film_month | published_day_no | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
750 | 236 | Science columnist Lee Hotz describes a remarka... | 585 | TEDGlobal 2010 | 14-07-2010 | 27 | Lee Hotz | Lee Hotz: Inside an Antarctic time machine | 1 | 24-08-2010 | ... | Inside an Antarctic time machine | https://www.ted.com/talks/lee_hotz_inside_an_a... | 592082 | August | Tuesday | 2010 | 2010 | Wednesday | July | 1 |
1 rows × 24 columns
data = df.groupby(['speaker_occupation','comments'])['views'].sum().sort_values(ascending=False).reset_index().head(100)
data.tail()
speaker_occupation | comments | views | |
---|---|---|---|
95 | Poet | 720 | 5316753 |
96 | Bluegrass musicians | 129 | 5199008 |
97 | Founder of the Beirut Marathon | 186 | 5182832 |
98 | Neuroscientist, Artist | 234 | 5006241 |
99 | Psychologist, author | 271 | 4984884 |
data = df.groupby(['published_year','published_day'])['views'].sum().sort_values(ascending=False).reset_index().head(10)
data.head()
published_year | published_day | views | |
---|---|---|---|
0 | 2006 | Tuesday | 143064168 |
1 | 2013 | Wednesday | 119379446 |
2 | 2013 | Friday | 116183651 |
3 | 2014 | Friday | 113215249 |
4 | 2010 | Monday | 106121776 |
df[(df["published_day_no"] == 6) | (df["published_day_no"] == 0)]
comments | description | duration | event | film_date | languages | main_speaker | name | num_speaker | published_date | ... | title | url | views | published_month | published_day | published_year | film_year | film_day | film_month | published_day_no | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13 | 242 | Jeff Han shows off a cheap, scalable multi-tou... | 527 | TED2006 | 06-02-2006 | 27 | Jeff Han | Jeff Han: The radical promise of the multi-tou... | 1 | 01-08-2006 | ... | The radical promise of the multi-touch interface | https://www.ted.com/talks/jeff_han_demos_his_b... | 4531020 | January | Sunday | 2006 | 2006 | Friday | June | 6 |
14 | 99 | Nicholas Negroponte, founder of the MIT Media ... | 1057 | TED2006 | 23-02-2006 | 25 | Nicholas Negroponte | Nicholas Negroponte: One Laptop per Child | 1 | 01-08-2006 | ... | One Laptop per Child | https://www.ted.com/talks/nicholas_negroponte_... | 358304 | January | Sunday | 2006 | 2006 | Thursday | February | 6 |
19 | 84 | Jimmy Wales recalls how he assembled "a ragtag... | 1201 | TEDGlobal 2005 | 14-07-2005 | 32 | Jimmy Wales | Jimmy Wales: The birth of Wikipedia | 1 | 21-08-2006 | ... | The birth of Wikipedia | https://www.ted.com/talks/jimmy_wales_on_the_b... | 1106561 | August | Monday | 2006 | 2005 | Thursday | July | 0 |
20 | 108 | In 2006, open-learning visionary Richard Baran... | 1114 | TED2006 | 23-02-2006 | 27 | Richard Baraniuk | Richard Baraniuk: The birth of the open-source... | 1 | 21-08-2006 | ... | The birth of the open-source learning revolution | https://www.ted.com/talks/richard_baraniuk_on_... | 966439 | August | Monday | 2006 | 2006 | Thursday | February | 0 |
46 | 52 | Musician and activist Peter Gabriel shares his... | 848 | TED2006 | 23-02-2006 | 24 | Peter Gabriel | Peter Gabriel: Fight injustice with raw video | 1 | 06-12-2006 | ... | Fight injustice with raw video | https://www.ted.com/talks/peter_gabriel_fights... | 904215 | June | Monday | 2006 | 2006 | Thursday | February | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2523 | 20 | Another economic reality is possible -- one th... | 747 | TEDxCrenshaw | 08-10-2016 | 3 | Niki Okuk | Niki Okuk: When workers own companies, the eco... | 1 | 28-08-2017 | ... | When workers own companies, the economy is mor... | https://www.ted.com/talks/niki_okuk_when_worke... | 744793 | August | Monday | 2017 | 2016 | Wednesday | August | 0 |
2529 | 45 | We all have origin stories and identity myths,... | 1156 | TEDxExeter | 24-04-2015 | 1 | Chetan Bhatt | Chetan Bhatt: Dare to refuse the origin myths ... | 1 | 01-09-2017 | ... | Dare to refuse the origin myths that claim who... | https://www.ted.com/talks/chetan_bhatt_dare_to... | 857850 | January | Monday | 2017 | 2015 | Friday | April | 0 |
2533 | 9 | In a mind-bending talk that blurs the line bet... | 663 | TED2017 | 24-04-2017 | 2 | Tomás Saraceno | Tomás Saraceno: Would you live in a floating ... | 1 | 07-09-2017 | ... | Would you live in a floating city in the sky? | https://www.ted.com/talks/tomas_saraceno_would... | 248411 | July | Sunday | 2017 | 2017 | Monday | April | 6 |
2534 | 2 | What the astronauts felt when they saw Earth f... | 725 | TEDxSkoll | 07-04-2017 | 1 | Benjamin Grant | Benjamin Grant: What it feels like to see Eart... | 1 | 07-09-2017 | ... | What it feels like to see Earth from space | https://www.ted.com/talks/benjamin_grant_what_... | 646174 | July | Sunday | 2017 | 2017 | Tuesday | July | 6 |
2543 | 7 | What if you could know exactly how food or med... | 894 | TED2017 | 24-04-2017 | 1 | Jun Wang | Jun Wang: How digital DNA could help you make ... | 1 | 18-09-2017 | ... | How digital DNA could help you make better hea... | https://www.ted.com/talks/jun_wang_how_digital... | 534824 | September | Monday | 2017 | 2017 | Monday | April | 0 |
613 rows × 24 columns
data = df.groupby('published_year')['views'].sum().reset_index()
pd.to_datetime(df["published_year"],format="%Y")
0 2006-01-01 1 2006-01-01 2 2006-01-01 3 2006-01-01 4 2006-01-01 ... 2545 2017-01-01 2546 2017-01-01 2547 2017-01-01 2548 2017-01-01 2549 2017-01-01 Name: published_year, Length: 2550, dtype: datetime64[ns]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2550 entries, 0 to 2549 Data columns (total 24 columns): comments 2550 non-null int64 description 2550 non-null object duration 2550 non-null int64 event 2550 non-null object film_date 2550 non-null object languages 2550 non-null int64 main_speaker 2550 non-null object name 2550 non-null object num_speaker 2550 non-null int64 published_date 2550 non-null object ratings 2550 non-null object related_talks 2550 non-null object speaker_occupation 2544 non-null object tags 2550 non-null object title 2550 non-null object url 2550 non-null object views 2550 non-null int64 published_month 2550 non-null object published_day 2550 non-null object published_year 2550 non-null int64 film_year 2550 non-null int64 film_day 2550 non-null object film_month 2550 non-null object published_day_no 2550 non-null int64 dtypes: int64(8), object(16) memory usage: 478.2+ KB
import plotly.graph_objects as go
views = df.groupby(['event'])\
['views'].sum().sort_values(ascending=False).reset_index()
views.sample()
event | views | |
---|---|---|
46 | TED@NYC | 17275047 |
views_top = views.head(20)
labels = views_top['event']
values = views_top['views']
# Use `hole` to create a donut-like pie chart
fig = go.Figure(
data=[
go.Pie(labels=labels, values=values)
])
fig.show()
fig = go.Figure(
data=[
go.Pie(labels=labels, values=values, hole=0.2)
])
fig.show()
import plotly.express as px
fig = px.histogram(views, x="views")
fig.show()
fig = px.bar(views_top, x='event', y='views')
fig.show()
fig = px.bar(views_top, x='views', y='event',orientation='h')
fig.show()
fig = px.scatter(df,x='comments',y='views')
fig.show()
fig = px.scatter(df,x='comments',y='views',color='duration')
fig.show()
fig = px.scatter(df,x='comments',y='views',size='duration',color='num_speaker', log_x=True, size_max=60)
fig.show()
fig = px.scatter(df,x='comments',y='views',size='duration',color='num_speaker', size_max=60)
fig.show()
talks = df.groupby('published_year')['event'].count().reset_index()
talks['number_of_events'] = talks['event']
talks.head()
published_year | event | number_of_events | |
---|---|---|---|
0 | 2006 | 50 | 50 |
1 | 2007 | 122 | 122 |
2 | 2008 | 188 | 188 |
3 | 2009 | 222 | 222 |
4 | 2010 | 257 | 257 |
fig = px.line(talks, x="published_year", y="number_of_events")
fig.show()
fig = go.Figure(data=[go.Table(header=dict(values=views_top.columns,
fill_color='yellow',
),
cells=dict(values=[views_top['event'],views_top['views']],
fill_color='paleturquoise',
))
])
fig.show()
df.sample()
comments | description | duration | event | film_date | languages | main_speaker | name | num_speaker | published_date | ... | title | url | views | published_month | published_day | published_year | film_year | film_day | film_month | published_day_no | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2164 | 47 | What if technology could connect us more deepl... | 654 | TED2016 | 17-02-2016 | 25 | Meron Gribetz | Meron Gribetz: A glimpse of the future through... | 1 | 18-03-2016 | ... | A glimpse of the future through an augmented r... | https://www.ted.com/talks/meron_gribetz_a_glim... | 1575381 | March | Friday | 2016 | 2016 | Wednesday | February | 4 |
1 rows × 24 columns
fig = px.density_heatmap(df, x="published_year", y="views",z="comments")
fig.show()
fig = px.scatter(df, x="duration", y="comments",animation_frame="published_year", size="duration", color="published_day")
fig.show()
fig.write_html('animation.html')
fig = px.box(df, x="published_day", y="duration")
fig.show()
fig = px.scatter(df,x='comments',y='views',color='duration',text="published_day")
fig.show()
fig = px.scatter_3d(df,x='comments',y='views',z='duration',color='views')
fig.show()
fig.write_html("3d.html")
fig = go.Figure(data=[go.Surface(z=df[['duration','views','comments']].values)])
fig.update_layout(title='3D Surface', autosize=False,
width=500, height=500,
margin=dict(l=65, r=50, b=65, t=90))
fig.show()
fig.write_html('3dsurface.html')
df['error'] = 10
fig = px.scatter(df,x='comments',y='views',color='published_day',error_x="error", error_y="error")
fig.show()
views_top = df.groupby(['event'])\
['views'].sum().sort_values(ascending=False).reset_index()
views_top = views_top.head(15)
views_top['error'] = views_top['views'] / 20
fig = go.Figure(
data=[
go.Bar(
x=views_top['event'], y=views_top['views'],
error_y=dict(type='data', array=views_top['error'].values)
)
])
fig.show()
px.scatter(df, x="duration", y="comments",
animation_frame="published_month", animation_group="event",
facet_col="published_day",width=1500, height=500,
size="views", color="published_day",
)