Author: Khuyen Tran
import datapane as dp
import pandas as pd
import numpy as np
# Load data from dp.Blob
medium = dp.Blob.get(name='medium', owner='khuyentran1401').download_df()
medium.head(10)
Title | Subtitle | Image | Author | Publication | Year | Month | Day | Tag | Reading_Time | Claps | Comment | url | Author_url | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Apply and Lambda usage in pandas | Learn these to master Pandas | 1 | Rahul Agarwal | Towards Data Science | 2019 | 7 | 1 | data_science | 6 | 1.5K | 0 | https://towardsdatascience.com/apply-and-lambd... | https://towardsdatascience.com/@rahul_agarwal?... |
1 | Jupyter is the new Excel (but not for your boss) | nan | 1 | Dan Lester | Towards Data Science | 2019 | 7 | 1 | data_science | 10 | 1.5K | 0 | https://towardsdatascience.com/jupyter-is-the-... | https://towardsdatascience.com/@dan_19973?sour... |
2 | Fuzzy matching at scale | From 3.7 hours to 0.2 seconds. How to perform ... | 1 | Josh Taylor | Towards Data Science | 2019 | 7 | 1 | data_science | 7 | 547 | 0 | https://towardsdatascience.com/fuzzy-matching-... | https://towardsdatascience.com/@thejoshtaylor?... |
3 | Artificial Intelligence in Video Games | An overview of how video game A.I. has develop... | 1 | Laura E Shummon Maass | Towards Data Science | 2019 | 7 | 1 | data_science | 14 | 265 | 0 | https://towardsdatascience.com/artificial-inte... | https://towardsdatascience.com/@laurashummonma... |
4 | Affinity Propagation Algorithm Explained | Affinity Propagation was first published in 20... | 1 | Cory Maklin | Towards Data Science | 2019 | 7 | 1 | data_science | 6 | 92 | 0 | https://towardsdatascience.com/unsupervised-ma... | https://towardsdatascience.com/@corymaklin?sou... |
5 | Deploying Models to Flask | A walk-through on how to deploy machine learni... | 1 | Jeremy Chow | Towards Data Science | 2019 | 7 | 1 | data_science | 8 | 859 | 0 | https://towardsdatascience.com/deploying-model... | https://towardsdatascience.com/@jeremyrchow?so... |
6 | AI, Machine Learning, Deep Learning Explained ... | Supervised ML, Unsupervised ML, Reinforcement | 1 | Jun Wu | Towards Data Science | 2019 | 7 | 1 | data_science | 7 | 406 | 0 | https://towardsdatascience.com/ai-machine-lear... | https://towardsdatascience.com/@junwu_46652?so... |
7 | Tweepy for beginners | Using Twitters API to build your own data set | 1 | Richard Chadwick | Towards Data Science | 2019 | 7 | 1 | data_science | 7 | 260 | 0 | https://towardsdatascience.com/tweepy-for-begi... | https://towardsdatascience.com/@richchad?sourc... |
8 | BIRCH Clustering Algorithm Example In Python | Existing data clustering methods do not adequa... | 1 | Cory Maklin | Towards Data Science | 2019 | 7 | 1 | data_science | 6 | 100 | 0 | https://towardsdatascience.com/machine-learnin... | https://towardsdatascience.com/@corymaklin?sou... |
9 | Zomato, Bangalore Data Analysis | What and where to eat in Bangalorea data scien... | 1 | Shubhankar Rawat | Towards Data Science | 2019 | 7 | 1 | data_science | 15 | 190 | 0 | https://towardsdatascience.com/zomato-bangalor... | https://towardsdatascience.com/@shubhankarrawa... |
medium = medium.replace('nan', np.nan)
# Drop duplicated
medium = medium.drop_duplicates(subset=['Title', 'Subtitle', 'Author', 'Year',
'Month', 'Day', 'Tag'])
medium.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 147392 entries, 0 to 148139 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 138215 non-null object 1 Subtitle 88691 non-null object 2 Image 147392 non-null uint8 3 Author 147294 non-null object 4 Publication 71402 non-null category 5 Year 147392 non-null uint16 6 Month 147392 non-null uint8 7 Day 147392 non-null uint8 8 Tag 147392 non-null category 9 Reading_Time 147392 non-null uint8 10 Claps 147392 non-null category 11 Comment 147392 non-null uint8 12 url 147392 non-null object 13 Author_url 147294 non-null object dtypes: category(3), object(5), uint16(1), uint8(5) memory usage: 8.8+ MB
import plotly.express as px
# Save the charts to build an interactive report later
charts = []
tag_plot = px.bar(x=medium.Tag.value_counts().index,
y=medium.Tag.value_counts().values,
labels={'y': 'Number of Articles',
'x': 'Tags'},
title='Number of articles in each data science-related topic')
tag_plot
charts.append(dp.Plot(tag_plot))
# Number of duplicated articles with different tags
sum(medium.iloc[:,:8].duplicated())
38516
# Drop duplicated
medium = medium.drop_duplicates(subset=['Title', 'Subtitle', 'Author', 'Year',
'Month', 'Day'])
medium.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 108864 entries, 0 to 148139 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Title 102376 non-null object 1 Subtitle 64990 non-null object 2 Image 108864 non-null uint8 3 Author 108776 non-null object 4 Publication 49457 non-null category 5 Year 108864 non-null uint16 6 Month 108864 non-null uint8 7 Day 108864 non-null uint8 8 Tag 108864 non-null category 9 Reading_Time 108864 non-null uint8 10 Claps 108864 non-null category 11 Comment 108864 non-null uint8 12 url 108864 non-null object 13 Author_url 108776 non-null object dtypes: category(3), object(5), uint16(1), uint8(5) memory usage: 6.6+ MB
comment = px.pie(medium.groupby('Comment').count().reset_index(),
values = 'Year',
names = 'Comment',
labels={'Year': 'Number of Articles'},
title = 'Number of Comments in Data Science Articles'
)
comment
charts.append(dp.Plot(comment))
medium.Claps = medium.Claps.astype('object')
medium.Claps.describe()
count 108864 unique 1042 top 0 freq 40077 Name: Claps, dtype: object
def str_to_float(feature):
'''Change string with K or M to a float (.i.e, 5k)'''
feature = feature.replace(r'[KM]+$', '', regex=True).astype(float) * \
feature.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int)
return feature
medium.Claps = str_to_float(medium.Claps)
medium['Claps'].describe()
count 108864.000000 mean 67.885674 std 258.135934 min 0.000000 25% 0.000000 50% 3.000000 75% 55.000000 max 26000.000000 Name: Claps, dtype: float64
claps = px.histogram(medium.sort_values(by='Claps')[:80000],
x='Claps',
title='Number of Claps')
claps.show()
charts.append(dp.Plot(claps))
readingTime_claps = px.scatter(medium,
x='Reading_Time',
y='Claps',
title='Claps vs Reading Time')
readingTime_claps
medium.corr().loc['Reading_Time', 'Claps']
0.1301349558669967
charts.append(dp.Plot(readingTime_claps))
# Find average duration of reading time for top articles
medium[medium.Claps.between(63, medium.Claps.max())].Reading_Time.describe()
count 24854.000000 mean 6.580027 std 3.910119 min 0.000000 25% 4.000000 50% 6.000000 75% 8.000000 max 171.000000 Name: Reading_Time, dtype: float64
# Group df by author and count the number of articles they publish
author_groupby = medium.groupby(['Author']).count().sort_values(by='Year', ascending=False).reset_index()
fig = px.bar(author_groupby[:100],
x='Author',
y='Year',
labels={'Year': 'Number of articles'},
title='Top 100 most active authors with topics related to data science'
)
fig.update_layout({
'plot_bgcolor': 'rgba(133, 227, 239, 0.04)',
})
fig.show()
charts.append(dp.Plot(fig))
author_rank = medium.Author.value_counts().index
100-(list(author_rank).index('Khuyen Tran')+1)/len(author_rank) * 100
99.85892408871186
author_groupby.Year.median()
1.0
publication_groupby = medium.groupby(by='Publication').count().sort_values(by='Title',ascending=False).reset_index()[:40000]
fig = px.bar(publication_groupby[:50],
x='Publication',
y='Title',
labels={'Title':'Number of article'},
title='Top 50 most active data science publication',
)
fig.update_layout({
'plot_bgcolor': 'rgba(133, 227, 239, 0.04)',
'margin': dict(b=250),
'height': 600,
})
fig.update_traces(textposition='outside')
fig.update_xaxes(title_font_family="Arial",tickangle=45)
charts.append(dp.Plot(fig))
import datetime
medium['Dates'] = medium.apply(lambda row: datetime.date(row.Year,row.Month,row.Day), axis=1)
dates_groupby = medium.groupby('Dates').count().reset_index()
dates = px.line(dates_groupby,
x='Dates',
y='Year',
labels={'Year':'Number of articles'},
title='Number of articles from July 2019 to the beginning of July 2020')
dates
charts.append(dp.Plot(dates))
def date_to_weekday(year, month, day):
'''Find the day of the week with regarding to the date'''
return datetime.date(year, month, day).weekday()
years = list(medium.Year)
months = list(medium.Month)
days = list(medium.Day)
medium['week_days'] = medium.apply(lambda row: date_to_weekday(row.Year, row.Month, row.Day), axis=1)
# Map the number to the day of the week
day_of_week = {0: 'Monday',
1: 'Tuesday',
2: 'Wednesday',
3: 'Thursday',
4: 'Friday',
5: 'Saturday',
6: 'Sunday'}
medium['week_days'].replace(day_of_week, inplace=True)
day_groupby = medium.groupby(by='week_days').count()
day_groupby = day_groupby.reindex(['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday',
'Sunday']).reset_index()
publish_dates = px.bar(day_groupby,
x='week_days',
y='Year',
labels={'week_days':'Day of the week',
'Year':'Number of articles'})
publish_dates
charts.append(dp.Plot(publish_dates))
fig = px.bar(medium.groupby(by='week_days').mean()['Claps'].reset_index(),
x='week_days',
y='Claps',
labels={'week_days':'Days of the week'},
title='Average number of claps on each day of a week')
fig.show()
charts.append(dp.Plot(fig))
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import math
def url_to_title(url):
'''Find title from url'''
url = url.replace('https://towardsdatascience.com/', '')
url = url.replace('https://medium.com/', '')
url = re.sub(r'.*/', '', url)
url = re.sub(r'([A-Za-z]+[\d@]+[\w@]*|[\d@]+[A-Za-z]+[\w@]*).+', '', url)
title = url.replace('-', ' ')
return title
null_urls = (list(medium.loc[medium.Title.isnull(), 'url']))
null_titles = []
for url in null_urls:
null_titles.append(url_to_title(url))
medium.loc[medium.Title.isnull(), 'Title'] = null_titles
def process_text(texts: list):
processed = []
for text in texts:
# lowercase
text = text.lower()
#remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
#remove stopwords
stop_words = set(stopwords.words('english'))
#tokenize
tokens = word_tokenize(text)
new_text = [i for i in tokens if not i in stop_words]
new_text = ' '.join(new_text)
processed.append(new_text)
return processed
text_features = ['Title', 'Subtitle']
# Replace nan with None
def isNaN(string):
return string != string
subtitle = medium.Subtitle.fillna('None')
subtitle = process_text(list(subtitle))
titles = process_text(list(medium.Title))
combine_titles = ' '.join(titles) + ' '.join([text for text in subtitle if text !='none'])
def make_wordcloud(new_text):
''''funciton to make wordcloud'''
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
min_font_size = 10).generate(new_text)
fig = plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
return fig
cloud = make_wordcloud(combine_titles)
cloud
charts.append(dp.Plot(cloud))
dp.Report(*charts).publish(name='medium_visualization',
headline='Data Science Articles on Medium from 2019/7 to 2020/7')