Data Science Articles¶

Author: Khuyen Tran

In [1]:

import datapane as dp 
import pandas as pd
import numpy as np

# Load data from dp.Blob
medium = dp.Blob.get(name='medium', owner='khuyentran1401').download_df()

In [2]:

medium.head(10)

Out[2]:

	Title	Subtitle	Image	Author	Publication	Year	Month	Day	Tag	Reading_Time	Claps	url	Author_url
0	Apply and Lambda usage in pandas	Learn these to master Pandas	1	Rahul Agarwal	Towards Data Science	2019	7	1	data_science	6	1.5K	https://towardsdatascience.com/apply-and-lambd...	https://towardsdatascience.com/@rahul_agarwal?...
1	Jupyter is the new Excel (but not for your boss)	nan	1	Dan Lester	Towards Data Science	2019	7	1	data_science	10	1.5K	https://towardsdatascience.com/jupyter-is-the-...	https://towardsdatascience.com/@dan_19973?sour...
2	Fuzzy matching at scale	From 3.7 hours to 0.2 seconds. How to perform ...	1	Josh Taylor	Towards Data Science	2019	7	1	data_science	7	547	https://towardsdatascience.com/fuzzy-matching-...	https://towardsdatascience.com/@thejoshtaylor?...
3	Artificial Intelligence in Video Games	An overview of how video game A.I. has develop...	1	Laura E Shummon Maass	Towards Data Science	2019	7	1	data_science	14	265	https://towardsdatascience.com/artificial-inte...	https://towardsdatascience.com/@laurashummonma...
4	Affinity Propagation Algorithm Explained	Affinity Propagation was first published in 20...	1	Cory Maklin	Towards Data Science	2019	7	1	data_science	6	92	https://towardsdatascience.com/unsupervised-ma...	https://towardsdatascience.com/@corymaklin?sou...
5	Deploying Models to Flask	A walk-through on how to deploy machine learni...	1	Jeremy Chow	Towards Data Science	2019	7	1	data_science	8	859	https://towardsdatascience.com/deploying-model...	https://towardsdatascience.com/@jeremyrchow?so...
6	AI, Machine Learning, Deep Learning Explained ...	Supervised ML, Unsupervised ML, Reinforcement	1	Jun Wu	Towards Data Science	2019	7	1	data_science	7	406	https://towardsdatascience.com/ai-machine-lear...	https://towardsdatascience.com/@junwu_46652?so...
7	Tweepy for beginners	Using Twitters API to build your own data set	1	Richard Chadwick	Towards Data Science	2019	7	1	data_science	7	260	https://towardsdatascience.com/tweepy-for-begi...	https://towardsdatascience.com/@richchad?sourc...
8	BIRCH Clustering Algorithm Example In Python	Existing data clustering methods do not adequa...	1	Cory Maklin	Towards Data Science	2019	7	1	data_science	6	100	https://towardsdatascience.com/machine-learnin...	https://towardsdatascience.com/@corymaklin?sou...
9	Zomato, Bangalore Data Analysis	What and where to eat in Bangalorea data scien...	1	Shubhankar Rawat	Towards Data Science	2019	7	1	data_science	15	190	https://towardsdatascience.com/zomato-bangalor...	https://towardsdatascience.com/@shubhankarrawa...

In [3]:

medium = medium.replace('nan', np.nan)

In [4]:

# Drop duplicated
medium = medium.drop_duplicates(subset=['Title', 'Subtitle', 'Author', 'Year',
                                  'Month', 'Day', 'Tag'])

In [5]:

medium.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147392 entries, 0 to 148139
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   Title         138215 non-null  object  
 1   Subtitle      88691 non-null   object  
 2   Image         147392 non-null  uint8   
 3   Author        147294 non-null  object  
 4   Publication   71402 non-null   category
 5   Year          147392 non-null  uint16  
 6   Month         147392 non-null  uint8   
 7   Day           147392 non-null  uint8   
 8   Tag           147392 non-null  category
 9   Reading_Time  147392 non-null  uint8   
 10  Claps         147392 non-null  category
 11  Comment       147392 non-null  uint8   
 12  url           147392 non-null  object  
 13  Author_url    147294 non-null  object  
dtypes: category(3), object(5), uint16(1), uint8(5)
memory usage: 8.8+ MB

Visualize Tags¶

In [6]:

import plotly.express as px

# Save the charts to build an interactive report later
charts = []

tag_plot = px.bar(x=medium.Tag.value_counts().index,
      y=medium.Tag.value_counts().values,
                 labels={'y': 'Number of Articles',
                        'x': 'Tags'},
                 title='Number of articles in each data science-related topic')
tag_plot

In [7]:

charts.append(dp.Plot(tag_plot))

In [8]:

# Number of duplicated articles with different tags
sum(medium.iloc[:,:8].duplicated())

Out[8]:

In [9]:

# Drop duplicated
medium = medium.drop_duplicates(subset=['Title', 'Subtitle', 'Author', 'Year',
                                  'Month', 'Day'])

In [10]:

medium.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108864 entries, 0 to 148139
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   Title         102376 non-null  object  
 1   Subtitle      64990 non-null   object  
 2   Image         108864 non-null  uint8   
 3   Author        108776 non-null  object  
 4   Publication   49457 non-null   category
 5   Year          108864 non-null  uint16  
 6   Month         108864 non-null  uint8   
 7   Day           108864 non-null  uint8   
 8   Tag           108864 non-null  category
 9   Reading_Time  108864 non-null  uint8   
 10  Claps         108864 non-null  category
 11  Comment       108864 non-null  uint8   
 12  url           108864 non-null  object  
 13  Author_url    108776 non-null  object  
dtypes: category(3), object(5), uint16(1), uint8(5)
memory usage: 6.6+ MB

Comment¶

In [11]:

comment = px.pie(medium.groupby('Comment').count().reset_index(),
       values = 'Year',
       names = 'Comment',
       labels={'Year': 'Number of Articles'},
        title = 'Number of Comments in Data Science Articles'
      )
comment

In [12]:

charts.append(dp.Plot(comment))

Claps¶

In [13]:

medium.Claps = medium.Claps.astype('object')
medium.Claps.describe()

Out[13]:

count     108864
unique      1042
top            0
freq       40077
Name: Claps, dtype: object

In [14]:

def str_to_float(feature):
    '''Change string with K or M to a float (.i.e, 5k)'''
    
    feature = feature.replace(r'[KM]+$', '', regex=True).astype(float) * \
    feature.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1).replace(['K','M'], [10**3, 10**6]).astype(int)
    
    return feature

In [15]:

medium.Claps = str_to_float(medium.Claps)

In [16]:

medium['Claps'].describe()

Out[16]:

count    108864.000000
mean         67.885674
std         258.135934
min           0.000000
25%           0.000000
50%           3.000000
75%          55.000000
max       26000.000000
Name: Claps, dtype: float64

In [17]:

claps = px.histogram(medium.sort_values(by='Claps')[:80000],
            x='Claps',
            title='Number of Claps')

In [18]:

claps.show()

In [19]:

charts.append(dp.Plot(claps))

In [20]:

readingTime_claps = px.scatter(medium,
          x='Reading_Time',
          y='Claps',
        title='Claps vs Reading Time')

readingTime_claps

In [21]:

medium.corr().loc['Reading_Time', 'Claps']

Out[21]:

0.1301349558669967

In [22]:

charts.append(dp.Plot(readingTime_claps))

In [23]:

# Find average duration of reading time for top articles

medium[medium.Claps.between(63, medium.Claps.max())].Reading_Time.describe()

Out[23]:

count    24854.000000
mean         6.580027
std          3.910119
min          0.000000
25%          4.000000
50%          6.000000
75%          8.000000
max        171.000000
Name: Reading_Time, dtype: float64

How frequent an author publishes¶

In [24]:

# Group df by author and count the number of articles they publish
author_groupby = medium.groupby(['Author']).count().sort_values(by='Year', ascending=False).reset_index()

In [25]:

fig = px.bar(author_groupby[:100], 
             x='Author', 
             y='Year',
             labels={'Year': 'Number of articles'},
             title='Top 100 most active authors with topics related to data science'
            )
fig.update_layout({
    'plot_bgcolor': 'rgba(133, 227, 239, 0.04)',
})

fig.show()

In [26]:

charts.append(dp.Plot(fig))

In [27]:

author_rank = medium.Author.value_counts().index
100-(list(author_rank).index('Khuyen Tran')+1)/len(author_rank) * 100

Out[27]:

99.85892408871186

In [28]:

author_groupby.Year.median()

Out[28]:

1.0

How frequent a publication publishes¶

In [29]:

publication_groupby = medium.groupby(by='Publication').count().sort_values(by='Title',ascending=False).reset_index()[:40000]

fig = px.bar(publication_groupby[:50],
      x='Publication',
      y='Title',
    labels={'Title':'Number of article'},
             title='Top 50 most active data science publication',
            )
fig.update_layout({
    'plot_bgcolor': 'rgba(133, 227, 239, 0.04)',
    'margin': dict(b=250),
    'height': 600,
})

fig.update_traces(textposition='outside')

fig.update_xaxes(title_font_family="Arial",tickangle=45)

In [30]:

charts.append(dp.Plot(fig))

Find the date of the week¶

In [31]:

import datetime

In [32]:

medium['Dates'] = medium.apply(lambda row: datetime.date(row.Year,row.Month,row.Day), axis=1)

In [33]:

dates_groupby = medium.groupby('Dates').count().reset_index()

In [34]:

dates = px.line(dates_groupby,
        x='Dates',
       y='Year',
        labels={'Year':'Number of articles'},
        title='Number of articles from July 2019 to the beginning of July 2020')

dates 

In [35]:

charts.append(dp.Plot(dates))

In [36]:

def date_to_weekday(year, month, day):
    '''Find the day of the week with regarding to the date'''
    
    return datetime.date(year, month, day).weekday()

years = list(medium.Year)
months = list(medium.Month)
days = list(medium.Day)

medium['week_days'] = medium.apply(lambda row: date_to_weekday(row.Year, row.Month, row.Day), axis=1)

In [37]:

# Map the number to the day of the week
day_of_week = {0: 'Monday',
              1: 'Tuesday',
              2: 'Wednesday',
              3: 'Thursday',
              4: 'Friday',
              5: 'Saturday',
              6: 'Sunday'}
medium['week_days'].replace(day_of_week, inplace=True)

In [38]:

day_groupby = medium.groupby(by='week_days').count()
day_groupby = day_groupby.reindex(['Monday', 'Tuesday', 'Wednesday',
                        'Thursday', 'Friday', 'Saturday',
                        'Sunday']).reset_index()

In [39]:

publish_dates = px.bar(day_groupby,
                       x='week_days',
                       y='Year',
                       labels={'week_days':'Day of the week', 
                               'Year':'Number of articles'})
publish_dates

In [40]:

charts.append(dp.Plot(publish_dates))

In [41]:

fig = px.bar(medium.groupby(by='week_days').mean()['Claps'].reset_index(),
      x='week_days',
      y='Claps',
      labels={'week_days':'Days of the week'},
      title='Average number of claps on each day of a week')

In [42]:

fig.show()

In [43]:

charts.append(dp.Plot(fig))

Fill in missing titles¶

In [44]:

import re 
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt
import math

def url_to_title(url):
    '''Find title from url'''

    url = url.replace('https://towardsdatascience.com/', '')
    url = url.replace('https://medium.com/', '')
    url = re.sub(r'.*/', '', url)
    url = re.sub(r'([A-Za-z]+[\d@]+[\w@]*|[\d@]+[A-Za-z]+[\w@]*).+', '', url)
    title = url.replace('-', ' ')

    return title

null_urls = (list(medium.loc[medium.Title.isnull(), 'url']))

null_titles = []
for url in null_urls:
    null_titles.append(url_to_title(url))

medium.loc[medium.Title.isnull(), 'Title'] = null_titles

In [45]:

def process_text(texts: list):
    
    processed = []
        
    for text in texts:
        
        # lowercase
        text = text.lower()

        #remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        #remove stopwords
        stop_words = set(stopwords.words('english'))

        #tokenize
        tokens = word_tokenize(text)
        new_text = [i for i in tokens if not i in stop_words]

        new_text = ' '.join(new_text)

        processed.append(new_text)    

    return processed

In [46]:

text_features = ['Title', 'Subtitle']

In [47]:

# Replace nan with None

def isNaN(string):
    return string != string

subtitle = medium.Subtitle.fillna('None')

In [48]:

subtitle = process_text(list(subtitle))

In [49]:

titles = process_text(list(medium.Title))

In [50]:

combine_titles = ' '.join(titles) + ' '.join([text for text in subtitle if text !='none'])

In [51]:

def make_wordcloud(new_text):
    ''''funciton to make wordcloud'''
    
    wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                min_font_size = 10).generate(new_text) 

    
    fig = plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    plt.show() 
    
    return fig

In [ ]:

cloud = make_wordcloud(combine_titles)
cloud 

In [ ]:

charts.append(dp.Plot(cloud))

In [ ]:

dp.Report(*charts).publish(name='medium_visualization',
                          headline='Data Science Articles on Medium from 2019/7 to 2020/7')

In [ ]: