#!/usr/bin/env python # coding: utf-8 # # Finding out what I like using Trello # # # Part 1: Visualizing and understanding the data # --- # # Blog post: [jtp.io/2015/05/11/trello-reading-stats.html](http://www.jtp.io/2015/05/11/trello-reading-stats.html) # # ## Using Trello to organize articles # # I tend to spend a lot of time on the Internet reading articles about almost everything. # # But there is a problem: it is very easy to be overwhelmed by tons of stuff to read and watch. # # In the beginning, I was doing it the lazy way, opening a new tab with the content I wanted to read, telling myself: "yes I will go back to it soon, it sounds very interesting and if I don't read it, I will miss out on something cool". But this tab would stay opened for months (really). Sometimes it would be closed by inadvertance or lazyness and would be forgotten in the limbo. # # This was a bad habit. One day I discovered Trello, a genuine project management tool, and I decided to give it a try. I would use it not for work or anything related, but just as a personal assistant to remember some of my stuff. # # Quickly it became part of my daily routine. Along with the mobile app, it turned out to be a convenient way to access any content that I have stored on Trello. Naturally, the idea of using it to save and organize articles came by itself. # # I created a board called *To Read and Watch*, with four lists: # - To Read # - To Watch # - Done # - Save # # # # How does it work? Everytime I spot or open an article that looks interesting and I know I will have to read it sooner or later, I reference it in the list *To Read* as a card. The title of the card corresponds to the title of the article and the link is in the description. # # # # I use labels to classify my articles, so when I want to read something I can easily make a choice depending on my current mood. Some labels examples: # - Programming # - Maths # - News # - Culture # - Music # - ... # # When I am done reading or watching, I move the card to the list *Done*, or the list *Save* if it was a good read that could lead to further investigations. # # I have been doing this for more than one year already and this has been very convenient to me. It is way more flexible because Trello can be accessed from my personal computer, phone, tablet, work computer ... In a way, I always feel like there is something to learn and read, wherever I could be as long as I have one of these devices with me. If I have to wait for the bus for example and feel bored, I know there is already a selection of interesting stuff that are just waiting to be "processed". # # ## Knowing more about myself # # Structuring my readings with this method makes it straightforward to keep track of everything. Sometimes I recall something I have read somewhere on the Internet, but it's been too long or my memories are too fusy. Trello provides a good search functionality, so there is a way to find the related article, as long as I remember what it was about. # # I am usually very curious, and seeing all of this data being created and stored (by me!) makes me want to analyze it in order to know more about myself. # # There are a lot of questions that can be asked about reading habits. Many of them relate to personal interests and can teach new things about personality: # - How many articles have I read or are still in the queue? # - What is the label I use the most? # - How long does an article stay in the *To Read* in average? # - What day of the week are there the most articles moved from *To Read* to *Done*? # - What kind of topic is the most present in my lists? # # # All of this is very exciting. As I am writing right now I haven't even started playing with the data, Python and graphs, but I know the results will be fascinating. # # ## How to do that? # # Trello provides an API, and that's a good news. With the API it's even possible to access more data than using the standard web UI. # # So that's our tool! Couple to that I will use Python, because requests, beautifulsoup and matplotlib are wonderful tools. # # ## Structure # # The notebook has been built as a collection of experiments, that have been added progressively (more like a draft). # # I keep iterating on this so there might be more content or modifications in the future. # # ## Getting started with the Trello API # # Let's get started! # # First thing to do is to become more familiar with the API. This first part will be all about data wrangling and how to arrange the data in a nice way so it becomes easy to use. # # In[1]: import os import requests import json TRELLO_API = 'https://api.trello.com/1/' # Read the API keys from the environment variables TRELLO_API_KEY = os.getenv('TRELLO_API_KEY', '') TRELLO_API_SECRET = os.getenv('TRELLO_API_SECRET', '') TRELLO_TOKEN = os.getenv('TRELLO_TOKEN', '') auth = '?key=' + TRELLO_API_KEY + '&token=' + TRELLO_TOKEN BOARD_ID = os.getenv('TRELLO_BOARD_ID', '') # Now that everything is set up, let's start getting some data. # # ## Get the total number of cards for each of the four lists # # An easy one to get started. # In[2]: # Get the board board = requests.get(TRELLO_API + 'boards/' + BOARD_ID + auth).json() # Get the board labels board_labels = board['labelNames']; # Basic information about the lists in that board raw_lists = requests.get(TRELLO_API + 'boards/' + BOARD_ID + '/lists' + auth).json() lists_filter = ['To Read', 'To Watch', 'Done', 'Save'] # Reformat the lists to a dic list_name -> list_id lists_ids = {x['name']: x['id'] for x in raw_lists if x['name'] in lists_filter} # Define a function to retrieve all actions for a given list def get_actions(list_id): res = [] page = 0 data = [1] # Paging while len(data) != 0: data = requests.get(TRELLO_API + 'lists/' + list_id + '/actions' + auth + '&page=' + str(page) + 'limit=1000').json() res += data page += 1 return res def get_list(list_id): return { 'cards': requests.get(TRELLO_API + 'lists/' + list_id + '/cards/all' + auth).json(), 'actions': get_actions(list_id) } ls = {list_name: get_list(lists_ids[list_name]) for list_name in lists_ids.keys()} number_cards_per_list = {x: len(c['cards']) for x, c in ls.items()} for x, c in ls.items(): print('The list "' + x + '" contains ' + str(len(c['cards'])) + ' cards') # Visualized: # In[3]: # First time we plot something, let's set everything up from matplotlib.ticker import FuncFormatter import matplotlib.pyplot as plt import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') plt.rcdefaults() # make it pretty! import seaborn as sns sns.set_palette("deep", desat=.6) sns.set_context(rc={"figure.figsize": (8, 4)}) keys = list(number_cards_per_list.keys()) y = np.arange(len(number_cards_per_list.keys())) rects = plt.barh(y, list(number_cards_per_list.values()), alpha=0.5) plt.yticks(y + 0.4, keys) for i, rect in enumerate(rects): plt.text(0.95 * rect.get_width(), rect.get_y() + rect.get_height() / 2.0, number_cards_per_list[keys[i]], ha='right', va='center') plt.show() # Now that we retrieved almost all the data we need, we can calculate stats on the cards. # # One thing to notice is that some cards are sometimes added to the list *Done* without going through the list *To Read*. This happens when I spot an article, read it directly, and think it is important enough to be remembered and stored in Trello. # ## Number of cards per label # # Some cards have more than one label. # In[4]: from collections import Counter big_list_of_cards = [card for c in ls.values() for card in c['cards']] print('Total number of cards: ' + str(len(big_list_of_cards))) labels_list = [(label['name'], label['color']) for c in big_list_of_cards for label in c['labels']] labels = {name: color for (name, color) in labels_list} label_count = dict(Counter(labels_list).most_common()) for label, number_of_cards in label_count.items(): print(str(label) + ' used ' + str(number_of_cards) + ' times') # And putting the data in a graph: # In[5]: # matplotlib doesn't know about these colors by default color_replacement = { 'sky': '#00C2E0', 'lime': '#51E898', None: '#EEEEEE' } x = np.arange(len(label_count.keys())) plt.barh(x, list(label_count.values()), color=[a[1] if a[1] not in color_replacement else color_replacement[a[1]] for a in label_count.keys()]) plt.yticks(x + 0.4, list(a[0] for a in label_count.keys())) plt.show() # ## How many cards were moved from *To Read* to *Done*? # In[6]: print(str(len(ls['To Read']['actions'])) + ' actions on the list "To Read"') # Filter the cards that were in the list "To Read" before moved_to_done = [c for c in ls['To Read']['actions'] if 'data' in c and 'listAfter' in c['data'] and c['data']['listAfter']['id'] == lists_ids['Done']] moved_to_save = [c for c in ls['To Read']['actions'] if 'data' in c and 'listAfter' in c['data'] and c['data']['listAfter']['id'] == lists_ids['Save']] # Combine the cards from the "Done" and "Save" lists moved_to_done = moved_to_done + moved_to_save print(str(len(moved_to_done)) + ' cards were moved from "To Read" to "Done" and "Save"') # ## What is the average time it takes for a card to move from *To Read* to *Done*? # In[7]: import datetime import math import statistics import collections date_format = '%Y-%m-%dT%H:%M:%S' def parse_date(date_str): return datetime.datetime.strptime(date_str[:date_str.index('.')], date_format) # Create a dictionary card_id -> date to track the date a card was moved dates_moved = {c['data']['card']['id']: parse_date(c['date']) for c in moved_to_done} # Create a dictionary card_id -> date to track the date a card was created dates_created = {c['data']['card']['id']: parse_date(c['date']) for c in ls['To Read']['actions'] if 'card' in c['data'] and c['data']['card']['id'] in dates_moved.keys() and c['type'] == 'createCard'} time_per_card = {cid: (d-dates_created[cid]).days for cid, d in dates_moved.items()} # Calculate the difference in days, assuming date_moved > date_created times = time_per_card.values() average_days_for_moving = math.floor(statistics.mean(times)) median = math.floor(statistics.median(times)) standard_deviation = statistics.stdev(times) variance = statistics.variance(times) max_days = max(times) min_days = min(times) print('It takes ' + str(average_days_for_moving) + ' days in average for a card to be moved from "To Read" to "Done"') print('The median is: ' + str(median) + ', which means that half of the articles are read in less than ' + str(median) + ' days') print('max: ' + str(max_days) + ', min: ' + str(min_days) + ', stdev: ' + str(standard_deviation), ', variance: ' + str(variance)) times_count = {x: 0 for x in range(max(times)+1)} for t in times: times_count[t] += 1 x = np.arange(len(times_count.keys())) plt.xlabel('Time in days') plt.ylabel('Number of cards moved') plt.bar(x, list(times_count.values())) plt.show() # The same data but zoomed to show only a 2 months period # In[8]: plt.xlim(0, 50) plt.xlabel('Time in days') plt.ylabel('Number of cards moved') plt.bar(x, list(times_count.values())) plt.show() # ## Average processing time per label? # # How much time does it take to move a card, grouped by label? # In[9]: def labels_list_names(ls): return [l['name'] for l in ls] cards_indexed_by_id = {c['id']: c for c in big_list_of_cards} average_time_per_label = {} median_time_per_label = {} for label in labels.keys(): # Basically the same code as before dates_moved = { c['data']['card']['id']: parse_date(c['date']) for c in moved_to_done # add one filter for the current tag if label in labels_list_names(cards_indexed_by_id[c['data']['card']['id']]['labels'])} # Create a dictionary card_id -> date to track the date a card was created dates_created = {c['data']['card']['id']: parse_date(c['date']) for c in ls['To Read']['actions'] if 'card' in c['data'] and c['data']['card']['id'] in dates_moved.keys() and c['type'] == 'createCard' } # Calculate the difference in days, assuming date_moved > date_created times = [(d-dates_created[cid]).days for cid, d in dates_moved.items()] if len(times) == 0: times = [0] average_time_per_label[label] = math.floor(statistics.mean(times)) median_time_per_label[label] = math.floor(statistics.median(times)) for label, time in average_time_per_label.items(): print('For label ' + label + ': avg time: ' + str(time) + ' days, mean: ' + str(median_time_per_label[label]) + ' days') # Plotting x = np.arange(len(labels.keys()))-1 averages = plt.barh(x-0.2, list(average_time_per_label.values()), label=['average', 'mean'], height=0.2, color='r') means = plt.barh(x+0.2, list(median_time_per_label.values()), height=0.2, color='b') plt.legend((averages, means), ('Average time', 'Mean time')) plt.yticks(x, list(label for label in average_time_per_label.keys())) plt.show() # ## At what day of the week a card is more likely to be moved? # # It can also be exciting to examine reading habits with the days of the week. # # Are some days more productive than others? # In[10]: import dateutil.parser from pytz import timezone # localize the time to Sweden sweden=timezone('Europe/Stockholm') WEEK_DAYS = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] days_of_week_punchcard = sorted( Counter( [dateutil.parser.parse(moved_action['date']).astimezone(sweden).weekday() for moved_action in moved_to_done] ).most_common() , key=lambda day: day[0] ) # Plotting x = np.arange(len(WEEK_DAYS)) plt.xlim(x.min() - 0.5, x.max() + 1) plt.bar(x, [d[1] for d in days_of_week_punchcard], alpha=0.5, width=0.6) plt.xticks(x + 0.3, WEEK_DAYS, fontsize=10) plt.show() # Surprisingly I manage to get things done on Mondays! # ## And now, at what time of the day? # In[11]: hours_of_day_punchcard = { x: 0 for x in range(24)} for moved_action in moved_to_done: hours_of_day_punchcard[dateutil.parser.parse(moved_action['date']).astimezone(sweden).hour] += 1 x = np.arange(24) plt.xlim(- 0.5, 24) plt.bar(x, hours_of_day_punchcard.values(), alpha=0.5, width=0.6) plt.xticks(x + 0.3, x, fontsize=10) plt.show() # Interesting results again: # - I sleep at least between 3 and 6 (which is comforting) # - I usually eat between 8 p.m and 9 p.m, so no time for reading at that moment! # - There is a big peak at the end of the day, which could mean that I might be tired after a day at work and I am more likely to finish the afternoon with a good read. # ## Punchcard (days and hours combined) # In[12]: global_punchcard = { (x, y): 0 for x in range(24) for y in range(7) } for moved_action in moved_to_done: parsed_date = dateutil.parser.parse(moved_action['date']).astimezone(sweden) global_punchcard[(parsed_date.hour, parsed_date.weekday())] += 1 x = [d[0] for d in global_punchcard.keys()] y = [d[1] for d in global_punchcard.keys()] s = [50 * s for s in global_punchcard.values()] plt.xlim(min(x)-0.5, max(x)+0.5) plt.ylim(min(y)-0.5, max(y)+0.5) plt.xticks(np.arange(24)) plt.yticks(np.arange(7), WEEK_DAYS) plt.gca().invert_yaxis() plt.scatter(x, y, s=s, alpha=0.5) plt.show() # ## What is this card that I moved on a Friday between midnight and 1 a.m? # # Once the data is available, it becomes natural to inspect more closely and dissect the information. # # For instance, it is possible to know what cards were moved on a specific day at a specific time. # In[13]: day_to_search, hour_to_search = 4, 0 moved_thursdays_at_5 = [moved_action for moved_action in moved_to_done if dateutil.parser.parse(moved_action['date']).astimezone(sweden).weekday() == day_to_search and dateutil.parser.parse(moved_action['date']).astimezone(sweden).hour == hour_to_search] print('Card name: ' + moved_thursdays_at_5[0]['data']['card']['name']) print('Moved on: ' + dateutil.parser.parse(moved_thursdays_at_5[0]['date']).astimezone(sweden).strftime("%A %d. %B %Y at %X")) # ## Export the data to a static JSON to work on a visualization # # Most of the cards only store a single link to the article. # # Given that and thanks to the [newspaper](https://github.com/codelucas/newspaper), it is possible to scrap each webpage and get the article text and the corresponding keywords. # # The idea of this section is to go over all the articles that are scrapable and compile a big chunk of words encountered. # In[ ]: # inject card infos in the list for c in moved_to_done: c['data']['card'] = cards_indexed_by_id[c['data']['card']['id']] # Let's compile a huge list of all the words used in the articles without the stopwords (as they don't provide that much information) # In[ ]: import re from newspaper import Article URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' huge_text = '' for c in moved_to_done: links = re.findall(URL_REGEX, c['data']['card']['desc']) if len(links) == 0: continue url = links[0] a = Article(url) c['data']['card']['keywords'] = [] try: a.download() a.parse() a.nlp() huge_text += a.text c['data']['card']['keywords'] = a.keywords except: # Some URL can't be parsed (pdf, video, image...) print('Problem parsing article: ' + c['data']['card']['name']) print ('Done') # In[ ]: # [Optional] Save to temp file to avoid the expensive nlp analysis with open('data.json', 'w') as f: export = json.dump({ 'cards_moved': moved_to_done, 'labels': labels }, f, indent=4) f.close() from nltk.corpus import stopwords # Most of the articles are in English and French all_stopwords = stopwords.words('english') + stopwords.words('french') all_words = [w.strip().lower() for w in huge_text.split( ) if w.strip().lower() not in all_stopwords] with open('all.txt', 'w') as f: f.write(' '.join(all_words)) f.close() # In[ ]: # Reload from previous dump (shortcut to avoid the long scrap) with open('data.json', 'r') as f: data = json.loads(f.read()) f.close() with open('all.txt', 'r') as f: all_text = f.read().split() f.close() cards_moved = data['cards_moved'] keywords = [w for c in cards_moved if 'keywords' in c['data']['card'] for w in c['data']['card']['keywords']] distinct_keywords = set(keywords) print('There are ' + str(len(keywords)) + ' keywords (' + str(len(distinct_keywords)) + ' distinct)') # In[ ]: from nltk.corpus import wordnet as wn from nltk import pos_tag # Find the type of word ref_nodes = dict(list(enumerate(keywords))) tags = pos_tag(all_text) # In[ ]: # Filter out nouns for a separate visualization nouns = [t[0] for t in tags if t[1] == 'NN' and len(t[0]) >= 2] print(str(len(tags)) + ' words in total, ' + str(len(nouns)) + ' nouns selected') counter_all = Counter(all_text) counter_nouns = Counter(nouns) counter_all_top200 = counter_all.most_common()[:200] counter_nouns_top200 = counter_nouns.most_common()[:200] min_all = min(counter_all_top200, key=lambda x: x[1])[1] max_all = max(counter_all_top200, key=lambda x: x[1])[1] min_nouns = min(counter_nouns_top200, key=lambda x: x[1])[1] max_nouns = max(counter_nouns_top200, key=lambda x: x[1])[1] counter_all_top200 = [(x[0], (x[1] - min_all) / (max_all - min_all)) for x in counter_all_top200] counter_nouns_top200 = [(x[0], (x[1] - min_nouns) / (max_nouns - min_nouns)) for x in counter_nouns_top200] with open('words.json', 'w') as f: export = json.dump({ 'all': counter_all_top200, 'nouns': counter_nouns_top200 }, f, indent=4) f.close() # Using the handy [wordcloud2.js library](https://github.com/timdream/wordcloud2.js), the words can be plotted as a **word cloud** # # ### All the words # # # # It is a bit amusing because characters like *=* and *{* are the most often encountered, which makes sense if some articles embed code. # # ### Only the nouns # # # # # Conclusion # # There is many more advanced things to do from here: # # - More elaborated stats. # - Given a link, title and labels for an article, predict when I will read it. # - Cluster articles and links by similarity, as another way to visualize data. # # Those are just ideas at the present moment. I will share my next findings in a future "Part 2"!