In [2]:

from collections import defaultdict
import inspect
import os
import re
import subprocess
import time

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import ray
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [2]:

ray.init()

2019-05-31 00:12:44,231	WARNING worker.py:1341 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.
2019-05-31 00:12:44,232	INFO node.py:497 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-31_00-12-44_232626_19160/logs.
2019-05-31 00:12:44,340	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:46401 to respond...
2019-05-31 00:12:44,448	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:23890 to respond...
2019-05-31 00:12:44,450	INFO services.py:806 -- Starting Redis shard with 6.58 GB max memory.
2019-05-31 00:12:44,460	INFO node.py:511 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-31_00-12-44_232626_19160/logs.
2019-05-31 00:12:44,462	INFO services.py:1441 -- Starting the Plasma object store with 9.87 GB memory using /dev/shm.

Out[2]:

{'node_ip_address': '172.31.31.63',
 'redis_address': '172.31.31.63:46401',
 'object_store_address': '/tmp/ray/session_2019-05-31_00-12-44_232626_19160/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-05-31_00-12-44_232626_19160/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2019-05-31_00-12-44_232626_19160'}

Download code¶

We go through the top python kaggle entries and download the code used.

In [21]:

chrome_options = webdriver.chrome.options.Options()
chrome_options.add_argument("--headless")

In [25]:

driver = webdriver.Chrome('dependencies/chromedriver', options=chrome_options)
driver.get("https://www.kaggle.com/kernels?sortBy=voteCount&language=Python")
time.sleep(1)

elem = driver.find_element_by_tag_name("body")

# number per page varies depending on browser driver
no_of_pagedowns = 300

while no_of_pagedowns:
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(1)
    no_of_pagedowns-=1

soup = BeautifulSoup(driver.page_source, "html.parser")
kernel_links = ['https://www.kaggle.com' + a['href'] 
                for a in soup.find_all("a", class_="block-link__anchor")]
driver.quit()

In [26]:

print(len(kernel_links))
kernel_links[:5]

Out[26]:

['https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python',
 'https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python',
 'https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard',
 'https://www.kaggle.com/kanncaa1/data-sciencetutorial-for-beginners',
 'https://www.kaggle.com/dansbecker/how-models-work']

In [27]:

@ray.remote
def get_download_link(kernel_link):
    try:
        driver = webdriver.Chrome('dependencies/chromedriver', options=chrome_options)
        driver.get(kernel_link)
        innerHTML = driver.execute_script("return document.body.innerHTML")
        soup = BeautifulSoup(innerHTML, 'html.parser')
        link = soup.find_all("a", class_="sc-hwNDZK gNPGxN")[0]['href']
        link = "https://www.kaggle.com/kernels/scriptcontent/{}/download".format(link.split("/")[-1])
        driver.quit()
        #print("DONE")
        return link
    except Exception as e:
        print("Error: {}".format(e))
        return None

In [ ]:

code_links = []
for link in kernel_links:
    code_links.append(get_download_link.remote(link))
code_links = ray.get(code_links)
code_links = [link for link in code_links if link is not None]
print(len(code_links))

In [35]:

code_links = [link for link in code_links if link is not None]
print(len(code_links))

In [36]:

with open("code_links.txt", "w") as file:
    [file.write("{}\n".format(link)) for link in code_links]

In [ ]:

@ray.remote
def download_files(link):
    """Download the data at the link.
    
    Parameters
    ----------
    link: str
        Link to the data.
    """
    cmd = "wget --content-disposition {} -P data".format(link)
    subprocess.Popen(cmd, shell=True, executable='/bin/bash')

for code_link in code_links:
    download_files.remote(code_link)

# We block on downloading files
ray.get([download_files.remote(code_link) for code_link in code_links])

In [38]:

# Clean up directory so that only the code remains.
for filename in os.listdir("data"):
    if filename == ".DS_Store":
        pass
    
    name_parts = filename.split(".")
    if len(name_parts) == 2:
        name_parts.append("0")
        
    if name_parts[1] in ['ipynb', 'py']:
        new_name = name_parts[0] + name_parts[2] + "." + name_parts[1]
        new_name = 'r' + new_name if name_parts[1] == 'py' else new_name
        os.rename('data/' + filename, 'data/' + new_name)

    else:
        os.remove('data/' + filename)

Mine code¶

We go through each kaggle entry and mine it for the relevant pandas functions.

In [3]:

# Get all the possible functions from base pandas class, dataframes, and series.
search_tokens = set()

objects = [pd, pd.DataFrame, pd.Series]
indexers = ['iloc', 'iat', 'ix', 'loc', 'at']
for obj in objects:
    for token in dir(obj):
        # We do not consider private functions or properties
        if token[0] == "_" and token[:2] != "__":
            continue
        elif inspect.isfunction(getattr(obj, token)):
            # For functions, we search for ".function_name("
            search_tokens.add("\.{}\(".format(token))
        elif token in indexers:
            # For indexing functions, we searhc for ".indexing_function["
            search_tokens.add("\.{}\[".format(token))
        else:
            # For properties, we add only a period in front
            search_tokens.add("\.{}".format(token))

# We compile the search tokens together for improved performance
search_tokens = ray.put(search_tokens)

In [4]:

@ray.remote
def parse_script(counter, script_name, search_tokens):
    """Parse the script and search for the desired regex expressions.
    
    Parameters
    ----------
    counter: Ray actor
        Ray actor that that has the method count that takes in a list.
    script_name: str
        File name of the script.
    search_tokens: str
        Regex expression as a string. We need this to be a string because
        ray cannot pickle compiled regex expressions.
    """
    with open(script_name) as script_file:
        code = script_file.read().splitlines()

    search_tokens = re.compile("|".join(search_tokens))
    for line in code:
        return re.findall(search_tokens, line)

@ray.remote
class TokenCounter(object):
    """Ray actor class that tracks the times each token has appeared.
    """
    def __init__(self):
        self.token_counts = defaultdict(lambda: 0)
                    
    def count(self, found_token):
        """Adds to the count of each token in list.
        
        Parameters
        ----------
        found_token: list
            List of tokens
        """
        for func in found_token:
            self.token_counts[func] += 1

    def get_counts(self):
        """Return the token counts as dict."""
        return dict(self.token_counts)

In [5]:

counter = TokenCounter.remote()

# We have to block on the parser to make sure everything is passed to actor before we can call the actor method
results = ray.get([parse_script.remote(counter, "data/{}".format(filename), search_tokens) 
         for filename in os.listdir('data')])
ray.get([counter.count.remote(result) for result in results])
method_counts = ray.get(counter.get_counts.remote())

In [10]:

mFreq = pd.DataFrame(method_counts.items(), columns=["function", "count"]).sort_values('count', ascending=False)
mFreq["function"] = mFreq["function"].apply(lambda x: x.lstrip("\.").rstrip('\('))
mFreq.to_csv("results.csv")
mFreq.tail()

Out[10]:

	function	count
258	searchsorted	2
259	ExcelWriter	2
260	read_html	2
261	read_clipboard	2
262	le	2

In [ ]: