from collections import defaultdict
import inspect
import os
import re
import subprocess
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import ray
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
ray.init()
2019-05-31 00:12:44,231 WARNING worker.py:1341 -- WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes. 2019-05-31 00:12:44,232 INFO node.py:497 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-31_00-12-44_232626_19160/logs. 2019-05-31 00:12:44,340 INFO services.py:409 -- Waiting for redis server at 127.0.0.1:46401 to respond... 2019-05-31 00:12:44,448 INFO services.py:409 -- Waiting for redis server at 127.0.0.1:23890 to respond... 2019-05-31 00:12:44,450 INFO services.py:806 -- Starting Redis shard with 6.58 GB max memory. 2019-05-31 00:12:44,460 INFO node.py:511 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-31_00-12-44_232626_19160/logs. 2019-05-31 00:12:44,462 INFO services.py:1441 -- Starting the Plasma object store with 9.87 GB memory using /dev/shm.
{'node_ip_address': '172.31.31.63', 'redis_address': '172.31.31.63:46401', 'object_store_address': '/tmp/ray/session_2019-05-31_00-12-44_232626_19160/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2019-05-31_00-12-44_232626_19160/sockets/raylet', 'webui_url': None, 'session_dir': '/tmp/ray/session_2019-05-31_00-12-44_232626_19160'}
We go through the top python kaggle entries and download the code used.
chrome_options = webdriver.chrome.options.Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome('dependencies/chromedriver', options=chrome_options)
driver.get("https://www.kaggle.com/kernels?sortBy=voteCount&language=Python")
time.sleep(1)
elem = driver.find_element_by_tag_name("body")
# number per page varies depending on browser driver
no_of_pagedowns = 300
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(1)
no_of_pagedowns-=1
soup = BeautifulSoup(driver.page_source, "html.parser")
kernel_links = ['https://www.kaggle.com' + a['href']
for a in soup.find_all("a", class_="block-link__anchor")]
driver.quit()
print(len(kernel_links))
kernel_links[:5]
2800
['https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python', 'https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python', 'https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard', 'https://www.kaggle.com/kanncaa1/data-sciencetutorial-for-beginners', 'https://www.kaggle.com/dansbecker/how-models-work']
@ray.remote
def get_download_link(kernel_link):
try:
driver = webdriver.Chrome('dependencies/chromedriver', options=chrome_options)
driver.get(kernel_link)
innerHTML = driver.execute_script("return document.body.innerHTML")
soup = BeautifulSoup(innerHTML, 'html.parser')
link = soup.find_all("a", class_="sc-hwNDZK gNPGxN")[0]['href']
link = "https://www.kaggle.com/kernels/scriptcontent/{}/download".format(link.split("/")[-1])
driver.quit()
#print("DONE")
return link
except Exception as e:
print("Error: {}".format(e))
return None
code_links = []
for link in kernel_links:
code_links.append(get_download_link.remote(link))
code_links = ray.get(code_links)
code_links = [link for link in code_links if link is not None]
print(len(code_links))
code_links = [link for link in code_links if link is not None]
print(len(code_links))
2795
with open("code_links.txt", "w") as file:
[file.write("{}\n".format(link)) for link in code_links]
@ray.remote
def download_files(link):
"""Download the data at the link.
Parameters
----------
link: str
Link to the data.
"""
cmd = "wget --content-disposition {} -P data".format(link)
subprocess.Popen(cmd, shell=True, executable='/bin/bash')
for code_link in code_links:
download_files.remote(code_link)
# We block on downloading files
ray.get([download_files.remote(code_link) for code_link in code_links])
# Clean up directory so that only the code remains.
for filename in os.listdir("data"):
if filename == ".DS_Store":
pass
name_parts = filename.split(".")
if len(name_parts) == 2:
name_parts.append("0")
if name_parts[1] in ['ipynb', 'py']:
new_name = name_parts[0] + name_parts[2] + "." + name_parts[1]
new_name = 'r' + new_name if name_parts[1] == 'py' else new_name
os.rename('data/' + filename, 'data/' + new_name)
else:
os.remove('data/' + filename)
We go through each kaggle entry and mine it for the relevant pandas functions.
# Get all the possible functions from base pandas class, dataframes, and series.
search_tokens = set()
objects = [pd, pd.DataFrame, pd.Series]
indexers = ['iloc', 'iat', 'ix', 'loc', 'at']
for obj in objects:
for token in dir(obj):
# We do not consider private functions or properties
if token[0] == "_" and token[:2] != "__":
continue
elif inspect.isfunction(getattr(obj, token)):
# For functions, we search for ".function_name("
search_tokens.add("\.{}\(".format(token))
elif token in indexers:
# For indexing functions, we searhc for ".indexing_function["
search_tokens.add("\.{}\[".format(token))
else:
# For properties, we add only a period in front
search_tokens.add("\.{}".format(token))
# We compile the search tokens together for improved performance
search_tokens = ray.put(search_tokens)
@ray.remote
def parse_script(counter, script_name, search_tokens):
"""Parse the script and search for the desired regex expressions.
Parameters
----------
counter: Ray actor
Ray actor that that has the method count that takes in a list.
script_name: str
File name of the script.
search_tokens: str
Regex expression as a string. We need this to be a string because
ray cannot pickle compiled regex expressions.
"""
with open(script_name) as script_file:
code = script_file.read().splitlines()
search_tokens = re.compile("|".join(search_tokens))
for line in code:
return re.findall(search_tokens, line)
@ray.remote
class TokenCounter(object):
"""Ray actor class that tracks the times each token has appeared.
"""
def __init__(self):
self.token_counts = defaultdict(lambda: 0)
def count(self, found_token):
"""Adds to the count of each token in list.
Parameters
----------
found_token: list
List of tokens
"""
for func in found_token:
self.token_counts[func] += 1
def get_counts(self):
"""Return the token counts as dict."""
return dict(self.token_counts)
counter = TokenCounter.remote()
# We have to block on the parser to make sure everything is passed to actor before we can call the actor method
results = ray.get([parse_script.remote(counter, "data/{}".format(filename), search_tokens)
for filename in os.listdir('data')])
ray.get([counter.count.remote(result) for result in results])
method_counts = ray.get(counter.get_counts.remote())
mFreq = pd.DataFrame(method_counts.items(), columns=["function", "count"]).sort_values('count', ascending=False)
mFreq["function"] = mFreq["function"].apply(lambda x: x.lstrip("\.").rstrip('\('))
mFreq.to_csv("results.csv")
mFreq.tail()
function | count | |
---|---|---|
258 | searchsorted | 2 |
259 | ExcelWriter | 2 |
260 | read_html | 2 |
261 | read_clipboard | 2 |
262 | le | 2 |