#!/usr/bin/env python # coding: utf-8 # # Polyglot Data Science with IPython & friends # # A demonstration of how to use Python, Julia, Fortran and R cooperatively to analyze data, in the same process. # # This is supported by the IPython kernel and a few extensions that take advantage of IPython's magic system to provide low-level integration between Python and other languages. # # See the [companion notebook](polyglot-ds-prep.ipynb) for data preparation and setup. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns # In[2]: plt.style.use("seaborn") plt.rcParams["figure.figsize"] = (10, 6) sns.set_context("talk", font_scale=1.4) # Let's begin by reading our dataset and having a quick look: # In[3]: data = pd.read_csv('data.csv') print(data.shape) data.head(3) # Ah, it looks like we have a quantitative dataset with $(x,y)$ pairs - a scatterplot is a decent starting point to get a feel for these data: # In[4]: data.plot.scatter(x='x', y='y'); # Mmh, what to do? # # Let's try to build a simple linear model for these data, with some features we'll extract from the data. There's probably: # # * a linear dependence # * something that looks broadly non-linear, let's say quadratic, to keep things simple # * and maybe an oscillatorty term: but it looks a bit like a chirp, not a simple sinusoid. Again, simplest non-linear choice: quadratic frequency dependency. # # In summary, let's try # # $$ # y \sim \theta_1 x + \theta_2 x^2 + \theta_3 \sin(x^2) # $$ # ## Oh boy! Complicated data, Julia to the rescue... # # Maybe Julia can help us efficiently compute that nasty non-linear feature, $x^2$? # In[5]: get_ipython().run_line_magic('load_ext', 'julia.magic') # In[6]: jxsq = get_ipython().run_line_magic('julia', 'xsq(x) = x.^2 # Simplest way to define a function in Julia') # We've defined the function `xsq` in Julia, and in Python we have it available as `jxsq`, which we can call as a normal Python function: # In[7]: x = data['x'] f2 = jxsq(x) x.shape == f2.shape # our feature should have the same shape as our data # # More complicated features, we need performance... # ## Fortran can help us! # # Let's use this oldie but goodie to assist in the task of estimating our next non-linear feature, $\sin(x^2)$: # In[8]: get_ipython().run_line_magic('load_ext', 'fortranmagic') # In[9]: get_ipython().run_cell_magic('fortran', '', 'subroutine sinx2(x, y, n)\n real, intent(in), dimension(n) :: x\n real, intent(out), dimension(n) :: y\n !intent(hide) :: n\n y = sin(x**2)\nend subroutine sinx2\n') # Now, `sinx2` can be used as a plain Python function too: # In[10]: #f3 = sinx2(x) f3 = np.sin(x**2) f3.shape == x.shape # same sanity check # # Statistical modeling? Maybe R can do that?? # # We now have our data `y` and our features $x$, $f_2 = x^2$ and $f_3 = \sin(x^2)$. This is a classic linear modeling problem, and R is awesome at fitting those! # # Let's put our features together in a nice design matrix and load up R: # In[11]: A = np.column_stack([x, f2, f3]) A.shape # In[12]: get_ipython().run_line_magic('load_ext', 'rpy2.ipython') y = data['y'] # In R, this can be written as a linear model `lm(y ~ 0 + A)`. # # Note that we'll ask for the fit coefficients `fitc` to keep moving forward: # In[13]: get_ipython().run_cell_magic('R', '-i y,A -o fitc', '\nylm = lm(y ~ 0 + A)\nfitc = coef(ylm)\nprint(summary(ylm))\npar(mfrow=c(2,2))\nplot(ylm)\n') # # Back to Python to look at the results! # # R gave us our fit coefficient vector `fitc`, we can now proceed using it: # In[14]: fitc # We construct our fitted model and visualize our results: # In[15]: yfit = A @ fitc # In[16]: plt.plot(x, y, 'o', label='data') plt.plot(x, yfit, label='fit', color='orange', lw=4) plt.title('Julia, Python and R working in Jupyter') plt.legend(); # # Polyglot Data Science FTW!!! # # Bonus # # ## Cython, the happy child of C and Python # In[17]: def f(x): return x**2-x def integrate_f(a, b, N): s = 0; dx = (b-a)/N for i in range(N): s += f(a+i*dx) return s * dx # In[18]: get_ipython().run_line_magic('load_ext', 'Cython') # In[19]: get_ipython().run_cell_magic('cython', '-a', 'cdef double fcy(double x) except? -2:\n return x**2-x\n\ndef integrate_fcy(double a, double b, int N):\n cdef int i\n cdef double s, dx\n s = 0; dx = (b-a)/N\n for i in range(N):\n s += fcy(a+i*dx)\n return s * dx\n') # In[20]: tpy = get_ipython().run_line_magic('timeit', '-o -n 1000 integrate_f(0, 1, 100)') tcy = get_ipython().run_line_magic('timeit', '-o -n 1000 integrate_fcy(0, 1, 100)') # In[21]: tpy.best/tcy.best # ## The Julia interoperability is pretty neat # In[22]: get_ipython().run_line_magic('julia', '@pyimport numpy as np') get_ipython().run_line_magic('julia', '@pyimport matplotlib.pyplot as plt') # In[23]: get_ipython().run_cell_magic('julia', '', '# Note how we mix numpy and julia:\nt = linspace(0,2*pi,1000); # use the julia linspace\ns = sin(3*t + 4*np.cos(2*t)); # use the numpy cosine and julia sine\nfig = plt.gcf()\nplt.plot(t, s, color="red", linewidth=2.0, linestyle="--")\n') # In[24]: fig = get_ipython().run_line_magic('julia', 'fig') fig.suptitle("Adding a title!") fig # ## And you can really use Fortran as an interactive language # In[25]: get_ipython().run_cell_magic('fortran', '', 'subroutine f1(x, y, n)\n real, intent(in), dimension(n) :: x\n real, intent(out), dimension(n) :: y\n !intent(hide) :: n\n y = sin(xx**2) - cos(x)\nend subroutine f1\n') # In[26]: t = np.linspace(0,2* np.pi, 1000) plt.plot(f1(t)); # # Bash, Perl, Ruby, etc... # In[27]: get_ipython().run_cell_magic('bash', '', 'echo $HOME\n') # In[28]: get_ipython().run_cell_magic('perl', '', '@months = ("Jan", "Feb", "Mar");\nprint($months[1])\n')