#!/usr/bin/env python # coding: utf-8 # # Data generation for polyglot data science example # # In order to run the full [Polyglot Data Science with IPython notebook](polyglot-ds.ipynb), you will need to install [Julia](https://julialang.org/downloads), and then the following (assuming a conda-based deployment that will automatically pull in R, otherwise you also need ot install R): # # ``` # conda install jupyter cython pandas matplotlib seaborn # conda install rpy2 # pip install julia fortran-magic # ``` # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import numpy as np import pandas as pd # We generate synthetic data according to # # $$ # y(x) = a x + b x^2 + c \sin(x^2) + \cal{N}(0, \epsilon) # $$ # In[2]: npts = 300 eps = 0.2 # noise a, b, c = 1, -0.2, 1 # model coefficients np.random.seed(1234) x = np.linspace(0, 2*np.pi, npts) y = a*x + b*x**2 + c*np.sin(x**2) + np.random.normal(scale=eps, size=npts) plt.plot(x, y, 'o'); # Write it to a CSV file for convenient retrieval in a "typical" workflow, Pandas does the job nicely: # In[3]: data = pd.DataFrame({'x':x, 'y':y}) data.head(3) # In[4]: data.to_csv('data.csv', index=False) get_ipython().system('head -3 data.csv') # Sanity check # In[5]: data2 = pd.read_csv('data.csv') data2.head(3) # In[6]: (data2-data).abs().sum()