#!/usr/bin/env python # coding: utf-8 # ## Example notebook for the %%stata cell magic by the IPyStata package. # **Author:** Ties de Kok # **Homepage:** https://github.com/TiesdeKok/ipystata # **PyPi:** https://pypi.python.org/pypi/ipystata # ## Note: this example notebook uses the Windows only `Stata Automation` mode # See Github for an example notebook that uses the `Stata Batch Mode` (supported for Windows, Mac OS X, and Linux). # ## Import packages # In[1]: import pandas as pd # In[2]: import ipystata # *Note: You can ignore the `Javascript error adding output!` warning if it pops up with the newest version of Jupyter Notebook * # ## Configure ipystata # Make sure that you have registered your Stata instance. (See GitHub for instructions). # ## Check whether IPyStata is working # In[3]: get_ipython().run_cell_magic('stata', '', '\ndisplay "Hello, I am printed by Stata."\n') # # Some examples based on the Stata 13 manual # ## Load the dataset "auto.dta" in Stata return it back to Python as a Pandas dataframe # The code cell below runs the Stata command **`sysuse auto.dta`** to load the dataset and returns it back to Python via the **`-o car_df`** argument. # In[4]: get_ipython().run_cell_magic('stata', '-o car_df', 'sysuse auto.dta\n') # **`car_df`** is a regular Pandas dataframe on which Python / Pandas actions can be performed. # In[5]: car_df.head() # ## Basic descriptive statistics # The argument **`-d or --data`** is used to define which dataframe should be set as dataset in Stata. # In the example below the Stata function **`tabulate`** is used to generate some descriptive statistics for the dataframe **`car_df`**. # car_df.to_stata('D:\Software\stata15\test.df', version=120) # In[6]: get_ipython().run_cell_magic('stata', '-d car_df', 'tabulate foreign headroom\n') # These descriptive statistics can be replicated in Pandas using the **`crosstab`** fuction, see the code below. # In[7]: pd.crosstab(car_df['foreign'], car_df['headroom'], margins=True) # ## Stata graphs # IPyStata will automatically check whether there are any new graph generated. # If you want to show multiple graphs, you have to make sure to use the `, name(.., replace)`argument in your Stata code. # **Note: the order is not guaranteed to be the same as the generation order. Recommended to use the `title()` argument when showing multiple graphs.** # # It is possible to prevent graphs from showing using the `-nogr` or `--nograph` arguments. # In[8]: get_ipython().run_cell_magic('stata', '-s graph_session', 'use https://stats.idre.ucla.edu/stat/data/hsb2.dta, clear\ngraph twoway scatter read math, name(a, replace) title("Graph a")\n\ngraph twoway scatter math science, name(b, replace) title("Graph b")\n') # ## Use Python lists as Stata macros # In many situations it is convenient to define values or variable names in a Python list or equivalently in a Stata macro. # The **`-i or --input`** argument makes a Python list available for use in Stata as a local macro. # For example, **`-i main_var`** converts the Python list **`['mpg', 'rep78']`** into the following Stata macro: **``main_var'`**. # In[9]: main_var = ['mpg', 'rep78'] control_var = ['gear_ratio', 'trunk', 'weight', 'displacement'] # In[10]: get_ipython().run_cell_magic('stata', '-i main_var -i control_var -os', '\ndisplay "`main_var\'"\ndisplay "`control_var\'"\n\nregress price `main_var\' `control_var\', vce(robust)\n') # ## Modify dataset in Stata and return it to Python # It is possible create new variables or modify the existing dataset in Stata and have it returned as a Pandas dataframe. # In the example below the output **`-o car_df`** will overwrite the **`car_df`** previously created. # Note, the argument **`-np or --noprint`** can be used to supress any output below the code cell. # In[11]: get_ipython().run_cell_magic('stata', '-o car_df -np', 'generate weight_squared = weight^2\ngenerate log_weight = log(weight)\n') # In[12]: car_df.head(3) # ## Retrieve macro from Stata back into Python # The **`-gm`** or **`--getmacro`** argument allows a macro to be extracted from a Stata session. The macro will be added to the **`macro_dict`** dictionary. # In[13]: get_ipython().run_cell_magic('stata', '-s macro_example -gm macro_1 -gm macro_2', 'local macro_1 one two\nlocal macro_2 three four\n') # In[14]: macro_dict # In[15]: macro_dict['macro_1'] # ## Set Python working directory in Stata # In[16]: import os os.chdir(r'C:/') # In[17]: get_ipython().run_cell_magic('stata', '-cwd', 'display "`c(pwd)\'"\n') # ## Using Mata code # In[18]: get_ipython().run_cell_magic('stata', '-s mata_session', 'sysuse auto\n') # In[19]: get_ipython().run_cell_magic('stata', '--mata -s mata_session', 'y = st_data(., "price")\nX = st_data(., "mpg trunk")\nn = rows(X)\nX = X,J(n,1,1)\nXpX = quadcross(X, X)\nXpXi = invsym(XpX)\nb = XpXi*quadcross(X, y)\nb\'\n') # ## Using Sessions # IPyStata 0.2 introduces the possibility to use many different Stata sessions that by default run in the background. # These sessions are defined using the **`-s`** or **`--session`** arguments. # ### Session example 1 # In[20]: get_ipython().run_cell_magic('stata', '-s session_1 -np', 'local session Hello I am session 1 and I am persistent\n') # In[21]: get_ipython().run_cell_magic('stata', '-s session_2 -np', 'local session Hello I am session 2 and I am persistent\n') # In[22]: get_ipython().run_cell_magic('stata', '-s session_1', 'display "`session\'"\n') # In[23]: get_ipython().run_cell_magic('stata', '-s session_2', 'display "`session\'"\n') # ### Session example 2 # In this example a logistic regression is performed in one cell and a postestimation (predict) is performed on this regression in the next cell. # In[24]: get_ipython().run_cell_magic('stata', '-s auto_session', 'sysuse auto\nlogit foreign weight mpg\n') # In[25]: get_ipython().run_cell_magic('stata', '-s auto_session', 'predict probhat\nsummarize probhat\n') # ## Session manager tools # In order to avoid using unnecessary system resources several tools and automatic cleanup routines are included. # ### Display all active Stata sessions: # In[26]: get_ipython().run_cell_magic('stata', '', 'sessions\n') # ### Reveal all Stata sessions # In[27]: get_ipython().run_cell_magic('stata', '', 'reveal all\n') # ### Hide all Stata sessions # In[28]: get_ipython().run_cell_magic('stata', '', 'hide all\n') # ### Close all Stata sessions initiated by IPyStata # In[29]: get_ipython().run_cell_magic('stata', '', 'close\n') # Close all Stata sessions (**Warning! This closes all Stata windows**) # In[30]: get_ipython().run_cell_magic('stata', '', 'close all\n') # ## An example case # Create the variable **`large`** in Python and use it as the dependent variable for a binary choice estimation by Stata. # In[31]: car_df['large'] = [1 if x > 3 and y > 200 else 0 for x, y in zip(car_df['headroom'], car_df['length'])] # In[32]: car_df[['headroom', 'length', 'large']].head(7) # In[33]: main_var = ['mpg', 'rep78'] control_var = ['gear_ratio', 'trunk', 'weight', 'displacement'] # In[34]: get_ipython().run_cell_magic('stata', '-d car_df -i main_var -i control_var', "\nlogit large `main_var' `control_var', vce(cluster make)\n")