#!/usr/bin/env python # coding: utf-8 # Open In Colab # # tabula-py example notebook # # tabula-py is a tool for convert PDF tables to pandas DataFrame. tabula-py is a wrapper of [tabula-java](https://github.com/tabulapdf/tabula-java), which requires java on your machine. tabula-py also enables you to convert tables in a PDF into CSV/TSV files. # # tabula-py's PDF extraction accuracy is same as tabula-java or [tabula app](https://tabula.technology/); GUI tool of tabula, so if you want to know the performance of tabula-py, I highly recommend you to try tabula app. # # tabula-py is good for: # - automation with Python script # - advanced analytics after converting pandas DataFrame # - casual analytics with Jupyter notebook or Google Colabolatory # # ## Check Java environment and install tabula-py # # tabula-py requires a java environment, so let's check the java environment on your machine. # In[1]: get_ipython().system('java -version') # After confirming the java environment, install tabula-py by using pip. # In[2]: # To be more precisely, it's better to use `{sys.executable} -m pip install tabula-py` get_ipython().system('pip install -q tabula-py') # Before trying tabula-py, check your environment via tabula-py `environment_info()` function, which shows Python version, Java version, and your OS environment. # In[3]: import tabula tabula.environment_info() # ## Read a PDF with `read_pdf()` function # # Let's read a PDF from GitHub. tabula-py can load a PDF or file like object on both local or internet by using `read_pdf()` function. # In[4]: import tabula pdf_path = "https://github.com/chezou/tabula-py/raw/master/tests/resources/data.pdf" dfs = tabula.read_pdf(pdf_path, stream=True) # read_pdf returns list of DataFrames print(len(dfs)) dfs[0] # ## Options for `read_pdf()` # # Note that `read_pdf()` function reads only page 1 by default. For more details, use `?read_pdf`. # In[5]: help(tabula.read_pdf) # Let's set `pages` option. Here is the extraction result of page 3: # In[6]: # set pages option dfs = tabula.read_pdf(pdf_path, pages=3, stream=True) dfs[0] # In[7]: # pass pages as string tabula.read_pdf(pdf_path, pages="1-2,3", stream=True) # You can set `pages="all"` for extration all pages. If you hit OOM error with Java, you should set appropriate `-Xmx` option for `java_options`. # In[8]: # extract all pages tabula.read_pdf(pdf_path, pages="all", stream=True) # ## Read partial area of PDF # # If you want to set a certain part of page, you can use `area` option. # # Note that as of tabula-py 2.0.0, `multiple_tables` option became `True` so if you want to use multiple `area` options like `[[0, 0, 100, 50], [0, 50, 100, 100]]`, you need to set `multiple_tables=False`. # In[9]: # set area option dfs = tabula.read_pdf(pdf_path, area=[126,149,212,462], pages=2) dfs[0] # ## Read giving column information # In[10]: pdf_path2 = "https://github.com/chezou/tabula-py/raw/master/tests/resources/campaign_donors.pdf" dfs = tabula.read_pdf(pdf_path2, columns=[47, 147, 256, 310, 375, 431, 504], guess=False, pages=1) df = dfs[0].drop(["Unnamed: 0"], axis=1) df # ## Extract to JSON, TSV, or CSV # # tabula-py has capability to convert not only DataFrame but also JSON, TSV, or CSV. You can set output format with `output_format` option. # In[11]: # read pdf as JSON tabula.read_pdf(pdf_path, output_format="json") # ## Convert PDF tables into CSV, TSV, or JSON files # # You can convert files directly rather creating Python objects with `convert_into()` function. # In[17]: # You can convert from pdf into JSON, CSV, TSV tabula.convert_into(pdf_path, "test.json", output_format="json", stream=True) get_ipython().system('cat test.json') # In[18]: tabula.convert_into(pdf_path, "test.tsv", output_format="tsv", stream=True) get_ipython().system('cat test.tsv') # In[19]: tabula.convert_into(pdf_path, "test.csv", output_format="csv", stream=True) get_ipython().system('cat test.csv') # ## Use lattice mode for more accurate extraction for spreadsheet style tables # # If your tables have lines separating cells, you can use `lattice` option. By default, tabula-py sets `guess=True`, which is the same behavior for default of tabula app. If your tables don't have separation lines, you can try `stream` option. # # As it mentioned, try tabula app before struglling with tabula-py option. Or, [PDFplumber](https://github.com/jsvine/pdfplumber) can be an alternative since it has different extraction strategy. # In[15]: pdf_path3 = "https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/spanning_cells.pdf" dfs = tabula.read_pdf( pdf_path3, pages="1", lattice=True, pandas_options={"header": [0, 1]}, area=[0, 0, 50, 100], relative_area=True, multiple_tables=False, ) dfs[0] # ## Use tabula app template # # tabula-py can handle tabula app template, which has area options set by GUI app to reuse. # In[16]: template_path = "https://github.com/chezou/tabula-py/raw/master/tests/resources/data.tabula-template.json" tabula.read_pdf_with_template(pdf_path, template_path) # If you have any question, ask on [StackOverflow](https://stackoverflow.com/search?q=tabula-py).