In [2]:
import itertools as it
import sys
import xlrd

from os import path as p, chdir
from io import StringIO
from array import array

if 'examples' in p.abspath('.'):
    chdir('..')
    
from meza import io, process as pr, convert as cv, fntools as ft, stats

Load data, type cast, and write to a CSV file

In [3]:
# Create a csv file
text = (
    'col1,col2,col3\n'
    'hello,5/4/82,1\n'
    'one,1/1/15,2\n'
    'happy,7/4/92,3\n')

f = StringIO(text)
f.seek(0)

# Load the file
records = io.read_csv(f)

# Records are an iterator over the rows
row = next(records)
row
Out[3]:
{'col1': 'hello', 'col2': '5/4/82', 'col3': '1'}
In [4]:
# Replace first row so as not to loose any data
records = pr.prepend(records, row)

# Guess column types. Note: `detect_types` returns a new `records`
# generator since it consumes rows during type detection
records, result = pr.detect_types(records)
result['types']
Out[4]:
[{'id': 'col1', 'type': 'text'},
 {'id': 'col2', 'type': 'date'},
 {'id': 'col3', 'type': 'int'}]
In [5]:
# Now type cast the records. Note: most `meza.process` functions return
# generators, so lets wrap the result in a list to view the data
casted = list(pr.type_cast(records, **result))
casted[0]
Out[5]:
{'col1': 'hello', 'col2': datetime.date(1982, 5, 4), 'col3': 1}

now run some operations on the type casted data

In [6]:
# Cut out the first column of data
cut_recs = pr.cut(casted, ['col1'], exclude=True)
cut_recs
Out[6]:
<generator object cut.<locals>.<genexpr> at 0x103d0adb0>
In [7]:
# merge the cut rows to and get the max value of the remaining columns. 
# Note: since `merge` (by definition) will always contain just one row,
# it is returned as is (not wrapped in a generator)
merged = pr.merge(cut_recs, pred=bool, op=max)
merged
Out[7]:
{'col2': datetime.date(2015, 1, 1), 'col3': 3}
In [8]:
# Now write merged data back to a new csv file.
f = StringIO()
io.write(f, cv.records2csv([merged]))
f.getvalue()
Out[8]:
'col2,col3\r\n2015-01-01,3\r\n'

Reading data

meza can read both filepaths and file-like objects. Additionally, all readers return equivalent records iterators, i.e., a generator of dictionaries with keys corresponding to the column names. Please see Readers for a complete list of available readers and recognized file types.

In [9]:
# Read a file like object and de-duplicate the header
f = StringIO('col,col\nhello,world\n')
records = io.read_csv(f, dedupe=True)

# View the first row
next(records) 
Out[9]:
{'col': 'hello', 'col_2': 'world'}

Processing data

Numerical analysis (à la pandas)

In the following example, pandas equivalent methods are preceded by -->.

In [10]:
# Create some data in the same structure as what the various `read...`
# functions output
header = ['A', 'B', 'C', 'D']
data = [
    [0.5607, 0.9338, 0.4769, 0.7804],
    [0.8227, 0.2844, 0.8166, 0.7323],
    [0.4627, 0.8633, 0.3283, 0.1909],
    [0.3932, 0.5454, 0.9604, 0.6376],
    [0.3685, 0.9166, 0.9457, 0.8066],
    [0.7584, 0.6981, 0.5625, 0.3578],
    [0.8959, 0.6932, 0.2565, 0.3378]]

df = [dict(zip(header, d)) for d in data]
df[0]
Out[10]:
{'A': 0.5607, 'B': 0.9338, 'C': 0.4769, 'D': 0.7804}
In [11]:
# Sort records by the value of column `B` --> df.sort_values(by='B')
next(pr.sort(df, 'B')) 
Out[11]:
{'A': 0.8227, 'B': 0.2844, 'C': 0.8166, 'D': 0.7323}
In [12]:
# Select column `A` --> df['A']
next(pr.cut(df, ['A'])) 
Out[12]:
{'A': 0.5607}
In [13]:
# Select the first 3 rows of data --> df[0:3]
list(it.islice(df, 3))
Out[13]:
[{'A': 0.5607, 'B': 0.9338, 'C': 0.4769, 'D': 0.7804},
 {'A': 0.8227, 'B': 0.2844, 'C': 0.8166, 'D': 0.7323},
 {'A': 0.4627, 'B': 0.8633, 'C': 0.3283, 'D': 0.1909}]
In [14]:
# Select all data whose value for column `A` is less than 0.5 --> df[df.A < 0.5]
next(pr.tfilter(df, 'A', lambda x: x < 0.5)) 
Out[14]:
{'A': 0.4627, 'B': 0.8633, 'C': 0.3283, 'D': 0.1909}

Note: since aggregate and merge (by definition) return just one row, they return them as is (not wrapped in a generator).

In [15]:
# Calculate the mean of column `A` across all data --> df.mean()['A']
pr.aggregate(df, 'A', stats.mean)['A']
Out[15]:
0.6088714285714286
In [16]:
# Calculate the sum of each column across all data --> df.sum()
pr.merge(df, pred=bool, op=sum) 
Out[16]:
{'A': 4.2621, 'B': 4.9348, 'C': 4.3469, 'D': 3.8434}

Text processing (à la csvkit)

In the following example, csvkit equivalent commands are preceded by -->.

In [17]:
# First create a few simple csv files
f1 = StringIO('col_1,col_2,col_3\n1,dill,male\n2,bob,male\n3,jane,female\n')
f2 = StringIO('col_1,col_2,col_3\n4,tom,male\n5,dick,male\n6,jill,female\n')
f1.seek(0)
f2.seek(0) 
Out[17]:
0

Now we can read the files, manipulate the data, convert the manipulated data to json, and write the json back to a new file. Also, note that since all readers return equivalent records iterators, you can use them interchangeably (in place of read_csv) to open any supported file. E.g., read_xls, read_sqlite, etc.

In [18]:
# First, let's combine the files into one iterator --> csvstack file1.csv file2.csv
records = io.join(f1, f2, ext='csv')
next(records)
Out[18]:
{'col_1': '1', 'col_2': 'dill', 'col_3': 'male'}
In [19]:
next(it.islice(records, 4, None)) 
Out[19]:
{'col_1': '6', 'col_2': 'jill', 'col_3': 'female'}
In [20]:
# Now let's create a persistant records list so we can do some simple manipulations
f1.seek(0)
records = list(io.read_csv(f1))
In [21]:
# Sort records by the value of column `col_2` --> csvsort -c col_2 file1.csv
next(pr.sort(records, 'col_2'))
Out[21]:
{'col_1': '2', 'col_2': 'bob', 'col_3': 'male'}
In [22]:
# Select column `col_2` --> csvcut -c col_2 file1.csv
next(pr.cut(records, ['col_2'])) 
Out[22]:
{'col_2': 'dill'}
In [23]:
# Select all data whose value for column `col_2` contains `jan`
# --> csvgrep -c col_2 -m jan file1.csv
next(pr.grep(records, [{'pattern': 'jan'}], ['col_2']))
Out[23]:
{'col_1': '3', 'col_2': 'jane', 'col_3': 'female'}
In [24]:
# Convert a csv file to json --> csvjson -i 4 file1.csv
f_json = StringIO()
io.write(f_json, cv.records2json(records))
f_json.getvalue()
Out[24]:
'[{"col_2": "dill", "col_3": "male", "col_1": "1"}, {"col_2": "bob", "col_3": "male", "col_1": "2"}, {"col_2": "jane", "col_3": "female", "col_1": "3"}]'

Geo processing (à la mapbox)

In the following example, mapbox equivalent commands are preceded by -->.

In [25]:
# First create a geojson file
f = StringIO(
    '{"type": "FeatureCollection","features": ['
    '{"type": "Feature", "id": 11, "geometry": '
    '{"type": "Point", "coordinates": [10, 20]}},'
    '{"type": "Feature", "id": 12, "geometry": '
    '{"type": "Point", "coordinates": [5, 15]}}]}')

f.seek(0)
Out[25]:
0

Now we can open the file, split the data by id, and finally convert the split data to a new geojson file-like object.

In [26]:
# Load the geojson file and peek at the results
records, peek = pr.peek(io.read_geojson(f))
peek[0] 
Out[26]:
{'id': 11, 'lat': 20, 'lon': 10, 'type': 'Point'}
In [27]:
# Split the records by feature ``id`` and select the first feature --> geojsplit -k id file.geojson
splits = pr.split(records, 'id')
feature_records, name = next(splits)
name
Out[27]:
11
In [28]:
# Convert the feature records into a GeoJSON file-like object
geojson = cv.records2geojson(feature_records)
geojson.readline()
Out[28]:
'{"type": "FeatureCollection", "bbox": [10, 20, 10, 20], "features": [{"type": "Feature", "id": 11, "geometry": {"type": "Point", "coordinates": [10, 20]}, "properties": {"id": 11}}], "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:OGC:1.3:CRS84"}}}'

Note: you can also write back to a file as shown previously io.write('file.geojson', geojson).

Writing Data

meza can persist records to disk via the following functions:

  • meza.convert.records2csv
  • meza.convert.records2json
  • meza.convert.records2geojson

Each function returns a file-like object that you can write to disk via meza.io.write('/path/to/file', result).

In [29]:
# First let's create a simple tsv file like object
f = StringIO('col1\tcol2\nhello\tworld\n')
f.seek(0)

# Next create a records list so we can reuse it
records = list(io.read_tsv(f))
records[0] 
Out[29]:
{'col1': 'hello', 'col2': 'world'}

Now we're ready to write the records data to file

In [30]:
# Create a csv file like object
f_out = cv.records2csv(records)
f_out.readline().strip()
Out[30]:
'col1,col2'
In [31]:
# Create a json file like object
f_out = cv.records2json(records)
f_out.readline()
Out[31]:
'[{"col1": "hello", "col2": "world"}]'

Interoperability

meza plays nicely with native and NumPy arrays out of the box

In [32]:
# First create some records and types. Also, convert the records to a list
# so we can reuse them.
records = [{'a': 'one', 'b': 2}, {'a': 'five', 'b': 10, 'c': 20.1}]
records, result = pr.detect_types(records)
records, types = list(records), result['types']
types
Out[32]:
[{'id': 'b', 'type': 'int'},
 {'id': 'c', 'type': 'float'},
 {'id': 'a', 'type': 'text'}]
In [33]:
# Convert records to a list of arrays
narray = cv.records2array(records, types, native=True)
narray
Out[33]:
[[array('u', 'b'), array('u', 'c'), array('u', 'a')],
 array('i', [2, 10]),
 array('f', [0.0, 20.100000381469727]),
 [array('u', 'one'), array('u', 'five')]]
In [34]:
# Convert list of arrays back to records
next(cv.array2records(narray, native=True)) 
Out[34]:
{'a': 'one', 'b': 2, 'c': 0.0}
In [35]:
# Convert a 2-D array to records
arr = [(1, 4), (2, 5), (3, 6)]
data = [array('i', a) for a in arr]
data
Out[35]:
[array('i', [1, 4]), array('i', [2, 5]), array('i', [3, 6])]
In [36]:
next(cv.array2records(data, native=True)) 
Out[36]:
{'column_1': 1, 'column_2': 2, 'column_3': 3}