In [1]:
# Just an idea

Original test.ldj

Valid JSON, only line by line

{"KEY3": "qux0", "KEY2": "baz0", "KEY1": "bar0", "KEY0": "foo0", "POS": 5, "CHROM": "chr3"}
{"KEY3": "qux1", "KEY2": "baz1", "KEY1": "bar1", "KEY0": "foo1", "POS": 6, "CHROM": "chr12"}
{"KEY3": "qux2", "KEY2": "baz2", "KEY1": "bar2", "KEY0": "foo2", "POS": 9, "CHROM": "chr1"}
{"KEY3": "qux3", "KEY2": "baz3", "KEY1": "bar3", "KEY0": "foo3", "POS": 8, "CHROM": "chr12"}
{"KEY3": "qux4", "KEY2": "baz4", "KEY1": "bar4", "KEY0": "foo4", "POS": 3, "CHROM": "chrX"}
{"KEY3": "qux5", "KEY2": "baz5", "KEY1": "bar5", "KEY0": "foo5", "POS": 2, "CHROM": "chrX"}
{"KEY3": "qux6", "KEY2": "baz6", "KEY1": "bar6", "KEY0": "foo6", "POS": 10, "CHROM": "chr3"}
{"KEY3": "qux7", "KEY2": "baz7", "KEY1": "bar7", "KEY0": "foo7", "POS": 9, "CHROM": "chr1"}
{"KEY3": "qux8", "KEY2": "baz8", "KEY1": "bar8", "KEY0": "foo8", "POS": 0, "CHROM": "chr12"}
{"KEY3": "qux9", "KEY2": "baz9", "KEY1": "bar9", "KEY0": "foo9", "POS": 3, "CHROM": "chr3"}
{"KEY3": "qux10", "KEY2": "baz10", "KEY1": "bar10", "KEY0": "foo10", "POS": 8, "CHROM": "chr12"}
{"KEY3": "qux11", "KEY2": "baz11", "KEY1": "bar11", "KEY0": "foo11", "POS": 7, "CHROM": "chrX"}
{"KEY3": "qux12", "KEY2": "baz12", "KEY1": "bar12", "KEY0": "foo12", "POS": 6, "CHROM": "chrX"}
{"KEY3": "qux13", "KEY2": "baz13", "KEY1": "bar13", "KEY0": "foo13", "POS": 10, "CHROM": "chr12"}
{"KEY3": "qux14", "KEY2": "baz14", "KEY1": "bar14", "KEY0": "foo14", "POS": 3, "CHROM": "chrX"}
{"KEY3": "qux15", "KEY2": "baz15", "KEY1": "bar15", "KEY0": "foo15", "POS": 10, "CHROM": "chr3"}
{"KEY3": "qux16", "KEY2": "baz16", "KEY1": "bar16", "KEY0": "foo16", "POS": 2, "CHROM": "chr12"}
{"KEY3": "qux17", "KEY2": "baz17", "KEY1": "bar17", "KEY0": "foo17", "POS": 0, "CHROM": "chr1"}
{"KEY3": "qux18", "KEY2": "baz18", "KEY1": "bar18", "KEY0": "foo18", "POS": 6, "CHROM": "chr3"}
{"KEY3": "qux19", "KEY2": "baz19", "KEY1": "bar19", "KEY0": "foo19", "POS": 0, "CHROM": "chr3"}

Converted to json

Everyline gets a comma, except the last one and it becomes a list

[{"KEY3": "qux0", "KEY2": "baz0", "KEY1": "bar0", "KEY0": "foo0", "POS": 5, "CHROM": "chr3"},
{"KEY3": "qux1", "KEY2": "baz1", "KEY1": "bar1", "KEY0": "foo1", "POS": 6, "CHROM": "chr12"},
{"KEY3": "qux2", "KEY2": "baz2", "KEY1": "bar2", "KEY0": "foo2", "POS": 9, "CHROM": "chr1"},
{"KEY3": "qux3", "KEY2": "baz3", "KEY1": "bar3", "KEY0": "foo3", "POS": 8, "CHROM": "chr12"},
{"KEY3": "qux4", "KEY2": "baz4", "KEY1": "bar4", "KEY0": "foo4", "POS": 3, "CHROM": "chrX"},
{"KEY3": "qux5", "KEY2": "baz5", "KEY1": "bar5", "KEY0": "foo5", "POS": 2, "CHROM": "chrX"},
{"KEY3": "qux6", "KEY2": "baz6", "KEY1": "bar6", "KEY0": "foo6", "POS": 10, "CHROM": "chr3"},
{"KEY3": "qux7", "KEY2": "baz7", "KEY1": "bar7", "KEY0": "foo7", "POS": 9, "CHROM": "chr1"},
{"KEY3": "qux8", "KEY2": "baz8", "KEY1": "bar8", "KEY0": "foo8", "POS": 0, "CHROM": "chr12"},
{"KEY3": "qux9", "KEY2": "baz9", "KEY1": "bar9", "KEY0": "foo9", "POS": 3, "CHROM": "chr3"},
{"KEY3": "qux10", "KEY2": "baz10", "KEY1": "bar10", "KEY0": "foo10", "POS": 8, "CHROM": "chr12"},
{"KEY3": "qux11", "KEY2": "baz11", "KEY1": "bar11", "KEY0": "foo11", "POS": 7, "CHROM": "chrX"},
{"KEY3": "qux12", "KEY2": "baz12", "KEY1": "bar12", "KEY0": "foo12", "POS": 6, "CHROM": "chrX"},
{"KEY3": "qux13", "KEY2": "baz13", "KEY1": "bar13", "KEY0": "foo13", "POS": 10, "CHROM": "chr12"},
{"KEY3": "qux14", "KEY2": "baz14", "KEY1": "bar14", "KEY0": "foo14", "POS": 3, "CHROM": "chrX"},
{"KEY3": "qux15", "KEY2": "baz15", "KEY1": "bar15", "KEY0": "foo15", "POS": 10, "CHROM": "chr3"},
{"KEY3": "qux16", "KEY2": "baz16", "KEY1": "bar16", "KEY0": "foo16", "POS": 2, "CHROM": "chr12"},
{"KEY3": "qux17", "KEY2": "baz17", "KEY1": "bar17", "KEY0": "foo17", "POS": 0, "CHROM": "chr1"},
{"KEY3": "qux18", "KEY2": "baz18", "KEY1": "bar18", "KEY0": "foo18", "POS": 6, "CHROM": "chr3"},
{"KEY3": "qux19", "KEY2": "baz19", "KEY1": "bar19", "KEY0": "foo19", "POS": 0, "CHROM": "chr3"}]
In [2]:
# Now we can put it in a pandas dataframe
In [3]:
converted_json = """[{"KEY3": "qux0", "KEY2": "baz0", "KEY1": "bar0", "KEY0": "foo0", "POS": 5, "CHROM": "chr3"},
{"KEY3": "qux1", "KEY2": "baz1", "KEY1": "bar1", "KEY0": "foo1", "POS": 6, "CHROM": "chr12"},
{"KEY3": "qux2", "KEY2": "baz2", "KEY1": "bar2", "KEY0": "foo2", "POS": 9, "CHROM": "chr1"},
{"KEY3": "qux3", "KEY2": "baz3", "KEY1": "bar3", "KEY0": "foo3", "POS": 8, "CHROM": "chr12"},
{"KEY3": "qux4", "KEY2": "baz4", "KEY1": "bar4", "KEY0": "foo4", "POS": 3, "CHROM": "chrX"},
{"KEY3": "qux5", "KEY2": "baz5", "KEY1": "bar5", "KEY0": "foo5", "POS": 2, "CHROM": "chrX"},
{"KEY3": "qux6", "KEY2": "baz6", "KEY1": "bar6", "KEY0": "foo6", "POS": 10, "CHROM": "chr3"},
{"KEY3": "qux7", "KEY2": "baz7", "KEY1": "bar7", "KEY0": "foo7", "POS": 9, "CHROM": "chr1"},
{"KEY3": "qux8", "KEY2": "baz8", "KEY1": "bar8", "KEY0": "foo8", "POS": 0, "CHROM": "chr12"},
{"KEY3": "qux9", "KEY2": "baz9", "KEY1": "bar9", "KEY0": "foo9", "POS": 3, "CHROM": "chr3"},
{"KEY3": "qux10", "KEY2": "baz10", "KEY1": "bar10", "KEY0": "foo10", "POS": 8, "CHROM": "chr12"},
{"KEY3": "qux11", "KEY2": "baz11", "KEY1": "bar11", "KEY0": "foo11", "POS": 7, "CHROM": "chrX"},
{"KEY3": "qux12", "KEY2": "baz12", "KEY1": "bar12", "KEY0": "foo12", "POS": 6, "CHROM": "chrX"},
{"KEY3": "qux13", "KEY2": "baz13", "KEY1": "bar13", "KEY0": "foo13", "POS": 10, "CHROM": "chr12"},
{"KEY3": "qux14", "KEY2": "baz14", "KEY1": "bar14", "KEY0": "foo14", "POS": 3, "CHROM": "chrX"},
{"KEY3": "qux15", "KEY2": "baz15", "KEY1": "bar15", "KEY0": "foo15", "POS": 10, "CHROM": "chr3"},
{"KEY3": "qux16", "KEY2": "baz16", "KEY1": "bar16", "KEY0": "foo16", "POS": 2, "CHROM": "chr12"},
{"KEY3": "qux17", "KEY2": "baz17", "KEY1": "bar17", "KEY0": "foo17", "POS": 0, "CHROM": "chr1"},
{"KEY3": "qux18", "KEY2": "baz18", "KEY1": "bar18", "KEY0": "foo18", "POS": 6, "CHROM": "chr3"},
{"KEY3": "qux19", "KEY2": "baz19", "KEY1": "bar19", "KEY0": "foo19", "POS": 0, "CHROM": "chr3"}]"""
In [10]:
# faking file, so everything stays in the notebook
import cStringIO

json_input = cStringIO.StringIO()
json_input.write(converted_json)
In [11]:
import pandas as pd
In [13]:
df = pd.read_json(json_input.getvalue())
In [14]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 0 to 19
Data columns (total 6 columns):
CHROM    20 non-null object
KEY0     20 non-null object
KEY1     20 non-null object
KEY2     20 non-null object
KEY3     20 non-null object
POS      20 non-null int64
dtypes: int64(1), object(5)
In [16]:
# sort by KEY1
df.sort(columns="KEY1")
Out[16]:
CHROM KEY0 KEY1 KEY2 KEY3 POS
0 chr3 foo0 bar0 baz0 qux0 5
1 chr12 foo1 bar1 baz1 qux1 6
10 chr12 foo10 bar10 baz10 qux10 8
11 chrX foo11 bar11 baz11 qux11 7
12 chrX foo12 bar12 baz12 qux12 6
13 chr12 foo13 bar13 baz13 qux13 10
14 chrX foo14 bar14 baz14 qux14 3
15 chr3 foo15 bar15 baz15 qux15 10
16 chr12 foo16 bar16 baz16 qux16 2
17 chr1 foo17 bar17 baz17 qux17 0
18 chr3 foo18 bar18 baz18 qux18 6
19 chr3 foo19 bar19 baz19 qux19 0
2 chr1 foo2 bar2 baz2 qux2 9
3 chr12 foo3 bar3 baz3 qux3 8
4 chrX foo4 bar4 baz4 qux4 3
5 chrX foo5 bar5 baz5 qux5 2
6 chr3 foo6 bar6 baz6 qux6 10
7 chr1 foo7 bar7 baz7 qux7 9
8 chr12 foo8 bar8 baz8 qux8 0
9 chr3 foo9 bar9 baz9 qux9 3
In [17]:
# view only chr1
df[df.CHROM=="chr1"]
Out[17]:
CHROM KEY0 KEY1 KEY2 KEY3 POS
2 chr1 foo2 bar2 baz2 qux2 9
7 chr1 foo7 bar7 baz7 qux7 9
17 chr1 foo17 bar17 baz17 qux17 0
In [ ]: