In [1]:
%matplotlib inline

import collections
import os

os.environ['SQLALCHEMY_WARN_20'] = 'true'

import pandas as pd
import sqlalchemy as sa
import matplotlib as mpl
import matplotlib.pyplot as plt

import treedb
import treedb.raw

treedb.configure_logging(log_sql=False)

treedb.print_versions()
treedb version: 2.3.1.dev0
sqlalchemy version: 1.4.15
sqlite_version: 3.35.5
csv23 version: 0.3.2

Clone the data repository

In [2]:
%%time

GLOTTOLOG_TAG = 'v4.4'

TARGET = '../glottolog/'

treedb.checkout_or_clone(GLOTTOLOG_TAG, target=TARGET)
Wall time: 33.9 s
Out[2]:
(CompletedProcess(args=['git', 'clone', '-c', 'advice.detachedHead=false', '--single-branch', '--branch', 'v4.4', '--depth', '1', 'https://github.com/glottolog/glottolog.git', WindowsPath('../glottolog')], returncode=0),
 CompletedProcess(args=['git', 'checkout', '-B', 'treedb', 'v4.4'], returncode=0))

Use the repository root as source

In [3]:
treedb.set_root(TARGET)
Out[3]:
<treedb._proxies.PathProxy path='../glottolog/languoids/tree' inode=42221246506743328>
In [4]:
next(treedb.iterfiles())
Out[4]:
FileInfo(path=('abin1243',), config=<treedb.files.ConfigParser object at 0x000001D657DD8BE0>, dentry=<DirEntry 'md.ini'>)
In [5]:
dict(treedb.iterlanguoids(limit=1))
Out[5]:
{('abin1243',): {'id': 'abin1243',
  'parent_id': None,
  'level': 'language',
  'name': 'Abinomn',
  'hid': 'bsa',
  'iso639_3': 'bsa',
  'latitude': -2.92281,
  'longitude': 138.891,
  'macroareas': ['Papunesia'],
  'countries': [{'id': 'ID', 'name': 'Indonesia'}],
  'links': [{'url': 'http://endangeredlanguages.com/lang/1763',
    'title': 'Abinomn',
    'scheme': 'http'},
   {'url': 'https://www.wikidata.org/entity/Q56648',
    'title': None,
    'scheme': 'https'},
   {'url': 'https://en.wikipedia.org/wiki/Abinomn_language',
    'title': None,
    'scheme': 'https'}],
  'timespan': None,
  'sources': {'glottolog': [{'bibfile': 'hh',
     'bibkey': 'e:Lagerberg:Moegip',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'h:SilzerClouse:Index',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'h:SilzerHeikkinen:Irian',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'hv:Foley:Northwest-New-Guinea',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'hvtyp:DonohueMusgrave:Melanesia',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'w:Fiwei:Abinomn',
     'pages': None,
     'trigger': None}]},
  'altnames': {'multitree': [{'name': '"Baso"', 'lang': None},
    {'name': 'Abinomn', 'lang': None},
    {'name': 'Avinomen', 'lang': None},
    {'name': 'Foja', 'lang': None},
    {'name': 'Foya', 'lang': None}],
   'lexvo': [{'name': 'Abinomn', 'lang': 'en'},
    {'name': 'Abinomn language', 'lang': 'en'},
    {'name': 'Abinomneg', 'lang': 'br'},
    {'name': 'Lingua abinomn', 'lang': 'gl'},
    {'name': 'Llingua Abinomn', 'lang': 'ast'}],
   'hhbib_lgcode': [{'name': 'Baso', 'lang': None}],
   'elcat': [{'name': '"Baso"', 'lang': None},
    {'name': 'Abinomn', 'lang': None},
    {'name': 'Avinomen', 'lang': None},
    {'name': 'Foja', 'lang': None},
    {'name': 'Foya', 'lang': None}]},
  'triggers': {'lgcode': ['macrohistory', 'moegip']},
  'identifier': {'multitree': 'bsa', 'endangeredlanguages': '1763'},
  'classification': {'familyrefs': [{'bibfile': 'hh',
     'bibkey': 'h:SilzerClouse:Index',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'hvtyp:DonohueMusgrave:Melanesia',
     'pages': None,
     'trigger': None}]},
  'endangerment': {'status': 'shifting',
   'source': {'name': 'ElCat', 'bibfile': None, 'bibkey': None, 'pages': None},
   'date': datetime.datetime(2017, 8, 19, 8, 16, 16),
   'comment': 'Abinomn (1763-bsa) = Endangered (20 percent certain, based on the evidence available)'},
  'hh_ethnologue_comment': None,
  'iso_retirement': None}}
In [6]:
%time treedb.checksum(source='files')
Wall time: 16.5 s
Out[6]:
'path_languoid:path:sha256:289247f73d5bf57f0a7dc8e0e2473b5625ace605dbad57b3c06fe93063d47599'

Set the database file and load it

In [7]:
treedb.set_engine('treedb.sqlite3')
Out[7]:
<treedb._proxies.SQLiteEngineProxy filename='treedb.sqlite3' size=None>
In [8]:
%%time

engine = treedb.load(rebuild=False, exclude_raw=False)
engine
0:00:49.079509
Wall time: 49.4 s
Out[8]:
<treedb._proxies.SQLiteEngineProxy filename='treedb.sqlite3' size=70512640>

Check the database

In [9]:
engine.file_mtime()
Out[9]:
datetime.datetime(2021, 5, 15, 16, 1, 45, 15915)
In [10]:
engine.file_size(as_megabytes=True)
Out[10]:
67.24609375
In [11]:
%time engine.file_sha256()
Wall time: 49 ms
Out[11]:
'ee6c9b4612100e941d28c02f96c804f0edfd7934298fd2d4bd1493ea6659f8d1'
In [12]:
%time treedb.raw.checksum()
Wall time: 106 ms
Out[12]:
'strong:sha256:3c95e5081c8d689980637506e363f99904798aa1a5ceb69eb1187b8eaff53709'
In [13]:
%time treedb.raw.checksum(weak=True)
Wall time: 2.91 s
Out[13]:
'weak:sha256:690dd35d5de6a0841f3d3f8cc8c9853382797c73e082a418c1b9ad77bd7495bb'
In [14]:
%time treedb.checksum(source='tables')
Wall time: 4.23 s
Out[14]:
'path_languoid:path:sha256:289247f73d5bf57f0a7dc8e0e2473b5625ace605dbad57b3c06fe93063d47599'
In [15]:
%time treedb.checksum(source='raw')
Wall time: 4.8 s
Out[15]:
'path_languoid:path:sha256:289247f73d5bf57f0a7dc8e0e2473b5625ace605dbad57b3c06fe93063d47599'
In [16]:
%time treedb.check()
valid_pseudofamily_referencesCheck: OK
pseudofamilies_are_rootsCheck: OK
valid_glottocodeCheck: OK
valid_iso639_3Check: OK
valid_hidCheck: OK
clean_nameCheck: OK
family_parentCheck: OK
language_parentCheck: OK
dialect_parentCheck: OK
family_childrenCheck: OK
family_languagesCheck: OK
bookkeeping_no_childrenCheck: OK
no_empty_filesCheck: OK
Wall time: 904 ms
Out[16]:
True

treedb.sqlite3

In [17]:
treedb.configure_logging(level='INFO', log_sql=True)

treedb.scalar(sa.select(sa.func.sqlite_version()))
[[email protected]_] treedb version: 2.3.1.dev0
BEGIN (implicit)
SELECT sqlite_version() AS sqlite_version_1
[generated in 0.00090s] ()
ROLLBACK
Out[17]:
'3.35.5'
In [18]:
application_id = treedb.scalar(sa.text('PRAGMA application_id'))

assert application_id == 1122 == 0x462
assert application_id == sum(ord(c) for c in treedb.Dataset.__tablename__)
assert treedb.Dataset.__tablename__ == '__dataset__'

application_id
BEGIN (implicit)
PRAGMA application_id
[generated in 0.00061s] ()
ROLLBACK
Out[18]:
1122
In [19]:
from treedb import pd_read_sql as read_sql

read_sql(treedb.select_tables_nrows(), index_col='table_name')
BEGIN (implicit)
SELECT sqlite_master.name 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name NOT LIKE ? ORDER BY sqlite_master.name
[generated in 0.00083s] ('table', 'sqlite_%')
ROLLBACK
[[email protected]] pandas version: 1.2.4
BEGIN (implicit)
SELECT ? AS table_name, (SELECT count(*) AS n 
FROM __dataset__) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM __producer__) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM _config) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM _file) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM _option) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM _value) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM altname) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM altnameprovider) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM bibfile) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM bibitem) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM classificationcomment) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM classificationref) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM country) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM endangerment) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM endangermentsource) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM endangermentstatus) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM ethnologuecomment) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM identifier) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM identifiersite) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM isoretirement) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM isoretirement_changeto) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM languoid) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM languoid_country) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM languoid_macroarea) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM languoidlevel) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM link) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM macroarea) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM pseudofamily) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM source) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM sourceprovider) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM timespan) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM "trigger") AS n_rows
[generated in 0.00107s] ('__dataset__', '__producer__', '_config', '_file', '_option', '_value', 'altname', 'altnameprovider', 'bibfile', 'bibitem', 'classificationcomment', 'classificationref', 'country', 'endangerment', 'endangermentsource', 'endangermentstatus', 'ethnologuecomment', 'identifier', 'identifiersite', 'isoretirement', 'isoretirement_changeto', 'languoid', 'languoid_country', 'languoid_macroarea', 'languoidlevel', 'link', 'macroarea', 'pseudofamily', 'source', 'sourceprovider', 'timespan', 'trigger')
ROLLBACK
Out[19]:
n_rows
table_name
__dataset__ 1
__producer__ 1
_config 353
_file 25900
_option 53
_value 614595
altname 154719
altnameprovider 11
bibfile 38
bibitem 145308
classificationcomment 11936
classificationref 16992
country 246
endangerment 8254
endangermentsource 44
endangermentstatus 6
ethnologuecomment 614
identifier 21406
identifiersite 4
isoretirement 350
isoretirement_changeto 441
languoid 25900
languoid_country 11377
languoid_macroarea 21280
languoidlevel 3
link 29704
macroarea 6
pseudofamily 8
source 192520
sourceprovider 1
timespan 144
trigger 30276
In [20]:
treedb.print_rows(treedb.backend.sqlite_master.select_views(),
                  format_='{name}')
BEGIN (implicit)
SELECT sqlite_master.name 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name NOT LIKE ? ORDER BY sqlite_master.name
[cached since 0.05537s ago] ('view', 'sqlite_%')
ROLLBACK
example
path_languoid
stats
In [21]:
from treedb import print_table_sql as print_sql

print_sql('sqlite_master')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[generated in 0.00097s] ('table', 'sqlite_master')
SELECT count(*) AS n_rows 
FROM sqlite_master
[generated in 0.00053s] ()
ROLLBACK
None
66
In [22]:
treedb.print_dataset()
BEGIN (implicit)
SELECT __dataset__.id, __dataset__.title, __dataset__.git_commit, __dataset__.git_describe, __dataset__.clean, __dataset__.version, __dataset__.exclude_raw 
FROM __dataset__
[generated in 0.00085s] ()
SELECT __producer__.id, __producer__.name, __producer__.version 
FROM __producer__
[cached since 13.15s ago] ()
ROLLBACK
[[email protected]] git describe 'v4.4' clean: True
[[email protected]] __dataset__.git_commit: 'c8fd3c0891a61e0f167ceb9c5f6e9418bc2f8a34'
[[email protected]] __dataset__.version: '4.4'
[[email protected]] __producer__.name: treedb
[[email protected]] __producer__.version: 2.3.1.dev0
git describe 'v4.4' clean: True
__dataset__.title: 'Glottolog treedb''
__dataset__.git_commit: 'c8fd3c0891a61e0f167ceb9c5f6e9418bc2f8a34'
__dataset__.version: '4.4'
__dataset__.exclude_raw: False
__producer__.name: treedb
__producer__.version: 2.3.1.dev0

__dataset__

In [23]:
from treedb import Dataset

print_sql(Dataset)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.02105s ago] ('table', '__dataset__')
SELECT count(*) AS n_rows 
FROM __dataset__
[generated in 0.00046s] ()
ROLLBACK
CREATE TABLE __dataset__ (
	id INTEGER NOT NULL CHECK (id = 1), 
	title TEXT NOT NULL CHECK (title != ''), 
	git_commit VARCHAR(40) NOT NULL CHECK (length(git_commit) = 40), 
	git_describe TEXT NOT NULL CHECK (git_describe != ''), 
	clean BOOLEAN NOT NULL, 
	version TEXT CHECK (version != ''), 
	exclude_raw BOOLEAN NOT NULL, 
	PRIMARY KEY (id), 
	UNIQUE (git_commit), 
	UNIQUE (git_describe), 
	CHECK (clean IN (0, 1)), 
	CHECK (exclude_raw IN (0, 1))
)
1
In [24]:
dataset, = treedb.iterrows(sa.select(Dataset), mappings=True)

pd.DataFrame.from_dict(dataset, orient='index',
                       columns=['__dataset__'])
BEGIN (implicit)
SELECT __dataset__.id, __dataset__.title, __dataset__.git_commit, __dataset__.git_describe, __dataset__.clean, __dataset__.version, __dataset__.exclude_raw 
FROM __dataset__
[cached since 0.02269s ago] ()
ROLLBACK
Out[24]:
__dataset__
id 1
title Glottolog treedb
git_commit c8fd3c0891a61e0f167ceb9c5f6e9418bc2f8a34
git_describe v4.4
clean True
version 4.4
exclude_raw False

__producer__

In [25]:
from treedb import Producer

print_sql(Producer)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.04303s ago] ('table', '__producer__')
SELECT count(*) AS n_rows 
FROM __producer__
[generated in 0.00046s] ()
ROLLBACK
CREATE TABLE __producer__ (
	id INTEGER NOT NULL CHECK (id = 1), 
	name TEXT NOT NULL CHECK (name != ''), 
	version TEXT NOT NULL CHECK (version != ''), 
	PRIMARY KEY (id), 
	UNIQUE (name)
)
1
In [26]:
producer, = treedb.iterrows(sa.select(Producer), mappings=True)

pd.DataFrame.from_dict(producer, orient='index',
                       columns=['__producer__'])
BEGIN (implicit)
SELECT __producer__.id, __producer__.name, __producer__.version 
FROM __producer__
[cached since 13.19s ago] ()
ROLLBACK
Out[26]:
__producer__
id 1
name treedb
version 2.3.1.dev0

treedb.raw

In [27]:
from treedb.raw import File, Option, Value

for model in (File, Option, Value):
    print_sql(model)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.06334s ago] ('table', '_file')
SELECT count(*) AS n_rows 
FROM _file
[generated in 0.00044s] ()
ROLLBACK
CREATE TABLE _file (
	id INTEGER NOT NULL, 
	glottocode VARCHAR(8) NOT NULL CHECK (length(glottocode) = 8), 
	path TEXT NOT NULL CHECK (length(path) >= 8 AND (length(path) + 1) % 9 = 0), 
	size INTEGER NOT NULL CHECK (size > 0), 
	sha256 VARCHAR(64) NOT NULL CHECK (length(sha256) = 64), 
	PRIMARY KEY (id), 
	CHECK (substr(path, -length(glottocode)) = glottocode), 
	UNIQUE (glottocode), 
	UNIQUE (path), 
	UNIQUE (sha256)
)
25900
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.07154s ago] ('table', '_option')
SELECT count(*) AS n_rows 
FROM _option
[generated in 0.00042s] ()
ROLLBACK
CREATE TABLE _option (
	id INTEGER NOT NULL, 
	section TEXT NOT NULL CHECK (section != ''), 
	option TEXT NOT NULL CHECK (option != ''), 
	is_lines BOOLEAN, 
	defined BOOLEAN NOT NULL, 
	defined_any_options BOOLEAN NOT NULL, 
	ord_section INTEGER CHECK (ord_section >= 1), 
	ord_option INTEGER CHECK (ord_section >= 0), 
	PRIMARY KEY (id), 
	UNIQUE (section, option), 
	CHECK ((is_lines IS NULL) = (defined = 0)), 
	CHECK (defined = 1 OR defined_any_options = 0), 
	CHECK ((defined = 0) = (ord_section IS NULL)), 
	CHECK (ord_section IS NOT NULL OR ord_section IS NOT NULL), 
	CHECK (is_lines IN (0, 1)), 
	CHECK (defined IN (0, 1)), 
	CHECK (defined_any_options IN (0, 1))
)
53
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.07827s ago] ('table', '_value')
SELECT count(*) AS n_rows 
FROM _value
[generated in 0.00038s] ()
ROLLBACK
CREATE TABLE _value (
	file_id INTEGER NOT NULL, 
	option_id INTEGER NOT NULL, 
	line INTEGER NOT NULL CHECK (line > 0), 
	value TEXT NOT NULL CHECK (value != ''), 
	PRIMARY KEY (file_id, option_id, line), 
	UNIQUE (file_id, line), 
	FOREIGN KEY(file_id) REFERENCES _file (id), 
	FOREIGN KEY(option_id) REFERENCES _option (id)
) WITHOUT ROWID
614595
In [28]:
read_sql(sa.select(File).limit(5), index_col='id')
BEGIN (implicit)
SELECT _file.id, _file.glottocode, _file.path, _file.size, _file.sha256 
FROM _file
 LIMIT ? OFFSET ?
[generated in 0.00092s] (5, 0)
ROLLBACK
Out[28]:
glottocode path size sha256
id
1 abin1243 abin1243 1217 6cfb1478a46c7ffd8b555677439fcf96235dce4cfe1524...
2 abis1238 abis1238 1701 120e2da87dfeb04e40ff556b4d5600897cd8b2b68341e1...
3 abkh1242 abkh1242 306 5dd28ae1265a0967b1a236d06c7fd6e7ac3b9e32d866dd...
4 abkh1243 abkh1242/abkh1243 257 a5900355c69af22d94eaa0a147b95fc291270634bc033c...
5 abaz1241 abkh1242/abkh1243/abaz1241 2416 994a62f8e215f84b8c565e30df7667aa451b40ba945506...
In [29]:
read_sql(sa.select(Option).limit(5), index_col='id')
BEGIN (implicit)
SELECT _option.id, _option.section, _option.option, _option.is_lines, _option.defined, _option.defined_any_options, _option.ord_section, _option.ord_option 
FROM _option
 LIMIT ? OFFSET ?
[generated in 0.00097s] (5, 0)
ROLLBACK
Out[29]:
section option is_lines defined defined_any_options ord_section ord_option
id
1 core name False True False 1 1
2 core hid False True False 1 2
3 core level False True False 1 3
4 core iso639-3 False True False 1 4
5 core latitude False True False 1 5
In [30]:
read_sql(sa.select(Value).limit(5), index_col=['file_id', 'option_id'])
BEGIN (implicit)
SELECT _value.file_id, _value.option_id, _value.line, _value.value 
FROM _value
 LIMIT ? OFFSET ?
[generated in 0.00079s] (5, 0)
ROLLBACK
Out[30]:
line value
file_id option_id
1 1 1 Abinomn
2 2 bsa
3 3 language
4 4 bsa
5 5 -2.92281
In [31]:
select_file_values = (sa.select(Option.section, Option.option, Value.line, Value.value)
                      .select_from(File)
                      .filter_by(glottocode=sa.bindparam('glottocode'))
                      .join(Value).join(Option))

read_sql(select_file_values, params={'glottocode': 'abin1243'},
         index_col=['section', 'option', 'line'])
BEGIN (implicit)
SELECT _option.section, _option.option, _value.line, _value.value 
FROM _file JOIN _value ON _file.id = _value.file_id JOIN _option ON _option.id = _value.option_id 
WHERE _file.glottocode = ?
[generated in 0.00079s] ('abin1243',)
ROLLBACK
Out[31]:
value
section option line
core name 1 Abinomn
hid 2 bsa
level 3 language
iso639-3 4 bsa
latitude 5 -2.92281
longitude 6 138.891
macroareas 7 Papunesia
countries 8 ID
links 9 [Abinomn](http://endangeredlanguages.com/lang/...
10 https://www.wikidata.org/entity/Q56648
11 https://en.wikipedia.org/wiki/Abinomn_language
sources glottolog 12 **hh:e:Lagerberg:Moegip**
13 **hh:h:SilzerClouse:Index**
14 **hh:h:SilzerHeikkinen:Irian**
15 **hh:hv:Foley:Northwest-New-Guinea**
16 **hh:hvtyp:DonohueMusgrave:Melanesia**
17 **hh:w:Fiwei:Abinomn**
altnames multitree 18 "Baso"
19 Abinomn
20 Avinomen
21 Foja
22 Foya
lexvo 23 Abinomn [en]
24 Abinomn language [en]
25 Abinomneg [br]
26 Lingua abinomn [gl]
27 Llingua Abinomn [ast]
hhbib_lgcode 28 Baso
elcat 29 "Baso"
30 Abinomn
31 Avinomen
32 Foja
33 Foya
triggers lgcode 34 macrohistory
35 moegip
identifier multitree 36 bsa
endangeredlanguages 37 1763
classification familyrefs 38 **hh:h:SilzerClouse:Index**
39 **hh:hvtyp:DonohueMusgrave:Melanesia**
endangerment status 40 shifting
source 41 ElCat
date 42 2017-08-19T08:16:16
comment 43 Abinomn (1763-bsa) = Endangered (20 percent ce...
In [32]:
path_depth = File.path_depth()

select_path_depths = (sa.select(path_depth,
                                treedb.Languoid.level, sa.func.count().label('n_files'))
                      .join_from(File, treedb.Languoid, File.glottocode == treedb.Languoid.id)
                      .group_by(path_depth, treedb.Languoid.level)
                      .order_by('path_depth', 'level'))

_ = (read_sql(select_path_depths, index_col=['path_depth', 'level'])
     .unstack(fill_value=0).droplevel(0, axis='columns')[list(treedb.LEVEL)])

_.plot.bar(stacked=True, figsize=(12, 3))

(100 * _.div(_.sum(axis='columns'), axis='rows')).plot.bar(stacked=True, figsize=(12, 3));
BEGIN (implicit)
SELECT (length(_file.path) + ?) / ? AS path_depth, languoid.level, count(*) AS n_files 
FROM _file JOIN languoid ON _file.glottocode = languoid.id GROUP BY (length(_file.path) + ?) / ?, languoid.level ORDER BY path_depth, languoid.level
[generated in 0.00075s] (1, 9, 1, 9)
ROLLBACK
[[email protected]] Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[[email protected]] NumExpr defaulting to 8 threads.
In [33]:
file_size = File.size.label('file_size')

select_file_sizes = (sa.select(file_size, sa.func.count().label('n_files'))
                     .group_by(file_size)
                     .order_by('file_size'))

_ = read_sql(select_file_sizes, index_col='file_size')

(_.plot.area(figsize=(12, 3), logx=True)
 .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(_.index.to_series().describe()
 .to_frame().transpose()[['count', 'min', 'max']])
BEGIN (implicit)
SELECT _file.size AS file_size, count(*) AS n_files 
FROM _file GROUP BY _file.size ORDER BY file_size
[generated in 0.00113s] ()
ROLLBACK
Out[33]:
count min max
file_size 3225.0 65.0 47133.0
In [34]:
file_nvalues = (sa.select(File.glottocode,
                          sa.func.count(Value.option_id.distinct()).label('n_values'))
                .join_from(File, Value)
                .group_by(File.glottocode)
                .alias('file_nvalues')
                .c.n_values)

select_nvalues = (sa.select(file_nvalues, sa.func.count().label('n_files'))
                  .group_by(file_nvalues)
                  .order_by(file_nvalues))

_ = read_sql(select_nvalues, index_col='n_values')

_.plot.bar(figsize=(12, 3))
(_.index.to_series().describe()
 .to_frame().transpose()[['count', 'min', 'max']])
BEGIN (implicit)
SELECT file_nvalues.n_values, count(*) AS n_files 
FROM (SELECT _file.glottocode AS glottocode, count(DISTINCT _value.option_id) AS n_values 
FROM _file JOIN _value ON _file.id = _value.file_id GROUP BY _file.glottocode) AS file_nvalues GROUP BY file_nvalues.n_values ORDER BY file_nvalues.n_values
[generated in 0.00094s] ()
ROLLBACK
Out[34]:
count min max
n_values 36.0 2.0 37.0
In [35]:
value_length = sa.func.length(Value.value).label('value_length')

select_value_length = (sa.select(value_length, sa.func.count().label('n_values'))
                       .group_by(value_length)
                       .order_by('value_length'))

_ = read_sql(select_value_length, index_col='value_length')

(_.plot.area(figsize=(12, 3), logx=True)
 .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(_.index.to_series().describe()
 .to_frame().transpose()[['count', 'min', 'max']])
BEGIN (implicit)
SELECT length(_value.value) AS value_length, count(*) AS n_values 
FROM _value GROUP BY length(_value.value) ORDER BY value_length
[generated in 0.00078s] ()
ROLLBACK
Out[35]:
count min max
value_length 633.0 1.0 6215.0
In [36]:
%time treedb.raw.print_stats()
[[email protected]] fetch statistics
BEGIN (implicit)
SELECT _option.section, _option.option, count(*) AS n 
FROM _option JOIN _value ON _option.id = _value.option_id GROUP BY _option.section, _option.option ORDER BY _option.defined DESC, _option.ord_section, _option.ord_option, _option.section, n DESC, _option.option
[generated in 0.00087s] ()
ROLLBACK
core                   name                   25,900
core                   hid                    8,780
core                   level                  25,900
core                   iso639-3               8,064
core                   latitude               8,686
core                   longitude              8,686
core                   macroareas             21,280
core                   countries              11,377
core                   name_comment           14
core                   comment                1
core                   location               1
core                   name_pronunciation     1
core                   speakers               1
core                   links                  29,704
core                   timespan               144
sources                glottolog              192,520
altnames               multitree              54,536
altnames               hhbib_lgcode           32,639
altnames               lexvo                  26,027
altnames               elcat                  24,757
altnames               aiatsis                6,542
altnames               wals                   2,442
altnames               wals other             2,097
altnames               moseley & asher (1994) 2,089
altnames               ruhlen (1987)          2,007
altnames               glottolog              1,580
altnames               ethnologue             3
triggers               lgcode                 30,089
triggers               inlg                   187
identifier             multitree              14,863
identifier             endangeredlanguages    3,584
identifier             wals                   2,620
identifier             languagelandscape      339
classification         sub                    11,726
classification         subrefs                16,286
classification         family                 210
classification         familyrefs             706
endangerment           status                 8,254
endangerment           source                 8,254
endangerment           date                   8,254
endangerment           comment                8,254
hh_ethnologue_comment  isohid                 614
hh_ethnologue_comment  comment_type           614
hh_ethnologue_comment  ethnologue_versions    614
hh_ethnologue_comment  comment                614
iso_retirement         code                   350
iso_retirement         name                   350
iso_retirement         change_request         341
iso_retirement         effective              350
iso_retirement         reason                 350
iso_retirement         change_to              441
iso_retirement         remedy                 321
iso_retirement         comment                232
Wall time: 743 ms

languoid

In [37]:
from treedb import Languoid

print_sql(Languoid)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 2.695s ago] ('table', 'languoid')
SELECT count(*) AS n_rows 
FROM languoid
[generated in 0.00040s] ()
ROLLBACK
CREATE TABLE languoid (
	id VARCHAR(8) NOT NULL CHECK (length(id) = 8), 
	name VARCHAR NOT NULL CHECK (name != ''), 
	level VARCHAR NOT NULL, 
	parent_id VARCHAR(8), 
	hid TEXT CHECK (length(hid) >= 3), 
	iso639_3 VARCHAR(3) CHECK (length(iso639_3) = 3), 
	latitude FLOAT CHECK (latitude BETWEEN -90 AND 90), 
	longitude FLOAT CHECK (longitude BETWEEN -180 AND 180), 
	PRIMARY KEY (id), 
	CHECK ((latitude IS NULL) = (longitude IS NULL)), 
	UNIQUE (name), 
	FOREIGN KEY(level) REFERENCES languoidlevel (name), 
	FOREIGN KEY(parent_id) REFERENCES languoid (id) DEFERRABLE INITIALLY DEFERRED, 
	UNIQUE (hid), 
	UNIQUE (iso639_3)
) WITHOUT ROWID
25900
In [38]:
from treedb import LEVEL
from treedb.models import LanguoidLevel

FAMILY, LANGUAGE, DIALECT = LEVEL

print_sql(LanguoidLevel)
read_sql(sa.select(LanguoidLevel).order_by('ordinal'), index_col='name')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 2.708s ago] ('table', 'languoidlevel')
SELECT count(*) AS n_rows 
FROM languoidlevel
[generated in 0.00039s] ()
ROLLBACK
CREATE TABLE languoidlevel (
	name VARCHAR NOT NULL CHECK (name != ''), 
	description TEXT NOT NULL CHECK (description != ''), 
	ordinal INTEGER NOT NULL CHECK (ordinal >= 1), 
	PRIMARY KEY (name)
) WITHOUT ROWID
3
BEGIN (implicit)
SELECT languoidlevel.name, languoidlevel.description, languoidlevel.ordinal 
FROM languoidlevel ORDER BY languoidlevel.ordinal
[generated in 0.00055s] ()
ROLLBACK
Out[38]:
description ordinal
name
family sub-grouping of languoids above the language l... 1
language defined by mutual non-intellegibility 2
dialect any variety which is not a language 3
In [39]:
from treedb.models import PseudoFamily

print_sql(PseudoFamily)
read_sql(sa.select(PseudoFamily).order_by('name'), index_col='config_section')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 2.726s ago] ('table', 'pseudofamily')
SELECT count(*) AS n_rows 
FROM pseudofamily
[generated in 0.00059s] ()
ROLLBACK
CREATE TABLE pseudofamily (
	languoid_id VARCHAR(8) NOT NULL, 
	name VARCHAR NOT NULL, 
	config_section VARCHAR NOT NULL CHECK (config_section != ''), 
	description TEXT CHECK (description != ''), 
	bookkeeping BOOLEAN CHECK (bookkeeping = 1), 
	PRIMARY KEY (languoid_id), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	UNIQUE (name), 
	FOREIGN KEY(name) REFERENCES languoid (name), 
	UNIQUE (config_section), 
	UNIQUE (bookkeeping)
) WITHOUT ROWID
8
BEGIN (implicit)
SELECT pseudofamily.languoid_id, pseudofamily.name, pseudofamily.config_section, pseudofamily.description, pseudofamily.bookkeeping 
FROM pseudofamily ORDER BY pseudofamily.name
[generated in 0.00072s] ()
ROLLBACK
Out[39]:
languoid_id name description bookkeeping
config_section
artificial_language arti1236 Artificial Language A language known to have been created by consc... None
bookkeeping book1242 Bookkeeping An alleged language that has been replaced wit... True
mixed_language mixe1287 Mixed Language None None
pidgin pidg1258 Pidgin A language used for interethnic communication ... None
sign_language sign1238 Sign Language A language with visible manual signs as the mo... None
speech_register spee1234 Speech Register A language not used by a community of speakers... None
unattested unat1236 Unattested A language arguably different from all other e... None
unclassifiable uncl1493 Unclassifiable A language arguably different from all other e... None
In [40]:
%time treedb.print_languoid_stats()
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT root.id AS parent_id, root.id AS child_id 
FROM languoid AS root 
WHERE root.parent_id IS NULL UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT ? AS kind, count(*) AS n 
FROM languoid UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? AND languoid.parent_id IS NULL UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? AND languoid.parent_id IS NULL UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.parent_id IS NULL UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? AND languoid.parent_id IS NOT NULL UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND (root.name NOT IN (?, ?, ?, ?, ?, ?, ?, ?)) UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name != ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ?
[generated in 0.00106s] ('languoids', 'families', 'family', 'isolates', 'language', 'roots', 'languages', 'language', 'subfamilies', 'family', 'dialects', 'dialect', 'Spoken L1 Languages', 'language', 'Sign Language', 'Unclassifiable', 'Pidgin', 'Unattested', 'Artificial Language', 'Mixed Language', 'Speech Register', 'Bookkeeping', 'Sign Language', 'language', 'Sign Language', 'Unclassifiable', 'language', 'Unclassifiable', 'Pidgin', 'language', 'Pidgin', 'Unattested', 'language', 'Unattested', 'Artificial Language', 'language', 'Artificial Language', 'Mixed Language', 'language', 'Mixed Language', 'Speech Register', 'language', 'Speech Register', 'All', 'language', 'Bookkeeping', 'Bookkeeping', 'language', 'Bookkeeping')
25,900 languoids
   245 families
   182 isolates
   427 roots
 8,533 languages
 4,326 subfamilies
12,796 dialects
 7,613 Spoken L1 Languages
   202 Sign Language
   122 Unclassifiable
    83 Pidgin
    68 Unattested
    31 Artificial Language
    11 Mixed Language
    14 Speech Register
 8,144 All
ROLLBACK
   389 Bookkeeping
Wall time: 373 ms
In [41]:
read_sql(sa.select(Languoid).limit(5), index_col='id')
BEGIN (implicit)
SELECT languoid.id, languoid.name, languoid.level, languoid.parent_id, languoid.hid, languoid.iso639_3, languoid.latitude, languoid.longitude 
FROM languoid
 LIMIT ? OFFSET ?
[generated in 0.00072s] (5, 0)
ROLLBACK
Out[41]:
name level parent_id hid iso639_3 latitude longitude
id
3adt1234 3Ad-Tekles dialect nort3292 None None NaN NaN
aala1237 Aalawa dialect ramo1244 None None NaN NaN
aant1238 Aantantara dialect nort2920 None None NaN NaN
aari1238 Aari-Gayil family ahkk1235 aiz aiz NaN NaN
aari1239 Aari language aari1238 aiw aiw 5.95034 36.5721
In [42]:
read_sql(sa.select(Languoid).order_by('id').limit(5), index_col='id')
BEGIN (implicit)
SELECT languoid.id, languoid.name, languoid.level, languoid.parent_id, languoid.hid, languoid.iso639_3, languoid.latitude, languoid.longitude 
FROM languoid ORDER BY languoid.id
 LIMIT ? OFFSET ?
[generated in 0.00082s] (5, 0)
ROLLBACK
Out[42]:
name level parent_id hid iso639_3 latitude longitude
id
3adt1234 3Ad-Tekles dialect nort3292 None None NaN NaN
aala1237 Aalawa dialect ramo1244 None None NaN NaN
aant1238 Aantantara dialect nort2920 None None NaN NaN
aari1238 Aari-Gayil family ahkk1235 aiz aiz NaN NaN
aari1239 Aari language aari1238 aiw aiw 5.95034 36.5721
In [43]:
read_sql(sa.select(Languoid).order_by('name').limit(5), index_col='id')
BEGIN (implicit)
SELECT languoid.id, languoid.name, languoid.level, languoid.parent_id, languoid.hid, languoid.iso639_3, languoid.latitude, languoid.longitude 
FROM languoid ORDER BY languoid.name
 LIMIT ? OFFSET ?
[generated in 0.00099s] (5, 0)
ROLLBACK
Out[43]:
name level parent_id hid iso639_3 latitude longitude
id
gane1238 !Gã!ne language east2867 NOCODE_Gane None -31.3200 28.7500
oung1238 !O!ung language book1242 oun oun -15.3000 14.3500
kwii1241 !Ui family tuuu1241 None None NaN NaN
abda1238 'Abd Al-Kuri dialect soqo1240 None None 12.1959 52.2282
aden1242 'Aden dialect jude1267 None None NaN NaN
In [44]:
Child, Parent = (sa.orm.aliased(Languoid, name=n) for n in ('child', 'parent'))

select_parent_levels = (sa.select(Child.level.label('child_level'),
                                  Parent.level.label('parent_level'),
                                  sa.func.count().label('n_languoids'))
                        .outerjoin_from(Child, Parent, Child.parent_id == Parent.id)
                        .group_by(Child.level, Parent.level)
                        .order_by('child_level', 'parent_level'))

(read_sql(select_parent_levels, index_col=['child_level', 'parent_level'])
 .unstack(fill_value=0).assign(all=lambda x: x.sum(axis='columns'))
 .pipe(lambda x: x.append(x.sum().rename('all'))))
BEGIN (implicit)
SELECT child.level AS child_level, parent.level AS parent_level, count(*) AS n_languoids 
FROM languoid AS child LEFT OUTER JOIN languoid AS parent ON child.parent_id = parent.id GROUP BY child.level, parent.level ORDER BY child_level, parent_level
[generated in 0.00094s] ()
ROLLBACK
Out[44]:
n_languoids all
parent_level NaN dialect family language
child_level
dialect 0 2120 0 10676 12796
family 245 0 4326 0 4571
language 182 0 8351 0 8533
all 427 2120 12677 10676 25900
In [45]:
select_lang_nisos = (sa.select(Languoid.level.label('level'),
                              sa.func.count().label('n_languoids'),
                              sa.func.count(Languoid.iso639_3).label('n_isos'))
                    .group_by(Languoid.level)
                    .order_by('level'))

(read_sql(select_lang_nisos, index_col='level')
 .assign(ratio=lambda x: 100 * x['n_isos'] / x['n_languoids']))
BEGIN (implicit)
SELECT languoid.level AS level, count(*) AS n_languoids, count(languoid.iso639_3) AS n_isos 
FROM languoid GROUP BY languoid.level ORDER BY level
[generated in 0.00079s] ()
ROLLBACK
Out[45]:
n_languoids n_isos ratio
level
dialect 12796 203 1.586433
family 4571 44 0.962590
language 8533 7817 91.609047
In [46]:
select_lang_nlocations = (sa.select(Languoid.level.label('level'),
                                    sa.func.count().label('n_languoids'),
                                    sa.func.count(Languoid.latitude).label('n_locations'))
                          .group_by(Languoid.level)
                          .order_by('level'))

(read_sql(select_lang_nlocations, index_col='level')
 .assign(ratio=lambda x: 100 * x['n_locations'] / x['n_languoids']))
BEGIN (implicit)
SELECT languoid.level AS level, count(*) AS n_languoids, count(languoid.latitude) AS n_locations 
FROM languoid GROUP BY languoid.level ORDER BY level
[generated in 0.00061s] ()
ROLLBACK
Out[46]:
n_languoids n_locations ratio
level
dialect 12796 450 3.516724
family 4571 24 0.525049
language 8533 8212 96.238134
In [47]:
select_latlon = (sa.select(Languoid.latitude, Languoid.longitude)
                 .select_from(Languoid)
                 .filter_by(level=LANGUAGE))

latitudes, longitudes = zip(*treedb.iterrows(select_latlon))
    
plt.figure(figsize=(12, 6))
plt.axis([-180, 180, -90, 90])
plt.xticks(range(-180, 181, 60))
plt.yticks(range(-90, 91, 30))
plt.scatter(longitudes, latitudes, 1, 'black');
BEGIN (implicit)
SELECT languoid.latitude, languoid.longitude 
FROM languoid 
WHERE languoid.level = ?
[generated in 0.00102s] ('language',)
ROLLBACK
In [48]:
Family, Child, family_child = treedb.Languoid.parent_descendant(parent_root=True,
                                                                parent_level=FAMILY)

n_languages = sa.func.count(Child.id).label('n_languages')

select_family_nlanguages = (sa.select(Family.id.label('family_id'), n_languages)
                            .select_from(family_child)
                            .where(Child.level == LANGUAGE)
                            .group_by(Family.id))

select_top_families = (select_family_nlanguages
                       .having(n_languages >= 100)
                       .order_by('n_languages'))

(read_sql(select_top_families, index_col='family_id')
 .plot.barh());
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT root.id AS parent_id, child.id AS child_id 
FROM languoid AS root LEFT OUTER JOIN languoid AS child ON root.id = child.parent_id 
WHERE root.parent_id IS NULL AND root.level = ? UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT root.id AS family_id, count(child.id) AS n_languages 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id LEFT OUTER JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? GROUP BY root.id 
HAVING count(child.id) >= ? ORDER BY n_languages
[generated in 0.00114s] ('family', 'language', 100)
ROLLBACK
In [49]:
family_size = (select_family_nlanguages
               .alias('family_nlanguages')
               .c.n_languages.label('family_size'))

select_family_sizes = (sa.select(family_size, sa.func.count().label('n_families'))
                       .group_by(family_size)
                       .order_by('family_size'))

_ = read_sql(select_family_sizes, index_col='family_size')

(_.plot.area(figsize=(12, 3), logx=True)
 .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(_.index.to_series().describe()
 .to_frame().transpose()[['count', 'min', 'max']])
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT root.id AS parent_id, child.id AS child_id 
FROM languoid AS root LEFT OUTER JOIN languoid AS child ON root.id = child.parent_id 
WHERE root.parent_id IS NULL AND root.level = ? UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT family_nlanguages.n_languages AS family_size, count(*) AS n_families 
FROM (SELECT root.id AS family_id, count(child.id) AS n_languages 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id LEFT OUTER JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? GROUP BY root.id) AS family_nlanguages GROUP BY family_nlanguages.n_languages ORDER BY family_size
[generated in 0.00085s] ('family', 'language')
ROLLBACK
Out[49]:
count min max
family_size 56.0 2.0 1403.0
In [50]:
levels = (sa.union_all(*[sa.select(sa.literal(l).label('level')) for l in LEVEL])
          .cte(name='levels'))

select_parent_nchildren = (sa.select(Parent.id.label('parent_id'),
                                     Parent.level.label('parent_level'),
                                     levels.c.level.label('child_level'),
                                     sa.func.count(Child.id).label('n_children'))
                           .join_from(Parent, levels, sa.true())
                           .outerjoin(Child, sa.and_(Parent.id == Child.parent_id, 
                                                     Child.level == levels.c.level))
                           .group_by(Parent.id, Parent.level, levels.c.level))

(read_sql(select_parent_nchildren)
 .pivot_table(index='parent_level', columns='child_level', values='n_children',
              aggfunc=['sum', 'max', 'mean'], fill_value=0))
BEGIN (implicit)
WITH levels AS 
(SELECT ? AS level UNION ALL SELECT ? AS level UNION ALL SELECT ? AS level)
 SELECT parent.id AS parent_id, parent.level AS parent_level, levels.level AS child_level, count(child.id) AS n_children 
FROM languoid AS parent JOIN levels ON 1 = 1 LEFT OUTER JOIN languoid AS child ON parent.id = child.parent_id AND child.level = levels.level GROUP BY parent.id, parent.level, levels.level
[generated in 0.00077s] ('family', 'language', 'dialect')
ROLLBACK
Out[50]:
sum max mean
child_level dialect family language dialect family language dialect family language
parent_level
dialect 2120 0 0 22 0 0 0.165677 0.000000 0.000000
family 0 4326 8351 0 46 389 0.000000 0.946401 1.826953
language 10676 0 0 32 0 0 1.251143 0.000000 0.000000
In [51]:
Parent, Child, parent_child = treedb.Languoid.parent_descendant()

select_parent_ndescendants = (sa.select(Parent.id.label('parent_id'),
                                        Parent.level.label('parent_level'),
                                        sa.func.count(Child.id).label('n_descendants'))
                              .select_from(parent_child)
                              .group_by(Parent.id, Parent.level)
                              .alias('parent_ndescendants'))

parent_level = select_parent_ndescendants.c.parent_level

n_descendants = select_parent_ndescendants.c.n_descendants

d_min = sa.func.min(n_descendants).label('min')
d_max = sa.func.max(n_descendants).label('max')
d_mean = (sa.func.sum(n_descendants).cast(sa.Float) / sa.func.count(n_descendants)).label('mean')

select_level_mean_descendants = (sa.select(parent_level, d_min, d_max, d_mean)
                                 .group_by(parent_level)
                                 .order_by(parent_level))

select_total_mean_descendants = sa.select(sa.literal('total').label('parent_level'),
                                          d_min, d_max, d_mean)

select_mean_descendants = [select_level_mean_descendants, select_total_mean_descendants]

_ = pd.concat([read_sql(q, index_col='parent_level') for q in select_mean_descendants])
_.columns = pd.MultiIndex.from_product([['n_descendants'], _.columns])
_
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT parent.id AS parent_id, child.id AS child_id 
FROM languoid AS parent LEFT OUTER JOIN languoid AS child ON parent.id = child.parent_id UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT parent_ndescendants.parent_level, min(parent_ndescendants.n_descendants) AS min, max(parent_ndescendants.n_descendants) AS max, CAST(sum(parent_ndescendants.n_descendants) AS FLOAT) / count(parent_ndescendants.n_descendants) AS mean 
FROM (SELECT parent.id AS parent_id, parent.level AS parent_level, count(child.id) AS n_descendants 
FROM tree JOIN languoid AS parent ON tree.parent_id = parent.id LEFT OUTER JOIN languoid AS child ON tree.child_id = child.id GROUP BY parent.id, parent.level) AS parent_ndescendants GROUP BY parent_ndescendants.parent_level ORDER BY parent_ndescendants.parent_level
[generated in 0.00095s] ()
ROLLBACK
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT parent.id AS parent_id, child.id AS child_id 
FROM languoid AS parent LEFT OUTER JOIN languoid AS child ON parent.id = child.parent_id UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT ? AS parent_level, min(parent_ndescendants.n_descendants) AS min, max(parent_ndescendants.n_descendants) AS max, CAST(sum(parent_ndescendants.n_descendants) AS FLOAT) / count(parent_ndescendants.n_descendants) AS mean 
FROM (SELECT parent.id AS parent_id, parent.level AS parent_level, count(child.id) AS n_descendants 
FROM tree JOIN languoid AS parent ON tree.parent_id = parent.id LEFT OUTER JOIN languoid AS child ON tree.child_id = child.id GROUP BY parent.id, parent.level) AS parent_ndescendants
[generated in 0.00082s] ('total',)
ROLLBACK
Out[51]:
n_descendants
min max mean
parent_level
dialect 0 57 0.228978
family 1 4793 34.641873
language 0 125 1.499590
total 0 4793 6.721004

macroarea

In [52]:
from treedb.models import Macroarea, languoid_macroarea

print_sql(Macroarea)
print_sql(languoid_macroarea)
read_sql(sa.select(Macroarea).order_by('name'), index_col='config_section')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 5.635s ago] ('table', 'macroarea')
SELECT count(*) AS n_rows 
FROM macroarea
[generated in 0.00048s] ()
ROLLBACK
CREATE TABLE macroarea (
	name VARCHAR NOT NULL CHECK (name != ''), 
	config_section VARCHAR NOT NULL CHECK (config_section != ''), 
	description TEXT NOT NULL CHECK (description != ''), 
	PRIMARY KEY (name), 
	UNIQUE (config_section)
) WITHOUT ROWID
6
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 5.643s ago] ('table', 'languoid_macroarea')
SELECT count(*) AS n_rows 
FROM languoid_macroarea
[generated in 0.00039s] ()
ROLLBACK
CREATE TABLE languoid_macroarea (
	languoid_id VARCHAR(8) NOT NULL, 
	macroarea_name VARCHAR NOT NULL, 
	PRIMARY KEY (languoid_id, macroarea_name), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	FOREIGN KEY(macroarea_name) REFERENCES macroarea (name)
) WITHOUT ROWID
21280
BEGIN (implicit)
SELECT macroarea.name, macroarea.config_section, macroarea.description 
FROM macroarea ORDER BY macroarea.name
[generated in 0.00056s] ()
ROLLBACK
Out[52]:
name description
config_section
africa Africa The continent
australia Australia The continent
eurasia Eurasia The Eurasian landmass North of Sinai. Includes...
northamerica North America North and Middle America up to Panama. Include...
pacific Papunesia All islands between Sumatra and the Americas, ...
southamerica South America Everything South of Darién.
In [53]:
select_macroarea_nlanguages = (sa.select(Macroarea.name.label('macroarea'),
                                         sa.func.count().label('n_languages'))
                               .outerjoin_from(Macroarea, languoid_macroarea).join(Languoid)
                               .filter_by(level=LANGUAGE)
                               .group_by(Macroarea.name)
                               .order_by(sa.desc('n_languages')))

_ = read_sql(select_macroarea_nlanguages, index_col='macroarea')
_.plot.pie(y='n_languages')
_
BEGIN (implicit)
SELECT macroarea.name AS macroarea, count(*) AS n_languages 
FROM macroarea LEFT OUTER JOIN languoid_macroarea ON macroarea.name = languoid_macroarea.macroarea_name JOIN languoid ON languoid.id = languoid_macroarea.languoid_id 
WHERE languoid.level = ? GROUP BY macroarea.name ORDER BY n_languages DESC
[generated in 0.00081s] ('language',)
ROLLBACK
Out[53]:
n_languages
macroarea
Africa 2353
Papunesia 2208
Eurasia 1977
North America 788
South America 716
Australia 388
In [54]:
macroareas = sa.func.group_concat(Macroarea.name, ', ').label('macroareas')

select_multiarea_languages = (sa.select(Languoid.id, Languoid.name, macroareas)
                              .select_from(Languoid)
                              .filter_by(level=LANGUAGE)
                              .join(languoid_macroarea).join(Macroarea)
                              .group_by(Languoid.id)
                              .having(sa.func.count() > 1)
                              .order_by('id'))
    
assert read_sql(select_multiarea_languages).empty
BEGIN (implicit)
SELECT languoid.id, languoid.name, group_concat(macroarea.name, ?) AS macroareas 
FROM languoid JOIN languoid_macroarea ON languoid.id = languoid_macroarea.languoid_id JOIN macroarea ON macroarea.name = languoid_macroarea.macroarea_name 
WHERE languoid.level = ? GROUP BY languoid.id 
HAVING count(*) > ? ORDER BY languoid.id
[generated in 0.00095s] (', ', 'language', 1)
ROLLBACK

country

In [55]:
from treedb.models import Country, languoid_country

print_sql(Country)
print_sql(languoid_country)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 5.84s ago] ('table', 'country')
SELECT count(*) AS n_rows 
FROM country
[generated in 0.00045s] ()
ROLLBACK
CREATE TABLE country (
	id VARCHAR(2) NOT NULL CHECK (length(id) = 2), 
	name TEXT NOT NULL CHECK (name != ''), 
	PRIMARY KEY (id), 
	UNIQUE (name)
) WITHOUT ROWID
246
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 5.847s ago] ('table', 'languoid_country')
SELECT count(*) AS n_rows 
FROM languoid_country
[generated in 0.00036s] ()
ROLLBACK
CREATE TABLE languoid_country (
	languoid_id VARCHAR(8) NOT NULL, 
	country_id VARCHAR(2) NOT NULL, 
	PRIMARY KEY (languoid_id, country_id), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	FOREIGN KEY(country_id) REFERENCES country (id)
) WITHOUT ROWID
11377
In [56]:
select_country_nlanguages = (sa.select(Country.name.label('country'),
                                       sa.func.count().label('n_languages'))
                             .outerjoin_from(Country, languoid_country).join(Languoid)
                             .filter_by(level=LANGUAGE)
                             .group_by(Country.id)
                             .order_by(sa.desc('n_languages'))
                             .limit(10))

read_sql(select_country_nlanguages, index_col='country')
BEGIN (implicit)
SELECT country.name AS country, count(*) AS n_languages 
FROM country LEFT OUTER JOIN languoid_country ON country.id = languoid_country.country_id JOIN languoid ON languoid.id = languoid_country.languoid_id 
WHERE languoid.level = ? GROUP BY country.id ORDER BY n_languages DESC
 LIMIT ? OFFSET ?
[generated in 0.00078s] ('language', 10, 0)
ROLLBACK
Out[56]:
n_languages
country
Papua New Guinea 894
Indonesia 745
Nigeria 572
India 502
China 405
Australia 398
Mexico 331
Brazil 330
United States 320
Cameroon 315
In [57]:
select_lang_country = (sa.select(Languoid.id, Languoid.name, Country.id.label('country'))
                       .select_from(Languoid)
                       .filter_by(level=LANGUAGE)
                       .join(languoid_country).join(Country)
                       .order_by(Languoid.id, 'country')
                       .alias('lang_country'))

countries = sa.func.group_concat(select_lang_country.c.country, ', ').label('countries')

select_multicountry_languages = (sa.select(select_lang_country.c.id,
                                           select_lang_country.c.name,
                                           sa.func.count().label('n_countries'),
                                           countries)
                                 .group_by(select_lang_country.c.id,
                                           select_lang_country.c.name)
                                 .having(sa.func.count() > 1)
                                 .order_by(sa.desc('n_countries'),
                                           select_lang_country.c.id)
                                 .limit(10))
    
read_sql(select_multicountry_languages, index_col='id')
BEGIN (implicit)
SELECT lang_country.id, lang_country.name, count(*) AS n_countries, group_concat(lang_country.country, ?) AS countries 
FROM (SELECT languoid.id AS id, languoid.name AS name, country.id AS country 
FROM languoid JOIN languoid_country ON languoid.id = languoid_country.languoid_id JOIN country ON country.id = languoid_country.country_id 
WHERE languoid.level = ? ORDER BY languoid.id, country) AS lang_country GROUP BY lang_country.id, lang_country.name 
HAVING count(*) > ? ORDER BY n_countries DESC, lang_country.id
 LIMIT ? OFFSET ?
[generated in 0.00100s] (', ', 'language', 1, 10, 0)
ROLLBACK
Out[57]:
name n_countries countries
id
stan1293 English 34 AU, BM, BR, BZ, CA, CC, CK, CU, CX, DO, FK, GB...
stan1288 Spanish 30 AD, AR, BO, BR, BZ, CL, CO, CR, CU, DO, EC, ES...
stan1318 Standard Arabic 25 AE, BH, DJ, DZ, EG, ER, IL, IQ, JO, KM, KW, LB...
amer1248 American Sign Language 22 BB, BF, BJ, BO, CA, CD, CF, CI, CN, GA, GH, JM...
east2295 Eastern Yiddish 22 AR, AU, BE, BY, CA, CR, EE, HU, IL, LT, LV, MD...
russ1263 Russian 20 BY, CN, EE, FI, GE, KG, KP, KZ, LT, LV, MD, MN...
stan1295 German 16 AT, BE, CH, CZ, DE, DK, FR, HU, IT, LI, LU, NL...
port1283 Portuguese 15 AD, AR, BO, BR, CO, ES, FR, GF, GY, PE, PT, PY...
sint1235 Sinte-Manus Romani 13 AT, CH, CZ, DE, FR, HR, IT, KZ, ME, NL, PL, RS...
nucl1301 Turkish 12 AL, AM, AZ, BG, CY, GE, GR, IQ, IR, MK, SY, TR
In [58]:
n_countries = (sa.select(Languoid.id, sa.func.count().label('n_countries'))
               .select_from(Languoid)
               .filter_by(level=LANGUAGE)
               .outerjoin(languoid_country)
               .group_by(Languoid.id)
               .alias('language_ncountries')
               .c.n_countries)

select_lc_dist = (sa.select(n_countries, sa.func.count().label('n_languages'))
                  .group_by(n_countries)
                  .order_by('n_countries'))

_ = read_sql(select_lc_dist, index_col='n_countries')

_.plot.bar(figsize=(12, 3))

(read_sql(sa.select(sa.literal('n_countries').label('value'),
                   sa.func.count().label('count'),
                   sa.func.sum(n_countries).label('sum'),
                   sa.func.min(n_countries).label('min'),
                   sa.func.max(n_countries).label('max'),
                   sa.func.avg(n_countries).label('mean')),
          index_col='value')
 .rename_axis(None))
BEGIN (implicit)
SELECT language_ncountries.n_countries, count(*) AS n_languages 
FROM (SELECT languoid.id AS id, count(*) AS n_countries 
FROM languoid LEFT OUTER JOIN languoid_country ON languoid.id = languoid_country.languoid_id 
WHERE languoid.level = ? GROUP BY languoid.id) AS language_ncountries GROUP BY language_ncountries.n_countries ORDER BY language_ncountries.n_countries
[generated in 0.00093s] ('language',)
ROLLBACK
BEGIN (implicit)
SELECT ? AS value, count(*) AS count, sum(language_ncountries.n_countries) AS sum, min(language_ncountries.n_countries) AS min, max(language_ncountries.n_countries) AS max, avg(language_ncountries.n_countries) AS mean 
FROM (SELECT languoid.id AS id, count(*) AS n_countries 
FROM languoid LEFT OUTER JOIN languoid_country ON languoid.id = languoid_country.languoid_id 
WHERE languoid.level = ? GROUP BY languoid.id) AS language_ncountries
[generated in 0.00095s] ('n_countries', 'language')
ROLLBACK
Out[58]:
count sum min max mean
n_countries 8533 10995 1 34 1.288527

altname

In [59]:
from treedb.models import Altname, AltnameProvider

select_provider_nlanguoids = (sa.select(AltnameProvider.name.label('altname_provider'),
                                        sa.func.count(sa.distinct(Altname.languoid_id)).label('n_languoids'))
                              .join_from(AltnameProvider, Altname)
                              .group_by(AltnameProvider.name)
                              .order_by('n_languoids', 'altname_provider'))

(read_sql(select_provider_nlanguoids, index_col='altname_provider')
 .plot.barh());
BEGIN (implicit)
SELECT altnameprovider.name AS altname_provider, count(DISTINCT altname.languoid_id) AS n_languoids 
FROM altnameprovider JOIN altname ON altnameprovider.id = altname.provider_id GROUP BY altnameprovider.name ORDER BY n_languoids, altname_provider
[generated in 0.00076s] ()
ROLLBACK
In [60]:
n_altnames = (sa.select(Languoid.id, sa.func.count().label('n_altnames'))
              .outerjoin_from(Languoid, Altname)
              .group_by(Languoid.id)
              .alias('languoid_naltnames')
              .c.n_altnames)

select_la_dist = (sa.select(n_altnames, sa.func.count().label('n_languoids'))
                  .group_by(n_altnames)
                  .order_by(n_altnames))

_ = read_sql(select_la_dist, index_col='n_altnames')

_.plot.area(figsize=(12, 3))

(read_sql(sa.select(sa.literal('n_altnames').label('value'),
                    sa.func.count().label('count'),
                    sa.func.sum(n_altnames).label('sum'),
                    sa.func.min(n_altnames).label('min'),
                    sa.func.max(n_altnames).label('max'),
                    sa.func.avg(n_altnames).label('mean')),
         index_col='value')
 .rename_axis(None))
BEGIN (implicit)
SELECT languoid_naltnames.n_altnames, count(*) AS n_languoids 
FROM (SELECT languoid.id AS id, count(*) AS n_altnames 
FROM languoid LEFT OUTER JOIN altname ON languoid.id = altname.languoid_id GROUP BY languoid.id) AS languoid_naltnames GROUP BY languoid_naltnames.n_altnames ORDER BY languoid_naltnames.n_altnames
[generated in 0.00082s] ()
ROLLBACK
BEGIN (implicit)
SELECT ? AS value, count(*) AS count, sum(languoid_naltnames.n_altnames) AS sum, min(languoid_naltnames.n_altnames) AS min, max(languoid_naltnames.n_altnames) AS max, avg(languoid_naltnames.n_altnames) AS mean 
FROM (SELECT languoid.id AS id, count(*) AS n_altnames 
FROM languoid LEFT OUTER JOIN altname ON languoid.id = altname.languoid_id GROUP BY languoid.id) AS languoid_naltnames
[generated in 0.00092s] ('n_altnames',)
ROLLBACK
Out[60]:
count sum min max mean
n_altnames 25900 164148 1 256 6.337761

source

In [61]:
from treedb.models import Source, SourceProvider

select_provider_nsources = (sa.select(SourceProvider.name.label('provider'),
                                      Languoid.level,
                                      sa.func.count().label('n_sources'))
                            .join_from(SourceProvider, Source).join(Languoid)
                            .group_by(SourceProvider.name, Languoid.level)
                            .order_by('provider', sa.desc('n_sources')))

read_sql(select_provider_nsources, index_col=['provider', 'level'])
BEGIN (implicit)
SELECT sourceprovider.name AS provider, languoid.level, count(*) AS n_sources 
FROM sourceprovider JOIN source ON sourceprovider.id = source.provider_id JOIN languoid ON languoid.id = source.languoid_id GROUP BY sourceprovider.name, languoid.level ORDER BY provider, n_sources DESC
[generated in 0.00082s] ()
ROLLBACK
Out[61]:
n_sources
provider level
glottolog language 188384
family 2417
dialect 1719
In [62]:
n_sources = (sa.select(Languoid.id,
                       sa.func.count(Source.languoid_id).label('n_sources'))
             .outerjoin_from(Languoid, Source)
             .group_by(Languoid.id)
             .alias('lang_nsources')
             .c.n_sources)

select_nsources_nlangs = (sa.select(n_sources, sa.func.count().label('n_languoids'))
                          .group_by(n_sources)
                          .order_by('n_languoids'))

_ = read_sql(select_nsources_nlangs, index_col='n_sources')
(_.groupby(_.index != 0).sum().rename_axis('n_sources')
    .rename(index={False: '= 0', True: '> 0'}).plot.bar(figsize=(4, 3)))
(_.drop(0).plot.area(figsize=(8, 3), logx=True)
    .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))

(read_sql(sa.select(sa.literal('n_sources').label('value'),
                    sa.func.count().label('count'),
                    sa.func.sum(n_sources).label('sum'),
                    sa.func.min(n_sources).label('min'),
                    sa.func.max(n_sources).label('max'),
                    sa.func.avg(n_sources).label('mean')),
          index_col='value')
 .rename_axis(None))
BEGIN (implicit)
SELECT lang_nsources.n_sources, count(*) AS n_languoids 
FROM (SELECT languoid.id AS id, count(source.languoid_id) AS n_sources 
FROM languoid LEFT OUTER JOIN source ON languoid.id = source.languoid_id GROUP BY languoid.id) AS lang_nsources GROUP BY lang_nsources.n_sources ORDER BY n_languoids
[generated in 0.00094s] ()
ROLLBACK
BEGIN (implicit)
SELECT ? AS value, count(*) AS count, sum(lang_nsources.n_sources) AS sum, min(lang_nsources.n_sources) AS min, max(lang_nsources.n_sources) AS max, avg(lang_nsources.n_sources) AS mean 
FROM (SELECT languoid.id AS id, count(source.languoid_id) AS n_sources 
FROM languoid LEFT OUTER JOIN source ON languoid.id = source.languoid_id GROUP BY languoid.id) AS lang_nsources
[generated in 0.00119s] ('n_sources',)
ROLLBACK
Out[62]:
count sum min max mean
n_sources 25900 192520 0 1549 7.433205
In [63]:
select_lang_nsources = (sa.select(Languoid.id, Languoid.level,
                                  sa.func.count(Source.languoid_id).label('n_sources'))
                        .outerjoin_from(Languoid, Source)
                        .group_by(Languoid.id, Languoid.level)
                        .alias('lang_nsources'))

select_ln_nlangs = (sa.select(select_lang_nsources.c.level,
                              select_lang_nsources.c.n_sources,
                              sa.func.count().label('n_languoids'))
                    .group_by(select_lang_nsources.c.level, select_lang_nsources.c.n_sources)
                    .order_by('n_languoids'))

_ = read_sql(select_ln_nlangs).pivot(index='n_sources', columns='level', values='n_languoids')
(_.groupby(_.index != 0).sum()[list(treedb.LEVEL)].rename_axis('n_sources')
 .rename(index={False: '= 0', True: '> 0'}).plot.bar(figsize=(4, 3)))
(_.drop(0)[list(treedb.LEVEL)].plot.area(figsize=(8, 3), logx=True)
 .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
BEGIN (implicit)
SELECT lang_nsources.level, lang_nsources.n_sources, count(*) AS n_languoids 
FROM (SELECT languoid.id AS id, languoid.level AS level, count(source.languoid_id) AS n_sources 
FROM languoid LEFT OUTER JOIN source ON languoid.id = source.languoid_id GROUP BY languoid.id, languoid.level) AS lang_nsources GROUP BY lang_nsources.level, lang_nsources.n_sources ORDER BY n_languoids
[generated in 0.00074s] ()
ROLLBACK
In [64]:
from treedb.models import Bibfile, Bibitem

select_bibfile_nused = (sa.select(Bibfile.name.label('bibfile'), sa.func.count().label('n_used'))
                        .join_from(Bibfile, Bibitem)
                        .group_by(Bibfile.name)
                        .order_by(sa.desc('n_used')))

(read_sql(select_bibfile_nused, index_col='bibfile')
 .plot.bar(figsize=(12, 3)));
BEGIN (implicit)
SELECT bibfile.name AS bibfile, count(*) AS n_used 
FROM bibfile JOIN bibitem ON bibfile.id = bibitem.bibfile_id GROUP BY bibfile.name ORDER BY n_used DESC
[generated in 0.00089s] ()
ROLLBACK

classification

In [65]:
from treedb.models import ClassificationRef

select_cr_levels = (sa.select(ClassificationRef.kind, Languoid.level,
                              sa.func.count().label('n_classificationrefs'))
                    .join_from(ClassificationRef, Languoid)
                    .group_by(ClassificationRef.kind, Languoid.level)
                    .order_by('kind', 'level'))

read_sql(select_cr_levels, index_col=['kind', 'level']).unstack()
BEGIN (implicit)
SELECT classificationref.kind, languoid.level, count(*) AS n_classificationrefs 
FROM classificationref JOIN languoid ON languoid.id = classificationref.languoid_id GROUP BY classificationref.kind, languoid.level ORDER BY classificationref.kind, languoid.level
[generated in 0.00059s] ()
ROLLBACK
Out[65]:
n_classificationrefs
level dialect family language
kind
family 8 442 256
sub 101 6113 10072
In [66]:
n_crefs = (sa.select(Languoid.id, sa.func.count().label('n_crefs'))
           .outerjoin_from(Languoid, ClassificationRef)
           .group_by(Languoid.id)
           .alias('lang_ncrefs')
           .c.n_crefs)

select_lcr_dist = (sa.select(n_crefs, sa.func.count().label('n_languoids'))
                   .group_by(n_crefs)
                   .order_by(n_crefs))

_ = read_sql(select_lcr_dist, index_col='n_crefs')

_.plot.area(figsize=(12, 3))

(read_sql(sa.select(sa.literal('n_crefs').label('value'),
                    sa.func.count().label('count'),
                    sa.func.sum(n_crefs).label('sum'),
                    sa.func.min(n_crefs).label('min'),
                    sa.func.max(n_crefs).label('max'),
                    sa.func.avg(n_crefs).label('mean')),
         index_col='value')
 .rename_axis(None))
BEGIN (implicit)
SELECT lang_ncrefs.n_crefs, count(*) AS n_languoids 
FROM (SELECT languoid.id AS id, count(*) AS n_crefs 
FROM languoid LEFT OUTER JOIN classificationref ON languoid.id = classificationref.languoid_id GROUP BY languoid.id) AS lang_ncrefs GROUP BY lang_ncrefs.n_crefs ORDER BY lang_ncrefs.n_crefs
[generated in 0.00083s] ()
ROLLBACK
BEGIN (implicit)
SELECT ? AS value, count(*) AS count, sum(lang_ncrefs.n_crefs) AS sum, min(lang_ncrefs.n_crefs) AS min, max(lang_ncrefs.n_crefs) AS max, avg(lang_ncrefs.n_crefs) AS mean 
FROM (SELECT languoid.id AS id, count(*) AS n_crefs 
FROM languoid LEFT OUTER JOIN classificationref ON languoid.id = classificationref.languoid_id GROUP BY languoid.id) AS lang_ncrefs
[generated in 0.00088s] ('n_crefs',)
ROLLBACK
Out[66]:
count sum min max mean
n_crefs 25900 30888 1 11 1.192587
In [67]:
from treedb.models import Link

print_sql(Link)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 7.962s ago] ('table', 'link')
SELECT count(*) AS n_rows 
FROM link
[generated in 0.00041s] ()
ROLLBACK
CREATE TABLE link (
	languoid_id VARCHAR(8) NOT NULL, 
	ord INTEGER NOT NULL CHECK (ord >= 1), 
	url TEXT NOT NULL CHECK (url != ''), 
	title TEXT CHECK (title != ''), 
	scheme TEXT, 
	PRIMARY KEY (languoid_id, ord), 
	UNIQUE (languoid_id, url), 
	CHECK (substr(url, 1, length(scheme) + 3) = scheme || '://'), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	CHECK (scheme IN ('http', 'https'))
) WITHOUT ROWID
29704
In [68]:
select_scheme_nlinks = (sa.select(Link.scheme.label('link_scheme'), sa.func.count().label('n_links'))
                        .group_by(Link.scheme)
                        .order_by(sa.desc('n_links')))

(read_sql(select_scheme_nlinks, index_col='link_scheme')
 .plot.pie(y='n_links'));
BEGIN (implicit)
SELECT link.scheme AS link_scheme, count(*) AS n_links 
FROM link GROUP BY link.scheme ORDER BY n_links DESC
[generated in 0.00086s] ()
ROLLBACK
In [69]:
from urllib.parse import urlparse

hosts = collections.Counter(urlparse(url).hostname for url, in treedb.iterrows(sa.select(Link.url)))

(pd.DataFrame.from_dict(hosts, orient='index', columns=['n_links'])
 .sort_values(by='n_links')
 .plot.barh());
BEGIN (implicit)
SELECT link.url 
FROM link
[generated in 0.00089s] ()
ROLLBACK

endangerment

In [70]:
from treedb.models import Endangerment, EndangermentStatus, EndangermentSource

print_sql(Endangerment)
print_sql(EndangermentStatus)
print_sql(EndangermentSource)
read_sql(sa.select(EndangermentStatus).order_by('ordinal'), index_col='config_section')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 8.409s ago] ('table', 'endangerment')
SELECT count(*) AS n_rows 
FROM endangerment
[generated in 0.00040s] ()
ROLLBACK
CREATE TABLE endangerment (
	languoid_id VARCHAR(8) NOT NULL, 
	status VARCHAR NOT NULL, 
	source_id INTEGER NOT NULL, 
	date DATETIME NOT NULL, 
	comment TEXT NOT NULL CHECK (comment != ''), 
	PRIMARY KEY (languoid_id), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	FOREIGN KEY(status) REFERENCES endangermentstatus (name), 
	FOREIGN KEY(source_id) REFERENCES endangermentsource (id)
) WITHOUT ROWID
8254
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 8.417s ago] ('table', 'endangermentstatus')
SELECT count(*) AS n_rows 
FROM endangermentstatus
[generated in 0.00037s] ()
ROLLBACK
CREATE TABLE endangermentstatus (
	name VARCHAR NOT NULL CHECK (name != ''), 
	config_section VARCHAR NOT NULL CHECK (config_section != ''), 
	ordinal INTEGER NOT NULL CHECK (ordinal >= 1), 
	egids VARCHAR NOT NULL CHECK (egids != ''), 
	unesco VARCHAR NOT NULL CHECK (unesco != ''), 
	elcat VARCHAR NOT NULL CHECK (elcat != ''), 
	icon VARCHAR NOT NULL CHECK (icon != ''), 
	bibitem_id INTEGER, 
	PRIMARY KEY (name), 
	UNIQUE (config_section), 
	FOREIGN KEY(bibitem_id) REFERENCES bibitem (id)
) WITHOUT ROWID
6
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 8.424s ago] ('table', 'endangermentsource')
SELECT count(*) AS n_rows 
FROM endangermentsource
[generated in 0.00055s] ()
ROLLBACK
CREATE TABLE endangermentsource (
	id INTEGER NOT NULL, 
	name TEXT NOT NULL CHECK (name != ''), 
	bibitem_id INTEGER, 
	pages TEXT CHECK (pages != ''), 
	PRIMARY KEY (id), 
	UNIQUE (bibitem_id, pages), 
	CHECK ((bibitem_id IS NULL) = (pages IS NULL)), 
	UNIQUE (name), 
	FOREIGN KEY(bibitem_id) REFERENCES bibitem (id)
)
44
BEGIN (implicit)
SELECT endangermentstatus.name, endangermentstatus.config_section, endangermentstatus.ordinal, endangermentstatus.egids, endangermentstatus.unesco, endangermentstatus.elcat, endangermentstatus.icon, endangermentstatus.bibitem_id 
FROM endangermentstatus ORDER BY endangermentstatus.ordinal
[generated in 0.00058s] ()
ROLLBACK
Out[70]:
name ordinal egids unesco elcat icon bibitem_id
config_section
safe not endangered 1 <=6a safe at risk c00ff00 1
vulnerable threatened 2 6b vulnerable vulnerable ca0fb75 1
definite shifting 3 7 definitely endangered threatened/endangered sff6600 1
severe moribund 4 8a severely endangered severly endangered dff4400 1
critical nearly extinct 5 8b critically endangered critically endangered tff0000 1
extinct extinct 6 >=9 extinct dormant/awakening f000000 1
In [71]:
e_source = EndangermentSource.name.label('source')

select_source_nendangerments = (sa.select(e_source, sa.func.count().label('n_endangerments'))
                                .join_from(Endangerment, EndangermentSource)
                                .group_by(e_source)
                                .order_by('n_endangerments'))

(read_sql(select_source_nendangerments, index_col='source')
 .plot.barh());
BEGIN (implicit)
SELECT endangermentsource.name AS source, count(*) AS n_endangerments 
FROM endangerment JOIN endangermentsource ON endangermentsource.id = endangerment.source_id GROUP BY endangermentsource.name ORDER BY n_endangerments
[generated in 0.00053s] ()
ROLLBACK

Example query

In [72]:
%time treedb.hash_csv(treedb.get_example_query())
BEGIN (implicit)
WITH RECURSIVE tree(child_id, parent_id, steps, terminal) AS 
(SELECT child.id AS child_id, child.id AS parent_id, ? AS steps, child.parent_id IS NULL AS terminal 
FROM languoid AS child UNION ALL SELECT tree.child_id AS child_id, parent.parent_id AS parent_id, tree.steps + ? AS steps, grandparent.parent_id IS NULL AS terminal 
FROM tree JOIN languoid AS parent ON tree.parent_id = parent.id AND parent.parent_id IS NOT NULL LEFT OUTER JOIN languoid AS grandparent ON parent.parent_id = grandparent.id)
 SELECT languoid.id, languoid.name, languoid.level, languoid.parent_id, (SELECT group_concat(parent_path.path_part, ?) AS path 
FROM (SELECT tree.parent_id AS path_part 
FROM tree 
WHERE tree.child_id = languoid.id ORDER BY tree.steps DESC) AS parent_path) AS path, (SELECT tree.parent_id 
FROM tree 
WHERE tree.child_id = languoid.id AND tree.steps > ? AND tree.terminal = 1) AS family_id, (SELECT tree.parent_id 
FROM tree 
WHERE tree.child_id = languoid.id AND languoid.level = ? AND (EXISTS (SELECT * 
FROM languoid AS ancestor 
WHERE tree.parent_id = ancestor.id AND ancestor.level = ?))) AS dialect_language_id, languoid.hid, languoid.iso639_3, languoid.latitude, languoid.longitude, (SELECT group_concat(lang_ma.macroarea_name, ?) AS macroareas 
FROM (SELECT languoid_macroarea.macroarea_name AS macroarea_name 
FROM languoid_macroarea 
WHERE languoid_macroarea.languoid_id = languoid.id ORDER BY languoid_macroarea.macroarea_name) AS lang_ma) AS macroareas, (SELECT group_concat(lang_country.country_id, ?) AS countries 
FROM (SELECT languoid_country.country_id AS country_id 
FROM languoid_country 
WHERE languoid_country.languoid_id = languoid.id ORDER BY languoid_country.country_id) AS lang_country) AS countries, (SELECT group_concat(lang_link.printf, ?) AS links 
FROM (SELECT CASE WHEN (link.title IS NOT NULL) THEN printf(?, link.title, link.url) ELSE link.url END AS printf 
FROM link 
WHERE link.languoid_id = languoid.id ORDER BY link.ord) AS lang_link) AS links, (SELECT group_concat(lang_source_glottolog.printf, ?) AS sources_glottolog 
FROM (SELECT CASE WHEN (source_glottolog.pages IS NOT NULL AND source_glottolog."trigger" IS NOT NULL) THEN printf(?, source_bibfile.name, source_bibitem.bibkey, source_glottolog.pages, source_glottolog."trigger") WHEN (source_glottolog.pages IS NOT NULL) THEN printf(?, source_bibfile.name, source_bibitem.bibkey, source_glottolog.pages) WHEN (source_glottolog."trigger" IS NOT NULL) THEN printf(?, source_bibfile.name, source_bibitem.bibkey, source_glottolog."trigger") ELSE printf(?, source_bibfile.name, source_bibitem.bibkey) END AS printf 
FROM source AS source_glottolog, bibfile AS source_bibfile, bibitem AS source_bibitem, sourceprovider AS source_provider 
WHERE source_glottolog.languoid_id = languoid.id AND source_glottolog.provider_id = source_provider.id AND source_provider.name = ? AND source_glottolog.bibitem_id = source_bibitem.id AND source_bibitem.bibfile_id = source_bibfile.id ORDER BY source_bibfile.name, source_bibitem.bibkey) AS lang_source_glottolog) AS sources_glottolog, (SELECT group_concat(lang_altname_aiatsis.printf, ?) AS altnames_aiatsis 
FROM (SELECT CASE WHEN (altname_aiatsis.lang = ?) THEN altname_aiatsis.name ELSE printf(?, altname_aiatsis.name, altname_aiatsis.lang) END AS printf 
FROM altname AS altname_aiatsis, altnameprovider AS altname_aiatsis_provider 
WHERE altname_aiatsis.languoid_id = languoid.id AND altname_aiatsis.provider_id = altname_aiatsis_provider.id AND altname_aiatsis_provider.name = ? ORDER BY altname_aiatsis.name, altname_aiatsis.lang) AS lang_altname_aiatsis) AS altnames_aiatsis, (SELECT group_concat(lang_altname_elcat.printf, ?) AS altnames_elcat 
FROM (SELECT CASE WHEN (altname_elcat.lang = ?) THEN altname_elcat.name ELSE printf(?, altname_elcat.name, altname_elcat.lang) END AS printf 
FROM altname AS altname_elcat, altnameprovider AS altname_elcat_provider 
WHERE altname_elcat.languoid_id = languoid.id AND altname_elcat.provider_id = altname_elcat_provider.id AND altname_elcat_provider.name = ? ORDER BY altname_elcat.name, altname_elcat.lang) AS lang_altname_elcat) AS altnames_elcat, (SELECT group_concat(lang_altname_ethnologue.printf, ?) AS altnames_ethnologue 
FROM (SELECT CASE WHEN (altname_ethnologue.lang = ?) THEN altname_ethnologue.name ELSE printf(?, altname_ethnologue.name, altname_ethnologue.lang) END AS printf 
FROM altname AS altname_ethnologue, altnameprovider AS altname_ethnologue_provider 
WHERE altname_ethnologue.languoid_id = languoid.id AND altname_ethnologue.provider_id = altname_ethnologue_provider.id AND altname_ethnologue_provider.name = ? ORDER BY altname_ethnologue.name, altname_ethnologue.lang) AS lang_altname_ethnologue) AS altnames_ethnologue, (SELECT group_concat(lang_altname_glottolog.printf, ?) AS altnames_glottolog 
FROM (SELECT CASE WHEN (altname_glottolog.lang = ?) THEN altname_glottolog.name ELSE printf(?, altname_glottolog.name, altname_glottolog.lang) END AS printf 
FROM altname AS altname_glottolog, altnameprovider AS altname_glottolog_provider 
WHERE altname_glottolog.languoid_id = languoid.id AND altname_glottolog.provider_id = altname_glottolog_provider.id AND altname_glottolog_provider.name = ? ORDER BY altname_glottolog.name, altname_glottolog.lang) AS lang_altname_glottolog) AS altnames_glottolog, (SELECT group_concat(lang_altname_hhbib_lgcode.printf, ?) AS altnames_hhbib_lgcode 
FROM (SELECT CASE WHEN (altname_hhbib_lgcode.lang = ?) THEN altname_hhbib_lgcode.name ELSE printf(?, altname_hhbib_lgcode.name, altname_hhbib_lgcode.lang) END AS printf 
FROM altname AS altname_hhbib_lgcode, altnameprovider AS altname_hhbib_lgcode_provider 
WHERE altname_hhbib_lgcode.languoid_id = languoid.id AND altname_hhbib_lgcode.provider_id = altname_hhbib_lgcode_provider.id AND altname_hhbib_lgcode_provider.name = ? ORDER BY altname_hhbib_lgcode.name, altname_hhbib_lgcode.lang) AS lang_altname_hhbib_lgcode) AS altnames_hhbib_lgcode, (SELECT group_concat(lang_altname_lexvo.printf, ?) AS altnames_lexvo 
FROM (SELECT CASE WHEN (altname_lexvo.lang = ?) THEN altname_lexvo.name ELSE printf(?, altname_lexvo.name, altname_lexvo.lang) END AS printf 
FROM altname AS altname_lexvo, altnameprovider AS altname_lexvo_provider 
WHERE altname_lexvo.languoid_id = languoid.id AND altname_lexvo.provider_id = altname_lexvo_provider.id AND altname_lexvo_provider.name = ? ORDER BY altname_lexvo.name, altname_lexvo.lang) AS lang_altname_lexvo) AS altnames_lexvo, (SELECT group_concat("lang_altname_moseley & asher (1994)".printf, ?) AS "altnames_moseley & asher (1994)" 
FROM (SELECT CASE WHEN ("altname_moseley & asher (1994)".lang = ?) THEN "altname_moseley & asher (1994)".name ELSE printf(?, "altname_moseley & asher (1994)".name, "altname_moseley & asher (1994)".lang) END AS printf 
FROM altname AS "altname_moseley & asher (1994)", altnameprovider AS "altname_moseley & asher (1994)_provider" 
WHERE "altname_moseley & asher (1994)".languoid_id = languoid.id AND "altname_moseley & asher (1994)".provider_id = "altname_moseley & asher (1994)_provider".id AND "altname_moseley & asher (1994)_provider".name = ? ORDER BY "altname_moseley & asher (1994)".name, "altname_moseley & asher (1994)".lang) AS "lang_altname_moseley & asher (1994)") AS "altnames_moseley & asher (1994)", (SELECT group_concat(lang_altname_multitree.printf, ?) AS altnames_multitree 
FROM (SELECT CASE WHEN (altname_multitree.lang = ?) THEN altname_multitree.name ELSE printf(?, altname_multitree.name, altname_multitree.lang) END AS printf 
FROM altname AS altname_multitree, altnameprovider AS altname_multitree_provider 
WHERE altname_multitree.languoid_id = languoid.id AND altname_multitree.provider_id = altname_multitree_provider.id AND altname_multitree_provider.name = ? ORDER BY altname_multitree.name, altname_multitree.lang) AS lang_altname_multitree) AS altnames_multitree, (SELECT group_concat("lang_altname_ruhlen (1987)".printf, ?) AS "altnames_ruhlen (1987)" 
FROM (SELECT CASE WHEN ("altname_ruhlen (1987)".lang = ?) THEN "altname_ruhlen (1987)".name ELSE printf(?, "altname_ruhlen (1987)".name, "altname_ruhlen (1987)".lang) END AS printf 
FROM altname AS "altname_ruhlen (1987)", altnameprovider AS "altname_ruhlen (1987)_provider" 
WHERE "altname_ruhlen (1987)".languoid_id = languoid.id AND "altname_ruhlen (1987)".provider_id = "altname_ruhlen (1987)_provider".id AND "altname_ruhlen (1987)_provider".name = ? ORDER BY "altname_ruhlen (1987)".name, "altname_ruhlen (1987)".lang) AS "lang_altname_ruhlen (1987)") AS "altnames_ruhlen (1987)", (SELECT group_concat(lang_altname_wals.printf, ?) AS altnames_wals 
FROM (SELECT CASE WHEN (altname_wals.lang = ?) THEN altname_wals.name ELSE printf(?, altname_wals.name, altname_wals.lang) END AS printf 
FROM altname AS altname_wals, altnameprovider AS altname_wals_provider 
WHERE altname_wals.languoid_id = languoid.id AND altname_wals.provider_id = altname_wals_provider.id AND altname_wals_provider.name = ? ORDER BY altname_wals.name, altname_wals.lang) AS lang_altname_wals) AS altnames_wals, (SELECT group_concat("lang_altname_wals other".printf, ?) AS "altnames_wals other" 
FROM (SELECT CASE WHEN ("altname_wals other".lang = ?) THEN "altname_wals other".name ELSE printf(?, "altname_wals other".name, "altname_wals other".lang) END AS printf 
FROM altname AS "altname_wals other", altnameprovider AS "altname_wals other_provider" 
WHERE "altname_wals other".languoid_id = languoid.id AND "altname_wals other".provider_id = "altname_wals other_provider".id AND "altname_wals other_provider".name = ? ORDER BY "altname_wals other".name, "altname_wals other".lang) AS "lang_altname_wals other") AS "altnames_wals other", (SELECT group_concat(lang_trigger_lgcode."trigger", ?) AS triggers_lgcode 
FROM (SELECT trigger_lgcode."trigger" AS "trigger" 
FROM "trigger" AS trigger_lgcode 
WHERE trigger_lgcode.field = ? AND trigger_lgcode.languoid_id = languoid.id ORDER BY trigger_lgcode.ord) AS lang_trigger_lgcode) AS triggers_lgcode, (SELECT group_concat(lang_trigger_inlg."trigger", ?) AS triggers_inlg 
FROM (SELECT trigger_inlg."trigger" AS "trigger" 
FROM "trigger" AS trigger_inlg 
WHERE trigger_inlg.field = ? AND trigger_inlg.languoid_id = languoid.id ORDER BY trigger_inlg.ord) AS lang_trigger_inlg) AS triggers_inlg, ident_endangeredlanguages.identifier AS identifier_endangeredlanguages, ident_languagelandscape.identifier AS identifier_languagelandscape, ident_multitree.identifier AS identifier_multitree, ident_wals.identifier AS identifier_wals, cc_sub.comment AS classification_sub, (SELECT group_concat(lang_cref_sub.printf, ?) AS classification_subrefs 
FROM (SELECT printf(CASE WHEN (cr_sub.pages IS NOT NULL) THEN ? ELSE ? END, bibfile_cr_sub.name, bibitem_cr_sub.bibkey, cr_sub.pages) AS printf 
FROM classificationref AS cr_sub, bibfile AS bibfile_cr_sub, bibitem AS bibitem_cr_sub 
WHERE cr_sub.kind = ? AND cr_sub.languoid_id = languoid.id AND cr_sub.bibitem_id = bibitem_cr_sub.id AND bibitem_cr_sub.bibfile_id = bibfile_cr_sub.id ORDER BY cr_sub.ord) AS lang_cref_sub) AS classification_subrefs, cc_fam.comment AS classification_family, (SELECT group_concat(lang_cref_family.printf, ?) AS classification_familyrefs 
FROM (SELECT printf(CASE WHEN (cr_family.pages IS NOT NULL) THEN ? ELSE ? END, bibfile_cr_family.name, bibitem_cr_family.bibkey, cr_family.pages) AS printf 
FROM classificationref AS cr_family, bibfile AS bibfile_cr_family, bibitem AS bibitem_cr_family 
WHERE cr_family.kind = ? AND cr_family.languoid_id = languoid.id AND cr_family.bibitem_id = bibitem_cr_family.id AND bibitem_cr_family.bibfile_id = bibfile_cr_family.id ORDER BY cr_family.ord) AS lang_cref_family) AS classification_familyrefs, endangerment.status AS endangerment_status, endangerment.date AS endangerment_date, endangerment.comment AS endangerment_comment, CASE WHEN (endangermentsource.bibitem_id IS NULL) THEN endangermentsource.name ELSE printf(?, bibfile_e.name, bibitem_e.bibkey, endangermentsource.pages) END AS endangerment_source, ethnologuecomment.isohid AS elcomment_isohid, ethnologuecomment.comment_type AS elcomment_comment_type, ethnologuecomment.ethnologue_versions AS elcomment_ethnologue_versions, ethnologuecomment.comment AS elcomment_comment, isoretirement.code AS iso_retirement_code, isoretirement.name AS iso_retirement_name, isoretirement.change_request AS iso_retirement_change_request, isoretirement.effective AS iso_retirement_effective, isoretirement.reason AS iso_retirement_reason, isoretirement.remedy AS iso_retirement_remedy, isoretirement.comment AS iso_retirement_comment, (SELECT group_concat(lang_irct.code, ?) AS iso_retirement_change_to 
FROM (SELECT isoretirement_changeto.code AS code 
FROM isoretirement_changeto 
WHERE isoretirement_changeto.languoid_id = languoid.id ORDER BY isoretirement_changeto.ord) AS lang_irct) AS iso_retirement_change_to 
FROM languoid LEFT OUTER JOIN (identifier AS ident_endangeredlanguages JOIN identifiersite AS ident_endangeredlanguages_site ON ident_endangeredlanguages.site_id = ident_endangeredlanguages_site.id) ON ident_endangeredlanguages_site.name = ? AND ident_endangeredlanguages.languoid_id = languoid.id LEFT OUTER JOIN (identifier AS ident_languagelandscape JOIN identifiersite AS ident_languagelandscape_site ON ident_languagelandscape.site_id = ident_languagelandscape_site.id) ON ident_languagelandscape_site.name = ? AND ident_languagelandscape.languoid_id = languoid.id LEFT OUTER JOIN (identifier AS ident_multitree JOIN identifiersite AS ident_multitree_site ON ident_multitree.site_id = ident_multitree_site.id) ON ident_multitree_site.name = ? AND ident_multitree.languoid_id = languoid.id LEFT OUTER JOIN (identifier AS ident_wals JOIN identifiersite AS ident_wals_site ON ident_wals.site_id = ident_wals_site.id) ON ident_wals_site.name = ? AND ident_wals.languoid_id = languoid.id LEFT OUTER JOIN classificationcomment AS cc_sub ON cc_sub.kind = ? AND cc_sub.languoid_id = languoid.id LEFT OUTER JOIN classificationcomment AS cc_fam ON cc_fam.kind = ? AND cc_fam.languoid_id = languoid.id LEFT OUTER JOIN (endangerment JOIN endangermentsource ON endangermentsource.id = endangerment.source_id) ON languoid.id = endangerment.languoid_id LEFT OUTER JOIN (bibitem AS bibitem_e JOIN bibfile AS bibfile_e ON bibfile_e.id = bibitem_e.bibfile_id) ON bibitem_e.id = endangermentsource.bibitem_id LEFT OUTER JOIN ethnologuecomment ON languoid.id = ethnologuecomment.languoid_id LEFT OUTER JOIN isoretirement ON languoid.id = isoretirement.languoid_id ORDER BY languoid.id
[generated in 0.00174s] (0, 1, '/', 0, 'dialect', 'language', ', ', ', ', ', ', '[%s](%s)', ', ', '**%s:%s**:%s<trigger "%s">', '**%s:%s**:%s', '**%s:%s**<trigger "%s">', '**%s:%s**', 'glottolog', ', ', '', '%s [%s]', 'aiatsis', ', ', '', '%s [%s]', 'elcat', ', ', '', '%s [%s]', 'ethnologue', ', ', '', '%s [%s]', 'glottolog', ', ', '', '%s [%s]', 'hhbib_lgcode', ', ', '', '%s [%s]', 'lexvo', ', ', '', '%s [%s]', 'moseley & asher (1994)', ', ', '', '%s [%s]', 'multitree', ', ', '', '%s [%s]', 'ruhlen (1987)', ', ', '', '%s [%s]', 'wals', ', ', '', '%s [%s]', 'wals other', ', ', 'lgcode', ', ', 'inlg', ', ', '**%s:%s**:%s', '**%s:%s**', 'sub', ', ', '**%s:%s**:%s', '**%s:%s**', 'family', '**%s:%s**:%s', ', ', 'endangeredlanguages', 'languagelandscape', 'multitree', 'wals', 'sub', 'family')
[[email protected]] hash rows with 'sha256', csv header: ['id', 'name', 'level', 'parent_id', 'path', 'family_id', 'dialect_language_id', 'hid', 'iso639_3', 'latitude', 'longitude', 'macroareas', 'countries', 'links', 'sources_glottolog', 'altnames_aiatsis', 'altnames_elcat', 'altnames_ethnologue', 'altnames_glottolog', 'altnames_hhbib_lgcode', 'altnames_lexvo', 'altnames_moseley & asher (1994)', 'altnames_multitree', 'altnames_ruhlen (1987)', 'altnames_wals', 'altnames_wals other', 'triggers_lgcode', 'triggers_inlg', 'identifier_endangeredlanguages', 'identifier_languagelandscape', 'identifier_multitree', 'identifier_wals', 'classification_sub', 'classification_subrefs', 'classification_family', 'classification_familyrefs', 'endangerment_status', 'endangerment_date', 'endangerment_comment', 'endangerment_source', 'elcomment_isohid', 'elcomment_comment_type', 'elcomment_ethnologue_versions', 'elcomment_comment', 'iso_retirement_code', 'iso_retirement_name', 'iso_retirement_change_request', 'iso_retirement_effective', 'iso_retirement_reason', 'iso_retirement_remedy', 'iso_retirement_comment', 'iso_retirement_change_to']
ROLLBACK
Wall time: 4.23 s
Out[72]:
'224691678e1f2e18406d6dd1a278e062c683ac12ec2acf57d501931d3661142e'