#!/usr/bin/env python
# coding: utf-8
#
#
#
#
# # Cases
#
# Cases are the building blocks on the faces of tablets.
#
# What about the distribution of signs in deeply nested cases versus outer cases?
# We show here how you can begin to investigate that.
# In[1]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[2]:
import collections
from IPython.display import display, Markdown
from tf.app import use
# In[3]:
A = use("Nino-cunei/uruk",hoist=globals())
# ## `search`
#
# You might want to read the
# [docs](https://annotation.github.io/text-fabric/tf/about/searchusage.html)
# or the tutorial chapter on
# [search](search.ipynb)
# first.
#
# Here is a quick recap.
#
# ### Explanation
#
# The search template is basically
#
# ```
# line
# case
# case
# sign
# ```
#
# This bare template looks for a sign within a case within a case within a line.
# Indentation acts as shorthand for embedding.
#
# But this is not enough, because a subsubcase of a case is also embedded in that case.
# We look for a situation where the first case is *directly* embedded in the line,
# and the second case is *directly* embedded in the first case.
#
# In our data we have an *edge* (relationship), called `sub`, that connects lines/cases with
# cases that are directly embedded in them.
#
# So
#
# ```
# c0 -sub> c1
# ```
#
# means that `c0` is `sub`-related to `c1`.
#
# Now it is possible to see that the result of this query will have signs that occur in
# subcases of cases of lines.
# The Cunei API provides a function to collect (sub)cases at a given level of nesting.
#
# We show how to use them, and for each task we show **how you can get things done easier with search**.
# ## Level 0
#
# If we do `casesByLevel(0, terminal=False)` we get all lines.
#
# If we do `casesByLevel(0)`, we get precisely the undivided lines.
# In[4]:
test0Cases = set(A.casesByLevel(0, terminal=False))
allLines = set(F.otype.s("line"))
types0 = {F.otype.v(n) for n in test0Cases}
print(f"test0Cases: {len(test0Cases):>5}")
print(f"allLines : {len(allLines):>5}")
print(f"test0Cases equal to allLines: {test0Cases == allLines}")
print(f"types of test0Cases: {types0}")
test0CasesT = set(A.casesByLevel(0))
print(f"test0CasesT: {len(test0CasesT):>5}")
print(f"Divided lines: {len(test0Cases) - len(test0CasesT):>5}")
# Let us compare this with doing the same by means of search.
#
# * All lines
# In[5]:
query = """
line
"""
results = A.search(query)
# * Undivided lines
# In[6]:
query = """
line terminal
"""
results = A.search(query)
# * Divided lines
# In[7]:
query = """
line terminal#
"""
results = A.search(query)
# ## Level 1
#
# If we do `casesByLevel(1, terminal=False)` we get all cases (not lines) that are the first subdivision of a line.
#
# If we do `casesByLevel(1)`, we get a subset of these cases, namely the ones that are not themselves subdivided.
# In[8]:
test1Cases = set(A.casesByLevel(1, terminal=False))
types1 = {F.otype.v(n) for n in test1Cases}
print(f"test1Cases: {len(test1Cases):>5}")
print(f"types of test1Cases: {types1}")
test1CasesT = set(A.casesByLevel(1))
print(f"test1CasesT: {len(test1CasesT):>5}")
print(f"Divided cases: {len(test1Cases) - len(test1CasesT):>5}")
# Or, by query:
#
# * Top-level cases
# In[9]:
query = """
case depth=1
"""
results = A.search(query)
# * Undivided top-level cases
# In[10]:
query = """
case depth=1 terminal
"""
results = A.search(query)
# * Divided top-level cases
# In[11]:
query = """
case depth=1 terminal#
"""
results = A.search(query)
# ## Example tablet
# Here we show by means of an example tablet the difference between `terminal=False` and
# `terminal=True` when calling `A.casesByLevel`
#
# We'll use an example tablet `P471695`.
# In[12]:
examplePnum = "P471695"
exampleTablet = T.nodeFromSection((examplePnum,))
A.getSource(exampleTablet)
A.pretty(exampleTablet)
# Above we have selected all cases of level 1 from the whole corpus, and constructed two sets:
# * terminal cases of level 1;
# * all cases of level 1.
# Now we take the intersection of these sets with the cases of the example tablet.
# In[13]:
exampleCases = set(L.d(exampleTablet, otype="case")) | set(
L.d(exampleTablet, otype="line")
)
example2 = test1Cases & exampleCases
example2T = test1CasesT & exampleCases
# In[14]:
print(f'\n{"-" * 48}\n'.join("\n".join(A.getSource(c)) for c in sorted(example2)))
# In[15]:
print(f'\n{"-" * 48}\n'.join("\n".join(A.getSource(c)) for c in sorted(example2T)))
# We can also show it with `plain()`.
# In[16]:
for c in sorted(example2):
A.plain(c)
# In[17]:
for c in sorted(example2T):
A.plain(c)
# We can also show it with `pretty()`.
# In[18]:
for c in sorted(example2):
A.pretty(c, showGraphics=False)
# In[19]:
for c in sorted(example2T):
A.pretty(c, showGraphics=False)
# What about case `1.b`?
# It is a case at level 2.
# Why is it not in `example2T`?
#
# Yes, but it is not a terminal case. It has subcases.
# That is why `1.b` is left out.
# The parameter `terminal` specifies that only cases without children will be in the result.
# ## Level 2
#
# What if we want all signs that occur in a subcase, i.e. a case at level 2?
#
# We can call `casesByLevel(2, terminal=False)`, iterate through the resulting cases, and
# collect all signs per case.
# However, we will encounter signs multiple times.
# Because if a sign is in a subcase, it is also in its containing case and in its containing line.
# We can solve this by collecting the signs in a set.
# Then we loose the corpus order of the signs, but we can easily reorder the set into a list.
#
# There is an alternative method: a search template.
# Search delivers unordered results, so we will reorder the search results as well.
#
# Text-Fabric has an API function for sorting nodes into corpus order: `sortNodes`.
#
# Let us try out both methods and compare the outcomes.
# ### `casesByLevel`
# In[20]:
cases = A.casesByLevel(2, terminal=False)
signSet = set()
for case in cases:
signSet |= set(L.d(case, otype="sign"))
signsA = N.sortNodes(signSet)
len(signsA)
# or, by query:
# In[21]:
query = """
case depth=2
sign
"""
results = A.search(query)
# or by a query not using the `depth` feature:
# In[22]:
query = """
line
-sub> case
-sub> case
sign
"""
results = A.search(query)
signsB = N.sortNodes(r[3] for r in results)
# A bit about results.
# The query mentions four quantities: `line`, `case`, `case`, `sign`.
# Every result of the query is an instantiation of those 4 quantities, hence a tuple of nodes:
#
# ```
# (resultLine, resultCase1, resultCase2, resultSign)
# ```
#
# See the table view:
# In[23]:
A.table(results, end=10)
# For our purposes we are only interested in the `resultSign` part, so we select it by the
# `r[3]` when we walk through all results `r`.
# ### Check
#
# Both methods yield the same number of results, but are they exactly the same results?
# In[24]:
signsA == signsB
# Yes!
# ### Twist
#
# Now we want to restrict ourselves to non-numerical signs.
# If you look at the feature docs (see the link at the start of the notebook),
# and read about the `type` feature for signs, you see that it can have the values
# `empty` `unknown` `numeral` `ideograph`.
# In[25]:
F.type.freqList()
# Ah, the feature `type` is also used for other things than signs.
# We just want a frequency list of `type` values for signs:
# In[26]:
F.type.freqList({"sign"})
# We just want the ideographs.
#
# We'll adapt both methods to get them and ignore the numerals and lesser defined graphemes.
#
# Of course, we can just filter the result list that we have already got,
# but this is a tutorial, and it may come in handy to have a well stocked repertoire
# of direct ways to drill to your data.
# #### `casesByLevel`
# In[27]:
cases = A.casesByLevel(2, terminal=False)
signSet = set()
for case in cases:
signSet |= set(s for s in L.d(case, otype="sign") if F.type.v(s) == "ideograph")
signsA = N.sortNodes(signSet)
len(signsA)
# ### `search`
#
# Note that it is very easy to add the desired condition to the template.
#
# This method is much easier to adapt than the first method!
# In[28]:
query = """
case depth=2
sign type=ideograph
"""
results = A.search(query)
signsB = N.sortNodes(r[1] for r in results)
# In[29]:
signsA == signsB
# ## Supercase versus subcase
#
# We finish of with a comparison of the frequencies of signs that occur on lines and level-1 cases, and the frequencies of signs that occur on level-2 and deeper cases.
#
# From both groups we pick the top-20.
# We make a nice markdown table showing the frequencies those top-20 signs in both groups.
#
# We do this for non-numeric ideographs only.
#
# Note that we have already collected the group of the subcases and deeper: `signsB`.
#
# We give this sequence an other name: `subSigns`.
# In[30]:
subSigns = signsB
len(subSigns)
# We need to collect the group of signs in lines and immediate cases.
# So we have to exclude cases that are subdivided in subcases.
#
# For that, we use the feature `terminal`, which exists and is equal to `1` for undivided
# cases and lines, and which does not exist for divided cases and lines.
#
# We get this group by two queries.
# In[31]:
query0 = """
line terminal=1
sign type=ideograph
"""
signs0 = [r[1] for r in A.search(query0)]
# In[32]:
query1 = """
line
-sub> case terminal=1
sign type=ideograph
"""
signs1 = [r[2] for r in A.search(query1)]
# Let us collect both results into `superSigns`.
# Note that `signs0` and `signs1` have no occurrences in common:
# a sign in `signs1` is part of a case, so the line that contains that case is divided,
# so it has no value for the`terminal` feature, so it is not in the results of `query0`.
# In[33]:
superSigns = signs0 + signs1
# Also note that `superSigns` and `subSigns` have nothing in common, for the same kind of reasoning as why `signs0` and `signs1` have no occurrences in common.
#
# That said, reasoning is one thing, and using data to verify assertions is another thing.
# Let us just check!
# In[34]:
set(signs0) & set(signs1)
# In[35]:
set(subSigns) & set(superSigns)
# Check!
# Last, but not least, we want to compare the frequencies of the super and sub groups with the
# overall frequencies.
# In[36]:
queryA = """
line
sign type=ideograph
"""
allSigns = [r[1] for r in A.search(queryA)]
# ### Frequency and rank
#
# We are going to make a frequency distribution for both groups.
# We do not want to repeat ourselves
# [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself),
# so we write a function that given a list of items,
# produces a frequency list.
#
# While we're at it, we also produce a ranking list: the most frequent item has rank 1,
# the second frequent item has rank 2, and so on.
#
# When we compute the frequencies, we count the number of times a sign, identified by its
# ATF transcription (without flags), occurs.
# In[37]:
def getFreqs(items):
freqs = collections.Counter()
for item in items:
freqs[A.atfFromSign(item)] += 1
ranks = {}
for item in sorted(freqs, key=lambda i: -freqs[i]):
ranks[item] = len(ranks) + 1
return (freqs, ranks)
# In[38]:
(allFreqs, allRanks) = getFreqs(allSigns)
(superFreqs, superRanks) = getFreqs(superSigns)
(subFreqs, subRanks) = getFreqs(subSigns)
# Now we want the top scorers in the super and sub teams.
# We make it customisable whether you want the top-20 or top-100, or whatever.
# In[39]:
def getTop(ranks, amount):
return sorted(ranks, key=lambda i: ranks[i])[0:amount]
# In[40]:
AMOUNT = 20
superTop = getTop(superRanks, AMOUNT)
subTop = getTop(subRanks, AMOUNT)
# We combine the two tops without duplication ...
# In[41]:
combiTopSet = set(superTop) | set(subTop)
# ... and sort them by overall rank:
# In[42]:
combiTop = sorted(combiTopSet, key=lambda i: allRanks[i])
# Since we have now our top characters ready, let us just show them.
# We group them into horizontal lines.
# In[43]:
def chunk(items, chunkSize):
chunks = [[]]
j = 0
for item in items:
if j == chunkSize:
chunks.append([])
j = 0
chunks[-1].append(item)
j += 1
return chunks
# In[44]:
for batch in chunk(combiTop, 4):
display(Markdown("\n\n---\n\n"))
A.lineart(batch, height=80, width=60)
# We can now compose our table.
#
# For each sign we make a row in which we report the frequency and rank of that sign in all
# groups.
# In[45]:
table = """
### Frequencies and ranks of non-numeral signs
sign | all F | all R | super F | super R | sub F | sub R
--- | --- | --- | --- | --- | --- | ---
"""
for sign in combiTop:
allF = allFreqs[sign]
allR = allRanks[sign]
superF = superFreqs.get(sign, " ")
superR = superRanks.get(sign, " ")
subF = subFreqs.get(sign, " ")
subR = subRanks.get(sign, " ")
row = f"**{sign}** | **{allF}** | **{allR}** | {superF} | *{superR}* | {subF} | *{subR}*"
table += f"{row}\n"
display(Markdown(table))
# # Next
#
# *Ready for advanced ...*
#
# Try the
# [primers](http://nbviewer.jupyter.org/github/Nino-cunei/primers/tree/master/)
# for introductions into digital cuneiform research.
#
# All chapters:
# [start](start.ipynb)
# [imagery](imagery.ipynb)
# [steps](steps.ipynb)
# [search](search.ipynb)
# [calc](calc.ipynb)
# [signs](signs.ipynb)
# [quads](quads.ipynb)
# [jumps](jumps.ipynb)
# **cases**
#
# ---
#
# CC-BY Dirk Roorda