#!/usr/bin/env python # coding: utf-8 #

# # --- # # To get started: consult [start](start.ipynb) # # --- # # # Sharing data features # # ## Explore additional data # # Once you analyse a corpus, it is likely that you produce data that others can reuse. # Maybe you have defined a set of proper name occurrences, or you have computed sentiments. # # It is possible to turn these insights into *new features*, i.e. new `.tf` files with values assigned to specific nodes. # # ## Make your own data # # New data is a product of your own methods and computations in the first place. # But how do you turn that data into new TF features? # It turns out that the last step is not that difficult. # # If you can shape your data as a mapping (dictionary) from node numbers (integers) to values # (strings or integers), then TF can turn that data into a feature file for you with one command. # # ## Share your new data # You can then easily share your new features on GitHub, so that your colleagues everywhere # can try it out for themselves. # # You can add such data on the fly, by passing a `mod={org}/{repo}/{path}` parameter, # or a bunch of them separated by commas. # # If the data is there, it will be auto-downloaded and stored on your machine. # # Let's do it. # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import collections import os from tf.app import use # In[3]: A = use("q-ran/quran", checkout="clone", hoist=globals()) # # Making data # # We illustrate the data creation part by creating a new feature, `sentiment`. # # It is not very sensical, but it serves to illustrate the workflow. # # We consider ayas that start with a vocative particle as a positive context, # and ayas that start with a resumptive particle as a negative context. # # For each lemma of a noun, verb, or adjective in the corpus, # we count how often it occurs in a positive context, # and subtract how many times it occurs in a negative context. # # The resulting number is the sentiment. # # We use a query to fetch the positive contexts and the negative contexts. # In[4]: contentTypes = set("verb noun adjective".split()) contentTypeCrit = "|".join(contentTypes) # In[5]: queryP = f""" aya =: word posx=vocative word pos={contentTypeCrit} """ queryN = f""" aya =: word posx=resumption word pos={contentTypeCrit} """ # In[6]: resultsP = A.search(queryP) resultsN = A.search(queryN) # Here are the first few results of both: # In[7]: A.displaySetup(extraFeatures="translation@en") # In[8]: A.show(resultsP, end=2, condensed=True) # In[9]: A.show(resultsN, end=2, condensed=True) # Observe how the positive results indeed have a positive sentiment, and the negative ones are indeed negative. # # However, we do not attempt at all to weed out the positive words under negation from the negative contexts. # # So our sentiments have to work against a massive "pollution", and are probably not useful. # In[10]: sentiment = collections.Counter() for (results, kind) in ((resultsP, 1), (resultsN, -1)): for (aya, particle, word) in results: sentiment[F.lemma.v(word)] += kind # Let's check what we found: how many lemma's per sentiment. # In[11]: sentimentDist = collections.Counter() for (lemma, sent) in sentiment.items(): sentimentDist[sent] += 1 for (sent, amount) in sorted( sentimentDist.items(), key=lambda x: (-x[1], x[0]), ): print(f"sentiment {sent:>3} is assigned to {amount:>4} lemmas") # We show the most negative and most positive sentiments in context. # In[12]: negaThreshold = -100 posiThreshold = 4 xPlemmas = {lemma for lemma in sentiment if sentiment[lemma] >= posiThreshold} xNlemmas = {lemma for lemma in sentiment if sentiment[lemma] <= negaThreshold} xPwords = [ w for w in F.otype.s("word") if F.lemma.v(w) in xPlemmas and F.pos.v(w) in contentTypes ] xNwords = [ w for w in F.otype.s("word") if F.lemma.v(w) in xNlemmas and F.pos.v(w) in contentTypes ] print(f"{len(xPwords)} extremely positive word occurrences") print(f"{len(xNwords)} extremely negative word occurrences") # We put the words in their ayas, and show a few. # In[13]: xPayas = collections.defaultdict(list) xNayas = collections.defaultdict(list) for w in xPwords: a = L.u(w, otype="aya")[0] xPayas[a].append(w) for w in xNwords: a = L.u(w, otype="aya")[0] xNayas[a].append(w) print(f"{len(xPayas)} ayas with extremely positive word occurrences") print(f"{len(xNayas)} ayas with extremely negative word occurrences") xPtuples = [(a, *words) for (a, words) in sorted(xPayas.items())] xNtuples = [(a, *words) for (a, words) in sorted(xNayas.items())] # We show three ayas of each category # In[14]: A.show(xPtuples, end=3) # In[15]: A.show(xNtuples, end=3) # Probably Allah has a negative sentiment because He occurs in many negative contexts as a punisher. # # Anyway, we do not try to be sophisticated here. # # We move on to export this sentiment feature. # # Saving data # # The [documentation](https://annotation.github.io/text-fabric/tf/core/fabric.html#tf.core.fabric.FabricCore.save) explains how to save this data into a text-fabric # data file. # # We choose a location where to save it, the `exercises` repository in the `q-ran` organization, in the folder `mining`. # # In order to do this, we restart the TF API, but now with the desired output location in the `locations` parameter. # In[16]: GITHUB = os.path.expanduser("~/github") ORG = "q-ran" REPO = "exercises" PATH = "mining" VERSION = A.version # Note the version: we have built the version against a specific version of the data: # In[17]: A.version # Later on, we pass this version on, so that users of our data will get the shared data in exactly the same version as their core data. # We have to specify a bit of metadata for this feature: # In[18]: metaData = { "sentiment": dict( valueType="int", description="crude sentiments in the Quran", creator="Dirk Roorda", ), } sentimentData = { w: sentiment[F.lemma.v(w)] for w in F.otype.s("word") if F.lemma.v(w) in sentiment and F.pos.v(w) in contentTypes } # Now we can give the save command: # In[19]: TF.save( nodeFeatures=dict(sentiment=sentimentData), metaData=metaData, location=f"{GITHUB}/{ORG}/{REPO}/{PATH}/tf", module=VERSION, ) # # Sharing data # # How to share your own data is explained in the # [documentation](https://annotation.github.io/text-fabric/tf/about/datasharing.html). # # Here we show it step by step for the `sentiment` feature. # ## Zip the data # # We need to zip the data in exactly the right directory structure. Text-Fabric can do that for us: # In[20]: get_ipython().run_cell_magic('sh', '', '\ntext-fabric-zip q-ran/exercises/mining/tf\n') # Now you have the file in the desired structure in your Downloads folder. # # ## Put the data on GitHub # # The next thing is: make a new release in your `github` directory, in this case Nino-cunei/exercises, and attach # the zip file as a binary. # # You have to do this in your web browser, on the GitHub website. # # Here is the result for our case: # # ![release](images/release.png) # # Use the data # # We can use the data by calling it up when we say `use('q-ran/quran', ...)`. # # Here is how: # In[21]: A = use( "q-ran/quran", hoist=globals(), mod="q-ran/exercises/mining/tf:clone", ) # Above you see a new section in the feature list: **q-ran/exercises/mining/tf** with our foreign feature in it: `sentiment`. # # Now, suppose did not know much about this feature, then we would like to do a few basic checks: # In[22]: F.sentiment.freqList() # Which nodes have a sentiment feature? # In[23]: {F.otype.v(n) for n in N.walk() if F.sentiment.v(n)} # Only words have the feature. # # Which part of speech do these words have? # In[24]: {F.pos.v(n) for n in F.otype.s("word") if F.sentiment.v(n)} # Let's have a look at a table of some words with positive sentiments. # In[25]: results = A.search( """ word sentiment>0 """ ) # In[26]: A.table(results, start=1, end=5) # In[27]: results = A.search( """ word sentiment<0 """ ) # In[28]: A.table(results, start=1, end=5) # Let's get lines with both positive and negative signs: # In[29]: results = A.search( """ aya word sentiment>0 word sentiment<0 """ ) # In[30]: A.table(results, start=1, end=2, condensed=True) # With highlights: # In[31]: highlights = {} for w in F.otype.s("word"): sent = F.sentiment.v(w) if sent: color = "lightsalmon" if sent < 0 else "mediumaquamarine" highlights[w] = color # In[32]: A.table(results, start=1, end=10, condensed=True, highlights=highlights) # If we do a pretty display, the `sentiment` feature shows up. # In[33]: A.show(results, start=1, end=3, condensed=True, withNodes=True, highlights=highlights) # # All together! # # If more researchers have shared data modules, you can draw them all in. # # Then you can design queries that use features from all these different sources. # # In that way, you build your own research on top of the work of others. # --- # # All chapters: # # * **[start](start.ipynb)** introduction to computing with your corpus # * **[display](display.ipynb)** become an expert in creating pretty displays of your text structures # * **[search](search.ipynb)** turbo charge your hand-coding with search templates # * **[exportExcel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results # * **share** draw in other people's data and let them use yours # * **[similarAyas](similarAyas.ipynb)** spot the similarities between lines # * **[rings](rings.ipynb)** ring structures in sura 2 # # CC-BY Dirk Roorda