%%capture
%%bash

# cleaning up residues from past executions and sample data folder

rm *
rm -rf sample_data

# downloading the tatoeba corpus

wget -nv http://downloads.tatoeba.org/exports/sentences_detailed.tar.bz2 \
         http://downloads.tatoeba.org/exports/user_languages.tar.bz2 \
         http://downloads.tatoeba.org/exports/links.tar.bz2

# downloading a 10k German word frequency list
wget -nv http://www1.ids-mannheim.de/fileadmin/kl/derewo/DeReKo-2014-II-MainArchive-STT.100000.freq.7z

# 7z is already pre-installed on hosted free Colab
7z e DeReKo-2014-II-MainArchive-STT.100000.freq.7z
mv DeReKo-2014-II-MainArchive-STT.100000.freq freq.csv

# extracting tatoeba corpus

tar xvjf sentences_detailed.tar.bz2 
tar xvjf user_languages.tar.bz2
tar xvjf links.tar.bz2

# cleaning up
rm *.bz2
rm *.7z
rm *.readme

# show files
ls -la

%%bash

# in tatoeba CSVs null is represented as a '\N' string

# selecting reference users, whose translations will be used
grep -P '^eng\t[45]' user_languages.csv > eng_users.csv
grep -P '^deu\t[45]' user_languages.csv > deu_users.csv

# German sentences (length, owner, punctuation)
awk -F, '
  BEGIN {FS="\t"};
  {
    if (($2 == "deu" &&
      $4 != "\\N" &&
      length($3) > 40 &&
      length($3) < 100 &&
      substr($3, 1, length($3)-1) !~ /[\.\?\!]/))
    print $0
  } ' sentences_detailed.csv > deu.csv

# English sentences (owner)
awk -F, '
  BEGIN {FS="\t"};
  {
    if (($2 == "eng" && $4 != "\\N")) print $0
  } ' sentences_detailed.csv > eng.csv

# pulling data into Pandas DataFrames

import warnings
import pandas as pd

# suppressing futurewarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

freq = pd.read_csv(
                'freq.csv', 
                sep='\t', 
                header=None,
                names=['word', 'lemma', 'POS_tag', 'POS_confidence'])

links = pd.read_csv('links.csv',
                 delimiter='\t',
                 error_bad_lines=False,
                 warn_bad_lines=True,
                 index_col=0,
                 header=None,
                 mangle_dupe_cols=True)

eng_sentences = pd.read_csv('eng.csv',
                 delimiter='\t',
                 error_bad_lines=False,
                 warn_bad_lines=True,
                 index_col=0,
                 usecols=[0,2,3],
                 names=['id', 'text', 'owner'],
                 header=None)

deu_sentences = pd.read_csv('deu.csv',
                 delimiter='\t',
                 error_bad_lines=False,
                 warn_bad_lines=True,
                 index_col=0,
                 usecols=[0,2,3],
                 names=['id', 'text', 'owner'],
                 header=None)

eng_users = pd.read_csv('eng_users.csv',
                 delimiter='\t',
                 error_bad_lines=False,
                 warn_bad_lines=True,
                 index_col=0,
                 usecols=[2],
                 names=['owner'],
                 header=None)

deu_users = pd.read_csv('deu_users.csv',
                 delimiter='\t',
                 error_bad_lines=False,
                 warn_bad_lines=True,
                 index_col=0,
                 usecols=[2],
                 names=['owner'],
                 header=None)


translations = links.join(deu_sentences, how='right')\
     .dropna().set_index(1)\
     .join(eng_sentences, how='right', lsuffix='_deu', rsuffix='_eng')\
     .dropna().loc[:,['text_deu','text_eng']].reset_index(drop=True)

# cleaning / trimming down the frequency list
# and also diminishing the rarity as it grows

bad_POS = ['TRUNC','$(','$,','$.','156259594','XY', 'CARD', 'NE']
bad_lemma = ['UNKNOWN', 'unknown']

POS_filter = ~freq.POS_tag.isin(bad_POS)
lemma_filter = ~freq.lemma.isin(bad_lemma)

freq = freq[POS_filter]
freq = freq[lemma_filter]

freq['log_freq'] = (
    pd.np.log(
        freq.index
        .astype(pd.np.int64)
        +1 # there I fixed np.log(0) with a ducktape
    )
    .astype(pd.np.int)
)

# preparing the German sentences to be probed against the frequency list
word_lists = (
    translations['text_deu']
    .str.replace(r'[,:]', '')
    .str.split()
)

# left join the frequency list to every single word list
# and deriving the median rarity of the words in the sentence
translations['rarity'] = (
    word_lists
      .apply(
          lambda word_list: 
            pd.DataFrame(word_list)
              .merge(freq, left_on=0, right_on='word', how='left')['log_freq']
              .median()
      )
)

# calculating the complexity value
translations['complexity'] = (
    translations.text_deu.str.len()
    *
    translations.rarity
)

# sorting sentences by complexity, then resetting index
# ready to export
(
    translations
      .sort_values('complexity', ascending=True)
      .reset_index(drop=True, inplace=True)
)

%%capture
from google.colab import files

url = ('https://translate.google.com/' +
       'translate_tts?ie=UTF-8&tl=de-DE&client=tw-ob&q=')

# making the German text Google Translate URL compatible
translations['audio'] = (url +

      translations['text_deu'].str.replace('[ \'\"]', '+') + '+')

# taking a look at the results. Remember - grammar does not count into
# complexity, only how common the words are in the sentence
(translations[['text_deu', 'text_eng', 'complexity']]
   .sample(50)
   .set_index(['text_deu','text_eng'])
   .sort_values('complexity'))

# generating result and downloading it in chrome
(translations[['text_deu', 'text_eng', 'audio', 'complexity']]
    .to_csv('export.csv', sep='\t', encoding='utf-8', header=False))

# downloading script results to your machine
files.download('export.csv')