%%capture %%bash # cleaning up residues from past executions and sample data folder rm * rm -rf sample_data # downloading the tatoeba corpus wget -nv http://downloads.tatoeba.org/exports/sentences_detailed.tar.bz2 \ http://downloads.tatoeba.org/exports/user_languages.tar.bz2 \ http://downloads.tatoeba.org/exports/links.tar.bz2 # downloading a 10k German word frequency list wget -nv http://www1.ids-mannheim.de/fileadmin/kl/derewo/DeReKo-2014-II-MainArchive-STT.100000.freq.7z # 7z is already pre-installed on hosted free Colab 7z e DeReKo-2014-II-MainArchive-STT.100000.freq.7z mv DeReKo-2014-II-MainArchive-STT.100000.freq freq.csv # extracting tatoeba corpus tar xvjf sentences_detailed.tar.bz2 tar xvjf user_languages.tar.bz2 tar xvjf links.tar.bz2 # cleaning up rm *.bz2 rm *.7z rm *.readme # show files ls -la %%bash # in tatoeba CSVs null is represented as a '\N' string # selecting reference users, whose translations will be used grep -P '^eng\t[45]' user_languages.csv > eng_users.csv grep -P '^deu\t[45]' user_languages.csv > deu_users.csv # German sentences (length, owner, punctuation) awk -F, ' BEGIN {FS="\t"}; { if (($2 == "deu" && $4 != "\\N" && length($3) > 40 && length($3) < 100 && substr($3, 1, length($3)-1) !~ /[\.\?\!]/)) print $0 } ' sentences_detailed.csv > deu.csv # English sentences (owner) awk -F, ' BEGIN {FS="\t"}; { if (($2 == "eng" && $4 != "\\N")) print $0 } ' sentences_detailed.csv > eng.csv # pulling data into Pandas DataFrames import warnings import pandas as pd # suppressing futurewarnings warnings.simplefilter(action='ignore', category=FutureWarning) freq = pd.read_csv( 'freq.csv', sep='\t', header=None, names=['word', 'lemma', 'POS_tag', 'POS_confidence']) links = pd.read_csv('links.csv', delimiter='\t', error_bad_lines=False, warn_bad_lines=True, index_col=0, header=None, mangle_dupe_cols=True) eng_sentences = pd.read_csv('eng.csv', delimiter='\t', error_bad_lines=False, warn_bad_lines=True, index_col=0, usecols=[0,2,3], names=['id', 'text', 'owner'], header=None) deu_sentences = pd.read_csv('deu.csv', delimiter='\t', error_bad_lines=False, warn_bad_lines=True, index_col=0, usecols=[0,2,3], names=['id', 'text', 'owner'], header=None) eng_users = pd.read_csv('eng_users.csv', delimiter='\t', error_bad_lines=False, warn_bad_lines=True, index_col=0, usecols=[2], names=['owner'], header=None) deu_users = pd.read_csv('deu_users.csv', delimiter='\t', error_bad_lines=False, warn_bad_lines=True, index_col=0, usecols=[2], names=['owner'], header=None) translations = links.join(deu_sentences, how='right')\ .dropna().set_index(1)\ .join(eng_sentences, how='right', lsuffix='_deu', rsuffix='_eng')\ .dropna().loc[:,['text_deu','text_eng']].reset_index(drop=True) # cleaning / trimming down the frequency list # and also diminishing the rarity as it grows bad_POS = ['TRUNC','$(','$,','$.','156259594','XY', 'CARD', 'NE'] bad_lemma = ['UNKNOWN', 'unknown'] POS_filter = ~freq.POS_tag.isin(bad_POS) lemma_filter = ~freq.lemma.isin(bad_lemma) freq = freq[POS_filter] freq = freq[lemma_filter] freq['log_freq'] = ( pd.np.log( freq.index .astype(pd.np.int64) +1 # there I fixed np.log(0) with a ducktape ) .astype(pd.np.int) ) # preparing the German sentences to be probed against the frequency list word_lists = ( translations['text_deu'] .str.replace(r'[,:]', '') .str.split() ) # left join the frequency list to every single word list # and deriving the median rarity of the words in the sentence translations['rarity'] = ( word_lists .apply( lambda word_list: pd.DataFrame(word_list) .merge(freq, left_on=0, right_on='word', how='left')['log_freq'] .median() ) ) # calculating the complexity value translations['complexity'] = ( translations.text_deu.str.len() * translations.rarity ) # sorting sentences by complexity, then resetting index # ready to export ( translations .sort_values('complexity', ascending=True) .reset_index(drop=True, inplace=True) ) %%capture from google.colab import files url = ('https://translate.google.com/' + 'translate_tts?ie=UTF-8&tl=de-DE&client=tw-ob&q=') # making the German text Google Translate URL compatible translations['audio'] = (url + translations['text_deu'].str.replace('[ \'\"]', '+') + '+') # taking a look at the results. Remember - grammar does not count into # complexity, only how common the words are in the sentence (translations[['text_deu', 'text_eng', 'complexity']] .sample(50) .set_index(['text_deu','text_eng']) .sort_values('complexity')) # generating result and downloading it in chrome (translations[['text_deu', 'text_eng', 'audio', 'complexity']] .to_csv('export.csv', sep='\t', encoding='utf-8', header=False)) # downloading script results to your machine files.download('export.csv')