#!/usr/bin/env python # coding: utf-8 #

# LDA Spike 1 - Cleaning #

# # This notebook "cleans" the text files containing answers with the help of the Natural Language Processing Library [spaCy](https://spacy.io/). By default the text files are expected to be found in the folder `Corpus` and the cleaned files are written into the folder `Cleaned`. We want to keep only useful information in the files and remove any "noise". Our strategy is to do the following: # # * Replace all words by their lemmata ('sang', 'singe', 'singt' --> 'singen'). # * Keep the capitalization for nouns and proper nouns but otherwise change to lower case. # * Keep only verbs, nouns, proper nouns and adjectives. # # Even before this more sophisticated processing, we manually cut of greeting phrases at the beginning and the end of the answer, as they do not contribute to the topic. # # The randomly picked example below will (probably) demonstrate the impact of these transformations. Nevertheless, there is still much room for improvement. You may try other NLP libraries as well or on the contrary skip this step altogether. # #

# # __This notebooks writes to and reads from your file system.__ Per default all used directory are within `~/TextData/Abgeordnetenwatch`, where `~` stands for whatever your operating system considers your home directory. To change this configuration either change the default values in the second next cell or edit [LDA Spike - Configuration.ipynb](./LDA%20Spike%20-%20Configuration.ipynb) and run it before you run this notebook. # #

# # This notebooks operates on text files. In our case we retrieved these texts from www.abgeordnetenwatch.de guided by data that was made available under the [Open Database License (ODbL) v1.0](https://opendatacommons.org/licenses/odbl/1.0/) at that site. # #

# In[1]: import time import random as rnd from pathlib import Path import spacy # In[2]: get_ipython().run_line_magic('store', '-r own_configuration_was_read') if not('own_configuration_was_read' in globals()): raise Exception( '\nReminder: You might want to run your configuration notebook before you run this notebook.' + '\nIf you want to manage your configuration from each notebook, just remove this check.') get_ipython().run_line_magic('store', '-r project_name') if not('project_name' in globals()): project_name = 'AbgeordnetenWatch' get_ipython().run_line_magic('store', '-r text_data_dir') if not('text_data_dir' in globals()): text_data_dir = Path.home() / 'TextData' # In[3]: corpus_dir = text_data_dir / project_name / 'Corpus' cleaned_dir = text_data_dir / project_name / 'Cleaned' assert corpus_dir.exists(), 'Directory should exist.' assert corpus_dir.is_dir(), 'Directory should be a directory.' assert next(corpus_dir.iterdir(), None) != None, 'Directory should not be empty.' cleaned_dir.mkdir(parents=True, exist_ok=True) # Creates a local directory! # In[4]: update_only_missing_texts = True # ## Manual removal of greeting phrases # In[5]: opening_greeting = ['sehr geehrter ', 'sehr geehrte ', 'liebe ', 'lieber ', 'hallo '] closing_greeting = ['mit freundlichen grüßen', 'mit freundlichem gruß', 'mfg', 'freundliche grüße' 'viele grüße', 'beste grüße', 'mit besten grüßen', 'liebe grüße', 'herzliche grüße', 'vielen dank und', 'vg,', 'vg '] max_closing_lines = 4 def without_opening_greeting(lines): for l, line in enumerate(lines): lower_line = line.strip().lower() for greeting in opening_greeting: if lower_line.startswith(greeting): line = ','.join(line.split(',')[1:]) lower_line = line.strip().lower() lines[l] = line return lines def post_scriptum(lines): for l, line in enumerate(lines): if line.startswith('P.S.') or line.startswith('PS'): return lines[l:] return [] def without_closing_greeting(lines): for l, line in enumerate(lines): lower_line = line.strip().lower() if any(lower_line.startswith(greeting) for greeting in closing_greeting): lines = lines[:l] + post_scriptum(lines[l:]) break return lines def without_greetings(text): lines = text.strip().splitlines() if len(lines) < 1: return '' lines = without_opening_greeting(lines[:1]) + lines[1:] closing_start = min(len(lines), max_closing_lines) lines = lines[:-closing_start] + without_closing_greeting(lines[-closing_start:]) return '\n'.join(lines).strip() # In[6]: text = ''' Sehr geehrter Herr N.N., liebe Frau Sonnenschein, wir freuen uns über Ihre Nachricht, die wir gerne demnächst beantworten. Vielen Dank und herzliche Grüße von Ihrem Abgeordneten P.S.: Unsere Partei schätzt den Bürgerdialog ''' print(without_greetings(text)) # ## NLP-based Cleaning # In[7]: notaword_pos = ['SPACE', 'PUNCT'] keepcase_pos = ['NOUN', 'PROPN'] keepword_pos = ['ADJ', 'NOUN', 'PROPN', 'VERB'] # In[8]: german = spacy.load('de') # In[9]: def cleaned_text(text): text_model = german(text) lemmata = [token.lemma_ if token.pos_ in keepcase_pos else token.lemma_.lower() for token in text_model if token.pos_ in keepword_pos] return ' '.join(lemmata) # In[10]: text = 'Die Kuh rannte bis sie fiel, in die Vertiefung.' print(text, '-->', cleaned_text(text)) # ## Load all files and remove the greetings # In[11]: answer_filenames = [] answer_texts = [] min_text_len = 50 files = list(corpus_dir.glob('*A*.txt')) list.sort(files) for file in files: text = without_greetings(file.read_text()) if len(text) >= min_text_len: answer_filenames.append(file.name) answer_texts.append(text) files = None # ## Random Example Text # In[12]: min_len = 400 max_len = 800 example_text = '' while (len(example_text) < min_len or len(example_text) > max_len): example = rnd.randint(0, len(answer_filenames)) example_text = answer_texts[example] print(example_text) # In[13]: # Create a model of the text. We use POS-Tagging to filter the words: # https://spacy.io/api/annotation#pos-tagging text_model = german(example_text) # ### Lemmatized words with part of speech tags # In[14]: for token in text_model: if token.pos_ in notaword_pos: print(token, end='') else: print(token.lemma_, token.pos_, end=' ') # ### Words by part of speech # In[15]: parts_of_speech = {} for token in text_model: pos = token.pos_ if pos in ['SPACE', 'PUNCT']: continue words = parts_of_speech.setdefault(pos, set()) if pos in keepcase_pos: words.add(token.text) else: words.add(token.text.lower()) for key in sorted(parts_of_speech.keys()): words = list(parts_of_speech[key]) list.sort(words) print('{:5}: {}'.format(key, ', '.join(words))) # ### Lemmatizations # In[16]: lemmatizations = list(set( token.text + ' -> ' + token.lemma_ for token in text_model if token.text != token.lemma_ )) list.sort(lemmatizations) print(', '.join(lemmatizations)) # ### Filtered by part of speech # In[17]: for token in text_model: if token.pos_ in keepword_pos: print(token.lemma_, end=' ') # ### Cleaned Example Text # In[18]: print(30 * '-' + ' Original text: ' + 30 * '-') print(example_text) print(30 * '-' + ' Cleaned text: ' + 30 * '-') print(cleaned_text(example_text)) # ## Write all cleaned files # In[19]: nlp_start_time = time.perf_counter() num_files = len(answer_texts) success = [] failure = [] for filename, answer_text in zip(answer_filenames, answer_texts): target_file = cleaned_dir / filename if update_only_missing_texts and target_file.exists(): continue try: target_file.write_text(cleaned_text(answer_text)) success.append(filename) except Exception as exception: failure.append((filename, exception)) finally: print('\r{}/{} files succesfully processed. {} files failed.'.format(len(success), num_files, len(failure)), end='') nlp_end_time = time.perf_counter() print('\nParsing the text as natural language and cleaning took {:.2f}s'.format(nlp_end_time - nlp_start_time)) # In[20]: for filename, exception in failure: print('Exception while processing "{}" was:'.format(filename)) print(exception) else: print('No exception during preprocessing :-)') # # # # # #
# # # # © T. Dong, D. Speicher
# Licensed under a # # CC BY-NC 4.0 # . # # Acknowledgments: # This material was prepared within the project # # P3ML # # which is funded by the Ministry of Education and Research of Germany (BMBF) # under grant number 01/S17064. The authors gratefully acknowledge this support. #