import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata
nlp = spacy.load('en', parse = False, tag=False, entity=False)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
def strip_html_tags(text):
soup = BeautifulSoup(text, "html.parser")
stripped_text = soup.get_text()
return stripped_text
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
def remove_special_characters(text):
text = re.sub('[^a-zA-z0-9\s]', '', text)
return text
def lemmatize_text(text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
def remove_stopwords(text, is_lower_case=False):
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopword_list]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
accented_char_removal=True, text_lower_case=True,
text_lemmatization=True, special_char_removal=True,
stopword_removal=True):
normalized_corpus = []
# normalize each document in the corpus
for doc in corpus:
# strip HTML
if html_stripping:
doc = strip_html_tags(doc)
# remove accented characters
if accented_char_removal:
doc = remove_accented_chars(doc)
# expand contractions
if contraction_expansion:
doc = expand_contractions(doc)
# lowercase the text
if text_lower_case:
doc = doc.lower()
# remove extra newlines
doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
# insert spaces between special characters to isolate them
special_char_pattern = re.compile(r'([{.(-)!}])')
doc = special_char_pattern.sub(" \\1 ", doc)
# lemmatize text
if text_lemmatization:
doc = lemmatize_text(doc)
# remove special characters
if special_char_removal:
doc = remove_special_characters(doc)
# remove extra whitespace
doc = re.sub(' +', ' ', doc)
# remove stopwords
if stopword_removal:
doc = remove_stopwords(doc, is_lower_case=text_lower_case)
normalized_corpus.append(doc)
return normalized_corpus
document = """<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n
It's an amazing language which can be used for Scripting, Web development,\r\n\r\n
Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n
What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n
got a headstart!</p>
"""
document
"<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n \n It's an amazing language which can be used for Scripting, Web development,\r\n\r\n\n Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n\n What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n\n got a headstart!</p>\n "
normalize_corpus([document], text_lemmatization=False, stopword_removal=False, text_lower_case=False)
['Hello Hello can you hear me I just heard about Python It is an amazing language which can be used for Scripting Web development Information Retrieval Natural Language Processing Machine Learning Artificial Intelligence What are you waiting for Go and get started He is learning she is learning they have already got a headstart ']
normalize_corpus([document])
['hello hello hear hear python amazing language use scripting web development information retrieval natural language processing machine learning artificial intelligence wait go get start learn learn already get headstart']