#!/usr/bin/env python # coding: utf-8 # # Data retrieval # In[1]: import requests from bs4 import BeautifulSoup import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import os get_ipython().run_line_magic('matplotlib', 'inline') # In[429]: seed_urls = ['https://inshorts.com/en/read/technology', 'https://inshorts.com/en/read/sports', 'https://inshorts.com/en/read/world'] def build_dataset(seed_urls): news_data = [] for url in seed_urls: news_category = url.split('/')[-1] data = requests.get(url) soup = BeautifulSoup(data.content, 'html.parser') news_articles = [{'news_headline': headline.find('span', attrs={"itemprop": "headline"}).string, 'news_article': article.find('div', attrs={"itemprop": "articleBody"}).string, 'news_category': news_category} for headline, article in zip(soup.find_all('div', class_=["news-card-title news-right-box"]), soup.find_all('div', class_=["news-card-content news-right-box"])) ] news_data.extend(news_articles) df = pd.DataFrame(news_data) df = df[['news_headline', 'news_article', 'news_category']] return df # In[430]: news_df = build_dataset(seed_urls) news_df.head(10) # In[431]: news_df.news_category.value_counts() # # Text Wrangling and Pre-processing # In[2]: import spacy import pandas as pd import numpy as np import nltk from nltk.tokenize.toktok import ToktokTokenizer import re from bs4 import BeautifulSoup from contractions import CONTRACTION_MAP import unicodedata nlp = spacy.load('en_core', parse = True, tag=True, entity=True) #nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True) tokenizer = ToktokTokenizer() stopword_list = nltk.corpus.stopwords.words('english') stopword_list.remove('no') stopword_list.remove('not') # ## Remove HTML tags # In[3]: def strip_html_tags(text): soup = BeautifulSoup(text, "html.parser") stripped_text = soup.get_text() return stripped_text strip_html_tags('

Some important text

') # ## Remove accented characters # In[4]: def remove_accented_chars(text): text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') return text remove_accented_chars('Sómě Áccěntěd těxt') # ## Expand contractions # In[5]: def expand_contractions(text, contraction_mapping=CONTRACTION_MAP): contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL) def expand_match(contraction): match = contraction.group(0) first_char = match[0] expanded_contraction = contraction_mapping.get(match)\ if contraction_mapping.get(match)\ else contraction_mapping.get(match.lower()) expanded_contraction = first_char+expanded_contraction[1:] return expanded_contraction expanded_text = contractions_pattern.sub(expand_match, text) expanded_text = re.sub("'", "", expanded_text) return expanded_text expand_contractions("Y'all can't expand contractions I'd think") # ## Remove special characters # In[6]: def remove_special_characters(text, remove_digits=False): pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]' text = re.sub(pattern, '', text) return text remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits=True) # ## Text lemmatization # In[7]: def lemmatize_text(text): text = nlp(text) text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text]) return text lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily") # ## Text stemming # In[8]: def simple_stemmer(text): ps = nltk.porter.PorterStemmer() text = ' '.join([ps.stem(word) for word in text.split()]) return text simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily") # ## Remove stopwords # In[9]: def remove_stopwords(text, is_lower_case=False): tokens = tokenizer.tokenize(text) tokens = [token.strip() for token in tokens] if is_lower_case: filtered_tokens = [token for token in tokens if token not in stopword_list] else: filtered_tokens = [token for token in tokens if token.lower() not in stopword_list] filtered_text = ' '.join(filtered_tokens) return filtered_text remove_stopwords("The, and, if are stopwords, computer is not") # ## Building a text normalizer # In[10]: def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True, accented_char_removal=True, text_lower_case=True, text_lemmatization=True, special_char_removal=True, stopword_removal=True, remove_digits=True): normalized_corpus = [] # normalize each document in the corpus for doc in corpus: # strip HTML if html_stripping: doc = strip_html_tags(doc) # remove accented characters if accented_char_removal: doc = remove_accented_chars(doc) # expand contractions if contraction_expansion: doc = expand_contractions(doc) # lowercase the text if text_lower_case: doc = doc.lower() # remove extra newlines doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc) # lemmatize text if text_lemmatization: doc = lemmatize_text(doc) # remove special characters and\or digits if special_char_removal: # insert spaces between special characters to isolate them special_char_pattern = re.compile(r'([{.(-)!}])') doc = special_char_pattern.sub(" \\1 ", doc) doc = remove_special_characters(doc, remove_digits=remove_digits) # remove extra whitespace doc = re.sub(' +', ' ', doc) # remove stopwords if stopword_removal: doc = remove_stopwords(doc, is_lower_case=text_lower_case) normalized_corpus.append(doc) return normalized_corpus # ## Pre-process and normalize news articles # In[16]: news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"] # In[442]: news_df['clean_text'] = normalize_corpus(news_df['full_text']) norm_corpus = list(news_df['clean_text']) news_df.iloc[1][['full_text', 'clean_text']].to_dict() # # Save the news articles # In[443]: news_df.to_csv('news.csv', index=False, encoding='utf-8') # # Tagging Parts of Speech # In[11]: news_df = pd.read_csv('news.csv') # In[12]: corpus = normalize_corpus(news_df['full_text'], text_lower_case=False, text_lemmatization=False, special_char_removal=False) sentence = str(news_df.iloc[1].news_headline) sentence_nlp = nlp(sentence) # In[22]: spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp] pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type']) # In[24]: nltk_pos_tagged = nltk.pos_tag(sentence.split()) pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag']) # # Shallow Parsing or Chunking Text # In[132]: from nltk.corpus import conll2000 data = conll2000.chunked_sents() train_data = data[:10900] test_data = data[10900:] print(len(train_data), len(test_data)) print(train_data[1]) # In[133]: from nltk.chunk.util import tree2conlltags, conlltags2tree wtc = tree2conlltags(train_data[1]) wtc # In[134]: tree = conlltags2tree(wtc) print(tree) # In[135]: def conll_tag_chunks(chunk_sents): tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents] def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff # In[136]: from nltk.tag import UnigramTagger, BigramTagger from nltk.chunk import ChunkParserI class NGramTagChunker(ChunkParserI): def __init__(self, train_sentences, tagger_classes=[UnigramTagger, BigramTagger]): train_sent_tags = conll_tag_chunks(train_sentences) self.chunk_tagger = combined_tagger(train_sent_tags, tagger_classes) def parse(self, tagged_sentence): if not tagged_sentence: return None pos_tags = [tag for word, tag in tagged_sentence] chunk_pos_tags = self.chunk_tagger.tag(pos_tags) chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags] wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag) in zip(tagged_sentence, chunk_tags)] return conlltags2tree(wpc_tags) # In[137]: ntc = NGramTagChunker(train_data) print(ntc.evaluate(test_data)) # In[152]: chunk_tree = ntc.parse(nltk_pos_tagged) print(chunk_tree) # In[153]: from IPython.display import display os.environ['PATH'] = os.environ['PATH']+";C:\\Program Files\\gs\\gs9.09\\bin\\" display(chunk_tree) # # Constituency parsing # In[446]: # set java path import os java_path = r'C:\Program Files\Java\jdk1.8.0_102\bin\java.exe' os.environ['JAVAHOME'] = java_path from nltk.parse.stanford import StanfordParser scp = StanfordParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar', path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') result = list(scp.raw_parse(sentence)) print(result[0]) # In[447]: from IPython.display import display os.environ['PATH'] = os.environ['PATH']+";C:\\Program Files\\gs\\gs9.09\\bin\\" display(result[0]) # # Dependency parsing # In[448]: dependency_pattern = '{left}<---{word}[{w_type}]--->{right}\n--------' for token in sentence_nlp: print(dependency_pattern.format(word=token.orth_, w_type=token.dep_, left=[t.orth_ for t in token.lefts], right=[t.orth_ for t in token.rights])) # In[449]: from spacy import displacy displacy.render(sentence_nlp, jupyter=True, options={'distance': 110, 'arrow_stroke': 2, 'arrow_width': 8}) # In[450]: from nltk.parse.stanford import StanfordDependencyParser sdp = StanfordDependencyParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar', path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar') result = list(sdp.raw_parse(sentence)) dep_tree = [parse.tree() for parse in result][0] print(dep_tree) # In[451]: from IPython.display import display os.environ['PATH'] = os.environ['PATH']+";C:\\Program Files\\gs\\gs9.09\\bin\\" display(dep_tree) # In[452]: from graphviz import Source dep_tree_dot_repr = [parse for parse in result][0].to_dot() source = Source(dep_tree_dot_repr, filename="dep_tree", format="png") source # # Named Entity Recognition # In[453]: sentence = str(news_df.iloc[1].full_text) sentence_nlp = nlp(sentence) # In[454]: print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_]) # In[455]: displacy.render(sentence_nlp, style='ent', jupyter=True) # In[19]: named_entities = [] for sentence in corpus: temp_entity_name = '' temp_named_entity = None sentence = nlp(sentence) for word in sentence: term = word.text tag = word.ent_type_ if tag: temp_entity_name = ' '.join([temp_entity_name, term]).strip() temp_named_entity = (temp_entity_name, tag) else: if temp_named_entity: named_entities.append(temp_named_entity) temp_entity_name = '' temp_named_entity = None entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type']) # In[24]: top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type']) .size() .sort_values(ascending=False) .reset_index().rename(columns={0 : 'Frequency'})) top_entities.T.iloc[:,:15] # In[26]: top_entities = (entity_frame.groupby(by=['Entity Type']) .size() .sort_values(ascending=False) .reset_index().rename(columns={0 : 'Frequency'})) top_entities.T.iloc[:,:15] # In[27]: from nltk.tag import StanfordNERTagger import os java_path = r'C:\Program Files\Java\jdk1.8.0_102\bin\java.exe' os.environ['JAVAHOME'] = java_path sn = StanfordNERTagger('E:/stanford/stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar='E:/stanford/stanford-ner-2014-08-27/stanford-ner.jar') ner_tagged_sentences = [sn.tag(sent.split()) for sent in corpus] # In[28]: named_entities = [] for sentence in ner_tagged_sentences: temp_entity_name = '' temp_named_entity = None for term, tag in sentence: if tag != 'O': temp_entity_name = ' '.join([temp_entity_name, term]).strip() temp_named_entity = (temp_entity_name, tag) else: if temp_named_entity: named_entities.append(temp_named_entity) temp_entity_name = '' temp_named_entity = None #named_entities = list(set(named_entities)) entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type']) # In[30]: top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type']) .size() .sort_values(ascending=False) .reset_index().rename(columns={0 : 'Frequency'})) top_entities.head(15) # In[462]: top_entities = (entity_frame.groupby(by=['Entity Type']) .size() .sort_values(ascending=False) .reset_index().rename(columns={0 : 'Frequency'})) top_entities.head() # # Emotion and Sentiment Analysis # In[13]: from afinn import Afinn af = Afinn() # In[14]: sentiment_scores = [af.score(article) for article in corpus] sentiment_category = ['positive' if score > 0 else 'negative' if score < 0 else 'neutral' for score in sentiment_scores] # In[15]: df = pd.DataFrame([list(news_df['news_category']), sentiment_scores, sentiment_category]).T df.columns = ['news_category', 'sentiment_score', 'sentiment_category'] df['sentiment_score'] = df.sentiment_score.astype('float') df.groupby(by=['news_category']).describe() # In[39]: f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4)) sp = sns.stripplot(x='news_category', y="sentiment_score", hue='news_category', data=df, ax=ax1) bp = sns.boxplot(x='news_category', y="sentiment_score", hue='news_category', data=df, palette="Set2", ax=ax2) t = f.suptitle('Visualizing News Sentiment', fontsize=14) # In[40]: fc = sns.factorplot(x="news_category", hue="sentiment_category", data=df, kind="count", palette={"negative": "#FE2020", "positive": "#BADD07", "neutral": "#68BFF5"}) # In[41]: pos_idx = df[(df.news_category=='technology') & (df.sentiment_score == 6)].index[0] neg_idx = df[(df.news_category=='technology') & (df.sentiment_score == -15)].index[0] print('Most Negative Tech News Article:', news_df.iloc[neg_idx][['news_article']][0]) print() print('Most Positive Tech News Article:', news_df.iloc[pos_idx][['news_article']][0]) # In[42]: pos_idx = df[(df.news_category=='world') & (df.sentiment_score == 16)].index[0] neg_idx = df[(df.news_category=='world') & (df.sentiment_score == -12)].index[0] print('Most Negative World News Article:', news_df.iloc[neg_idx][['news_article']][0]) print() print('Most Positive World News Article:', news_df.iloc[pos_idx][['news_article']][0]) # In[16]: from textblob import TextBlob sentiment_scores_tb = [round(TextBlob(article).sentiment.polarity, 3) for article in news_df['clean_text']] sentiment_category_tb = ['positive' if score > 0 else 'negative' if score < 0 else 'neutral' for score in sentiment_scores_tb] # In[17]: df = pd.DataFrame([list(news_df['news_category']), sentiment_scores_tb, sentiment_category_tb]).T df.columns = ['news_category', 'sentiment_score', 'sentiment_category'] df['sentiment_score'] = df.sentiment_score.astype('float') df.groupby(by=['news_category']).describe() # In[18]: df.head() # In[74]: fc = sns.factorplot(x="news_category", hue="sentiment_category", data=df, kind="count", palette={"negative": "#FE2020", "positive": "#BADD07", "neutral": "#68BFF5"}) # In[75]: pos_idx = df[(df.news_category=='world') & (df.sentiment_score == 0.7)].index[0] neg_idx = df[(df.news_category=='world') & (df.sentiment_score == -0.296)].index[0] print('Most Negative World News Article:', news_df.iloc[neg_idx][['news_article']][0]) print() print('Most Positive World News Article:', news_df.iloc[pos_idx][['news_article']][0]) # In[20]: import model_evaluation_utils as meu meu.display_confusion_matrix_pretty(true_labels=sentiment_category, predicted_labels=sentiment_category_tb, classes=['negative', 'neutral', 'positive'])