#!/usr/bin/env python # coding: utf-8 # In[ ]: # This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the "../input/" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir("../input")) # Any results you write to the current directory are saved as output. # In[ ]: a1 = pd.read_csv('../input/articles1.csv',index_col=0) a2 = pd.read_csv('../input/articles2.csv',index_col=0) a3 = pd.read_csv('../input/articles3.csv',index_col=0) # In[ ]: df = pd.concat([a1,a2,a3]) # In[ ]: # save memory del a1, a2, a3 # In[ ]: df.head() # In[ ]: import matplotlib.pyplot as plt plt.figure(figsize=(10,7)) df.publication.value_counts().plot(kind='bar') # In[ ]: doc = df.loc[0,'content'] # In[ ]: import spacy from spacy import displacy nlp = spacy.load('en') # In[ ]: doc = nlp(doc) # In[ ]: displacy.render(doc,style='ent',jupyter=True) # In[ ]: from tqdm import tqdm, tqdm_notebook # In[ ]: nlp = spacy.load('en', disable=['parser', 'tagger', 'textcat']) # In[ ]: frames = [] for i in tqdm_notebook(range(1000)): doc = df.loc[i,'content'] text_id = df.loc[i,'id'] doc = nlp(doc) ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents if len(e.text.strip(' -—')) > 0] frame = pd.DataFrame(ents) frame['id'] = text_id frames.append(frame) # In[ ]: npf = pd.concat(frames) # In[ ]: npf.head() # In[ ]: npf.columns = ['Text','Start','Stop','Type','id'] # In[ ]: plt.figure(figsize=(10,7)) npf.Type.value_counts().plot(kind='bar') # In[ ]: orgs = npf[npf.Type == 'ORG'] # In[ ]: plt.figure(figsize=(10,7)) orgs.Text.value_counts()[:15].plot(kind='bar') # In[ ]: orgs.groupby(['id','Text']).size() # In[ ]: doc = 'Google to buy Apple' doc = nlp(doc) displacy.render(doc,style='dep',jupyter=True, options={'distance':120}) # In[ ]: for chunk in doc.noun_chunks: print(chunk.text,'|' , chunk.root.text,'|', chunk.root.dep_,'|', chunk.root.head.text) # In[ ]: for token in doc: print(token.text,'|', token.lemma_,'|', token.pos_,'|', token.tag_,'|', token.dep_,'|', token.shape_,'|', token.is_alpha,'|', token.is_stop) # # Fine tuning SpaCy NER # In[ ]: import spacy import random # In[ ]: TRAIN_DATA = [ ('Who is Shaka Khan?', { 'entities': [(7, 17, 'PERSON')] }), ('I like London and Berlin.', { 'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')] }) ] # In[ ]: nlp = spacy.load('en') # In[ ]: # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe('ner') # In[ ]: # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) # In[ ]: # In[ ]: n_iter = 5 # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp._optimizer if not nlp._optimizer: optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update( [text], # batch of texts [annotations], # batch of annotations drop=0.5, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses) # In[ ]: # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # # Rule based matching # In[ ]: import spacy from spacy.matcher import Matcher nlp = spacy.load('en') # In[ ]: pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}] matcher.add('HelloWorld', None, pattern) doc = nlp(u'Hello, world! Hello world!') matches = matcher(doc) # In[ ]: matches # In[ ]: doc[0:3] # In[ ]: # In[ ]: df.title = df.title.fillna('') # In[ ]: np.where(df.content.str.contains('iPhone')) # In[ ]: df.loc[14] # In[ ]: import spacy from spacy.matcher import Matcher nlp = spacy.load('en') matcher = Matcher(nlp.vocab) # In[ ]: # Get the hash of the word 'PRODUCT'. This is required to set an entity. PRODUCT = nlp.vocab.strings['PRODUCT'] def add_product_ent(matcher, doc, i, matches): # Get the current match and create tuple of entity label, start and end. # Append entity to the doc's entity. (Don't overwrite doc.ents!) match_id, start, end = matches[i] doc.ents += ((PRODUCT, start, end),) pattern1 = [{'LOWER': 'iphone'}] pattern2 = [{'ORTH': 'iPhone'}, {'IS_DIGIT': True}] matcher.add('iPhone', add_product_ent,pattern1, pattern2) # In[ ]: matches = matcher(doc) # In[ ]: displacy.render(doc,style='ent',jupyter=True) # In[ ]: def matcher_component(doc): matches = matcher(doc) return doc # In[ ]: nlp.add_pipe(matcher_component,last=True) # In[ ]: doc = nlp(df.content.iloc[14]) #matcher(doc) # In[ ]: displacy.render(doc,style='ent',jupyter=True) # # Regex # In[ ]: import re # In[ ]: pattern = 'NL[0-9]{9}B[0-9]{2}' # In[ ]: my_string = 'ING Bank N.V. BTW:NL003028112B01' # In[ ]: re.findall(pattern,my_string) # In[ ]: match = re.search(pattern,my_string) # In[ ]: match.span() # In[ ]: