In [ ]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [ ]:

a1 = pd.read_csv('../input/articles1.csv',index_col=0)
a2 = pd.read_csv('../input/articles2.csv',index_col=0)
a3 = pd.read_csv('../input/articles3.csv',index_col=0)

In [ ]:

df = pd.concat([a1,a2,a3])

In [ ]:

# save memory
del a1, a2, a3

In [ ]:

df.head()

In [ ]:

import matplotlib.pyplot as plt
plt.figure(figsize=(10,7))
df.publication.value_counts().plot(kind='bar')

In [ ]:

doc = df.loc[0,'content']

In [ ]:

import spacy
from spacy import displacy
nlp = spacy.load('en')

In [ ]:

doc = nlp(doc)

In [ ]:

displacy.render(doc,style='ent',jupyter=True)

In [ ]:

from tqdm import tqdm, tqdm_notebook

In [ ]:

nlp = spacy.load('en',
                 disable=['parser', 
                          'tagger',
                          'textcat'])

In [ ]:

frames = []
for i in tqdm_notebook(range(1000)):
    doc = df.loc[i,'content']
    text_id = df.loc[i,'id']
    doc = nlp(doc)
    ents = [(e.text, e.start_char, e.end_char, e.label_) 
            for e in doc.ents 
            if len(e.text.strip(' -—')) > 0]
    frame = pd.DataFrame(ents)
    frame['id'] = text_id
    frames.append(frame)

In [ ]:

npf = pd.concat(frames)

In [ ]:

npf.head()

In [ ]:

npf.columns = ['Text','Start','Stop','Type','id']

In [ ]:

plt.figure(figsize=(10,7))
npf.Type.value_counts().plot(kind='bar')

In [ ]:

orgs = npf[npf.Type == 'ORG']

In [ ]:

plt.figure(figsize=(10,7))
orgs.Text.value_counts()[:15].plot(kind='bar')

In [ ]:

orgs.groupby(['id','Text']).size()

In [ ]:

doc = 'Google to buy Apple'
doc = nlp(doc)
displacy.render(doc,style='dep',jupyter=True, options={'distance':120})

In [ ]:

for chunk in doc.noun_chunks:
    print(chunk.text,'|' , chunk.root.text,'|', chunk.root.dep_,'|',
          chunk.root.head.text)

In [ ]:

for token in doc:
    print(token.text,'|', token.lemma_,'|', token.pos_,'|', token.tag_,'|', token.dep_,'|',
          token.shape_,'|', token.is_alpha,'|', token.is_stop)

Fine tuning SpaCy NER¶

In [ ]:

import spacy
import random

In [ ]:

TRAIN_DATA = [
    ('Who is Shaka Khan?', {
        'entities': [(7, 17, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

In [ ]:

nlp = spacy.load('en')

In [ ]:

# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe('ner')

In [ ]:

# add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [ ]:

n_iter = 5

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp._optimizer 
    if not nlp._optimizer:
        optimizer = nlp.begin_training()
    
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update(
                [text],  # batch of texts
                [annotations],  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses)

In [ ]:

# test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Rule based matching¶

In [ ]:

import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en')

In [ ]:

pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]

matcher.add('HelloWorld', None, pattern)

doc = nlp(u'Hello, world! Hello world!')
matches = matcher(doc)

In [ ]:

matches

In [ ]:

doc[0:3]

In [ ]:

df.title = df.title.fillna('')

In [ ]:

np.where(df.content.str.contains('iPhone'))

In [ ]:

df.loc[14]

In [ ]:

import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)

In [ ]:

# Get the hash of the word 'PRODUCT'. This is required to set an entity.
PRODUCT = nlp.vocab.strings['PRODUCT']

def add_product_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    doc.ents += ((PRODUCT, start, end),)

pattern1 = [{'LOWER': 'iphone'}]
pattern2 = [{'ORTH': 'iPhone'}, {'IS_DIGIT': True}]

matcher.add('iPhone', add_product_ent,pattern1, pattern2)

In [ ]:

matches = matcher(doc)

In [ ]:

displacy.render(doc,style='ent',jupyter=True)

In [ ]:

def matcher_component(doc):
    matches = matcher(doc)
    return doc

In [ ]:

nlp.add_pipe(matcher_component,last=True)

In [ ]:

doc = nlp(df.content.iloc[14])
#matcher(doc)

In [ ]:

displacy.render(doc,style='ent',jupyter=True)

Regex¶

In [ ]:

import re

In [ ]:

pattern = 'NL[0-9]{9}B[0-9]{2}'

In [ ]:

my_string = 'ING Bank N.V. BTW:NL003028112B01'

In [ ]:

re.findall(pattern,my_string)

In [ ]:

match = re.search(pattern,my_string)

In [ ]:

match.span()

In [ ]: