#!/usr/bin/env python
# coding: utf-8

# # Regular Expressions

# In[1]:


import addutils.toc ; addutils.toc.js(ipy_notebook=True)


# In[2]:


from addutils import css_notebook
css_notebook()


# ## 1 Finding wanted words and pieces of information (in a text complexity)

# This notebook is about the task of searching and managing substrings (matches) of a string. This is useful to extract piece of information from a text, for example when parsing dates, urls, e-mails, data lists, configuration files or programing scripts. Python offers some string methods for managing the simplest requirements, but the most powerful solution is offered by a language-free pattern matching standard: [regular expressions](http://en.wikipedia.org/wiki/Regular_expression).  
# Regular expressions are a sort of very specialized programming language made of special text strings (meta-characters) designed for describing a search pattern. Python has also some packages suitable for managing regular expression, such as python re, the regular expression module contained in the python standard distribution, or pyregex, a new external package under development (not treated in this notebook).  
#   
#   

# ## 2 Python easy solutions for simple problems

# ### 2.1 Some of the built-in python string functions may solve some of the easiest tasks:

# #### 2.1.1 find

# One of the most common requirements is to find a given word, or set of characters/numbers from a text. The find functions returns the positional index of the first character we were looking for, if a match is found; it returns -1 if not found.

# In[3]:


string = "this is string example....wow!!!"

part = "wow!!!"
part2 = "strong"
print(string.find(part))
print(string.find(part2))


# #### 2.1.2 strip, lstrip, rstrip

# Other functions help to clean and extract only useful information 

# In[4]:


string = "0000000this is string example....wow!!!0000000"

print(string.strip('0'))
print(string.lstrip('0'))
print(string.rstrip('0'))


# #### 2.1.3 replace

# In[5]:


string = "this is string example....wow!!!"

spl = string.replace('string', 'good')
spl


# #### 2.1.4 functions for identifying the type of character

# a series of methods, and even simple idiomatic expressions using basic operators, returning True or False, such as **isalnum** (checking for alphanumeric), **isalpha** (only alphabetic), **isdigit** (numbers), **isspace** (whitespace), **islower** (lowercase), **isupper** (uppercase), **istitle** (titlecase, if all words in a string starts with uppercase), **startswith**, **endswith**. 

# In[6]:


"a" in 'xyxxyabcxyzzy'


# In[7]:


string = "this is string example....wow!!!"

print(string.startswith('this'))
print(string.startswith('is'))
print(string.startswith('string', 8))   # start index at the matching boundary


# In[8]:


string = 'this'

string.isalpha()


# In[9]:


string = 'this '  # whitespace is not alphabetic!

print(string.isalpha())


# ***Try by yourself*** &nbsp;the other methods:
# 
#     string = 'this'
#     print string.isupper()
#     print string.islower()
#     print string.istitle()
#     print string.isalnum()
#     print string.isspace()
#     print string.isdigit()
# 
# Try also by modifying the string:   
# 
#     mod = string.upper()
#     mod = string.title()
#     print mod.isupper()

# In[ ]:


# #### 2.1.5 a slightly more complex example

# Let see how we could clean a string with some unuseful elements, using python built-in methods:

# In[10]:


string = "this 44444is a99999 dirty 678435 string xxxxxxexample....wow000000!!!"

spl = string.split()
print(spl)


# In[11]:


ls = []
for i, item in enumerate(spl):
    if item.find('xxx') != -1:
        item = item.lstrip('x')
    result = ''.join([e for e in item if not e.isdigit()])
    if result:                                             # needed to exclude empty strings
        ls.append(result)
print('The temporary cleaned list looks like this: ', ls)


# Get back to string again, after complete cleaning and slight modifying:

# In[12]:


string = ' '.join(ls)
final = string.replace('....', ', ')
final = string.replace('dirty', 'clean')
print(final)


# > For simple string management python built-in methods are enough,
# > but when we are dealing to more complex tasks,
# > regular expressions are the best solution for dealing with pattern matching. 

# ## 2.2 The Power of Regular Expressions

# A regular expression (regex or regexp for short) is a special text string for describing a search pattern. Regular expressions may be used for retrieving parts of longer strings matching some desired criteria. Dealing with regular expressions may seem complex at the beginning, since they are made of both regular and special characters concatenated in a sequence, hard to be understood at a first sight. But once they are fully assimilated, they become a powerful helper while parsing any kind of text. 
# The most basic regular expressions are single literal characters, for example "a" will look for all "a" occurrence in a text. But there are some special characters, also called meta-characters, which combined with regular characters and concatenated build the regular expression search patterns.
# The meta-characters used by regular expressions are: 
#   
# 
# <center>. ^  $  *  +  ?  {  [  ]  \  |  (  )</center>
#   
# 
# The following link refers to a list of regular expressions, and the description of their use:

# In[13]:


from IPython.display import HTML
HTML('<iframe src=https://help.libreoffice.org/Common/List_of_Regular_Expressions width=700 height=250>')


# Online there are many tools suitable for testing the effectiveness of a regular expression, such as:

# In[14]:


HTML('<iframe src=http://pythex.org// width=700 height=250>')


# ## 2.3 Python re module

# Python [re module](http://docs.python.org/2/library/re.html) is the python standard distribution module for the regular expressions ). It offers some methods for compiling regular expressions to RegexObjects, used to search, manage and return the expected matches.

# In[15]:


import re


# re.match() is suitable to find a match at the beginning of a string.

# In[16]:


line = "Cats are smarter than dogs";

matchObj = re.match( r'(.*) are (.*?) .*', line, re.M|re.I)
if matchObj:
   print("matchObj.group() : ", matchObj.group())
   print("matchObj.group(1) : ", matchObj.group(1))
   print("matchObj.group(2) : ", matchObj.group(2))
else:
   print("No match!!")


# re.search() is similar finds a match anywhere inside a text.

# In[17]:


string = 'purple alice-b@google.com monkey dishwasher'

match = re.search(r'@[\w.]+', string)
if match:
    print(match.group())  ## 'b@google'


# In[18]:


match = re.search(r'[\w.-]+@[\w.-]+', string)
if match:
    print(match.group())  ## 'alice-b@google.com'


# In[19]:


string = 'purple alice-b@google.com monkey dishwasher'

match = re.search('([\w.-]+)@([\w.-]+)', string)
if match:
    print(match.group())   ## 'alice-b@google.com' (the whole match)
    print(match.group(1))  ## 'alice-b' (the username, group 1)
    print(match.group(2))  ## 'google.com' (the host, group 2)


# re.findall() finds all the matches for a given regular expression.

# In[20]:


## Suppose we have a text with many email addresses
string = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'

## Here re.findall() returns a list of all the found email strings
emails = re.findall(r'[\w\.-]+@[\w\.-]+', string) ## ['alice@google.com', 'bob@abc.com']
for email in emails:
    # do something with each found email string
    print(email)


# In[21]:


string = 'purple alice@google.com, blah monkey bob@abc.com blah dishwasher'

tuples = re.findall(r'([\w\.-]+)@([\w\.-]+)', string)
print(tuples)  ## [('alice', 'google.com'), ('bob', 'abc.com')]
for tup in tuples:
    print(tup[0])  ## username
    print(tup[1])  ## host


# re.sub() is suitable for replacing occurences ot the regex pattern with a given substitute.

# In[22]:


phone = "2004-959-559 # This is Phone Number"

# Delete Python-style comments
num = re.sub(r'#.*$', "", phone)
print("Phone Num : ", num)

# Remove anything other than digits
num = re.sub(r'\D', "", phone)    
print("Phone Num : ", num)


# Opening and looking for matches in a .txt file, in this case finding all words preceeded or followed by a - symbol.

# In[23]:


# Open file
import os.path
path = os.path.join(os.path.curdir, "example_data", "small_is_beautiful.txt")
f = open(path, 'r')
# Feed the file text into findall(); it returns a list of all the found strings
text = f.read()
strings = re.findall(r'(\w*-\w*)', text)
print("Matches are: ", strings)
# Follows the text parsed by the findall method:
print('text is: \n', text)
f.close()


# ---
# 
# Visit [www.add-for.com](<http://www.add-for.com/IT>) for more tutorials and updates.
# 
# This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.