import pandas as pd
import numpy as np
import re
val = 'a, b, guido , bajo'
print(val)
a, b, guido , bajo
# splitting the data by , and strip the whitespace
val2 = [x.strip() for x in val.split(',')]
val2
['a', 'b', 'guido', 'bajo']
# tuple assignment
first, second, third, four = val2
first + "::" + second + "::" + third
'a::b::guido'
# practical method is join
"::".join(val2)
'a::b::guido::bajo'
# checking if guido is in val2
'guido' in val2
True
# searching in string
print("index",val.index(','))
print("find",val.find(','))
index 1 find 1
print("find",val.find(':')) # find and index behave same if string is available
# print("index",val.index(':')) # index throws an exception where find returns -1
find -1
# get string counts
print(", -- ", val.count(','),"\n"
"a --", val.count('a'))
, -- 3 a -- 2
# replace will substitute occurrences of one pattern for another. This is commonly used
# to delete patterns, too, by passing an empty string:
val.replace(',', '::')
'a:: b:: guido :: bajo'
val.replace(',', '')
'a b guido bajo'
count Return the number of non-overlapping occurrences of substring in the string.
endswith, startswith Returns True if string ends with suffix (starts with prefix).
join Use string as delimiter for concatenating a sequence of other strings.
index Return position of first character in substring if found in the string. Raises ValueError if not found.
find Return position of first character of first occurrence of substring in the string. Like index, but returns -1 if not found.
rfind Return position of first character of last occurrence of substring in the string. Returns -1 if not found.
replace Replace occurrences of string with another string.
strip, rstrip, lstrip Trim whitespace, including newlines; equivalent to x.strip() (and rstrip, lstrip, respectively) for each element.
split Break string into list of substrings using passed delimiter.
lower, upper Convert alphabet characters to lowercase or uppercase, respectively.
ljust, rjust Left justify or right justify, respectively. Pad opposite side of string with spaces (or some other fill character) to return a string with a minimum width
import re
text = "foo bar\t baz \tqux"
text
'foo bar\t baz \tqux'
re.split('\s+', text)
['foo', 'bar', 'baz', 'qux']
# compiled version of regex
regex = re.compile('\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
# to get a list of all patterns matching the regex, you can use the findall method:
regex.findall(text)
[' ', '\t ', ' \t']
# match and search are closely related to findall. While findall returns all matches in a
# string, search returns only the first match. More rigidly, match only matches at the
# beginning of the string.
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
# search returns a special match object for the first email address in the text. For the
# above regex, the match object can only tell us the start and end position of the pattern
# in the string:
m = regex.search(text)
m
<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>
text[m.start():m.end()]
'dave@google.com'
# regex.match returns None, as it only will match if the pattern occurs at the start of the string:
print(regex.match(text))
None
# sub will return a new string with occurrences of the pattern replaced by the a new string:
print (regex.sub('REDACTED', text))
Dave REDACTED Steve REDACTED Rob REDACTED Ryan REDACTED
# to find email addresses and simultaneously segment each address
# into its 3 components: username, domain name, and domain suffix.
# To do this, put parentheses around the parts of the pattern to segment:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
# A match object produced by this modified regex returns a tuple of the pattern components
# with its groups method
m = regex.match('wesm@bright.net')
m.groups()
('wesm', 'bright', 'net')
# findall returns a list of tuples when the pattern has groups:
regex.findall(text)
[('dave', 'google', 'com'), ('steve', 'gmail', 'com'), ('rob', 'gmail', 'com'), ('ryan', 'yahoo', 'com')]
# sub also has access to groups in each match using special symbols like \1, \2, etc.:
print (regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))
Dave Username: dave, Domain: google, Suffix: com Steve Username: steve, Domain: gmail, Suffix: com Rob Username: rob, Domain: gmail, Suffix: com Ryan Username: ryan, Domain: yahoo, Suffix: com
findall, finditer Return all non-overlapping matching patterns in a string. findall returns a list of all patterns while finditer returns them one by one from an iterator.
match Match pattern at start of string and optionally segment pattern components into groups. If the pattern matches, returns a match object, otherwise None.
search Scan string for match to pattern; returning a match object if so. Unlike match, the match can be anywhere in the string as opposed to only at the beginning.
split Break string into pieces at each occurrence of pattern.
sub, subn Replace all (sub) or first n occurrences (subn) of pattern in string with replacement expression. Use symbols \1, \2, ... to refer to match group elements in the replacement string.
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
Dave dave@google.com Rob rob@gmail.com Steve steve@gmail.com Wes NaN dtype: object
# String and regular expression methods can be applied (passing a lambda or other function)
# to each value using data.map, but it will fail on the NA. To cope with this, Series
# has concise methods for string operations that skip NA values. These are accessed
# through Series’s str attribute; for example, we could check whether each email address
# has 'gmail' in it with str.contains:
data.str.contains('gmail')
Dave False Rob True Steve True Wes NaN dtype: object
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)
Dave [(dave, google, com)] Rob [(rob, gmail, com)] Steve [(steve, gmail, com)] Wes NaN dtype: object
# There are a couple of ways to do vectorized element retrieval. Either use str.get or
# index into the str attribute:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches
C:\tools\Anaconda3\lib\site-packages\ipykernel\__main__.py:4: FutureWarning: In future versions of pandas, match will change to always return a bool indexer.
Dave (dave, google, com) Rob (rob, gmail, com) Steve (steve, gmail, com) Wes NaN dtype: object
matches.str.get(1)
Dave google Rob gmail Steve gmail Wes NaN dtype: object
matches.str[0]
Dave dave Rob rob Steve steve Wes NaN dtype: object
data.str[:5]
Dave dave@ Rob rob@g Steve steve Wes NaN dtype: object
cat Concatenate strings element-wise with optional delimiter
contains Return boolean array if each string contains pattern/regex
count Count occurrences of pattern
endswith, startswith Equivalent to x.endswith(pattern) or x.startswith(pattern) for each element.
findall Compute list of all occurrences of pattern/regex for each string get Index into each element (retrieve i-th element)
join Join strings in each element of the Series with passed separator
len Compute length of each string
lower, upper Convert cases; equivalent to x.lower() or x.upper() for each element.
match Use re.match with the passed regular expression on each element, returning matched groups as list.
pad Add whitespace to left, right, or both sides of strings
center Equivalent to pad(side='both')
repeat Duplicate values; for example s.str.repeat(3) equivalent to x * 3 for each string.
replace Replace occurrences of pattern/regex with some other string
slice Slice each string in the Series.
split Split strings on delimiter or regular expression
strip, rstrip, lstrip Trim whitespace, including newlines; equivalent to x.strip() (and rstrip, lstrip, respectively) for each element.