re is a built-in Python module for regular expressionsimport numpy as np
import pandas as pd
import re
fruit.txt is a list of fruits distributed with R's
stringr library. fruits_df = pd.read_csv('./fruits.txt')
fruits = list(fruits_df['fruit'].values)
fruits_df.head()
contains, match, fullmatch,count, findall, replace, extract, split.fruits_df[fruits_df['fruit'].str.match('^a')]
str.contains() returns a bool indicating whether a pattern
is found in each entry of a string series.re.search(). #[re.search(' ', fruit) is not None for fruit in fruits]
two_word_fruits = []
for fruit in fruits:
if re.search(' ', fruit):# is not None:
two_word_fruits.append(fruit)
two_word_fruits
fruits_df[fruits_df['fruit'].str.contains(' ')]
fruits_df[fruits_df['fruit'].str.contains('a')]
^ indicates the match must come at the
beginning of the string. fruits_df[fruits_df['fruit'].str.contains('^a')]
$ indicates the match must come at the
end of the string. fruits_df[fruits_df['fruit'].str.contains('a$')]
.startswith() and .endswith()
methods. fruits_df[
np.logical_or(
fruits_df['fruit'].str.startswith('a'),
fruits_df['fruit'].str.endswith('a')
)
]
| can be used as an or operator in regular
expressions. fruits_df[fruits_df['fruit'].str.contains('^a|a$')]
fruits_df[fruits_df['fruit'].str.contains('^[aeiou]')]
^ means to
match anything but the listed characters. fruits_df[fruits_df['fruit'].str.contains('[^aeiounrt]$')]
[a-z] - lowercase letters[A-Z] - uppercase letters[0-9] - digits[A-Za-z0-9]. {} can be used to specify a
a specific number (or range) of matches. fruits_df[fruits_df['fruit'].str.contains('[^aeiounrt]{2}$')]
#fruits_df[fruits_df['fruit'].str.contains('[^aeiour]{2, 3}$')]
#fruits_df[fruits_df['fruit'].str.contains('')]
* indicates 0 or more matches, ? indicates
0 or 1 matches, and + indicates one or more matches.. can be used to match any single character..* matches anything but a\n) character. rgx0 = '[aeiou]{2}.[aeiou]{2}'
fruits_df[fruits_df['fruit'].str.contains(rgx0)]
first = True
if first:
rgx1 = '[aeiou]{2}.+[aeiou]{2}'
fruits_df[fruits_df['fruit'].str.contains(rgx1)]
else:
fruits_df[
np.logical_and(
fruits_df['fruit'].str.contains(rgx1),
~fruits_df['fruit'].str.contains(rgx0)
)
. can be escaped
using a backslash \, e.g. \..[.]. fruits.append('507@umich.edu')
print(fruits[len(fruits) - 1])
for f in fruits:
if re.search('\.', f):
print(f)
if re.search('[.]', f):
print('[' + f + ']')
\ is used as an escape character, a literal backslash
\needs to be escaped. fruits.append(r'C:\path\file.txt')
fruits[len(fruits) - 1]
To avoid unwanted escaping with \ in a regular expression, use raw string literals like r'C:\x' instead of the equivalent 'C:\x'.
-- Wes McKinney
for f in fruits:
if re.search(r'\\', f):
print(f)
if re.search('\\\\', f):
print('ugh!')
print(f)
\w roughly [a-zA-z0-9]+,\W,\d = [0-9], \D,\s,\S.fruits_df[fruits_df['fruit'].str.contains('\s')]
fruits_df[fruits_df['fruit'].str.contains('(.)\\1')]
#fruits_df[fruits_df['fruit'].str.contains('([^r])\\1')]
#fruits_df[fruits_df['fruit'].str.contains('(.)\\1$')]