re
is a built-in Python module for regular expressionsimport numpy as np
import pandas as pd
import re
fruit.txt
is a list of fruits distributed with R's
stringr library. fruits_df = pd.read_csv('./fruits.txt')
fruits = list(fruits_df['fruit'].values)
fruits_df.head()
contains
, match
, fullmatch
,count
, findall
, replace
, extract
, split
.fruits_df[fruits_df['fruit'].str.match('^a')]
str.contains()
returns a bool indicating whether a pattern
is found in each entry of a string series.re.search()
. #[re.search(' ', fruit) is not None for fruit in fruits]
two_word_fruits = []
for fruit in fruits:
if re.search(' ', fruit):# is not None:
two_word_fruits.append(fruit)
two_word_fruits
fruits_df[fruits_df['fruit'].str.contains(' ')]
fruits_df[fruits_df['fruit'].str.contains('a')]
^
indicates the match must come at the
beginning of the string. fruits_df[fruits_df['fruit'].str.contains('^a')]
$
indicates the match must come at the
end of the string. fruits_df[fruits_df['fruit'].str.contains('a$')]
.startswith()
and .endswith()
methods. fruits_df[
np.logical_or(
fruits_df['fruit'].str.startswith('a'),
fruits_df['fruit'].str.endswith('a')
)
]
|
can be used as an or operator in regular
expressions. fruits_df[fruits_df['fruit'].str.contains('^a|a$')]
fruits_df[fruits_df['fruit'].str.contains('^[aeiou]')]
^
means to
match anything but the listed characters. fruits_df[fruits_df['fruit'].str.contains('[^aeiounrt]$')]
[a-z]
- lowercase letters[A-Z]
- uppercase letters[0-9]
- digits[A-Za-z0-9]
. {}
can be used to specify a
a specific number (or range) of matches. fruits_df[fruits_df['fruit'].str.contains('[^aeiounrt]{2}$')]
#fruits_df[fruits_df['fruit'].str.contains('[^aeiour]{2, 3}$')]
#fruits_df[fruits_df['fruit'].str.contains('')]
*
indicates 0 or more matches, ?
indicates
0 or 1 matches, and +
indicates one or more matches..
can be used to match any single character..*
matches anything but a\n
) character. rgx0 = '[aeiou]{2}.[aeiou]{2}'
fruits_df[fruits_df['fruit'].str.contains(rgx0)]
first = True
if first:
rgx1 = '[aeiou]{2}.+[aeiou]{2}'
fruits_df[fruits_df['fruit'].str.contains(rgx1)]
else:
fruits_df[
np.logical_and(
fruits_df['fruit'].str.contains(rgx1),
~fruits_df['fruit'].str.contains(rgx0)
)
.
can be escaped
using a backslash \
, e.g. \.
.[.]
. fruits.append('507@umich.edu')
print(fruits[len(fruits) - 1])
for f in fruits:
if re.search('\.', f):
print(f)
if re.search('[.]', f):
print('[' + f + ']')
\
is used as an escape character, a literal backslash
\
needs to be escaped. fruits.append(r'C:\path\file.txt')
fruits[len(fruits) - 1]
To avoid unwanted escaping with \ in a regular expression, use raw string literals like r'C:\x' instead of the equivalent 'C:\x'.
-- Wes McKinney
for f in fruits:
if re.search(r'\\', f):
print(f)
if re.search('\\\\', f):
print('ugh!')
print(f)
\w
roughly [a-zA-z0-9]+
,\W
,\d = [0-9]
, \D
,\s
,\S
.fruits_df[fruits_df['fruit'].str.contains('\s')]
fruits_df[fruits_df['fruit'].str.contains('(.)\\1')]
#fruits_df[fruits_df['fruit'].str.contains('([^r])\\1')]
#fruits_df[fruits_df['fruit'].str.contains('(.)\\1$')]