-
Notifications
You must be signed in to change notification settings - Fork 1
/
regular_expressions.py
315 lines (225 loc) · 9.81 KB
/
regular_expressions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
'''Regular Expressions using the standard module re'''
import re
# https://docs.python.org/3/library/re.html
# match()
# -----------------------------------------------------------------------------
# Define a pattern string, and a source string to compare against.
# match() checks whether the source begins with the pattern. 'You' is the
# pattern 'Young Frankenstein' is the source
r = re.match('You', 'Young Frankenstein')
print(r) # <_sre.SRE_Match object; span=(0, 3), match='You'>
print(r.group()) # You
# For more complex matches, you can compile your pattern first to speed up the
# match. Then, you can perform your match against the compiled pattern:
pattern = re.compile('You')
r = pattern.match('Young Frankenstein')
print(r) # <_sre.SRE_Match object; span=(0, 3), match='You'>
print(r.group()) # You
# Obviously the source can be defined in a variable:
pattern = re.compile('blue')
source = 'blue red green yellow blueish blue'
r = re.match(pattern, source)
print(r) # <_sre.SRE_Match object; span=(0, 4), match='blue'>
print(r.group()) # blue
# In the following, r returns nothing because match only checks out if the
# beginning of the source matches:
r = re.match('green', source)
print(r) # None
# print(r.group()) # AttributeError: 'NoneType' object has no attribute 'group'
# Wildcards .*
# -----------------------------------------------------------------------------
# . means any character
# * means any number of the preceding character
# .* says there can be any amount of any characters before
r = re.match('.*green', source)
print(r) # <_sre.SRE_Match object; span=(0, 14), match='blue red green'>
print(r.group()) # blue red green
# search()
# -----------------------------------------------------------------------------
# search() returns the first match, if any.
r = re.search('green', source)
print(r) # <_sre.SRE_Match object; span=(9, 14), match='green'>
print(r.group()) # green
# findall()
# -----------------------------------------------------------------------------
# findall() returns a list of all non-overlapping matches, if any.
r = re.findall('blue', source)
print('Found', len(r), 'matches') # Found 3 matches
# this says find 'e' followed by any character:
r = re.findall('e.', source)
print(r) # ['e ', 'ed', 'ee', 'el', 'ei']
# The above will not return the last e because no character follows it.
# Indicate the character after 'e' is optional with '?':
r = re.findall('e.?', source)
print(r) # ['e ', 'ed', 'ee', 'el', 'ei', 'e']
# split() and sub()
# -----------------------------------------------------------------------------
# split() splits the source using the pattern as the split point and returns
# a list of the string pieces.
r = re.split(' ', source)
print(r) # ['blue', 'red', 'green', 'yellow', 'blueish', 'blue']
# sub() takes another replacement argument, and changes all parts of source
# that are matched by pattern to the replacement.
r = re.sub('blue', 'black', source)
print(r) # black red green yellow blackish black
# Special characters
# -----------------------------------------------------------------------------
# \d a single digit
# \D a single non-digit
# \w an alphanumeric character
# \W an non-alphanumeric character
# \s a whitespace character
# \S a non-whitespace character
# \b a word boundary (the beginning or end of a word)
# \B a non-word boundary (not the beginning or end of a word)
# testing
# -----------------------------------------------------------------------------
sample = """
Intro:
Is this the real Life? Is this just fantasy?
Caught in a landslide, no escape from reality.
Open your eyes, look up to the skies and see.
I'm just a poor boy, I need no sympathy.
Because I'm easy come, easy go, little high, little low.
Any way the wind blows doesn't really matter to me, to me.
For testing: (Verse 1-4, Outro) beau dish, wish, fish, surreal
"""
# which characters are digits:
r = re.findall('\d', sample)
print(r) # ['1', '4']
# which characters are digits, letters or underscore:
r = re.findall('\w', sample)
# note \d and \w work on whatever Unicode defines as a digit or character
# for example:
test = 'abc-/&\u00ea\u0115'
r = re.findall('\w', test)
print(r) # ['a', 'b', 'c', 'ê', 'ĕ']
# There are a few cases in which the regular expression pattern rules conflict
# with the Python string rules. The following pattern should match any word
# that begins with b:
r = re.findall('\bb', sample)
print(r) # []
# In the mini-language of regular expressions \b means the beginning or end of
# a word but in Python strings it means backspace. Avoid the accidental use of
# escape characters by using Python's "raw strings" when you define your
# regular expression string. Always put an r character before your regular
# expression pattern string, and Python escape characters will be disabled:
r = re.findall(r'\bb', sample)
print(r) # ['b', 'b']
# The above isn't very helpful as we only get the 'b' part of the match.
# This says, find all complete words that start with the letter 'b'
r = re.findall(r'\bb\w*', sample)
print(r) # ['boy', 'blows']
# breakdown:
# \b - indicates the beginning of a word
# b - starts with b
# \w* - followed by any number of alphanumeric characters
# This says, find all words that start with the letter 'b' or 'B'
r = re.findall(r'\b[bB]\w*', sample)
print(r) # ['boy', 'Because', 'blows']
# This says, find all 5 letter words that start with 'b' or 'B'
r = re.findall(r'\b[bB]\w\w\w\w\b', sample)
print(r) # ['blows']
# Same as above:
r = re.findall(r'\b[bB]\w{4}\b', sample)
print(r) # ['blows']
# Find all words that end in the letter 'r':
r = re.findall(r'\b\w*r\b', sample)
print(r) # ['your', 'poor', 'matter', 'For']
# Doesn't work well for words ending in 't' on account of apostrophes aren't
# matched by \w. This says match any number of letters or apostrophes: [\w']*
r = re.findall(r"\b[\w']*t\b", sample)
print(r) # ['just', 'Caught', 'just', "doesn't"]
# Pattern Specifiers
# -----------------------------------------------------------------------------
# abc literal abc
# (...) any valid regular expression
# a|b a or b - these can be expressions too (...)|(...)
# . any character except \n
# ^ start of source string
# $ end of source string
# * zero or more of the preceding character, ab* -> a, ab, abbb
# + one or more of the preceding character, ab+ -> ab, abbb
# ? means the preceding character is optional (zero or one)
# abc? c is optional, a(bc)? means bc is optional
# abc*? zero or more c, as few as possible, will return ab
# abc+ one or more c, as many as possible
# abc+? one or more c, as few as possible
# a{m} number of consecutive a, a{3} is aaa
# a{m,n} m to n consecutive a, as many as possible
# a{m,n}? m to n consecutive a, as few as possible
# [abc] a or b or c (same as a|b|c)
# [^abc] not (a or b or c)
# prev(?= next) prev if followed by next
# prev(?! next) prev if not followed by next
# (?<=prev)next next if preceded by prev
# (?<!prev)next next if not preceded by prev
# More testing
# -----------------------------------------------------------------------------
# find real anywhere:
r = re.findall('real', sample)
print(r) # ['real', 'real', 'real', 'real']
# find real where it's at the beginning of a word:
r = re.findall(r'\breal\w*', sample)
print(r) # ['real', 'reality', 'really']
# find real where it's at the end of a word:
r = re.findall(r'\w*real\b', sample)
print(r) # ['real', 'surreal']
# find real where it's at the beginning AND end of a word:
r = re.findall(r'\breal\b', sample)
print(r) # ['real']
# find real where it's at the beginning OR end of a word:
r = re.findall(r'\breal\w*|\w*real\b', sample)
print(r) # ['real', 'reality', 'really', 'surreal']
# The characters ^ and $ are called anchors. ^ anchors the search to the
# beginning of the string, and $ anchors it to the end.
# find Intro at the beginning:
r = re.findall(r'^Intro', sample)
print(r) # []
# find \nIntro at the beginning:
r = re.findall(r'^\nIntro', sample)
print(r) # ['\nIntro']
# find surreal at the end:
r = (re.findall(r'surreal$', sample))
print(r) # ['surreal']
# find w or f or d followed by ish:
r = re.findall(r'[wfd]ish', sample)
print(r) # ['dish', 'wish', 'fish']
# find one or more runs of b or c:
r = re.findall(r'[bc]+\w*', sample)
print(r) # ['cape', 'boy', 'cause', 'come', 'blows']
# find me followed by a non-alphanumeric:
r = re.findall(r'me\W', sample)
print(r) # ['me,', 'me,', 'me.']
# find poor followed by boy:
r = re.findall(r'poor (?=boy)', sample)
print(r) # ['poor ']
# find blows preceded by wind:
r = re.findall(r'(?<=wind) blows', sample)
print(r) # [' blows']
# find words that contain 3 vowels in a row:
r = re.findall(r'\b\w*[aeiuo]{3}\w*\b', sample)
print(r) # ['beau']
# Match Output
# -----------------------------------------------------------------------------
# When using match() or search(), all matches are returned from the result
# object r as r.group(). If you enclose a pattern in parentheses, the match
# will be saved to its own group, and a tuple of them will be available as
# r.groups(), as shown here:
r = re.search(r'(escape).*(reality)', sample)
print(r.group()) # escape from reality
print(r.groups()) # ('escape', 'reality')
# Even More testing
# -----------------------------------------------------------------------------
import re
import sys
pattern = sys.argv[1]
search_string = sys.argv[2]
# To use sys.argv, run the file like this:
# $ python3 re_testing.py 'hello' 'hello world'
match = re.match(pattern, search_string)
if match:
template = "'{}' matches pattern '{}'"
else:
template = "'{}' doesn't match pattern '{}'"
print(template.format(search_string, pattern))