In this video we’ll continue learning about regular expressions. We’ll cover +, *, Greedy Matching, Lazy Matching, \b, String Boundaries ^ and $, (?m), Subexpressions and we’ll solve a bunch of problems.
If you missed the last regular expression video it is here. All of the code and a transcript of the video follows the video below.
If you like videos like this consider donating a $1 on Patreon.
[googleplusone]
Code & Transcript
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import re # Did you find a match # if re.search("REGEX", yourString) # Get list of matches # print("Matches :", len(re.findall("REGEX", yourString))) # Get a pattern object # regex = re.compile("REGEX") # Substitute the match # yourString = regex.sub("substitution", yourString) # [ ] : Match what is in the brackets # [^ ] : Match anything not in the brackets # . : Match any 1 character or space # + : Match 1 or more of what proceeds # \n : Newline # \d : Any 1 number # \D : Anything but a number # \w : Same as [a-zA-Z0-9_] # \W : Same as [^a-zA-Z0-9_] # \s : Same as [\f\n\r\t\v] # \S : Same as [^\f\n\r\t\v] # {5} : Match 5 of what proceeds the curly brackets # {5,7} : Match values that are between 5 and 7 in length # ---------- Matching Zero or One ---------- randStr = "cat cats" regex = re.compile("[cat]+s?") matches = re.findall(regex, randStr) # Match cat or cats print("Matches :", len(matches)) for i in matches: print(i) # ---------- Matching Zero or More ---------- # * matches zero or more of what proceeds it randStr = "doctor doctors doctor's" # Match doctor doctors or doctor's regex = re.compile("[doctor]+['s]*") matches = re.findall(regex, randStr) print("Matches :", len(matches)) # You can do the same by setting an interval match regex = re.compile("[doctor]+['s]{0,2}") matches = re.findall(regex, randStr) print("Matches :", len(matches)) for i in matches: print(i) # ---------- PROBLEM ---------- # On Windows newlines are some times \n and other times \r\n # Create a regex that will grab each of the lines in this # string, print out the number of matches and each line longStr = '''Just some words and some more\r and more ''' print("Matches :", len(re.findall(r"[\w\s]+[\r]?\n", longStr))) matches = re.findall("[\w\s]+[\r]?\n", longStr) for i in matches: print(i) # ---------- Greedy & Lazy Matching ---------- randStr = "<name>Life On Mars</name><name>Freaks and Geeks</name>" # Let's try to grab everything between <name> tags # Because * is greedy (It grabs the biggest match possible) # we can't get what we want, which is each individual tag # match regex = re.compile(r"<name>.*</name>") matches = re.findall(regex, randStr) print("Matches :", len(matches)) for i in matches: print(i) # We want to grab the smallest match we use *?, +?, or # {n,}? instead regex = re.compile(r"<name>.*?</name>") matches = re.findall(regex, randStr) print("Matches :", len(matches)) for i in matches: print(i) # ---------- Word Boundaries ---------- # We use word boundaries to define where our matches start # and end # \b matches the start or end of a word # If we want ape it will match ape and the beginning of apex randStr = "ape at the apex" regex = re.compile(r"ape") # If we use the word boundary regex = re.compile(r"\bape\b") matches = re.findall(regex, randStr) print("Matches :", len(matches)) for i in matches: print(i) # ---------- String Boundaries ---------- # ^ : Matches the beginning of a string if outside of # a [ ] # $ : Matches the end of a string # Grab everything from the start of the string to @ randStr = "Match everything up to @" regex = re.compile(r"^.*[^@]") matches = re.findall(regex, randStr) print("Matches :", len(matches)) for i in matches: print(i) # Grab everything from @ to the end of the line randStr = "@ Get this string" regex = re.compile(r"[^@\s].*$") matches = re.findall(regex, randStr) print("Matches :", len(matches)) for i in matches: print(i) # Grab the 1st word of each line using the the multiline # code which allows for the targeting of each line after # a line break with ^ randStr = '''Ape is big Turtle is slow Cheetah is fast''' regex = re.compile(r"(?m)^.*?\s") matches = re.findall(regex, randStr) print("Matches :", len(matches)) for i in matches: print(i) # ---------- Subexpressions ---------- # Subexpressions are parts of a larger expression # If you want to match for a large block, but # only want to return part of it. To do that # surround what you want with ( ) # Get just the number minus the area code randStr = "My number is 412-555-1212" regex = re.compile(r"412-(.*)") matches = re.findall(regex, randStr) print("Matches :", len(matches)) for i in matches: print(i) # ---------- Problem ---------- # Get just the numbers minus the area codes from # this string randStr = "412-555-1212 412-555-1213 412-555-1214" regex = re.compile(r"412-(.{8})") matches = re.findall(regex, randStr) print("Matches :", len(matches)) for i in matches: print(i) # ---------- Multiple Subexpressions ---------- # You can have multiple subexpressions as well # Get both numbers that follow 412 separately randStr = "My number is 412-555-1212" regex = re.compile(r"412-(.*)-(.*)") matches = re.findall(regex, randStr) print("Matches :", len(matches)) print(matches[0][0]) print(matches[0][1]) |
Leave a Reply