Python: regex-based tokenizer in 4 lines of code
April 2021
import re
SCANNER = re.compile(r'''
(\s+) | # whitespace
(//)[^\n]* | # comments
0[xX]([0-9A-Fa-f]+) | # hexadecimal integer literals
(\d+) | # integer literals
(<<|>>) | # multi-char punctuation
([][(){}<>=,;:*+-/]) | # punctuation
([A-Za-z_][A-Za-z0-9_]*) | # identifiers
"""(.*?)""" | # multi-line string literal
"((?:[^"\n\\]|\\.)*)" | # regular string literal
(.) # an error!
''', re.DOTALL | re.VERBOSE)
for match in re.finditer(SCANNER, data):
space, comment, hexint, integer, mpunct, \
punct, word, mstringlit, stringlit, badchar = match.groups()
if space: ...
if comment: ...
# ...
if badchar: raise FooException...
Leave a Reply