re – Regular Expressions 理解完正则学会一半

理解完正则学会一半：

import re

def test_patterns(text,patterns=[]):
    """Given source text and a list of patterns,look for
    matches for each pattern within the text and print
    them to stdout.
    """
    print
    print ''.join(str(i/10 or ' ') for i in range(len(text)))
    print ''.join(str(i%10) for i in range(len(text)))
    print text

    # Look for each pattern in the text and print the results
    for pattern in patterns:
        print
        print 'Matching "%s"' % pattern
        for match in re.finditer(pattern,text):
            s = match.start()
            e = match.end()
            print '  %2d : %2d = "%s"' % \
                (s,e-1,text[s:e])
    return

if __name__ == '__main__':
    print "*"*50
    #Pattern Syntax
    test_patterns('abbaaabbbbaaaaa',['ab'])
    print "*"*50
    #Repetition
    test_patterns('abbaaabbbbaaaaa',[ 'ab*',# a followed by zero or more b
                'ab+',# a followed by one or more b
                'ab?',# a followed by zero or one b
                'ab{3}',# a followed by three b
                'ab{2,3}',# a followed by two to three b
                ])
    print "*"*50
    #Character Sets
    test_patterns('abbaaabbbbaaaaa',[ '[ab]',# either a or b
                'a[ab]+',# a followed by one or more a or b
                'a[ab]+?',# a followed by one or more a or b,not greedy
                ])
    print "*"*50
    test_patterns('This is some text -- with punctuation.',[ '[^-. ]+',# sequences without -,.,or space
                ])
    print "*"*50
    test_patterns('This is some text -- with punctuation.',[ '[a-z]+',# sequences of lower case letters
                '[A-Z]+',# sequences of upper case letters
                '[a-zA-Z]+',# sequences of lower or upper case letters
                '[A-Z][a-z]+',# one upper case letter followed by lower case letters
                ])
    print "*"*50
    test_patterns('abbaaabbbbaaaaa',[ 'a.',# a followed by any one character
                'b.',# b followed by any one character
                'a.*b',# a followed by anything,ending in b
                'a.*?b',ending in b
                ])
    print "*"*50
    #Escape Codes
    # Code  Meaning
    # \d    a digit
    # \D    a non-digit
    # \s    whitespace (tab,space,newline,etc.)
    # \S    non-whitespace
    # \w    alphanumeric
    # \W    non-alphanumeric
    test_patterns('This is a prime #1 example!',[ r'\d+',# sequence of digits
                r'\D+',# sequence of non-digits
                r'\s+',# sequence of whitespace
                r'\S+',# sequence of non-whitespace
                r'\w+',# alphanumeric characters
                r'\W+',# non-alphanumeric
                ])
    print "*"*50
    test_patterns(r'\d+ \D+ \s+ \S+ \w+ \W+',[ r'\\d\+',r'\\D\+',r'\\s\+',r'\\S\+',r'\\w\+',r'\\W\+',])
    #Anchoring
    # Code  Meaning
    # ^ start of string,or line
    # $ end of string,or line
    # \A    start of string
    # \Z    end of string
    # \b    empty string at the beginning or end of a word
    # \B    empty string not at the beginning or end of a word
    print "*"*50
    test_patterns('This is some text -- with punctuation.',[ r'^\w+',# word at start of string
                r'\A\w+',# word at start of string
                r'\w+\S*$',# word at end of string,with optional punctuation
                r'\w+\S*\Z',with optional punctuation
                r'\w*t\w*',# word containing 't'
                r'\bt\w+',# 't' at start of word
                r'\w+t\b',# 't' at end of word
                r'\Bt\B',# 't',not start or end of word
                ])

输出结果：

**************************************************

          11111
012345678901234
abbaaabbbbaaaaa

Matching "ab"
   0 :  1 = "ab"
   5 :  6 = "ab"
**************************************************

          11111
012345678901234
abbaaabbbbaaaaa

Matching "ab*"
   0 :  2 = "abb"
   3 :  3 = "a"
   4 :  4 = "a"
   5 :  9 = "abbbb"
  10 : 10 = "a"
  11 : 11 = "a"
  12 : 12 = "a"
  13 : 13 = "a"
  14 : 14 = "a"

Matching "ab+"
   0 :  2 = "abb"
   5 :  9 = "abbbb"

Matching "ab?"
   0 :  1 = "ab"
   3 :  3 = "a"
   4 :  4 = "a"
   5 :  6 = "ab"
  10 : 10 = "a"
  11 : 11 = "a"
  12 : 12 = "a"
  13 : 13 = "a"
  14 : 14 = "a"

Matching "ab{3}"
   5 :  8 = "abbb"

Matching "ab{2,3}"
   0 :  2 = "abb"
   5 :  8 = "abbb"
**************************************************

          11111
012345678901234
abbaaabbbbaaaaa

Matching "[ab]"
   0 :  0 = "a"
   1 :  1 = "b"
   2 :  2 = "b"
   3 :  3 = "a"
   4 :  4 = "a"
   5 :  5 = "a"
   6 :  6 = "b"
   7 :  7 = "b"
   8 :  8 = "b"
   9 :  9 = "b"
  10 : 10 = "a"
  11 : 11 = "a"
  12 : 12 = "a"
  13 : 13 = "a"
  14 : 14 = "a"

Matching "a[ab]+"
   0 : 14 = "abbaaabbbbaaaaa"

Matching "a[ab]+?"
   0 :  1 = "ab"
   3 :  4 = "aa"
   5 :  6 = "ab"
  10 : 11 = "aa"
  12 : 13 = "aa"
**************************************************

          1111111111222222222233333333
01234567890123456789012345678901234567
This is some text -- with punctuation.

Matching "[^-. ]+"
   0 :  3 = "This"
   5 :  6 = "is"
   8 : 11 = "some"
  13 : 16 = "text"
  21 : 24 = "with"
  26 : 36 = "punctuation"
**************************************************

          1111111111222222222233333333
01234567890123456789012345678901234567
This is some text -- with punctuation.

Matching "[a-z]+"
   1 :  3 = "his"
   5 :  6 = "is"
   8 : 11 = "some"
  13 : 16 = "text"
  21 : 24 = "with"
  26 : 36 = "punctuation"

Matching "[A-Z]+"
   0 :  0 = "T"

Matching "[a-zA-Z]+"
   0 :  3 = "This"
   5 :  6 = "is"
   8 : 11 = "some"
  13 : 16 = "text"
  21 : 24 = "with"
  26 : 36 = "punctuation"

Matching "[A-Z][a-z]+"
   0 :  3 = "This"
**************************************************

          11111
012345678901234
abbaaabbbbaaaaa

Matching "a."
   0 :  1 = "ab"
   3 :  4 = "aa"
   5 :  6 = "ab"
  10 : 11 = "aa"
  12 : 13 = "aa"

Matching "b."
   1 :  2 = "bb"
   6 :  7 = "bb"
   8 :  9 = "bb"

Matching "a.*b"
   0 :  9 = "abbaaabbbb"

Matching "a.*?b"
   0 :  1 = "ab"
   3 :  6 = "aaab"
**************************************************

          11111111112222222
012345678901234567890123456
This is a prime #1 example!

Matching "\d+"
  17 : 17 = "1"

Matching "\D+"
   0 : 16 = "This is a prime #"
  18 : 26 = " example!"

Matching "\s+"
   4 :  4 = " "
   7 :  7 = " "
   9 :  9 = " "
  15 : 15 = " "
  18 : 18 = " "

Matching "\S+"
   0 :  3 = "This"
   5 :  6 = "is"
   8 :  8 = "a"
  10 : 14 = "prime"
  16 : 17 = "#1"
  19 : 26 = "example!"

Matching "\w+"
   0 :  3 = "This"
   5 :  6 = "is"
   8 :  8 = "a"
  10 : 14 = "prime"
  17 : 17 = "1"
  19 : 25 = "example"

Matching "\W+"
   4 :  4 = " "
   7 :  7 = " "
   9 :  9 = " "
  15 : 16 = " #"
  18 : 18 = " "
  26 : 26 = "!"
**************************************************

          1111111111222
01234567890123456789012
\d+ \D+ \s+ \S+ \w+ \W+

Matching "\\d\+"
   0 :  2 = "\d+"

Matching "\\D\+"
   4 :  6 = "\D+"

Matching "\\s\+"
   8 : 10 = "\s+"

Matching "\\S\+"
  12 : 14 = "\S+"

Matching "\\w\+"
  16 : 18 = "\w+"

Matching "\\W\+"
  20 : 22 = "\W+"
**************************************************

          1111111111222222222233333333
01234567890123456789012345678901234567
This is some text -- with punctuation.

Matching "^\w+"
   0 :  3 = "This"

Matching "\A\w+"
   0 :  3 = "This"

Matching "\w+\S*$"
  26 : 37 = "punctuation."

Matching "\w+\S*\Z"
  26 : 37 = "punctuation."

Matching "\w*t\w*"
  13 : 16 = "text"
  21 : 24 = "with"
  26 : 36 = "punctuation"

Matching "\bt\w+"
  13 : 16 = "text"

Matching "\w+t\b"
  13 : 16 = "text"

Matching "\Bt\B"
  23 : 23 = "t"
  30 : 30 = "t"
  33 : 33 = "t"

待续...

re – Regular Expressions 理解完正则学会一半

相关文章