Author mark
Recipients akitada, akuchling, amaury.forgeotdarc, collinwinter, ezio.melotti, georg.brandl, gregory.p.smith, jaylogan, jimjjewett, loewis, mark, moreati, mrabarnett, nneonneo, pitrou, r.david.murray, rsc, sjmachin, timehorse, vbr
Date 2009-08-15.07:49:45
SpamBayes Score 1.81072e-07
Marked as misclassified No
Message-id <1250322589.0.0.356740161312.issue2636@psf.upfronthosting.co.za>
In-reply-to
Content
Hi,

I've noticed 3 differences between the re and regex engines. 
I don't know if they are intended or not, but thought it best to mention
them. (I used the issue2636-20090810#3.zip version.)

Python 2.6.2 (r262:71600, Apr 20 2009, 09:25:38) 
[GCC 4.3.2 20081105 (Red Hat 4.3.2-7)] on linux2
IDLE 2.6.2      
>>> import re, regex
>>> ############################################################ 1 of 3
>>> re1= re.compile(r"""
                    (?!<\w)(?P<name>[-\w]+)=
                    (?P<quote>(?P<single>')|(?P<double>"))?
                    (?P<value>(?(single)[^']+?|(?(double)[^"]+?|\S+)))
                    (?(quote)(?P=quote))
                    """, re.VERBOSE)
>>> re2= regex.compile(r"""
                    (?!<\w)(?P<name>[-\w]+)=
                    (?P<quote>(?P<single>')|(?P<double>"))?
                    (?P<value>(?(single)[^']+?|(?(double)[^"]+?|\S+)))
                    (?(quote)(?P=quote))
                    """, re.VERBOSE)
>>> text = "<table border='1'>"
>>> re1.findall(text)
[('border', "'", "'", '', '1')]
>>> re2.findall(text)
[]
>>> text = "<table border=1>"
>>> re1.findall(text)
[('border', '', '', '', '1>')]
>>> re2.findall(text)
[]
>>> ############################################################ 2 of 3
>>> re1 = re.compile(r"""^[ \t]*
                         (?P<parenthesis>\()?
                         [- ]?
                         (?P<area>\d{3})
                         (?(parenthesis)\))
                         [- ]?
                         (?P<local_a>\d{3})
                         [- ]?
                         (?P<local_b>\d{4})
                         [ \t]*$
                         """, re.VERBOSE)
>>> re2 = regex.compile(r"""^[ \t]*
                         (?P<parenthesis>\()?
                         [- ]?
                         (?P<area>\d{3})
                         (?(parenthesis)\))
                         [- ]?
                         (?P<local_a>\d{3})
                         [- ]?
                         (?P<local_b>\d{4})
                         [ \t]*$
                         """, re.VERBOSE)
>>> data = ("179-829-2116", "(187) 160 0880", "(286)-771-3878",
"(291) 835-9634", "353-896-0505", "(555) 555 5555", "(555) 555-5555",
"(555)-555-5555", "555 555 5555", "555 555-5555", "555-555-5555",
"601 805 3142", "(675) 372 3135", "810 329 7071", "(820) 951 3885",
"942 818-5280", "(983)8792282")
>>> for d in data:
	ans1 = re1.findall(d)
	ans2 = re2.findall(d)
	print "re=%s rx=%s %d" % (ans1, ans2, ans1 == ans2)

re=[('', '179', '829', '2116')] rx=[('', '179', '829', '2116')] 1
re=[('(', '187', '160', '0880')] rx=[] 0
re=[('(', '286', '771', '3878')] rx=[('(', '286', '771', '3878')] 1
re=[('(', '291', '835', '9634')] rx=[] 0
re=[('', '353', '896', '0505')] rx=[('', '353', '896', '0505')] 1
re=[('(', '555', '555', '5555')] rx=[] 0
re=[('(', '555', '555', '5555')] rx=[] 0
re=[('(', '555', '555', '5555')] rx=[('(', '555', '555', '5555')] 1
re=[('', '555', '555', '5555')] rx=[] 0
re=[('', '555', '555', '5555')] rx=[] 0
re=[('', '555', '555', '5555')] rx=[('', '555', '555', '5555')] 1
re=[('', '601', '805', '3142')] rx=[] 0
re=[('(', '675', '372', '3135')] rx=[] 0
re=[('', '810', '329', '7071')] rx=[] 0
re=[('(', '820', '951', '3885')] rx=[] 0
re=[('', '942', '818', '5280')] rx=[] 0
re=[('(', '983', '879', '2282')] rx=[('(', '983', '879', '2282')] 1
>>> ############################################################ 3 of 3
>>> re1 = re.compile(r"""
<img\s+[^>]*?src=(?:(?P<quote>["'])(?P<qimage>[^\1>]+?)   
(?P=quote)|(?P<uimage>[^"' >]+))[^>]*?>""", re.VERBOSE)
>>> re2 = regex.compile(r"""
<img\s+[^>]*?src=(?:(?P<quote>["'])(?P<qimage>[^\1>]+?)   
(?P=quote)|(?P<uimage>[^"' >]+))[^>]*?>""", re.VERBOSE)
>>> data = """<body> <img src='a.png'> <img alt='picture' src="b.png">
              <img alt="picture" src="Big C.png" other="xyx">
              <img src=icon.png alt=icon>
              <img src="I'm here!.jpg" alt="aren't I?">"""
>>> data = data.split("\n")
>>> data = [x.strip() for x in data]
>>> for d in data:
	ans1 = re1.findall(d)
	ans2 = re2.findall(d)
	print "re=%s rx=%s %d" % (ans1, ans2, ans1 == ans2)

re=[("'", 'a.png', '')] rx=[("'", 'a.png', '')] 1
re=[('"', 'b.png', '')] rx=[('"', 'b.png', '')] 1
re=[('"', 'Big C.png', '')] rx=[('"', 'Big C.png', '')] 1
re=[('', '', 'icon.png')] rx=[('', '', 'icon.png alt=icon')] 0
re=[('"', "I'm here!.jpg", '')] rx=[('"', "I'm here!.jpg", '')] 1

I'm sorry I haven't had the time to try to minimize the examples, but I
hope that at least they will prove helpful.

Number 3 looks like a problem with non-greedy matching; I don't know
about the others.
History
Date User Action Args
2009-08-15 07:49:49marksetrecipients: + mark, loewis, akuchling, georg.brandl, collinwinter, gregory.p.smith, jimjjewett, sjmachin, amaury.forgeotdarc, pitrou, nneonneo, rsc, timehorse, vbr, ezio.melotti, mrabarnett, jaylogan, akitada, moreati, r.david.murray
2009-08-15 07:49:49marksetmessageid: <1250322589.0.0.356740161312.issue2636@psf.upfronthosting.co.za>
2009-08-15 07:49:47marklinkissue2636 messages
2009-08-15 07:49:45markcreate