#!/usr/bin/env python2 import codecs, profile, os, re, sys hrefRE = re.compile( ''.join( [ r'href=', r'(?P["\'])', r'(?P', r'.*?', r')', r'(?=quote)', ], ), ) ####################################################### onePathSegmentMS = ''.join( [ r'(?P<_pathSeg>', r'(', r'/?', r'(', r'(?!', r'[ \t\r\n]+', r'$', r')', u'[^%s]' % ( re.escape( r'/?#')), r')+', r'|', r'/', r')', r')', ], ) onePathSegmentRE = re.compile( onePathSegmentMS) ####################################################### uriMS = r''.join( ( r'(?P', ## leading whitespace is OK and ignorable; see http://dev.w3.org/html5/spec-LC/urls.html r'[ \t\r\n]+', r')?', r'(', r'(?P', r'https?', r')', r':\/{0,2}', ## accounts for encountered error: only 0 or 1 slash instead of 2 r')?', r'(?P', r'(?P', r'(', r'(?P<_userinfo>', r'[^%s]+' % ( re.escape( r'@/[:?#')), r')', re.escape( '@'), r')?', r')', r'(?P', r'(?P', re.escape( r'['), r')?', r'(', r'(?P', r'(', r'[0-9]{1,3}%s' % ( re.escape( r'.')), r'){3}', r'[0-9]{1,3}', r')', r'|', r'(?P', r'(', r'[0-9A-Fa-f]{0,4}%s' % ( re.escape( ':')), r'){1,7}', r'[0-9A-Fa-f]{0,4}', r')', r'|', r'(?P', r'(', r'[^%s]+?' % ( re.escape( r']:/?#')), ## this may have dots r'\.', r')+', r'(?P', ## top-level domain, e.g. "com", "gov" etc. r'(', r'(?!', r'[ \t\r\n]+', r'$', r')', r'[^%s]' % ( re.escape( r']:/?#\.')), ## tld: no dots allowed r')+', r')', r')', r')', r'(?P', re.escape( r']'), r')?', r')', ## end of r'(?P', r'(', re.escape( r':'), r'(?P', r'[0-9]{1,5}', r')', r')?', r')', r')?', ## authority is optional; it could be a relative URL that starts with a path r'(?P', r'(', onePathSegmentMS, r')*', r')', r'(?P', re.escape( r'?',), r'(', r'(?P<_query>', r'(?!', r'[ \t\r\n]+', r'$', r')', r'[^%s]*' % ( re.escape( r'#')), r')', r')', r')?', r'(?P', r'(', re.escape( r'#',), r'(?P<_fragment>', r'(?!', r'[ \t\r\n]+', r'$', r')', r'.*', r')', r')?', r')', r'(?P', ## trailing whitespace is OK and ignorable; see http://dev.w3.org/html5/spec-LC/urls.html r'[ \t\r\n]+', r')?', ), ) uriRE = re.compile( ''.join( ( r'^', uriMS, r'$', ), ), re.IGNORECASE | re.DOTALL, ) ####################################################### def uriFunc( MO): z = MO.groupdict() z[ 'scheme'] ####################################################### def hrefFunc( MO): re.sub( uriRE, uriFunc, MO.group( 'url') ) ####################################################### def test(): for iteration in range( 10000): re.sub( hrefRE, hrefFunc, text, ) ####################################################### if __name__ == '__main__': textFO = codecs.open( 'p17-188.htm', 'r', 'utf-8') text = textFO.read() textFO.close() import cProfile cProfile.run( 'test()', '/tmp/profile-%d' % ( os.getpid())) import pstats p = pstats.Stats( '/tmp/profile-%d' % ( os.getpid())) print '\nprofile for Python version %s\n%s\n%s' % ( '.'.join( map( str, sys.version_info, ), ), ' '.join( os.uname()).strip(), open( '/etc/issue', 'r').read().strip(), ) # p.sort_stats( 'cumulative').print_stats( 10) p.sort_stats('time').print_stats(10)