import collections import urllib import re for n in range(ord('A'), ord('Z') + 1): url = "http://docs.python.org/genindex-%c.html" % chr(n) print "/////////////////////////" print "// %s" % url content = urllib.urlopen(url).read() for m1 in re.finditer('
(.+?)
', content, re.I): hrefs = collections.defaultdict(list) for m2 in re.finditer('(.+?)', m1.group(1)): m3 = re.search('href=["\'](.+?)["\']', m2.group(1)) hrefs[m3.group(1)].append(m2.group(2)) for href, texts in hrefs.iteritems(): if len(texts) >= 2: print href for text in texts: print "\t" + text print