import urllib import re import os print 'Getting data' if os.path.exists('page.html'): dat = open('page.html', 'rb').read().decode('utf-8') else: dat = urllib.urlopen('http://btjunkie.org/search?q=family+guy').read().decode('utf8', 'replace') # Added by AMK to save the output output = open('page.html', 'wb') output.write(dat.encode('utf-8')) output.close() # I know it's not very readable, but the SGML parser feels in pain torrent_re = re.compile('(?s)http://.*?do=download[^"]+).*?' '(?P.*?).*?' 'color="#808080">(?P\d+MB).*?' 'color="#32cd32">(?P\d+).*?' 'color="#00ced1">(?P\d+)') print 'searching' for match in torrent_re.finditer(dat): print match.groupdict()