Message49932
Logged In: YES
user_id=190903
Can't figure out how to add a file to this @#$%*@#%$ bug
reporting system.
Here's a checker to compare urljoin from urlparse and uriparse
import urllib2
import urlparse
import uriparse
import BeautifulSoup
for url in (
"http://python.org/",
"http://www.perl.org/",
## "http://aspn.activestate.com/ASPN/Cookbook/Python", #
they have \n in urls!
"http://slashdot.org/",
"http://cnn.com/",
"http://bbc.co.uk/",
"http://www.foxnews.com/",
"http://reddit.com/",
"http://yahoo.com/",
"http://planetpython.org/",
"http://www.slate.com/",
"http://anarchaia.org/index.html",
"http://www.ensembl.org/index.html",
):
print "Processing", url
f = urllib2.urlopen(url)
soup = BeautifulSoup.BeautifulSoup(f)
rel_url_list = []
for a in soup.findAll("a", href=True):
rel_url_list.append(a["href"])
for img in soup.findAll("img", src=True):
rel_url_list.append(img["src"])
for rel_url in rel_url_list:
rel_url = rel_url.strip()
url_joined = urlparse.urljoin(url, rel_url)
uri_joined = uriparse.urljoin(url, rel_url)
if url_joined != uri_joined:
# urijoin can add an extra '/'
## if url_joined == uri_joined+"/":
## continue
## if url_joined.replace("//", "/") ==
uri_joined.replace("//", "/"):
## continue
## # 'http://cnn.com/' u'/cnnsi/scorecard/?cnn=yes'
## # url_joined ==
u'http://cnn.com/cnnsi/scorecard/?cnn=yes'
## # uri_joined ==
u'http://cnn.com/cnnsi/scorecard?cnn=yes'
## if url_joined.replace("/?", "?") == uri_joined:
## continue
print repr(url), repr(rel_url)
print " ", repr(url_joined), "!=", repr(uri_joined)
|
|
Date |
User |
Action |
Args |
2007-08-23 15:47:34 | admin | link | issue1462525 messages |
2007-08-23 15:47:34 | admin | create | |
|