Message 49932 - Python tracker

➜

This issue tracker has been migrated to GitHub, and is currently read-only.
For more information, see the GitHub FAQs in the Python's Developer Guide.

Author	dalke
Recipients
Date	2006-11-06.10:41:48
SpamBayes Score
Marked as misclassified
Message-id
In-reply-to

Content
Logged In: YES user_id=190903 Can't figure out how to add a file to this @#$%*@#%$ bug reporting system. Here's a checker to compare urljoin from urlparse and uriparse import urllib2 import urlparse import uriparse import BeautifulSoup for url in ( "http://python.org/", "http://www.perl.org/", ## "http://aspn.activestate.com/ASPN/Cookbook/Python", # they have \n in urls! "http://slashdot.org/", "http://cnn.com/", "http://bbc.co.uk/", "http://www.foxnews.com/", "http://reddit.com/", "http://yahoo.com/", "http://planetpython.org/", "http://www.slate.com/", "http://anarchaia.org/index.html", "http://www.ensembl.org/index.html", ): print "Processing", url f = urllib2.urlopen(url) soup = BeautifulSoup.BeautifulSoup(f) rel_url_list = [] for a in soup.findAll("a", href=True): rel_url_list.append(a["href"]) for img in soup.findAll("img", src=True): rel_url_list.append(img["src"]) for rel_url in rel_url_list: rel_url = rel_url.strip() url_joined = urlparse.urljoin(url, rel_url) uri_joined = uriparse.urljoin(url, rel_url) if url_joined != uri_joined: # urijoin can add an extra '/' ## if url_joined == uri_joined+"/": ## continue ## if url_joined.replace("//", "/") == uri_joined.replace("//", "/"): ## continue ## # 'http://cnn.com/' u'/cnnsi/scorecard/?cnn=yes' ## # url_joined == u'http://cnn.com/cnnsi/scorecard/?cnn=yes' ## # uri_joined == u'http://cnn.com/cnnsi/scorecard?cnn=yes' ## if url_joined.replace("/?", "?") == uri_joined: ## continue print repr(url), repr(rel_url) print " ", repr(url_joined), "!=", repr(uri_joined)

Logged In: YES 
user_id=190903

Can't figure out how to add a file to this @#$%*@#%$ bug
reporting system.

Here's a checker to compare urljoin from urlparse and uriparse

import urllib2
import urlparse
import uriparse
import BeautifulSoup

for url in (
    "http://python.org/",
    "http://www.perl.org/",
##    "http://aspn.activestate.com/ASPN/Cookbook/Python", #
they have \n in urls!
    "http://slashdot.org/",
    "http://cnn.com/",
    "http://bbc.co.uk/",
    "http://www.foxnews.com/",
    "http://reddit.com/",
    "http://yahoo.com/",
    "http://planetpython.org/",
    "http://www.slate.com/",
    "http://anarchaia.org/index.html",
    "http://www.ensembl.org/index.html",
    ):
    print "Processing", url
    f = urllib2.urlopen(url)
    soup = BeautifulSoup.BeautifulSoup(f)

    rel_url_list = []
    for a in soup.findAll("a", href=True):
        rel_url_list.append(a["href"])
    for img in soup.findAll("img", src=True):
        rel_url_list.append(img["src"])

    for rel_url in rel_url_list:
        rel_url = rel_url.strip()
        url_joined = urlparse.urljoin(url, rel_url)
        uri_joined = uriparse.urljoin(url, rel_url)
        if url_joined != uri_joined:
            # urijoin can add an extra '/'
##            if url_joined == uri_joined+"/":
##                continue
##            if url_joined.replace("//", "/") ==
uri_joined.replace("//", "/"):
##                continue
##            # 'http://cnn.com/' u'/cnnsi/scorecard/?cnn=yes'
##            # url_joined ==
u'http://cnn.com/cnnsi/scorecard/?cnn=yes'
##            # uri_joined ==
u'http://cnn.com/cnnsi/scorecard?cnn=yes'
##            if url_joined.replace("/?", "?") == uri_joined:
##                continue
            
            print repr(url), repr(rel_url)
            print "  ", repr(url_joined), "!=", repr(uri_joined)

History
Date	User	Action	Args
2007-08-23 15:47:34	admin	link	issue1462525 messages
2007-08-23 15:47:34	admin	create