# HG changeset patch # Parent 5c2d7aceb46865110857879702bed186f8aa1ad7 Accept ASCII/surrogateescape strings as hostname arguments. diff --git a/Doc/library/socket.rst b/Doc/library/socket.rst --- a/Doc/library/socket.rst +++ b/Doc/library/socket.rst @@ -132,6 +132,19 @@ ASCII-compatible encodings (see :mod:`en converts any non-ASCII bytes to the Unicode lone surrogate codes U+DC80...U+DCFF. +When a string is passed as a hostname argument, it will be +encoded as ASCII if it contains only ASCII characters; otherwise, +the :mod:`~encodings.idna` codec will be used, and if that fails +(for instance, because the string contains lone surrogate codes), +then the string will be encoded using the ``'ascii'`` codec and +the ``'surrogateescape'`` error handler. + + .. versionchanged:: XXX + Previously, hostnames were decoded as UTF-8 (except by + :func:`gethostname`, which on non-Windows platforms used the + file system encoding and the ``'surrogateescape'`` error + handler), and encoded as strict ASCII or IDNA. + All errors raise exceptions. The normal exceptions for invalid argument types and out-of-memory conditions can be raised; starting from Python 3.3, errors related to socket or address semantics raise :exc:`OSError` or one of its diff --git a/Lib/test/test_socket.py b/Lib/test/test_socket.py --- a/Lib/test/test_socket.py +++ b/Lib/test/test_socket.py @@ -1434,6 +1434,66 @@ class GeneralModuleTests(unittest.TestCa self.assertEqual(s.family, 42424) self.assertEqual(s.type, 13331) + +# This should produce the same results with or without network access, +# but can hang for some time if the upstream DNS servers are +# unreachable. +@unittest.skipUnless(support.is_resource_enabled('network'), + 'network is not enabled') +class TestHostnameRepresentations(unittest.TestCase): + + def tryHostnameArgs(self, function, notfounderror): + # Call the given one-argument function with various valid and + # invalid representations of nonexistent hostnames - it should + # raise notfounderror for valid representations. + + # An RFC 1123-compliant host name (".invalid" TLD is reserved + # under RFC 2606). + self.assertRaises(notfounderror, function, "host.domain.invalid.") + # Previous name as bytes and bytearray. + self.assertRaises(notfounderror, function, b"host.domain.invalid.") + self.assertRaises(notfounderror, function, + bytearray(b"host.domain.invalid.")) + # A domain name with a non-ASCII octet, as bytes and bytearray. + self.assertRaises(notfounderror, function, b"\xff.domain.invalid.") + self.assertRaises(notfounderror, function, + bytearray(b"\xff.domain.invalid.")) + # Previous domain name as ASCII/surrogateescape string representation. + self.assertRaises(notfounderror, function, "\udcff.domain.invalid.") + # A legal IDN. + self.assertRaises(notfounderror, function, "€.domain.invalid.") + # A combination of the previous two, which may make sense in + # theory, but is not accepted (the Euro sign means it must be + # interpreted as an IDN, but it is is not a legal IDN, because + # it contains a surrogate character). + self.assertRaises(TypeError, function, "\udcff.€.domain.invalid.") + + def testGethostbynameHostnames(self): + self.tryHostnameArgs(socket.gethostbyname, + (socket.herror, socket.gaierror)) + + def testGethostbyname_exHostnames(self): + self.tryHostnameArgs(socket.gethostbyname_ex, + (socket.herror, socket.gaierror)) + + def testGethostbyaddrHostnames(self): + self.tryHostnameArgs(socket.gethostbyaddr, + (socket.herror, socket.gaierror)) + + def testGetaddrinfoHostnames(self): + self.tryHostnameArgs(lambda host: socket.getaddrinfo(host, None), + socket.gaierror) + + def testSocketObjectHostnames(self): + def f(host): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.connect((host, 80)) + finally: + s.close() + self.tryHostnameArgs(f, socket.error) + + @unittest.skipUnless(HAVE_SOCKET_CAN, 'SocketCan required for this test.') class BasicCANTest(unittest.TestCase): @@ -5305,6 +5365,7 @@ def test_main(): tests.append(TestUnixDomain) tests.append(TestLinuxAbstractNamespace) tests.extend([TIPCTest, TIPCThreadableTest]) + tests.append(TestHostnameRepresentations) tests.extend([BasicCANTest, CANTest]) tests.extend([BasicRDSTest, RDSTest]) tests.extend([ diff --git a/Modules/socketmodule.c b/Modules/socketmodule.c --- a/Modules/socketmodule.c +++ b/Modules/socketmodule.c @@ -1413,6 +1413,11 @@ idna_converter(PyObject *obj, struct may return 0; } obj3 = PyUnicode_AsEncodedString(obj2, "idna", NULL); + /* IDNA codec doesn't use UnicodeEncodeError. */ + if (!obj3 && PyErr_ExceptionMatches(PyExc_UnicodeError)) { + PyErr_Clear(); + obj3 = PyUnicode_AsEncodedString(obj2, "ascii", "surrogateescape"); + } Py_DECREF(obj2); if (!obj3) { PyErr_SetString(PyExc_TypeError, "encoding of hostname failed"); @@ -4501,17 +4506,17 @@ extern int sethostname(const char *, siz static PyObject * socket_gethostbyname(PyObject *self, PyObject *args) { - char *name; + struct maybe_idna host = {NULL, NULL}; sock_addr_t addrbuf; PyObject *ret = NULL; - if (!PyArg_ParseTuple(args, "et:gethostbyname", "idna", &name)) + if (!PyArg_ParseTuple(args, "O&:gethostbyname", idna_converter, &host)) return NULL; - if (setipaddr(name, SAS2SA(&addrbuf), sizeof(addrbuf), AF_INET) < 0) + if (setipaddr(host.buf, SAS2SA(&addrbuf), sizeof(addrbuf), AF_INET) < 0) goto finally; ret = makeipaddr(SAS2SA(&addrbuf), sizeof(struct sockaddr_in)); finally: - PyMem_Free(name); + idna_cleanup(&host); return ret; } @@ -4655,7 +4660,7 @@ gethost_common(struct hostent *h, struct static PyObject * socket_gethostbyname_ex(PyObject *self, PyObject *args) { - char *name; + struct maybe_idna host = {NULL, NULL}; struct hostent *h; sock_addr_t addr; struct sockaddr *sa; @@ -4674,27 +4679,27 @@ socket_gethostbyname_ex(PyObject *self, #endif #endif /* HAVE_GETHOSTBYNAME_R */ - if (!PyArg_ParseTuple(args, "et:gethostbyname_ex", "idna", &name)) + if (!PyArg_ParseTuple(args, "O&:gethostbyname_ex", idna_converter, &host)) return NULL; - if (setipaddr(name, SAS2SA(&addr), sizeof(addr), AF_INET) < 0) + if (setipaddr(host.buf, SAS2SA(&addr), sizeof(addr), AF_INET) < 0) goto finally; Py_BEGIN_ALLOW_THREADS #ifdef HAVE_GETHOSTBYNAME_R #if defined(HAVE_GETHOSTBYNAME_R_6_ARG) - gethostbyname_r(name, &hp_allocated, buf, buf_len, + gethostbyname_r(host.buf, &hp_allocated, buf, buf_len, &h, &errnop); #elif defined(HAVE_GETHOSTBYNAME_R_5_ARG) - h = gethostbyname_r(name, &hp_allocated, buf, buf_len, &errnop); + h = gethostbyname_r(host.buf, &hp_allocated, buf, buf_len, &errnop); #else /* HAVE_GETHOSTBYNAME_R_3_ARG */ memset((void *) &data, '\0', sizeof(data)); - result = gethostbyname_r(name, &hp_allocated, &data); + result = gethostbyname_r(host.buf, &hp_allocated, &data); h = (result != 0) ? NULL : &hp_allocated; #endif #else /* not HAVE_GETHOSTBYNAME_R */ #ifdef USE_GETHOSTBYNAME_LOCK PyThread_acquire_lock(netdb_lock, 1); #endif - h = gethostbyname(name); + h = gethostbyname(host.buf); #endif /* HAVE_GETHOSTBYNAME_R */ Py_END_ALLOW_THREADS /* Some C libraries would require addr.__ss_family instead of @@ -4708,7 +4713,7 @@ socket_gethostbyname_ex(PyObject *self, PyThread_release_lock(netdb_lock); #endif finally: - PyMem_Free(name); + idna_cleanup(&host); return ret; } @@ -4727,7 +4732,7 @@ socket_gethostbyaddr(PyObject *self, PyO { sock_addr_t addr; struct sockaddr *sa = SAS2SA(&addr); - char *ip_num; + struct maybe_idna ip_num = {NULL, NULL}; struct hostent *h; PyObject *ret = NULL; #ifdef HAVE_GETHOSTBYNAME_R @@ -4751,10 +4756,10 @@ socket_gethostbyaddr(PyObject *self, PyO int al; int af; - if (!PyArg_ParseTuple(args, "et:gethostbyaddr", "idna", &ip_num)) + if (!PyArg_ParseTuple(args, "O&:gethostbyaddr", idna_converter, &ip_num)) return NULL; af = AF_UNSPEC; - if (setipaddr(ip_num, sa, sizeof(addr), af) < 0) + if (setipaddr(ip_num.buf, sa, sizeof(addr), af) < 0) goto finally; af = sa->sa_family; ap = NULL; @@ -4800,7 +4805,7 @@ socket_gethostbyaddr(PyObject *self, PyO PyThread_release_lock(netdb_lock); #endif finally: - PyMem_Free(ip_num); + idna_cleanup(&ip_num); return ret; } @@ -5502,11 +5507,11 @@ socket_getaddrinfo(PyObject *self, PyObj PyObject *hobj = NULL; PyObject *pobj = (PyObject *)NULL; char pbuf[30]; - char *hptr, *pptr; + char *pptr; int family, socktype, protocol, flags; int error; PyObject *all = (PyObject *)NULL; - PyObject *idna = NULL; + struct maybe_idna host = {NULL, NULL}; socktype = protocol = flags = 0; family = AF_UNSPEC; @@ -5516,20 +5521,8 @@ socket_getaddrinfo(PyObject *self, PyObj return NULL; } if (hobj == Py_None) { - hptr = NULL; - } else if (PyUnicode_Check(hobj)) { - _Py_IDENTIFIER(encode); - - idna = _PyObject_CallMethodId(hobj, &PyId_encode, "s", "idna"); - if (!idna) - return NULL; - assert(PyBytes_Check(idna)); - hptr = PyBytes_AS_STRING(idna); - } else if (PyBytes_Check(hobj)) { - hptr = PyBytes_AsString(hobj); - } else { - PyErr_SetString(PyExc_TypeError, - "getaddrinfo() argument 1 must be string or None"); + host.buf = NULL; + } else if (!idna_converter(hobj, &host)) { return NULL; } if (PyLong_CheckExact(pobj)) { @@ -5566,7 +5559,7 @@ socket_getaddrinfo(PyObject *self, PyObj hints.ai_flags = flags; Py_BEGIN_ALLOW_THREADS ACQUIRE_GETADDRINFO_LOCK - error = getaddrinfo(hptr, pptr, &hints, &res0); + error = getaddrinfo(host.buf, pptr, &hints, &res0); Py_END_ALLOW_THREADS RELEASE_GETADDRINFO_LOCK /* see comment in setipaddr() */ if (error) { @@ -5595,13 +5588,13 @@ socket_getaddrinfo(PyObject *self, PyObj goto err; Py_XDECREF(single); } - Py_XDECREF(idna); + idna_cleanup(&host); if (res0) freeaddrinfo(res0); return all; err: Py_XDECREF(all); - Py_XDECREF(idna); + idna_cleanup(&host); if (res0) freeaddrinfo(res0); return (PyObject *)NULL;