Accept ASCII/surrogateescape strings as hostname arguments. diff --git a/Doc/library/socket.rst b/Doc/library/socket.rst --- a/Doc/library/socket.rst +++ b/Doc/library/socket.rst @@ -49,6 +49,28 @@ supported. The address format required b automatically selected based on the address family specified when the socket object was created. +When a hostname is returned by a system interface, it is decoded into +a string using the ``'ascii'`` codec and the ``'surrogateescape'`` +error handler; this leaves ASCII bytes as ASCII, including IDNA +ASCII-compatible encodings (see :mod:`encodings.idna`), but converts +any non-ASCII bytes to the Unicode lone surrogate codes +U+DC80...U+DCFF. + +Hostname arguments can be passed as strings or :class:`bytes` objects. +The latter are passed to the system unchanged, while strings are +encoded as follows: if a string contains only ASCII characters and/or +the Unicode lone surrogate codes U+DC80...U+DCFF, it is encoded using +the ``'ascii'`` codec and the ``'surrogateescape'`` error handler; +otherwise it is converted to IDNA ASCII-compatible form using the +``'idna'`` codec, and if this is not possible, :exc:`UnicodeError` is +raised. + +.. versionchanged:: XXX + Previously, hostnames were decoded as UTF-8 and encoded using IDNA + or UTF-8; ``surrogateescape`` was not used; some interfaces + formerly accepted :class:`bytearray` objects, or did not accept + :class:`bytes` objects. + For IPv4 addresses, two special forms are accepted instead of a host address: the empty string represents :const:`INADDR_ANY`, and the string ``''`` represents :const:`INADDR_BROADCAST`. The behavior is not diff --git a/Lib/test/test_socket.py b/Lib/test/test_socket.py --- a/Lib/test/test_socket.py +++ b/Lib/test/test_socket.py @@ -322,6 +322,55 @@ class GeneralModuleTests(unittest.TestCa except socket.error: pass + def tryHostnameArgs(self, function, notfounderror): + # Call the given one-argument function with various valid and + # invalid representations of nonexistent hostnames. Check + # that it raises notfounderror for valid representations, and + # UnicodeError for invalid ones. + + # An RFC 1123-compliant host name (".invalid" TLD is reserved + # under RFC 2606). + self.assertRaises(notfounderror, function, "host.domain.invalid") + # Previous name as a bytes object. + self.assertRaises(notfounderror, function, b"host.domain.invalid") + # A domain name with a non-ASCII octet, as bytes. + self.assertRaises(notfounderror, function, b"\xff.domain.invalid") + # Previous domain name as ASCII/surrogateescape string representation. + self.assertRaises(notfounderror, function, "\udcff.domain.invalid") + # A legal IDN. + self.assertRaises(notfounderror, function, "€.domain.invalid") + # A combination of the previous two, which may make sense in + # theory, but is not accepted (the Euro sign means it must be + # interpreted as an IDN, but it is is not a legal IDN, because + # it contains a surrogate character). + self.assertRaises(UnicodeError, function, "\udcff.€.domain.invalid") + + def testGethostbynameHostnames(self): + self.tryHostnameArgs(socket.gethostbyname, + (socket.herror, socket.gaierror)) + + def testGethostbyname_exHostnames(self): + self.tryHostnameArgs(socket.gethostbyname_ex, + (socket.herror, socket.gaierror)) + + def testGethostbyaddrHostnames(self): + self.tryHostnameArgs(socket.gethostbyaddr, + (socket.herror, socket.gaierror)) + + def testGetaddrinfoHostnames(self): + self.tryHostnameArgs(lambda host: socket.getaddrinfo(host, None), + socket.gaierror) + + def testGetnameinfoHostnames(self): + self.tryHostnameArgs(lambda host: socket.getnameinfo((host, 0), 0), + socket.gaierror) + + def testSocketObjectHostnames(self): + def f(host): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((host, 80)) + self.tryHostnameArgs(f, socket.error) + def testNtoH(self): # This just checks that htons etc. are their own inverse, # when looking at the lower 16 or 32 bits. diff --git a/Modules/socketmodule.c b/Modules/socketmodule.c --- a/Modules/socketmodule.c +++ b/Modules/socketmodule.c @@ -739,6 +739,57 @@ PyThread_type_lock netdb_lock; #endif +/* "O&" converter for hostname arguments, which attempts to encode + strings as ASCII with "surrogateescape" error handler before + falling back to strict IDNA encoding. Also accepts bytes objects, + but not bytearray. On success, sets PyObject *addr to a new + reference to a bytes object, and may clear the error indicator. */ + +static int +hostname_converter(PyObject *arg, void *addr) +{ + PyObject **result = addr; + + if (arg == NULL) { + Py_DECREF(result); + return 1; + } + + if (PyUnicode_Check(arg)) { + PyObject *tmp; + + tmp = PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(arg), + PyUnicode_GET_SIZE(arg), + "surrogateescape"); + if (tmp == NULL) { + if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) + return 0; + PyErr_Clear(); + tmp = PyUnicode_AsEncodedString(arg, "idna", "strict"); + if (tmp == NULL) + return 0; + } + arg = tmp; + } + else + Py_INCREF(arg); + if (!PyBytes_Check(arg)) { + PyErr_SetString(PyExc_TypeError, + "hostnames must be str or bytes objects"); + goto err; + } + if (PyBytes_GET_SIZE(arg) != strlen(PyBytes_AS_STRING(arg))) { + PyErr_SetString(PyExc_TypeError, "embedded NUL character in hostname"); + goto err; + } + *result = arg; + return Py_CLEANUP_SUPPORTED; +err: + Py_DECREF(arg); + return 0; +} + + /* Return the string representation for the given hostname. */ static PyObject * @@ -1219,6 +1270,7 @@ getsockaddrarg(PySocketSockObject *s, Py case AF_INET: { struct sockaddr_in* addr; + PyObject *hostobj; char *host; int port, result; if (!PyTuple_Check(args)) { @@ -1229,13 +1281,14 @@ getsockaddrarg(PySocketSockObject *s, Py Py_TYPE(args)->tp_name); return 0; } - if (!PyArg_ParseTuple(args, "eti:getsockaddrarg", - "idna", &host, &port)) + if (!PyArg_ParseTuple(args, "O&i:getsockaddrarg", + hostname_converter, &hostobj, &port)) return 0; + host = PyBytes_AS_STRING(hostobj); addr=(struct sockaddr_in*)addr_ret; result = setipaddr(host, (struct sockaddr *)addr, sizeof(*addr), AF_INET); - PyMem_Free(host); + Py_DECREF(hostobj); if (result < 0) return 0; if (port < 0 || port > 0xffff) { @@ -1254,6 +1307,7 @@ getsockaddrarg(PySocketSockObject *s, Py case AF_INET6: { struct sockaddr_in6* addr; + PyObject *hostobj; char *host; int port, flowinfo, scope_id, result; flowinfo = scope_id = 0; @@ -1265,15 +1319,16 @@ getsockaddrarg(PySocketSockObject *s, Py Py_TYPE(args)->tp_name); return 0; } - if (!PyArg_ParseTuple(args, "eti|ii", - "idna", &host, &port, &flowinfo, + if (!PyArg_ParseTuple(args, "O&i|ii", + hostname_converter, &hostobj, &port, &flowinfo, &scope_id)) { return 0; } + host = PyBytes_AS_STRING(hostobj); addr = (struct sockaddr_in6*)addr_ret; result = setipaddr(host, (struct sockaddr *)addr, sizeof(*addr), AF_INET6); - PyMem_Free(host); + Py_DECREF(hostobj); if (result < 0) return 0; if (port < 0 || port > 0xffff) { @@ -3009,12 +3064,18 @@ socket_gethostbyname(PyObject *self, PyO { char *name; sock_addr_t addrbuf; - - if (!PyArg_ParseTuple(args, "s:gethostbyname", &name)) + PyObject *nameobj, *ret = NULL; + + if (!PyArg_ParseTuple(args, "O&:gethostbyname", + hostname_converter, &nameobj)) return NULL; + name = PyBytes_AS_STRING(nameobj); if (setipaddr(name, SAS2SA(&addrbuf), sizeof(addrbuf), AF_INET) < 0) - return NULL; - return makeipaddr(SAS2SA(&addrbuf), sizeof(struct sockaddr_in)); + goto finally; + ret = makeipaddr(SAS2SA(&addrbuf), sizeof(struct sockaddr_in)); +finally: + Py_DECREF(nameobj); + return ret; } PyDoc_STRVAR(gethostbyname_doc, @@ -3158,6 +3219,7 @@ gethost_common(struct hostent *h, struct static PyObject * socket_gethostbyname_ex(PyObject *self, PyObject *args) { + PyObject *nameobj; char *name; struct hostent *h; #ifdef ENABLE_IPV6 @@ -3166,7 +3228,7 @@ socket_gethostbyname_ex(PyObject *self, struct sockaddr_in addr; #endif struct sockaddr *sa; - PyObject *ret; + PyObject *ret = NULL; #ifdef HAVE_GETHOSTBYNAME_R struct hostent hp_allocated; #ifdef HAVE_GETHOSTBYNAME_R_3_ARG @@ -3181,10 +3243,12 @@ socket_gethostbyname_ex(PyObject *self, #endif #endif /* HAVE_GETHOSTBYNAME_R */ - if (!PyArg_ParseTuple(args, "s:gethostbyname_ex", &name)) + if (!PyArg_ParseTuple(args, "O&:gethostbyname_ex", + hostname_converter, &nameobj)) return NULL; + name = PyBytes_AS_STRING(nameobj); if (setipaddr(name, (struct sockaddr *)&addr, sizeof(addr), AF_INET) < 0) - return NULL; + goto finally; Py_BEGIN_ALLOW_THREADS #ifdef HAVE_GETHOSTBYNAME_R #if defined(HAVE_GETHOSTBYNAME_R_6_ARG) @@ -3214,6 +3278,8 @@ socket_gethostbyname_ex(PyObject *self, #ifdef USE_GETHOSTBYNAME_LOCK PyThread_release_lock(netdb_lock); #endif +finally: + Py_DECREF(nameobj); return ret; } @@ -3236,9 +3302,10 @@ socket_gethostbyaddr(PyObject *self, PyO struct sockaddr_in addr; #endif struct sockaddr *sa = (struct sockaddr *)&addr; + PyObject *addrobj; char *ip_num; struct hostent *h; - PyObject *ret; + PyObject *ret = NULL; #ifdef HAVE_GETHOSTBYNAME_R struct hostent hp_allocated; #ifdef HAVE_GETHOSTBYNAME_R_3_ARG @@ -3260,11 +3327,13 @@ socket_gethostbyaddr(PyObject *self, PyO int al; int af; - if (!PyArg_ParseTuple(args, "s:gethostbyaddr", &ip_num)) + if (!PyArg_ParseTuple(args, "O&:gethostbyaddr", + hostname_converter, &addrobj)) return NULL; + ip_num = PyBytes_AS_STRING(addrobj); af = AF_UNSPEC; if (setipaddr(ip_num, sa, sizeof(addr), af) < 0) - return NULL; + goto finally; af = sa->sa_family; ap = NULL; al = 0; @@ -3281,7 +3350,7 @@ socket_gethostbyaddr(PyObject *self, PyO #endif default: PyErr_SetString(socket_error, "unsupported address family"); - return NULL; + goto finally; } Py_BEGIN_ALLOW_THREADS #ifdef HAVE_GETHOSTBYNAME_R @@ -3308,6 +3377,8 @@ socket_gethostbyaddr(PyObject *self, PyO #ifdef USE_GETHOSTBYNAME_LOCK PyThread_release_lock(netdb_lock); #endif +finally: + Py_DECREF(addrobj); return ret; } @@ -3859,17 +3930,9 @@ socket_getaddrinfo(PyObject *self, PyObj } if (hobj == Py_None) { hptr = NULL; - } else if (PyUnicode_Check(hobj)) { - idna = PyObject_CallMethod(hobj, "encode", "s", "idna"); - if (!idna) - return NULL; - assert(PyBytes_Check(idna)); + } else if (hostname_converter(hobj, &idna)) { hptr = PyBytes_AS_STRING(idna); - } else if (PyBytes_Check(hobj)) { - hptr = PyBytes_AsString(hobj); } else { - PyErr_SetString(PyExc_TypeError, - "getaddrinfo() argument 1 must be string or None"); return NULL; } if (PyLong_CheckExact(pobj)) { @@ -3949,6 +4012,7 @@ socket_getnameinfo(PyObject *self, PyObj { PyObject *sa = (PyObject *)NULL; int flags; + PyObject *hostobj; char *hostp; int port, flowinfo, scope_id; char hbuf[NI_MAXHOST], pbuf[NI_MAXSERV]; @@ -3964,9 +4028,10 @@ socket_getnameinfo(PyObject *self, PyObj "getnameinfo() argument 1 must be a tuple"); return NULL; } - if (!PyArg_ParseTuple(sa, "si|ii", - &hostp, &port, &flowinfo, &scope_id)) + if (!PyArg_ParseTuple(sa, "O&i|ii", hostname_converter, + &hostobj, &port, &flowinfo, &scope_id)) return NULL; + hostp = PyBytes_AS_STRING(hostobj); PyOS_snprintf(pbuf, sizeof(pbuf), "%d", port); memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; @@ -4017,6 +4082,7 @@ socket_getnameinfo(PyObject *self, PyObj fail: if (res) freeaddrinfo(res); + Py_DECREF(hostobj); return ret; }