Accept ASCII/surrogateescape strings as hostname arguments. diff --git a/Doc/library/socket.rst b/Doc/library/socket.rst --- a/Doc/library/socket.rst +++ b/Doc/library/socket.rst @@ -49,6 +49,27 @@ supported. The address format required b automatically selected based on the address family specified when the socket object was created. +When a hostname is returned by a system interface, it is decoded into +a string using the ``'ascii'`` codec and the ``'surrogateescape'`` +error handler; this leaves IDNA ASCII-compatible encodings in ASCII +form, but converts any non-ASCII bytes in the hostname to the Unicode +lone surrogate codes U+DC80...U+DCFF. + +Hostname arguments can be passed as strings or :class:`bytes` objects. +The latter are passed to the system unchanged, while strings are +encoded as follows: if a string contains only ASCII characters and/or +the Unicode lone surrogate codes U+DC80...U+DCFF, it is encoded using +the ``'ascii'`` codec and the ``'surrogateescape'`` error handler; +otherwise it is converted to IDNA ASCII-compatible form using the +``'idna'`` codec (see :mod:`encodings.idna`), and if this is not +possible, :exc:`UnicodeError` is raised. + +.. versionchanged:: XXX + Previously, hostnames were decoded as UTF-8 and encoded using IDNA + or UTF-8; ``surrogateescape`` was not used; some interfaces + formerly accepted :class:`bytearray` objects, or did not accept + :class:`bytes` objects. + For IPv4 addresses, two special forms are accepted instead of a host address: the empty string represents :const:`INADDR_ANY`, and the string ``''`` represents :const:`INADDR_BROADCAST`. The behavior is not diff --git a/Lib/test/test_socket.py b/Lib/test/test_socket.py --- a/Lib/test/test_socket.py +++ b/Lib/test/test_socket.py @@ -309,6 +309,55 @@ class GeneralModuleTests(unittest.TestCa except socket.error: pass + def tryHostnameArgs(self, function, notfounderror): + # Call the given one-argument function with various valid and + # invalid representations of nonexistent hostnames. Check + # that it raises notfounderror for valid representations, and + # UnicodeError for invalid ones. + + # An RFC 1123-compliant host name (".invalid" TLD is reserved + # under RFC 2606). + self.assertRaises(notfounderror, function, "host.domain.invalid") + # Previous name as a bytes object. + self.assertRaises(notfounderror, function, b"host.domain.invalid") + # A domain name with a non-ASCII octet, as bytes. + self.assertRaises(notfounderror, function, b"\xff.domain.invalid") + # Previous domain name as ASCII/surrogateescape string representation. + self.assertRaises(notfounderror, function, "\udcff.domain.invalid") + # A legal IDN. + self.assertRaises(notfounderror, function, "€.domain.invalid") + # A combination of the previous two, which may make sense in + # theory, but is not accepted (the Euro sign means it must be + # interpreted as an IDN, but it is is not a legal IDN, because + # it contains a surrogate character). + self.assertRaises(UnicodeError, function, "\udcff.€.domain.invalid") + + def testGethostbynameHostnames(self): + self.tryHostnameArgs(socket.gethostbyname, + (socket.herror, socket.gaierror)) + + def testGethostbyname_exHostnames(self): + self.tryHostnameArgs(socket.gethostbyname_ex, + (socket.herror, socket.gaierror)) + + def testGethostbyaddrHostnames(self): + self.tryHostnameArgs(socket.gethostbyaddr, + (socket.herror, socket.gaierror)) + + def testGetaddrinfoHostnames(self): + self.tryHostnameArgs(lambda host: socket.getaddrinfo(host, None), + socket.gaierror) + + def testGetnameinfoHostnames(self): + self.tryHostnameArgs(lambda host: socket.getnameinfo((host, 0), 0), + socket.gaierror) + + def testSocketObjectHostnames(self): + def f(host): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect((host, 80)) + self.tryHostnameArgs(f, socket.error) + def testNtoH(self): # This just checks that htons etc. are their own inverse, # when looking at the lower 16 or 32 bits. diff --git a/Modules/socketmodule.c b/Modules/socketmodule.c --- a/Modules/socketmodule.c +++ b/Modules/socketmodule.c @@ -1136,6 +1136,55 @@ makesockaddr(int sockfd, struct sockaddr } +/* "O&" converter for hostname arguments, which attempts to encode + strings as ASCII with "surrogateescape" error handler before + falling back to strict IDNA encoding. Also accepts bytes objects, + but not bytearray. On success, sets PyObject *addr to a new + reference to a bytes object, and may clear the error indicator. */ + +static int +hostname_to_bytes(PyObject *arg, void *addr) +{ + PyObject **result = addr; + + if (arg == NULL) { + Py_DECREF(result); + return 1; + } + + if (PyUnicode_Check(arg)) { + PyObject *tmp; + + tmp = PyUnicode_AsEncodedObject(arg, "ascii", "surrogateescape"); + if (tmp == NULL) { + if (!PyErr_ExceptionMatches(PyExc_UnicodeError)) + return 0; + PyErr_Clear(); + tmp = PyUnicode_AsEncodedObject(arg, "idna", "strict"); + if (tmp == NULL) + return 0; + } + arg = tmp; + } + else + Py_INCREF(arg); + if (!PyBytes_Check(arg)) { + PyErr_SetString(PyExc_TypeError, + "hostnames must be str or bytes objects"); + goto err; + } + if (PyBytes_GET_SIZE(arg) != strlen(PyBytes_AS_STRING(arg))) { + PyErr_SetString(PyExc_TypeError, "embedded NUL character in hostname"); + goto err; + } + *result = arg; + return Py_CLEANUP_SUPPORTED; +err: + Py_DECREF(arg); + return 0; +} + + /* Parse a socket address argument according to the socket object's address family. Return 1 if the address was in the proper format, 0 of not. The address is returned through addr_ret, its length @@ -1215,7 +1264,7 @@ getsockaddrarg(PySocketSockObject *s, Py case AF_INET: { struct sockaddr_in* addr; - char *host; + PyObject *host; int port, result; if (!PyTuple_Check(args)) { PyErr_Format( @@ -1225,13 +1274,13 @@ getsockaddrarg(PySocketSockObject *s, Py Py_TYPE(args)->tp_name); return 0; } - if (!PyArg_ParseTuple(args, "eti:getsockaddrarg", - "idna", &host, &port)) + if (!PyArg_ParseTuple(args, "O&i:getsockaddrarg", + hostname_to_bytes, &host, &port)) return 0; addr=(struct sockaddr_in*)addr_ret; - result = setipaddr(host, (struct sockaddr *)addr, + result = setipaddr(PyBytes_AS_STRING(host), (struct sockaddr *)addr, sizeof(*addr), AF_INET); - PyMem_Free(host); + Py_DECREF(host); if (result < 0) return 0; if (port < 0 || port > 0xffff) { @@ -1250,7 +1299,7 @@ getsockaddrarg(PySocketSockObject *s, Py case AF_INET6: { struct sockaddr_in6* addr; - char *host; + PyObject *host; int port, flowinfo, scope_id, result; flowinfo = scope_id = 0; if (!PyTuple_Check(args)) { @@ -1261,15 +1310,15 @@ getsockaddrarg(PySocketSockObject *s, Py Py_TYPE(args)->tp_name); return 0; } - if (!PyArg_ParseTuple(args, "eti|ii", - "idna", &host, &port, &flowinfo, + if (!PyArg_ParseTuple(args, "O&i|ii", + hostname_to_bytes, &host, &port, &flowinfo, &scope_id)) { return 0; } addr = (struct sockaddr_in6*)addr_ret; - result = setipaddr(host, (struct sockaddr *)addr, + result = setipaddr(PyBytes_AS_STRING(host), (struct sockaddr *)addr, sizeof(*addr), AF_INET6); - PyMem_Free(host); + Py_DECREF(host); if (result < 0) return 0; if (port < 0 || port > 0xffff) { @@ -2985,14 +3034,19 @@ Return the current host name."); static PyObject * socket_gethostbyname(PyObject *self, PyObject *args) { - char *name; + PyObject *name; sock_addr_t addrbuf; - - if (!PyArg_ParseTuple(args, "s:gethostbyname", &name)) + PyObject *ret = NULL; + + if (!PyArg_ParseTuple(args, "O&:gethostbyname", hostname_to_bytes, &name)) return NULL; - if (setipaddr(name, SAS2SA(&addrbuf), sizeof(addrbuf), AF_INET) < 0) - return NULL; - return makeipaddr(SAS2SA(&addrbuf), sizeof(struct sockaddr_in)); + if (setipaddr(PyBytes_AS_STRING(name), SAS2SA(&addrbuf), sizeof(addrbuf), + AF_INET) < 0) + goto finally; + ret = makeipaddr(SAS2SA(&addrbuf), sizeof(struct sockaddr_in)); +finally: + Py_DECREF(name); + return ret; } PyDoc_STRVAR(gethostbyname_doc, @@ -3136,7 +3190,7 @@ gethost_common(struct hostent *h, struct static PyObject * socket_gethostbyname_ex(PyObject *self, PyObject *args) { - char *name; + PyObject *name; struct hostent *h; #ifdef ENABLE_IPV6 struct sockaddr_storage addr; @@ -3144,7 +3198,7 @@ socket_gethostbyname_ex(PyObject *self, struct sockaddr_in addr; #endif struct sockaddr *sa; - PyObject *ret; + PyObject *ret = NULL; #ifdef HAVE_GETHOSTBYNAME_R struct hostent hp_allocated; #ifdef HAVE_GETHOSTBYNAME_R_3_ARG @@ -3159,20 +3213,23 @@ socket_gethostbyname_ex(PyObject *self, #endif #endif /* HAVE_GETHOSTBYNAME_R */ - if (!PyArg_ParseTuple(args, "s:gethostbyname_ex", &name)) + if (!PyArg_ParseTuple(args, "O&:gethostbyname_ex", + hostname_to_bytes, &name)) return NULL; - if (setipaddr(name, (struct sockaddr *)&addr, sizeof(addr), AF_INET) < 0) - return NULL; + if (setipaddr(PyBytes_AS_STRING(name), (struct sockaddr *)&addr, + sizeof(addr), AF_INET) < 0) + goto finally; Py_BEGIN_ALLOW_THREADS #ifdef HAVE_GETHOSTBYNAME_R #if defined(HAVE_GETHOSTBYNAME_R_6_ARG) - result = gethostbyname_r(name, &hp_allocated, buf, buf_len, - &h, &errnop); + result = gethostbyname_r(PyBytes_AS_STRING(name), &hp_allocated, + buf, buf_len, &h, &errnop); #elif defined(HAVE_GETHOSTBYNAME_R_5_ARG) - h = gethostbyname_r(name, &hp_allocated, buf, buf_len, &errnop); + h = gethostbyname_r(PyBytes_AS_STRING(name), &hp_allocated, + buf, buf_len, &errnop); #else /* HAVE_GETHOSTBYNAME_R_3_ARG */ memset((void *) &data, '\0', sizeof(data)); - result = gethostbyname_r(name, &hp_allocated, &data); + result = gethostbyname_r(PyBytes_AS_STRING(name), &hp_allocated, &data); h = (result != 0) ? NULL : &hp_allocated; #endif #else /* not HAVE_GETHOSTBYNAME_R */ @@ -3192,6 +3249,8 @@ socket_gethostbyname_ex(PyObject *self, #ifdef USE_GETHOSTBYNAME_LOCK PyThread_release_lock(netdb_lock); #endif +finally: + Py_DECREF(name); return ret; } @@ -3214,9 +3273,9 @@ socket_gethostbyaddr(PyObject *self, PyO struct sockaddr_in addr; #endif struct sockaddr *sa = (struct sockaddr *)&addr; - char *ip_num; + PyObject *bytes; struct hostent *h; - PyObject *ret; + PyObject *ret = NULL; #ifdef HAVE_GETHOSTBYNAME_R struct hostent hp_allocated; #ifdef HAVE_GETHOSTBYNAME_R_3_ARG @@ -3238,11 +3297,11 @@ socket_gethostbyaddr(PyObject *self, PyO int al; int af; - if (!PyArg_ParseTuple(args, "s:gethostbyaddr", &ip_num)) + if (!PyArg_ParseTuple(args, "O&:gethostbyaddr", hostname_to_bytes, &bytes)) return NULL; af = AF_UNSPEC; - if (setipaddr(ip_num, sa, sizeof(addr), af) < 0) - return NULL; + if (setipaddr(PyBytes_AS_STRING(bytes), sa, sizeof(addr), af) < 0) + goto finally; af = sa->sa_family; ap = NULL; al = 0; @@ -3259,7 +3318,7 @@ socket_gethostbyaddr(PyObject *self, PyO #endif default: PyErr_SetString(socket_error, "unsupported address family"); - return NULL; + goto finally; } Py_BEGIN_ALLOW_THREADS #ifdef HAVE_GETHOSTBYNAME_R @@ -3286,6 +3345,8 @@ socket_gethostbyaddr(PyObject *self, PyO #ifdef USE_GETHOSTBYNAME_LOCK PyThread_release_lock(netdb_lock); #endif +finally: + Py_DECREF(bytes); return ret; } @@ -3835,17 +3896,9 @@ socket_getaddrinfo(PyObject *self, PyObj } if (hobj == Py_None) { hptr = NULL; - } else if (PyUnicode_Check(hobj)) { - idna = PyObject_CallMethod(hobj, "encode", "s", "idna"); - if (!idna) - return NULL; - assert(PyBytes_Check(idna)); + } else if (hostname_to_bytes(hobj, &idna)) { hptr = PyBytes_AS_STRING(idna); - } else if (PyBytes_Check(hobj)) { - hptr = PyBytes_AsString(hobj); } else { - PyErr_SetString(PyExc_TypeError, - "getaddrinfo() argument 1 must be string or None"); return NULL; } if (PyLong_CheckExact(pobj)) { @@ -3925,7 +3978,7 @@ socket_getnameinfo(PyObject *self, PyObj { PyObject *sa = (PyObject *)NULL; int flags; - char *hostp; + PyObject *host; int port, flowinfo, scope_id; char hbuf[NI_MAXHOST], pbuf[NI_MAXSERV]; struct addrinfo hints, *res = NULL; @@ -3940,8 +3993,8 @@ socket_getnameinfo(PyObject *self, PyObj "getnameinfo() argument 1 must be a tuple"); return NULL; } - if (!PyArg_ParseTuple(sa, "si|ii", - &hostp, &port, &flowinfo, &scope_id)) + if (!PyArg_ParseTuple(sa, "O&i|ii", hostname_to_bytes, + &host, &port, &flowinfo, &scope_id)) return NULL; PyOS_snprintf(pbuf, sizeof(pbuf), "%d", port); memset(&hints, 0, sizeof(hints)); @@ -3949,7 +4002,7 @@ socket_getnameinfo(PyObject *self, PyObj hints.ai_socktype = SOCK_DGRAM; /* make numeric port happy */ Py_BEGIN_ALLOW_THREADS ACQUIRE_GETADDRINFO_LOCK - error = getaddrinfo(hostp, pbuf, &hints, &res); + error = getaddrinfo(PyBytes_AS_STRING(host), pbuf, &hints, &res); Py_END_ALLOW_THREADS RELEASE_GETADDRINFO_LOCK /* see comment in setipaddr() */ if (error) { @@ -3993,6 +4046,7 @@ socket_getnameinfo(PyObject *self, PyObj fail: if (res) freeaddrinfo(res); + Py_DECREF(host); return ret; }