Message 288510 - Python tracker

➜

This issue tracker has been migrated to GitHub, and is currently read-only.
For more information, see the GitHub FAQs in the Python's Developer Guide.

Author	Ján Stanček
Recipients	Ján Stanček
Date	2017-02-24.09:03:56
SpamBayes Score	-1.0
Marked as misclassified	Yes
Message-id	<1487927038.37.0.851881550844.issue29640@psf.upfronthosting.co.za>
In-reply-to

Content
Following crash is sporadically observed in RHEL7 anaconda: (gdb) f 0 #0 PyThread_ReInitTLS () at /usr/src/debug/Python-2.7.5/Python/thread.c:411 411 if (p->id != id) { (gdb) list 406 keymutex = PyThread_allocate_lock(); 407 408 /* Delete all keys which do not match the current thread id / 409 q = &keyhead; 410 while ((p = q) != NULL) { 411 if (p->id != id) { 412 q = p->next; 413 free((void )p); 414 /* NB This does not free p->value! / 415 } (gdb) p p $1 = (struct key ) 0x3333333333333333 (gdb) p keyhead $2 = (struct key ) 0x3333333333333333 key list is protected by keymutex (except for PyThread_ReInitTLS), but there doesn't seem to be any protection against concurrent fork(). What seems to happen is fork() at the moment when key list is not consistent. For example, if I initialize new key to 0xfe: static struct key find_key(int key, void value) // find_key with extra memset() ... p = (struct key )malloc(sizeof(struct key)); memset(p, 0xfe, sizeof(struct key)); if (p != NULL) { p->id = id; p->key = key; p->value = value; p->next = keyhead; keyhead = p; } ... Looking at disassembly, compiler reordered last 2 writes: 0x00000000004fcb50 <+272>: callq 0x413d10 <malloc@plt> 0x00000000004fcb55 <+277>: movabs $0xfefefefefefefefe,%rcx 0x00000000004fcb5f <+287>: test %rax,%rax 0x00000000004fcb62 <+290>: mov %rcx,(%rax) 0x00000000004fcb65 <+293>: mov %rcx,0x8(%rax) 0x00000000004fcb69 <+297>: mov %rcx,0x10(%rax) 0x00000000004fcb6d <+301>: mov %rcx,0x18(%rax) 0x00000000004fcb71 <+305>: je 0x4fcaff <PyThread_set_key_value+191> 0x00000000004fcb73 <+307>: mov 0x2f1e26(%rip),%rdx # 0x7ee9a0 <keyhead> 0x00000000004fcb7a <+314>: mov 0x2f1dff(%rip),%rdi # 0x7ee980 <keymutex> 0x00000000004fcb81 <+321>: xor %r14d,%r14d 0x00000000004fcb84 <+324>: mov %rbp,0x8(%rax) 0x00000000004fcb88 <+328>: mov %r12d,0x10(%rax) 0x00000000004fcb8c <+332>: mov %r13,0x18(%rax) 0x00000000004fcb90 <+336>: mov %rax,0x2f1e09(%rip) # 0x7ee9a0 <keyhead> 0x00000000004fcb97 <+343>: mov %rdx,(%rax) Now consider what happens, when different threads call fork() in between these 2 writes: we updated keyhead, but keyhead->next has not been updated yet. Now when anaconda hangs, I get: (gdb) list 407 keymutex = PyThread_allocate_lock(); 408 409 /* Delete all keys which do not match the current thread id / 410 q = &keyhead; 411 while ((p = q) != NULL) { 412 if (p->id != id) { 413 q = p->next; 414 free((void )p); 415 /* NB This does not free p->value! / 416 } (gdb) p p $1 = (struct key ) 0xfefefefefefefefe (gdb) p keyhead $2 = (struct key ) 0xfefefefefefefefe Here's how I think we get into this state: -------------------------> thread 1 # has GIL Thread.start _start_new_thread(self.__bootstrap, ()) PyThread_start_new_thread(t_bootstrap) # spawns thread 3 -------------------------> thread 2 ... # waiting for GIL -------------------------> thread 3 t_bootstrap _PyThreadState_Init # does not have GIL yet at this point _PyGILState_NoteThreadState PyThread_set_key_value(autoTLSkey, (void )tstate) find_key # key list is temporarily not consistent # due to compiler reordering couple writes in find_key -------------------------> thread 1 continuing Thread.start self.__started.wait() Event.wait() self.__cond.wait Condition.wait() waiter = _allocate_lock() waiter.acquire() lock_PyThread_acquire_lock Py_BEGIN_ALLOW_THREADS PyEval_SaveThread PyThread_release_lock(interpreter_lock); -------------------------> thread 2 ... # acquired GIL os.fork() # forks inconsistent list -------------------------> child PyOS_AfterFork() PyThread_ReInitTLS() SIGSEGV Attached patch for python makes it easier to reproduce, by adding delays to couple places to make window key list is not consistent larger.

Following crash is sporadically observed in RHEL7 anaconda:

(gdb) f 0
#0  PyThread_ReInitTLS () at /usr/src/debug/Python-2.7.5/Python/thread.c:411
411             if (p->id != id) {
(gdb) list
406         keymutex = PyThread_allocate_lock();
407
408         /* Delete all keys which do not match the current thread id */
409         q = &keyhead;
410         while ((p = *q) != NULL) {
411             if (p->id != id) {
412                 *q = p->next;
413                 free((void *)p);
414                 /* NB This does *not* free p->value! */
415             }
(gdb) p p
$1 = (struct key *) 0x3333333333333333
(gdb) p keyhead
$2 = (struct key *) 0x3333333333333333

key list is protected by keymutex (except for PyThread_ReInitTLS), but there doesn't seem to be any protection against concurrent fork(). What seems to happen is fork() at the moment when key list is not consistent.

For example, if I initialize new key to 0xfe:

static struct key *find_key(int key, void *value)
    // find_key with extra memset()
    ...
    p = (struct key *)malloc(sizeof(struct key));
    memset(p, 0xfe, sizeof(struct key));
    if (p != NULL) {
        p->id = id;
        p->key = key;
        p->value = value;
        p->next = keyhead;
        keyhead = p;
    }
    ...

Looking at disassembly, compiler reordered last 2 writes:

   0x00000000004fcb50 <+272>:   callq  0x413d10 <malloc@plt>
   0x00000000004fcb55 <+277>:   movabs $0xfefefefefefefefe,%rcx
   0x00000000004fcb5f <+287>:   test   %rax,%rax
   0x00000000004fcb62 <+290>:   mov    %rcx,(%rax)
   0x00000000004fcb65 <+293>:   mov    %rcx,0x8(%rax)
   0x00000000004fcb69 <+297>:   mov    %rcx,0x10(%rax)
   0x00000000004fcb6d <+301>:   mov    %rcx,0x18(%rax)
   0x00000000004fcb71 <+305>:   je     0x4fcaff <PyThread_set_key_value+191>
   0x00000000004fcb73 <+307>:   mov    0x2f1e26(%rip),%rdx        # 0x7ee9a0 <keyhead>
   0x00000000004fcb7a <+314>:   mov    0x2f1dff(%rip),%rdi        # 0x7ee980 <keymutex>
   0x00000000004fcb81 <+321>:   xor    %r14d,%r14d
   0x00000000004fcb84 <+324>:   mov    %rbp,0x8(%rax)
   0x00000000004fcb88 <+328>:   mov    %r12d,0x10(%rax)
   0x00000000004fcb8c <+332>:   mov    %r13,0x18(%rax)
   0x00000000004fcb90 <+336>:   mov    %rax,0x2f1e09(%rip)        # 0x7ee9a0 <keyhead>
   0x00000000004fcb97 <+343>:   mov    %rdx,(%rax)

Now consider what happens, when different threads call fork() in between these 2 writes: we updated keyhead, but keyhead->next has not been updated yet.

Now when anaconda hangs, I get:

(gdb) list
407         keymutex = PyThread_allocate_lock();
408
409         /* Delete all keys which do not match the current thread id */
410         q = &keyhead;
411         while ((p = *q) != NULL) {
412             if (p->id != id) {
413                 *q = p->next;
414                 free((void *)p);
415                 /* NB This does *not* free p->value! */
416             }
(gdb) p p
$1 = (struct key *) 0xfefefefefefefefe
(gdb) p keyhead
$2 = (struct key *) 0xfefefefefefefefe


Here's how I think we get into this state:

-------------------------> thread 1
# has GIL
Thread.start
  _start_new_thread(self.__bootstrap, ())
    PyThread_start_new_thread(t_bootstrap)
      # spawns thread 3

-------------------------> thread 2
...
  # waiting for GIL

-------------------------> thread 3
t_bootstrap
  _PyThreadState_Init # does not have GIL yet at this point
    _PyGILState_NoteThreadState
      PyThread_set_key_value(autoTLSkey, (void *)tstate)
        find_key
          # key list is temporarily not consistent
          # due to compiler reordering couple writes in find_key

-------------------------> thread 1
continuing Thread.start
  self.__started.wait()
    Event.wait()
      self.__cond.wait 
        Condition.wait()
          waiter = _allocate_lock()
          waiter.acquire()
            lock_PyThread_acquire_lock
              Py_BEGIN_ALLOW_THREADS
                PyEval_SaveThread
                  PyThread_release_lock(interpreter_lock);

-------------------------> thread 2
...
  # acquired GIL
  os.fork() # forks inconsistent list

-------------------------> child
PyOS_AfterFork()
  PyThread_ReInitTLS()
    SIGSEGV

Attached patch for python makes it easier to reproduce, by adding delays to couple places to make window key list is not consistent larger.

History
Date	User	Action	Args
2017-02-24 09:03:58	Ján Stanček	set	recipients: + Ján Stanček
2017-02-24 09:03:58	Ján Stanček	set	messageid: <1487927038.37.0.851881550844.issue29640@psf.upfronthosting.co.za>
2017-02-24 09:03:58	Ján Stanček	link	issue29640 messages
2017-02-24 09:03:56	Ján Stanček	create