This issue tracker has been migrated to GitHub, and is currently read-only.
For more information, see the GitHub FAQs in the Python's Developer Guide.

Author Ján Stanček
Recipients Ján Stanček
Date 2017-02-24.09:03:56
SpamBayes Score -1.0
Marked as misclassified Yes
Message-id <1487927038.37.0.851881550844.issue29640@psf.upfronthosting.co.za>
In-reply-to
Content
Following crash is sporadically observed in RHEL7 anaconda:

(gdb) f 0
#0  PyThread_ReInitTLS () at /usr/src/debug/Python-2.7.5/Python/thread.c:411
411             if (p->id != id) {
(gdb) list
406         keymutex = PyThread_allocate_lock();
407
408         /* Delete all keys which do not match the current thread id */
409         q = &keyhead;
410         while ((p = *q) != NULL) {
411             if (p->id != id) {
412                 *q = p->next;
413                 free((void *)p);
414                 /* NB This does *not* free p->value! */
415             }
(gdb) p p
$1 = (struct key *) 0x3333333333333333
(gdb) p keyhead
$2 = (struct key *) 0x3333333333333333

key list is protected by keymutex (except for PyThread_ReInitTLS), but there doesn't seem to be any protection against concurrent fork(). What seems to happen is fork() at the moment when key list is not consistent.

For example, if I initialize new key to 0xfe:

static struct key *find_key(int key, void *value)
    // find_key with extra memset()
    ...
    p = (struct key *)malloc(sizeof(struct key));
    memset(p, 0xfe, sizeof(struct key));
    if (p != NULL) {
        p->id = id;
        p->key = key;
        p->value = value;
        p->next = keyhead;
        keyhead = p;
    }
    ...

Looking at disassembly, compiler reordered last 2 writes:

   0x00000000004fcb50 <+272>:   callq  0x413d10 <malloc@plt>
   0x00000000004fcb55 <+277>:   movabs $0xfefefefefefefefe,%rcx
   0x00000000004fcb5f <+287>:   test   %rax,%rax
   0x00000000004fcb62 <+290>:   mov    %rcx,(%rax)
   0x00000000004fcb65 <+293>:   mov    %rcx,0x8(%rax)
   0x00000000004fcb69 <+297>:   mov    %rcx,0x10(%rax)
   0x00000000004fcb6d <+301>:   mov    %rcx,0x18(%rax)
   0x00000000004fcb71 <+305>:   je     0x4fcaff <PyThread_set_key_value+191>
   0x00000000004fcb73 <+307>:   mov    0x2f1e26(%rip),%rdx        # 0x7ee9a0 <keyhead>
   0x00000000004fcb7a <+314>:   mov    0x2f1dff(%rip),%rdi        # 0x7ee980 <keymutex>
   0x00000000004fcb81 <+321>:   xor    %r14d,%r14d
   0x00000000004fcb84 <+324>:   mov    %rbp,0x8(%rax)
   0x00000000004fcb88 <+328>:   mov    %r12d,0x10(%rax)
   0x00000000004fcb8c <+332>:   mov    %r13,0x18(%rax)
   0x00000000004fcb90 <+336>:   mov    %rax,0x2f1e09(%rip)        # 0x7ee9a0 <keyhead>
   0x00000000004fcb97 <+343>:   mov    %rdx,(%rax)

Now consider what happens, when different threads call fork() in between these 2 writes: we updated keyhead, but keyhead->next has not been updated yet.

Now when anaconda hangs, I get:

(gdb) list
407         keymutex = PyThread_allocate_lock();
408
409         /* Delete all keys which do not match the current thread id */
410         q = &keyhead;
411         while ((p = *q) != NULL) {
412             if (p->id != id) {
413                 *q = p->next;
414                 free((void *)p);
415                 /* NB This does *not* free p->value! */
416             }
(gdb) p p
$1 = (struct key *) 0xfefefefefefefefe
(gdb) p keyhead
$2 = (struct key *) 0xfefefefefefefefe


Here's how I think we get into this state:

-------------------------> thread 1
# has GIL
Thread.start
  _start_new_thread(self.__bootstrap, ())
    PyThread_start_new_thread(t_bootstrap)
      # spawns thread 3

-------------------------> thread 2
...
  # waiting for GIL

-------------------------> thread 3
t_bootstrap
  _PyThreadState_Init # does not have GIL yet at this point
    _PyGILState_NoteThreadState
      PyThread_set_key_value(autoTLSkey, (void *)tstate)
        find_key
          # key list is temporarily not consistent
          # due to compiler reordering couple writes in find_key

-------------------------> thread 1
continuing Thread.start
  self.__started.wait()
    Event.wait()
      self.__cond.wait 
        Condition.wait()
          waiter = _allocate_lock()
          waiter.acquire()
            lock_PyThread_acquire_lock
              Py_BEGIN_ALLOW_THREADS
                PyEval_SaveThread
                  PyThread_release_lock(interpreter_lock);

-------------------------> thread 2
...
  # acquired GIL
  os.fork() # forks inconsistent list

-------------------------> child
PyOS_AfterFork()
  PyThread_ReInitTLS()
    SIGSEGV

Attached patch for python makes it easier to reproduce, by adding delays to couple places to make window key list is not consistent larger.
History
Date User Action Args
2017-02-24 09:03:58Ján Stančeksetrecipients: + Ján Stanček
2017-02-24 09:03:58Ján Stančeksetmessageid: <1487927038.37.0.851881550844.issue29640@psf.upfronthosting.co.za>
2017-02-24 09:03:58Ján Stančeklinkissue29640 messages
2017-02-24 09:03:56Ján Stančekcreate