Date 2019-04-12
On x86-64, clang -O3 compiles the following function:

PyCArgObject *
    PyCArgObject *p;
    p = PyObject_New(PyCArgObject, &PyCArg_Type);
    if (p == NULL)
        return NULL;
    p->pffi_type = NULL;
    p->tag = '\0';
    p->obj = NULL;
    memset(&p->value, 0, sizeof(p->value));
    return p;

like that:

   0x00007fffe9c6acb0 <+0>:	push   rax
   0x00007fffe9c6acb1 <+1>:	mov    rdi,QWORD PTR [rip+0xe308]        # 0x7fffe9c78fc0
   0x00007fffe9c6acb8 <+8>:	call   0x7fffe9c5e8a0 <_PyObject_New@plt>
   0x00007fffe9c6acbd <+13>:	test   rax,rax
   0x00007fffe9c6acc0 <+16>:	je     0x7fffe9c6acdf <PyCArgObject_new+47>
   0x00007fffe9c6acc2 <+18>:	mov    QWORD PTR [rax+0x20],0x0
   0x00007fffe9c6acca <+26>:	mov    BYTE PTR [rax+0x28],0x0
   0x00007fffe9c6acce <+30>:	xorps  xmm0,xmm0
   0x00007fffe9c6acd1 <+33>:	movaps XMMWORD PTR [rax+0x30],xmm0
   0x00007fffe9c6acd5 <+37>:	mov    QWORD PTR [rax+0x40],0x0
   0x00007fffe9c6acdd <+45>:	pop    rcx
   0x00007fffe9c6acde <+46>:	ret    
   0x00007fffe9c6acdf <+47>:	xor    eax,eax
   0x00007fffe9c6ace1 <+49>:	pop    rcx
   0x00007fffe9c6ace2 <+50>:	ret    

The problem is that movaps requires the memory address to be aligned on 16 bytes, whereas PyObject_New() uses pymalloc allocator (the requested size is 80 bytes, pymalloc supports allocations up to 512 bytes) and pymalloc only provides alignment on 8 bytes.

If PyObject_New() returns an address not aligned on 16 bytes, PyCArgObject_new() crash immediately with a segmentation fault (SIGSEGV).

CPython must be compiled using -fmax-type-align=8 to avoid such alignment crash. Using this compiler flag, clag emits expected machine code:

   0x00007fffe9caacb0 <+0>:	push   rax
   0x00007fffe9caacb1 <+1>:	mov    rdi,QWORD PTR [rip+0xe308]        # 0x7fffe9cb8fc0
   0x00007fffe9caacb8 <+8>:	call   0x7fffe9c9e8a0 <_PyObject_New@plt>
   0x00007fffe9caacbd <+13>:	test   rax,rax
   0x00007fffe9caacc0 <+16>:	je     0x7fffe9caacdf <PyCArgObject_new+47>
   0x00007fffe9caacc2 <+18>:	mov    QWORD PTR [rax+0x20],0x0
   0x00007fffe9caacca <+26>:	mov    BYTE PTR [rax+0x28],0x0
   0x00007fffe9caacce <+30>:	xorps  xmm0,xmm0
   0x00007fffe9caacd1 <+33>:	movups XMMWORD PTR [rax+0x30],xmm0
   0x00007fffe9caacd5 <+37>:	mov    QWORD PTR [rax+0x40],0x0
   0x00007fffe9caacdd <+45>:	pop    rcx
   0x00007fffe9caacde <+46>:	ret    
   0x00007fffe9caacdf <+47>:	xor    eax,eax
   0x00007fffe9caace1 <+49>:	pop    rcx
   0x00007fffe9caace2 <+50>:	ret    

"movaps" instruction becomes "movups" instruction: "a" stands for "aligned" in movaps, whereas "u" stands for "unaligned" in movups.
