From 45de7c94fb76820b4a96def73cf6816c3eabcea2 Mon Sep 17 00:00:00 2001 From: mancoast Date: Fri, 22 Jan 2016 13:50:55 -0500 Subject: [PATCH] k1om libffi --- Modules/_ctypes/libffi/src/raw_api.c | 16 +-- Modules/_ctypes/libffi/src/x86/ffi64.c | 129 +++++++++-------------- Modules/_ctypes/libffi/src/x86/unix64.S | 180 +++++++++++++++++++++++--------- 3 files changed, 187 insertions(+), 138 deletions(-) diff --git a/Modules/_ctypes/libffi/src/raw_api.c b/Modules/_ctypes/libffi/src/raw_api.c index ce21372..4c6af5f 100644 --- a/Modules/_ctypes/libffi/src/raw_api.c +++ b/Modules/_ctypes/libffi/src/raw_api.c @@ -29,7 +29,7 @@ #include #include -#if !FFI_NO_RAW_API +//#if !FFI_NO_RAW_API size_t ffi_raw_size (ffi_cif *cif) @@ -178,7 +178,7 @@ ffi_ptrarray_to_raw (ffi_cif *cif, void **args, ffi_raw *raw) } } -#if !FFI_NATIVE_RAW_API +//#if !FFI_NATIVE_RAW_API /* This is a generic definition of ffi_raw_call, to be used if the @@ -195,7 +195,7 @@ void ffi_raw_call (ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *raw) ffi_call (cif, fn, rvalue, avalue); } -#if FFI_CLOSURES /* base system provides closures */ +//#if FFI_CLOSURES /* base system provides closures */ static void ffi_translate_args (ffi_cif *cif, void *rvalue, @@ -231,10 +231,10 @@ ffi_prep_raw_closure_loc (ffi_raw_closure* cl, return status; } -#endif /* FFI_CLOSURES */ -#endif /* !FFI_NATIVE_RAW_API */ +//#endif /* FFI_CLOSURES */ +//#endif /* !FFI_NATIVE_RAW_API */ -#if FFI_CLOSURES +//#if FFI_CLOSURES /* Again, here is the generic version of ffi_prep_raw_closure, which * will install an intermediate "hub" for translation of arguments from @@ -249,6 +249,6 @@ ffi_prep_raw_closure (ffi_raw_closure* cl, return ffi_prep_raw_closure_loc (cl, cif, fun, user_data, cl); } -#endif /* FFI_CLOSURES */ +//#endif /* FFI_CLOSURES */ -#endif /* !FFI_NO_RAW_API */ +//#endif /* !FFI_NO_RAW_API */ diff --git a/Modules/_ctypes/libffi/src/x86/ffi64.c b/Modules/_ctypes/libffi/src/x86/ffi64.c index 5a5e043..d106c0b 100644 --- a/Modules/_ctypes/libffi/src/x86/ffi64.c +++ b/Modules/_ctypes/libffi/src/x86/ffi64.c @@ -1,10 +1,8 @@ /* ----------------------------------------------------------------------- - ffi64.c - Copyright (c) 2013 The Written Word, Inc. - Copyright (c) 2011 Anthony Green - Copyright (c) 2008, 2010 Red Hat, Inc. - Copyright (c) 2002, 2007 Bo Thorsen - - x86-64 Foreign Function Interface + ffi64.c - Copyright (c) 2002, 2007 Bo Thorsen + Copyright (c) 2008 Red Hat, Inc. + + x86-64 Foreign Function Interface Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -38,30 +36,13 @@ #define MAX_GPR_REGS 6 #define MAX_SSE_REGS 8 -#if defined(__INTEL_COMPILER) -#include "xmmintrin.h" -#define UINT128 __m128 -#else -#if defined(__SUNPRO_C) -#include -#define UINT128 __m128i -#else -#define UINT128 __int128_t -#endif -#endif - -union big_int_union -{ - UINT32 i32; - UINT64 i64; - UINT128 i128; -}; +typedef struct { int64_t m[8]; } __int512_t; struct register_args { /* Registers for argument passing. */ UINT64 gpr[MAX_GPR_REGS]; - union big_int_union sse[MAX_SSE_REGS]; + __int512_t sse[MAX_SSE_REGS]; }; extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, @@ -152,7 +133,7 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2) See the x86-64 PS ABI for details. */ -static size_t +static int classify_argument (ffi_type *type, enum x86_64_reg_class classes[], size_t byte_offset) { @@ -168,7 +149,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[], case FFI_TYPE_SINT64: case FFI_TYPE_POINTER: { - size_t size = byte_offset + type->size; + int size = byte_offset + type->size; if (size <= 4) { @@ -203,17 +184,15 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[], case FFI_TYPE_DOUBLE: classes[0] = X86_64_SSEDF_CLASS; return 1; -#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE case FFI_TYPE_LONGDOUBLE: classes[0] = X86_64_X87_CLASS; classes[1] = X86_64_X87UP_CLASS; return 2; -#endif case FFI_TYPE_STRUCT: { - const size_t UNITS_PER_WORD = 8; - size_t words = (type->size + UNITS_PER_WORD - 1) / UNITS_PER_WORD; - ffi_type **ptr; + const int UNITS_PER_WORD = 8; + int words = (type->size + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + ffi_type **ptr; int i; enum x86_64_reg_class subclasses[MAX_CLASSES]; @@ -235,7 +214,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[], /* Merge the fields of structure. */ for (ptr = type->elements; *ptr != NULL; ptr++) { - size_t num; + int num; byte_offset = ALIGN (byte_offset, (*ptr)->alignment); @@ -244,7 +223,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[], return 0; for (i = 0; i < num; i++) { - size_t pos = byte_offset / 8; + int pos = byte_offset / 8; classes[i + pos] = merge_classes (subclasses[i], classes[i + pos]); } @@ -308,12 +287,11 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[], class. Return zero iff parameter should be passed in memory, otherwise the number of registers. */ -static size_t +static int examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES], _Bool in_return, int *pngpr, int *pnsse) { - size_t n; - int i, ngpr, nsse; + int i, n, ngpr, nsse; n = classify_argument (type, classes, 0); if (n == 0) @@ -354,9 +332,9 @@ examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES], ffi_status ffi_prep_cif_machdep (ffi_cif *cif) { - int gprcount, ssecount, i, avn, ngpr, nsse, flags; + int gprcount, ssecount, i, avn, n, ngpr, nsse, flags; enum x86_64_reg_class classes[MAX_CLASSES]; - size_t bytes, n; + size_t bytes; gprcount = ssecount = 0; @@ -402,7 +380,7 @@ ffi_prep_cif_machdep (ffi_cif *cif) if (align < 8) align = 8; - bytes = ALIGN (bytes, align); + bytes = ALIGN(bytes, align); bytes += cif->arg_types[i]->size; } else @@ -414,7 +392,7 @@ ffi_prep_cif_machdep (ffi_cif *cif) if (ssecount) flags |= 1 << 11; cif->flags = flags; - cif->bytes = (unsigned)ALIGN (bytes, 8); + cif->bytes = bytes; return FFI_OK; } @@ -450,14 +428,15 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue) /* If the return value is passed in memory, add the pointer as the first integer argument. */ if (ret_in_memory) - reg_args->gpr[gprcount++] = (unsigned long) rvalue; + reg_args->gpr[gprcount++] = (long) rvalue; avn = cif->nargs; arg_types = cif->arg_types; for (i = 0; i < avn; ++i) { - size_t n, size = arg_types[i]->size; + size_t size = arg_types[i]->size; + int n; n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse); if (n == 0 @@ -487,33 +466,32 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue) { case X86_64_INTEGER_CLASS: case X86_64_INTEGERSI_CLASS: - /* Sign-extend integer arguments passed in general - purpose registers, to cope with the fact that - LLVM incorrectly assumes that this will be done - (the x86-64 PS ABI does not specify this). */ - switch (arg_types[i]->type) - { - case FFI_TYPE_SINT8: - *(SINT64 *)®_args->gpr[gprcount] = (SINT64) *((SINT8 *) a); - break; - case FFI_TYPE_SINT16: - *(SINT64 *)®_args->gpr[gprcount] = (SINT64) *((SINT16 *) a); - break; - case FFI_TYPE_SINT32: - *(SINT64 *)®_args->gpr[gprcount] = (SINT64) *((SINT32 *) a); - break; - default: - reg_args->gpr[gprcount] = 0; - memcpy (®_args->gpr[gprcount], a, size < 8 ? size : 8); - } + reg_args->gpr[gprcount] = 0; + memcpy (®_args->gpr[gprcount], a, size < 8 ? size : 8); gprcount++; break; case X86_64_SSE_CLASS: case X86_64_SSEDF_CLASS: - reg_args->sse[ssecount++].i64 = *(UINT64 *) a; + reg_args->sse[ssecount].m[0] = *(UINT64 *) a; + reg_args->sse[ssecount].m[1] = 0; + reg_args->sse[ssecount].m[2] = 0; + reg_args->sse[ssecount].m[3] = 0; + reg_args->sse[ssecount].m[4] = 0; + reg_args->sse[ssecount].m[5] = 0; + reg_args->sse[ssecount].m[6] = 0; + reg_args->sse[ssecount].m[7] = 0; + ssecount++; break; case X86_64_SSESF_CLASS: - reg_args->sse[ssecount++].i32 = *(UINT32 *) a; + reg_args->sse[ssecount].m[0] = *(UINT32 *) a; + reg_args->sse[ssecount].m[1] = 0; + reg_args->sse[ssecount].m[2] = 0; + reg_args->sse[ssecount].m[3] = 0; + reg_args->sse[ssecount].m[4] = 0; + reg_args->sse[ssecount].m[5] = 0; + reg_args->sse[ssecount].m[6] = 0; + reg_args->sse[ssecount].m[7] = 0; + ssecount++; break; default: abort(); @@ -538,21 +516,12 @@ ffi_prep_closure_loc (ffi_closure* closure, { volatile unsigned short *tramp; - /* Sanity check on the cif ABI. */ - { - int abi = cif->abi; - if (UNLIKELY (! (abi > FFI_FIRST_ABI && abi < FFI_LAST_ABI))) - return FFI_BAD_ABI; - } - tramp = (volatile unsigned short *) &closure->tramp[0]; tramp[0] = 0xbb49; /* mov , %r11 */ - *((unsigned long long * volatile) &tramp[1]) - = (unsigned long) ffi_closure_unix64; + *(void * volatile *) &tramp[1] = ffi_closure_unix64; tramp[5] = 0xba49; /* mov , %r10 */ - *((unsigned long long * volatile) &tramp[6]) - = (unsigned long) codeloc; + *(void * volatile *) &tramp[6] = codeloc; /* Set the carry bit iff the function uses any sse registers. This is clc or stc, together with the first byte of the jmp. */ @@ -586,12 +555,12 @@ ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue, if (ret != FFI_TYPE_VOID) { enum x86_64_reg_class classes[MAX_CLASSES]; - size_t n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse); + int n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse); if (n == 0) { /* The return value goes in memory. Arrange for the closure return value to go directly back to the original caller. */ - rvalue = (void *) (unsigned long) reg_args->gpr[gprcount++]; + rvalue = (void *) reg_args->gpr[gprcount++]; /* We don't have to do anything in asm for the return. */ ret = FFI_TYPE_VOID; } @@ -609,11 +578,11 @@ ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue, avn = cif->nargs; arg_types = cif->arg_types; - + for (i = 0; i < avn; ++i) { enum x86_64_reg_class classes[MAX_CLASSES]; - size_t n; + int n; n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse); if (n == 0 @@ -652,7 +621,7 @@ ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue, /* Otherwise, allocate space to make them consecutive. */ else { - char *a = alloca (16); + char *a = alloca (64); int j; avalue[i] = a; diff --git a/Modules/_ctypes/libffi/src/x86/unix64.S b/Modules/_ctypes/libffi/src/x86/unix64.S index 45a0ed7..b79fe01 100644 --- a/Modules/_ctypes/libffi/src/x86/unix64.S +++ b/Modules/_ctypes/libffi/src/x86/unix64.S @@ -1,7 +1,6 @@ /* ----------------------------------------------------------------------- - unix64.S - Copyright (c) 2013 The Written Word, Inc. - - Copyright (c) 2008 Red Hat, Inc - - Copyright (c) 2002 Bo Thorsen + unix64.S - Copyright (c) 2002 Bo Thorsen + Copyright (c) 2008 Red Hat, Inc x86-64 Foreign Function Interface @@ -24,8 +23,17 @@ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + + PORT TO THE INTEL MIC ARCHITECTURE: + EMILIO CASTILLO VILLAR + CRISTOBAL CAMARERO COTERILLO + + UNIVERSITY OF CANTABRIA + SPAIN ----------------------------------------------------------------------- */ +/ #ifdef __x86_64__ #define LIBFFI_ASM #include @@ -70,7 +78,7 @@ ffi_call_unix64: .Lret_from_load_sse: /* Deallocate the reg arg area. */ - leaq 176(%r10), %rsp + leaq 560(%r10), %rsp /* Call the user function. */ call *%r11 @@ -146,11 +154,20 @@ ffi_call_unix64: .align 2 .Lst_float: - movss %xmm0, (%rdi) + + movl $1, %eax + kmov %eax, %k1 + vpackstorelps %zmm0, (%rdi){%k1} + vpackstorehps %zmm0, 64(%rdi){%k1} + /*movss %xmm0, (%rdi)*/ ret .align 2 .Lst_double: - movsd %xmm0, (%rdi) + movl $1, %eax + kmov %eax, %k1 + vpackstorelpd %zmm0, (%rdi){%k1} + vpackstorehpd %zmm0, 64(%rdi){%k1} + /*movsd %xmm0, (%rdi)*/ ret .Lst_ldouble: fstpt (%rdi) @@ -165,16 +182,39 @@ ffi_call_unix64: value to a 16 byte scratch area first. Bits 8, 9, and 10 control where the values are located. Only one of the three bits will be set; see ffi_prep_cif_machdep for the pattern. */ - movd %xmm0, %r10 - movd %xmm1, %r11 + + + movq %rax, %r10 + movl $1, %eax + kmov %eax, %k1 + movq %r10, %rax + + vpackstorelpd %zmm0, -200(%rsp){%k1} + vpackstorehpd %zmm0, -136(%rsp){%k1} + movq -200(%rsp), %r10 + + + vpackstorelpd %zmm1, -200(%rsp){%k1} + vpackstorehpd %zmm1, -136(%rsp){%k1} + movq -200(%rsp), %r11 + + /*movd %zmm0, %r10 + movd %zmm1, %r11*/ testl $0x100, %ecx - cmovnz %rax, %rdx - cmovnz %r10, %rax + jz .Lst_struct_n1 + movq %rax, %rdx + movq %r10, %rax +.Lst_struct_n1: + testl $0x200, %ecx - cmovnz %r10, %rdx + jz .Lst_struct_n2 + movq %r10, %rdx +.Lst_struct_n2: testl $0x400, %ecx - cmovnz %r10, %rax - cmovnz %r11, %rdx + jz .Lst_struct_n3 + movq %r10, %rax + movq %r11, %rdx +.Lst_struct_n3: movq %rax, (%rsi) movq %rdx, 8(%rsi) @@ -190,14 +230,33 @@ ffi_call_unix64: .align 2 .LUW3: .Lload_sse: - movdqa 48(%r10), %xmm0 - movdqa 64(%r10), %xmm1 - movdqa 80(%r10), %xmm2 - movdqa 96(%r10), %xmm3 - movdqa 112(%r10), %xmm4 - movdqa 128(%r10), %xmm5 - movdqa 144(%r10), %xmm6 - movdqa 160(%r10), %xmm7 + + vloadunpacklq 48(%r10), %zmm0 + vloadunpacklq 112(%r10), %zmm1 + vloadunpacklq 176(%r10), %zmm2 + vloadunpacklq 240(%r10), %zmm3 + vloadunpacklq 304(%r10), %zmm4 + vloadunpacklq 368(%r10), %zmm5 + vloadunpacklq 432(%r10), %zmm6 + vloadunpacklq 496(%r10), %zmm7 + + vloadunpackhq 112(%r10), %zmm0 + vloadunpackhq 176(%r10), %zmm1 + vloadunpackhq 240(%r10), %zmm2 + vloadunpackhq 304(%r10), %zmm3 + vloadunpackhq 368(%r10), %zmm4 + vloadunpackhq 432(%r10), %zmm5 + vloadunpackhq 496(%r10), %zmm6 + vloadunpackhq 560(%r10), %zmm7 + + /*vmovaps 48(%r10), %zmm0 + vmovaps 112(%r10), %zmm1 + vmovaps 176(%r10), %zmm2 + vmovaps 240(%r10), %zmm3 + vmovaps 304(%r10), %zmm4 + vmovaps 368(%r10), %zmm5 + vmovaps 432(%r10), %zmm6 + vmovaps 496(%r10), %zmm7*/ jmp .Lret_from_load_sse .LUW4: @@ -211,7 +270,7 @@ ffi_closure_unix64: .LUW5: /* The carry flag is set by the trampoline iff SSE registers are used. Don't clobber it before the branch instruction. */ - leaq -200(%rsp), %rsp + leaq -584(%rsp), %rsp .LUW6: movq %rdi, (%rsp) movq %rsi, 8(%rsp) @@ -223,13 +282,13 @@ ffi_closure_unix64: .Lret_from_save_sse: movq %r10, %rdi - leaq 176(%rsp), %rsi + leaq 560(%rsp), %rsi movq %rsp, %rdx - leaq 208(%rsp), %rcx + leaq 592(%rsp), %rcx call ffi_closure_unix64_inner@PLT /* Deallocate stack frame early; return value is now in redzone. */ - addq $200, %rsp + addq $584, %rsp .LUW7: /* The first byte of the return value contains the FFI_TYPE. */ @@ -279,11 +338,13 @@ ffi_closure_unix64: .align 2 .Lld_float: - movss -24(%rsp), %xmm0 + vbroadcastss -24(%rsp), %zmm0 + /*movss -24(%rsp), %xmm0*/ ret .align 2 .Lld_double: - movsd -24(%rsp), %xmm0 + vbroadcastsd -24(%rsp), %zmm0 + /*movsd -24(%rsp), %xmm0*/ ret .align 2 .Lld_ldouble: @@ -299,40 +360,61 @@ ffi_closure_unix64: that rax gets the second word. */ movq -24(%rsp), %rcx movq -16(%rsp), %rdx - movq -16(%rsp), %xmm1 + vbroadcastsd -16(%rsp), %zmm1 + /*movq -16(%rsp), %xmm1*/ testl $0x100, %eax - cmovnz %rdx, %rcx - movd %rcx, %xmm0 - testl $0x200, %eax + jz .Lld_struct_1 + + movq %rdx, %rcx +.Lld_struct_1: + subq $8, %rsp + movq %rcx, (%rsp) + addq $8, %rsp + vbroadcastss (%rsp), %zmm0 + + /*movd %rcx, %zmm0*/ movq -24(%rsp), %rax - cmovnz %rdx, %rax + testl $0x200, %eax + jz .Lld_struct_2 + movq %rdx, %rax +.Lld_struct_2: ret /* See the comment above .Lload_sse; the same logic applies here. */ .align 2 .LUW8: .Lsave_sse: - movdqa %xmm0, 48(%rsp) - movdqa %xmm1, 64(%rsp) - movdqa %xmm2, 80(%rsp) - movdqa %xmm3, 96(%rsp) - movdqa %xmm4, 112(%rsp) - movdqa %xmm5, 128(%rsp) - movdqa %xmm6, 144(%rsp) - movdqa %xmm7, 160(%rsp) + vpackstorelq %zmm0, 48(%rsp) + vpackstorelq %zmm1, 112(%rsp) + vpackstorelq %zmm2, 176(%rsp) + vpackstorelq %zmm3, 240(%rsp) + vpackstorelq %zmm4, 304(%rsp) + vpackstorelq %zmm5, 368(%rsp) + vpackstorelq %zmm6, 432(%rsp) + vpackstorelq %zmm7, 496(%rsp) + + vpackstorehq %zmm0, 112(%rsp) + vpackstorehq %zmm1, 176(%rsp) + vpackstorehq %zmm2, 240(%rsp) + vpackstorehq %zmm3, 304(%rsp) + vpackstorehq %zmm4, 368(%rsp) + vpackstorehq %zmm5, 432(%rsp) + vpackstorehq %zmm6, 496(%rsp) + vpackstorehq %zmm7, 560(%rsp) + /*vmovaps %zmm0, 48(%rsp) + vmovaps %zmm1, 112(%rsp) + vmovaps %zmm2, 176(%rsp) + vmovaps %zmm3, 240(%rsp) + vmovaps %zmm4, 304(%rsp) + vmovaps %zmm5, 368(%rsp) + vmovaps %zmm6, 432(%rsp) + vmovaps %zmm7, 496(%rsp) */ jmp .Lret_from_save_sse .LUW9: .size ffi_closure_unix64,.-ffi_closure_unix64 -#ifdef __GNUC__ -/* Only emit DWARF unwind info when building with the GNU toolchain. */ - -#ifdef HAVE_AS_X86_64_UNWIND_SECTION_TYPE - .section .eh_frame,"a",@unwind -#else .section .eh_frame,"a",@progbits -#endif .Lframe1: .long .LECIE1-.LSCIE1 /* CIE Length */ .LSCIE1: @@ -366,7 +448,7 @@ ffi_closure_unix64: .byte 0x4 /* DW_CFA_advance_loc4 */ .long .LUW1-.LUW0 - /* New stack frame based off rbp. This is an itty bit of unwind + /* New stack frame based off rbp. This is a itty bit of unwind trickery in that the CFA *has* changed. There is no easy way to describe it correctly on entry to the function. Fortunately, it doesn't matter too much since at all points we can correctly @@ -423,8 +505,6 @@ ffi_closure_unix64: .align 8 .LEFDE3: -#endif /* __GNUC__ */ - #endif /* __x86_64__ */ #if defined __ELF__ && defined __linux__ -- 2.5.3.windows.1