X-Git-Url: http://wagnertech.de/git?a=blobdiff_plain;f=i686-linux-gnu-4.7%2Fusr%2Flib%2Fgcc%2Fx86_64-linux-gnu%2F4.7%2Finclude%2Favx2intrin.h;fp=i686-linux-gnu-4.7%2Fusr%2Flib%2Fgcc%2Fx86_64-linux-gnu%2F4.7%2Finclude%2Favx2intrin.h;h=cebd9a2fced39d4564227779886d20fe91ac8b6b;hb=94df942c2c7bd3457276fe5b7367623cbb8c1302;hp=0000000000000000000000000000000000000000;hpb=4dd7d9155a920895ff7b1cb6b9c9c676aa62000a;p=cross.git diff --git a/i686-linux-gnu-4.7/usr/lib/gcc/x86_64-linux-gnu/4.7/include/avx2intrin.h b/i686-linux-gnu-4.7/usr/lib/gcc/x86_64-linux-gnu/4.7/include/avx2intrin.h new file mode 100644 index 0000000..cebd9a2 --- /dev/null +++ b/i686-linux-gnu-4.7/usr/lib/gcc/x86_64-linux-gnu/4.7/include/avx2intrin.h @@ -0,0 +1,1874 @@ +/* Copyright (C) 2011 + Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +/* Sum absolute 8-bit integer difference of adjacent groups of 4 + byte integers in the first 2 operands. Starting offsets within + operands are determined by the 3rd mask operand. */ +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, + (__v32qi)__Y, __M); +} +#else +#define _mm256_mpsadbw_epu8(X, Y, M) \ + ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi8 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi16 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi32 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packs_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packs_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packus_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packus_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) +{ + return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, + (__v4di)__B, + __N * 8); +} +#else +/* In that case (__N*8) will be in vreg, and insn will not be matched. */ +/* Use define instead */ +#define _mm256_alignr_epi8(A, B, N) \ + ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (int)(N) * 8)) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_avg_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_avg_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) +{ + return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, + (__v32qi)__Y, + (__v32qi)__M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, + (__v16hi)__Y, + __M); +} +#else +#define _mm256_blend_epi16(X, Y, M) \ + ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A, + (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A, + (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A, + (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadds_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsubs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddubs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, + (__v32qi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_madd_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, + (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_epi8 (__m256i __A) +{ + return __builtin_ia32_pmovmskb256 ((__v32qi)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhi_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhi_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_si256 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sad_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, + (__v32qi)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_epi32 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflehi_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflelo_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); +} +#else +#define _mm256_shuffle_epi32(A, N) \ + ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) +#define _mm256_shufflehi_epi16(A, N) \ + ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) +#define _mm256_shufflelo_epi16(A, N) \ + ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); +} +#else +#define _mm256_slli_si256(A, N) \ + ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi64 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi64 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); +} +#else +#define _mm256_srli_si256(A, N) \ + ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi64 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi64 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_si256 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_load_si256 (__m256i const *__X) +{ + return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastss_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastss_ps (__m128 __X) +{ + return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastsd_pd (__m128d __X) +{ + return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastsi128_si256 (__m128i __X) +{ + return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, + (__v4si)__Y, + __M); +} +#else +#define _mm_blend_epi32(X, Y, M) \ + ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, + (__v8si)__Y, + __M); +} +#else +#define _mm256_blend_epi32(X, Y, M) \ + ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastb_epi8 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastw_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastd_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastq_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastb_epi8 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastw_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastd_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastq_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_pd (__m256d __X, const int __M) +{ + return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); +} +#else +#define _mm256_permute4x64_pd(X, M) \ + ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) +#endif + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) +{ + return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_epi64 (__m256i __X, const int __M) +{ + return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); +} +#else +#define _mm256_permute4x64_epi64(X, M) \ + ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) +#endif + + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); +} +#else +#define _mm256_permute2x128_si256(X, Y, M) \ + ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti128_si256 (__m256i __X, const int __M) +{ + return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); +} +#else +#define _mm256_extracti128_si256(X, M) \ + ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); +} +#else +#define _mm256_inserti128_si256(X, Y, M) \ + ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \ + (__v2di)(__m128i)(Y), \ + (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_epi32 (int const *__X, __m256i __M ) +{ + return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, + (__v8si)__M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_epi64 (long long const *__X, __m256i __M ) +{ + return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, + (__v4di)__M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_epi32 (int const *__X, __m128i __M ) +{ + return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, + (__v4si)__M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_epi64 (long long const *__X, __m128i __M ) +{ + return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, + (__v2di)__M); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) +{ + __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) +{ + __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) +{ + __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) +{ + __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srav_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srav_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_pd (double const *base, __m128i index, const int scale) +{ + __v2df src = _mm_setzero_pd (); + __v2df mask = _mm_cmpeq_pd (src, src); + + return (__m128d) __builtin_ia32_gathersiv2df (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index, + __m128d mask, const int scale) +{ + return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src, + base, + (__v4si)index, + (__v2df)mask, + scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_pd (double const *base, __m128i index, const int scale) +{ + __v4df src = _mm256_setzero_pd (); + __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); + + return (__m256d) __builtin_ia32_gathersiv4df (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_pd (__m256d src, double const *base, + __m128i index, __m256d mask, const int scale) +{ + return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src, + base, + (__v4si)index, + (__v4df)mask, + scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_pd (double const *base, __m128i index, const int scale) +{ + __v2df src = _mm_setzero_pd (); + __v2df mask = _mm_cmpeq_pd (src, src); + + return (__m128d) __builtin_ia32_gatherdiv2df (src, + base, + (__v2di)index, + mask, + scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index, + __m128d mask, const int scale) +{ + return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src, + base, + (__v2di)index, + (__v2df)mask, + scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_pd (double const *base, __m256i index, const int scale) +{ + __v4df src = _mm256_setzero_pd (); + __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); + + return (__m256d) __builtin_ia32_gatherdiv4df (src, + base, + (__v4di)index, + mask, + scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_pd (__m256d src, double const *base, + __m256i index, __m256d mask, const int scale) +{ + return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src, + base, + (__v4di)index, + (__v4df)mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_ps (float const *base, __m128i index, const int scale) +{ + __v4sf src = _mm_setzero_ps (); + __v4sf mask = _mm_cmpeq_ps (src, src); + + return (__m128) __builtin_ia32_gathersiv4sf (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index, + __m128 mask, const int scale) +{ + return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src, + base, + (__v4si)index, + (__v4sf)mask, + scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_ps (float const *base, __m256i index, const int scale) +{ + __v8sf src = _mm256_setzero_ps (); + __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); + + return (__m256) __builtin_ia32_gathersiv8sf (src, + base, + (__v8si)index, + mask, + scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_ps (__m256 src, float const *base, + __m256i index, __m256 mask, const int scale) +{ + return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src, + base, + (__v8si)index, + (__v8sf)mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_ps (float const *base, __m128i index, const int scale) +{ + __v4sf src = _mm_setzero_ps (); + __v4sf mask = _mm_cmpeq_ps (src, src); + + return (__m128) __builtin_ia32_gatherdiv4sf (src, + base, + (__v2di)index, + mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index, + __m128 mask, const int scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src, + base, + (__v2di)index, + (__v4sf)mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_ps (float const *base, __m256i index, const int scale) +{ + __v4sf src = _mm_setzero_ps (); + __v4sf mask = _mm_cmpeq_ps (src, src); + + return (__m128) __builtin_ia32_gatherdiv4sf256 (src, + base, + (__v4di)index, + mask, + scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_ps (__m128 src, float const *base, + __m256i index, __m128 mask, const int scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src, + base, + (__v4di)index, + (__v4sf)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi64 (long long int const *base, + __m128i index, const int scale) +{ + __v2di src = __extension__ (__v2di){ 0, 0 }; + __v2di mask = __extension__ (__v2di){ ~0, ~0 }; + + return (__m128i) __builtin_ia32_gathersiv2di (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi64 (__m128i src, long long int const *base, + __m128i index, __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src, + base, + (__v4si)index, + (__v2di)mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi64 (long long int const *base, + __m128i index, const int scale) +{ + __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + + return (__m256i) __builtin_ia32_gathersiv4di (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi64 (__m256i src, long long int const *base, + __m128i index, __m256i mask, const int scale) +{ + return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src, + base, + (__v4si)index, + (__v4di)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi64 (long long int const *base, + __m128i index, const int scale) +{ + __v2di src = __extension__ (__v2di){ 0, 0 }; + __v2di mask = __extension__ (__v2di){ ~0, ~0 }; + + return (__m128i) __builtin_ia32_gatherdiv2di (src, + base, + (__v2di)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index, + __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src, + base, + (__v2di)index, + (__v2di)mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi64 (long long int const *base, + __m256i index, const int scale) +{ + __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + + return (__m256i) __builtin_ia32_gatherdiv4di (src, + base, + (__v4di)index, + mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi64 (__m256i src, long long int const *base, + __m256i index, __m256i mask, const int scale) +{ + return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src, + base, + (__v4di)index, + (__v4di)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi32 (int const *base, __m128i index, const int scale) +{ + __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + + return (__m128i) __builtin_ia32_gathersiv4si (src, + base, + (__v4si)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index, + __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src, + base, + (__v4si)index, + (__v4si)mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi32 (int const *base, __m256i index, const int scale) +{ + __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; + __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; + + return (__m256i) __builtin_ia32_gathersiv8si (src, + base, + (__v8si)index, + mask, + scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi32 (__m256i src, int const *base, + __m256i index, __m256i mask, const int scale) +{ + return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src, + base, + (__v8si)index, + (__v8si)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi32 (int const *base, __m128i index, const int scale) +{ + __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + + return (__m128i) __builtin_ia32_gatherdiv4si (src, + base, + (__v2di)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index, + __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src, + base, + (__v2di)index, + (__v4si)mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi32 (int const *base, __m256i index, const int scale) +{ + __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + + return (__m128i) __builtin_ia32_gatherdiv4si256 (src, + base, + (__v4di)index, + mask, + scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi32 (__m128i src, int const *base, + __m256i index, __m128i mask, const int scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src, + base, + (__v4di)index, + (__v4si)mask, + scale); +} +#else /* __OPTIMIZE__ */ +#define _mm_i32gather_pd(BASE, INDEX, SCALE) \ + (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ + (double const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v2df)_mm_set1_pd( \ + (double)(long long int) -1), \ + (int)SCALE) + +#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \ + (double const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v2df)(__m128d)MASK, \ + (int)SCALE) + +#define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ + (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ + (double const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4df)_mm256_set1_pd( \ + (double)(long long int) -1), \ + (int)SCALE) + +#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \ + (double const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4df)(__m256d)MASK, \ + (int)SCALE) + +#define _mm_i64gather_pd(BASE, INDEX, SCALE) \ + (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ + (double const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v2df)_mm_set1_pd( \ + (double)(long long int) -1), \ + (int)SCALE) + +#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \ + (double const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v2df)(__m128d)MASK, \ + (int)SCALE) + +#define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ + (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ + (double const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4df)_mm256_set1_pd( \ + (double)(long long int) -1), \ + (int)SCALE) + +#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \ + (double const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4df)(__m256d)MASK, \ + (int)SCALE) + +#define _mm_i32gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ + (float const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + _mm_set1_ps ((float)(int) -1), \ + (int)SCALE) + +#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \ + (float const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4sf)(__m128d)MASK, \ + (int)SCALE) + +#define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ + (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ + (float const *)BASE, \ + (__v8si)(__m256i)INDEX, \ + (__v8sf)_mm256_set1_ps ( \ + (float)(int) -1), \ + (int)SCALE) + +#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \ + (float const *)BASE, \ + (__v8si)(__m256i)INDEX, \ + (__v8sf)(__m256d)MASK, \ + (int)SCALE) + +#define _mm_i64gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ + (float const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v4sf)_mm_set1_ps ( \ + (float)(int) -1), \ + (int)SCALE) + +#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \ + (float const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v4sf)(__m128d)MASK, \ + (int)SCALE) + +#define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ + (float const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4sf)_mm_set1_ps( \ + (float)(int) -1), \ + (int)SCALE) + +#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \ + (float const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4sf)(__m128)MASK, \ + (int)SCALE) + +#define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \ + (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v2di)_mm_set1_epi64x (-1), \ + (int)SCALE) + +#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \ + (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v2di)(__m128i)MASK, \ + (int)SCALE) + +#define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \ + (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4di)_mm256_set1_epi64x (-1), \ + (int)SCALE) + +#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \ + (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4di)(__m256i)MASK, \ + (int)SCALE) + +#define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \ + (long long const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v2di)_mm_set1_epi64x (-1), \ + (int)SCALE) + +#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \ + (long long const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v2di)(__m128i)MASK, \ + (int)SCALE) + +#define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \ + (long long const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4di)_mm256_set1_epi64x (-1), \ + (int)SCALE) + +#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \ + (long long const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4di)(__m256i)MASK, \ + (int)SCALE) + +#define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \ + (int const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4si)_mm_set1_epi32 (-1), \ + (int)SCALE) + +#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \ + (int const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4si)(__m128i)MASK, \ + (int)SCALE) + +#define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \ + (int const *)BASE, \ + (__v8si)(__m256i)INDEX, \ + (__v8si)_mm256_set1_epi32 (-1), \ + (int)SCALE) + +#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \ + (int const *)BASE, \ + (__v8si)(__m256i)INDEX, \ + (__v8si)(__m256i)MASK, \ + (int)SCALE) + +#define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \ + (int const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v4si)_mm_set1_epi32 (-1), \ + (int)SCALE) + +#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \ + (int const *)BASE, \ + (__v2di)(__m128i)INDEX, \ + (__v4si)(__m128i)MASK, \ + (int)SCALE) + +#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \ + (int const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4si)_mm_set1_epi32(-1), \ + (int)SCALE) + +#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \ + (int const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4si)(__m128i)MASK, \ + (int)SCALE) +#endif /* __OPTIMIZE__ */