PNG  IHDRxsBIT|d pHYs+tEXtSoftwarewww.inkscape.org<,tEXtComment File Manager

File Manager

Path: /opt/cloudlinux/alt-php85/root/usr/include/php/Zend/

Viewing File: zend_simd.h

/********************************************************************************
 * MIT License
 * Copyright (c) 2025 Saki Takamachi <saki@sakiot.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *********************************************************************************/


 #ifndef XSSE_H
 #define XSSE_H

 #define XSSE_VERSION 10000

 #ifdef _MSC_VER
 #  define XSSE_FORCE_INLINE __forceinline
 #elif defined(__GNUC__) || defined(__clang__)
 #  define XSSE_FORCE_INLINE inline __attribute__((always_inline))
 #  define XSSE_HAS_MACRO_EXTENSION
 #else
 #  define XSSE_FORCE_INLINE inline
 #endif


 #if defined(__SSE2__) || defined(_M_X64) || defined(_M_AMD64)
 #include <emmintrin.h>
 #define XSSE2


 #elif defined(__aarch64__) || defined(_M_ARM64)
 #include <arm_neon.h>
 #define XSSE2

 typedef int8x16_t __m128i;


 /*****************************************************************************
  * Load / Store                                                              *
  *****************************************************************************/

 #define _mm_set_epi8(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \
	 ((int8x16_t) { \
		 (int8_t) (x15), (int8_t) (x14), (int8_t) (x13), (int8_t) (x12), \
		 (int8_t) (x11), (int8_t) (x10), (int8_t) (x9),  (int8_t) (x8), \
		 (int8_t) (x7),  (int8_t) (x6),  (int8_t) (x5),  (int8_t) (x4), \
		 (int8_t) (x3),  (int8_t) (x2),  (int8_t) (x1),  (int8_t) (x0) })
 #define _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) \
	 (vreinterpretq_s8_s16((int16x8_t) { \
		 (int16_t) (x7), (int16_t) (x6), (int16_t) (x5), (int16_t) (x4), \
		 (int16_t) (x3), (int16_t) (x2), (int16_t) (x1), (int16_t) (x0) }))
 #define _mm_set_epi32(x0, x1, x2, x3) \
	 (vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x3), (int32_t) (x2), (int32_t) (x1), (int32_t) (x0) }))
 #define _mm_set_epi64x(x0, x1) (vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }))
 #define _mm_set1_epi8(x) (vdupq_n_s8((int8_t) (x)))
 #define _mm_set1_epi16(x) (vreinterpretq_s8_s16(vdupq_n_s16((int16_t) (x))))
 #define _mm_set1_epi32(x) (vreinterpretq_s8_s32(vdupq_n_s32((int32_t) (x))))
 #define _mm_set1_epi64x(x) (vreinterpretq_s8_s64(vdupq_n_s64((int64_t) (x))))

 #define _mm_setr_epi8(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \
	 ((int8x16_t) { \
		 (int8_t) (x0), (int8_t) (x1), (int8_t) (x2), (int8_t) (x3), \
		 (int8_t) (x4), (int8_t) (x5), (int8_t) (x6), (int8_t) (x7), \
		 (int8_t) (x8), (int8_t) (x9), (int8_t) (x10), (int8_t) (x11), \
		 (int8_t) (x12), (int8_t) (x13), (int8_t) (x14), (int8_t) (x15) })
 #define _mm_setr_epi16(x0, x1, x2, x3, x4, x5, x6, x7) \
	 (vreinterpretq_s8_s16((int16x8_t) { \
		 (int16_t) (x0), (int16_t) (x1), (int16_t) (x2), (int16_t) (x3), \
		 (int16_t) (x4), (int16_t) (x5), (int16_t) (x6), (int16_t) (x7) }))
 #define _mm_setr_epi32(x0, x1, x2, x3) \
	 (vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x0), (int32_t) (x1), (int32_t) (x2), (int32_t) (x3) }))

 #define _mm_setzero_si128() (vdupq_n_s8(0))

 #define _mm_load_si128(x) (vld1q_s8((const int8_t *) (x)))
 #define _mm_loadu_si128(x) _mm_load_si128(x)

 #define _mm_store_si128(to, x) (vst1q_s8((int8_t *) (to), x))
 #define _mm_storeu_si128(to, x) _mm_store_si128(to, x)
 #define _mm_stream_si128(to, x) _mm_store_si128(to, x)
 #define _mm_stream_si32(to, x) (*(volatile int32_t *)(to) = (int32_t)(x))


 /*****************************************************************************
  * Bit shift / Bit wise                                                      *
  *****************************************************************************/

 #define _mm_or_si128(a, b) (vorrq_s8((a), (b)))
 #define _mm_xor_si128(a, b) (veorq_s8((a), (b)))
 #define _mm_and_si128(a, b) (vandq_s8((a), (b)))
 #define _mm_andnot_si128(a, b) (vbicq_s8((b), (a)))

 #define _mm_slli_epi16(x, count) (vreinterpretq_s8_u16(vshlq_n_u16(vreinterpretq_u16_s8(x), (count))))
 #define _mm_slli_epi32(x, count) (vreinterpretq_s8_u32(vshlq_n_u32(vreinterpretq_u32_s8(x), (count))))
 #define _mm_slli_epi64(x, count) (vreinterpretq_s8_u64(vshlq_n_u64(vreinterpretq_u64_s8(x), (count))))
 static XSSE_FORCE_INLINE __m128i _mm_sll_epi16(__m128i x, __m128i count)
 {
	 uint16_t shift = (uint16_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFF);
	 return vreinterpretq_s8_u16(
		 vshlq_u16(vreinterpretq_u16_s8(x), vdupq_n_s16((int16_t) shift))
	 );
 }
 static XSSE_FORCE_INLINE __m128i _mm_sll_epi32(__m128i x, __m128i count)
 {
	 uint32_t shift = (uint32_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFFFFFF);
	 return vreinterpretq_s8_u32(
		 vshlq_u32(vreinterpretq_u32_s8(x), vdupq_n_s32((int32_t) shift))
	 );
 }
 static XSSE_FORCE_INLINE __m128i _mm_sll_epi64(__m128i x, __m128i count)
 {
	 uint64_t shift = (uint64_t) vgetq_lane_s64(vreinterpretq_s64_s8(count), 0);
	 return vreinterpretq_s8_u64(
		 vshlq_u64(vreinterpretq_u64_s8(x), vdupq_n_s64((int64_t) shift))
	 );
 }

 #define _mm_slli_si128(x, imm) \
	 ((imm) >= 16 ? vdupq_n_s8(0) : vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 16 - (imm))))

 #define _mm_srai_epi16(x, count) (vreinterpretq_s8_s16(vshrq_n_s16(vreinterpretq_s16_s8(x), (count))))
 #define _mm_srai_epi32(x, count) (vreinterpretq_s8_s32(vshrq_n_s32(vreinterpretq_s32_s8(x), (count))))
 static inline __m128i _mm_sra_epi16(__m128i x, __m128i count)
 {
	 uint16_t shift = (uint16_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFF);
	 return vreinterpretq_s8_s16(
		 vshlq_s16(vreinterpretq_s16_s8(x), vdupq_n_s16(-(int16_t) shift))
	 );
 }
 static inline __m128i _mm_sra_epi32(__m128i x, __m128i count)
 {
	 uint32_t shift = (uint32_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFFFFFF);
	 return vreinterpretq_s8_s32(
		 vshlq_s32(vreinterpretq_s32_s8(x), vdupq_n_s32(-(int32_t) shift))
	 );
 }

 #define _mm_srli_epi16(x, count) (vreinterpretq_s8_u16(vshrq_n_u16(vreinterpretq_u16_s8(x), (count))))
 #define _mm_srli_epi32(x, count) (vreinterpretq_s8_u32(vshrq_n_u32(vreinterpretq_u32_s8(x), (count))))
 #define _mm_srli_epi64(x, count) (vreinterpretq_s8_u64(vshrq_n_u64(vreinterpretq_u64_s8(x), (count))))
 static XSSE_FORCE_INLINE __m128i _mm_srl_epi16(__m128i x, __m128i count)
 {
	 uint16_t shift = (uint16_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFF);
	 return vreinterpretq_s8_u16(
		 vshlq_u16(vreinterpretq_u16_s8(x), vdupq_n_s16(-(int16_t) shift))
	 );
 }
 static XSSE_FORCE_INLINE __m128i _mm_srl_epi32(__m128i x, __m128i count)
 {
	 uint32_t shift = (uint32_t) (vgetq_lane_s64(vreinterpretq_s64_s8(count), 0) & 0xFFFFFFFF);
	 return vreinterpretq_s8_u32(
		 vshlq_u32(vreinterpretq_u32_s8(x), vdupq_n_s32(-(int32_t) shift))
	 );
 }
 static XSSE_FORCE_INLINE __m128i _mm_srl_epi64(__m128i x, __m128i count)
 {
	 uint64_t shift = (uint64_t) vgetq_lane_s64(vreinterpretq_s64_s8(count), 0);
	 return vreinterpretq_s8_u64(
		 vshlq_u64(vreinterpretq_u64_s8(x), vdupq_n_s64(-(int64_t) shift))
	 );
 }

 #define _mm_srli_si128(x, imm) \
	 ((imm) >= 16 ? vdupq_n_s8(0) : vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), (imm))))


 /*****************************************************************************
  * Integer Arithmetic Operations                                             *
  *****************************************************************************/

 /**
  * In practice, there is no problem, but a runtime error for signed integer overflow is triggered by UBSAN,
  * so perform the calculation as unsigned. Since it is optimized at compile time, there are no unnecessary casts at runtime.
  */
 #define _mm_add_epi8(a, b) (vreinterpretq_s8_u8(vaddq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))))
 #define _mm_add_epi16(a, b) (vreinterpretq_s8_u16(vaddq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b))))
 #define _mm_add_epi32(a, b) (vreinterpretq_s8_u32(vaddq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b))))
 #define _mm_add_epi64(a, b) (vreinterpretq_s8_u64(vaddq_u64(vreinterpretq_u64_s8(a), vreinterpretq_u64_s8(b))))

 #define _mm_adds_epi8(a, b) (vqaddq_s8((a), (b)))
 #define _mm_adds_epi16(a, b) (vreinterpretq_s8_s16(vqaddq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b))))
 #define _mm_adds_epu8(a, b) (vreinterpretq_s8_u8(vqaddq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))))
 #define _mm_adds_epu16(a, b) (vreinterpretq_s8_u16(vqaddq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b))))

 #define _mm_avg_epu8(a, b) (vreinterpretq_s8_u8(vrhaddq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))))
 #define _mm_avg_epu16(a, b) (vreinterpretq_s8_u16(vrhaddq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b))))

 static XSSE_FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
 {
	 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s8(a)), vget_low_s16(vreinterpretq_s16_s8(b)));
	 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_s8(a)), vget_high_s16(vreinterpretq_s16_s8(b)));

	 return vreinterpretq_s8_s32(vcombine_s32(
		 vpadd_s32(vget_low_s32(mul_lo), vget_high_s32(mul_lo)),
		 vpadd_s32(vget_low_s32(mul_hi), vget_high_s32(mul_hi))
	 ));
 }

 #define _mm_max_epu8(a, b) (vreinterpretq_s8_u8(vmaxq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))))
 #define _mm_max_epi16(a, b) (vreinterpretq_s8_s16(vmaxq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b))))
 #define _mm_min_epu8(a, b) (vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))))
 #define _mm_min_epi16(a, b) (vreinterpretq_s8_s16(vminq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b))))

 static XSSE_FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
 {
	 int32x4_t lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s8(a)), vget_low_s16(vreinterpretq_s16_s8(b)));
	 int32x4_t hi = vmull_s16(vget_high_s16(vreinterpretq_s16_s8(a)), vget_high_s16(vreinterpretq_s16_s8(b)));
	 return vreinterpretq_s8_s16(vcombine_s16(vshrn_n_s32(lo, 16), vshrn_n_s32(hi, 16)));
 }
 static XSSE_FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
 {
	 uint32x4_t lo = vmull_u16(vget_low_u16(vreinterpretq_u16_s8(a)), vget_low_u16(vreinterpretq_u16_s8(b)));
	 uint32x4_t hi = vmull_u16(vget_high_u16(vreinterpretq_u16_s8(a)), vget_high_u16(vreinterpretq_u16_s8(b)));
	 return vreinterpretq_s8_u16(vcombine_u16(vshrn_n_u32(lo, 16), vshrn_n_u32(hi, 16)));
 }
 static XSSE_FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
 {
	 int32x4_t lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s8(a)), vget_low_s16(vreinterpretq_s16_s8(b)));
	 int32x4_t hi = vmull_s16(vget_high_s16(vreinterpretq_s16_s8(a)), vget_high_s16(vreinterpretq_s16_s8(b)));
	 return vreinterpretq_s8_s16(vcombine_s16(vmovn_s32(lo), vmovn_s32(hi)));
 }
 static XSSE_FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
 {
	 uint32x4_t evens = vuzpq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b)).val[0];
	 return vreinterpretq_s8_u64(vmull_u32(vget_low_u32(evens), vget_high_u32(evens)));
 }
 static XSSE_FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
 {
	 uint16x8_t abs_diffs_16 = vpaddlq_u8(vabdq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b)));
	 uint32x4_t abs_diffs_32 = vpaddlq_u16(abs_diffs_16);
	 uint64x2_t abs_diffs_64 = vpaddlq_u32(abs_diffs_32);

	 return vreinterpretq_s8_u16((uint16x8_t) {
		 (int16_t) vgetq_lane_u64(abs_diffs_64, 0), 0, 0, 0,
		 (int16_t) vgetq_lane_u64(abs_diffs_64, 1), 0, 0, 0
	 });
 }

 #define _mm_sub_epi8(a, b) (vreinterpretq_s8_u8(vsubq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))))
 #define _mm_sub_epi16(a, b) (vreinterpretq_s8_u16(vsubq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b))))
 #define _mm_sub_epi32(a, b) (vreinterpretq_s8_u32(vsubq_u32(vreinterpretq_u32_s8(a), vreinterpretq_u32_s8(b))))
 #define _mm_sub_epi64(a, b) (vreinterpretq_s8_u64(vsubq_u64(vreinterpretq_u64_s8(a), vreinterpretq_u64_s8(b))))

 #define _mm_subs_epi8(a, b) (vqsubq_s8((a), (b)))
 #define _mm_subs_epi16(a, b) (vreinterpretq_s8_s16(vqsubq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b))))
 #define _mm_subs_epu8(a, b) (vreinterpretq_s8_u8(vqsubq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))))
 #define _mm_subs_epu16(a, b) (vreinterpretq_s8_u16(vqsubq_u16(vreinterpretq_u16_s8(a), vreinterpretq_u16_s8(b))))


 /*****************************************************************************
  * Comparison                                                                *
  *****************************************************************************/

 #define _mm_cmpeq_epi8(a, b) (vreinterpretq_s8_u8(vceqq_s8((a), (b))))
 #define _mm_cmpeq_epi16(a, b) (vreinterpretq_s8_u16(vceqq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b))))
 #define _mm_cmpeq_epi32(a, b) (vreinterpretq_s8_u32(vceqq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b))))

 #define _mm_cmplt_epi8(a, b) (vreinterpretq_s8_u8(vcltq_s8((a), (b))))
 #define _mm_cmplt_epi16(a, b) (vreinterpretq_s8_u16(vcltq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b))))
 #define _mm_cmplt_epi32(a, b) (vreinterpretq_s8_u32(vcltq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b))))

 #define _mm_cmpgt_epi8(a, b) (vreinterpretq_s8_u8(vcgtq_s8((a), (b))))
 #define _mm_cmpgt_epi16(a, b) (vreinterpretq_s8_u16(vcgtq_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b))))
 #define _mm_cmpgt_epi32(a, b) (vreinterpretq_s8_u32(vcgtq_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b))))


 /*****************************************************************************
  * Convert                                                                   *
  *****************************************************************************/

 #define _mm_cvtsi32_si128(x) (vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x), 0, 0, 0 }))
 #define _mm_cvtsi64_si128(x) (vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x), 0 }))
 #define _mm_cvtsi128_si32(x) (vgetq_lane_s32(vreinterpretq_s32_s8(x), 0))
 #define _mm_cvtsi128_si64(x) (vgetq_lane_s64(vreinterpretq_s64_s8(x), 0))


 /*****************************************************************************
  * Others                                                                    *
  *****************************************************************************/

 #define _mm_packs_epi16(a, b) (vcombine_s8(vqmovn_s16(vreinterpretq_s16_s8(a)), vqmovn_s16(vreinterpretq_s16_s8(b))))
 #define _mm_packs_epi32(a, b) \
	 (vreinterpretq_s8_s16(vcombine_s16(vqmovn_s32(vreinterpretq_s32_s8(a)), vqmovn_s32(vreinterpretq_s32_s8(b)))))
 #define _mm_packus_epi16(a, b) \
	 (vreinterpretq_s8_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_s8(a)), vqmovun_s16(vreinterpretq_s16_s8(b)))))

 #define _mm_extract_epi16(x, imm) (vgetq_lane_s16(vreinterpretq_s16_s8(x), (imm)))
 #define _mm_insert_epi16(x, val, imm) (vreinterpretq_s8_s16(vsetq_lane_s16((int16_t) (val), vreinterpretq_s16_s8(x), (imm))))

 static XSSE_FORCE_INLINE int _mm_movemask_epi8(__m128i x)
 {
	 /**
	  * based on code from
	  * https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
	  */
	 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s8(x), 7));
	 uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
	 uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
	 uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
	 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
 }

 #define _MM_SHUFFLE(a, b, c, d) (((a) << 6) | ((b) << 4) | ((c) << 2) | (d))
 #ifdef XSSE_HAS_MACRO_EXTENSION
 #define _mm_shuffle_epi32(x, imm) __extension__({ \
		 int32x4_t __xsse_tmp = vreinterpretq_s32_s8(x); \
		 vreinterpretq_s8_s32((int32x4_t) { \
			 (int32_t) vgetq_lane_s32(__xsse_tmp, ((imm) >> 0) & 0x3), \
			 (int32_t) vgetq_lane_s32(__xsse_tmp, ((imm) >> 2) & 0x3), \
			 (int32_t) vgetq_lane_s32(__xsse_tmp, ((imm) >> 4) & 0x3), \
			 (int32_t) vgetq_lane_s32(__xsse_tmp, ((imm) >> 6) & 0x3) \
		 }); \
	 })
 #define _mm_shufflehi_epi16(x, imm) __extension__({ \
		 int16x8_t __xsse_tmp = vreinterpretq_s16_s8(x); \
		 vreinterpretq_s8_s16(vcombine_s16( \
			 vget_low_s16(__xsse_tmp), \
			 (int16x4_t) { \
				 (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 0) & 0x3) + 4), \
				 (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 2) & 0x3) + 4), \
				 (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 4) & 0x3) + 4), \
				 (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 6) & 0x3) + 4) \
			 } \
		 )); \
	 })
 #define _mm_shufflelo_epi16(x, imm) __extension__({ \
		 int16x8_t __xsse_tmp = vreinterpretq_s16_s8(x); \
		 vreinterpretq_s8_s16(vcombine_s16( \
			 (int16x4_t) { \
				 (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 0) & 0x3)), \
				 (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 2) & 0x3)), \
				 (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 4) & 0x3)), \
				 (int16_t) vgetq_lane_s16(__xsse_tmp, (((imm) >> 6) & 0x3)) \
			 }, \
			 vget_high_s16(__xsse_tmp) \
		 )); \
	 })
 #else
 static XSSE_FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i x, int imm)
 {
	 int32x4_t vec = vreinterpretq_s32_s8(x);
	 int32_t arr[4];
	 vst1q_s32(arr, vec);

	 return vreinterpretq_s8_s32((int32x4_t) {
		 arr[(imm >> 0) & 0x3],
		 arr[(imm >> 2) & 0x3],
		 arr[(imm >> 4) & 0x3],
		 arr[(imm >> 6) & 0x3]
	 });
 }
 static XSSE_FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i x, int imm)
 {
	 int16x8_t vec = vreinterpretq_s16_s8(x);
	 int16_t arr[8];
	 vst1q_s16(arr, vec);

	 return vreinterpretq_s8_s16((int16x8_t) {
		 arr[0], arr[1], arr[2], arr[3],
		 arr[((imm >> 0) & 0x3) + 4],
		 arr[((imm >> 2) & 0x3) + 4],
		 arr[((imm >> 4) & 0x3) + 4],
		 arr[((imm >> 6) & 0x3) + 4]
	 });
 }
 static XSSE_FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i x, int imm)
 {
	 int16x8_t vec = vreinterpretq_s16_s8(x);
	 int16_t arr[8];
	 vst1q_s16(arr, vec);

	 return vreinterpretq_s8_s16((int16x8_t) {
		 arr[((imm >> 0) & 0x3)],
		 arr[((imm >> 2) & 0x3)],
		 arr[((imm >> 4) & 0x3)],
		 arr[((imm >> 6) & 0x3)],
		 arr[4], arr[5], arr[6], arr[7]
	 });
 }
 #endif

 #define _mm_unpackhi_epi8(a, b) (vzip2q_s8((a), (b)))
 #define _mm_unpackhi_epi16(a, b) (vreinterpretq_s8_s16(vzip2q_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b))))
 #define _mm_unpackhi_epi32(a, b) (vreinterpretq_s8_s32(vzip2q_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b))))
 #define _mm_unpackhi_epi64(a, b) (vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(a), vreinterpretq_s64_s8(b))))

 #define _mm_unpacklo_epi8(a, b) (vzip1q_s8((a), (b)))
 #define _mm_unpacklo_epi16(a, b) (vreinterpretq_s8_s16(vzip1q_s16(vreinterpretq_s16_s8(a), vreinterpretq_s16_s8(b))))
 #define _mm_unpacklo_epi32(a, b) (vreinterpretq_s8_s32(vzip1q_s32(vreinterpretq_s32_s8(a), vreinterpretq_s32_s8(b))))
 #define _mm_unpacklo_epi64(a, b) (vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(a), vreinterpretq_s64_s8(b))))

 #define _mm_move_epi64(x) (vreinterpretq_s8_s64((int64x2_t) { vgetq_lane_s64(vreinterpretq_s64_s8(x), 0), 0 }))

 #endif

 #endif /* XSSE_H */
b IDATxytVսϓ22 A@IR :hCiZ[v*E:WũZA ^dQeQ @ !jZ'>gsV仿$|?g)&x-EIENT ;@xT.i%-X}SvS5.r/UHz^_$-W"w)Ɗ/@Z &IoX P$K}JzX:;` &, ŋui,e6mX ԵrKb1ԗ)DADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADADA݀!I*]R;I2$eZ#ORZSrr6mteffu*((Pu'v{DIߔ4^pIm'77WEEE;vƎ4-$]'RI{\I&G :IHJ DWBB=\WR޽m o$K(V9ABB.}jѢv`^?IOȅ} ڶmG}T#FJ`56$-ھ}FI&v;0(h;Б38CӧOWf!;A i:F_m9s&|q%=#wZprrrla A &P\\СC[A#! {olF} `E2}MK/vV)i{4BffV\|ۭX`b@kɶ@%i$K z5zhmX[IXZ` 'b%$r5M4º/l ԃߖxhʔ)[@=} K6IM}^5k㏷݆z ΗÿO:gdGBmyT/@+Vɶ纽z񕏵l.y޴it뭷zV0[Y^>Wsqs}\/@$(T7f.InݺiR$푔n.~?H))\ZRW'Mo~v Ov6oԃxz! S,&xm/yɞԟ?'uaSѽb,8GלKboi&3t7Y,)JJ c[nzӳdE&KsZLӄ I?@&%ӟ۶mSMMњ0iؐSZ,|J+N ~,0A0!5%Q-YQQa3}$_vVrf9f?S8`zDADADADADADADADADAdqP,تmMmg1V?rSI꒟]u|l RCyEf٢9 jURbztѰ!m5~tGj2DhG*{H9)꒟ר3:(+3\?/;TUݭʴ~S6lڧUJ*i$d(#=Yݺd{,p|3B))q:vN0Y.jkק6;SɶVzHJJЀ-utѹսk>QUU\޲~]fFnK?&ߡ5b=z9)^|u_k-[y%ZNU6 7Mi:]ۦtk[n X(e6Bb."8cۭ|~teuuw|ήI-5"~Uk;ZicEmN/:]M> cQ^uiƞ??Ңpc#TUU3UakNwA`:Y_V-8.KKfRitv޲* 9S6ֿj,ՃNOMߤ]z^fOh|<>@Å5 _/Iu?{SY4hK/2]4%it5q]GGe2%iR| W&f*^]??vq[LgE_3f}Fxu~}qd-ږFxu~I N>\;͗O֊:̗WJ@BhW=y|GgwܷH_NY?)Tdi'?խwhlmQi !SUUsw4kӺe4rfxu-[nHtMFj}H_u~w>)oV}(T'ebʒv3_[+vn@Ȭ\S}ot}w=kHFnxg S 0eޢm~l}uqZfFoZuuEg `zt~? b;t%>WTkķh[2eG8LIWx,^\thrl^Ϊ{=dž<}qV@ ⠨Wy^LF_>0UkDuʫuCs$)Iv:IK;6ֲ4{^6եm+l3>݆uM 9u?>Zc }g~qhKwڭeFMM~pМuqǿz6Tb@8@Y|jx](^]gf}M"tG -w.@vOqh~/HII`S[l.6nØXL9vUcOoB\xoǤ'T&IǍQw_wpv[kmO{w~>#=P1Pɞa-we:iǏlHo׈꒟f9SzH?+shk%Fs:qVhqY`jvO'ρ?PyX3lх]˾uV{ݞ]1,MzYNW~̈́ joYn}ȚF߾׮mS]F z+EDxm/d{F{-W-4wY듏:??_gPf ^3ecg ҵs8R2מz@TANGj)}CNi/R~}c:5{!ZHӋӾ6}T]G]7W6^n 9*,YqOZj:P?Q DFL|?-^.Ɵ7}fFh׶xe2Pscz1&5\cn[=Vn[ĶE鎀uˌd3GII k;lNmشOuuRVfBE]ۣeӶu :X-[(er4~LHi6:Ѻ@ԅrST0trk%$Č0ez" *z"T/X9|8.C5Feg}CQ%͞ˣJvL/?j^h&9xF`њZ(&yF&Iݻfg#W;3^{Wo^4'vV[[K';+mӍִ]AC@W?1^{එyh +^]fm~iԵ]AB@WTk̏t uR?l.OIHiYyԶ]Aˀ7c:q}ힽaf6Z~қm(+sK4{^6}T*UUu]n.:kx{:2 _m=sAߤU@?Z-Vކеz왍Nэ{|5 pڶn b p-@sPg]0G7fy-M{GCF'%{4`=$-Ge\ eU:m+Zt'WjO!OAF@ik&t݆ϥ_ e}=]"Wz_.͜E3leWFih|t-wZۍ-uw=6YN{6|} |*={Ѽn.S.z1zjۻTH]흾 DuDvmvK.`V]yY~sI@t?/ϓ. m&["+P?MzovVЫG3-GRR[(!!\_,^%?v@ҵő m`Y)tem8GMx.))A]Y i`ViW`?^~!S#^+ѽGZj?Vģ0.))A꨷lzL*]OXrY`DBBLOj{-MH'ii-ϰ ok7^ )쭡b]UXSְmռY|5*cֽk0B7镹%ڽP#8nȎq}mJr23_>lE5$iwui+ H~F`IjƵ@q \ @#qG0".0" l`„.0! ,AQHN6qzkKJ#o;`Xv2>,tێJJ7Z/*A .@fفjMzkg @TvZH3Zxu6Ra'%O?/dQ5xYkU]Rֽkق@DaS^RSּ5|BeHNN͘p HvcYcC5:y #`οb;z2.!kr}gUWkyZn=f Pvsn3p~;4p˚=ē~NmI] ¾ 0lH[_L hsh_ғߤc_њec)g7VIZ5yrgk̞W#IjӪv>՞y睝M8[|]\շ8M6%|@PZڨI-m>=k='aiRo-x?>Q.}`Ȏ:Wsmu u > .@,&;+!!˱tﭧDQwRW\vF\~Q7>spYw$%A~;~}6¾ g&if_=j,v+UL1(tWake:@Ș>j$Gq2t7S?vL|]u/ .(0E6Mk6hiۺzښOrifޱxm/Gx> Lal%%~{lBsR4*}{0Z/tNIɚpV^#Lf:u@k#RSu =S^ZyuR/.@n&΃z~B=0eg뺆#,Þ[B/?H uUf7y Wy}Bwegל`Wh(||`l`.;Ws?V@"c:iɍL֯PGv6zctM̠':wuW;d=;EveD}9J@B(0iհ bvP1{\P&G7D޴Iy_$-Qjm~Yrr&]CDv%bh|Yzni_ˆR;kg}nJOIIwyuL}{ЌNj}:+3Y?:WJ/N+Rzd=hb;dj͒suݔ@NKMԄ jqzC5@y°hL m;*5ezᕏ=ep XL n?מ:r`۵tŤZ|1v`V뽧_csج'ߤ%oTuumk%%%h)uy]Nk[n 'b2 l.=͜E%gf$[c;s:V-͞WߤWh-j7]4=F-X]>ZLSi[Y*We;Zan(ӇW|e(HNNP5[= r4tP &0<pc#`vTNV GFqvTi*Tyam$ߏWyE*VJKMTfFw>'$-ؽ.Ho.8c"@DADADADADADADADADA~j*֘,N;Pi3599h=goضLgiJ5փy~}&Zd9p֚ e:|hL``b/d9p? fgg+%%hMgXosج, ΩOl0Zh=xdjLmhݻoO[g_l,8a]٭+ӧ0$I]c]:粹:Teꢢ"5a^Kgh,&= =՟^߶“ߢE ܹS J}I%:8 IDAT~,9/ʃPW'Mo}zNƍ쨓zPbNZ~^z=4mswg;5 Y~SVMRXUյڱRf?s:w ;6H:ºi5-maM&O3;1IKeamZh͛7+##v+c ~u~ca]GnF'ټL~PPPbn voC4R,ӟgg %hq}@#M4IÇ Oy^xMZx ) yOw@HkN˖-Sǎmb]X@n+i͖!++K3gd\$mt$^YfJ\8PRF)77Wא!Cl$i:@@_oG I{$# 8磌ŋ91A (Im7֭>}ߴJq7ޗt^ -[ԩSj*}%]&' -ɓ'ꫯVzzvB#;a 7@GxI{j޼ƌ.LÇWBB7`O"I$/@R @eee@۷>}0,ɒ2$53Xs|cS~rpTYYY} kHc %&k.], @ADADADADADADADADA@lT<%''*Lo^={رc5h %$+CnܸQ3fҥK}vUVVs9G R,_{xˇ3o߾;TTTd}馛]uuuG~iԩ@4bnvmvfϞ /Peeeq}}za I~,誫{UWW뮻}_~YƍSMMMYχ֝waw\ďcxꩧtEƍկ_?۷5@u?1kNׯWzz/wy>}zj3 k(ٺuq_Zvf̘:~ ABQ&r|!%KҥKgԞ={<_X-z !CyFUUz~ ABQIIIjݺW$UXXDٳZ~ ABQƍecW$<(~<RSSvZujjjԧOZQu@4 8m&&&jԩg$ď1h ͟?_{768@g =@`)))5o6m3)ѣƌJ;wҿUTT /KZR{~a=@0o<*狔iFɶ[ˎ;T]]OX@?K.ۈxN pppppppppppppppppPfl߾] ,{ァk۶mڿo5BTӦMӴiӴ|r DB2e|An!Dy'tkΝ[A $***t5' "!駟oaDnΝ:t֭[gDШQ06qD;@ x M6v(PiizmZ4ew"@̴ixf [~-Fٱc&IZ2|n!?$@{[HTɏ#@hȎI# _m(F /6Z3z'\r,r!;w2Z3j=~GY7"I$iI.p_"?pN`y DD?: _  Gÿab7J !Bx@0 Bo cG@`1C[@0G @`0C_u V1 aCX>W ` | `!<S `"<. `#c`?cAC4 ?c p#~@0?:08&_MQ1J h#?/`7;I  q 7a wQ A 1 Hp !#<8/#@1Ul7=S=K.4Z?E_$i@!1!E4?`P_  @Bă10#: "aU,xbFY1 [n|n #'vEH:`xb #vD4Y hi.i&EΖv#O H4IŶ}:Ikh @tZRF#(tXҙzZ ?I3l7q@õ|ۍ1,GpuY Ꮿ@hJv#xxk$ v#9 5 }_$c S#=+"K{F*m7`#%H:NRSp6I?sIՖ{Ap$I$I:QRv2$Z @UJ*$]<FO4IENDB`