/* CYGNUS LOCAL whole file */

#ifndef _XMMINTRIN_H_INCLUDED
#define _XMMINTRIN_H_INCLUDED

/* We need type definitions from the MMX header file.  */
#include <mmintrin.h>

typedef int __m128 __attribute__ ((mode (TI)));
typedef int __v4sf __attribute__ ((mode (V4SF)));
typedef int __v4si __attribute__ ((mode (V4SI)));

#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))

#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) do {			\
  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
  __v4sf __t0 = __builtin_ia32_shufps (__r0, __r1, 0x44);		\
  __v4sf __t1 = __builtin_ia32_shufps (__r0, __r1, 0xEE);		\
  __v4sf __t2 = __builtin_ia32_shufps (__r2, __r3, 0x44);		\
  __v4sf __t3 = __builtin_ia32_shufps (__r2, __r3, 0xEE);		\
  (row0) = __builtin_ia32_shufps (__t0, __t1, 0x88);			\
  (row1) = __builtin_ia32_shufps (__t0, __t1, 0xDD);			\
  (row2) = __builtin_ia32_shufps (__t2, __t3, 0x88);			\
  (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD);			\
} while (0)

/* constants for use with _mm_prefetch */
#define _MM_HINT_T0	1
#define _MM_HINT_T1	2
#define _MM_HINT_T2	3
#define _MM_HINT_NTA	0

/* (this declspec not supported with 0.A or 0.B) */
#define _MM_ALIGN16 __attribute__ ((aligned (16)))

/* MACRO functions for setting and reading the MXCSR */
#define _MM_EXCEPT_MASK       0x003f
#define _MM_EXCEPT_INVALID    0x0001
#define _MM_EXCEPT_DENORM     0x0002
#define _MM_EXCEPT_DIV_ZERO   0x0004
#define _MM_EXCEPT_OVERFLOW   0x0008
#define _MM_EXCEPT_UNDERFLOW  0x0010
#define _MM_EXCEPT_INEXACT    0x0020

#define _MM_MASK_MASK         0x1f80
#define _MM_MASK_INVALID      0x0080
#define _MM_MASK_DENORM       0x0100
#define _MM_MASK_DIV_ZERO     0x0200
#define _MM_MASK_OVERFLOW     0x0400
#define _MM_MASK_UNDERFLOW    0x0800
#define _MM_MASK_INEXACT      0x1000

#define _MM_ROUND_MASK        0x6000
#define _MM_ROUND_NEAREST     0x0000
#define _MM_ROUND_DOWN        0x2000
#define _MM_ROUND_UP          0x4000
#define _MM_ROUND_TOWARD_ZERO 0x6000

#define _MM_FLUSH_ZERO_MASK   0x8000
#define _MM_FLUSH_ZERO_ON     0x8000
#define _MM_FLUSH_ZERO_OFF    0x0000

#define _MM_SET_EXCEPTION_STATE(mask)                               \
            _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (mask))
#define _MM_GET_EXCEPTION_STATE()                                   \
            (_mm_getcsr() & _MM_EXCEPT_MASK)

#define _MM_SET_EXCEPTION_MASK(mask)                                \
            _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (mask))
#define _MM_GET_EXCEPTION_MASK()                                    \
            (_mm_getcsr() & _MM_MASK_MASK)

#define _MM_SET_ROUNDING_MODE(mode)                                 \
            _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (mode))
#define _MM_GET_ROUNDING_MODE()                                     \
            (_mm_getcsr() & _MM_ROUND_MASK)

#define _MM_SET_FLUSH_ZERO_MODE(mode)                               \
            _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (mode))
#define _MM_GET_FLUSH_ZERO_MODE(mode)                               \
            (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)

/*****************************************************/
/*     INTRINSICS FUNCTION PROTOTYPES START HERE     */
/*****************************************************/

#define _mm_add_ps(A,B)		((__m128)__builtin_ia32_addps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_add_ss(A,B)		((__m128)__builtin_ia32_addss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_sub_ps(A,B)		((__m128)__builtin_ia32_subps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_sub_ss(A,B)		((__m128)__builtin_ia32_subss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_mul_ps(A,B)		((__m128)__builtin_ia32_mulps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_mul_ss(A,B)		((__m128)__builtin_ia32_mulss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_div_ps(A,B)		((__m128)__builtin_ia32_divps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_div_ss(A,B)		((__m128)__builtin_ia32_divss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_sqrt_ps(A)		((__m128)__builtin_ia32_sqrtps ((__v4sf)(A)))
#define _mm_sqrt_ss(A)		((__m128)__builtin_ia32_sqrtss ((__v4sf)(A)))
#define _mm_rcp_ps(A)		((__m128)__builtin_ia32_rcpps ((__v4sf)(A)))
#define _mm_rcp_ss(A)		((__m128)__builtin_ia32_rcpss ((__v4sf)(A)))
#define _mm_rqsrt_ps(A)		((__m128)__builtin_ia32_rsqrtps ((__v4sf)(A)))
#define _mm_rsqrt_ss(A)		((__m128)__builtin_ia32_rsqrtss ((__v4sf)(A)))
#define _mm_min_ps(A,B)		((__m128)__builtin_ia32_minps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_min_ss(A,B)		((__m128)__builtin_ia32_minss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_max_ps(A,B)		((__m128)__builtin_ia32_maxps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_max_ss(A,B)		((__m128)__builtin_ia32_minss ((__v4sf)(A), (__v4sf)(B)))

#define _mm_and_ps(A,B)		__builtin_ia32_andps (A, B)
#define _mm_andnot_ps(A,B)	__builtin_ia32_nandps (A, B)
#define _mm_or_ps(A,B)		__builtin_ia32_orps (A, B)
#define _mm_xor_ps(A,B)		__builtin_ia32_xorps (A, B)

#define _mm_cmpeq_ps(A,B)	((__m128)__builtin_ia32_cmpeqps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmplt_ps(A,B)	((__m128)__builtin_ia32_cmpltps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmple_ps(A,B)	((__m128)__builtin_ia32_cmpleps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpgt_ps(A,B)	((__m128)__builtin_ia32_cmpgtps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpge_ps(A,B)	((__m128)__builtin_ia32_cmpgeps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpneq_ps(A,B)	((__m128)__builtin_ia32_cmpneqps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpnlt_ps(A,B)	((__m128)__builtin_ia32_cmpnltps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpnle_ps(A,B)	((__m128)__builtin_ia32_cmpnleps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpngt_ps(A,B)	((__m128)__builtin_ia32_cmpngtps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpnge_ps(A,B)	((__m128)__builtin_ia32_cmpngeps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpord_ps(A,B)	((__m128)__builtin_ia32_cmpordps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpunord_ps(A,B)	((__m128)__builtin_ia32_cmpunordps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpeq_ss(A,B)	((__m128)__builtin_ia32_cmpeqss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmplt_ss(A,B)	((__m128)__builtin_ia32_cmpltss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmple_ss(A,B)	((__m128)__builtin_ia32_cmpless ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpgt_ss(A,B)	((__m128)__builtin_ia32_cmpgtss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpge_ss(A,B)	((__m128)__builtin_ia32_cmpgess ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpneq_ss(A,B)	((__m128)__builtin_ia32_cmpneqss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpnlt_ss(A,B)	((__m128)__builtin_ia32_cmpnltss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpnle_ss(A,B)	((__m128)__builtin_ia32_cmpnless ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpngt_ss(A,B)	((__m128)__builtin_ia32_cmpngtss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpnge_ss(A,B)	((__m128)__builtin_ia32_cmpngess ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpord_ss(A,B)	((__m128)__builtin_ia32_cmpordss ((__v4sf)(A), (__v4sf)(B)))
#define _mm_cmpunord_ss(A,B)	((__m128)__builtin_ia32_cmpunordss ((__v4sf)(A), (__v4sf)(B)))

#define _mm_comieq_ss(A,B)	((__m128)__builtin_ia32_comieq ((__v4sf)(A), (__v4sf)(B)))
#define _mm_comilt_ss(A,B)	((__m128)__builtin_ia32_comilt ((__v4sf)(A), (__v4sf)(B)))
#define _mm_comile_ss(A,B)	((__m128)__builtin_ia32_comile ((__v4sf)(A), (__v4sf)(B)))
#define _mm_comigt_ss(A,B)	((__m128)__builtin_ia32_comigt ((__v4sf)(A), (__v4sf)(B)))
#define _mm_comige_ss(A,B)	((__m128)__builtin_ia32_comige ((__v4sf)(A), (__v4sf)(B)))
#define _mm_comineq_ss(A,B)	((__m128)__builtin_ia32_comineq ((__v4sf)(A), (__v4sf)(B)))
#define _mm_ucomieq_ss(A,B)	((__m128)__builtin_ia32_ucomieq ((__v4sf)(A), (__v4sf)(B)))
#define _mm_ucomilt_ss(A,B)	((__m128)__builtin_ia32_ucomilt ((__v4sf)(A), (__v4sf)(B)))
#define _mm_ucomile_ss(A,B)	((__m128)__builtin_ia32_ucomile ((__v4sf)(A), (__v4sf)(B)))
#define _mm_ucomigt_ss(A,B)	((__m128)__builtin_ia32_ucomigt ((__v4sf)(A), (__v4sf)(B)))
#define _mm_ucomige_ss(A,B)	((__m128)__builtin_ia32_ucomige ((__v4sf)(A), (__v4sf)(B)))
#define _mm_ucomineq_ss(A,B)	((__m128)__builtin_ia32_ucomineq ((__v4sf)(A), (__v4sf)(B)))

#define _mm_cvt_ss2si(A)	__builtin_ia32_cvtss2si ((__v4sf) (A))
#define _mm_cvt_ps2pi(A)	((__m64)__builtin_ia32_cvtps2pi ((__v4sf) (A)))
#define _mm_cvtt_ss2si(A)	__builtin_ia32_cvttss2si ((__v4sf) (A))
#define _mm_cvtt_ps2pi(A)	((__m64)__builtin_ia32_cvttps2pi ((__v4sf) (A)))
#define _mm_cvt_si2ss(A,B)	((__m128)__builtin_ia32_cvtsi2ss ((__v4sf) (A)), (B))
#define _mm_cvt_pi2ps(A,B)	((__m128)__builtin_ia32_cvtpi2ps ((__v4sf) (A)), (__v2si)(B))

#define _mm_shuffle_ps(A,B,C)	((__m128)__builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B)), (C))
#define _mm_unpackhi_ps(A,B)	((__m128)__builtin_ia32_unpckhps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_unpacklo_ps(A,B)	((__m128)__builtin_ia32_unpcklps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_loadh_pi(A,B)	((__m128)__builtin_ia32_loadhps ((__v4sf)(A), (__v2si *)(B)))
#define _mm_movehl_ps(A,B)	((__m128)__builtin_ia32_movhlps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_movelh_ps(A,B)	((__m128)__builtin_ia32_movlhps ((__v4sf)(A), (__v4sf)(B)))
#define _mm_storeh_pi(A,B)	__builtin_ia32_storehps ((__v2si *)(A), (__v4sf)(B))
#define _mm_loadl_pi(A,B)	((__m128)__builtin_ia32_loadlps ((__v4sf)(A), (__v2si *)(B)))
#define _mm_storel_pi(A,B)	__builtin_ia32_storelps ((__v2si *)(A), (__v4sf)(B))
#define _mm_movemask_ps(A,B)	__builtin_ia32_movmskps ((__v2sf)(A))

#define _m_pextrw(A,B)		__builtin_ia32_pextrw ((__v4hi)(A), (B))
#define _m_pinsrw(A,B,C)	((__m64)__builtin_ia32_pinsrw ((__v4hi)(A), (B), (C)))
#define _m_pmaxsw(A,B)		((__m64)__builtin_ia32_pmaxsw ((__v4hi)(A), (__v4hi)(B)))
#define _m_pmaxub(A,B)		((__m64)__builtin_ia32_pmaxub ((__v8qi)(A), (__v8qi)(B)))
#define _m_pminsw(A,B)		((__m64)__builtin_ia32_pminsw ((__v4hi)(A), (__v4hi)(B)))
#define _m_pminub(A,B)		((__m64)__builtin_ia32_pminub ((__v8qi)(A), (__v8qi)(B)))
#define _m_pmovmskb(A,B)	__builtin_ia32_pmovmskb ((__v8qi)(A))
#define _m_pmulhuw(A,B)		((__m64)__builtin_ia32_pmulhuw ((__v4hi)(A), (__v4hi)(B)))
#define _m_pshufw(A,B)		((__m64)__builtin_ia32_pshufw ((__v4hi)(A), (__v4hi)(B)))
#define _m_maskmovq(A,B,C)	__builtin_ia32_maskmovq ((__v8qi)(A), (__v8qi)(B), (C))
#define _m_pavgb(A,B)		((__m64)__builtin_ia32_pavgb ((__v8qi)(A), (__v8qi)(B)))
#define _m_pavgw(A,B)		((__m64)__builtin_ia32_pavgw ((__v4hi)(A), (__v4hi)(B)))
#define _m_psadbw(A,B)		((__m64)__builtin_ia32_psadbw ((__v8qi)(A), (__v8qi)(B)))


#if 0 /* These are not documented.  */
#define _mm_set_ss(A,B)
#define _mm_loadu_ps(A,B)	((__m128)__builtin_ia32_loadups (A))
#define _mm_storeu_ps(A,B)	__builtin_ia32_storeups (A, (__v4sf)(B))

extern void* _mm_malloc(int siz, int al);
extern void _mm_free(void *p);

#endif

#define _mm_set_ps1(A)		((__m128)__builtin_ia32_setps1 (A))
#define _mm_set_ps(A)		((__m128)__builtin_ia32_setps (A))
#define _mm_setr_ps(A,B,C,D)	((__m128)__builtin_ia32_setps (D,C,B,A)
#define _mm_setzero_ps()	((__m128)__builtin_ia32_setzerops ())
#define _mm_load_ss(A)		((__m128)__builtin_ia32_loadss (A)) 
#define _mm_load_ps1(A)		((__m128)__builtin_ia32_loadps1 (A))
#define _mm_load_ps(A)		((__m128)__builtin_ia32_loadps (A))
#define _mm_loadr_ps(A)		((__m128)__builtin_ia32_loadrps (A))
#define _mm_store_ss(A,B)	__builtin_ia32_storess (A, (__v4sf)(B))
#define _mm_store_ps1(A,B)	__builtin_ia32_storeps1 (A, (__v4sf)(B))
#define _mm_store_ps(A,B)	__builtin_ia32_storeps (A, (__v4sf)(B))
#define _mm_storer_ps(A,B)	__builtin_ia32_storerps (A, (__v4sf)(B))

#define _mm_prefetch(A,B)	__builtin_ia32_prefetch (A, B)
#define _mm_stream_pi(A,B)	__builtin_ia32_movntq (A, B)
#define _mm_stream_ps(A,B)	__builtin_ia32_movntps (A, (__v4sf)(B))
#define _mm_move_ss(A,B)	((__m128)__builtin_ia32_movss ((__v4sf)(A), (__v4sf)(B))

#define _mm_sfence()		__builtin_ia32_sfence()
#define _mm_getcsr()		__builtin_ia32_getmxcsr()
#define _mm_setcsr(A)		__builtin_ia32_setmxcsr(A)

/* Alternate intrinsic names definition */
#define _mm_cvtss_si32    _mm_cvt_ss2si
#define _mm_cvtps_pi32    _mm_cvt_ps2pi
#define _mm_cvttss_si32   _mm_cvtt_ss2si
#define _mm_cvttps_pi32   _mm_cvtt_ps2pi
#define _mm_cvtsi32_ss    _mm_cvt_si2ss
#define _mm_cvtpi32_ps    _mm_cvt_pi2ps
#define _mm_extract_pi16  _m_pextrw
#define _mm_insert_pi16   _m_pinsrw
#define _mm_max_pi16      _m_pmaxsw
#define _mm_max_pu8       _m_pmaxub
#define _mm_min_pi16      _m_pminsw
#define _mm_min_pu8       _m_pminub
#define _mm_movemask_pi8  _m_pmovmskb
#define _mm_mulhi_pu16    _m_pmulhuw
#define _mm_shuffle_pi16  _m_pshufw
#define _mm_maskmove_si64 _m_maskmovq
#define _mm_avg_pu8       _m_pavgb
#define _mm_avg_pu16      _m_pavgw
#define _mm_sad_pu8       _m_psadbw
#define _mm_set1_ps       _mm_set_ps1
#define _mm_load1_ps      _mm_load_ps1
#define _mm_store1_ps     _mm_store_ps1

/*  Convert 4 16-bit signed integer values to 4 single-precision
    float values.  */
static __inline __m128 _mm_cvtpi16_ps(__m64 a)
{
  __v4hi va = (__v4hi)a;
  __v4hi hizero = (__v4hi)__builtin_ia32_mmx_zero ();
  __v4sf sfzero = (__v4sf)__builtin_ia32_setzerops ();
  /* This comparison against zero gives us a mask that can be used to
     fill in the missing sign bits in the unpack operations below, so
     that we get signed values after unpacking.  */
  __v4hi ext_val = __builtin_ia32_pcmpgtw (hizero, va);
  /* Convert the two high words to doublewords.  */
  __v2si highsi = (__v2si)__builtin_ia32_punpckhwd (va, ext_val);
  __v4sf highsf1 = __builtin_ia32_cvtpi2ps (sfzero, highsi);
  /* That has put the values into the lower part of the SSE register;
     move them to the high position.  */
  __v4sf highsf = __builtin_ia32_movlhps (highsf, highsf);
  __v2si lowsi = (__v2si)__builtin_ia32_punpcklwd (va, ext_val);
  __v4sf converted = __builtin_ia32_cvtpi2ps (highsf, lowsi);

  return (__m128) converted;
}


/*  Convert 4 16-bit unsigned integer values to 4 single-precision
    float values.  */
static __inline __m128 _mm_cvtpu16_ps(__m64 a)
{
  __v4hi va = (__v4hi)a;
  __v4hi hizero = (__v4hi)__builtin_ia32_mmx_zero ();
  __v4sf sfzero = (__v4sf)__builtin_ia32_setzerops ();
  /* Convert the two high words to doublewords.  */
  __v2si highsi = (__v2si)__builtin_ia32_punpckhwd (va, hizero);
  __v4sf highsf1 = __builtin_ia32_cvtpi2ps (sfzero, highsi);
  /* That has put the values into the lower part of the SSE register;
     move them to the high position.  */
  __v4sf highsf = __builtin_ia32_movlhps (highsf, highsf);
  __v2si lowsi = (__v2si)__builtin_ia32_punpcklwd (va, hizero);
  __v4sf converted = __builtin_ia32_cvtpi2ps (highsf, lowsi);

  return (__m128) converted;
}


/* Convert 4 single-precision float values to 4 16-bit integer values.  */
static __inline __m64 _mm_cvtps_pi16(__m128 a)
{
  __v4sf va = (__v4sf) a;
  __v4sf va_low = __builtin_ia32_movhlps (va, va);
  __v2si sihigh = __builtin_ia32_cvtps2pi (va);
  __v2si silow = __builtin_ia32_cvtps2pi (va_low);
  __v4hi result = __builtin_ia32_packssdw (silow, sihigh);
  return (__m64) result;
}


/* Convert 4 8-bit integer values to 4 single-precision float values.  */
static __inline __m128 _mm_cvtpi8_ps(__m64 a)
{
  __v8qi va = (__v8qi)a;
  __v8qi qizero = (__v8qi)__builtin_ia32_mmx_zero ();
  /* This comparison against zero gives us a mask that can be used to
     fill in the missing sign bits in the unpack operation below, so
     that we get signed values after unpacking.  */
  __v8qi ext_val = __builtin_ia32_pcmpgtb (qizero, va);
  /* Convert the four low bytes to words.  */
  __v4hi hi_value = (__v4hi)__builtin_ia32_punpcklbw (va, ext_val);

  return _mm_cvtpi16_ps((__m64)hi_value);
}


/* Convert 4 8-bit unsigned integer values to 4 single-precision
   float values.  */
static __inline __m128 _mm_cvtpu8_ps(__m64 a)
{
  __v8qi va = (__v8qi)a;
  __v8qi qizero = (__v8qi)__builtin_ia32_mmx_zero ();
  /* Convert the four low bytes to words.  */
  __v4hi hi_value = (__v4hi)__builtin_ia32_punpcklbw (va, qizero);

  return _mm_cvtpu16_ps((__m64)hi_value);
}


/* Convert 4 single-precision float values to 4 8-bit integer values.  */
static __inline __m64 _mm_cvtps_pi8(__m128 a)
{
  __v4hi hi_val = (__v4hi) _mm_cvtps_pi16 (a);
  __v4hi hizero = (__v4hi) __builtin_ia32_mmx_zero ();
  __v8qi result = __builtin_ia32_packsswb (hi_val, hizero);
  return (__m64) result;
}


/* Convert 4 32-bit integer values to 4 single-precision float values.  */
static __inline __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
{
  __v4sf sfzero = (__v4sf)__builtin_ia32_setzerops ();
  __v4sf sfa = __builtin_ia32_cvtpi2ps (sfzero, (__v2si) a);
  __v4sf sfb = __builtin_ia32_cvtpi2ps (sfzero, (__v2si) b);
  __v4sf result = __builtin_ia32_movlhps (sfa, sfb);
  return (__m128) result;
}

#endif	/*  _XMMINTRIN_H_INCLUDED */
