448 #endif |
450 #endif |
449 |
451 |
450 #if defined(USE_SSE2) |
452 #if defined(USE_SSE2) |
451 RI_INLINE static __m128i mm_mul4x32(const __m128i a, const __m128i b) { |
453 RI_INLINE static __m128i mm_mul4x32(const __m128i a, const __m128i b) { |
452 __m128i res; |
454 __m128i res; |
453 #if (_MSC_VER > 1400 ) |
455 #if defined(__GNUG__) |
|
456 __m128i m0 = _mm_mul_epu32(a, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 1, 0, 0))); |
|
457 __m128i m1 = _mm_mul_epu32(a, _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 2, 2))); |
|
458 |
|
459 res = _mm_cvtps_epi32(_mm_shuffle_ps(_mm_cvtepi32_ps(m0), _mm_cvtepi32_ps(m1), _MM_SHUFFLE(2, 0, 2, 0))); |
|
460 #elif (_MSC_VER > 1400) |
454 // \todo Simpler way to do this on intel? |
461 // \todo Simpler way to do this on intel? |
455 __m128i m0 = _mm_mul_epu32(a, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 1, 0, 0))); |
462 __m128i m0 = _mm_mul_epu32(a, _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 1, 0, 0))); |
456 __m128i m1 = _mm_mul_epu32(a, _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 2, 2))); |
463 __m128i m1 = _mm_mul_epu32(a, _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 2, 2))); |
457 |
464 |
458 res = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(2, 0, 2, 0))); |
465 res = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(m0), _mm_castsi128_ps(m1), _MM_SHUFFLE(2, 0, 2, 0))); |