123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571 |
- /******************************************************************************
- * $Id: gdalsse_priv.h 28877 2015-04-08 23:11:36Z rouault $
- *
- * Project: GDAL
- * Purpose: SSE2 helper
- * Author: Even Rouault <even dot rouault at spatialys dot com>
- *
- ******************************************************************************
- * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- ****************************************************************************/
- #ifndef GDALSSE_PRIV_H_INCLUDED
- #define GDALSSE_PRIV_H_INCLUDED
- /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
- /* Could possibly be used too on 32bit, but we would need to check at runtime */
- #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
- /* Requires SSE2 */
- #include <emmintrin.h>
- #include <string.h>
- class XMMReg2Double
- {
- public:
- __m128d xmm;
- XMMReg2Double() {}
- XMMReg2Double(double val) { xmm = _mm_load_sd (&val); }
- XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
- static inline XMMReg2Double Zero()
- {
- XMMReg2Double reg;
- reg.Zeroize();
- return reg;
- }
- static inline XMMReg2Double Load2Val(const double* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2Val(ptr);
- return reg;
- }
- static inline XMMReg2Double Load2Val(const float* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2Val(ptr);
- return reg;
- }
- static inline XMMReg2Double Load2ValAligned(const double* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2ValAligned(ptr);
- return reg;
- }
- static inline XMMReg2Double Load2Val(const unsigned char* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2Val(ptr);
- return reg;
- }
- static inline XMMReg2Double Load2Val(const short* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2Val(ptr);
- return reg;
- }
- static inline XMMReg2Double Load2Val(const unsigned short* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2Val(ptr);
- return reg;
- }
-
- inline void nsLoad2Val(const double* ptr)
- {
- xmm = _mm_loadu_pd(ptr);
- }
- inline void nsLoad2ValAligned(const double* pval)
- {
- xmm = _mm_load_pd(pval);
- }
- inline void nsLoad2Val(const float* pval)
- {
- __m128 temp1 = _mm_load_ss(pval);
- __m128 temp2 = _mm_load_ss(pval + 1);
- temp1 = _mm_shuffle_ps(temp1, temp2, _MM_SHUFFLE(1,0,1,0));
- temp1 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,3,2,0));
- xmm = _mm_cvtps_pd(temp1);
- }
- inline void nsLoad2Val(const unsigned char* ptr)
- {
- __m128i xmm_i = _mm_cvtsi32_si128(*(unsigned short*)(ptr));
- xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
- xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
- xmm = _mm_cvtepi32_pd(xmm_i);
- }
- inline void nsLoad2Val(const short* ptr)
- {
- int i;
- memcpy(&i, ptr, 4);
- __m128i xmm_i = _mm_cvtsi32_si128(i);
- xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
- xmm_i = _mm_srai_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
- xmm = _mm_cvtepi32_pd(xmm_i);
- }
- inline void nsLoad2Val(const unsigned short* ptr)
- {
- int i;
- memcpy(&i, ptr, 4);
- __m128i xmm_i = _mm_cvtsi32_si128(i);
- xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
- xmm_i = _mm_srli_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|0|b|0|a */
- xmm = _mm_cvtepi32_pd(xmm_i);
- }
-
- static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- __m128i xmm_i = _mm_cvtsi32_si128(*(int*)(ptr));
- xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
- xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
- low.xmm = _mm_cvtepi32_pd(xmm_i);
- high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
- }
- static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- low.nsLoad2Val(ptr);
- high.nsLoad2Val(ptr+2);
- }
- static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- low.nsLoad2Val(ptr);
- high.nsLoad2Val(ptr+2);
- }
- static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- low.nsLoad2Val(ptr);
- high.nsLoad2Val(ptr+2);
- }
- static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- __m128 temp1 = _mm_loadu_ps(ptr);
- __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
- low.xmm = _mm_cvtps_pd(temp1);
- high.xmm = _mm_cvtps_pd(temp2);
- }
-
- inline void Zeroize()
- {
- xmm = _mm_setzero_pd();
- }
- inline const XMMReg2Double& operator= (const XMMReg2Double& other)
- {
- xmm = other.xmm;
- return *this;
- }
- inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
- {
- xmm = _mm_add_pd(xmm, other.xmm);
- return *this;
- }
- inline XMMReg2Double operator+ (const XMMReg2Double& other)
- {
- XMMReg2Double ret;
- ret.xmm = _mm_add_pd(xmm, other.xmm);
- return ret;
- }
- inline XMMReg2Double operator- (const XMMReg2Double& other)
- {
- XMMReg2Double ret;
- ret.xmm = _mm_sub_pd(xmm, other.xmm);
- return ret;
- }
- inline XMMReg2Double operator* (const XMMReg2Double& other)
- {
- XMMReg2Double ret;
- ret.xmm = _mm_mul_pd(xmm, other.xmm);
- return ret;
- }
- inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
- {
- xmm = _mm_mul_pd(xmm, other.xmm);
- return *this;
- }
- inline void AddLowAndHigh()
- {
- __m128d xmm2;
- xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
- xmm = _mm_add_pd(xmm, xmm2);
- }
-
- inline void Store2Double(double* pval)
- {
- _mm_storeu_pd(pval, xmm);
- }
-
- inline void Store2DoubleAligned(double* pval)
- {
- _mm_store_pd(pval, xmm);
- }
- inline operator double () const
- {
- double val;
- _mm_store_sd(&val, xmm);
- return val;
- }
- };
- #else
- #warning "Software emulation of SSE2 !"
- class XMMReg2Double
- {
- public:
- double low;
- double high;
- XMMReg2Double() {}
- XMMReg2Double(double val) { low = val; high = 0.0; }
- XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
- static inline XMMReg2Double Zero()
- {
- XMMReg2Double reg;
- reg.Zeroize();
- return reg;
- }
-
- static inline XMMReg2Double Load2Val(const double* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2Val(ptr);
- return reg;
- }
- static inline XMMReg2Double Load2ValAligned(const double* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2ValAligned(ptr);
- return reg;
- }
-
- static inline XMMReg2Double Load2Val(const float* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2Val(ptr);
- return reg;
- }
- static inline XMMReg2Double Load2Val(const unsigned char* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2Val(ptr);
- return reg;
- }
- static inline XMMReg2Double Load2Val(const short* ptr)
- {
- XMMReg2Double reg;
- reg.nsLoad2Val(ptr);
- return reg;
- }
- inline void nsLoad2Val(const double* pval)
- {
- low = pval[0];
- high = pval[1];
- }
- inline void nsLoad2ValAligned(const double* pval)
- {
- low = pval[0];
- high = pval[1];
- }
- inline void nsLoad2Val(const float* pval)
- {
- low = pval[0];
- high = pval[1];
- }
- inline void nsLoad2Val(const unsigned char* ptr)
- {
- low = ptr[0];
- high = ptr[1];
- }
- inline void nsLoad2Val(const short* ptr)
- {
- low = ptr[0];
- high = ptr[1];
- }
- inline void nsLoad2Val(const unsigned short* ptr)
- {
- low = ptr[0];
- high = ptr[1];
- }
-
- static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- low.low = ptr[0];
- low.high = ptr[1];
- high.low = ptr[2];
- high.high = ptr[3];
- }
- static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- low.nsLoad2Val(ptr);
- high.nsLoad2Val(ptr+2);
- }
- static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- low.nsLoad2Val(ptr);
- high.nsLoad2Val(ptr+2);
- }
- static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- low.nsLoad2Val(ptr);
- high.nsLoad2Val(ptr+2);
- }
- static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
- {
- low.nsLoad2Val(ptr);
- high.nsLoad2Val(ptr+2);
- }
- inline void Zeroize()
- {
- low = 0.0;
- high = 0.0;
- }
- inline const XMMReg2Double& operator= (const XMMReg2Double& other)
- {
- low = other.low;
- high = other.high;
- return *this;
- }
- inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
- {
- low += other.low;
- high += other.high;
- return *this;
- }
- inline XMMReg2Double operator+ (const XMMReg2Double& other)
- {
- XMMReg2Double ret;
- ret.low = low + other.low;
- ret.high = high + other.high;
- return ret;
- }
- inline XMMReg2Double operator- (const XMMReg2Double& other)
- {
- XMMReg2Double ret;
- ret.low = low - other.low;
- ret.high = high - other.high;
- return ret;
- }
- inline XMMReg2Double operator* (const XMMReg2Double& other)
- {
- XMMReg2Double ret;
- ret.low = low * other.low;
- ret.high = high * other.high;
- return ret;
- }
- inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
- {
- low *= other.low;
- high *= other.high;
- return *this;
- }
- inline void AddLowAndHigh()
- {
- double add = low + high;
- low = add;
- high = add;
- }
- inline void Store2Double(double* pval)
- {
- pval[0] = low;
- pval[1] = high;
- }
-
- inline void Store2DoubleAligned(double* pval)
- {
- pval[0] = low;
- pval[1] = high;
- }
- inline operator double () const
- {
- return low;
- }
- };
- #endif /* defined(__x86_64) || defined(_M_X64) */
- class XMMReg4Double
- {
- public:
- XMMReg2Double low, high;
- XMMReg4Double() {}
- XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
- static inline XMMReg4Double Zero()
- {
- XMMReg4Double reg;
- reg.low.Zeroize();
- reg.high.Zeroize();
- return reg;
- }
-
- static inline XMMReg4Double Load4Val(const unsigned char* ptr)
- {
- XMMReg4Double reg;
- XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
- return reg;
- }
- static inline XMMReg4Double Load4Val(const short* ptr)
- {
- XMMReg4Double reg;
- reg.low.nsLoad2Val(ptr);
- reg.high.nsLoad2Val(ptr+2);
- return reg;
- }
- static inline XMMReg4Double Load4Val(const unsigned short* ptr)
- {
- XMMReg4Double reg;
- reg.low.nsLoad2Val(ptr);
- reg.high.nsLoad2Val(ptr+2);
- return reg;
- }
- static inline XMMReg4Double Load4Val(const double* ptr)
- {
- XMMReg4Double reg;
- reg.low.nsLoad2Val(ptr);
- reg.high.nsLoad2Val(ptr+2);
- return reg;
- }
- static inline XMMReg4Double Load4ValAligned(const double* ptr)
- {
- XMMReg4Double reg;
- reg.low.nsLoad2ValAligned(ptr);
- reg.high.nsLoad2ValAligned(ptr+2);
- return reg;
- }
- static inline XMMReg4Double Load4Val(const float* ptr)
- {
- XMMReg4Double reg;
- XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
- return reg;
- }
-
- inline const XMMReg4Double& operator= (const XMMReg4Double& other)
- {
- low = other.low;
- high = other.high;
- return *this;
- }
- inline const XMMReg4Double& operator+= (const XMMReg4Double& other)
- {
- low += other.low;
- high += other.high;
- return *this;
- }
- inline XMMReg4Double operator+ (const XMMReg4Double& other)
- {
- XMMReg4Double ret;
- ret.low = low + other.low;
- ret.high = high + other.high;
- return ret;
- }
- inline XMMReg4Double operator- (const XMMReg4Double& other)
- {
- XMMReg4Double ret;
- ret.low = low - other.low;
- ret.high = high - other.high;
- return ret;
- }
- inline XMMReg4Double operator* (const XMMReg4Double& other)
- {
- XMMReg4Double ret;
- ret.low = low * other.low;
- ret.high = high * other.high;
- return ret;
- }
- inline const XMMReg4Double& operator*= (const XMMReg4Double& other)
- {
- low *= other.low;
- high *= other.high;
- return *this;
- }
- inline void AddLowAndHigh()
- {
- low = low + high;
- low.AddLowAndHigh();
- }
- inline XMMReg2Double& GetLow()
- {
- return low;
- }
- };
- #endif /* GDALSSE_PRIV_H_INCLUDED */
|