gdalsse_priv.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. /******************************************************************************
  2. * $Id: gdalsse_priv.h 28877 2015-04-08 23:11:36Z rouault $
  3. *
  4. * Project: GDAL
  5. * Purpose: SSE2 helper
  6. * Author: Even Rouault <even dot rouault at spatialys dot com>
  7. *
  8. ******************************************************************************
  9. * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
  10. *
  11. * Permission is hereby granted, free of charge, to any person obtaining a
  12. * copy of this software and associated documentation files (the "Software"),
  13. * to deal in the Software without restriction, including without limitation
  14. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  15. * and/or sell copies of the Software, and to permit persons to whom the
  16. * Software is furnished to do so, subject to the following conditions:
  17. *
  18. * The above copyright notice and this permission notice shall be included
  19. * in all copies or substantial portions of the Software.
  20. *
  21. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  22. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  23. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  24. * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  25. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  26. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  27. * DEALINGS IN THE SOFTWARE.
  28. ****************************************************************************/
  29. #ifndef GDALSSE_PRIV_H_INCLUDED
  30. #define GDALSSE_PRIV_H_INCLUDED
  31. /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
  32. /* Could possibly be used too on 32bit, but we would need to check at runtime */
  33. #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
  34. /* Requires SSE2 */
  35. #include <emmintrin.h>
  36. #include <string.h>
  37. class XMMReg2Double
  38. {
  39. public:
  40. __m128d xmm;
  41. XMMReg2Double() {}
  42. XMMReg2Double(double val) { xmm = _mm_load_sd (&val); }
  43. XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
  44. static inline XMMReg2Double Zero()
  45. {
  46. XMMReg2Double reg;
  47. reg.Zeroize();
  48. return reg;
  49. }
  50. static inline XMMReg2Double Load2Val(const double* ptr)
  51. {
  52. XMMReg2Double reg;
  53. reg.nsLoad2Val(ptr);
  54. return reg;
  55. }
  56. static inline XMMReg2Double Load2Val(const float* ptr)
  57. {
  58. XMMReg2Double reg;
  59. reg.nsLoad2Val(ptr);
  60. return reg;
  61. }
  62. static inline XMMReg2Double Load2ValAligned(const double* ptr)
  63. {
  64. XMMReg2Double reg;
  65. reg.nsLoad2ValAligned(ptr);
  66. return reg;
  67. }
  68. static inline XMMReg2Double Load2Val(const unsigned char* ptr)
  69. {
  70. XMMReg2Double reg;
  71. reg.nsLoad2Val(ptr);
  72. return reg;
  73. }
  74. static inline XMMReg2Double Load2Val(const short* ptr)
  75. {
  76. XMMReg2Double reg;
  77. reg.nsLoad2Val(ptr);
  78. return reg;
  79. }
  80. static inline XMMReg2Double Load2Val(const unsigned short* ptr)
  81. {
  82. XMMReg2Double reg;
  83. reg.nsLoad2Val(ptr);
  84. return reg;
  85. }
  86. inline void nsLoad2Val(const double* ptr)
  87. {
  88. xmm = _mm_loadu_pd(ptr);
  89. }
  90. inline void nsLoad2ValAligned(const double* pval)
  91. {
  92. xmm = _mm_load_pd(pval);
  93. }
  94. inline void nsLoad2Val(const float* pval)
  95. {
  96. __m128 temp1 = _mm_load_ss(pval);
  97. __m128 temp2 = _mm_load_ss(pval + 1);
  98. temp1 = _mm_shuffle_ps(temp1, temp2, _MM_SHUFFLE(1,0,1,0));
  99. temp1 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,3,2,0));
  100. xmm = _mm_cvtps_pd(temp1);
  101. }
  102. inline void nsLoad2Val(const unsigned char* ptr)
  103. {
  104. __m128i xmm_i = _mm_cvtsi32_si128(*(unsigned short*)(ptr));
  105. xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
  106. xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
  107. xmm = _mm_cvtepi32_pd(xmm_i);
  108. }
  109. inline void nsLoad2Val(const short* ptr)
  110. {
  111. int i;
  112. memcpy(&i, ptr, 4);
  113. __m128i xmm_i = _mm_cvtsi32_si128(i);
  114. xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
  115. xmm_i = _mm_srai_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
  116. xmm = _mm_cvtepi32_pd(xmm_i);
  117. }
  118. inline void nsLoad2Val(const unsigned short* ptr)
  119. {
  120. int i;
  121. memcpy(&i, ptr, 4);
  122. __m128i xmm_i = _mm_cvtsi32_si128(i);
  123. xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
  124. xmm_i = _mm_srli_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|0|b|0|a */
  125. xmm = _mm_cvtepi32_pd(xmm_i);
  126. }
  127. static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
  128. {
  129. __m128i xmm_i = _mm_cvtsi32_si128(*(int*)(ptr));
  130. xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
  131. xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
  132. low.xmm = _mm_cvtepi32_pd(xmm_i);
  133. high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
  134. }
  135. static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
  136. {
  137. low.nsLoad2Val(ptr);
  138. high.nsLoad2Val(ptr+2);
  139. }
  140. static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
  141. {
  142. low.nsLoad2Val(ptr);
  143. high.nsLoad2Val(ptr+2);
  144. }
  145. static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
  146. {
  147. low.nsLoad2Val(ptr);
  148. high.nsLoad2Val(ptr+2);
  149. }
  150. static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
  151. {
  152. __m128 temp1 = _mm_loadu_ps(ptr);
  153. __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
  154. low.xmm = _mm_cvtps_pd(temp1);
  155. high.xmm = _mm_cvtps_pd(temp2);
  156. }
  157. inline void Zeroize()
  158. {
  159. xmm = _mm_setzero_pd();
  160. }
  161. inline const XMMReg2Double& operator= (const XMMReg2Double& other)
  162. {
  163. xmm = other.xmm;
  164. return *this;
  165. }
  166. inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
  167. {
  168. xmm = _mm_add_pd(xmm, other.xmm);
  169. return *this;
  170. }
  171. inline XMMReg2Double operator+ (const XMMReg2Double& other)
  172. {
  173. XMMReg2Double ret;
  174. ret.xmm = _mm_add_pd(xmm, other.xmm);
  175. return ret;
  176. }
  177. inline XMMReg2Double operator- (const XMMReg2Double& other)
  178. {
  179. XMMReg2Double ret;
  180. ret.xmm = _mm_sub_pd(xmm, other.xmm);
  181. return ret;
  182. }
  183. inline XMMReg2Double operator* (const XMMReg2Double& other)
  184. {
  185. XMMReg2Double ret;
  186. ret.xmm = _mm_mul_pd(xmm, other.xmm);
  187. return ret;
  188. }
  189. inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
  190. {
  191. xmm = _mm_mul_pd(xmm, other.xmm);
  192. return *this;
  193. }
  194. inline void AddLowAndHigh()
  195. {
  196. __m128d xmm2;
  197. xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
  198. xmm = _mm_add_pd(xmm, xmm2);
  199. }
  200. inline void Store2Double(double* pval)
  201. {
  202. _mm_storeu_pd(pval, xmm);
  203. }
  204. inline void Store2DoubleAligned(double* pval)
  205. {
  206. _mm_store_pd(pval, xmm);
  207. }
  208. inline operator double () const
  209. {
  210. double val;
  211. _mm_store_sd(&val, xmm);
  212. return val;
  213. }
  214. };
  215. #else
  216. #warning "Software emulation of SSE2 !"
  217. class XMMReg2Double
  218. {
  219. public:
  220. double low;
  221. double high;
  222. XMMReg2Double() {}
  223. XMMReg2Double(double val) { low = val; high = 0.0; }
  224. XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
  225. static inline XMMReg2Double Zero()
  226. {
  227. XMMReg2Double reg;
  228. reg.Zeroize();
  229. return reg;
  230. }
  231. static inline XMMReg2Double Load2Val(const double* ptr)
  232. {
  233. XMMReg2Double reg;
  234. reg.nsLoad2Val(ptr);
  235. return reg;
  236. }
  237. static inline XMMReg2Double Load2ValAligned(const double* ptr)
  238. {
  239. XMMReg2Double reg;
  240. reg.nsLoad2ValAligned(ptr);
  241. return reg;
  242. }
  243. static inline XMMReg2Double Load2Val(const float* ptr)
  244. {
  245. XMMReg2Double reg;
  246. reg.nsLoad2Val(ptr);
  247. return reg;
  248. }
  249. static inline XMMReg2Double Load2Val(const unsigned char* ptr)
  250. {
  251. XMMReg2Double reg;
  252. reg.nsLoad2Val(ptr);
  253. return reg;
  254. }
  255. static inline XMMReg2Double Load2Val(const short* ptr)
  256. {
  257. XMMReg2Double reg;
  258. reg.nsLoad2Val(ptr);
  259. return reg;
  260. }
  261. inline void nsLoad2Val(const double* pval)
  262. {
  263. low = pval[0];
  264. high = pval[1];
  265. }
  266. inline void nsLoad2ValAligned(const double* pval)
  267. {
  268. low = pval[0];
  269. high = pval[1];
  270. }
  271. inline void nsLoad2Val(const float* pval)
  272. {
  273. low = pval[0];
  274. high = pval[1];
  275. }
  276. inline void nsLoad2Val(const unsigned char* ptr)
  277. {
  278. low = ptr[0];
  279. high = ptr[1];
  280. }
  281. inline void nsLoad2Val(const short* ptr)
  282. {
  283. low = ptr[0];
  284. high = ptr[1];
  285. }
  286. inline void nsLoad2Val(const unsigned short* ptr)
  287. {
  288. low = ptr[0];
  289. high = ptr[1];
  290. }
  291. static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
  292. {
  293. low.low = ptr[0];
  294. low.high = ptr[1];
  295. high.low = ptr[2];
  296. high.high = ptr[3];
  297. }
  298. static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
  299. {
  300. low.nsLoad2Val(ptr);
  301. high.nsLoad2Val(ptr+2);
  302. }
  303. static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
  304. {
  305. low.nsLoad2Val(ptr);
  306. high.nsLoad2Val(ptr+2);
  307. }
  308. static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
  309. {
  310. low.nsLoad2Val(ptr);
  311. high.nsLoad2Val(ptr+2);
  312. }
  313. static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
  314. {
  315. low.nsLoad2Val(ptr);
  316. high.nsLoad2Val(ptr+2);
  317. }
  318. inline void Zeroize()
  319. {
  320. low = 0.0;
  321. high = 0.0;
  322. }
  323. inline const XMMReg2Double& operator= (const XMMReg2Double& other)
  324. {
  325. low = other.low;
  326. high = other.high;
  327. return *this;
  328. }
  329. inline const XMMReg2Double& operator+= (const XMMReg2Double& other)
  330. {
  331. low += other.low;
  332. high += other.high;
  333. return *this;
  334. }
  335. inline XMMReg2Double operator+ (const XMMReg2Double& other)
  336. {
  337. XMMReg2Double ret;
  338. ret.low = low + other.low;
  339. ret.high = high + other.high;
  340. return ret;
  341. }
  342. inline XMMReg2Double operator- (const XMMReg2Double& other)
  343. {
  344. XMMReg2Double ret;
  345. ret.low = low - other.low;
  346. ret.high = high - other.high;
  347. return ret;
  348. }
  349. inline XMMReg2Double operator* (const XMMReg2Double& other)
  350. {
  351. XMMReg2Double ret;
  352. ret.low = low * other.low;
  353. ret.high = high * other.high;
  354. return ret;
  355. }
  356. inline const XMMReg2Double& operator*= (const XMMReg2Double& other)
  357. {
  358. low *= other.low;
  359. high *= other.high;
  360. return *this;
  361. }
  362. inline void AddLowAndHigh()
  363. {
  364. double add = low + high;
  365. low = add;
  366. high = add;
  367. }
  368. inline void Store2Double(double* pval)
  369. {
  370. pval[0] = low;
  371. pval[1] = high;
  372. }
  373. inline void Store2DoubleAligned(double* pval)
  374. {
  375. pval[0] = low;
  376. pval[1] = high;
  377. }
  378. inline operator double () const
  379. {
  380. return low;
  381. }
  382. };
  383. #endif /* defined(__x86_64) || defined(_M_X64) */
  384. class XMMReg4Double
  385. {
  386. public:
  387. XMMReg2Double low, high;
  388. XMMReg4Double() {}
  389. XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
  390. static inline XMMReg4Double Zero()
  391. {
  392. XMMReg4Double reg;
  393. reg.low.Zeroize();
  394. reg.high.Zeroize();
  395. return reg;
  396. }
  397. static inline XMMReg4Double Load4Val(const unsigned char* ptr)
  398. {
  399. XMMReg4Double reg;
  400. XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
  401. return reg;
  402. }
  403. static inline XMMReg4Double Load4Val(const short* ptr)
  404. {
  405. XMMReg4Double reg;
  406. reg.low.nsLoad2Val(ptr);
  407. reg.high.nsLoad2Val(ptr+2);
  408. return reg;
  409. }
  410. static inline XMMReg4Double Load4Val(const unsigned short* ptr)
  411. {
  412. XMMReg4Double reg;
  413. reg.low.nsLoad2Val(ptr);
  414. reg.high.nsLoad2Val(ptr+2);
  415. return reg;
  416. }
  417. static inline XMMReg4Double Load4Val(const double* ptr)
  418. {
  419. XMMReg4Double reg;
  420. reg.low.nsLoad2Val(ptr);
  421. reg.high.nsLoad2Val(ptr+2);
  422. return reg;
  423. }
  424. static inline XMMReg4Double Load4ValAligned(const double* ptr)
  425. {
  426. XMMReg4Double reg;
  427. reg.low.nsLoad2ValAligned(ptr);
  428. reg.high.nsLoad2ValAligned(ptr+2);
  429. return reg;
  430. }
  431. static inline XMMReg4Double Load4Val(const float* ptr)
  432. {
  433. XMMReg4Double reg;
  434. XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
  435. return reg;
  436. }
  437. inline const XMMReg4Double& operator= (const XMMReg4Double& other)
  438. {
  439. low = other.low;
  440. high = other.high;
  441. return *this;
  442. }
  443. inline const XMMReg4Double& operator+= (const XMMReg4Double& other)
  444. {
  445. low += other.low;
  446. high += other.high;
  447. return *this;
  448. }
  449. inline XMMReg4Double operator+ (const XMMReg4Double& other)
  450. {
  451. XMMReg4Double ret;
  452. ret.low = low + other.low;
  453. ret.high = high + other.high;
  454. return ret;
  455. }
  456. inline XMMReg4Double operator- (const XMMReg4Double& other)
  457. {
  458. XMMReg4Double ret;
  459. ret.low = low - other.low;
  460. ret.high = high - other.high;
  461. return ret;
  462. }
  463. inline XMMReg4Double operator* (const XMMReg4Double& other)
  464. {
  465. XMMReg4Double ret;
  466. ret.low = low * other.low;
  467. ret.high = high * other.high;
  468. return ret;
  469. }
  470. inline const XMMReg4Double& operator*= (const XMMReg4Double& other)
  471. {
  472. low *= other.low;
  473. high *= other.high;
  474. return *this;
  475. }
  476. inline void AddLowAndHigh()
  477. {
  478. low = low + high;
  479. low.AddLowAndHigh();
  480. }
  481. inline XMMReg2Double& GetLow()
  482. {
  483. return low;
  484. }
  485. };
  486. #endif /* GDALSSE_PRIV_H_INCLUDED */