vsx_utils.hpp 48 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16. // Copyright (C) 2015, Itseez Inc., all rights reserved.
  17. // Third party copyrights are property of their respective owners.
  18. //
  19. // Redistribution and use in source and binary forms, with or without modification,
  20. // are permitted provided that the following conditions are met:
  21. //
  22. // * Redistribution's of source code must retain the above copyright notice,
  23. // this list of conditions and the following disclaimer.
  24. //
  25. // * Redistribution's in binary form must reproduce the above copyright notice,
  26. // this list of conditions and the following disclaimer in the documentation
  27. // and/or other materials provided with the distribution.
  28. //
  29. // * The name of the copyright holders may not be used to endorse or promote products
  30. // derived from this software without specific prior written permission.
  31. //
  32. // This software is provided by the copyright holders and contributors "as is" and
  33. // any express or implied warranties, including, but not limited to, the implied
  34. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  35. // In no event shall the Intel Corporation or contributors be liable for any direct,
  36. // indirect, incidental, special, exemplary, or consequential damages
  37. // (including, but not limited to, procurement of substitute goods or services;
  38. // loss of use, data, or profits; or business interruption) however caused
  39. // and on any theory of liability, whether in contract, strict liability,
  40. // or tort (including negligence or otherwise) arising in any way out of
  41. // the use of this software, even if advised of the possibility of such damage.
  42. //
  43. //M*/
  44. #ifndef OPENCV_HAL_VSX_UTILS_HPP
  45. #define OPENCV_HAL_VSX_UTILS_HPP
  46. #include "opencv2/core/cvdef.h"
  47. //! @addtogroup core_utils_vsx
  48. //! @{
  49. #if CV_VSX
  50. #define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
  51. #define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
  52. FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); }
  53. #define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
  54. FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
  55. #define VSX_IMPL_PERM(rt, fnm, ...) \
  56. FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \
  57. { static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
  58. #define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
  59. #define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v}
  60. #define __VSX_S4__(c, v) (c){v, v, v, v}
  61. #define __VSX_S2__(c, v) (c){v, v}
  62. typedef __vector unsigned char vec_uchar16;
  63. #define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
  64. #define vec_uchar16_sp(c) (__VSX_S16__(vec_uchar16, c))
  65. #define vec_uchar16_c(v) ((vec_uchar16)(v))
  66. #define vec_uchar16_mx vec_uchar16_sp(0xFF)
  67. #define vec_uchar16_mn vec_uchar16_sp(0)
  68. #define vec_uchar16_z vec_uchar16_mn
  69. typedef __vector signed char vec_char16;
  70. #define vec_char16_set(...) (vec_char16){__VA_ARGS__}
  71. #define vec_char16_sp(c) (__VSX_S16__(vec_char16, c))
  72. #define vec_char16_c(v) ((vec_char16)(v))
  73. #define vec_char16_mx vec_char16_sp(0x7F)
  74. #define vec_char16_mn vec_char16_sp(-0x7F-1)
  75. #define vec_char16_z vec_char16_sp(0)
  76. typedef __vector unsigned short vec_ushort8;
  77. #define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
  78. #define vec_ushort8_sp(c) (__VSX_S8__(vec_ushort8, c))
  79. #define vec_ushort8_c(v) ((vec_ushort8)(v))
  80. #define vec_ushort8_mx vec_ushort8_sp(0xFFFF)
  81. #define vec_ushort8_mn vec_ushort8_sp(0)
  82. #define vec_ushort8_z vec_ushort8_mn
  83. typedef __vector signed short vec_short8;
  84. #define vec_short8_set(...) (vec_short8){__VA_ARGS__}
  85. #define vec_short8_sp(c) (__VSX_S8__(vec_short8, c))
  86. #define vec_short8_c(v) ((vec_short8)(v))
  87. #define vec_short8_mx vec_short8_sp(0x7FFF)
  88. #define vec_short8_mn vec_short8_sp(-0x7FFF-1)
  89. #define vec_short8_z vec_short8_sp(0)
  90. typedef __vector unsigned int vec_uint4;
  91. #define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
  92. #define vec_uint4_sp(c) (__VSX_S4__(vec_uint4, c))
  93. #define vec_uint4_c(v) ((vec_uint4)(v))
  94. #define vec_uint4_mx vec_uint4_sp(0xFFFFFFFFU)
  95. #define vec_uint4_mn vec_uint4_sp(0)
  96. #define vec_uint4_z vec_uint4_mn
  97. typedef __vector signed int vec_int4;
  98. #define vec_int4_set(...) (vec_int4){__VA_ARGS__}
  99. #define vec_int4_sp(c) (__VSX_S4__(vec_int4, c))
  100. #define vec_int4_c(v) ((vec_int4)(v))
  101. #define vec_int4_mx vec_int4_sp(0x7FFFFFFF)
  102. #define vec_int4_mn vec_int4_sp(-0x7FFFFFFF-1)
  103. #define vec_int4_z vec_int4_sp(0)
  104. typedef __vector float vec_float4;
  105. #define vec_float4_set(...) (vec_float4){__VA_ARGS__}
  106. #define vec_float4_sp(c) (__VSX_S4__(vec_float4, c))
  107. #define vec_float4_c(v) ((vec_float4)(v))
  108. #define vec_float4_mx vec_float4_sp(3.40282347E+38F)
  109. #define vec_float4_mn vec_float4_sp(1.17549435E-38F)
  110. #define vec_float4_z vec_float4_sp(0)
  111. typedef __vector unsigned long long vec_udword2;
  112. #define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
  113. #define vec_udword2_sp(c) (__VSX_S2__(vec_udword2, c))
  114. #define vec_udword2_c(v) ((vec_udword2)(v))
  115. #define vec_udword2_mx vec_udword2_sp(18446744073709551615ULL)
  116. #define vec_udword2_mn vec_udword2_sp(0)
  117. #define vec_udword2_z vec_udword2_mn
  118. typedef __vector signed long long vec_dword2;
  119. #define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
  120. #define vec_dword2_sp(c) (__VSX_S2__(vec_dword2, c))
  121. #define vec_dword2_c(v) ((vec_dword2)(v))
  122. #define vec_dword2_mx vec_dword2_sp(9223372036854775807LL)
  123. #define vec_dword2_mn vec_dword2_sp(-9223372036854775807LL-1)
  124. #define vec_dword2_z vec_dword2_sp(0)
  125. typedef __vector double vec_double2;
  126. #define vec_double2_set(...) (vec_double2){__VA_ARGS__}
  127. #define vec_double2_c(v) ((vec_double2)(v))
  128. #define vec_double2_sp(c) (__VSX_S2__(vec_double2, c))
  129. #define vec_double2_mx vec_double2_sp(1.7976931348623157E+308)
  130. #define vec_double2_mn vec_double2_sp(2.2250738585072014E-308)
  131. #define vec_double2_z vec_double2_sp(0)
  132. #define vec_bchar16 __vector __bool char
  133. #define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
  134. #define vec_bchar16_c(v) ((vec_bchar16)(v))
  135. #define vec_bchar16_f (__VSX_S16__(vec_bchar16, 0))
  136. #define vec_bchar16_t (__VSX_S16__(vec_bchar16, 1))
  137. #define vec_bshort8 __vector __bool short
  138. #define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
  139. #define vec_bshort8_c(v) ((vec_bshort8)(v))
  140. #define vec_bshort8_f (__VSX_S8__(vec_bshort8, 0))
  141. #define vec_bshort8_t (__VSX_S8__(vec_bshort8, 1))
  142. #define vec_bint4 __vector __bool int
  143. #define vec_bint4_set(...) (vec_bint4){__VA_ARGS__}
  144. #define vec_bint4_c(v) ((vec_bint4)(v))
  145. #define vec_bint4_f (__VSX_S4__(vec_bint4, 0))
  146. #define vec_bint4_t (__VSX_S4__(vec_bint4, 1))
  147. #define vec_bdword2 __vector __bool long long
  148. #define vec_bdword2_set(...) (vec_bdword2){__VA_ARGS__}
  149. #define vec_bdword2_c(v) ((vec_bdword2)(v))
  150. #define vec_bdword2_f (__VSX_S2__(vec_bdword2, 0))
  151. #define vec_bdword2_t (__VSX_S2__(vec_bdword2, 1))
  152. /*
  153. * GCC VSX compatibility
  154. **/
  155. #if defined(__GNUG__) && !defined(__IBMCPP__) && !defined(__clang__)
  156. // inline asm helper
  157. #define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \
  158. FORCE_INLINE(rt) fnm(const rg& a) \
  159. { rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "="#rto (rs) : #rgo (a)); return rs; }
  160. #define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
  161. FORCE_INLINE(rt) fnm(const rg& a) \
  162. { rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; }
  163. #define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm) \
  164. FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
  165. { rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; }
  166. #define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
  167. #if __GNUG__ < 7
  168. /* up to GCC 6 vec_mul only supports precisions and llong */
  169. # ifdef vec_mul
  170. # undef vec_mul
  171. # endif
  172. /*
  173. * there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07,
  174. * XLC Implement it by using instruction "multiply even", "multiply oden" and "permute"
  175. * todo: Do I need to support 8-bit ?
  176. **/
  177. # define VSX_IMPL_MULH(Tvec, Tcast) \
  178. FORCE_INLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
  179. { \
  180. static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21, \
  181. 8, 9, 24, 25, 12, 13, 28, 29}; \
  182. return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \
  183. }
  184. VSX_IMPL_MULH(vec_short8, vec_short8_c)
  185. VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c)
  186. /* vmuluwm can be used for unsigned or signed integers, that's what they said */
  187. VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
  188. VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
  189. /* redirect to GCC builtin vec_mul, since it already supports precisions and llong */
  190. VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
  191. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
  192. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
  193. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
  194. #endif // __GNUG__ < 7
  195. #if __GNUG__ < 6
  196. /*
  197. * Instruction "compare greater than or equal" in ISA 2.07 only supports single
  198. * and double precision.
  199. * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR.
  200. **/
  201. # ifdef vec_cmpge
  202. # undef vec_cmpge
  203. # endif
  204. # ifdef vec_cmple
  205. # undef vec_cmple
  206. # endif
  207. # define vec_cmple(a, b) vec_cmpge(b, a)
  208. # define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
  209. VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
  210. VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge)
  211. VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
  212. VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge)
  213. VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
  214. VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge)
  215. VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
  216. VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
  217. VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
  218. /* redirect to GCC builtin cmpge, since it already supports precisions */
  219. VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
  220. VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
  221. // up to gcc5 vec_nor doesn't support bool long long
  222. # undef vec_nor
  223. template<typename T>
  224. VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
  225. FORCE_INLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
  226. { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
  227. #endif // __GNUG__ < 6
  228. // vector population count
  229. #ifndef vec_popcnt
  230. VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcnt)
  231. VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcnt)
  232. VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcnt)
  233. VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcnt)
  234. VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcnt)
  235. VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcnt)
  236. VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcnt)
  237. VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcnt)
  238. #endif // vec_popcnt
  239. #if __GNUG__ < 5
  240. // vec_xxpermdi in gcc4 missing little-endian supports just like clang
  241. # define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
  242. // vec_packs doesn't support double words in gcc4
  243. # undef vec_packs
  244. VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
  245. VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
  246. VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
  247. VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)
  248. VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs)
  249. VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
  250. #else
  251. # define vec_permi vec_xxpermdi
  252. #endif
  253. // converts between single and double-precision
  254. #ifndef vec_cvf
  255. VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
  256. FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
  257. { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); }
  258. #endif
  259. // converts 32 and 64 bit integers to double-precision
  260. #ifndef vec_ctd
  261. # define vec_ctd(a, b) __vec_ctd(a)
  262. VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, __vec_ctd)
  263. VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, __vec_ctd)
  264. VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, __vec_ctd)
  265. VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, __vec_ctd)
  266. #endif
  267. // shift left double by word immediate
  268. #ifndef vec_sldw
  269. # define vec_sldw __builtin_vsx_xxsldwi
  270. #endif
  271. // just in case if GCC doesn't define it
  272. #ifndef vec_xl
  273. # define vec_xl vec_vsx_ld
  274. # define vec_xst vec_vsx_st
  275. #endif
  276. #endif // GCC VSX compatibility
  277. /*
  278. * CLANG VSX compatibility
  279. **/
  280. #if defined(__clang__) && !defined(__IBMCPP__)
  281. /*
  282. * CLANG doesn't support %x<n> in the inline asm template which fixes register number
  283. * when using any of the register constraints wa, wd, wf
  284. *
  285. * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
  286. * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
  287. *
  288. * So we're not able to use inline asm and only use built-in functions that CLANG supports
  289. */
  290. #if __clang_major__ < 5
  291. // implement vec_permi in a dirty way
  292. # define VSX_IMPL_CLANG_4_PERMI(Tvec) \
  293. FORCE_INLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c) \
  294. { \
  295. switch (c) \
  296. { \
  297. case 0: \
  298. return vec_mergeh(a, b); \
  299. case 1: \
  300. return vec_mergel(vec_mergeh(a, a), b); \
  301. case 2: \
  302. return vec_mergeh(vec_mergel(a, a), b); \
  303. default: \
  304. return vec_mergel(a, b); \
  305. } \
  306. }
  307. VSX_IMPL_CLANG_4_PERMI(vec_udword2)
  308. VSX_IMPL_CLANG_4_PERMI(vec_dword2)
  309. VSX_IMPL_CLANG_4_PERMI(vec_double2)
  310. // vec_xxsldwi is missing in clang 4
  311. # define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
  312. #else
  313. // vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
  314. # define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
  315. #endif // __clang_major__ < 5
  316. // shift left double by word immediate
  317. #ifndef vec_sldw
  318. # define vec_sldw vec_xxsldwi
  319. #endif
  320. /* converts between single and double precision */
  321. #ifndef vec_cvf
  322. VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
  323. FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
  324. { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); }
  325. #endif
  326. /* converts 32 and 64 bit integers to double-precision */
  327. #ifndef vec_ctd
  328. # define vec_ctd(a, b) __vec_ctd(a)
  329. VSX_REDIRECT_1RG(vec_double2, vec_int4, __vec_ctd, __builtin_vsx_xvcvsxwdp)
  330. VSX_REDIRECT_1RG(vec_double2, vec_uint4, __vec_ctd, __builtin_vsx_xvcvuxwdp)
  331. // implement vec_ctd for double word in a dirty way since we are missing builtin xvcvsxddp, xvcvuxddp
  332. // please try to avoid using it for double words
  333. FORCE_INLINE(vec_double2) __vec_ctd(const vec_dword2& a)
  334. { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); }
  335. FORCE_INLINE(vec_double2) __vec_ctd(const vec_udword2& a)
  336. { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); }
  337. #endif
  338. // Implement vec_rsqrt since clang only supports vec_rsqrte
  339. #ifndef vec_rsqrt
  340. FORCE_INLINE(vec_float4) vec_rsqrt(const vec_float4& a)
  341. { return vec_div(vec_float4_sp(1), vec_sqrt(a)); }
  342. FORCE_INLINE(vec_double2) vec_rsqrt(const vec_double2& a)
  343. { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
  344. #endif
  345. /*
  346. * __builtin_altivec_vctsxs in clang 5 and 6 causes ambiguous which used by vec_cts
  347. * so we just redefine it and cast it
  348. */
  349. #if __clang_major__ > 4
  350. # undef vec_cts
  351. # define vec_cts(__a, __b) \
  352. _Generic((__a), vector float \
  353. : (vector signed int)__builtin_altivec_vctsxs((__a), (__b)), vector double \
  354. : __extension__({ \
  355. vector double __ret = \
  356. (__a) * \
  357. (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
  358. << 52); \
  359. __builtin_convertvector(__ret, vector signed long long); \
  360. }))
  361. #endif // __clang_major__ > 4
  362. #endif // CLANG VSX compatibility
  363. /*
  364. * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
  365. * load and set using offset depend on the pointer type
  366. *
  367. * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
  368. * load and set using offset depend on fixed bytes size
  369. *
  370. * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
  371. * so we are using vec_vsx_ld, vec_vsx_st instead
  372. */
  373. #if defined(__clang__) && !defined(__IBMCPP__)
  374. # define vsx_ldf vec_vsx_ld
  375. # define vsx_stf vec_vsx_st
  376. #else // GCC , XLC
  377. # define vsx_ldf vec_xl
  378. # define vsx_stf vec_xst
  379. #endif
  380. #define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
  381. #define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
  382. #define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
  383. /*
  384. * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words
  385. * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long
  386. * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses
  387. *
  388. * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
  389. */
  390. #if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
  391. FORCE_INLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
  392. { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); }
  393. FORCE_INLINE(vec_dword2) vsx_ld2(long o, const int64* p)
  394. { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); }
  395. FORCE_INLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
  396. { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); }
  397. FORCE_INLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
  398. { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); }
  399. #else // XLC
  400. FORCE_INLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
  401. { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); }
  402. FORCE_INLINE(vec_dword2) vsx_ld2(long o, const int64* p)
  403. { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); }
  404. FORCE_INLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
  405. { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); }
  406. FORCE_INLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
  407. { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
  408. #endif
  409. #if defined(__clang__) || defined(__IBMCPP__)
  410. // gcc can find his way in casting log int and XLC, CLANG ambiguous
  411. FORCE_INLINE(vec_udword2) vec_splats(uint64 v)
  412. { return vec_splats((unsigned long long) v); }
  413. FORCE_INLINE(vec_dword2) vec_splats(int64 v)
  414. { return vec_splats((long long) v); }
  415. #endif
  416. // Implement store vector bool char for XLC
  417. #if defined(__IBMCPP__) && defined(__clang__)
  418. FORCE_INLINE(void) vec_xst(const vec_bchar16 &vec, long o, uchar* p)
  419. { vec_xst(vec_uchar16_c(vec), VSX_OFFSET(o, p), p); }
  420. #endif
  421. // Working around vec_popcnt compatibility
  422. /*
  423. * vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
  424. *
  425. * use vec_popcntu instead to deal with it
  426. */
  427. #if defined(__clang__) && !defined(__IBMCPP__)
  428. # define VSX_IMPL_CLANG_POPCNTU(Tvec, Tvec2, ucast) \
  429. FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a) \
  430. { return ucast(vec_popcnt(a)); }
  431. VSX_IMPL_CLANG_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
  432. VSX_IMPL_CLANG_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
  433. VSX_IMPL_CLANG_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
  434. // redirect unsigned types
  435. VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
  436. VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
  437. VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
  438. #else
  439. # define vec_popcntu vec_popcnt
  440. #endif
  441. // Working around vec_cts compatibility
  442. /*
  443. * vec_cts in gcc and clang converts single-precision to signed fixed-point word
  444. * and from double-precision to signed doubleword, also there's no implement for vec_ctsl
  445. *
  446. * vec_cts in xlc converts single and double precision to signed fixed-point word
  447. * and xlc has vec_ctsl which converts single and double precision to signed doubleword
  448. *
  449. * so to deal with this situation, use vec_cts only if you want to convert single-precision to signed fixed-point word
  450. * and use vec_ctsl when you want to convert double-precision to signed doubleword
  451. *
  452. * Also we implemented vec_ctsw(a) to convert double-precision to signed fixed-point word
  453. */
  454. // converts double-precision to signed doubleword for GCC and CLANG
  455. #if !defined(vec_ctsl) && !defined(__IBMCPP__) && (defined(__GNUG__) || defined(__clang__))
  456. // GCC4 has incorrect results in convert to signed doubleword
  457. # if !defined(__clang__) && __GNUG__ < 5
  458. # define vec_ctsl(a, b) __vec_ctsl(a)
  459. VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, __vec_ctsl)
  460. # else // GCC > 4 , CLANG
  461. # define vec_ctsl vec_cts
  462. # endif
  463. #endif
  464. // converts double-precision to signed fixed-point word
  465. #if defined(__IBMCPP__)
  466. # define vec_ctsw(a) vec_cts(a, 0)
  467. #else // GCC, CLANG
  468. # define vec_ctsw(a) vec_int4_c(__builtin_vsx_xvcvdpsxws(a))
  469. #endif
  470. // load 4 unsigned bytes into uint4 vector
  471. #define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
  472. // load 4 signed bytes into int4 vector
  473. #define vec_ld_bsw(p) vec_int4_set((p)[0], (p)[1], (p)[2], (p)[3])
  474. // load 4 unsigned bytes into float vector
  475. #define vec_ld_bps(p) vec_ctf(vec_ld_buw(p), 0)
  476. // Store lower 8 byte
  477. #define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
  478. // Store higher 8 byte
  479. #define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
  480. /*
  481. * vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part
  482. * vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part
  483. **/
  484. #if defined(__clang__) && !defined(__IBMCPP__)
  485. # define __VSX_LOAD_L8(Tvec, p) (Tvec)((vec_udword2)*((uint64*)(p)))
  486. #else
  487. # define __VSX_LOAD_L8(Tvec, p) *((Tvec*)(p))
  488. #endif
  489. #define VSX_IMPL_LOAD_L8(Tvec, Tp) \
  490. FORCE_INLINE(Tvec) vec_ld_l8(const Tp *p) \
  491. { return __VSX_LOAD_L8(Tvec, p); } \
  492. FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p) \
  493. { \
  494. static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000}; \
  495. return vec_and(vec_ld_l8(p), (Tvec)mask); \
  496. }
  497. VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
  498. VSX_IMPL_LOAD_L8(vec_char16, schar)
  499. VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
  500. VSX_IMPL_LOAD_L8(vec_short8, short)
  501. VSX_IMPL_LOAD_L8(vec_uint4, uint)
  502. VSX_IMPL_LOAD_L8(vec_int4, int)
  503. VSX_IMPL_LOAD_L8(vec_float4, float)
  504. VSX_IMPL_LOAD_L8(vec_udword2, uint64)
  505. VSX_IMPL_LOAD_L8(vec_dword2, int64)
  506. VSX_IMPL_LOAD_L8(vec_double2, double)
  507. // logical not
  508. #define vec_not(a) vec_nor(a, a)
  509. // power9 yaya
  510. // not equal
  511. #ifndef vec_cmpne
  512. # define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
  513. #endif
  514. // absoulte difference
  515. #ifndef vec_absd
  516. # define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
  517. #endif
  518. /*
  519. * Implement vec_unpacklu and vec_unpackhu
  520. * since vec_unpackl, vec_unpackh only support signed integers
  521. **/
  522. #define VSX_IMPL_UNPACKU(rt, rg, zero) \
  523. FORCE_INLINE(rt) vec_unpacklu(const rg& a) \
  524. { return reinterpret_cast<rt>(vec_mergel(a, zero)); } \
  525. FORCE_INLINE(rt) vec_unpackhu(const rg& a) \
  526. { return reinterpret_cast<rt>(vec_mergeh(a, zero)); }
  527. VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
  528. VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z)
  529. VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)
  530. /*
  531. * Implement vec_mergesqe and vec_mergesqo
  532. * Merges the sequence values of even and odd elements of two vectors
  533. */
  534. // 16
  535. #define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
  536. #define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
  537. VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
  538. VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
  539. VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe)
  540. VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo)
  541. // 8
  542. #define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
  543. #define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
  544. VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
  545. VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
  546. VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe)
  547. VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo)
  548. // 4
  549. #define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
  550. #define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
  551. VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe)
  552. VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo)
  553. VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe)
  554. VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo)
  555. VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
  556. VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
  557. // 2
  558. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
  559. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
  560. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh)
  561. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel)
  562. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
  563. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
  564. /*
  565. * Implement vec_mergesqh and vec_mergesql
  566. * Merges the sequence most and least significant halves of two vectors
  567. */
  568. #define VSX_IMPL_MERGESQHL(Tvec) \
  569. FORCE_INLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b) \
  570. { return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); } \
  571. FORCE_INLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b) \
  572. { return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
  573. VSX_IMPL_MERGESQHL(vec_uchar16)
  574. VSX_IMPL_MERGESQHL(vec_char16)
  575. VSX_IMPL_MERGESQHL(vec_ushort8)
  576. VSX_IMPL_MERGESQHL(vec_short8)
  577. VSX_IMPL_MERGESQHL(vec_uint4)
  578. VSX_IMPL_MERGESQHL(vec_int4)
  579. VSX_IMPL_MERGESQHL(vec_float4)
  580. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
  581. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
  582. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh)
  583. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel)
  584. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
  585. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
  586. // 2 and 4 channels interleave for all types except 2 lanes
  587. #define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec) \
  588. FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
  589. { \
  590. vsx_stf(vec_mergeh(a, b), 0, ptr); \
  591. vsx_stf(vec_mergel(a, b), 16, ptr); \
  592. } \
  593. FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  594. const Tvec& c, const Tvec& d, Tp* ptr) \
  595. { \
  596. Tvec ac = vec_mergeh(a, c); \
  597. Tvec bd = vec_mergeh(b, d); \
  598. vsx_stf(vec_mergeh(ac, bd), 0, ptr); \
  599. vsx_stf(vec_mergel(ac, bd), 16, ptr); \
  600. ac = vec_mergel(a, c); \
  601. bd = vec_mergel(b, d); \
  602. vsx_stf(vec_mergeh(ac, bd), 32, ptr); \
  603. vsx_stf(vec_mergel(ac, bd), 48, ptr); \
  604. }
  605. VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16)
  606. VSX_IMPL_ST_INTERLEAVE(schar, vec_char16)
  607. VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
  608. VSX_IMPL_ST_INTERLEAVE(short, vec_short8)
  609. VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4)
  610. VSX_IMPL_ST_INTERLEAVE(int, vec_int4)
  611. VSX_IMPL_ST_INTERLEAVE(float, vec_float4)
  612. // 2 and 4 channels deinterleave for 16 lanes
  613. #define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \
  614. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
  615. { \
  616. Tvec v0 = vsx_ld(0, ptr); \
  617. Tvec v1 = vsx_ld(16, ptr); \
  618. a = vec_mergesqe(v0, v1); \
  619. b = vec_mergesqo(v0, v1); \
  620. } \
  621. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
  622. Tvec& c, Tvec& d) \
  623. { \
  624. Tvec v0 = vsx_ld(0, ptr); \
  625. Tvec v1 = vsx_ld(16, ptr); \
  626. Tvec v2 = vsx_ld(32, ptr); \
  627. Tvec v3 = vsx_ld(48, ptr); \
  628. Tvec m0 = vec_mergesqe(v0, v1); \
  629. Tvec m1 = vec_mergesqe(v2, v3); \
  630. a = vec_mergesqe(m0, m1); \
  631. c = vec_mergesqo(m0, m1); \
  632. m0 = vec_mergesqo(v0, v1); \
  633. m1 = vec_mergesqo(v2, v3); \
  634. b = vec_mergesqe(m0, m1); \
  635. d = vec_mergesqo(m0, m1); \
  636. }
  637. VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
  638. VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
  639. // 2 and 4 channels deinterleave for 8 lanes
  640. #define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec) \
  641. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
  642. { \
  643. Tvec v0 = vsx_ld(0, ptr); \
  644. Tvec v1 = vsx_ld(8, ptr); \
  645. a = vec_mergesqe(v0, v1); \
  646. b = vec_mergesqo(v0, v1); \
  647. } \
  648. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
  649. Tvec& c, Tvec& d) \
  650. { \
  651. Tvec v0 = vsx_ld(0, ptr); \
  652. Tvec v1 = vsx_ld(8, ptr); \
  653. Tvec m0 = vec_mergeh(v0, v1); \
  654. Tvec m1 = vec_mergel(v0, v1); \
  655. Tvec ab0 = vec_mergeh(m0, m1); \
  656. Tvec cd0 = vec_mergel(m0, m1); \
  657. v0 = vsx_ld(16, ptr); \
  658. v1 = vsx_ld(24, ptr); \
  659. m0 = vec_mergeh(v0, v1); \
  660. m1 = vec_mergel(v0, v1); \
  661. Tvec ab1 = vec_mergeh(m0, m1); \
  662. Tvec cd1 = vec_mergel(m0, m1); \
  663. a = vec_mergesqh(ab0, ab1); \
  664. b = vec_mergesql(ab0, ab1); \
  665. c = vec_mergesqh(cd0, cd1); \
  666. d = vec_mergesql(cd0, cd1); \
  667. }
  668. VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
  669. VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8)
  670. // 2 and 4 channels deinterleave for 4 lanes
  671. #define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \
  672. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
  673. { \
  674. a = vsx_ld(0, ptr); \
  675. b = vsx_ld(4, ptr); \
  676. Tvec m0 = vec_mergeh(a, b); \
  677. Tvec m1 = vec_mergel(a, b); \
  678. a = vec_mergeh(m0, m1); \
  679. b = vec_mergel(m0, m1); \
  680. } \
  681. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
  682. Tvec& c, Tvec& d) \
  683. { \
  684. Tvec v0 = vsx_ld(0, ptr); \
  685. Tvec v1 = vsx_ld(4, ptr); \
  686. Tvec v2 = vsx_ld(8, ptr); \
  687. Tvec v3 = vsx_ld(12, ptr); \
  688. Tvec m0 = vec_mergeh(v0, v2); \
  689. Tvec m1 = vec_mergeh(v1, v3); \
  690. a = vec_mergeh(m0, m1); \
  691. b = vec_mergel(m0, m1); \
  692. m0 = vec_mergel(v0, v2); \
  693. m1 = vec_mergel(v1, v3); \
  694. c = vec_mergeh(m0, m1); \
  695. d = vec_mergel(m0, m1); \
  696. }
  697. VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4)
  698. VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4)
  699. VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
  700. // 2 and 4 channels interleave and deinterleave for 2 lanes
  701. #define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func) \
  702. FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
  703. { \
  704. st_func(vec_mergeh(a, b), 0, ptr); \
  705. st_func(vec_mergel(a, b), 2, ptr); \
  706. } \
  707. FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  708. const Tvec& c, const Tvec& d, Tp* ptr) \
  709. { \
  710. st_func(vec_mergeh(a, b), 0, ptr); \
  711. st_func(vec_mergel(a, b), 2, ptr); \
  712. st_func(vec_mergeh(c, d), 4, ptr); \
  713. st_func(vec_mergel(c, d), 6, ptr); \
  714. } \
  715. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
  716. { \
  717. Tvec m0 = ld_func(0, ptr); \
  718. Tvec m1 = ld_func(2, ptr); \
  719. a = vec_mergeh(m0, m1); \
  720. b = vec_mergel(m0, m1); \
  721. } \
  722. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
  723. Tvec& c, Tvec& d) \
  724. { \
  725. Tvec v0 = ld_func(0, ptr); \
  726. Tvec v1 = ld_func(2, ptr); \
  727. a = vec_mergeh(v0, v1); \
  728. b = vec_mergel(v0, v1); \
  729. v0 = ld_func(4, ptr); \
  730. v1 = ld_func(6, ptr); \
  731. c = vec_mergeh(v0, v1); \
  732. d = vec_mergel(v0, v1); \
  733. }
  734. VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2)
  735. VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
  736. VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st)
  737. /* 3 channels */
  738. #define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \
  739. FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  740. const Tvec& c, Tp* ptr) \
  741. { \
  742. static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5}; \
  743. static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15}; \
  744. vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
  745. static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26}; \
  746. static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15}; \
  747. vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr); \
  748. static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0}; \
  749. static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31}; \
  750. vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr); \
  751. } \
  752. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
  753. { \
  754. Tvec v1 = vsx_ld(0, ptr); \
  755. Tvec v2 = vsx_ld(16, ptr); \
  756. Tvec v3 = vsx_ld(32, ptr); \
  757. static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0}; \
  758. static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}; \
  759. a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
  760. static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
  761. static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}; \
  762. b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
  763. static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0}; \
  764. static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31}; \
  765. c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
  766. }
  767. VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
  768. VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
  769. #define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec) \
  770. FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  771. const Tvec& c, Tp* ptr) \
  772. { \
  773. static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21}; \
  774. static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15}; \
  775. vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
  776. static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11}; \
  777. static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15}; \
  778. vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr); \
  779. static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0}; \
  780. static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31}; \
  781. vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr); \
  782. } \
  783. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
  784. { \
  785. Tvec v1 = vsx_ld(0, ptr); \
  786. Tvec v2 = vsx_ld(8, ptr); \
  787. Tvec v3 = vsx_ld(16, ptr); \
  788. static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
  789. static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27}; \
  790. a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
  791. static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0}; \
  792. static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29}; \
  793. b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
  794. static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
  795. static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31}; \
  796. c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
  797. }
  798. VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
  799. VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8)
  800. #define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \
  801. FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  802. const Tvec& c, Tp* ptr) \
  803. { \
  804. Tvec hbc = vec_mergeh(b, c); \
  805. static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; \
  806. vsx_st(vec_perm(a, hbc, ahbc), 0, ptr); \
  807. Tvec lab = vec_mergel(a, b); \
  808. vsx_st(vec_sld(lab, hbc, 8), 4, ptr); \
  809. static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
  810. vsx_st(vec_perm(c, lab, clab), 8, ptr); \
  811. } \
  812. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
  813. { \
  814. Tvec v1 = vsx_ld(0, ptr); \
  815. Tvec v2 = vsx_ld(4, ptr); \
  816. Tvec v3 = vsx_ld(8, ptr); \
  817. static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31}; \
  818. a = vec_perm(v1, vec_sld(v3, v2, 8), flp); \
  819. static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19}; \
  820. b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); \
  821. c = vec_perm(vec_sld(v2, v1, 8), v3, flp); \
  822. }
  823. VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4)
  824. VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4)
  825. VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
  826. #define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \
  827. FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  828. const Tvec& c, Tp* ptr) \
  829. { \
  830. st_func(vec_mergeh(a, b), 0, ptr); \
  831. st_func(vec_permi(c, a, 1), 2, ptr); \
  832. st_func(vec_mergel(b, c), 4, ptr); \
  833. } \
  834. FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \
  835. Tvec& b, Tvec& c) \
  836. { \
  837. Tvec v1 = ld_func(0, ptr); \
  838. Tvec v2 = ld_func(2, ptr); \
  839. Tvec v3 = ld_func(4, ptr); \
  840. a = vec_permi(v1, v2, 1); \
  841. b = vec_permi(v1, v3, 2); \
  842. c = vec_permi(v2, v3, 1); \
  843. }
  844. VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2)
  845. VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
  846. VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st)
  847. #endif // CV_VSX
  848. //! @}
  849. #endif // OPENCV_HAL_VSX_UTILS_HPP