$treeview $search $mathjax
Eigen
3.2.5
$projectbrief
|
$projectbrief
|
$searchbox |
00001 // This file is part of Eigen, a lightweight C++ template library 00002 // for linear algebra. 00003 // 00004 // Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr> 00005 // 00006 // This Source Code Form is subject to the terms of the Mozilla 00007 // Public License v. 2.0. If a copy of the MPL was not distributed 00008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 00009 00010 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H 00011 #define EIGEN_PACKET_MATH_ALTIVEC_H 00012 00013 namespace Eigen { 00014 00015 namespace internal { 00016 00017 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 00018 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 00019 #endif 00020 00021 #ifndef EIGEN_HAS_FUSE_CJMADD 00022 #define EIGEN_HAS_FUSE_CJMADD 1 00023 #endif 00024 00025 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 00026 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 00027 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 00028 #endif 00029 00030 typedef __vector float Packet4f; 00031 typedef __vector int Packet4i; 00032 typedef __vector unsigned int Packet4ui; 00033 typedef __vector __bool int Packet4bi; 00034 typedef __vector short int Packet8i; 00035 typedef __vector unsigned char Packet16uc; 00036 00037 // We don't want to write the same code all the time, but we need to reuse the constants 00038 // and it doesn't really work to declare them global, so we define macros instead 00039 00040 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ 00041 Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X) 00042 00043 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ 00044 Packet4i p4i_##NAME = vec_splat_s32(X) 00045 00046 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 00047 Packet4f p4f_##NAME = pset1<Packet4f>(X) 00048 00049 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 00050 Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) 00051 00052 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 00053 Packet4i p4i_##NAME = pset1<Packet4i>(X) 00054 00055 #define DST_CHAN 1 00056 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) 00057 00058 // Define global static constants: 00059 static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 }; 00060 static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 }; 00061 static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; 00062 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 00063 static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7}; 00064 00065 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); 00066 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); 00067 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); 00068 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); 00069 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); 00070 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); 00071 static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); 00072 00073 template<> struct packet_traits<float> : default_packet_traits 00074 { 00075 typedef Packet4f type; 00076 enum { 00077 Vectorizable = 1, 00078 AlignedOnScalar = 1, 00079 size=4, 00080 00081 // FIXME check the Has* 00082 HasSin = 0, 00083 HasCos = 0, 00084 HasLog = 0, 00085 HasExp = 0, 00086 HasSqrt = 0 00087 }; 00088 }; 00089 template<> struct packet_traits<int> : default_packet_traits 00090 { 00091 typedef Packet4i type; 00092 enum { 00093 // FIXME check the Has* 00094 Vectorizable = 1, 00095 AlignedOnScalar = 1, 00096 size=4 00097 }; 00098 }; 00099 00100 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; 00101 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; 00102 /* 00103 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) 00104 { 00105 union { 00106 Packet4f v; 00107 float n[4]; 00108 } vt; 00109 vt.v = v; 00110 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 00111 return s; 00112 } 00113 00114 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) 00115 { 00116 union { 00117 Packet4i v; 00118 int n[4]; 00119 } vt; 00120 vt.v = v; 00121 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 00122 return s; 00123 } 00124 00125 inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) 00126 { 00127 union { 00128 Packet4ui v; 00129 unsigned int n[4]; 00130 } vt; 00131 vt.v = v; 00132 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 00133 return s; 00134 } 00135 00136 inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) 00137 { 00138 union { 00139 Packet4bi v; 00140 unsigned int n[4]; 00141 } vt; 00142 vt.v = v; 00143 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 00144 return s; 00145 } 00146 */ 00147 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 00148 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 00149 float EIGEN_ALIGN16 af[4]; 00150 af[0] = from; 00151 Packet4f vc = vec_ld(0, af); 00152 vc = vec_splat(vc, 0); 00153 return vc; 00154 } 00155 00156 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { 00157 int EIGEN_ALIGN16 ai[4]; 00158 ai[0] = from; 00159 Packet4i vc = vec_ld(0, ai); 00160 vc = vec_splat(vc, 0); 00161 return vc; 00162 } 00163 00164 template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); } 00165 template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); } 00166 00167 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); } 00168 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); } 00169 00170 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); } 00171 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); } 00172 00173 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); } 00174 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); } 00175 00176 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 00177 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 00178 00179 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); } 00180 /* Commented out: it's actually slower than processing it scalar 00181 * 00182 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) 00183 { 00184 // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec 00185 //Set up constants, variables 00186 Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel; 00187 00188 // Get the absolute values 00189 a1 = vec_abs(a); 00190 b1 = vec_abs(b); 00191 00192 // Get the signs using xor 00193 Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO); 00194 00195 // Do the multiplication for the asbolute values. 00196 bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 ); 00197 low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1); 00198 high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO); 00199 high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16); 00200 prod = vec_add( low_prod, high_prod ); 00201 00202 // NOR the product and select only the negative elements according to the sign mask 00203 prod_ = vec_nor(prod, prod); 00204 prod_ = vec_sel(p4i_ZERO, prod_, sgn); 00205 00206 // Add 1 to the result to get the negative numbers 00207 v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn); 00208 prod_ = vec_add(prod_, v1sel); 00209 00210 // Merge the results back to the final vector. 00211 prod = vec_sel(prod, prod_, sgn); 00212 00213 return prod; 00214 } 00215 */ 00216 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 00217 { 00218 Packet4f t, y_0, y_1, res; 00219 00220 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation 00221 y_0 = vec_re(b); 00222 00223 // Do one Newton-Raphson iteration to get the needed accuracy 00224 t = vec_nmsub(y_0, b, p4f_ONE); 00225 y_1 = vec_madd(y_0, t, y_0); 00226 00227 res = vec_madd(a, y_1, p4f_ZERO); 00228 return res; 00229 } 00230 00231 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 00232 { eigen_assert(false && "packet integer division are not supported by AltiVec"); 00233 return pset1<Packet4i>(0); 00234 } 00235 00236 // for some weird raisons, it has to be overloaded for packet of integers 00237 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } 00238 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } 00239 00240 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } 00241 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } 00242 00243 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } 00244 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } 00245 00246 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics 00247 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } 00248 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } 00249 00250 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } 00251 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } 00252 00253 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } 00254 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } 00255 00256 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } 00257 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } 00258 00259 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } 00260 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } 00261 00262 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 00263 { 00264 EIGEN_DEBUG_ALIGNED_LOAD 00265 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 00266 Packet16uc MSQ, LSQ; 00267 Packet16uc mask; 00268 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 00269 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 00270 mask = vec_lvsl(0, from); // create the permute mask 00271 return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data 00272 00273 } 00274 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 00275 { 00276 EIGEN_DEBUG_ALIGNED_LOAD 00277 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 00278 Packet16uc MSQ, LSQ; 00279 Packet16uc mask; 00280 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 00281 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 00282 mask = vec_lvsl(0, from); // create the permute mask 00283 return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data 00284 } 00285 00286 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 00287 { 00288 Packet4f p; 00289 if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4f>(from); 00290 else p = ploadu<Packet4f>(from); 00291 return vec_perm(p, p, p16uc_DUPLICATE); 00292 } 00293 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 00294 { 00295 Packet4i p; 00296 if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4i>(from); 00297 else p = ploadu<Packet4i>(from); 00298 return vec_perm(p, p, p16uc_DUPLICATE); 00299 } 00300 00301 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } 00302 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } 00303 00304 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 00305 { 00306 EIGEN_DEBUG_UNALIGNED_STORE 00307 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 00308 // Warning: not thread safe! 00309 Packet16uc MSQ, LSQ, edges; 00310 Packet16uc edgeAlign, align; 00311 00312 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 00313 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 00314 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 00315 edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges 00316 align = vec_lvsr( 0, to ); // permute map to misalign data 00317 MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) 00318 LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) 00319 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 00320 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 00321 } 00322 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 00323 { 00324 EIGEN_DEBUG_UNALIGNED_STORE 00325 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 00326 // Warning: not thread safe! 00327 Packet16uc MSQ, LSQ, edges; 00328 Packet16uc edgeAlign, align; 00329 00330 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 00331 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 00332 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 00333 edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges 00334 align = vec_lvsr( 0, to ); // permute map to misalign data 00335 MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) 00336 LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) 00337 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 00338 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 00339 } 00340 00341 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } 00342 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } 00343 00344 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } 00345 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } 00346 00347 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } 00348 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } 00349 00350 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } 00351 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } 00352 00353 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 00354 { 00355 Packet4f b, sum; 00356 b = (Packet4f) vec_sld(a, a, 8); 00357 sum = vec_add(a, b); 00358 b = (Packet4f) vec_sld(sum, sum, 4); 00359 sum = vec_add(sum, b); 00360 return pfirst(sum); 00361 } 00362 00363 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 00364 { 00365 Packet4f v[4], sum[4]; 00366 00367 // It's easier and faster to transpose then add as columns 00368 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 00369 // Do the transpose, first set of moves 00370 v[0] = vec_mergeh(vecs[0], vecs[2]); 00371 v[1] = vec_mergel(vecs[0], vecs[2]); 00372 v[2] = vec_mergeh(vecs[1], vecs[3]); 00373 v[3] = vec_mergel(vecs[1], vecs[3]); 00374 // Get the resulting vectors 00375 sum[0] = vec_mergeh(v[0], v[2]); 00376 sum[1] = vec_mergel(v[0], v[2]); 00377 sum[2] = vec_mergeh(v[1], v[3]); 00378 sum[3] = vec_mergel(v[1], v[3]); 00379 00380 // Now do the summation: 00381 // Lines 0+1 00382 sum[0] = vec_add(sum[0], sum[1]); 00383 // Lines 2+3 00384 sum[1] = vec_add(sum[2], sum[3]); 00385 // Add the results 00386 sum[0] = vec_add(sum[0], sum[1]); 00387 00388 return sum[0]; 00389 } 00390 00391 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 00392 { 00393 Packet4i sum; 00394 sum = vec_sums(a, p4i_ZERO); 00395 sum = vec_sld(sum, p4i_ZERO, 12); 00396 return pfirst(sum); 00397 } 00398 00399 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 00400 { 00401 Packet4i v[4], sum[4]; 00402 00403 // It's easier and faster to transpose then add as columns 00404 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 00405 // Do the transpose, first set of moves 00406 v[0] = vec_mergeh(vecs[0], vecs[2]); 00407 v[1] = vec_mergel(vecs[0], vecs[2]); 00408 v[2] = vec_mergeh(vecs[1], vecs[3]); 00409 v[3] = vec_mergel(vecs[1], vecs[3]); 00410 // Get the resulting vectors 00411 sum[0] = vec_mergeh(v[0], v[2]); 00412 sum[1] = vec_mergel(v[0], v[2]); 00413 sum[2] = vec_mergeh(v[1], v[3]); 00414 sum[3] = vec_mergel(v[1], v[3]); 00415 00416 // Now do the summation: 00417 // Lines 0+1 00418 sum[0] = vec_add(sum[0], sum[1]); 00419 // Lines 2+3 00420 sum[1] = vec_add(sum[2], sum[3]); 00421 // Add the results 00422 sum[0] = vec_add(sum[0], sum[1]); 00423 00424 return sum[0]; 00425 } 00426 00427 // Other reduction functions: 00428 // mul 00429 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 00430 { 00431 Packet4f prod; 00432 prod = pmul(a, (Packet4f)vec_sld(a, a, 8)); 00433 return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4))); 00434 } 00435 00436 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 00437 { 00438 EIGEN_ALIGN16 int aux[4]; 00439 pstore(aux, a); 00440 return aux[0] * aux[1] * aux[2] * aux[3]; 00441 } 00442 00443 // min 00444 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 00445 { 00446 Packet4f b, res; 00447 b = vec_min(a, vec_sld(a, a, 8)); 00448 res = vec_min(b, vec_sld(b, b, 4)); 00449 return pfirst(res); 00450 } 00451 00452 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 00453 { 00454 Packet4i b, res; 00455 b = vec_min(a, vec_sld(a, a, 8)); 00456 res = vec_min(b, vec_sld(b, b, 4)); 00457 return pfirst(res); 00458 } 00459 00460 // max 00461 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 00462 { 00463 Packet4f b, res; 00464 b = vec_max(a, vec_sld(a, a, 8)); 00465 res = vec_max(b, vec_sld(b, b, 4)); 00466 return pfirst(res); 00467 } 00468 00469 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 00470 { 00471 Packet4i b, res; 00472 b = vec_max(a, vec_sld(a, a, 8)); 00473 res = vec_max(b, vec_sld(b, b, 4)); 00474 return pfirst(res); 00475 } 00476 00477 template<int Offset> 00478 struct palign_impl<Offset,Packet4f> 00479 { 00480 static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) 00481 { 00482 if (Offset!=0) 00483 first = vec_sld(first, second, Offset*4); 00484 } 00485 }; 00486 00487 template<int Offset> 00488 struct palign_impl<Offset,Packet4i> 00489 { 00490 static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) 00491 { 00492 if (Offset!=0) 00493 first = vec_sld(first, second, Offset*4); 00494 } 00495 }; 00496 00497 } // end namespace internal 00498 00499 } // end namespace Eigen 00500 00501 #endif // EIGEN_PACKET_MATH_ALTIVEC_H