DOSBox-X
|
00001 /* Copyright (C) 2003, 2004, 2005, 2006, 2008, 2009 Dean Beeler, Jerome Fisher 00002 * Copyright (C) 2011 Dean Beeler, Jerome Fisher, Sergey V. Mikayev 00003 * 00004 * This program is free software: you can redistribute it and/or modify 00005 * it under the terms of the GNU Lesser General Public License as published by 00006 * the Free Software Foundation, either version 2.1 of the License, or 00007 * (at your option) any later version. 00008 * 00009 * This program is distributed in the hope that it will be useful, 00010 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00012 * GNU Lesser General Public License for more details. 00013 * 00014 * You should have received a copy of the GNU Lesser General Public License 00015 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00016 */ 00017 00018 #ifndef MT32EMU_MMATH_H 00019 #define MT32EMU_MMATH_H 00020 00021 #if !defined (__SSE2__) || (defined (_M_IX86_FP) && (_M_IX86_FP) < 2) 00022 #include <cmath> 00023 namespace MT32Emu { 00024 // Mathematical constants 00025 const double DOUBLE_PI = 3.141592653589793; 00026 const double DOUBLE_LN_10 = 2.302585092994046; 00027 const float FLOAT_PI = 3.1415927f; 00028 const float FLOAT_2PI = 6.2831853f; 00029 const float FLOAT_LN_2 = 0.6931472f; 00030 const float FLOAT_LN_10 = 2.3025851f; 00031 static inline float POWF(float x, float y) { 00032 return pow(x, y); 00033 } 00034 static inline float EXPF(float x) { 00035 return exp(x); 00036 } 00037 static inline float EXP2F(float x) { 00038 #ifdef __APPLE__ 00039 // on OSX exp2f() is 1.59 times faster than "exp() and the multiplication with FLOAT_LN_2" 00040 return exp2f(x); 00041 #else 00042 return exp(FLOAT_LN_2 * x); 00043 #endif 00044 } 00045 static inline float EXP10F(float x) { 00046 return exp(FLOAT_LN_10 * x); 00047 } 00048 static inline float LOGF(float x) { 00049 return log(x); 00050 } 00051 static inline float LOG2F(float x) { 00052 return log(x) / FLOAT_LN_2; 00053 } 00054 static inline float LOG10F(float x) { 00055 return log10(x); 00056 } 00057 } // namespace MT32Emu 00058 00059 #else 00060 #define FIXEDPOINT_UDIV(x, y, point) (((x) << (point)) / ((y))) 00061 #define FIXEDPOINT_SDIV(x, y, point) (((x) * (1 << point)) / ((y))) 00062 #define FIXEDPOINT_UMULT(x, y, point) (((x) * (y)) >> point) 00063 #define FIXEDPOINT_SMULT(x, y, point) (((x) * (y)) / (1 << point)) 00064 00065 #define FIXEDPOINT_MAKE(x, point) ((Bit32u)((1 << point) * x)) 00066 00067 #if defined(_MSC_VER) 00068 # pragma warning(disable:4244) /* const fmath::local::uint64_t to double possible loss of data */ 00069 #endif 00070 00071 // added by ykhwong (start) 00072 #pragma once 00073 00083 /* 00084 function prototype list 00085 00086 float fmath::exp(float); 00087 float fmath::log(float); 00088 00089 __m128 fmath::exp_ps(__m128); 00090 __m128 fmath::log_ps(__m128); 00091 00092 if FMATH_USE_XBYAK is defined then Xbyak version are used 00093 */ 00094 //#define FMATH_USE_XBYAK 00095 00096 #include <math.h> 00097 #include <stddef.h> 00098 #include <assert.h> 00099 #include <limits> 00100 #include <stdlib.h> 00101 #include <float.h> 00102 #if defined(_WIN32) && !defined(__GNUC__) 00103 #include <intrin.h> 00104 #ifndef MIE_ALIGN 00105 #define MIE_ALIGN(x) __declspec(align(x)) 00106 #endif 00107 #else 00108 #ifndef __GNUC_PREREQ 00109 #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor))) 00110 #endif 00111 #if __GNUC_PREREQ(4, 4) || !defined(__GNUC__) 00112 /* GCC >= 4.4 and non-GCC compilers */ 00113 #include <x86intrin.h> 00114 #elif __GNUC_PREREQ(4, 1) 00115 /* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header */ 00116 #include <emmintrin.h> 00117 #endif 00118 #ifndef MIE_ALIGN 00119 #define MIE_ALIGN(x) __attribute__((aligned(x))) 00120 #endif 00121 #endif 00122 #ifndef MIE_PACK 00123 #define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w)) 00124 #endif 00125 #ifdef FMATH_USE_XBYAK 00126 #include "xbyak/xbyak.h" 00127 #include "xbyak/xbyak_util.h" 00128 #endif 00129 00130 #if 1//#ifdef DEBUG 00131 inline void put(const void *p) 00132 { 00133 const float *f = (const float*)p; 00134 printf("{%e, %e, %e, %e}\n", f[0], f[1], f[2], f[3]); 00135 } 00136 inline void puti(const void *p) 00137 { 00138 const unsigned int *i = (const unsigned int *)p; 00139 printf("{%d, %d, %d, %d}\n", i[0], i[1], i[2], i[3]); 00140 printf("{%x, %x, %x, %x}\n", i[0], i[1], i[2], i[3]); 00141 } 00142 #endif 00143 00144 namespace fmath { 00145 00146 namespace local { 00147 00148 const size_t EXP_TABLE_SIZE = 10; 00149 const size_t EXPD_TABLE_SIZE = 11; 00150 const size_t LOG_TABLE_SIZE = 12; 00151 00152 typedef unsigned long long uint64_t; 00153 00154 union fi { 00155 float f; 00156 unsigned int i; 00157 }; 00158 00159 union di { 00160 double d; 00161 uint64_t i; 00162 }; 00163 00164 inline unsigned int mask(int x) 00165 { 00166 return (1U << x) - 1; 00167 } 00168 00169 inline uint64_t mask64(int x) 00170 { 00171 return (1ULL << x) - 1; 00172 } 00173 00174 template<class T> 00175 inline const T* cast_to(const void *p) 00176 { 00177 return reinterpret_cast<const T*>(p); 00178 } 00179 00180 template<class T, size_t N> 00181 size_t NumOfArray(const T (&)[N]) { return N; } 00182 00183 /* 00184 exp(88.722839f) = inf ; 0x42b17218 00185 exp(-87.33655f) = 1.175491e-038f(007fffe6) denormal ; 0xc2aeac50 00186 exp(-103.972081f) = 0 ; 0xc2cff1b5 00187 */ 00188 template<size_t N = EXP_TABLE_SIZE> 00189 struct ExpVar { 00190 enum { 00191 s = N, 00192 n = 1 << s, 00193 f88 = 0x42b00000 /* 88.0 */ 00194 }; 00195 float minX[4]; 00196 float maxX[4]; 00197 float a[4]; 00198 float b[4]; 00199 float f1[4]; 00200 unsigned int i127s[4]; 00201 unsigned int mask_s[4]; 00202 unsigned int i7fffffff[4]; 00203 unsigned int tbl[n]; 00204 ExpVar() 00205 { 00206 float log_2 = ::logf(2.0f); 00207 for (int i = 0; i < 4; i++) { 00208 maxX[i] = 88; 00209 minX[i] = -88; 00210 a[i] = n / log_2; 00211 b[i] = log_2 / n; 00212 f1[i] = 1.0f; 00213 i127s[i] = 127 << s; 00214 i7fffffff[i] = 0x7fffffff; 00215 mask_s[i] = mask(s); 00216 } 00217 00218 for (int i = 0; i < n; i++) { 00219 float y = pow(2.0f, (float)i / n); 00220 fi fi; 00221 fi.f = y; 00222 tbl[i] = fi.i & mask(23); 00223 } 00224 } 00225 }; 00226 00227 template<size_t sbit_ = EXPD_TABLE_SIZE> 00228 struct ExpdVar { 00229 enum { 00230 sbit = sbit_, 00231 s = 1UL << sbit, 00232 adj = (1UL << (sbit + 10)) - (1UL << sbit) 00233 }; 00234 // A = 1, B = 1, C = 1/2, D = 1/6 00235 double C1[2]; // A 00236 double C2[2]; // D 00237 double C3[2]; // C/D 00238 uint64_t tbl[s]; 00239 const double a; 00240 const double ra; 00241 ExpdVar() 00242 : a(s / ::log(2.0)) 00243 , ra(1 / a) 00244 { 00245 for (int i = 0; i < 2; i++) { 00246 #if 0 00247 C1[i] = 1.0; 00248 C2[i] = 0.16667794882310216; 00249 C3[i] = 2.9997969303278795; 00250 #else 00251 C1[i] = 1.0; 00252 C2[i] = 0.16666666685227835064; 00253 C3[i] = 3.0000000027955394; 00254 #endif 00255 } 00256 for (int i = 0; i < s; i++) { 00257 di di; 00258 di.d = ::pow(2.0, i * (1.0 / s)); 00259 tbl[i] = di.i & mask64(52); 00260 } 00261 } 00262 }; 00263 00264 template<size_t N = LOG_TABLE_SIZE> 00265 struct LogVar { 00266 enum { 00267 LEN = N - 1 00268 }; 00269 unsigned int m1[4]; // 0 00270 unsigned int m2[4]; // 16 00271 unsigned int m3[4]; // 32 00272 float m4[4]; // 48 00273 unsigned int m5[4]; // 64 00274 struct { 00275 float app; 00276 float rev; 00277 } tbl[1 << LEN]; 00278 float c_log2; 00279 LogVar() 00280 : c_log2(::logf(2.0f) / (1 << 23)) 00281 { 00282 const double e = 1 / double(1 << 24); 00283 const double h = 1 / double(1 << LEN); 00284 const size_t n = 1U << LEN; 00285 for (size_t i = 0; i < n; i++) { 00286 double x = 1 + double(i) / n; 00287 double a = ::log(x); 00288 tbl[i].app = (float)a; 00289 if (i < n - 1) { 00290 double b = ::log(x + h - e); 00291 tbl[i].rev = (float)((b - a) / ((h - e) * (1 << 23))); 00292 } else { 00293 tbl[i].rev = (float)(1 / (x * (1 << 23))); 00294 } 00295 } 00296 for (int i = 0; i < 4; i++) { 00297 m1[i] = mask(8) << 23; 00298 m2[i] = mask(LEN) << (23 - LEN); 00299 m3[i] = mask(23 - LEN); 00300 m4[i] = c_log2; 00301 m5[i] = 127U << 23; 00302 } 00303 } 00304 }; 00305 00306 #ifdef FMATH_USE_XBYAK 00307 struct ExpCode : public Xbyak::CodeGenerator { 00308 float (*exp_)(float); 00309 __m128 (*exp_ps_)(__m128); 00310 template<size_t N> 00311 ExpCode(const ExpVar<N> *self) 00312 { 00313 Xbyak::util::Cpu cpu; 00314 try { 00315 makeExp(self, cpu); 00316 exp_ = (float(*)(float))getCode(); 00317 align(16); 00318 exp_ps_ = (__m128(*)(__m128))getCurr(); 00319 makeExpPs(self, cpu); 00320 return; 00321 } catch (Xbyak::Error err) { 00322 LOG_MSG( "ExpCode ERR:%s(%d)\n", Xbyak::ConvertErrorToString(err), err); 00323 } catch (...) { 00324 LOG_MSG( "ExpCode ERR:unknown error\n"); 00325 } 00326 ::exit(1); 00327 } 00328 template<size_t N> 00329 void makeExp(const ExpVar<N> *self, const Xbyak::util::Cpu& /*cpu*/) 00330 { 00331 typedef ExpVar<N> Self; 00332 using namespace local; 00333 using namespace Xbyak; 00334 00335 inLocalLabel(); 00336 #ifdef XBYAK64 00337 const Reg64& base = rcx; 00338 const Reg64& a = rax; 00339 #else 00340 const Reg32& base = ecx; 00341 const Reg32& a = eax; 00342 #endif 00343 00344 mov(base, (size_t)self); 00345 00346 #ifdef XBYAK32 00347 movss(xm0, ptr [esp + 4]); 00348 #endif 00349 L(".retry"); 00350 movaps(xm1, xm0); 00351 movd(edx, xm0); 00352 mulss(xm1, ptr [base + offsetof(Self, a)]); // t 00353 and(edx, 0x7fffffff); 00354 cvtss2si(eax, xm1); 00355 cmp(edx, ExpVar<N>::f88); 00356 jg(".overflow"); 00357 lea(edx, ptr [eax + (127 << self->s)]); 00358 cvtsi2ss(xm1, eax); 00359 and(eax, mask(self->s)); // v 00360 mov(eax, ptr [base + a * 4 + offsetof(Self, tbl)]); // expVar.tbl[v] 00361 shr(edx, self->s); 00362 mulss(xm1, ptr [base + offsetof(Self, b)]); 00363 shl(edx, 23); // u 00364 subss(xm0, xm1); // t 00365 or(eax, edx); // fi.f 00366 addss(xm0, ptr [base + offsetof(Self, f1)]); 00367 movd(xm1, eax); 00368 mulss(xm0, xm1); 00369 #ifdef XBYAK32 00370 movss(ptr[esp + 4], xm0); 00371 fld(dword[esp + 4]); 00372 #endif 00373 ret(); 00374 L(".overflow"); 00375 minss(xm0, ptr [base + offsetof(Self, maxX)]); 00376 maxss(xm0, ptr [base + offsetof(Self, minX)]); 00377 jmp(".retry"); 00378 outLocalLabel(); 00379 } 00380 template<size_t N> 00381 void makeExpPs(const ExpVar<N> *self, const Xbyak::util::Cpu& cpu) 00382 { 00383 typedef ExpVar<N> Self; 00384 using namespace local; 00385 using namespace Xbyak; 00386 00387 inLocalLabel(); 00388 #ifdef XBYAK64 00389 const Reg64& base = rcx; 00390 const Reg64& a = rax; 00391 const Reg64& d = rdx; 00392 #else 00393 const Reg32& base = ecx; 00394 const Reg32& a = eax; 00395 const Reg32& d = edx; 00396 #endif 00397 00398 /* 00399 if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try 00400 minps, maxps are very slow then avoid them 00401 */ 00402 const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41); 00403 #if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER) 00404 movaps(xm0, ptr [rcx]); 00405 #endif 00406 mov(base, (size_t)self); 00407 L(".retry"); 00408 movaps(xm5, xm0); 00409 andps(xm5, ptr [base + offsetof(Self, i7fffffff)]); 00410 movaps(xm3, ptr [base + offsetof(Self, a)]); 00411 movaps(xm4, ptr [base + offsetof(Self, b)]); 00412 pcmpgtd(xm5, ptr [base + offsetof(Self, maxX)]); 00413 mulps(xm3, xm0); 00414 movaps(xm1, ptr [base + offsetof(Self, i127s)]); 00415 pmovmskb(eax, xm5); 00416 movaps(xm5, ptr [base + offsetof(Self, mask_s)]); 00417 cvtps2dq(xm2, xm3); 00418 pand(xm5, xm2); 00419 cvtdq2ps(xm3, xm2); 00420 test(eax, eax); 00421 jnz(".overflow"); 00422 paddd(xm1, xm2); 00423 movd(eax, xm5); 00424 mulps(xm4, xm3); 00425 pextrw(edx, xm5, 2); 00426 subps(xm0, xm4); 00427 movd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)]); 00428 addps(xm0, ptr [base + offsetof(Self, f1)]); 00429 pextrw(eax, xm5, 4); 00430 if (useSSE41) { 00431 pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 1); 00432 } else { 00433 movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]); 00434 movlhps(xm4, xm3); 00435 } 00436 pextrw(edx, xm5, 6); 00437 psrld(xm1, self->s); 00438 pslld(xm1, 23); 00439 if (useSSE41) { 00440 pinsrd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)], 2); 00441 pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 3); 00442 } else { 00443 movd(xm2, ptr [base + a * 4 + offsetof(Self, tbl)]); 00444 movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]); 00445 movlhps(xm2, xm3); 00446 shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0)); 00447 } 00448 por(xm1, xm4); 00449 mulps(xm0, xm1); 00450 ret(); 00451 L(".overflow"); 00452 minps(xm0, ptr [base + offsetof(Self, maxX)]); 00453 maxps(xm0, ptr [base + offsetof(Self, minX)]); 00454 jmp(".retry"); 00455 outLocalLabel(); 00456 } 00457 }; 00458 #endif 00459 00460 /* to define static variables in fmath.hpp */ 00461 template<size_t EXP_N = EXP_TABLE_SIZE, size_t LOG_N = LOG_TABLE_SIZE, size_t EXPD_N = EXPD_TABLE_SIZE> 00462 struct C { 00463 static const ExpVar<EXP_N> expVar; 00464 static const LogVar<LOG_N> logVar; 00465 static const ExpdVar<EXPD_N> expdVar; 00466 #ifdef FMATH_USE_XBYAK 00467 static const ExpCode& getInstance() { 00468 static const ExpCode expCode(&expVar); 00469 return expCode; 00470 } 00471 #endif 00472 }; 00473 00474 template<size_t EXP_N, size_t LOG_N, size_t EXPD_N> 00475 MIE_ALIGN(16) const ExpVar<EXP_N> C<EXP_N, LOG_N, EXPD_N>::expVar; 00476 00477 template<size_t EXP_N, size_t LOG_N, size_t EXPD_N> 00478 MIE_ALIGN(16) const LogVar<LOG_N> C<EXP_N, LOG_N, EXPD_N>::logVar; 00479 00480 template<size_t EXP_N, size_t LOG_N, size_t EXPD_N> 00481 MIE_ALIGN(16) const ExpdVar<EXPD_N> C<EXP_N, LOG_N, EXPD_N>::expdVar; 00482 00483 } // fmath::local 00484 00485 #ifdef FMATH_USE_XBYAK 00486 inline float expC(float x) 00487 #else 00488 inline float exp(float x) 00489 #endif 00490 { 00491 using namespace local; 00492 const ExpVar<>& expVar = C<>::expVar; 00493 00494 #if 1 00495 __m128 x1 = _mm_set_ss(x); 00496 00497 int limit = _mm_cvtss_si32(x1) & 0x7fffffff; 00498 if (limit > ExpVar<>::f88) { 00499 x1 = _mm_min_ss(x1, _mm_load_ss(expVar.maxX)); 00500 x1 = _mm_max_ss(x1, _mm_load_ss(expVar.minX)); 00501 } 00502 00503 int r = _mm_cvtss_si32(_mm_mul_ss(x1, _mm_load_ss(expVar.a))); 00504 unsigned int v = r & mask(expVar.s); 00505 float t = _mm_cvtss_f32(x1) - r * expVar.b[0]; 00506 int u = r >> expVar.s; 00507 fi fi; 00508 fi.i = ((u + 127) << 23) | expVar.tbl[v]; 00509 return (1 + t) * fi.f; 00510 #else 00511 x = std::min(x, expVar.maxX[0]); 00512 x = std::max(x, expVar.minX[0]); 00513 float t = x * expVar.a[0]; 00514 const float magic = (1 << 23) + (1 << 22); // to round 00515 t += magic; 00516 fi fi; 00517 fi.f = t; 00518 t = x - (t - magic) * expVar.b[0]; 00519 int u = ((fi.i + (127 << expVar.s)) >> expVar.s) << 23; 00520 unsigned int v = fi.i & mask(expVar.s); 00521 fi.i = u | expVar.tbl[v]; 00522 return (1 + t) * fi.f; 00523 // return (1 + t) * pow(2, (float)u) * pow(2, (float)v / n); 00524 #endif 00525 } 00526 00527 /* 00528 remark : -ffast-math option of gcc may generate bad code for fmath::expd 00529 */ 00530 inline double expd(double x) 00531 { 00532 if (x <= -708.39641853226408) return 0; 00533 if (x >= 709.78271289338397) return std::numeric_limits<double>::infinity(); 00534 using namespace local; 00535 const ExpdVar<>& c = C<>::expdVar; 00536 const uint64_t b = 3ULL << 51; 00537 di di; 00538 di.d = x * c.a + b; 00539 uint64_t iax = c.tbl[di.i & mask(c.sbit)]; 00540 00541 double t = (di.d - b) * c.ra - x; 00542 uint64_t u = ((di.i + c.adj) >> c.sbit) << 52; 00543 double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0]; 00544 // double y = (2.999796930327879362111743 - t) * (t * t) * 0.166677948823102161853172 - t + 1.000000000000000000488181; 00545 00546 di.i = u | iax; 00547 return y * di.d; 00548 } 00549 00550 inline void expd_v(double *px, int n) 00551 { 00552 using namespace local; 00553 const ExpdVar<>& c = C<>::expdVar; 00554 const uint64_t b = 3ULL << 51; 00555 assert((n % 2) == 0); 00556 const __m128d mC1 = *cast_to<__m128d>(c.C1); 00557 const __m128d mC2 = *cast_to<__m128d>(c.C2); 00558 const __m128d mC3 = *cast_to<__m128d>(c.C3); 00559 const __m128d ma = _mm_set1_pd(c.a); 00560 const __m128d mra = _mm_set1_pd(c.ra); 00561 const __m128i madj = _mm_set1_epi32(c.adj); 00562 MIE_ALIGN(16) const double expMax[2] = { 709.78271289338397, 709.78271289338397 }; 00563 MIE_ALIGN(16) const double expMin[2] = { -708.39641853226408, -708.39641853226408 }; 00564 for (unsigned int i = 0; i < (unsigned int)n; i += 2) { 00565 __m128d x = _mm_load_pd(px); 00566 x = _mm_min_pd(x, *(const __m128d*)expMax); 00567 x = _mm_max_pd(x, *(const __m128d*)expMin); 00568 00569 __m128d d = _mm_mul_pd(x, ma); 00570 d = _mm_add_pd(d, _mm_set1_pd(b)); 00571 int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit); 00572 int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit); 00573 00574 __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr0])); 00575 __m128i iax = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr1])); 00576 iax = _mm_unpacklo_epi64(iaxL, iax); 00577 00578 __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x); 00579 __m128i u = _mm_castpd_si128(d); 00580 u = _mm_add_epi64(u, madj); 00581 u = _mm_srli_epi64(u, c.sbit); 00582 u = _mm_slli_epi64(u, 52); 00583 u = _mm_or_si128(u, iax); 00584 __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t)); 00585 y = _mm_mul_pd(y, mC2); 00586 y = _mm_add_pd(_mm_sub_pd(y, t), mC1); 00587 _mm_store_pd(px, _mm_mul_pd(y, _mm_castsi128_pd(u))); 00588 px += 2; 00589 } 00590 } 00591 00592 #ifdef FMATH_USE_XBYAK 00593 inline __m128 exp_psC(__m128 x) 00594 #else 00595 inline __m128 exp_ps(__m128 x) 00596 #endif 00597 { 00598 using namespace local; 00599 const ExpVar<>& expVar = C<>::expVar; 00600 00601 __m128i limit = _mm_castps_si128(_mm_and_ps(x, *cast_to<__m128>(expVar.i7fffffff))); 00602 int over = _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *cast_to<__m128i>(expVar.maxX))); 00603 if (over) { 00604 x = _mm_min_ps(x, _mm_load_ps(expVar.maxX)); 00605 x = _mm_max_ps(x, _mm_load_ps(expVar.minX)); 00606 } 00607 00608 __m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *cast_to<__m128>(expVar.a))); 00609 __m128 t = _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *cast_to<__m128>(expVar.b))); 00610 t = _mm_add_ps(t, *cast_to<__m128>(expVar.f1)); 00611 00612 __m128i v4 = _mm_and_si128(r, *cast_to<__m128i>(expVar.mask_s)); 00613 __m128i u4 = _mm_add_epi32(r, *cast_to<__m128i>(expVar.i127s)); 00614 u4 = _mm_srli_epi32(u4, expVar.s); 00615 u4 = _mm_slli_epi32(u4, 23); 00616 00617 unsigned int v0, v1, v2, v3; 00618 v0 = _mm_cvtsi128_si32(v4); 00619 v1 = _mm_extract_epi16(v4, 2); 00620 v2 = _mm_extract_epi16(v4, 4); 00621 v3 = _mm_extract_epi16(v4, 6); 00622 #if 1 00623 __m128 t0, t1, t2, t3; 00624 00625 #if 0 00626 t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v0])); 00627 t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v1])); 00628 t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v2])); 00629 t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v3])); 00630 #else // faster but gcc puts warnings 00631 t0 = _mm_set_ss(*(const float*)&expVar.tbl[v0]); 00632 t1 = _mm_set_ss(*(const float*)&expVar.tbl[v1]); 00633 t2 = _mm_set_ss(*(const float*)&expVar.tbl[v2]); 00634 t3 = _mm_set_ss(*(const float*)&expVar.tbl[v3]); 00635 #endif 00636 00637 t1 = _mm_movelh_ps(t1, t3); 00638 t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32)); 00639 t0 = _mm_movelh_ps(t0, t2); 00640 t0 = _mm_or_ps(t0, t1); 00641 #else 00642 __m128i ti = _mm_castps_si128(_mm_load_ss((const float*)&expVar.tbl[v0])); 00643 ti = _mm_insert_epi32(ti, expVar.tbl[v1], 1); 00644 ti = _mm_insert_epi32(ti, expVar.tbl[v2], 2); 00645 ti = _mm_insert_epi32(ti, expVar.tbl[v3], 3); 00646 __m128 t0 = _mm_castsi128_ps(ti); 00647 #endif 00648 t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4)); 00649 00650 t = _mm_mul_ps(t, t0); 00651 00652 return t; 00653 } 00654 00655 inline float log(float x) 00656 { 00657 using namespace local; 00658 const LogVar<>& logVar = C<>::logVar; 00659 const size_t logLen = logVar.LEN; 00660 00661 fi fi; 00662 fi.f = x; 00663 int a = fi.i & (mask(8) << 23); 00664 unsigned int b1 = fi.i & (mask(logLen) << (23 - logLen)); 00665 unsigned int b2 = fi.i & mask(23 - logLen); 00666 int idx = b1 >> (23 - logLen); 00667 float f = float(a - (127 << 23)) * logVar.c_log2 + logVar.tbl[idx].app + float(b2) * logVar.tbl[idx].rev; 00668 return f; 00669 } 00670 00671 inline __m128 log_ps(__m128 x) 00672 { 00673 using namespace local; 00674 const LogVar<>& logVar = C<>::logVar; 00675 00676 __m128i xi = _mm_castps_si128(x); 00677 __m128i idx = _mm_srli_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m2)), (23 - logVar.LEN)); 00678 __m128 a = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m1)), *cast_to<__m128i>(logVar.m5))); 00679 __m128 b2 = _mm_cvtepi32_ps(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m3))); 00680 00681 a = _mm_mul_ps(a, *cast_to<__m128>(logVar.m4)); // c_log2 00682 00683 unsigned int i0 = _mm_cvtsi128_si32(idx); 00684 00685 #if 1 00686 unsigned int i1 = _mm_extract_epi16(idx, 2); 00687 unsigned int i2 = _mm_extract_epi16(idx, 4); 00688 unsigned int i3 = _mm_extract_epi16(idx, 6); 00689 #else 00690 idx = _mm_srli_si128(idx, 4); 00691 unsigned int i1 = _mm_cvtsi128_si32(idx); 00692 00693 idx = _mm_srli_si128(idx, 4); 00694 unsigned int i2 = _mm_cvtsi128_si32(idx); 00695 00696 idx = _mm_srli_si128(idx, 4); 00697 unsigned int i3 = _mm_cvtsi128_si32(idx); 00698 #endif 00699 00700 __m128 app, rev; 00701 __m128i L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i0].app)); 00702 __m128i H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i1].app)); 00703 __m128 t = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H)); 00704 L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i2].app)); 00705 H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i3].app)); 00706 rev = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H)); 00707 app = _mm_shuffle_ps(t, rev, MIE_PACK(2, 0, 2, 0)); 00708 rev = _mm_shuffle_ps(t, rev, MIE_PACK(3, 1, 3, 1)); 00709 00710 a = _mm_add_ps(a, app); 00711 rev = _mm_mul_ps(b2, rev); 00712 return _mm_add_ps(a, rev); 00713 } 00714 00715 #ifndef __CYGWIN__ 00716 // cygwin defines log2() in global namespace! 00717 // log2(x) = log(x) / log(2) 00718 inline float log2(float x) { return fmath::log(x) * 1.442695f; } 00719 #endif 00720 00721 /* 00722 for given y > 0 00723 get f_y(x) := pow(x, y) for x >= 0 00724 */ 00725 class PowGenerator { 00726 enum { 00727 N = 11 00728 }; 00729 float tbl0_[256]; 00730 struct { 00731 float app; 00732 float rev; 00733 } tbl1_[1 << N]; 00734 public: 00735 PowGenerator(float y) 00736 { 00737 for (int i = 0; i < 256; i++) { 00738 tbl0_[i] = ::powf(2, (i - 127) * y); 00739 } 00740 const double e = 1 / double(1 << 24); 00741 const double h = 1 / double(1 << N); 00742 const size_t n = 1U << N; 00743 for (size_t i = 0; i < n; i++) { 00744 double x = 1 + double(i) / n; 00745 double a = ::pow(x, (double)y); 00746 tbl1_[i].app = (float)a; 00747 double b = ::pow(x + h - e, (double)y); 00748 tbl1_[i].rev = (float)((b - a) / (h - e) / (1 << 23)); 00749 } 00750 } 00751 float get(float x) const 00752 { 00753 using namespace local; 00754 fi fi; 00755 fi.f = x; 00756 int a = (fi.i >> 23) & mask(8); 00757 unsigned int b = fi.i & mask(23); 00758 unsigned int b1 = b & (mask(N) << (23 - N)); 00759 unsigned int b2 = b & mask(23 - N); 00760 float f; 00761 int idx = b1 >> (23 - N); 00762 f = tbl0_[a] * (tbl1_[idx].app + float(b2) * tbl1_[idx].rev); 00763 return f; 00764 } 00765 }; 00766 00767 // for Xbyak version 00768 #ifdef FMATH_USE_XBYAK 00769 float (*const exp)(float) = local::C<>::getInstance().exp_; 00770 __m128 (*const exp_ps)(__m128) = local::C<>::getInstance().exp_ps_; 00771 #endif 00772 00773 // exp2(x) = pow(2, x) 00774 inline float exp2(float x) { return fmath::exp(x * 0.6931472f); } 00775 00776 } // fmath 00777 // added by ykhwong (end) 00778 00779 namespace MT32Emu { 00780 00781 // Mathematical constants 00782 const double DOUBLE_PI = 3.141592653589793; 00783 const double DOUBLE_LN_10 = 2.302585092994046; 00784 const float FLOAT_PI = 3.1415927f; 00785 const float FLOAT_2PI = 6.2831853f; 00786 const float FLOAT_LN_2 = 0.6931472f; 00787 const float FLOAT_LN_10 = 2.3025851f; 00788 00789 static inline float POWF(float x, float y) { 00790 fmath::PowGenerator f(y); return f.get(x); 00791 } 00792 00793 static inline float EXPF(float x) { 00794 return fmath::exp(x); 00795 } 00796 00797 static inline float EXP2F(float x) { 00798 #ifdef __APPLE__ 00799 // on OSX exp2f() is 1.59 times faster than "exp() and the multiplication with FLOAT_LN_2" 00800 return exp2f(x); 00801 #else 00802 return fmath::exp2(x); 00803 #endif 00804 } 00805 00806 static inline float EXP10F(float x) { 00807 return fmath::exp(FLOAT_LN_10 * x); 00808 } 00809 00810 static inline float LOGF(float x) { 00811 return fmath::log(x); 00812 } 00813 00814 static inline float LOG2F(float x) { 00815 return fmath::log(x) / FLOAT_LN_2; 00816 } 00817 00818 static inline float LOG10F(float x) { 00819 return log10(x); 00820 } 00821 00822 } 00823 00824 #endif // #if !defined (__SSE2__) 00825 #endif // #ifndef MT32EMU_MMATH_H