DOSBox-X
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines
src/mt32/mmath.h
00001 /* Copyright (C) 2003, 2004, 2005, 2006, 2008, 2009 Dean Beeler, Jerome Fisher
00002  * Copyright (C) 2011 Dean Beeler, Jerome Fisher, Sergey V. Mikayev
00003  *
00004  *  This program is free software: you can redistribute it and/or modify
00005  *  it under the terms of the GNU Lesser General Public License as published by
00006  *  the Free Software Foundation, either version 2.1 of the License, or
00007  *  (at your option) any later version.
00008  *
00009  *  This program is distributed in the hope that it will be useful,
00010  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *  GNU Lesser General Public License for more details.
00013  *
00014  *  You should have received a copy of the GNU Lesser General Public License
00015  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef MT32EMU_MMATH_H
00019 #define MT32EMU_MMATH_H
00020 
00021 #if !defined (__SSE2__) || (defined (_M_IX86_FP) && (_M_IX86_FP) < 2)
00022 #include <cmath>
00023 namespace MT32Emu {
00024         // Mathematical constants
00025         const double DOUBLE_PI = 3.141592653589793;
00026         const double DOUBLE_LN_10 = 2.302585092994046;
00027         const float FLOAT_PI = 3.1415927f;
00028         const float FLOAT_2PI = 6.2831853f;
00029         const float FLOAT_LN_2 = 0.6931472f;
00030         const float FLOAT_LN_10 = 2.3025851f;
00031         static inline float POWF(float x, float y) {
00032                 return pow(x, y);
00033         }
00034         static inline float EXPF(float x) {
00035                 return exp(x);
00036         }
00037         static inline float EXP2F(float x) {
00038 #ifdef __APPLE__
00039         // on OSX exp2f() is 1.59 times faster than "exp() and the multiplication with FLOAT_LN_2"
00040                 return exp2f(x);
00041 #else
00042                 return exp(FLOAT_LN_2 * x);
00043 #endif
00044         }
00045         static inline float EXP10F(float x) {
00046                 return exp(FLOAT_LN_10 * x);
00047         }
00048         static inline float LOGF(float x) {
00049                 return log(x);
00050         }
00051         static inline float LOG2F(float x) {
00052                 return log(x) / FLOAT_LN_2;
00053         }
00054         static inline float LOG10F(float x) {
00055                 return log10(x);
00056         }
00057 } // namespace MT32Emu
00058 
00059 #else
00060 #define FIXEDPOINT_UDIV(x, y, point) (((x) << (point)) / ((y)))
00061 #define FIXEDPOINT_SDIV(x, y, point) (((x) * (1 << point)) / ((y)))
00062 #define FIXEDPOINT_UMULT(x, y, point) (((x) * (y)) >> point)
00063 #define FIXEDPOINT_SMULT(x, y, point) (((x) * (y)) / (1 << point))
00064 
00065 #define FIXEDPOINT_MAKE(x, point) ((Bit32u)((1 << point) * x))
00066 
00067 #if defined(_MSC_VER)
00068 # pragma warning(disable:4244) /* const fmath::local::uint64_t to double possible loss of data */
00069 #endif
00070 
00071 // added by ykhwong (start)
00072 #pragma once
00073 
00083 /*
00084         function prototype list
00085 
00086         float fmath::exp(float);
00087         float fmath::log(float);
00088 
00089         __m128 fmath::exp_ps(__m128);
00090         __m128 fmath::log_ps(__m128);
00091 
00092         if FMATH_USE_XBYAK is defined then Xbyak version are used
00093 */
00094 //#define FMATH_USE_XBYAK
00095 
00096 #include <math.h>
00097 #include <stddef.h>
00098 #include <assert.h>
00099 #include <limits>
00100 #include <stdlib.h>
00101 #include <float.h>
00102 #if defined(_WIN32) && !defined(__GNUC__)
00103         #include <intrin.h>
00104         #ifndef MIE_ALIGN
00105                 #define MIE_ALIGN(x) __declspec(align(x))
00106         #endif
00107 #else
00108         #ifndef __GNUC_PREREQ
00109         #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
00110         #endif
00111         #if __GNUC_PREREQ(4, 4) || !defined(__GNUC__)
00112                 /* GCC >= 4.4 and non-GCC compilers */
00113                 #include <x86intrin.h>
00114         #elif __GNUC_PREREQ(4, 1)
00115                 /* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header */
00116                 #include <emmintrin.h>
00117         #endif
00118         #ifndef MIE_ALIGN
00119                 #define MIE_ALIGN(x) __attribute__((aligned(x)))
00120         #endif
00121 #endif
00122 #ifndef MIE_PACK
00123         #define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w))
00124 #endif
00125 #ifdef FMATH_USE_XBYAK
00126         #include "xbyak/xbyak.h"
00127         #include "xbyak/xbyak_util.h"
00128 #endif
00129 
00130 #if 1//#ifdef DEBUG
00131 inline void put(const void *p)
00132 {
00133         const float *f = (const float*)p;
00134         printf("{%e, %e, %e, %e}\n", f[0], f[1], f[2], f[3]);
00135 }
00136 inline void puti(const void *p)
00137 {
00138         const unsigned int *i = (const unsigned int *)p;
00139         printf("{%d, %d, %d, %d}\n", i[0], i[1], i[2], i[3]);
00140         printf("{%x, %x, %x, %x}\n", i[0], i[1], i[2], i[3]);
00141 }
00142 #endif
00143 
00144 namespace fmath {
00145 
00146 namespace local {
00147 
00148 const size_t EXP_TABLE_SIZE = 10;
00149 const size_t EXPD_TABLE_SIZE = 11;
00150 const size_t LOG_TABLE_SIZE = 12;
00151 
00152 typedef unsigned long long uint64_t;
00153 
00154 union fi {
00155         float f;
00156         unsigned int i;
00157 };
00158 
00159 union di {
00160         double d;
00161         uint64_t i;
00162 };
00163 
00164 inline unsigned int mask(int x)
00165 {
00166         return (1U << x) - 1;
00167 }
00168 
00169 inline uint64_t mask64(int x)
00170 {
00171         return (1ULL << x) - 1;
00172 }
00173 
00174 template<class T>
00175 inline const T* cast_to(const void *p)
00176 {
00177         return reinterpret_cast<const T*>(p);
00178 }
00179 
00180 template<class T, size_t N>
00181 size_t NumOfArray(const T (&)[N]) { return N; }
00182 
00183 /*
00184         exp(88.722839f) = inf ; 0x42b17218
00185         exp(-87.33655f) = 1.175491e-038f(007fffe6) denormal ; 0xc2aeac50
00186         exp(-103.972081f) = 0 ; 0xc2cff1b5
00187 */
00188 template<size_t N = EXP_TABLE_SIZE>
00189 struct ExpVar {
00190         enum {
00191                 s = N,
00192                 n = 1 << s,
00193                 f88 = 0x42b00000 /* 88.0 */
00194         };
00195         float minX[4];
00196         float maxX[4];
00197         float a[4];
00198         float b[4];
00199         float f1[4];
00200         unsigned int i127s[4];
00201         unsigned int mask_s[4];
00202         unsigned int i7fffffff[4];
00203         unsigned int tbl[n];
00204         ExpVar()
00205         {
00206                 float log_2 = ::logf(2.0f);
00207                 for (int i = 0; i < 4; i++) {
00208                         maxX[i] = 88;
00209                         minX[i] = -88;
00210                         a[i] = n / log_2;
00211                         b[i] = log_2 / n;
00212                         f1[i] = 1.0f;
00213                         i127s[i] = 127 << s;
00214                         i7fffffff[i] = 0x7fffffff;
00215                         mask_s[i] = mask(s);
00216                 }
00217 
00218                 for (int i = 0; i < n; i++) {
00219                         float y = pow(2.0f, (float)i / n);
00220                         fi fi;
00221                         fi.f = y;
00222                         tbl[i] = fi.i & mask(23);
00223                 }
00224         }
00225 };
00226 
00227 template<size_t sbit_ = EXPD_TABLE_SIZE>
00228 struct ExpdVar {
00229         enum {
00230                 sbit = sbit_,
00231                 s = 1UL << sbit,
00232                 adj = (1UL << (sbit + 10)) - (1UL << sbit)
00233         };
00234         // A = 1, B = 1, C = 1/2, D = 1/6
00235         double C1[2]; // A
00236         double C2[2]; // D
00237         double C3[2]; // C/D
00238         uint64_t tbl[s];
00239         const double a;
00240         const double ra;
00241         ExpdVar()
00242                 : a(s / ::log(2.0))
00243                 , ra(1 / a)
00244         {
00245                 for (int i = 0; i < 2; i++) {
00246 #if 0
00247                         C1[i] = 1.0;
00248                         C2[i] = 0.16667794882310216;
00249                         C3[i] = 2.9997969303278795;
00250 #else
00251                         C1[i] = 1.0;
00252                         C2[i] = 0.16666666685227835064;
00253                         C3[i] = 3.0000000027955394;
00254 #endif
00255                 }
00256                 for (int i = 0; i < s; i++) {
00257                         di di;
00258                         di.d = ::pow(2.0, i * (1.0 / s));
00259                         tbl[i] = di.i & mask64(52);
00260                 }
00261         }
00262 };
00263 
00264 template<size_t N = LOG_TABLE_SIZE>
00265 struct LogVar {
00266         enum {
00267                 LEN = N - 1
00268         };
00269         unsigned int m1[4]; // 0
00270         unsigned int m2[4]; // 16
00271         unsigned int m3[4]; // 32
00272         float m4[4];            // 48
00273         unsigned int m5[4]; // 64
00274         struct {
00275                 float app;
00276                 float rev;
00277         } tbl[1 << LEN];
00278         float c_log2;
00279         LogVar()
00280                 : c_log2(::logf(2.0f) / (1 << 23))
00281         {
00282                 const double e = 1 / double(1 << 24);
00283                 const double h = 1 / double(1 << LEN);
00284                 const size_t n = 1U << LEN;
00285                 for (size_t i = 0; i < n; i++) {
00286                         double x = 1 + double(i) / n;
00287                         double a = ::log(x);
00288                         tbl[i].app = (float)a;
00289                         if (i < n - 1) {
00290                                 double b = ::log(x + h - e);
00291                                 tbl[i].rev = (float)((b - a) / ((h - e) * (1 << 23)));
00292                         } else {
00293                                 tbl[i].rev = (float)(1 / (x * (1 << 23)));
00294                         }
00295                 }
00296                 for (int i = 0; i < 4; i++) {
00297                         m1[i] = mask(8) << 23;
00298                         m2[i] = mask(LEN) << (23 - LEN);
00299                         m3[i] = mask(23 - LEN);
00300                         m4[i] = c_log2;
00301                         m5[i] = 127U << 23;
00302                 }
00303         }
00304 };
00305 
00306 #ifdef FMATH_USE_XBYAK
00307 struct ExpCode : public Xbyak::CodeGenerator {
00308         float (*exp_)(float);
00309         __m128 (*exp_ps_)(__m128);
00310         template<size_t N>
00311         ExpCode(const ExpVar<N> *self)
00312         {
00313                 Xbyak::util::Cpu cpu;
00314                 try {
00315                         makeExp(self, cpu);
00316                         exp_ = (float(*)(float))getCode();
00317                         align(16);
00318                         exp_ps_ = (__m128(*)(__m128))getCurr();
00319                         makeExpPs(self, cpu);
00320                         return;
00321                 } catch (Xbyak::Error err) {
00322                         LOG_MSG( "ExpCode ERR:%s(%d)\n", Xbyak::ConvertErrorToString(err), err);
00323                 } catch (...) {
00324                         LOG_MSG( "ExpCode ERR:unknown error\n");
00325                 }
00326                 ::exit(1);
00327         }
00328         template<size_t N>
00329         void makeExp(const ExpVar<N> *self, const Xbyak::util::Cpu& /*cpu*/)
00330         {
00331                 typedef ExpVar<N> Self;
00332                 using namespace local;
00333                 using namespace Xbyak;
00334 
00335                 inLocalLabel();
00336 #ifdef XBYAK64
00337                 const Reg64& base = rcx;
00338                 const Reg64& a = rax;
00339 #else
00340                 const Reg32& base = ecx;
00341                 const Reg32& a = eax;
00342 #endif
00343 
00344                 mov(base, (size_t)self);
00345 
00346 #ifdef XBYAK32
00347                 movss(xm0, ptr [esp + 4]);
00348 #endif
00349         L(".retry");
00350                 movaps(xm1, xm0);
00351                 movd(edx, xm0);
00352                 mulss(xm1, ptr [base + offsetof(Self, a)]); // t
00353                 and(edx, 0x7fffffff);
00354                 cvtss2si(eax, xm1);
00355                 cmp(edx, ExpVar<N>::f88);
00356                 jg(".overflow");
00357                 lea(edx, ptr [eax + (127 << self->s)]);
00358                 cvtsi2ss(xm1, eax);
00359                 and(eax, mask(self->s)); // v
00360                 mov(eax, ptr [base + a * 4 + offsetof(Self, tbl)]); // expVar.tbl[v]
00361                 shr(edx, self->s);
00362                 mulss(xm1, ptr [base + offsetof(Self, b)]);
00363                 shl(edx, 23); // u
00364                 subss(xm0, xm1); // t
00365                 or(eax, edx); // fi.f
00366                 addss(xm0, ptr [base + offsetof(Self, f1)]);
00367                 movd(xm1, eax);
00368                 mulss(xm0, xm1);
00369 #ifdef XBYAK32
00370                 movss(ptr[esp + 4], xm0);
00371                 fld(dword[esp + 4]);
00372 #endif
00373                 ret();
00374         L(".overflow");
00375                 minss(xm0, ptr [base + offsetof(Self, maxX)]);
00376                 maxss(xm0, ptr [base + offsetof(Self, minX)]);
00377                 jmp(".retry");
00378                 outLocalLabel();
00379         }
00380         template<size_t N>
00381         void makeExpPs(const ExpVar<N> *self, const Xbyak::util::Cpu& cpu)
00382         {
00383                 typedef ExpVar<N> Self;
00384                 using namespace local;
00385                 using namespace Xbyak;
00386 
00387                 inLocalLabel();
00388 #ifdef XBYAK64
00389                 const Reg64& base = rcx;
00390                 const Reg64& a = rax;
00391                 const Reg64& d = rdx;
00392 #else
00393                 const Reg32& base = ecx;
00394                 const Reg32& a = eax;
00395                 const Reg32& d = edx;
00396 #endif
00397 
00398 /*
00399         if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try
00400         minps, maxps are very slow then avoid them
00401 */
00402                 const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41);
00403 #if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER)
00404                 movaps(xm0, ptr [rcx]);
00405 #endif
00406                 mov(base, (size_t)self);
00407         L(".retry");
00408                 movaps(xm5, xm0);
00409                 andps(xm5, ptr [base + offsetof(Self, i7fffffff)]);
00410                 movaps(xm3, ptr [base + offsetof(Self, a)]);
00411                 movaps(xm4, ptr [base + offsetof(Self, b)]);
00412                 pcmpgtd(xm5, ptr [base + offsetof(Self, maxX)]);
00413                 mulps(xm3, xm0);
00414                 movaps(xm1, ptr [base + offsetof(Self, i127s)]);
00415                 pmovmskb(eax, xm5);
00416                 movaps(xm5, ptr [base + offsetof(Self, mask_s)]);
00417                 cvtps2dq(xm2, xm3);
00418                 pand(xm5, xm2);
00419                 cvtdq2ps(xm3, xm2);
00420                 test(eax, eax);
00421                 jnz(".overflow");
00422                 paddd(xm1, xm2);
00423                 movd(eax, xm5);
00424                 mulps(xm4, xm3);
00425                 pextrw(edx, xm5, 2);
00426                 subps(xm0, xm4);
00427                 movd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)]);
00428                 addps(xm0, ptr [base + offsetof(Self, f1)]);
00429                 pextrw(eax, xm5, 4);
00430                 if (useSSE41) {
00431                         pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 1);
00432                 } else {
00433                         movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]);
00434                         movlhps(xm4, xm3);
00435                 }
00436                 pextrw(edx, xm5, 6);
00437                 psrld(xm1, self->s);
00438                 pslld(xm1, 23);
00439                 if (useSSE41) {
00440                         pinsrd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)], 2);
00441                         pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 3);
00442                 } else {
00443                         movd(xm2, ptr [base + a * 4 + offsetof(Self, tbl)]);
00444                         movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]);
00445                         movlhps(xm2, xm3);
00446                         shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0));
00447                 }
00448                 por(xm1, xm4);
00449                 mulps(xm0, xm1);
00450                 ret();
00451         L(".overflow");
00452                 minps(xm0, ptr [base + offsetof(Self, maxX)]);
00453                 maxps(xm0, ptr [base + offsetof(Self, minX)]);
00454                 jmp(".retry");
00455                 outLocalLabel();
00456         }
00457 };
00458 #endif
00459 
00460 /* to define static variables in fmath.hpp */
00461 template<size_t EXP_N = EXP_TABLE_SIZE, size_t LOG_N = LOG_TABLE_SIZE, size_t EXPD_N = EXPD_TABLE_SIZE>
00462 struct C {
00463         static const ExpVar<EXP_N> expVar;
00464         static const LogVar<LOG_N> logVar;
00465         static const ExpdVar<EXPD_N> expdVar;
00466 #ifdef FMATH_USE_XBYAK
00467         static const ExpCode& getInstance() {
00468                 static const ExpCode expCode(&expVar);
00469                 return expCode;
00470         }
00471 #endif
00472 };
00473 
00474 template<size_t EXP_N, size_t LOG_N, size_t EXPD_N>
00475 MIE_ALIGN(16) const ExpVar<EXP_N> C<EXP_N, LOG_N, EXPD_N>::expVar;
00476 
00477 template<size_t EXP_N, size_t LOG_N, size_t EXPD_N>
00478 MIE_ALIGN(16) const LogVar<LOG_N> C<EXP_N, LOG_N, EXPD_N>::logVar;
00479 
00480 template<size_t EXP_N, size_t LOG_N, size_t EXPD_N>
00481 MIE_ALIGN(16) const ExpdVar<EXPD_N> C<EXP_N, LOG_N, EXPD_N>::expdVar;
00482 
00483 } // fmath::local
00484 
00485 #ifdef FMATH_USE_XBYAK
00486 inline float expC(float x)
00487 #else
00488 inline float exp(float x)
00489 #endif
00490 {
00491         using namespace local;
00492         const ExpVar<>& expVar = C<>::expVar;
00493 
00494 #if 1
00495         __m128 x1 = _mm_set_ss(x);
00496 
00497         int limit = _mm_cvtss_si32(x1) & 0x7fffffff;
00498         if (limit > ExpVar<>::f88) {
00499                 x1 = _mm_min_ss(x1, _mm_load_ss(expVar.maxX));
00500                 x1 = _mm_max_ss(x1, _mm_load_ss(expVar.minX));
00501         }
00502 
00503         int r = _mm_cvtss_si32(_mm_mul_ss(x1, _mm_load_ss(expVar.a)));
00504         unsigned int v = r & mask(expVar.s);
00505         float t = _mm_cvtss_f32(x1) - r * expVar.b[0];
00506         int u = r >> expVar.s;
00507         fi fi;
00508         fi.i = ((u + 127) << 23) | expVar.tbl[v];
00509         return (1 + t) * fi.f;
00510 #else
00511         x = std::min(x, expVar.maxX[0]);
00512         x = std::max(x, expVar.minX[0]);
00513         float t = x * expVar.a[0];
00514         const float magic = (1 << 23) + (1 << 22); // to round
00515         t += magic;
00516         fi fi;
00517         fi.f = t;
00518         t = x - (t - magic) * expVar.b[0];
00519         int u = ((fi.i + (127 << expVar.s)) >> expVar.s) << 23;
00520         unsigned int v = fi.i & mask(expVar.s);
00521         fi.i = u | expVar.tbl[v];
00522         return (1 + t) * fi.f;
00523 //      return (1 + t) * pow(2, (float)u) * pow(2, (float)v / n);
00524 #endif
00525 }
00526 
00527 /*
00528         remark : -ffast-math option of gcc may generate bad code for fmath::expd
00529 */
00530 inline double expd(double x)
00531 {
00532         if (x <= -708.39641853226408) return 0;
00533         if (x >= 709.78271289338397) return std::numeric_limits<double>::infinity();
00534         using namespace local;
00535         const ExpdVar<>& c = C<>::expdVar;
00536         const uint64_t b = 3ULL << 51;
00537         di di;
00538         di.d = x * c.a + b;
00539         uint64_t iax = c.tbl[di.i & mask(c.sbit)];
00540 
00541         double t = (di.d - b) * c.ra - x;
00542         uint64_t u = ((di.i + c.adj) >> c.sbit) << 52;
00543         double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];
00544 //      double y = (2.999796930327879362111743 - t) * (t * t) * 0.166677948823102161853172 - t + 1.000000000000000000488181;
00545 
00546         di.i = u | iax;
00547         return y * di.d;
00548 }
00549 
00550 inline void expd_v(double *px, int n)
00551 {
00552         using namespace local;
00553         const ExpdVar<>& c = C<>::expdVar;
00554         const uint64_t b = 3ULL << 51;
00555         assert((n % 2) == 0);
00556         const __m128d mC1 = *cast_to<__m128d>(c.C1);
00557         const __m128d mC2 = *cast_to<__m128d>(c.C2);
00558         const __m128d mC3 = *cast_to<__m128d>(c.C3);
00559         const __m128d ma = _mm_set1_pd(c.a);
00560         const __m128d mra = _mm_set1_pd(c.ra);
00561         const __m128i madj = _mm_set1_epi32(c.adj);
00562         MIE_ALIGN(16) const double expMax[2] = { 709.78271289338397, 709.78271289338397 };
00563         MIE_ALIGN(16) const double expMin[2] = { -708.39641853226408, -708.39641853226408 };
00564         for (unsigned int i = 0; i < (unsigned int)n; i += 2) {
00565                 __m128d x = _mm_load_pd(px);
00566                 x = _mm_min_pd(x, *(const __m128d*)expMax);
00567                 x = _mm_max_pd(x, *(const __m128d*)expMin);
00568 
00569                 __m128d d = _mm_mul_pd(x, ma);
00570                 d = _mm_add_pd(d, _mm_set1_pd(b));
00571                 int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);
00572                 int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit);
00573 
00574                 __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr0]));
00575                 __m128i iax = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr1]));
00576                 iax = _mm_unpacklo_epi64(iaxL, iax);
00577 
00578                 __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
00579                 __m128i u = _mm_castpd_si128(d);
00580                 u = _mm_add_epi64(u, madj);
00581                 u = _mm_srli_epi64(u, c.sbit);
00582                 u = _mm_slli_epi64(u, 52);
00583                 u = _mm_or_si128(u, iax);
00584                 __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
00585                 y = _mm_mul_pd(y, mC2);
00586                 y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
00587                 _mm_store_pd(px, _mm_mul_pd(y, _mm_castsi128_pd(u)));
00588                 px += 2;
00589         }
00590 }
00591 
00592 #ifdef FMATH_USE_XBYAK
00593 inline __m128 exp_psC(__m128 x)
00594 #else
00595 inline __m128 exp_ps(__m128 x)
00596 #endif
00597 {
00598         using namespace local;
00599         const ExpVar<>& expVar = C<>::expVar;
00600 
00601         __m128i limit = _mm_castps_si128(_mm_and_ps(x, *cast_to<__m128>(expVar.i7fffffff)));
00602         int over = _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *cast_to<__m128i>(expVar.maxX)));
00603         if (over) {
00604                 x = _mm_min_ps(x, _mm_load_ps(expVar.maxX));
00605                 x = _mm_max_ps(x, _mm_load_ps(expVar.minX));
00606         }
00607 
00608         __m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *cast_to<__m128>(expVar.a)));
00609         __m128 t = _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *cast_to<__m128>(expVar.b)));
00610         t = _mm_add_ps(t, *cast_to<__m128>(expVar.f1));
00611 
00612         __m128i v4 = _mm_and_si128(r, *cast_to<__m128i>(expVar.mask_s));
00613         __m128i u4 = _mm_add_epi32(r, *cast_to<__m128i>(expVar.i127s));
00614         u4 = _mm_srli_epi32(u4, expVar.s);
00615         u4 = _mm_slli_epi32(u4, 23);
00616 
00617         unsigned int v0, v1, v2, v3;
00618         v0 = _mm_cvtsi128_si32(v4);
00619         v1 = _mm_extract_epi16(v4, 2);
00620         v2 = _mm_extract_epi16(v4, 4);
00621         v3 = _mm_extract_epi16(v4, 6);
00622 #if 1
00623         __m128 t0, t1, t2, t3;
00624 
00625 #if 0
00626         t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v0]));
00627         t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v1]));
00628         t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v2]));
00629         t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v3]));
00630 #else // faster but gcc puts warnings
00631         t0 = _mm_set_ss(*(const float*)&expVar.tbl[v0]);
00632         t1 = _mm_set_ss(*(const float*)&expVar.tbl[v1]);
00633         t2 = _mm_set_ss(*(const float*)&expVar.tbl[v2]);
00634         t3 = _mm_set_ss(*(const float*)&expVar.tbl[v3]);
00635 #endif
00636 
00637         t1 = _mm_movelh_ps(t1, t3);
00638         t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));
00639         t0 = _mm_movelh_ps(t0, t2);
00640         t0 = _mm_or_ps(t0, t1);
00641 #else
00642         __m128i ti = _mm_castps_si128(_mm_load_ss((const float*)&expVar.tbl[v0]));
00643         ti = _mm_insert_epi32(ti, expVar.tbl[v1], 1);
00644         ti = _mm_insert_epi32(ti, expVar.tbl[v2], 2);
00645         ti = _mm_insert_epi32(ti, expVar.tbl[v3], 3);
00646         __m128 t0 = _mm_castsi128_ps(ti);
00647 #endif
00648         t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));
00649 
00650         t = _mm_mul_ps(t, t0);
00651 
00652         return t;
00653 }
00654 
00655 inline float log(float x)
00656 {
00657         using namespace local;
00658         const LogVar<>& logVar = C<>::logVar;
00659         const size_t logLen = logVar.LEN;
00660 
00661         fi fi;
00662         fi.f = x;
00663         int a = fi.i & (mask(8) << 23);
00664         unsigned int b1 = fi.i & (mask(logLen) << (23 - logLen));
00665         unsigned int b2 = fi.i & mask(23 - logLen);
00666         int idx = b1 >> (23 - logLen);
00667         float f = float(a - (127 << 23)) * logVar.c_log2 + logVar.tbl[idx].app + float(b2) * logVar.tbl[idx].rev;
00668         return f;
00669 }
00670 
00671 inline __m128 log_ps(__m128 x)
00672 {
00673         using namespace local;
00674         const LogVar<>& logVar = C<>::logVar;
00675 
00676         __m128i xi = _mm_castps_si128(x);
00677         __m128i idx = _mm_srli_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m2)), (23 - logVar.LEN));
00678         __m128 a  = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m1)), *cast_to<__m128i>(logVar.m5)));
00679         __m128 b2 = _mm_cvtepi32_ps(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m3)));
00680 
00681         a = _mm_mul_ps(a, *cast_to<__m128>(logVar.m4)); // c_log2
00682 
00683         unsigned int i0 = _mm_cvtsi128_si32(idx);
00684 
00685 #if 1
00686         unsigned int i1 = _mm_extract_epi16(idx, 2);
00687         unsigned int i2 = _mm_extract_epi16(idx, 4);
00688         unsigned int i3 = _mm_extract_epi16(idx, 6);
00689 #else
00690         idx = _mm_srli_si128(idx, 4);
00691         unsigned int i1 = _mm_cvtsi128_si32(idx);
00692 
00693         idx = _mm_srli_si128(idx, 4);
00694         unsigned int i2 = _mm_cvtsi128_si32(idx);
00695 
00696         idx = _mm_srli_si128(idx, 4);
00697         unsigned int i3 = _mm_cvtsi128_si32(idx);
00698 #endif
00699 
00700         __m128 app, rev;
00701         __m128i L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i0].app));
00702         __m128i H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i1].app));
00703         __m128 t = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
00704         L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i2].app));
00705         H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i3].app));
00706         rev = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
00707         app = _mm_shuffle_ps(t, rev, MIE_PACK(2, 0, 2, 0));
00708         rev = _mm_shuffle_ps(t, rev, MIE_PACK(3, 1, 3, 1));
00709 
00710         a = _mm_add_ps(a, app);
00711         rev = _mm_mul_ps(b2, rev);
00712         return _mm_add_ps(a, rev);
00713 }
00714 
00715 #ifndef __CYGWIN__
00716 // cygwin defines log2() in global namespace!
00717 // log2(x) = log(x) / log(2)
00718 inline float log2(float x) { return fmath::log(x) * 1.442695f; }
00719 #endif
00720 
00721 /*
00722         for given y > 0
00723         get f_y(x) := pow(x, y) for x >= 0
00724 */
00725 class PowGenerator {
00726         enum {
00727                 N = 11
00728         };
00729         float tbl0_[256];
00730         struct {
00731                 float app;
00732                 float rev;
00733         } tbl1_[1 << N];
00734 public:
00735         PowGenerator(float y)
00736         {
00737                 for (int i = 0; i < 256; i++) {
00738                         tbl0_[i] = ::powf(2, (i - 127) * y);
00739                 }
00740                 const double e = 1 / double(1 << 24);
00741                 const double h = 1 / double(1 << N);
00742                 const size_t n = 1U << N;
00743                 for (size_t i = 0; i < n; i++) {
00744                         double x = 1 + double(i) / n;
00745                         double a = ::pow(x, (double)y);
00746                         tbl1_[i].app = (float)a;
00747                         double b = ::pow(x + h - e, (double)y);
00748                         tbl1_[i].rev = (float)((b - a) / (h - e) / (1 << 23));
00749                 }
00750         }
00751         float get(float x) const
00752         {
00753                 using namespace local;
00754                 fi fi;
00755                 fi.f = x;
00756                 int a = (fi.i >> 23) & mask(8);
00757                 unsigned int b = fi.i & mask(23);
00758                 unsigned int b1 = b & (mask(N) << (23 - N));
00759                 unsigned int b2 = b & mask(23 - N);
00760                 float f;
00761                 int idx = b1 >> (23 - N);
00762                 f = tbl0_[a] * (tbl1_[idx].app + float(b2) * tbl1_[idx].rev);
00763                 return f;
00764         }
00765 };
00766 
00767 // for Xbyak version
00768 #ifdef FMATH_USE_XBYAK
00769 float (*const exp)(float) = local::C<>::getInstance().exp_;
00770 __m128 (*const exp_ps)(__m128) = local::C<>::getInstance().exp_ps_;
00771 #endif
00772 
00773 // exp2(x) = pow(2, x)
00774 inline float exp2(float x) { return fmath::exp(x * 0.6931472f); }
00775 
00776 } // fmath
00777 // added by ykhwong (end)
00778 
00779 namespace MT32Emu {
00780 
00781 // Mathematical constants
00782 const double DOUBLE_PI = 3.141592653589793;
00783 const double DOUBLE_LN_10 = 2.302585092994046;
00784 const float FLOAT_PI = 3.1415927f;
00785 const float FLOAT_2PI = 6.2831853f;
00786 const float FLOAT_LN_2 = 0.6931472f;
00787 const float FLOAT_LN_10 = 2.3025851f;
00788 
00789 static inline float POWF(float x, float y) {
00790         fmath::PowGenerator f(y); return f.get(x);
00791 }
00792 
00793 static inline float EXPF(float x) {
00794         return fmath::exp(x);
00795 }
00796 
00797 static inline float EXP2F(float x) {
00798 #ifdef __APPLE__
00799         // on OSX exp2f() is 1.59 times faster than "exp() and the multiplication with FLOAT_LN_2"
00800         return exp2f(x);
00801 #else
00802         return fmath::exp2(x);
00803 #endif
00804 }
00805 
00806 static inline float EXP10F(float x) {
00807         return fmath::exp(FLOAT_LN_10 * x);
00808 }
00809 
00810 static inline float LOGF(float x) {
00811         return fmath::log(x);
00812 }
00813 
00814 static inline float LOG2F(float x) {
00815         return fmath::log(x) / FLOAT_LN_2;
00816 }
00817 
00818 static inline float LOG10F(float x) {
00819         return log10(x);
00820 }
00821 
00822 }
00823 
00824 #endif // #if !defined (__SSE2__)
00825 #endif // #ifndef MT32EMU_MMATH_H