DOSBox-X
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator
src/mt32/mmath.h
00001 /* Copyright (C) 2003, 2004, 2005, 2006, 2008, 2009 Dean Beeler, Jerome Fisher
00002  * Copyright (C) 2011 Dean Beeler, Jerome Fisher, Sergey V. Mikayev
00003  *
00004  *  This program is free software: you can redistribute it and/or modify
00005  *  it under the terms of the GNU Lesser General Public License as published by
00006  *  the Free Software Foundation, either version 2.1 of the License, or
00007  *  (at your option) any later version.
00008  *
00009  *  This program is distributed in the hope that it will be useful,
00010  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *  GNU Lesser General Public License for more details.
00013  *
00014  *  You should have received a copy of the GNU Lesser General Public License
00015  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
00016  */
00017 
00018 #ifndef MT32EMU_MMATH_H
00019 #define MT32EMU_MMATH_H
00020 
00021 #define FIXEDPOINT_UDIV(x, y, point) (((x) << (point)) / ((y)))
00022 #define FIXEDPOINT_SDIV(x, y, point) (((x) * (1 << point)) / ((y)))
00023 #define FIXEDPOINT_UMULT(x, y, point) (((x) * (y)) >> point)
00024 #define FIXEDPOINT_SMULT(x, y, point) (((x) * (y)) / (1 << point))
00025 
00026 #define FIXEDPOINT_MAKE(x, point) ((Bit32u)((1 << point) * x))
00027 
00028 #if defined(_MSC_VER)
00029 # pragma warning(disable:4244) /* const fmath::local::uint64_t to double possible loss of data */
00030 #endif
00031 
00032 // added by ykhwong (start)
00033 #pragma once
00034 
00044 /*
00045         function prototype list
00046 
00047         float fmath::exp(float);
00048         float fmath::log(float);
00049 
00050         __m128 fmath::exp_ps(__m128);
00051         __m128 fmath::log_ps(__m128);
00052 
00053         if FMATH_USE_XBYAK is defined then Xbyak version are used
00054 */
00055 //#define FMATH_USE_XBYAK
00056 
00057 #include <math.h>
00058 #include <stddef.h>
00059 #include <assert.h>
00060 #include <limits>
00061 #include <stdlib.h>
00062 #include <float.h>
00063 #if defined(_WIN32) && !defined(__GNUC__)
00064         #include <intrin.h>
00065         #ifndef MIE_ALIGN
00066                 #define MIE_ALIGN(x) __declspec(align(x))
00067         #endif
00068 #else
00069         #ifndef __GNUC_PREREQ
00070         #define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
00071         #endif
00072         #if __GNUC_PREREQ(4, 4) || !defined(__GNUC__)
00073                 /* GCC >= 4.4 and non-GCC compilers */
00074                 #include <x86intrin.h>
00075         #elif __GNUC_PREREQ(4, 1)
00076                 /* GCC 4.1, 4.2, and 4.3 do not have x86intrin.h, directly include SSE2 header */
00077                 #include <emmintrin.h>
00078         #endif
00079         #ifndef MIE_ALIGN
00080                 #define MIE_ALIGN(x) __attribute__((aligned(x)))
00081         #endif
00082 #endif
00083 #ifndef MIE_PACK
00084         #define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w))
00085 #endif
00086 #ifdef FMATH_USE_XBYAK
00087         #include "xbyak/xbyak.h"
00088         #include "xbyak/xbyak_util.h"
00089 #endif
00090 
00091 #if 1//#ifdef DEBUG
00092 inline void put(const void *p)
00093 {
00094         const float *f = (const float*)p;
00095         printf("{%e, %e, %e, %e}\n", f[0], f[1], f[2], f[3]);
00096 }
00097 inline void puti(const void *p)
00098 {
00099         const unsigned int *i = (const unsigned int *)p;
00100         printf("{%d, %d, %d, %d}\n", i[0], i[1], i[2], i[3]);
00101         printf("{%x, %x, %x, %x}\n", i[0], i[1], i[2], i[3]);
00102 }
00103 #endif
00104 
00105 namespace fmath {
00106 
00107 namespace local {
00108 
00109 const size_t EXP_TABLE_SIZE = 10;
00110 const size_t EXPD_TABLE_SIZE = 11;
00111 const size_t LOG_TABLE_SIZE = 12;
00112 
00113 typedef unsigned long long uint64_t;
00114 
00115 union fi {
00116         float f;
00117         unsigned int i;
00118 };
00119 
00120 union di {
00121         double d;
00122         uint64_t i;
00123 };
00124 
00125 inline unsigned int mask(int x)
00126 {
00127         return (1U << x) - 1;
00128 }
00129 
00130 inline uint64_t mask64(int x)
00131 {
00132         return (1ULL << x) - 1;
00133 }
00134 
00135 template<class T>
00136 inline const T* cast_to(const void *p)
00137 {
00138         return reinterpret_cast<const T*>(p);
00139 }
00140 
00141 template<class T, size_t N>
00142 size_t NumOfArray(const T (&)[N]) { return N; }
00143 
00144 /*
00145         exp(88.722839f) = inf ; 0x42b17218
00146         exp(-87.33655f) = 1.175491e-038f(007fffe6) denormal ; 0xc2aeac50
00147         exp(-103.972081f) = 0 ; 0xc2cff1b5
00148 */
00149 template<size_t N = EXP_TABLE_SIZE>
00150 struct ExpVar {
00151         enum {
00152                 s = N,
00153                 n = 1 << s,
00154                 f88 = 0x42b00000 /* 88.0 */
00155         };
00156         float minX[4];
00157         float maxX[4];
00158         float a[4];
00159         float b[4];
00160         float f1[4];
00161         unsigned int i127s[4];
00162         unsigned int mask_s[4];
00163         unsigned int i7fffffff[4];
00164         unsigned int tbl[n];
00165         ExpVar()
00166         {
00167                 float log_2 = ::logf(2.0f);
00168                 for (int i = 0; i < 4; i++) {
00169                         maxX[i] = 88;
00170                         minX[i] = -88;
00171                         a[i] = n / log_2;
00172                         b[i] = log_2 / n;
00173                         f1[i] = 1.0f;
00174                         i127s[i] = 127 << s;
00175                         i7fffffff[i] = 0x7fffffff;
00176                         mask_s[i] = mask(s);
00177                 }
00178 
00179                 for (int i = 0; i < n; i++) {
00180                         float y = pow(2.0f, (float)i / n);
00181                         fi fi;
00182                         fi.f = y;
00183                         tbl[i] = fi.i & mask(23);
00184                 }
00185         }
00186 };
00187 
00188 template<size_t sbit_ = EXPD_TABLE_SIZE>
00189 struct ExpdVar {
00190         enum {
00191                 sbit = sbit_,
00192                 s = 1UL << sbit,
00193                 adj = (1UL << (sbit + 10)) - (1UL << sbit)
00194         };
00195         // A = 1, B = 1, C = 1/2, D = 1/6
00196         double C1[2]; // A
00197         double C2[2]; // D
00198         double C3[2]; // C/D
00199         uint64_t tbl[s];
00200         const double a;
00201         const double ra;
00202         ExpdVar()
00203                 : a(s / ::log(2.0))
00204                 , ra(1 / a)
00205         {
00206                 for (int i = 0; i < 2; i++) {
00207 #if 0
00208                         C1[i] = 1.0;
00209                         C2[i] = 0.16667794882310216;
00210                         C3[i] = 2.9997969303278795;
00211 #else
00212                         C1[i] = 1.0;
00213                         C2[i] = 0.16666666685227835064;
00214                         C3[i] = 3.0000000027955394;
00215 #endif
00216                 }
00217                 for (int i = 0; i < s; i++) {
00218                         di di;
00219                         di.d = ::pow(2.0, i * (1.0 / s));
00220                         tbl[i] = di.i & mask64(52);
00221                 }
00222         }
00223 };
00224 
00225 template<size_t N = LOG_TABLE_SIZE>
00226 struct LogVar {
00227         enum {
00228                 LEN = N - 1
00229         };
00230         unsigned int m1[4]; // 0
00231         unsigned int m2[4]; // 16
00232         unsigned int m3[4]; // 32
00233         float m4[4];            // 48
00234         unsigned int m5[4]; // 64
00235         struct {
00236                 float app;
00237                 float rev;
00238         } tbl[1 << LEN];
00239         float c_log2;
00240         LogVar()
00241                 : c_log2(::logf(2.0f) / (1 << 23))
00242         {
00243                 const double e = 1 / double(1 << 24);
00244                 const double h = 1 / double(1 << LEN);
00245                 const size_t n = 1U << LEN;
00246                 for (size_t i = 0; i < n; i++) {
00247                         double x = 1 + double(i) / n;
00248                         double a = ::log(x);
00249                         tbl[i].app = (float)a;
00250                         if (i < n - 1) {
00251                                 double b = ::log(x + h - e);
00252                                 tbl[i].rev = (float)((b - a) / ((h - e) * (1 << 23)));
00253                         } else {
00254                                 tbl[i].rev = (float)(1 / (x * (1 << 23)));
00255                         }
00256                 }
00257                 for (int i = 0; i < 4; i++) {
00258                         m1[i] = mask(8) << 23;
00259                         m2[i] = mask(LEN) << (23 - LEN);
00260                         m3[i] = mask(23 - LEN);
00261                         m4[i] = c_log2;
00262                         m5[i] = 127U << 23;
00263                 }
00264         }
00265 };
00266 
00267 #ifdef FMATH_USE_XBYAK
00268 struct ExpCode : public Xbyak::CodeGenerator {
00269         float (*exp_)(float);
00270         __m128 (*exp_ps_)(__m128);
00271         template<size_t N>
00272         ExpCode(const ExpVar<N> *self)
00273         {
00274                 Xbyak::util::Cpu cpu;
00275                 try {
00276                         makeExp(self, cpu);
00277                         exp_ = (float(*)(float))getCode();
00278                         align(16);
00279                         exp_ps_ = (__m128(*)(__m128))getCurr();
00280                         makeExpPs(self, cpu);
00281                         return;
00282                 } catch (Xbyak::Error err) {
00283                         LOG_MSG( "ExpCode ERR:%s(%d)\n", Xbyak::ConvertErrorToString(err), err);
00284                 } catch (...) {
00285                         LOG_MSG( "ExpCode ERR:unknown error\n");
00286                 }
00287                 ::exit(1);
00288         }
00289         template<size_t N>
00290         void makeExp(const ExpVar<N> *self, const Xbyak::util::Cpu& /*cpu*/)
00291         {
00292                 typedef ExpVar<N> Self;
00293                 using namespace local;
00294                 using namespace Xbyak;
00295 
00296                 inLocalLabel();
00297 #ifdef XBYAK64
00298                 const Reg64& base = rcx;
00299                 const Reg64& a = rax;
00300 #else
00301                 const Reg32& base = ecx;
00302                 const Reg32& a = eax;
00303 #endif
00304 
00305                 mov(base, (size_t)self);
00306 
00307 #ifdef XBYAK32
00308                 movss(xm0, ptr [esp + 4]);
00309 #endif
00310         L(".retry");
00311                 movaps(xm1, xm0);
00312                 movd(edx, xm0);
00313                 mulss(xm1, ptr [base + offsetof(Self, a)]); // t
00314                 and(edx, 0x7fffffff);
00315                 cvtss2si(eax, xm1);
00316                 cmp(edx, ExpVar<N>::f88);
00317                 jg(".overflow");
00318                 lea(edx, ptr [eax + (127 << self->s)]);
00319                 cvtsi2ss(xm1, eax);
00320                 and(eax, mask(self->s)); // v
00321                 mov(eax, ptr [base + a * 4 + offsetof(Self, tbl)]); // expVar.tbl[v]
00322                 shr(edx, self->s);
00323                 mulss(xm1, ptr [base + offsetof(Self, b)]);
00324                 shl(edx, 23); // u
00325                 subss(xm0, xm1); // t
00326                 or(eax, edx); // fi.f
00327                 addss(xm0, ptr [base + offsetof(Self, f1)]);
00328                 movd(xm1, eax);
00329                 mulss(xm0, xm1);
00330 #ifdef XBYAK32
00331                 movss(ptr[esp + 4], xm0);
00332                 fld(dword[esp + 4]);
00333 #endif
00334                 ret();
00335         L(".overflow");
00336                 minss(xm0, ptr [base + offsetof(Self, maxX)]);
00337                 maxss(xm0, ptr [base + offsetof(Self, minX)]);
00338                 jmp(".retry");
00339                 outLocalLabel();
00340         }
00341         template<size_t N>
00342         void makeExpPs(const ExpVar<N> *self, const Xbyak::util::Cpu& cpu)
00343         {
00344                 typedef ExpVar<N> Self;
00345                 using namespace local;
00346                 using namespace Xbyak;
00347 
00348                 inLocalLabel();
00349 #ifdef XBYAK64
00350                 const Reg64& base = rcx;
00351                 const Reg64& a = rax;
00352                 const Reg64& d = rdx;
00353 #else
00354                 const Reg32& base = ecx;
00355                 const Reg32& a = eax;
00356                 const Reg32& d = edx;
00357 #endif
00358 
00359 /*
00360         if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try
00361         minps, maxps are very slow then avoid them
00362 */
00363                 const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41);
00364 #if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER)
00365                 movaps(xm0, ptr [rcx]);
00366 #endif
00367                 mov(base, (size_t)self);
00368         L(".retry");
00369                 movaps(xm5, xm0);
00370                 andps(xm5, ptr [base + offsetof(Self, i7fffffff)]);
00371                 movaps(xm3, ptr [base + offsetof(Self, a)]);
00372                 movaps(xm4, ptr [base + offsetof(Self, b)]);
00373                 pcmpgtd(xm5, ptr [base + offsetof(Self, maxX)]);
00374                 mulps(xm3, xm0);
00375                 movaps(xm1, ptr [base + offsetof(Self, i127s)]);
00376                 pmovmskb(eax, xm5);
00377                 movaps(xm5, ptr [base + offsetof(Self, mask_s)]);
00378                 cvtps2dq(xm2, xm3);
00379                 pand(xm5, xm2);
00380                 cvtdq2ps(xm3, xm2);
00381                 test(eax, eax);
00382                 jnz(".overflow");
00383                 paddd(xm1, xm2);
00384                 movd(eax, xm5);
00385                 mulps(xm4, xm3);
00386                 pextrw(edx, xm5, 2);
00387                 subps(xm0, xm4);
00388                 movd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)]);
00389                 addps(xm0, ptr [base + offsetof(Self, f1)]);
00390                 pextrw(eax, xm5, 4);
00391                 if (useSSE41) {
00392                         pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 1);
00393                 } else {
00394                         movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]);
00395                         movlhps(xm4, xm3);
00396                 }
00397                 pextrw(edx, xm5, 6);
00398                 psrld(xm1, self->s);
00399                 pslld(xm1, 23);
00400                 if (useSSE41) {
00401                         pinsrd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)], 2);
00402                         pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 3);
00403                 } else {
00404                         movd(xm2, ptr [base + a * 4 + offsetof(Self, tbl)]);
00405                         movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]);
00406                         movlhps(xm2, xm3);
00407                         shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0));
00408                 }
00409                 por(xm1, xm4);
00410                 mulps(xm0, xm1);
00411                 ret();
00412         L(".overflow");
00413                 minps(xm0, ptr [base + offsetof(Self, maxX)]);
00414                 maxps(xm0, ptr [base + offsetof(Self, minX)]);
00415                 jmp(".retry");
00416                 outLocalLabel();
00417         }
00418 };
00419 #endif
00420 
00421 /* to define static variables in fmath.hpp */
00422 template<size_t EXP_N = EXP_TABLE_SIZE, size_t LOG_N = LOG_TABLE_SIZE, size_t EXPD_N = EXPD_TABLE_SIZE>
00423 struct C {
00424         static const ExpVar<EXP_N> expVar;
00425         static const LogVar<LOG_N> logVar;
00426         static const ExpdVar<EXPD_N> expdVar;
00427 #ifdef FMATH_USE_XBYAK
00428         static const ExpCode& getInstance() {
00429                 static const ExpCode expCode(&expVar);
00430                 return expCode;
00431         }
00432 #endif
00433 };
00434 
00435 template<size_t EXP_N, size_t LOG_N, size_t EXPD_N>
00436 MIE_ALIGN(16) const ExpVar<EXP_N> C<EXP_N, LOG_N, EXPD_N>::expVar;
00437 
00438 template<size_t EXP_N, size_t LOG_N, size_t EXPD_N>
00439 MIE_ALIGN(16) const LogVar<LOG_N> C<EXP_N, LOG_N, EXPD_N>::logVar;
00440 
00441 template<size_t EXP_N, size_t LOG_N, size_t EXPD_N>
00442 MIE_ALIGN(16) const ExpdVar<EXPD_N> C<EXP_N, LOG_N, EXPD_N>::expdVar;
00443 
00444 } // fmath::local
00445 
00446 #ifdef FMATH_USE_XBYAK
00447 inline float expC(float x)
00448 #else
00449 inline float exp(float x)
00450 #endif
00451 {
00452         using namespace local;
00453         const ExpVar<>& expVar = C<>::expVar;
00454 
00455 #if 1
00456         __m128 x1 = _mm_set_ss(x);
00457 
00458         int limit = _mm_cvtss_si32(x1) & 0x7fffffff;
00459         if (limit > ExpVar<>::f88) {
00460                 x1 = _mm_min_ss(x1, _mm_load_ss(expVar.maxX));
00461                 x1 = _mm_max_ss(x1, _mm_load_ss(expVar.minX));
00462         }
00463 
00464         int r = _mm_cvtss_si32(_mm_mul_ss(x1, _mm_load_ss(expVar.a)));
00465         unsigned int v = r & mask(expVar.s);
00466         float t = _mm_cvtss_f32(x1) - r * expVar.b[0];
00467         int u = r >> expVar.s;
00468         fi fi;
00469         fi.i = ((u + 127) << 23) | expVar.tbl[v];
00470         return (1 + t) * fi.f;
00471 #else
00472         x = std::min(x, expVar.maxX[0]);
00473         x = std::max(x, expVar.minX[0]);
00474         float t = x * expVar.a[0];
00475         const float magic = (1 << 23) + (1 << 22); // to round
00476         t += magic;
00477         fi fi;
00478         fi.f = t;
00479         t = x - (t - magic) * expVar.b[0];
00480         int u = ((fi.i + (127 << expVar.s)) >> expVar.s) << 23;
00481         unsigned int v = fi.i & mask(expVar.s);
00482         fi.i = u | expVar.tbl[v];
00483         return (1 + t) * fi.f;
00484 //      return (1 + t) * pow(2, (float)u) * pow(2, (float)v / n);
00485 #endif
00486 }
00487 
00488 /*
00489         remark : -ffast-math option of gcc may generate bad code for fmath::expd
00490 */
00491 inline double expd(double x)
00492 {
00493         if (x <= -708.39641853226408) return 0;
00494         if (x >= 709.78271289338397) return std::numeric_limits<double>::infinity();
00495         using namespace local;
00496         const ExpdVar<>& c = C<>::expdVar;
00497         const uint64_t b = 3ULL << 51;
00498         di di;
00499         di.d = x * c.a + b;
00500         uint64_t iax = c.tbl[di.i & mask(c.sbit)];
00501 
00502         double t = (di.d - b) * c.ra - x;
00503         uint64_t u = ((di.i + c.adj) >> c.sbit) << 52;
00504         double y = (c.C3[0] - t) * (t * t) * c.C2[0] - t + c.C1[0];
00505 //      double y = (2.999796930327879362111743 - t) * (t * t) * 0.166677948823102161853172 - t + 1.000000000000000000488181;
00506 
00507         di.i = u | iax;
00508         return y * di.d;
00509 }
00510 
00511 inline void expd_v(double *px, int n)
00512 {
00513         using namespace local;
00514         const ExpdVar<>& c = C<>::expdVar;
00515         const uint64_t b = 3ULL << 51;
00516         assert((n % 2) == 0);
00517         const __m128d mC1 = *cast_to<__m128d>(c.C1);
00518         const __m128d mC2 = *cast_to<__m128d>(c.C2);
00519         const __m128d mC3 = *cast_to<__m128d>(c.C3);
00520         const __m128d ma = _mm_set1_pd(c.a);
00521         const __m128d mra = _mm_set1_pd(c.ra);
00522         const __m128i madj = _mm_set1_epi32(c.adj);
00523         MIE_ALIGN(16) const double expMax[2] = { 709.78271289338397, 709.78271289338397 };
00524         MIE_ALIGN(16) const double expMin[2] = { -708.39641853226408, -708.39641853226408 };
00525         for (unsigned int i = 0; i < (unsigned int)n; i += 2) {
00526                 __m128d x = _mm_load_pd(px);
00527                 x = _mm_min_pd(x, *(const __m128d*)expMax);
00528                 x = _mm_max_pd(x, *(const __m128d*)expMin);
00529 
00530                 __m128d d = _mm_mul_pd(x, ma);
00531                 d = _mm_add_pd(d, _mm_set1_pd(b));
00532                 int adr0 = _mm_cvtsi128_si32(_mm_castpd_si128(d)) & mask(c.sbit);
00533                 int adr1 = _mm_cvtsi128_si32(_mm_srli_si128(_mm_castpd_si128(d), 8)) & mask(c.sbit);
00534 
00535                 __m128i iaxL = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr0]));
00536                 __m128i iax = _mm_castpd_si128(_mm_load_sd((const double*)&c.tbl[adr1]));
00537                 iax = _mm_unpacklo_epi64(iaxL, iax);
00538 
00539                 __m128d t = _mm_sub_pd(_mm_mul_pd(_mm_sub_pd(d, _mm_set1_pd(b)), mra), x);
00540                 __m128i u = _mm_castpd_si128(d);
00541                 u = _mm_add_epi64(u, madj);
00542                 u = _mm_srli_epi64(u, c.sbit);
00543                 u = _mm_slli_epi64(u, 52);
00544                 u = _mm_or_si128(u, iax);
00545                 __m128d y = _mm_mul_pd(_mm_sub_pd(mC3, t), _mm_mul_pd(t, t));
00546                 y = _mm_mul_pd(y, mC2);
00547                 y = _mm_add_pd(_mm_sub_pd(y, t), mC1);
00548                 _mm_store_pd(px, _mm_mul_pd(y, _mm_castsi128_pd(u)));
00549                 px += 2;
00550         }
00551 }
00552 
00553 #ifdef FMATH_USE_XBYAK
00554 inline __m128 exp_psC(__m128 x)
00555 #else
00556 inline __m128 exp_ps(__m128 x)
00557 #endif
00558 {
00559         using namespace local;
00560         const ExpVar<>& expVar = C<>::expVar;
00561 
00562         __m128i limit = _mm_castps_si128(_mm_and_ps(x, *cast_to<__m128>(expVar.i7fffffff)));
00563         int over = _mm_movemask_epi8(_mm_cmpgt_epi32(limit, *cast_to<__m128i>(expVar.maxX)));
00564         if (over) {
00565                 x = _mm_min_ps(x, _mm_load_ps(expVar.maxX));
00566                 x = _mm_max_ps(x, _mm_load_ps(expVar.minX));
00567         }
00568 
00569         __m128i r = _mm_cvtps_epi32(_mm_mul_ps(x, *cast_to<__m128>(expVar.a)));
00570         __m128 t = _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(r), *cast_to<__m128>(expVar.b)));
00571         t = _mm_add_ps(t, *cast_to<__m128>(expVar.f1));
00572 
00573         __m128i v4 = _mm_and_si128(r, *cast_to<__m128i>(expVar.mask_s));
00574         __m128i u4 = _mm_add_epi32(r, *cast_to<__m128i>(expVar.i127s));
00575         u4 = _mm_srli_epi32(u4, expVar.s);
00576         u4 = _mm_slli_epi32(u4, 23);
00577 
00578         unsigned int v0, v1, v2, v3;
00579         v0 = _mm_cvtsi128_si32(v4);
00580         v1 = _mm_extract_epi16(v4, 2);
00581         v2 = _mm_extract_epi16(v4, 4);
00582         v3 = _mm_extract_epi16(v4, 6);
00583 #if 1
00584         __m128 t0, t1, t2, t3;
00585 
00586 #if 0
00587         t0 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v0]));
00588         t1 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v1]));
00589         t2 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v2]));
00590         t3 = _mm_castsi128_ps(_mm_set1_epi32(expVar.tbl[v3]));
00591 #else // faster but gcc puts warnings
00592         t0 = _mm_set_ss(*(const float*)&expVar.tbl[v0]);
00593         t1 = _mm_set_ss(*(const float*)&expVar.tbl[v1]);
00594         t2 = _mm_set_ss(*(const float*)&expVar.tbl[v2]);
00595         t3 = _mm_set_ss(*(const float*)&expVar.tbl[v3]);
00596 #endif
00597 
00598         t1 = _mm_movelh_ps(t1, t3);
00599         t1 = _mm_castsi128_ps(_mm_slli_epi64(_mm_castps_si128(t1), 32));
00600         t0 = _mm_movelh_ps(t0, t2);
00601         t0 = _mm_or_ps(t0, t1);
00602 #else
00603         __m128i ti = _mm_castps_si128(_mm_load_ss((const float*)&expVar.tbl[v0]));
00604         ti = _mm_insert_epi32(ti, expVar.tbl[v1], 1);
00605         ti = _mm_insert_epi32(ti, expVar.tbl[v2], 2);
00606         ti = _mm_insert_epi32(ti, expVar.tbl[v3], 3);
00607         __m128 t0 = _mm_castsi128_ps(ti);
00608 #endif
00609         t0 = _mm_or_ps(t0, _mm_castsi128_ps(u4));
00610 
00611         t = _mm_mul_ps(t, t0);
00612 
00613         return t;
00614 }
00615 
00616 inline float log(float x)
00617 {
00618         using namespace local;
00619         const LogVar<>& logVar = C<>::logVar;
00620         const size_t logLen = logVar.LEN;
00621 
00622         fi fi;
00623         fi.f = x;
00624         int a = fi.i & (mask(8) << 23);
00625         unsigned int b1 = fi.i & (mask(logLen) << (23 - logLen));
00626         unsigned int b2 = fi.i & mask(23 - logLen);
00627         int idx = b1 >> (23 - logLen);
00628         float f = float(a - (127 << 23)) * logVar.c_log2 + logVar.tbl[idx].app + float(b2) * logVar.tbl[idx].rev;
00629         return f;
00630 }
00631 
00632 inline __m128 log_ps(__m128 x)
00633 {
00634         using namespace local;
00635         const LogVar<>& logVar = C<>::logVar;
00636 
00637         __m128i xi = _mm_castps_si128(x);
00638         __m128i idx = _mm_srli_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m2)), (23 - logVar.LEN));
00639         __m128 a  = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m1)), *cast_to<__m128i>(logVar.m5)));
00640         __m128 b2 = _mm_cvtepi32_ps(_mm_and_si128(xi, *cast_to<__m128i>(logVar.m3)));
00641 
00642         a = _mm_mul_ps(a, *cast_to<__m128>(logVar.m4)); // c_log2
00643 
00644         unsigned int i0 = _mm_cvtsi128_si32(idx);
00645 
00646 #if 1
00647         unsigned int i1 = _mm_extract_epi16(idx, 2);
00648         unsigned int i2 = _mm_extract_epi16(idx, 4);
00649         unsigned int i3 = _mm_extract_epi16(idx, 6);
00650 #else
00651         idx = _mm_srli_si128(idx, 4);
00652         unsigned int i1 = _mm_cvtsi128_si32(idx);
00653 
00654         idx = _mm_srli_si128(idx, 4);
00655         unsigned int i2 = _mm_cvtsi128_si32(idx);
00656 
00657         idx = _mm_srli_si128(idx, 4);
00658         unsigned int i3 = _mm_cvtsi128_si32(idx);
00659 #endif
00660 
00661         __m128 app, rev;
00662         __m128i L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i0].app));
00663         __m128i H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i1].app));
00664         __m128 t = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
00665         L = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i2].app));
00666         H = _mm_loadl_epi64(cast_to<__m128i>(&logVar.tbl[i3].app));
00667         rev = _mm_castsi128_ps(_mm_unpacklo_epi64(L, H));
00668         app = _mm_shuffle_ps(t, rev, MIE_PACK(2, 0, 2, 0));
00669         rev = _mm_shuffle_ps(t, rev, MIE_PACK(3, 1, 3, 1));
00670 
00671         a = _mm_add_ps(a, app);
00672         rev = _mm_mul_ps(b2, rev);
00673         return _mm_add_ps(a, rev);
00674 }
00675 
00676 #ifndef __CYGWIN__
00677 // cygwin defines log2() in global namespace!
00678 // log2(x) = log(x) / log(2)
00679 inline float log2(float x) { return fmath::log(x) * 1.442695f; }
00680 #endif
00681 
00682 /*
00683         for given y > 0
00684         get f_y(x) := pow(x, y) for x >= 0
00685 */
00686 class PowGenerator {
00687         enum {
00688                 N = 11
00689         };
00690         float tbl0_[256];
00691         struct {
00692                 float app;
00693                 float rev;
00694         } tbl1_[1 << N];
00695 public:
00696         PowGenerator(float y)
00697         {
00698                 for (int i = 0; i < 256; i++) {
00699                         tbl0_[i] = ::powf(2, (i - 127) * y);
00700                 }
00701                 const double e = 1 / double(1 << 24);
00702                 const double h = 1 / double(1 << N);
00703                 const size_t n = 1U << N;
00704                 for (size_t i = 0; i < n; i++) {
00705                         double x = 1 + double(i) / n;
00706                         double a = ::pow(x, (double)y);
00707                         tbl1_[i].app = (float)a;
00708                         double b = ::pow(x + h - e, (double)y);
00709                         tbl1_[i].rev = (float)((b - a) / (h - e) / (1 << 23));
00710                 }
00711         }
00712         float get(float x) const
00713         {
00714                 using namespace local;
00715                 fi fi;
00716                 fi.f = x;
00717                 int a = (fi.i >> 23) & mask(8);
00718                 unsigned int b = fi.i & mask(23);
00719                 unsigned int b1 = b & (mask(N) << (23 - N));
00720                 unsigned int b2 = b & mask(23 - N);
00721                 float f;
00722                 int idx = b1 >> (23 - N);
00723                 f = tbl0_[a] * (tbl1_[idx].app + float(b2) * tbl1_[idx].rev);
00724                 return f;
00725         }
00726 };
00727 
00728 // for Xbyak version
00729 #ifdef FMATH_USE_XBYAK
00730 float (*const exp)(float) = local::C<>::getInstance().exp_;
00731 __m128 (*const exp_ps)(__m128) = local::C<>::getInstance().exp_ps_;
00732 #endif
00733 
00734 // exp2(x) = pow(2, x)
00735 inline float exp2(float x) { return fmath::exp(x * 0.6931472f); }
00736 
00737 } // fmath
00738 // added by ykhwong (end)
00739 
00740 namespace MT32Emu {
00741 
00742 // Mathematical constants
00743 const double DOUBLE_PI = 3.141592653589793;
00744 const double DOUBLE_LN_10 = 2.302585092994046;
00745 const float FLOAT_PI = 3.1415927f;
00746 const float FLOAT_2PI = 6.2831853f;
00747 const float FLOAT_LN_2 = 0.6931472f;
00748 const float FLOAT_LN_10 = 2.3025851f;
00749 
00750 static inline float POWF(float x, float y) {
00751         fmath::PowGenerator f(y); return f.get(x);
00752 }
00753 
00754 static inline float EXPF(float x) {
00755         return fmath::exp(x);
00756 }
00757 
00758 static inline float EXP2F(float x) {
00759 #ifdef __APPLE__
00760         // on OSX exp2f() is 1.59 times faster than "exp() and the multiplication with FLOAT_LN_2"
00761         return exp2f(x);
00762 #else
00763         return fmath::exp2(x);
00764 #endif
00765 }
00766 
00767 static inline float EXP10F(float x) {
00768         return fmath::exp(FLOAT_LN_10 * x);
00769 }
00770 
00771 static inline float LOGF(float x) {
00772         return fmath::log(x);
00773 }
00774 
00775 static inline float LOG2F(float x) {
00776         return fmath::log(x) / FLOAT_LN_2;
00777 }
00778 
00779 static inline float LOG10F(float x) {
00780         return log10(x);
00781 }
00782 
00783 }
00784 
00785 #endif