DOSBox-X
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines
src/xBRZ/xbrz.cpp
00001 // ****************************************************************************
00002 // * This file is part of the xBRZ project. It is distributed under           *
00003 // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0         *
00004 // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved          *
00005 // *                                                                          *
00006 // * Additionally and as a special exception, the author gives permission     *
00007 // * to link the code of this program with the following libraries            *
00008 // * (or with modified versions that use the same licenses), and distribute   *
00009 // * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
00010 // * You must obey the GNU General Public License in all respects for all of  *
00011 // * the code used other than MAME, FreeFileSync, Snes9x, ePSXe.              *
00012 // * If you modify this file, you may extend this exception to your version   *
00013 // * of the file, but you are not obligated to do so. If you do not wish to   *
00014 // * do so, delete this exception statement from your version.                *
00015 // ****************************************************************************
00016 
00017 #include "xbrz.h"
00018 #include <cassert>
00019 #include <vector>
00020 #include <algorithm>
00021 #include <cmath> //std::sqrt
00022 #include "xbrz_tools.h"
00023 
00024 using namespace xbrz;
00025 
00026 
00027 namespace
00028 {
00029 template <unsigned int M, unsigned int N> inline
00030 uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: http://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
00031 {
00032     static_assert(0 < M && M < N && N <= 1000, "");
00033 
00034     auto calcColor = [](unsigned char colFront, unsigned char colBack) -> unsigned char { return (colFront * M + colBack * (N - M)) / N; };
00035 
00036     return makePixel(calcColor(getRed  (pixFront), getRed  (pixBack)),
00037                      calcColor(getGreen(pixFront), getGreen(pixBack)),
00038                      calcColor(getBlue (pixFront), getBlue (pixBack)));
00039 }
00040 
00041 
00042 template <unsigned int M, unsigned int N> inline
00043 uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
00044 {
00045     static_assert(0 < M && M < N && N <= 1000, "");
00046 
00047     const unsigned int weightFront = getAlpha(pixFront) * M;
00048     const unsigned int weightBack  = getAlpha(pixBack) * (N - M);
00049     const unsigned int weightSum   = weightFront + weightBack;
00050     if (weightSum == 0)
00051         return 0;
00052 
00053     auto calcColor = [=](unsigned char colFront, unsigned char colBack)
00054     {
00055         return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
00056     };
00057 
00058     return makePixel(static_cast<unsigned char>(weightSum / N),
00059                      calcColor(getRed  (pixFront), getRed  (pixBack)),
00060                      calcColor(getGreen(pixFront), getGreen(pixBack)),
00061                      calcColor(getBlue (pixFront), getBlue (pixBack)));
00062 }
00063 
00064 
00065 //inline
00066 //double fastSqrt(double n)
00067 //{
00068 //    __asm //speeds up xBRZ by about 9% compared to std::sqrt which internally uses the same assembler instructions but adds some "fluff"
00069 //    {
00070 //        fld n
00071 //        fsqrt
00072 //    }
00073 //}
00074 //
00075 
00076 
00077 #ifdef _MSC_VER
00078     #define FORCE_INLINE __forceinline
00079 #elif defined __GNUC__
00080     #define FORCE_INLINE __attribute__((always_inline)) inline
00081 #else
00082     #define FORCE_INLINE inline
00083 #endif
00084 
00085 
00086 enum RotationDegree //clock-wise
00087 {
00088     ROT_0,
00089     ROT_90,
00090     ROT_180,
00091     ROT_270
00092 };
00093 
00094 //calculate input matrix coordinates after rotation at compile time
00095 template <RotationDegree rotDeg, size_t I, size_t J, size_t N>
00096 struct MatrixRotation;
00097 
00098 template <size_t I, size_t J, size_t N>
00099 struct MatrixRotation<ROT_0, I, J, N>
00100 {
00101     static const size_t I_old = I;
00102     static const size_t J_old = J;
00103 };
00104 
00105 template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
00106 struct MatrixRotation
00107 {
00108     static const size_t I_old = N - 1 - MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
00109     static const size_t J_old =         MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::I_old; //
00110 };
00111 
00112 
00113 template <size_t N, RotationDegree rotDeg>
00114 class OutputMatrix
00115 {
00116 public:
00117     OutputMatrix(uint32_t* out, int outWidth) : //access matrix area, top-left at position "out" for image with given width
00118         out_(out),
00119         outWidth_(outWidth) {}
00120 
00121     template <size_t I, size_t J>
00122     uint32_t& ref() const
00123     {
00124         static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
00125         static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
00126         return *(out_ + J_old + I_old * outWidth_);
00127     }
00128 
00129 private:
00130     uint32_t* out_;
00131     const int outWidth_;
00132 };
00133 
00134 
00135 template <class T> inline
00136 T square(T value) { return value * value; }
00137 
00138 
00139 #if 0
00140 inline
00141 double distRGB(uint32_t pix1, uint32_t pix2)
00142 {
00143     const double r_diff = static_cast<int>(getRed  (pix1)) - getRed  (pix2);
00144     const double g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
00145     const double b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
00146 
00147     //euklidean RGB distance
00148     return std::sqrt(square(r_diff) + square(g_diff) + square(b_diff));
00149 }
00150 #endif
00151 
00152 
00153 inline
00154 double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
00155 {
00156     //http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
00157     //YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
00158     const int r_diff = static_cast<int>(getRed  (pix1)) - getRed  (pix2); //we may delay division by 255 to after matrix multiplication
00159     const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); //
00160     const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2); //substraction for int is noticeable faster than for double!
00161 
00162     //const double k_b = 0.0722; //ITU-R BT.709 conversion
00163     //const double k_r = 0.2126; //
00164     const double k_b = 0.0593; //ITU-R BT.2020 conversion
00165     const double k_r = 0.2627; //
00166     const double k_g = 1 - k_b - k_r;
00167 
00168     const double scale_b = 0.5 / (1 - k_b);
00169     const double scale_r = 0.5 / (1 - k_r);
00170 
00171     const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
00172     const double c_b = scale_b * (b_diff - y);
00173     const double c_r = scale_r * (r_diff - y);
00174 
00175     //we skip division by 255 to have similar range like other distance functions
00176     return std::sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
00177 }
00178 
00179 
00180 inline
00181 double distYCbCrBuffered(uint32_t pix1, uint32_t pix2)
00182 {
00183     //30% perf boost compared to plain distYCbCr()!
00184     //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
00185     static const std::vector<float> diffToDist = []
00186     {
00187         std::vector<float> tmp;
00188 
00189         for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
00190         {
00191             const int r_diff = getByte<2>(i) * 2 - 0xFF;
00192             const int g_diff = getByte<1>(i) * 2 - 0xFF;
00193             const int b_diff = getByte<0>(i) * 2 - 0xFF;
00194 
00195             const double k_b = 0.0593; //ITU-R BT.2020 conversion
00196             const double k_r = 0.2627; //
00197             const double k_g = 1 - k_b - k_r;
00198 
00199             const double scale_b = 0.5 / (1 - k_b);
00200             const double scale_r = 0.5 / (1 - k_r);
00201 
00202             const double y   = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
00203             const double c_b = scale_b * (b_diff - y);
00204             const double c_r = scale_r * (r_diff - y);
00205 
00206             tmp.push_back(static_cast<float>(std::sqrt(square(y) + square(c_b) + square(c_r))));
00207         }
00208         return tmp;
00209     }();
00210 
00211     //if (pix1 == pix2) -> 8% perf degradation!
00212     //    return 0;
00213     //if (pix1 < pix2)
00214     //    std::swap(pix1, pix2); -> 30% perf degradation!!!
00215 #if 1
00216     const int r_diff = static_cast<int>(getRed  (pix1)) - getRed  (pix2);
00217     const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
00218     const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
00219 
00220     return diffToDist[size_t((((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
00221                              (((g_diff + 0xFF) / 2) <<  8) |
00222                              (((b_diff + 0xFF) / 2)      ))];
00223 #else //not noticeably faster:
00224     const int r_diff_tmp = ((pix1 & 0xFF0000) + 0xFF0000 - (pix2 & 0xFF0000)) / 2;
00225     const int g_diff_tmp = ((pix1 & 0x00FF00) + 0x00FF00 - (pix2 & 0x00FF00)) / 2; //slightly reduce precision (division by 2) to squeeze value into single byte
00226     const int b_diff_tmp = ((pix1 & 0x0000FF) + 0x0000FF - (pix2 & 0x0000FF)) / 2;
00227 
00228     return diffToDist[(r_diff_tmp & 0xFF0000) | (g_diff_tmp & 0x00FF00) | (b_diff_tmp & 0x0000FF)];
00229 #endif
00230 }
00231 
00232 
00233 enum BlendType
00234 {
00235     BLEND_NONE = 0,
00236     BLEND_NORMAL,   //a normal indication to blend
00237     BLEND_DOMINANT, //a strong indication to blend
00238     //attention: BlendType must fit into the value range of 2 bit!!!
00239 };
00240 
00241 struct BlendResult
00242 {
00243     BlendType
00244     blend_f, blend_g,
00245     blend_j, blend_k;
00246 };
00247 
00248 
00249 struct Kernel_4x4 //kernel for preprocessing step
00250 {
00251     uint32_t
00252     a, b, c, d,
00253     e, f, g, h,
00254     i, j, k, l,
00255     m, n, o, p;
00256 };
00257 
00258 /*
00259 input kernel area naming convention:
00260 -----------------
00261 | A | B | C | D |
00262 ----|---|---|---|
00263 | E | F | G | H |   //evaluate the four corners between F, G, J, K
00264 ----|---|---|---|   //input pixel is at position F
00265 | I | J | K | L |
00266 ----|---|---|---|
00267 | M | N | O | P |
00268 -----------------
00269 */
00270 template <class ColorDistance>
00271 FORCE_INLINE //detect blend direction
00272 BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg) //result: F, G, J, K corners of "GradientType"
00273 {
00274     BlendResult result = {};
00275 
00276     if ((ker.f == ker.g &&
00277          ker.j == ker.k) ||
00278         (ker.f == ker.j &&
00279          ker.g == ker.k))
00280         return result;
00281 
00282     auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };
00283 
00284     const int weight = 4;
00285     double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + weight * dist(ker.j, ker.g);
00286     double fk = dist(ker.e, ker.j) + dist(ker.j, ker.o) + dist(ker.b, ker.g) + dist(ker.g, ker.l) + weight * dist(ker.f, ker.k);
00287 
00288     if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
00289     {
00290         const bool dominantGradient = cfg.dominantDirectionThreshold * jg < fk;
00291         if (ker.f != ker.g && ker.f != ker.j)
00292             result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
00293 
00294         if (ker.k != ker.j && ker.k != ker.g)
00295             result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
00296     }
00297     else if (fk < jg)
00298     {
00299         const bool dominantGradient = cfg.dominantDirectionThreshold * fk < jg;
00300         if (ker.j != ker.f && ker.j != ker.k)
00301             result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
00302 
00303         if (ker.g != ker.f && ker.g != ker.k)
00304             result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
00305     }
00306     return result;
00307 }
00308 
00309 struct Kernel_3x3
00310 {
00311     uint32_t
00312     a,  b,  c,
00313     d,  e,  f,
00314     g,  h,  i;
00315 };
00316 
00317 #define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
00318 //we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
00319 DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
00320 DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
00321 DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
00322 #undef DEF_GETTER
00323 
00324 #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_90>(const Kernel_3x3& ker) { return ker.y; }
00325 DEF_GETTER(a, g) DEF_GETTER(b, d) DEF_GETTER(c, a)
00326 DEF_GETTER(d, h) DEF_GETTER(e, e) DEF_GETTER(f, b)
00327 DEF_GETTER(g, i) DEF_GETTER(h, f) DEF_GETTER(i, c)
00328 #undef DEF_GETTER
00329 
00330 #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_180>(const Kernel_3x3& ker) { return ker.y; }
00331 DEF_GETTER(a, i) DEF_GETTER(b, h) DEF_GETTER(c, g)
00332 DEF_GETTER(d, f) DEF_GETTER(e, e) DEF_GETTER(f, d)
00333 DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
00334 #undef DEF_GETTER
00335 
00336 #define DEF_GETTER(x, y) template <> inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
00337 DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
00338 DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
00339 DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
00340 #undef DEF_GETTER
00341 
00342 
00343 //compress four blend types into a single byte
00344 //inline BlendType getTopL   (unsigned char b) { return static_cast<BlendType>(0x3 & b); }
00345 inline BlendType getTopR   (unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 2)); }
00346 inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
00347 inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
00348 
00349 inline void setTopL   (unsigned char& b, BlendType bt) { b |= bt; } //buffer is assumed to be initialized before preprocessing!
00350 inline void setTopR   (unsigned char& b, BlendType bt) { b |= (bt << 2); }
00351 inline void setBottomR(unsigned char& b, BlendType bt) { b |= (bt << 4); }
00352 inline void setBottomL(unsigned char& b, BlendType bt) { b |= (bt << 6); }
00353 
00354 inline bool blendingNeeded(unsigned char b) { return b != 0; }
00355 
00356 template <RotationDegree rotDeg> inline
00357 unsigned char rotateBlendInfo(unsigned char b) { return b; }
00358 template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) | (b >> 6)) & 0xff; }
00359 template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) | (b >> 4)) & 0xff; }
00360 template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
00361 
00362 
00363 #ifndef NDEBUG
00364     int debugPixelX = -1;
00365     int debugPixelY = 12;
00366     __declspec(thread) bool breakIntoDebugger = false;
00367 #endif
00368 
00369 
00370 /*
00371 input kernel area naming convention:
00372 -------------
00373 | A | B | C |
00374 ----|---|---|
00375 | D | E | F | //input pixel is at position E
00376 ----|---|---|
00377 | G | H | I |
00378 -------------
00379 */
00380 template <class Scaler, class ColorDistance, RotationDegree rotDeg>
00381 FORCE_INLINE //perf: quite worth it!
00382 void blendPixel(const Kernel_3x3& ker,
00383                 uint32_t* target, int trgWidth,
00384                 unsigned char blendInfo, //result of preprocessing all four corners of pixel "e"
00385                 const xbrz::ScalerCfg& cfg)
00386 {
00387 #define a get_a<rotDeg>(ker)
00388 #define b get_b<rotDeg>(ker)
00389 #define c get_c<rotDeg>(ker)
00390 #define d get_d<rotDeg>(ker)
00391 #define e get_e<rotDeg>(ker)
00392 #define f get_f<rotDeg>(ker)
00393 #define g get_g<rotDeg>(ker)
00394 #define h get_h<rotDeg>(ker)
00395 #define i get_i<rotDeg>(ker)
00396 
00397 #ifndef NDEBUG
00398     if (breakIntoDebugger)
00399         __debugbreak(); //__asm int 3;
00400 #endif
00401 
00402     (void)a; //silence Clang's -Wunused-function
00403 
00404     const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
00405 
00406     if (getBottomR(blend) >= BLEND_NORMAL)
00407     {
00408         auto eq   = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight) < cfg.equalColorTolerance; };
00409         auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };
00410 
00411         const bool doLineBlend = [&]() -> bool
00412         {
00413             if (getBottomR(blend) >= BLEND_DOMINANT)
00414                 return true;
00415 
00416             //make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
00417             if (getTopR(blend) != BLEND_NONE && !eq(e, g)) //but support double-blending for 90° corners
00418                 return false;
00419             if (getBottomL(blend) != BLEND_NONE && !eq(e, c))
00420                 return false;
00421 
00422             //no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
00423             if (!eq(e, i) && eq(g, h) && eq(h, i) && eq(i, f) && eq(f, c))
00424                 return false;
00425 
00426             return true;
00427         }();
00428 
00429         const uint32_t px = dist(e, f) <= dist(e, h) ? f : h; //choose most similar color
00430 
00431         OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
00432 
00433         if (doLineBlend)
00434         {
00435             const double fg = dist(f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
00436             const double hc = dist(h, c); //
00437 
00438             const bool haveShallowLine = cfg.steepDirectionThreshold * fg <= hc && e != g && d != g;
00439             const bool haveSteepLine   = cfg.steepDirectionThreshold * hc <= fg && e != c && b != c;
00440 
00441             if (haveShallowLine)
00442             {
00443                 if (haveSteepLine)
00444                     Scaler::blendLineSteepAndShallow(px, out);
00445                 else
00446                     Scaler::blendLineShallow(px, out);
00447             }
00448             else
00449             {
00450                 if (haveSteepLine)
00451                     Scaler::blendLineSteep(px, out);
00452                 else
00453                     Scaler::blendLineDiagonal(px, out);
00454             }
00455         }
00456         else
00457             Scaler::blendCorner(px, out);
00458     }
00459 
00460 #undef a
00461 #undef b
00462 #undef c
00463 #undef d
00464 #undef e
00465 #undef f
00466 #undef g
00467 #undef h
00468 #undef i
00469 }
00470 
00471 
00472 template <class Scaler, class ColorDistance> //scaler policy: see "Scaler2x" reference implementation
00473 void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
00474 {
00475     yFirst = std::max(yFirst, 0);
00476     yLast  = std::min(yLast, srcHeight);
00477     if (yFirst >= yLast || srcWidth <= 0)
00478         return;
00479 
00480     const int trgWidth = srcWidth * Scaler::scale;
00481 
00482     //"use" space at the end of the image as temporary buffer for "on the fly preprocessing": we even could use larger area of
00483     //"sizeof(uint32_t) * srcWidth * (yLast - yFirst)" bytes without risk of accidental overwriting before accessing
00484     const int bufferSize = srcWidth;
00485     unsigned char* preProcBuffer = reinterpret_cast<unsigned char*>(trg + (size_t)yLast * Scaler::scale * trgWidth) - bufferSize;
00486     std::fill(preProcBuffer, preProcBuffer + bufferSize, '\0');
00487     static_assert(BLEND_NONE == 0, "");
00488 
00489     //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
00490     //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
00491     if (yFirst > 0)
00492     {
00493         const int y = yFirst - 1;
00494 
00495         const uint32_t* s_m1 = src + (size_t)srcWidth * std::max(y - 1, 0);
00496         const uint32_t* s_0  = src + (size_t)srcWidth * y; //center line
00497         const uint32_t* s_p1 = src + (size_t)srcWidth * std::min(y + 1, srcHeight - 1);
00498         const uint32_t* s_p2 = src + (size_t)srcWidth * std::min(y + 2, srcHeight - 1);
00499 
00500         for (int x = 0; x < srcWidth; ++x)
00501         {
00502             const int x_m1 = std::max(x - 1, 0);
00503             const int x_p1 = std::min(x + 1, srcWidth - 1);
00504             const int x_p2 = std::min(x + 2, srcWidth - 1);
00505 
00506             Kernel_4x4 ker = {}; //perf: initialization is negligible
00507             ker.a = s_m1[x_m1]; //read sequentially from memory as far as possible
00508             ker.b = s_m1[x];
00509             ker.c = s_m1[x_p1];
00510             ker.d = s_m1[x_p2];
00511 
00512             ker.e = s_0[x_m1];
00513             ker.f = s_0[x];
00514             ker.g = s_0[x_p1];
00515             ker.h = s_0[x_p2];
00516 
00517             ker.i = s_p1[x_m1];
00518             ker.j = s_p1[x];
00519             ker.k = s_p1[x_p1];
00520             ker.l = s_p1[x_p2];
00521 
00522             ker.m = s_p2[x_m1];
00523             ker.n = s_p2[x];
00524             ker.o = s_p2[x_p1];
00525             ker.p = s_p2[x_p2];
00526 
00527             const BlendResult res = preProcessCorners<ColorDistance>(ker, cfg);
00528             /*
00529             preprocessing blend result:
00530             ---------
00531             | F | G |   //evalute corner between F, G, J, K
00532             ----|---|   //input pixel is at position F
00533             | J | K |
00534             ---------
00535             */
00536             setTopR(preProcBuffer[x], res.blend_j);
00537 
00538             if (x + 1 < bufferSize)
00539                 setTopL(preProcBuffer[x + 1], res.blend_k);
00540         }
00541     }
00542     //------------------------------------------------------------------------------------
00543 
00544     for (int y = yFirst; y < yLast; ++y)
00545     {
00546         uint32_t* out = trg + (size_t)Scaler::scale * y * trgWidth; //consider MT "striped" access
00547 
00548         const uint32_t* s_m1 = src + (size_t)srcWidth * std::max(y - 1, 0);
00549         const uint32_t* s_0  = src + (size_t)srcWidth * y; //center line
00550         const uint32_t* s_p1 = src + (size_t)srcWidth * std::min(y + 1, srcHeight - 1);
00551         const uint32_t* s_p2 = src + (size_t)srcWidth * std::min(y + 2, srcHeight - 1);
00552 
00553         unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
00554 
00555         for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
00556         {
00557 #ifndef NDEBUG
00558             breakIntoDebugger = debugPixelX == x && debugPixelY == y;
00559 #endif
00560             //all those bounds checks have only insignificant impact on performance!
00561             const int x_m1 = std::max(x - 1, 0); //perf: prefer array indexing to additional pointers!
00562             const int x_p1 = std::min(x + 1, srcWidth - 1);
00563             const int x_p2 = std::min(x + 2, srcWidth - 1);
00564 
00565             Kernel_4x4 ker4 = {}; //perf: initialization is negligible
00566 
00567             ker4.a = s_m1[x_m1]; //read sequentially from memory as far as possible
00568             ker4.b = s_m1[x];
00569             ker4.c = s_m1[x_p1];
00570             ker4.d = s_m1[x_p2];
00571 
00572             ker4.e = s_0[x_m1];
00573             ker4.f = s_0[x];
00574             ker4.g = s_0[x_p1];
00575             ker4.h = s_0[x_p2];
00576 
00577             ker4.i = s_p1[x_m1];
00578             ker4.j = s_p1[x];
00579             ker4.k = s_p1[x_p1];
00580             ker4.l = s_p1[x_p2];
00581 
00582             ker4.m = s_p2[x_m1];
00583             ker4.n = s_p2[x];
00584             ker4.o = s_p2[x_p1];
00585             ker4.p = s_p2[x_p2];
00586 
00587             //evaluate the four corners on bottom-right of current pixel
00588             unsigned char blend_xy = 0; //for current (x, y) position
00589             {
00590                 const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
00591                 /*
00592                 preprocessing blend result:
00593                 ---------
00594                 | F | G |   //evalute corner between F, G, J, K
00595                 ----|---|   //current input pixel is at position F
00596                 | J | K |
00597                 ---------
00598                 */
00599                 blend_xy = preProcBuffer[x];
00600                 setBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
00601 
00602                 setTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
00603                 preProcBuffer[x] = blend_xy1; //store on current buffer position for use on next row
00604 
00605                 blend_xy1 = 0;
00606                 setTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
00607 
00608                 if (x + 1 < bufferSize) //set 3rd known corner for (x + 1, y)
00609                     setBottomL(preProcBuffer[x + 1], res.blend_g);
00610             }
00611 
00612             //fill block of size scale * scale with the given color
00613             fillBlock(out, trgWidth * (int)sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
00614             //place *after* preprocessing step, to not overwrite the results while processing the the last pixel!
00615 
00616             //blend four corners of current pixel
00617             if (blendingNeeded(blend_xy)) //good 5% perf-improvement
00618             {
00619                 Kernel_3x3 ker3 = {}; //perf: initialization is negligible
00620 
00621                 ker3.a = ker4.a;
00622                 ker3.b = ker4.b;
00623                 ker3.c = ker4.c;
00624 
00625                 ker3.d = ker4.e;
00626                 ker3.e = ker4.f;
00627                 ker3.f = ker4.g;
00628 
00629                 ker3.g = ker4.i;
00630                 ker3.h = ker4.j;
00631                 ker3.i = ker4.k;
00632 
00633                 blendPixel<Scaler, ColorDistance, ROT_0  >(ker3, out, trgWidth, blend_xy, cfg);
00634                 blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg);
00635                 blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg);
00636                 blendPixel<Scaler, ColorDistance, ROT_270>(ker3, out, trgWidth, blend_xy, cfg);
00637             }
00638         }
00639     }
00640 }
00641 
00642 //------------------------------------------------------------------------------------
00643 
00644 template <class ColorGradient>
00645 struct Scaler2x : public ColorGradient
00646 {
00647     static const int scale = 2;
00648 
00649     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
00650     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
00651 
00652 
00653     template <class OutputMatrix>
00654     static void blendLineShallow(uint32_t col, OutputMatrix& out)
00655     {
00656         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
00657         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
00658     }
00659 
00660     template <class OutputMatrix>
00661     static void blendLineSteep(uint32_t col, OutputMatrix& out)
00662     {
00663         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
00664         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
00665     }
00666 
00667     template <class OutputMatrix>
00668     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
00669     {
00670         alphaGrad<1, 4>(out.template ref<1, 0>(), col);
00671         alphaGrad<1, 4>(out.template ref<0, 1>(), col);
00672         alphaGrad<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
00673     }
00674 
00675     template <class OutputMatrix>
00676     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
00677     {
00678         alphaGrad<1, 2>(out.template ref<1, 1>(), col);
00679     }
00680 
00681     template <class OutputMatrix>
00682     static void blendCorner(uint32_t col, OutputMatrix& out)
00683     {
00684         //model a round corner
00685         alphaGrad<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
00686     }
00687 };
00688 
00689 
00690 template <class ColorGradient>
00691 struct Scaler3x : public ColorGradient
00692 {
00693     static const int scale = 3;
00694 
00695     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
00696     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
00697 
00698 
00699     template <class OutputMatrix>
00700     static void blendLineShallow(uint32_t col, OutputMatrix& out)
00701     {
00702         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
00703         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
00704 
00705         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
00706         out.template ref<scale - 1, 2>() = col;
00707     }
00708 
00709     template <class OutputMatrix>
00710     static void blendLineSteep(uint32_t col, OutputMatrix& out)
00711     {
00712         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
00713         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
00714 
00715         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
00716         out.template ref<2, scale - 1>() = col;
00717     }
00718 
00719     template <class OutputMatrix>
00720     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
00721     {
00722         alphaGrad<1, 4>(out.template ref<2, 0>(), col);
00723         alphaGrad<1, 4>(out.template ref<0, 2>(), col);
00724         alphaGrad<3, 4>(out.template ref<2, 1>(), col);
00725         alphaGrad<3, 4>(out.template ref<1, 2>(), col);
00726         out.template ref<2, 2>() = col;
00727     }
00728 
00729     template <class OutputMatrix>
00730     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
00731     {
00732         alphaGrad<1, 8>(out.template ref<1, 2>(), col); //conflict with other rotations for this odd scale
00733         alphaGrad<1, 8>(out.template ref<2, 1>(), col);
00734         alphaGrad<7, 8>(out.template ref<2, 2>(), col); //
00735     }
00736 
00737     template <class OutputMatrix>
00738     static void blendCorner(uint32_t col, OutputMatrix& out)
00739     {
00740         //model a round corner
00741         alphaGrad<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
00742         //alphaGrad<7, 256>(out.template ref<2, 1>(), col); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
00743         //alphaGrad<7, 256>(out.template ref<1, 2>(), col); //0.02826017254
00744     }
00745 };
00746 
00747 
00748 template <class ColorGradient>
00749 struct Scaler4x : public ColorGradient
00750 {
00751     static const int scale = 4;
00752 
00753     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
00754     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
00755 
00756 
00757     template <class OutputMatrix>
00758     static void blendLineShallow(uint32_t col, OutputMatrix& out)
00759     {
00760         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
00761         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
00762 
00763         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
00764         alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
00765 
00766         out.template ref<scale - 1, 2>() = col;
00767         out.template ref<scale - 1, 3>() = col;
00768     }
00769 
00770     template <class OutputMatrix>
00771     static void blendLineSteep(uint32_t col, OutputMatrix& out)
00772     {
00773         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
00774         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
00775 
00776         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
00777         alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
00778 
00779         out.template ref<2, scale - 1>() = col;
00780         out.template ref<3, scale - 1>() = col;
00781     }
00782 
00783     template <class OutputMatrix>
00784     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
00785     {
00786         alphaGrad<3, 4>(out.template ref<3, 1>(), col);
00787         alphaGrad<3, 4>(out.template ref<1, 3>(), col);
00788         alphaGrad<1, 4>(out.template ref<3, 0>(), col);
00789         alphaGrad<1, 4>(out.template ref<0, 3>(), col);
00790 
00791         alphaGrad<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
00792 
00793         out.template ref<3, 3>() = col;
00794         out.template ref<3, 2>() = col;
00795         out.template ref<2, 3>() = col;
00796     }
00797 
00798     template <class OutputMatrix>
00799     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
00800     {
00801         alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
00802         alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
00803         out.template ref<scale - 1, scale - 1>() = col;
00804     }
00805 
00806     template <class OutputMatrix>
00807     static void blendCorner(uint32_t col, OutputMatrix& out)
00808     {
00809         //model a round corner
00810         alphaGrad<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
00811         alphaGrad< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
00812         alphaGrad< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
00813     }
00814 };
00815 
00816 
00817 template <class ColorGradient>
00818 struct Scaler5x : public ColorGradient
00819 {
00820     static const int scale = 5;
00821 
00822     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
00823     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
00824 
00825 
00826     template <class OutputMatrix>
00827     static void blendLineShallow(uint32_t col, OutputMatrix& out)
00828     {
00829         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
00830         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
00831         alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
00832 
00833         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
00834         alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
00835 
00836         out.template ref<scale - 1, 2>() = col;
00837         out.template ref<scale - 1, 3>() = col;
00838         out.template ref<scale - 1, 4>() = col;
00839         out.template ref<scale - 2, 4>() = col;
00840     }
00841 
00842     template <class OutputMatrix>
00843     static void blendLineSteep(uint32_t col, OutputMatrix& out)
00844     {
00845         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
00846         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
00847         alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
00848 
00849         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
00850         alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
00851 
00852         out.template ref<2, scale - 1>() = col;
00853         out.template ref<3, scale - 1>() = col;
00854         out.template ref<4, scale - 1>() = col;
00855         out.template ref<4, scale - 2>() = col;
00856     }
00857 
00858     template <class OutputMatrix>
00859     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
00860     {
00861         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
00862         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
00863         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
00864 
00865         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
00866         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
00867         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
00868 
00869         alphaGrad<2, 3>(out.template ref<3, 3>(), col);
00870 
00871         out.template ref<2, scale - 1>() = col;
00872         out.template ref<3, scale - 1>() = col;
00873         out.template ref<4, scale - 1>() = col;
00874 
00875         out.template ref<scale - 1, 2>() = col;
00876         out.template ref<scale - 1, 3>() = col;
00877     }
00878 
00879     template <class OutputMatrix>
00880     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
00881     {
00882         alphaGrad<1, 8>(out.template ref<scale - 1, scale / 2    >(), col); //conflict with other rotations for this odd scale
00883         alphaGrad<1, 8>(out.template ref<scale - 2, scale / 2 + 1>(), col);
00884         alphaGrad<1, 8>(out.template ref<scale - 3, scale / 2 + 2>(), col); //
00885 
00886         alphaGrad<7, 8>(out.template ref<4, 3>(), col);
00887         alphaGrad<7, 8>(out.template ref<3, 4>(), col);
00888 
00889         out.template ref<4, 4>() = col;
00890     }
00891 
00892     template <class OutputMatrix>
00893     static void blendCorner(uint32_t col, OutputMatrix& out)
00894     {
00895         //model a round corner
00896         alphaGrad<86, 100>(out.template ref<4, 4>(), col); //exact: 0.8631434088
00897         alphaGrad<23, 100>(out.template ref<4, 3>(), col); //0.2306749731
00898         alphaGrad<23, 100>(out.template ref<3, 4>(), col); //0.2306749731
00899         //alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
00900         //alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
00901     }
00902 };
00903 
00904 
00905 template <class ColorGradient>
00906 struct Scaler6x : public ColorGradient
00907 {
00908     static const int scale = 6;
00909 
00910     template <unsigned int M, unsigned int N> //bring template function into scope for GCC
00911     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
00912 
00913 
00914     template <class OutputMatrix>
00915     static void blendLineShallow(uint32_t col, OutputMatrix& out)
00916     {
00917         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
00918         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
00919         alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
00920 
00921         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
00922         alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
00923         alphaGrad<3, 4>(out.template ref<scale - 3, 5>(), col);
00924 
00925         out.template ref<scale - 1, 2>() = col;
00926         out.template ref<scale - 1, 3>() = col;
00927         out.template ref<scale - 1, 4>() = col;
00928         out.template ref<scale - 1, 5>() = col;
00929 
00930         out.template ref<scale - 2, 4>() = col;
00931         out.template ref<scale - 2, 5>() = col;
00932     }
00933 
00934     template <class OutputMatrix>
00935     static void blendLineSteep(uint32_t col, OutputMatrix& out)
00936     {
00937         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
00938         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
00939         alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
00940 
00941         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
00942         alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
00943         alphaGrad<3, 4>(out.template ref<5, scale - 3>(), col);
00944 
00945         out.template ref<2, scale - 1>() = col;
00946         out.template ref<3, scale - 1>() = col;
00947         out.template ref<4, scale - 1>() = col;
00948         out.template ref<5, scale - 1>() = col;
00949 
00950         out.template ref<4, scale - 2>() = col;
00951         out.template ref<5, scale - 2>() = col;
00952     }
00953 
00954     template <class OutputMatrix>
00955     static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
00956     {
00957         alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
00958         alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
00959         alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
00960         alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
00961 
00962         alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
00963         alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
00964         alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
00965         alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
00966 
00967         out.template ref<2, scale - 1>() = col;
00968         out.template ref<3, scale - 1>() = col;
00969         out.template ref<4, scale - 1>() = col;
00970         out.template ref<5, scale - 1>() = col;
00971 
00972         out.template ref<4, scale - 2>() = col;
00973         out.template ref<5, scale - 2>() = col;
00974 
00975         out.template ref<scale - 1, 2>() = col;
00976         out.template ref<scale - 1, 3>() = col;
00977     }
00978 
00979     template <class OutputMatrix>
00980     static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
00981     {
00982         alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2    >(), col);
00983         alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
00984         alphaGrad<1, 2>(out.template ref<scale - 3, scale / 2 + 2>(), col);
00985 
00986         out.template ref<scale - 2, scale - 1>() = col;
00987         out.template ref<scale - 1, scale - 1>() = col;
00988         out.template ref<scale - 1, scale - 2>() = col;
00989     }
00990 
00991     template <class OutputMatrix>
00992     static void blendCorner(uint32_t col, OutputMatrix& out)
00993     {
00994         //model a round corner
00995         alphaGrad<97, 100>(out.template ref<5, 5>(), col); //exact: 0.9711013910
00996         alphaGrad<42, 100>(out.template ref<4, 5>(), col); //0.4236372243
00997         alphaGrad<42, 100>(out.template ref<5, 4>(), col); //0.4236372243
00998         alphaGrad< 6, 100>(out.template ref<5, 3>(), col); //0.05652034508
00999         alphaGrad< 6, 100>(out.template ref<3, 5>(), col); //0.05652034508
01000     }
01001 };
01002 
01003 //------------------------------------------------------------------------------------
01004 
01005 struct ColorDistanceRGB
01006 {
01007     static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
01008     {
01009         (void)luminanceWeight;
01010 
01011         return distYCbCrBuffered(pix1, pix2);
01012 
01013         //if (pix1 == pix2) //about 4% perf boost
01014         //    return 0;
01015         //return distYCbCr(pix1, pix2, luminanceWeight);
01016     }
01017 };
01018 
01019 struct ColorDistanceARGB
01020 {
01021     static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
01022     {
01023         (void)luminanceWeight;
01024 
01025         const double a1 = getAlpha(pix1) / 255.0 ;
01026         const double a2 = getAlpha(pix2) / 255.0 ;
01027         /*
01028         Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
01029 
01030             1. if a1 = a2, distance should be: a1 * distYCbCr()
01031             2. if a1 = 0,  distance should be: a2 * distYCbCr(black, white) = a2 * 255
01032             3. if a1 = 1,  ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
01033         */
01034 
01035         //return std::min(a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
01036         //=> following code is 15% faster:
01037         const double d = distYCbCrBuffered(pix1, pix2);
01038         if (a1 < a2)
01039             return a1 * d + 255 * (a2 - a1);
01040         else
01041             return a2 * d + 255 * (a1 - a2);
01042 
01043         //alternative? return std::sqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
01044     }
01045 };
01046 
01047 
01048 struct ColorDistanceUnbufferedARGB
01049 {
01050     static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
01051     {
01052         const double a1 = getAlpha(pix1) / 255.0 ;
01053         const double a2 = getAlpha(pix2) / 255.0 ;
01054 
01055         const double d = distYCbCr(pix1, pix2, luminanceWeight);
01056         if (a1 < a2)
01057             return a1 * d + 255 * (a2 - a1);
01058         else
01059             return a2 * d + 255 * (a1 - a2);
01060     }
01061 };
01062 
01063 
01064 struct ColorGradientRGB
01065 {
01066     template <unsigned int M, unsigned int N>
01067     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront)
01068     {
01069         pixBack = gradientRGB<M, N>(pixFront, pixBack);
01070     }
01071 };
01072 
01073 struct ColorGradientARGB
01074 {
01075     template <unsigned int M, unsigned int N>
01076     static void alphaGrad(uint32_t& pixBack, uint32_t pixFront)
01077     {
01078         pixBack = gradientARGB<M, N>(pixFront, pixBack);
01079     }
01080 };
01081 }
01082 
01083 
01084 void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, ColorFormat colFmt, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
01085 {
01086     static_assert(SCALE_FACTOR_MAX == 6, "");
01087     switch (colFmt)
01088     {
01089         case ColorFormat::RGB:
01090             switch (factor)
01091             {
01092                 case 2:
01093                     return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01094                 case 3:
01095                     return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01096                 case 4:
01097                     return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01098                 case 5:
01099                     return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01100                 case 6:
01101                     return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01102             }
01103             break;
01104 
01105         case ColorFormat::ARGB:
01106             switch (factor)
01107             {
01108                 case 2:
01109                     return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01110                 case 3:
01111                     return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01112                 case 4:
01113                     return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01114                 case 5:
01115                     return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01116                 case 6:
01117                     return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01118             }
01119             break;
01120 
01121         case ColorFormat::ARGB_UNBUFFERED:
01122             switch (factor)
01123             {
01124                 case 2:
01125                     return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01126                 case 3:
01127                     return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01128                 case 4:
01129                     return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01130                 case 5:
01131                     return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01132                 case 6:
01133                     return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
01134             }
01135             break;
01136     }
01137     assert(false);
01138 }
01139 
01140 
01141 bool xbrz::equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, double luminanceWeight, double equalColorTolerance)
01142 {
01143     switch (colFmt)
01144     {
01145         case ColorFormat::RGB:
01146             return ColorDistanceRGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
01147         case ColorFormat::ARGB:
01148             return ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
01149         case ColorFormat::ARGB_UNBUFFERED:
01150             return ColorDistanceUnbufferedARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
01151     }
01152     assert(false);
01153     return false;
01154 }
01155 
01156 
01157 void xbrz::bilinearScale(const uint32_t* src, int srcWidth, int srcHeight,
01158                            uint32_t* trg, int trgWidth, int trgHeight)
01159 {
01160     bilinearScale(src, srcWidth, srcHeight, srcWidth * (int)sizeof(uint32_t),
01161                   trg, trgWidth, trgHeight, trgWidth * (int)sizeof(uint32_t),
01162     0, trgHeight, [](uint32_t pix) { return pix; });
01163 }
01164 
01165 
01166 void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight,
01167                                   uint32_t* trg, int trgWidth, int trgHeight)
01168 {
01169     nearestNeighborScale(src, srcWidth, srcHeight, srcWidth * (int)sizeof(uint32_t),
01170                          trg, trgWidth, trgHeight, trgWidth * (int)sizeof(uint32_t),
01171     0, trgHeight, [](uint32_t pix) { return pix; });
01172 }
01173 
01174 
01175 #if 0
01176 //#include <ppl.h>
01177 void bilinearScaleCpu(const uint32_t* src, int srcWidth, int srcHeight,
01178                         uint32_t* trg, int trgWidth, int trgHeight)
01179 {
01180     const int TASK_GRANULARITY = 16;
01181 
01182     concurrency::task_group tg;
01183 
01184     for (int i = 0; i < trgHeight; i += TASK_GRANULARITY)
01185         tg.run([=]
01186     {
01187         const int iLast = std::min(i + TASK_GRANULARITY, trgHeight);
01188         xbrz::bilinearScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
01189                             trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
01190         i, iLast, [](uint32_t pix) { return pix; });
01191     });
01192     tg.wait();
01193 }
01194 
01195 
01196 //Perf: AMP vs CPU: merely ~10% shorter runtime (scaling 1280x800 -> 1920x1080)
01197 //#include <amp.h>
01198 void bilinearScaleAmp(const uint32_t* src, int srcWidth, int srcHeight, //throw concurrency::runtime_exception
01199                         uint32_t* trg, int trgWidth, int trgHeight)
01200 {
01201     //C++ AMP reference:       https://msdn.microsoft.com/en-us/library/hh289390.aspx
01202     //introduction to C++ AMP: https://msdn.microsoft.com/en-us/magazine/hh882446.aspx
01203     using namespace concurrency;
01204     //TODO: pitch
01205 
01206     if (srcHeight <= 0 || srcWidth <= 0) return;
01207 
01208     const float scaleX = static_cast<float>(trgWidth ) / srcWidth;
01209     const float scaleY = static_cast<float>(trgHeight) / srcHeight;
01210 
01211     array_view<const uint32_t, 2> srcView(srcHeight, srcWidth, src);
01212     array_view<      uint32_t, 2> trgView(trgHeight, trgWidth, trg);
01213     trgView.discard_data();
01214 
01215     parallel_for_each(trgView.extent, [=](index<2> idx) restrict(amp) //throw ?
01216     {
01217         const int y = idx[0];
01218         const int x = idx[1];
01219         //Perf notes:
01220         //    -> float-based calculation is (almost 2x) faster than double!
01221         //    -> no noticeable improvement via tiling: https://msdn.microsoft.com/en-us/magazine/hh882447.aspx
01222         //    -> no noticeable improvement with restrict(amp,cpu)
01223         //    -> iterating over y-axis only is significantly slower!
01224         //    -> pre-calculating x,y-dependent variables in a buffer + array_view<> is ~ 20 % slower!
01225         const int y1 = srcHeight * y / trgHeight;
01226         int y2 = y1 + 1;
01227         if (y2 == srcHeight) --y2;
01228 
01229         const float yy1 = y / scaleY - y1;
01230         const float y2y = 1 - yy1;
01231         //-------------------------------------
01232         const int x1 = srcWidth * x / trgWidth;
01233         int x2 = x1 + 1;
01234         if (x2 == srcWidth) --x2;
01235 
01236         const float xx1 = x / scaleX - x1;
01237         const float x2x = 1 - xx1;
01238         //-------------------------------------
01239         const float x2xy2y = x2x * y2y;
01240         const float xx1y2y = xx1 * y2y;
01241         const float x2xyy1 = x2x * yy1;
01242         const float xx1yy1 = xx1 * yy1;
01243 
01244         auto interpolate = [=](int offset)
01245         {
01246             /*
01247                 https://en.wikipedia.org/wiki/Bilinear_interpolation
01248                 (c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
01249                 (c12(x2 - x) + c22(x - x1)) * (y  - y1)
01250             */
01251             const auto c11 = (srcView(y1, x1) >> (8 * offset)) & 0xff;
01252             const auto c21 = (srcView(y1, x2) >> (8 * offset)) & 0xff;
01253             const auto c12 = (srcView(y2, x1) >> (8 * offset)) & 0xff;
01254             const auto c22 = (srcView(y2, x2) >> (8 * offset)) & 0xff;
01255 
01256             return c11 * x2xy2y + c21 * xx1y2y +
01257                    c12 * x2xyy1 + c22 * xx1yy1;
01258         };
01259 
01260         const float bi = interpolate(0);
01261         const float gi = interpolate(1);
01262         const float ri = interpolate(2);
01263         const float ai = interpolate(3);
01264 
01265         const auto b = static_cast<uint32_t>(bi + 0.5f);
01266         const auto g = static_cast<uint32_t>(gi + 0.5f);
01267         const auto r = static_cast<uint32_t>(ri + 0.5f);
01268         const auto a = static_cast<uint32_t>(ai + 0.5f);
01269 
01270         trgView(y, x) = (a << 24) | (r << 16) | (g << 8) | b;
01271     });
01272     trgView.synchronize(); //throw ?
01273 }
01274 #endif