[Frei0r] [PATCH] Optimize the balanc0r plugin using SSE2

Author: Steinar H. Gunderson
Date:
To: frei0r
Subject: [Frei0r] [PATCH] Optimize the balanc0r plugin using SSE2

Hi,

Before I abandoned the balanc0r plugin altogether, I made some efforts to
optimize it (and to make the surrounding processes in Kdenlive/MLT less slow;
see http://www.kdenlive.org/mantis/view.php?id=2727). Maybe the patch will be
useful to someone, so I'm sending it on instead of letting it rot in my own
directory.

/* Steinar */

commit acdb08c2e3c63d256a73d6db119fe9a5a0b9567c
Author: Steinar H. Gunderson <sgunderson@???>
Date: Mon Sep 10 22:03:16 2012 +0200

    Optimize the balanc0r plugin using SSE2.

    There is no good reason why this plugin should work in floating point;
    the conversions back and forth are very slow. Moving it to fixed point
    also enabled SSE2-optimizing it.

    Also make sure CLAMP0255() is inlined (it was not in my build).

    On a rough estimate, this makes the plugin four times as fast on my
    Core i5.

diff --git a/include/frei0r_math.h b/include/frei0r_math.h
index cf84eb4..ad625db 100644
--- a/include/frei0r_math.h
+++ b/include/frei0r_math.h
@@ -18,7 +18,7 @@

 /* Clamps a int32-range int between 0 and 255 inclusive. */
 #ifndef CLAMP0255
-unsigned char CLAMP0255(int32_t a)
+static inline unsigned char CLAMP0255(int32_t a)
 {
   return (unsigned char)
     ( (((-a) >> 31) & a)  // 0 if the number was negative
diff --git a/src/filter/balanc0r/balanc0r.c b/src/filter/balanc0r/balanc0r.c
index c52bc15..d08ff42 100644
--- a/src/filter/balanc0r/balanc0r.c
+++ b/src/filter/balanc0r/balanc0r.c
@@ -27,6 +27,10 @@
 #include <math.h>
 #include <stdio.h>

+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
#include "frei0r.h"
#include "frei0r_math.h"

@@ -684,19 +688,51 @@ void f0r_get_param_value(f0r_instance_t instance,

}

+int clamp(int x, int low, int high)
+{
+    if (x < low) {
+        return low;
+    }
+    if (x > high) {
+        return high;
+    }
+    return x;
+}
+
 void f0r_update(f0r_instance_t instance, double time,
         const uint32_t* inframe, uint32_t* outframe)
 {
     assert(instance);
     balanc0r_instance_t* inst = (balanc0r_instance_t*)instance;
-    unsigned int         len  = inst->width * inst->height + 1;
+    unsigned int         len  = inst->width * inst->height;
     unsigned char*       dst = (unsigned char*)outframe;
     const unsigned char* src = (unsigned char*)inframe;
+    
+    // Convert the multiplicative constants to 8.8 fixed point.
+    int mr = clamp(lrint(inst->mr * 256.0), 0, 65535);
+    int mg = clamp(lrint(inst->mg * 256.0), 0, 65535);
+    int mb = clamp(lrint(inst->mb * 256.0), 0, 65535);
+
+#ifdef __SSE2__
+    __m128i mulfac = _mm_setr_epi16(mr, mg, mb, 256, mr, mg, mb, 256);
+    __m128i zero = _mm_setzero_si128();
+    while (len >= 4) {
+        __m128i lohi = _mm_loadu_si128((const __m128i *)src);
+        __m128i lo = _mm_unpacklo_epi8(zero, lohi);
+        __m128i hi = _mm_unpackhi_epi8(zero, lohi);
+        lo = _mm_mulhi_epu16(lo, mulfac);
+        hi = _mm_mulhi_epu16(hi, mulfac);
+        _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(lo, hi));
+        len -= 4;
+        src += 16;
+        dst += 16;
+    }
+#endif

-    while (--len) {
-        *dst++ = CLAMP0255(*src++ * inst->mr);
-        *dst++ = CLAMP0255(*src++ * inst->mg);
-        *dst++ = CLAMP0255(*src++ * inst->mb);
+    while (len--) {
+        *dst++ = CLAMP0255((*src++ * mr) >> 8);
+        *dst++ = CLAMP0255((*src++ * mg) >> 8);
+        *dst++ = CLAMP0255((*src++ * mb) >> 8);
         *dst++ = *src++;  // copy alpha
     }
 }
-- 
Homepage: http://www.sesse.net/

This message is part of the following thread:
	the complete thread tree sorted by date

	Dan Dennedy at

Donate to Dyne.org