Hi,
Before I abandoned the balanc0r plugin altogether, I made some efforts to
optimize it (and to make the surrounding processes in Kdenlive/MLT less slow;
see
http://www.kdenlive.org/mantis/view.php?id=2727). Maybe the patch will be
useful to someone, so I'm sending it on instead of letting it rot in my own
directory.
/* Steinar */
commit acdb08c2e3c63d256a73d6db119fe9a5a0b9567c
Author: Steinar H. Gunderson <sgunderson@???>
Date: Mon Sep 10 22:03:16 2012 +0200
Optimize the balanc0r plugin using SSE2.
There is no good reason why this plugin should work in floating point;
the conversions back and forth are very slow. Moving it to fixed point
also enabled SSE2-optimizing it.
Also make sure CLAMP0255() is inlined (it was not in my build).
On a rough estimate, this makes the plugin four times as fast on my
Core i5.
diff --git a/include/frei0r_math.h b/include/frei0r_math.h
index cf84eb4..ad625db 100644
--- a/include/frei0r_math.h
+++ b/include/frei0r_math.h
@@ -18,7 +18,7 @@
/* Clamps a int32-range int between 0 and 255 inclusive. */
#ifndef CLAMP0255
-unsigned char CLAMP0255(int32_t a)
+static inline unsigned char CLAMP0255(int32_t a)
{
return (unsigned char)
( (((-a) >> 31) & a) // 0 if the number was negative
diff --git a/src/filter/balanc0r/balanc0r.c b/src/filter/balanc0r/balanc0r.c
index c52bc15..d08ff42 100644
--- a/src/filter/balanc0r/balanc0r.c
+++ b/src/filter/balanc0r/balanc0r.c
@@ -27,6 +27,10 @@
#include <math.h>
#include <stdio.h>
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
#include "frei0r.h"
#include "frei0r_math.h"
@@ -684,19 +688,51 @@ void f0r_get_param_value(f0r_instance_t instance,
}
+int clamp(int x, int low, int high)
+{
+ if (x < low) {
+ return low;
+ }
+ if (x > high) {
+ return high;
+ }
+ return x;
+}
+
void f0r_update(f0r_instance_t instance, double time,
const uint32_t* inframe, uint32_t* outframe)
{
assert(instance);
balanc0r_instance_t* inst = (balanc0r_instance_t*)instance;
- unsigned int len = inst->width * inst->height + 1;
+ unsigned int len = inst->width * inst->height;
unsigned char* dst = (unsigned char*)outframe;
const unsigned char* src = (unsigned char*)inframe;
+
+ // Convert the multiplicative constants to 8.8 fixed point.
+ int mr = clamp(lrint(inst->mr * 256.0), 0, 65535);
+ int mg = clamp(lrint(inst->mg * 256.0), 0, 65535);
+ int mb = clamp(lrint(inst->mb * 256.0), 0, 65535);
+
+#ifdef __SSE2__
+ __m128i mulfac = _mm_setr_epi16(mr, mg, mb, 256, mr, mg, mb, 256);
+ __m128i zero = _mm_setzero_si128();
+ while (len >= 4) {
+ __m128i lohi = _mm_loadu_si128((const __m128i *)src);
+ __m128i lo = _mm_unpacklo_epi8(zero, lohi);
+ __m128i hi = _mm_unpackhi_epi8(zero, lohi);
+ lo = _mm_mulhi_epu16(lo, mulfac);
+ hi = _mm_mulhi_epu16(hi, mulfac);
+ _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(lo, hi));
+ len -= 4;
+ src += 16;
+ dst += 16;
+ }
+#endif
- while (--len) {
- *dst++ = CLAMP0255(*src++ * inst->mr);
- *dst++ = CLAMP0255(*src++ * inst->mg);
- *dst++ = CLAMP0255(*src++ * inst->mb);
+ while (len--) {
+ *dst++ = CLAMP0255((*src++ * mr) >> 8);
+ *dst++ = CLAMP0255((*src++ * mg) >> 8);
+ *dst++ = CLAMP0255((*src++ * mb) >> 8);
*dst++ = *src++; // copy alpha
}
}
--
Homepage: http://www.sesse.net/