Yes, the NASM code only includes the BSR optimisation, and I always linked libwavpack statically. The normal modes of WavPack are generally not affected by my optimisations. You'll probably measure the highest speedup with '-x6'.
I'm already working on a few alternatives to the BSR code. Will see, if I can post some positive news next week.
Ah OK, I tried -x6 and now I see a difference - but now in fact the MMX version is slower (~20%). But, as I know that gcc is a bit bitchy, I modified your patch and now it is actually faster (~10%). (I cannot reliably test, as I have backround processes running).
This is not cleaned up...but the trick is to *not* use unions. gcc unfortunately treats them differently... Perhaps one wants to test, whether my version is faster for one, as well (using gcc/mingw)? I hope I didn't mess anything up...
--- extra2.c 2006-05-06 23:28:07.000000000 +0200
+++ extra2mmx.c 2006-05-06 15:42:30.000000000 +0200
@@ -57,42 +57,44 @@
if (dpp->term > 0) {
const int_mmx
delta = { dpp->delta, dpp->delta },
- msk0 = { 0x7fff, 0x7fff },
- msk1 = { 0xffff, 0xffff },
+ msk0 = { 0x00007fffL, 0x00007fffL },
+ msk1 = { 0x0000ffffL, 0x0000ffffL },
round = { 512, 512 },
zero = { 0, 0 };
int_mmx left_right, sam_AB, tmp0, tmp1;
- union {
+ /*union {
int_mmx q [MAX_TERM];
int d [2 * MAX_TERM];
- } samples_AB;
- union {
+ } samples_AB;*/
+ int_mmx samples_AB[MAX_TERM];
+ /*union {
int_mmx q;
int d [2];
- } weight_AB, sum_AB;
+ } weight_AB, sum_AB;*/
+ int_mmx weight_AB, sum_AB ={0,0};
- sum_AB.d [0] = 0;
- sum_AB.d [1] = 0;
- weight_AB.d [0] = restore_weight (store_weight (dpp->weight_A));
- weight_AB.d [1] = restore_weight (store_weight (dpp->weight_B));
+ //sum_AB.d [0] = 0;
+ //sum_AB.d [1] = 0;
+ *(int*)&weight_AB = restore_weight (store_weight (dpp->weight_A));
+ *((int*)&weight_AB+1) = restore_weight (store_weight (dpp->weight_B));
for (k = 0; k < MAX_TERM; ++k) {
- samples_AB.d [k * 2] = exp2s (log2s (dpp->samples_A [k]));
- samples_AB.d [k * 2 + 1] = exp2s (log2s (dpp->samples_B [k]));
+ *((int*)&samples_AB + k * 2) = exp2s (log2s (dpp->samples_A [k]));
+ *((int*)&samples_AB + k * 2 + 1) = exp2s (log2s (dpp->samples_B [k]));
}
if (dpp->term == 17) {
while (num_samples--) {
- sam_AB = __builtin_ia32_pslld (samples_AB.q [0], 1);
- sam_AB = __builtin_ia32_psubd (sam_AB, samples_AB.q [1]);
+ sam_AB = __builtin_ia32_pslld (samples_AB [0], 1);
+ sam_AB = __builtin_ia32_psubd (sam_AB, samples_AB [1]);
- samples_AB.q [1] = samples_AB.q [0];
- samples_AB.q [0] = left_right = *(int_mmx *) in_samples;
+ samples_AB [1] = samples_AB [0];
+ samples_AB [0] = left_right = *(int_mmx *) in_samples;
tmp0 = __builtin_ia32_psrld (sam_AB, 15);
tmp1 = __builtin_ia32_pand (sam_AB, msk0);
tmp0 = __builtin_ia32_pand (tmp0, msk1);
- tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB.q);
- tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB.q);
+ tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB);
+ tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB);
tmp1 = __builtin_ia32_paddd (tmp1, round);
tmp0 = __builtin_ia32_pslld (tmp0, 5);
tmp1 = __builtin_ia32_psrad (tmp1, 10);
@@ -107,9 +109,9 @@
tmp0 = __builtin_ia32_pcmpeqd (left_right, zero);
tmp0 = __builtin_ia32_por (tmp0, sam_AB);
tmp0 = __builtin_ia32_pandn (tmp0, tmp1);
- weight_AB.q = __builtin_ia32_paddd (weight_AB.q, tmp0);
+ weight_AB = __builtin_ia32_paddd (weight_AB, tmp0);
- sum_AB.q = __builtin_ia32_paddd (sum_AB.q, weight_AB.q);
+ sum_AB = __builtin_ia32_paddd (sum_AB, weight_AB);
*(int_mmx *) out_samples = left_right;
@@ -119,20 +121,20 @@
}
else if (dpp->term == 18) {
while (num_samples--) {
- tmp0 = samples_AB.q [0];
- sam_AB = __builtin_ia32_psubd (tmp0, samples_AB.q [1]);
+ tmp0 = samples_AB [0];
+ sam_AB = __builtin_ia32_psubd (tmp0, samples_AB [1]);
tmp0 = __builtin_ia32_pslld (tmp0, 1);
sam_AB = __builtin_ia32_paddd (sam_AB, tmp0);
sam_AB = __builtin_ia32_psrad (sam_AB, 1);
- samples_AB.q [1] = samples_AB.q [0];
- samples_AB.q [0] = left_right = *(int_mmx *) in_samples;
+ samples_AB [1] = samples_AB [0];
+ samples_AB [0] = left_right = *(int_mmx *) in_samples;
tmp0 = __builtin_ia32_psrld (sam_AB, 15);
tmp1 = __builtin_ia32_pand (sam_AB, msk0);
tmp0 = __builtin_ia32_pand (tmp0, msk1);
- tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB.q);
- tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB.q);
+ tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB);
+ tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB);
tmp1 = __builtin_ia32_paddd (tmp1, round);
tmp0 = __builtin_ia32_pslld (tmp0, 5);
tmp1 = __builtin_ia32_psrad (tmp1, 10);
@@ -147,9 +149,9 @@
tmp0 = __builtin_ia32_pcmpeqd (left_right, zero);
tmp0 = __builtin_ia32_por (tmp0, sam_AB);
tmp0 = __builtin_ia32_pandn (tmp0, tmp1);
- weight_AB.q = __builtin_ia32_paddd (weight_AB.q, tmp0);
+ weight_AB = __builtin_ia32_paddd (weight_AB, tmp0);
- sum_AB.q = __builtin_ia32_paddd (sum_AB.q, weight_AB.q);
+ sum_AB = __builtin_ia32_paddd (sum_AB, weight_AB);
*(int_mmx *) out_samples = left_right;
@@ -161,14 +163,14 @@
while (num_samples--) {
k = (m + dpp->term) & (MAX_TERM - 1);
- sam_AB = samples_AB.q [m];
- samples_AB.q [k] = left_right = *(int_mmx *) in_samples;
+ sam_AB = samples_AB [m];
+ samples_AB [k] = left_right = *(int_mmx *) in_samples;
tmp0 = __builtin_ia32_psrld (sam_AB, 15);
tmp1 = __builtin_ia32_pand (sam_AB, msk0);
tmp0 = __builtin_ia32_pand (tmp0, msk1);
- tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB.q);
- tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB.q);
+ tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB);
+ tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB);
tmp1 = __builtin_ia32_paddd (tmp1, round);
tmp0 = __builtin_ia32_pslld (tmp0, 5);
tmp1 = __builtin_ia32_psrad (tmp1, 10);
@@ -183,9 +185,9 @@
tmp0 = __builtin_ia32_pcmpeqd (left_right, zero);
tmp0 = __builtin_ia32_por (tmp0, sam_AB);
tmp0 = __builtin_ia32_pandn (tmp0, tmp1);
- weight_AB.q = __builtin_ia32_paddd (weight_AB.q, tmp0);
+ weight_AB = __builtin_ia32_paddd (weight_AB, tmp0);
- sum_AB.q = __builtin_ia32_paddd (sum_AB.q, weight_AB.q);
+ sum_AB = __builtin_ia32_paddd (sum_AB, weight_AB);
*(int_mmx *) out_samples = left_right;
@@ -194,13 +196,13 @@
m = (m + 1) & (MAX_TERM - 1);
}
}
- dpp->sum_A = sum_AB.d [0];
- dpp->sum_B = sum_AB.d [1];
- dpp->weight_A = weight_AB.d [0];
- dpp->weight_B = weight_AB.d [1];
+ dpp->sum_A = *(int*)&sum_AB;
+ dpp->sum_B = *((int*)&sum_AB+1);
+ dpp->weight_A = *(int*)&weight_AB;
+ dpp->weight_B = *((int*)&weight_AB+1);
for (k = 0; k < MAX_TERM; ++k) {
- dpp->samples_A [k] = samples_AB.d [m * 2];
- dpp->samples_B [k] = samples_AB.d [m * 2 + 1];
+ dpp->samples_A [k] = *((int*)&samples_AB+m * 2);
+ dpp->samples_B [k] = *((int*)&samples_AB+ m * 2 + 1);
m = (m + 1) & (MAX_TERM - 1);
}
__builtin_ia32_emms ();
[!--sizeo:1--][span style=\"font-size:8pt;line-height:100%\"][!--/sizeo--]Moderation: CODE to CODEBOX[/size]