[quote author=he-jo link=msg=389893 date=1146931699]Yes, the NASM code only includes the BSR optimisation, and I always linked libwavpack statically. The normal modes of WavPack are generally not affected by my optimisations. You'll probably measure the highest speedup with '-x6'.

I'm already working on a few alternatives to the BSR code. Will see, if I can post some positive news next week.[/quote]

Ah OK, I tried -x6 and now I see a difference - but now in fact the MMX version is slower (~20%). But, as I know that gcc is a bit bitchy, I modified your patch and now it is actually faster (~10%). (I cannot reliably test, as I have backround processes running).

This is not cleaned up...but the trick is to *not* use unions. gcc unfortunately treats them differently... Perhaps one wants to test, whether my version is faster for one, as well (using gcc/mingw)? I hope I didn't mess anything up...

--- extra2.c 2006-05-06 23:28:07.000000000 +0200

+++ extra2mmx.c 2006-05-06 15:42:30.000000000 +0200

@@ -57,42 +57,44 @@

if (dpp->term > 0) {

const int_mmx

delta = { dpp->delta, dpp->delta },

- msk0 = { 0x7fff, 0x7fff },

- msk1 = { 0xffff, 0xffff },

+ msk0 = { 0x00007fffL, 0x00007fffL },

+ msk1 = { 0x0000ffffL, 0x0000ffffL },

round = { 512, 512 },

zero = { 0, 0 };

int_mmx left_right, sam_AB, tmp0, tmp1;

- union {

+ /*union {

int_mmx q [MAX_TERM];

int d [2 * MAX_TERM];

- } samples_AB;

- union {

+ } samples_AB;*/

+ int_mmx samples_AB[MAX_TERM];

+ /*union {

int_mmx q;

int d [2];

- } weight_AB, sum_AB;

+ } weight_AB, sum_AB;*/

+ int_mmx weight_AB, sum_AB ={0,0};

- sum_AB.d [0] = 0;

- sum_AB.d [1] = 0;

- weight_AB.d [0] = restore_weight (store_weight (dpp->weight_A));

- weight_AB.d [1] = restore_weight (store_weight (dpp->weight_B));

+ //sum_AB.d [0] = 0;

+ //sum_AB.d [1] = 0;

+ *(int*)&weight_AB = restore_weight (store_weight (dpp->weight_A));

+ *((int*)&weight_AB+1) = restore_weight (store_weight (dpp->weight_B));

for (k = 0; k < MAX_TERM; ++k) {

- samples_AB.d [k * 2] = exp2s (log2s (dpp->samples_A [k]));

- samples_AB.d [k * 2 + 1] = exp2s (log2s (dpp->samples_B [k]));

+ *((int*)&samples_AB + k * 2) = exp2s (log2s (dpp->samples_A [k]));

+ *((int*)&samples_AB + k * 2 + 1) = exp2s (log2s (dpp->samples_B [k]));

}

if (dpp->term == 17) {

while (num_samples--) {

- sam_AB = __builtin_ia32_pslld (samples_AB.q [0], 1);

- sam_AB = __builtin_ia32_psubd (sam_AB, samples_AB.q [1]);

+ sam_AB = __builtin_ia32_pslld (samples_AB [0], 1);

+ sam_AB = __builtin_ia32_psubd (sam_AB, samples_AB [1]);

- samples_AB.q [1] = samples_AB.q [0];

- samples_AB.q [0] = left_right = *(int_mmx *) in_samples;

+ samples_AB [1] = samples_AB [0];

+ samples_AB [0] = left_right = *(int_mmx *) in_samples;

tmp0 = __builtin_ia32_psrld (sam_AB, 15);

tmp1 = __builtin_ia32_pand (sam_AB, msk0);

tmp0 = __builtin_ia32_pand (tmp0, msk1);

- tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB.q);

- tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB.q);

+ tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB);

+ tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB);

tmp1 = __builtin_ia32_paddd (tmp1, round);

tmp0 = __builtin_ia32_pslld (tmp0, 5);

tmp1 = __builtin_ia32_psrad (tmp1, 10);

@@ -107,9 +109,9 @@

tmp0 = __builtin_ia32_pcmpeqd (left_right, zero);

tmp0 = __builtin_ia32_por (tmp0, sam_AB);

tmp0 = __builtin_ia32_pandn (tmp0, tmp1);

- weight_AB.q = __builtin_ia32_paddd (weight_AB.q, tmp0);

+ weight_AB = __builtin_ia32_paddd (weight_AB, tmp0);

- sum_AB.q = __builtin_ia32_paddd (sum_AB.q, weight_AB.q);

+ sum_AB = __builtin_ia32_paddd (sum_AB, weight_AB);

*(int_mmx *) out_samples = left_right;

@@ -119,20 +121,20 @@

}

else if (dpp->term == 18) {

while (num_samples--) {

- tmp0 = samples_AB.q [0];

- sam_AB = __builtin_ia32_psubd (tmp0, samples_AB.q [1]);

+ tmp0 = samples_AB [0];

+ sam_AB = __builtin_ia32_psubd (tmp0, samples_AB [1]);

tmp0 = __builtin_ia32_pslld (tmp0, 1);

sam_AB = __builtin_ia32_paddd (sam_AB, tmp0);

sam_AB = __builtin_ia32_psrad (sam_AB, 1);

- samples_AB.q [1] = samples_AB.q [0];

- samples_AB.q [0] = left_right = *(int_mmx *) in_samples;

+ samples_AB [1] = samples_AB [0];

+ samples_AB [0] = left_right = *(int_mmx *) in_samples;

tmp0 = __builtin_ia32_psrld (sam_AB, 15);

tmp1 = __builtin_ia32_pand (sam_AB, msk0);

tmp0 = __builtin_ia32_pand (tmp0, msk1);

- tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB.q);

- tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB.q);

+ tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB);

+ tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB);

tmp1 = __builtin_ia32_paddd (tmp1, round);

tmp0 = __builtin_ia32_pslld (tmp0, 5);

tmp1 = __builtin_ia32_psrad (tmp1, 10);

@@ -147,9 +149,9 @@

tmp0 = __builtin_ia32_pcmpeqd (left_right, zero);

tmp0 = __builtin_ia32_por (tmp0, sam_AB);

tmp0 = __builtin_ia32_pandn (tmp0, tmp1);

- weight_AB.q = __builtin_ia32_paddd (weight_AB.q, tmp0);

+ weight_AB = __builtin_ia32_paddd (weight_AB, tmp0);

- sum_AB.q = __builtin_ia32_paddd (sum_AB.q, weight_AB.q);

+ sum_AB = __builtin_ia32_paddd (sum_AB, weight_AB);

*(int_mmx *) out_samples = left_right;

@@ -161,14 +163,14 @@

while (num_samples--) {

k = (m + dpp->term) & (MAX_TERM - 1);

- sam_AB = samples_AB.q [m];

- samples_AB.q [k] = left_right = *(int_mmx *) in_samples;

+ sam_AB = samples_AB [m];

+ samples_AB [k] = left_right = *(int_mmx *) in_samples;

tmp0 = __builtin_ia32_psrld (sam_AB, 15);

tmp1 = __builtin_ia32_pand (sam_AB, msk0);

tmp0 = __builtin_ia32_pand (tmp0, msk1);

- tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB.q);

- tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB.q);

+ tmp1 = __builtin_ia32_pmaddwd (tmp1, weight_AB);

+ tmp0 = __builtin_ia32_pmaddwd (tmp0, weight_AB);

tmp1 = __builtin_ia32_paddd (tmp1, round);

tmp0 = __builtin_ia32_pslld (tmp0, 5);

tmp1 = __builtin_ia32_psrad (tmp1, 10);

@@ -183,9 +185,9 @@

tmp0 = __builtin_ia32_pcmpeqd (left_right, zero);

tmp0 = __builtin_ia32_por (tmp0, sam_AB);

tmp0 = __builtin_ia32_pandn (tmp0, tmp1);

- weight_AB.q = __builtin_ia32_paddd (weight_AB.q, tmp0);

+ weight_AB = __builtin_ia32_paddd (weight_AB, tmp0);

- sum_AB.q = __builtin_ia32_paddd (sum_AB.q, weight_AB.q);

+ sum_AB = __builtin_ia32_paddd (sum_AB, weight_AB);

*(int_mmx *) out_samples = left_right;

@@ -194,13 +196,13 @@

m = (m + 1) & (MAX_TERM - 1);

}

}

- dpp->sum_A = sum_AB.d [0];

- dpp->sum_B = sum_AB.d [1];

- dpp->weight_A = weight_AB.d [0];

- dpp->weight_B = weight_AB.d [1];

+ dpp->sum_A = *(int*)&sum_AB;

+ dpp->sum_B = *((int*)&sum_AB+1);

+ dpp->weight_A = *(int*)&weight_AB;

+ dpp->weight_B = *((int*)&weight_AB+1);

for (k = 0; k < MAX_TERM; ++k) {

- dpp->samples_A [k] = samples_AB.d [m * 2];

- dpp->samples_B [k] = samples_AB.d [m * 2 + 1];

+ dpp->samples_A [k] = *((int*)&samples_AB+m * 2);

+ dpp->samples_B [k] = *((int*)&samples_AB+ m * 2 + 1);

m = (m + 1) & (MAX_TERM - 1);

}

__builtin_ia32_emms ();

[!--sizeo:1--][span style=\"font-size:8pt;line-height:100%\"][!--/sizeo--]Moderation: CODE to CODEBOX[/size]