earlier was this:

#if __GNUC__ && !__INTEL_COMPILER

const int_mmx

delta = { dpp->delta, dpp->delta },

msk0 = { 0x7fff, 0x7fff },

msk1 = { 0xffff, 0xffff },

round = { 512, 512 },

zero = { 0, 0 };

#else // NO GCC

const int_mmx

delta = _mm_set_pi32(dpp->delta, dpp->delta),

msk0 = _mm_set_pi32(0x7fff, 0x7fff),

msk1 = _mm_set_pi32(0xffff, 0xffff),

round = _mm_set_pi32(512, 512),

zero = _mm_set_pi32(0, 0);

#endif // __GNUC__

now is:

const int_mmx

delta = set_int_mmx(dpp->delta, dpp->delta),

fill = set_int_mmx(0x7bff, 0x7bff),

msk0 = set_int_mmx(0x7fff, 0x7fff),

msk1 = set_int_mmx(0xffff, 0xffff),

round = set_int_mmx(512, 512),

zero = set_int_mmx(0, 0);

int_mmx

sum_AB = set_int_mmx(0, 0),

weight_AB = set_int_mmx(

restore_weight (store_weight (dpp->weight_A)),

restore_weight (store_weight (dpp->weight_B))

),

left_right, sam_AB, tmp0, tmp1, samples_AB [MAX_TERM];

the set_int_mmx takes two same args (values), so switching the m1 with m2 do not gives any change,

only with weight_AB is the differece, where A and B are switched!!!

but i will try this patch