__m128 X1,X2,X3,X4;
__m128 X1out,X2out,X3out,X4out;
float X1_sse[4] __attribute__((aligned(16))) = {1, 2, 3, 4};
float X2_sse[4] __attribute__((aligned(16))) = {11, 12, 13, 14};
float X3_sse[4] __attribute__((aligned(16))) = {21, 22, 23, 24};
float X4_sse[4] __attribute__((aligned(16))) = {31, 32, 33, 34};
float X1out_sse[4] __attribute__((aligned(16)));
float X2out_sse[4] __attribute__((aligned(16)));
float X3out_sse[4] __attribute__((aligned(16)));
float X4out_sse[4] __attribute__((aligned(16)));
int main(int argc, int** argv)
{
double t1, t2, dt, dtmoy=0;
int N;
int k, l, i, j, m;
int i;
t1 = dtime();
X1 = _mm_load_ps(X1_sse); X2 = _mm_load_ps(X2_sse); X3 = _mm_load_ps(X3_sse); X4 = _mm_load_ps(X4_sse);
X1out = _mm_unpacklo_ps(X1,X3);
X2out = _mm_unpackhi_ps(X1,X3);
X3out = _mm_unpacklo_ps(X2,X4);
X4out = _mm_unpackhi_ps(X2,X4);
X1 = _mm_unpacklo_ps(X1out,X3out);
X2 = _mm_unpackhi_ps(X1out,X3out);
X3 = _mm_unpacklo_ps(X2out,X4out);
X4 = _mm_unpackhi_ps(X2out,X4out);
_mm_store_ps(X1out_sse, X1);
_mm_store_ps(X2out_sse, X2);
_mm_store_ps(X3out_sse, X3);
_mm_store_ps(X4out_sse, X4);
t2 = dtime();
dt = t2 - t1;
}