Submission #2281310
Source Code Expand
#include <immintrin.h>
#include <cstdio>
int N;
int a[200000] __attribute__((aligned(64)));
int b[200000] __attribute__((aligned(64)));
int extract(__m128i ret) {
__m128i t0 = _mm_shuffle_epi32(ret, 176); // [2,3,0,0]
__m128i t1 = _mm_xor_si128(ret, t0); // [0^2,1^3,x,x]
__m128i t2 = _mm_shuffle_epi32(t1, 64); // [1^3,x,x,x]
__m128i t3 = _mm_xor_si128(t1, t2); // [0^2^1^3,x,x,x]
int rl = _mm_extract_epi16(t3, 6);
int rh = _mm_extract_epi16(t3, 7);
return rl + (rh << 16);
}
#define process_xor(src) \
c0 = _mm_add_epi32(a0, (src)); \
c1 = _mm_add_epi32(a1, (src)); \
c2 = _mm_add_epi32(a2, (src)); \
c3 = _mm_add_epi32(a3, (src)); \
c4 = _mm_add_epi32(a4, (src)); \
c5 = _mm_add_epi32(a5, (src)); \
c6 = _mm_add_epi32(a6, (src)); \
c7 = _mm_add_epi32(a7, (src)); \
c8 = _mm_add_epi32(a8, (src)); \
c9 = _mm_add_epi32(a9, (src)); \
r0 = _mm_xor_si128(r0, c0); \
r0 = _mm_xor_si128(r0, c1); \
r0 = _mm_xor_si128(r0, c2); \
r0 = _mm_xor_si128(r0, c3); \
r0 = _mm_xor_si128(r0, c4); \
r0 = _mm_xor_si128(r0, c5); \
r0 = _mm_xor_si128(r0, c6); \
r0 = _mm_xor_si128(r0, c7); \
r0 = _mm_xor_si128(r0, c8); \
r0 = _mm_xor_si128(r0, c9);
int solve() {
// port0 vec alu,
// port1 vec alu, vec shuf
// port5 vec alu, vec shuf
// port2, port3 load
__m128i r0 = _mm_setzero_si128(); // pxor 1l/3t
int ri = 0;
int DI = 10;
int DJ = 24;
int NI = N / DI * DI;
int NJ = N / DJ * DJ;
for(int i=0;i<NI;i+=DI) {
int aa = a[i];
int ab = a[i+1];
int ac = a[i+2];
int ad = a[i+3];
int ae = a[i+4];
int af = a[i+5];
int ag = a[i+6];
int ah = a[i+7];
int ai = a[i+8];
int aj = a[i+9];
__m128i a0 = _mm_set_epi32(aa, aa, aa, aa); // movd, pshufd
__m128i a1 = _mm_set_epi32(ab, ab, ab, ab);
__m128i a2 = _mm_set_epi32(ac, ac, ac, ac);
__m128i a3 = _mm_set_epi32(ad, ad, ad, ad);
__m128i a4 = _mm_set_epi32(ae, ae, ae, ae);
__m128i a5 = _mm_set_epi32(af, af, af, af);
__m128i a6 = _mm_set_epi32(ag, ag, ag, ag);
__m128i a7 = _mm_set_epi32(ah, ah, ah, ah);
__m128i a8 = _mm_set_epi32(ai, ai, ai, ai);
__m128i a9 = _mm_set_epi32(aj, aj, aj, aj);
for(int j=0;j<NJ;j+=DJ) {
__m128i b0, b1, b2, b3, b4, b5;
__m128i c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
b0 = _mm_load_si128((__m128i *const)&b[j]); // movdqa 1l/2t
b1 = _mm_load_si128((__m128i *const)&b[j+4]); // movdqa 1l/2t
b2 = _mm_load_si128((__m128i *const)&b[j+8]); // movdqa 1l/2t
b3 = _mm_load_si128((__m128i *const)&b[j+12]); // movdqa 1l/2t
b4 = _mm_load_si128((__m128i *const)&b[j+16]); // movdqa 1l/2t
b5 = _mm_load_si128((__m128i *const)&b[j+20]); // movdqa 1l/2t
process_xor(b0);
process_xor(b1);
process_xor(b2);
process_xor(b3);
process_xor(b4);
process_xor(b5);
}
for(int j=NJ;j<N;j++) {
ri ^= aa + b[j];
ri ^= ab + b[j];
ri ^= ac + b[j];
ri ^= ad + b[j];
ri ^= ae + b[j];
ri ^= af + b[j];
ri ^= ag + b[j];
ri ^= ah + b[j];
ri ^= ai + b[j];
ri ^= aj + b[j];
}
}
for(int i=NI;i<N;i++)
for(int j=0;j<N;j++)
ri ^= a[i] + b[j];
return ri ^ extract(r0);
}
int main() {
scanf("%d", &N);
for(int i=0;i<N;i++) {
scanf("%d", a+i);
}
for(int i=0;i<N;i++) {
scanf("%d", b+i);
}
int ans = solve();
printf("%d\n", ans);
return 0;
}
Submission Info
Submission Time |
|
Task |
D - Two Sequences |
User |
brly |
Language |
C++14 (Clang 3.8.0) |
Score |
500 |
Code Size |
3564 Byte |
Status |
AC |
Exec Time |
2767 ms |
Memory |
1792 KB |
Judge Result
Set Name |
Sample |
All |
Score / Max Score |
0 / 0 |
500 / 500 |
Status |
|
|
Set Name |
Test Cases |
Sample |
example_0, example_1, example_2, example_3 |
All |
N100000_0, N100000_1, N150000_0, N150000_1, N200000_0, N200000_1, N200000_ex_0, N200000_ex_1, example_0, example_1, example_2, example_3, rand_0, rand_1, smallrand_0, smallrand_1 |
Case Name |
Status |
Exec Time |
Memory |
N100000_0 |
AC |
703 ms |
1024 KB |
N100000_1 |
AC |
703 ms |
1024 KB |
N150000_0 |
AC |
1562 ms |
1408 KB |
N150000_1 |
AC |
1561 ms |
1408 KB |
N200000_0 |
AC |
2761 ms |
1792 KB |
N200000_1 |
AC |
2760 ms |
1792 KB |
N200000_ex_0 |
AC |
2767 ms |
1792 KB |
N200000_ex_1 |
AC |
2758 ms |
1792 KB |
example_0 |
AC |
1 ms |
256 KB |
example_1 |
AC |
1 ms |
256 KB |
example_2 |
AC |
1 ms |
256 KB |
example_3 |
AC |
1 ms |
256 KB |
rand_0 |
AC |
6 ms |
256 KB |
rand_1 |
AC |
17 ms |
384 KB |
smallrand_0 |
AC |
1 ms |
256 KB |
smallrand_1 |
AC |
1 ms |
256 KB |