12 #if defined(__x86_64__) || defined(__amd64__) 16 void Transform(uint32_t* s,
const unsigned char* chunk,
size_t blocks)
18 static const uint32_t K256
alignas(16) [] = {
19 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
20 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
21 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
22 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
23 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
24 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
25 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
26 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
27 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
28 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
29 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
30 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
31 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
32 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
33 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
34 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
36 static const uint32_t FLIP_MASK
alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
37 static const uint32_t SHUF_00BA
alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
38 static const uint32_t SHUF_DC00
alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
39 uint32_t a, b, c, d, f, g, h, y0, y1, y2;
41 uint64_t inp_end, inp;
42 uint32_t xfer
alignas(16) [4];
64 "pshufb %%xmm12,%%xmm4;" 65 "movdqu 0x10(%1),%%xmm5;" 66 "pshufb %%xmm12,%%xmm5;" 67 "movdqu 0x20(%1),%%xmm6;" 68 "pshufb %%xmm12,%%xmm6;" 69 "movdqu 0x30(%1),%%xmm7;" 70 "pshufb %%xmm12,%%xmm7;" 75 "movdqa 0x0(%13),%%xmm9;" 76 "paddd %%xmm4,%%xmm9;" 78 "movdqa %%xmm7,%%xmm0;" 82 "palignr $0x4,%%xmm6,%%xmm0;" 87 "movdqa %%xmm5,%%xmm1;" 90 "paddd %%xmm4,%%xmm0;" 94 "palignr $0x4,%%xmm4,%%xmm1;" 98 "movdqa %%xmm1,%%xmm2;" 102 "movdqa %%xmm1,%%xmm3;" 106 "pslld $0x19,%%xmm1;" 116 "movdqa %%xmm3,%%xmm2;" 119 "movdqa %%xmm3,%%xmm8;" 128 "psrld $0x12,%%xmm2;" 133 "pxor %%xmm3,%%xmm1;" 140 "pxor %%xmm2,%%xmm1;" 144 "pxor %%xmm8,%%xmm1;" 148 "pshufd $0xfa,%%xmm7,%%xmm2;" 151 "paddd %%xmm1,%%xmm0;" 154 "movdqa %%xmm2,%%xmm3;" 158 "movdqa %%xmm2,%%xmm8;" 164 "psrlq $0x11,%%xmm2;" 166 "psrlq $0x13,%%xmm3;" 174 "pxor %%xmm3,%%xmm2;" 178 "pxor %%xmm2,%%xmm8;" 182 "pshufb %%xmm10,%%xmm8;" 186 "paddd %%xmm8,%%xmm0;" 189 "pshufd $0x50,%%xmm0,%%xmm2;" 192 "movdqa %%xmm2,%%xmm3;" 196 "movdqa %%xmm2,%%xmm4;" 201 "psrlq $0x11,%%xmm2;" 204 "psrlq $0x13,%%xmm3;" 212 "pxor %%xmm3,%%xmm2;" 216 "pxor %%xmm2,%%xmm4;" 220 "pshufb %%xmm11,%%xmm4;" 224 "paddd %%xmm0,%%xmm4;" 229 "movdqa 0x10(%13),%%xmm9;" 230 "paddd %%xmm5,%%xmm9;" 232 "movdqa %%xmm4,%%xmm0;" 236 "palignr $0x4,%%xmm7,%%xmm0;" 241 "movdqa %%xmm6,%%xmm1;" 244 "paddd %%xmm5,%%xmm0;" 248 "palignr $0x4,%%xmm5,%%xmm1;" 252 "movdqa %%xmm1,%%xmm2;" 256 "movdqa %%xmm1,%%xmm3;" 260 "pslld $0x19,%%xmm1;" 270 "movdqa %%xmm3,%%xmm2;" 273 "movdqa %%xmm3,%%xmm8;" 282 "psrld $0x12,%%xmm2;" 287 "pxor %%xmm3,%%xmm1;" 294 "pxor %%xmm2,%%xmm1;" 298 "pxor %%xmm8,%%xmm1;" 302 "pshufd $0xfa,%%xmm4,%%xmm2;" 305 "paddd %%xmm1,%%xmm0;" 308 "movdqa %%xmm2,%%xmm3;" 312 "movdqa %%xmm2,%%xmm8;" 318 "psrlq $0x11,%%xmm2;" 320 "psrlq $0x13,%%xmm3;" 328 "pxor %%xmm3,%%xmm2;" 332 "pxor %%xmm2,%%xmm8;" 336 "pshufb %%xmm10,%%xmm8;" 340 "paddd %%xmm8,%%xmm0;" 343 "pshufd $0x50,%%xmm0,%%xmm2;" 346 "movdqa %%xmm2,%%xmm3;" 350 "movdqa %%xmm2,%%xmm5;" 355 "psrlq $0x11,%%xmm2;" 358 "psrlq $0x13,%%xmm3;" 366 "pxor %%xmm3,%%xmm2;" 370 "pxor %%xmm2,%%xmm5;" 374 "pshufb %%xmm11,%%xmm5;" 378 "paddd %%xmm0,%%xmm5;" 383 "movdqa 0x20(%13),%%xmm9;" 384 "paddd %%xmm6,%%xmm9;" 386 "movdqa %%xmm5,%%xmm0;" 390 "palignr $0x4,%%xmm4,%%xmm0;" 395 "movdqa %%xmm7,%%xmm1;" 398 "paddd %%xmm6,%%xmm0;" 402 "palignr $0x4,%%xmm6,%%xmm1;" 406 "movdqa %%xmm1,%%xmm2;" 410 "movdqa %%xmm1,%%xmm3;" 414 "pslld $0x19,%%xmm1;" 424 "movdqa %%xmm3,%%xmm2;" 427 "movdqa %%xmm3,%%xmm8;" 436 "psrld $0x12,%%xmm2;" 441 "pxor %%xmm3,%%xmm1;" 448 "pxor %%xmm2,%%xmm1;" 452 "pxor %%xmm8,%%xmm1;" 456 "pshufd $0xfa,%%xmm5,%%xmm2;" 459 "paddd %%xmm1,%%xmm0;" 462 "movdqa %%xmm2,%%xmm3;" 466 "movdqa %%xmm2,%%xmm8;" 472 "psrlq $0x11,%%xmm2;" 474 "psrlq $0x13,%%xmm3;" 482 "pxor %%xmm3,%%xmm2;" 486 "pxor %%xmm2,%%xmm8;" 490 "pshufb %%xmm10,%%xmm8;" 494 "paddd %%xmm8,%%xmm0;" 497 "pshufd $0x50,%%xmm0,%%xmm2;" 500 "movdqa %%xmm2,%%xmm3;" 504 "movdqa %%xmm2,%%xmm6;" 509 "psrlq $0x11,%%xmm2;" 512 "psrlq $0x13,%%xmm3;" 520 "pxor %%xmm3,%%xmm2;" 524 "pxor %%xmm2,%%xmm6;" 528 "pshufb %%xmm11,%%xmm6;" 532 "paddd %%xmm0,%%xmm6;" 537 "movdqa 0x30(%13),%%xmm9;" 538 "paddd %%xmm7,%%xmm9;" 541 "movdqa %%xmm6,%%xmm0;" 545 "palignr $0x4,%%xmm5,%%xmm0;" 550 "movdqa %%xmm4,%%xmm1;" 553 "paddd %%xmm7,%%xmm0;" 557 "palignr $0x4,%%xmm7,%%xmm1;" 561 "movdqa %%xmm1,%%xmm2;" 565 "movdqa %%xmm1,%%xmm3;" 569 "pslld $0x19,%%xmm1;" 579 "movdqa %%xmm3,%%xmm2;" 582 "movdqa %%xmm3,%%xmm8;" 591 "psrld $0x12,%%xmm2;" 596 "pxor %%xmm3,%%xmm1;" 603 "pxor %%xmm2,%%xmm1;" 607 "pxor %%xmm8,%%xmm1;" 611 "pshufd $0xfa,%%xmm6,%%xmm2;" 614 "paddd %%xmm1,%%xmm0;" 617 "movdqa %%xmm2,%%xmm3;" 621 "movdqa %%xmm2,%%xmm8;" 627 "psrlq $0x11,%%xmm2;" 629 "psrlq $0x13,%%xmm3;" 637 "pxor %%xmm3,%%xmm2;" 641 "pxor %%xmm2,%%xmm8;" 645 "pshufb %%xmm10,%%xmm8;" 649 "paddd %%xmm8,%%xmm0;" 652 "pshufd $0x50,%%xmm0,%%xmm2;" 655 "movdqa %%xmm2,%%xmm3;" 659 "movdqa %%xmm2,%%xmm7;" 664 "psrlq $0x11,%%xmm2;" 667 "psrlq $0x13,%%xmm3;" 675 "pxor %%xmm3,%%xmm2;" 679 "pxor %%xmm2,%%xmm7;" 683 "pshufb %%xmm11,%%xmm7;" 687 "paddd %%xmm0,%%xmm7;" 697 "paddd 0x0(%13),%%xmm4;" 811 "paddd 0x10(%13),%%xmm5;" 926 "movdqa %%xmm6,%%xmm4;" 927 "movdqa %%xmm7,%%xmm5;" 953 :
"+r"(s),
"+r"(chunk),
"+r"(blocks),
"=r"(a),
"=r"(b),
"=r"(c),
"=r"(d),
"=r"(f),
"=r"(g),
"=r"(h),
"=r"(y0),
"=r"(y1),
"=r"(y2),
"=r"(tbl),
"+m"(inp_end),
"+m"(inp),
"+m"(xfer)
954 :
"m"(K256),
"m"(FLIP_MASK),
"m"(SHUF_00BA),
"m"(SHUF_DC00)
955 :
"cc",
"memory",
"xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"xmm8",
"xmm9",
"xmm10",
"xmm11",
"xmm12"