Raven Core  3.0.0
P2P Digital Currency
sha256_sse4.cpp
Go to the documentation of this file.
1 // Copyright (c) 2017 The Bitcoin Core developers
2 // Copyright (c) 2017-2019 The Raven Core developers
3 // Distributed under the MIT software license, see the accompanying
4 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
5 //
6 // This is a translation to GCC extended asm syntax from YASM code by Intel
7 // (available at the bottom of this file).
8 
9 #include <stdint.h>
10 #include <stdlib.h>
11 
12 #if defined(__x86_64__) || defined(__amd64__)
13 
14 namespace sha256_sse4
15 {
16 void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
17 {
18  static const uint32_t K256 alignas(16) [] = {
19  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
20  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
21  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
22  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
23  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
24  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
25  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
26  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
27  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
28  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
29  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
30  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
31  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
32  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
33  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
34  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
35  };
36  static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
37  static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
38  static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
39  uint32_t a, b, c, d, f, g, h, y0, y1, y2;
40  uint64_t tbl;
41  uint64_t inp_end, inp;
42  uint32_t xfer alignas(16) [4];
43 
44  __asm__ __volatile__(
45  "shl $0x6,%2;"
46  "je Ldone_hash_%=;"
47  "add %1,%2;"
48  "mov %2,%14;"
49  "mov (%0),%3;"
50  "mov 0x4(%0),%4;"
51  "mov 0x8(%0),%5;"
52  "mov 0xc(%0),%6;"
53  "mov 0x10(%0),%k2;"
54  "mov 0x14(%0),%7;"
55  "mov 0x18(%0),%8;"
56  "mov 0x1c(%0),%9;"
57  "movdqa %18,%%xmm12;"
58  "movdqa %19,%%xmm10;"
59  "movdqa %20,%%xmm11;"
60 
61  "Lloop0_%=:"
62  "lea %17,%13;"
63  "movdqu (%1),%%xmm4;"
64  "pshufb %%xmm12,%%xmm4;"
65  "movdqu 0x10(%1),%%xmm5;"
66  "pshufb %%xmm12,%%xmm5;"
67  "movdqu 0x20(%1),%%xmm6;"
68  "pshufb %%xmm12,%%xmm6;"
69  "movdqu 0x30(%1),%%xmm7;"
70  "pshufb %%xmm12,%%xmm7;"
71  "mov %1,%15;"
72  "mov $3,%1;"
73 
74  "Lloop1_%=:"
75  "movdqa 0x0(%13),%%xmm9;"
76  "paddd %%xmm4,%%xmm9;"
77  "movdqa %%xmm9,%16;"
78  "movdqa %%xmm7,%%xmm0;"
79  "mov %k2,%10;"
80  "ror $0xe,%10;"
81  "mov %3,%11;"
82  "palignr $0x4,%%xmm6,%%xmm0;"
83  "ror $0x9,%11;"
84  "xor %k2,%10;"
85  "mov %7,%12;"
86  "ror $0x5,%10;"
87  "movdqa %%xmm5,%%xmm1;"
88  "xor %3,%11;"
89  "xor %8,%12;"
90  "paddd %%xmm4,%%xmm0;"
91  "xor %k2,%10;"
92  "and %k2,%12;"
93  "ror $0xb,%11;"
94  "palignr $0x4,%%xmm4,%%xmm1;"
95  "xor %3,%11;"
96  "ror $0x6,%10;"
97  "xor %8,%12;"
98  "movdqa %%xmm1,%%xmm2;"
99  "ror $0x2,%11;"
100  "add %10,%12;"
101  "add %16,%12;"
102  "movdqa %%xmm1,%%xmm3;"
103  "mov %3,%10;"
104  "add %12,%9;"
105  "mov %3,%12;"
106  "pslld $0x19,%%xmm1;"
107  "or %5,%10;"
108  "add %9,%6;"
109  "and %5,%12;"
110  "psrld $0x7,%%xmm2;"
111  "and %4,%10;"
112  "add %11,%9;"
113  "por %%xmm2,%%xmm1;"
114  "or %12,%10;"
115  "add %10,%9;"
116  "movdqa %%xmm3,%%xmm2;"
117  "mov %6,%10;"
118  "mov %9,%11;"
119  "movdqa %%xmm3,%%xmm8;"
120  "ror $0xe,%10;"
121  "xor %6,%10;"
122  "mov %k2,%12;"
123  "ror $0x9,%11;"
124  "pslld $0xe,%%xmm3;"
125  "xor %9,%11;"
126  "ror $0x5,%10;"
127  "xor %7,%12;"
128  "psrld $0x12,%%xmm2;"
129  "ror $0xb,%11;"
130  "xor %6,%10;"
131  "and %6,%12;"
132  "ror $0x6,%10;"
133  "pxor %%xmm3,%%xmm1;"
134  "xor %9,%11;"
135  "xor %7,%12;"
136  "psrld $0x3,%%xmm8;"
137  "add %10,%12;"
138  "add 4+%16,%12;"
139  "ror $0x2,%11;"
140  "pxor %%xmm2,%%xmm1;"
141  "mov %9,%10;"
142  "add %12,%8;"
143  "mov %9,%12;"
144  "pxor %%xmm8,%%xmm1;"
145  "or %4,%10;"
146  "add %8,%5;"
147  "and %4,%12;"
148  "pshufd $0xfa,%%xmm7,%%xmm2;"
149  "and %3,%10;"
150  "add %11,%8;"
151  "paddd %%xmm1,%%xmm0;"
152  "or %12,%10;"
153  "add %10,%8;"
154  "movdqa %%xmm2,%%xmm3;"
155  "mov %5,%10;"
156  "mov %8,%11;"
157  "ror $0xe,%10;"
158  "movdqa %%xmm2,%%xmm8;"
159  "xor %5,%10;"
160  "ror $0x9,%11;"
161  "mov %6,%12;"
162  "xor %8,%11;"
163  "ror $0x5,%10;"
164  "psrlq $0x11,%%xmm2;"
165  "xor %k2,%12;"
166  "psrlq $0x13,%%xmm3;"
167  "xor %5,%10;"
168  "and %5,%12;"
169  "psrld $0xa,%%xmm8;"
170  "ror $0xb,%11;"
171  "xor %8,%11;"
172  "xor %k2,%12;"
173  "ror $0x6,%10;"
174  "pxor %%xmm3,%%xmm2;"
175  "add %10,%12;"
176  "ror $0x2,%11;"
177  "add 8+%16,%12;"
178  "pxor %%xmm2,%%xmm8;"
179  "mov %8,%10;"
180  "add %12,%7;"
181  "mov %8,%12;"
182  "pshufb %%xmm10,%%xmm8;"
183  "or %3,%10;"
184  "add %7,%4;"
185  "and %3,%12;"
186  "paddd %%xmm8,%%xmm0;"
187  "and %9,%10;"
188  "add %11,%7;"
189  "pshufd $0x50,%%xmm0,%%xmm2;"
190  "or %12,%10;"
191  "add %10,%7;"
192  "movdqa %%xmm2,%%xmm3;"
193  "mov %4,%10;"
194  "ror $0xe,%10;"
195  "mov %7,%11;"
196  "movdqa %%xmm2,%%xmm4;"
197  "ror $0x9,%11;"
198  "xor %4,%10;"
199  "mov %5,%12;"
200  "ror $0x5,%10;"
201  "psrlq $0x11,%%xmm2;"
202  "xor %7,%11;"
203  "xor %6,%12;"
204  "psrlq $0x13,%%xmm3;"
205  "xor %4,%10;"
206  "and %4,%12;"
207  "ror $0xb,%11;"
208  "psrld $0xa,%%xmm4;"
209  "xor %7,%11;"
210  "ror $0x6,%10;"
211  "xor %6,%12;"
212  "pxor %%xmm3,%%xmm2;"
213  "ror $0x2,%11;"
214  "add %10,%12;"
215  "add 12+%16,%12;"
216  "pxor %%xmm2,%%xmm4;"
217  "mov %7,%10;"
218  "add %12,%k2;"
219  "mov %7,%12;"
220  "pshufb %%xmm11,%%xmm4;"
221  "or %9,%10;"
222  "add %k2,%3;"
223  "and %9,%12;"
224  "paddd %%xmm0,%%xmm4;"
225  "and %8,%10;"
226  "add %11,%k2;"
227  "or %12,%10;"
228  "add %10,%k2;"
229  "movdqa 0x10(%13),%%xmm9;"
230  "paddd %%xmm5,%%xmm9;"
231  "movdqa %%xmm9,%16;"
232  "movdqa %%xmm4,%%xmm0;"
233  "mov %3,%10;"
234  "ror $0xe,%10;"
235  "mov %k2,%11;"
236  "palignr $0x4,%%xmm7,%%xmm0;"
237  "ror $0x9,%11;"
238  "xor %3,%10;"
239  "mov %4,%12;"
240  "ror $0x5,%10;"
241  "movdqa %%xmm6,%%xmm1;"
242  "xor %k2,%11;"
243  "xor %5,%12;"
244  "paddd %%xmm5,%%xmm0;"
245  "xor %3,%10;"
246  "and %3,%12;"
247  "ror $0xb,%11;"
248  "palignr $0x4,%%xmm5,%%xmm1;"
249  "xor %k2,%11;"
250  "ror $0x6,%10;"
251  "xor %5,%12;"
252  "movdqa %%xmm1,%%xmm2;"
253  "ror $0x2,%11;"
254  "add %10,%12;"
255  "add %16,%12;"
256  "movdqa %%xmm1,%%xmm3;"
257  "mov %k2,%10;"
258  "add %12,%6;"
259  "mov %k2,%12;"
260  "pslld $0x19,%%xmm1;"
261  "or %8,%10;"
262  "add %6,%9;"
263  "and %8,%12;"
264  "psrld $0x7,%%xmm2;"
265  "and %7,%10;"
266  "add %11,%6;"
267  "por %%xmm2,%%xmm1;"
268  "or %12,%10;"
269  "add %10,%6;"
270  "movdqa %%xmm3,%%xmm2;"
271  "mov %9,%10;"
272  "mov %6,%11;"
273  "movdqa %%xmm3,%%xmm8;"
274  "ror $0xe,%10;"
275  "xor %9,%10;"
276  "mov %3,%12;"
277  "ror $0x9,%11;"
278  "pslld $0xe,%%xmm3;"
279  "xor %6,%11;"
280  "ror $0x5,%10;"
281  "xor %4,%12;"
282  "psrld $0x12,%%xmm2;"
283  "ror $0xb,%11;"
284  "xor %9,%10;"
285  "and %9,%12;"
286  "ror $0x6,%10;"
287  "pxor %%xmm3,%%xmm1;"
288  "xor %6,%11;"
289  "xor %4,%12;"
290  "psrld $0x3,%%xmm8;"
291  "add %10,%12;"
292  "add 4+%16,%12;"
293  "ror $0x2,%11;"
294  "pxor %%xmm2,%%xmm1;"
295  "mov %6,%10;"
296  "add %12,%5;"
297  "mov %6,%12;"
298  "pxor %%xmm8,%%xmm1;"
299  "or %7,%10;"
300  "add %5,%8;"
301  "and %7,%12;"
302  "pshufd $0xfa,%%xmm4,%%xmm2;"
303  "and %k2,%10;"
304  "add %11,%5;"
305  "paddd %%xmm1,%%xmm0;"
306  "or %12,%10;"
307  "add %10,%5;"
308  "movdqa %%xmm2,%%xmm3;"
309  "mov %8,%10;"
310  "mov %5,%11;"
311  "ror $0xe,%10;"
312  "movdqa %%xmm2,%%xmm8;"
313  "xor %8,%10;"
314  "ror $0x9,%11;"
315  "mov %9,%12;"
316  "xor %5,%11;"
317  "ror $0x5,%10;"
318  "psrlq $0x11,%%xmm2;"
319  "xor %3,%12;"
320  "psrlq $0x13,%%xmm3;"
321  "xor %8,%10;"
322  "and %8,%12;"
323  "psrld $0xa,%%xmm8;"
324  "ror $0xb,%11;"
325  "xor %5,%11;"
326  "xor %3,%12;"
327  "ror $0x6,%10;"
328  "pxor %%xmm3,%%xmm2;"
329  "add %10,%12;"
330  "ror $0x2,%11;"
331  "add 8+%16,%12;"
332  "pxor %%xmm2,%%xmm8;"
333  "mov %5,%10;"
334  "add %12,%4;"
335  "mov %5,%12;"
336  "pshufb %%xmm10,%%xmm8;"
337  "or %k2,%10;"
338  "add %4,%7;"
339  "and %k2,%12;"
340  "paddd %%xmm8,%%xmm0;"
341  "and %6,%10;"
342  "add %11,%4;"
343  "pshufd $0x50,%%xmm0,%%xmm2;"
344  "or %12,%10;"
345  "add %10,%4;"
346  "movdqa %%xmm2,%%xmm3;"
347  "mov %7,%10;"
348  "ror $0xe,%10;"
349  "mov %4,%11;"
350  "movdqa %%xmm2,%%xmm5;"
351  "ror $0x9,%11;"
352  "xor %7,%10;"
353  "mov %8,%12;"
354  "ror $0x5,%10;"
355  "psrlq $0x11,%%xmm2;"
356  "xor %4,%11;"
357  "xor %9,%12;"
358  "psrlq $0x13,%%xmm3;"
359  "xor %7,%10;"
360  "and %7,%12;"
361  "ror $0xb,%11;"
362  "psrld $0xa,%%xmm5;"
363  "xor %4,%11;"
364  "ror $0x6,%10;"
365  "xor %9,%12;"
366  "pxor %%xmm3,%%xmm2;"
367  "ror $0x2,%11;"
368  "add %10,%12;"
369  "add 12+%16,%12;"
370  "pxor %%xmm2,%%xmm5;"
371  "mov %4,%10;"
372  "add %12,%3;"
373  "mov %4,%12;"
374  "pshufb %%xmm11,%%xmm5;"
375  "or %6,%10;"
376  "add %3,%k2;"
377  "and %6,%12;"
378  "paddd %%xmm0,%%xmm5;"
379  "and %5,%10;"
380  "add %11,%3;"
381  "or %12,%10;"
382  "add %10,%3;"
383  "movdqa 0x20(%13),%%xmm9;"
384  "paddd %%xmm6,%%xmm9;"
385  "movdqa %%xmm9,%16;"
386  "movdqa %%xmm5,%%xmm0;"
387  "mov %k2,%10;"
388  "ror $0xe,%10;"
389  "mov %3,%11;"
390  "palignr $0x4,%%xmm4,%%xmm0;"
391  "ror $0x9,%11;"
392  "xor %k2,%10;"
393  "mov %7,%12;"
394  "ror $0x5,%10;"
395  "movdqa %%xmm7,%%xmm1;"
396  "xor %3,%11;"
397  "xor %8,%12;"
398  "paddd %%xmm6,%%xmm0;"
399  "xor %k2,%10;"
400  "and %k2,%12;"
401  "ror $0xb,%11;"
402  "palignr $0x4,%%xmm6,%%xmm1;"
403  "xor %3,%11;"
404  "ror $0x6,%10;"
405  "xor %8,%12;"
406  "movdqa %%xmm1,%%xmm2;"
407  "ror $0x2,%11;"
408  "add %10,%12;"
409  "add %16,%12;"
410  "movdqa %%xmm1,%%xmm3;"
411  "mov %3,%10;"
412  "add %12,%9;"
413  "mov %3,%12;"
414  "pslld $0x19,%%xmm1;"
415  "or %5,%10;"
416  "add %9,%6;"
417  "and %5,%12;"
418  "psrld $0x7,%%xmm2;"
419  "and %4,%10;"
420  "add %11,%9;"
421  "por %%xmm2,%%xmm1;"
422  "or %12,%10;"
423  "add %10,%9;"
424  "movdqa %%xmm3,%%xmm2;"
425  "mov %6,%10;"
426  "mov %9,%11;"
427  "movdqa %%xmm3,%%xmm8;"
428  "ror $0xe,%10;"
429  "xor %6,%10;"
430  "mov %k2,%12;"
431  "ror $0x9,%11;"
432  "pslld $0xe,%%xmm3;"
433  "xor %9,%11;"
434  "ror $0x5,%10;"
435  "xor %7,%12;"
436  "psrld $0x12,%%xmm2;"
437  "ror $0xb,%11;"
438  "xor %6,%10;"
439  "and %6,%12;"
440  "ror $0x6,%10;"
441  "pxor %%xmm3,%%xmm1;"
442  "xor %9,%11;"
443  "xor %7,%12;"
444  "psrld $0x3,%%xmm8;"
445  "add %10,%12;"
446  "add 4+%16,%12;"
447  "ror $0x2,%11;"
448  "pxor %%xmm2,%%xmm1;"
449  "mov %9,%10;"
450  "add %12,%8;"
451  "mov %9,%12;"
452  "pxor %%xmm8,%%xmm1;"
453  "or %4,%10;"
454  "add %8,%5;"
455  "and %4,%12;"
456  "pshufd $0xfa,%%xmm5,%%xmm2;"
457  "and %3,%10;"
458  "add %11,%8;"
459  "paddd %%xmm1,%%xmm0;"
460  "or %12,%10;"
461  "add %10,%8;"
462  "movdqa %%xmm2,%%xmm3;"
463  "mov %5,%10;"
464  "mov %8,%11;"
465  "ror $0xe,%10;"
466  "movdqa %%xmm2,%%xmm8;"
467  "xor %5,%10;"
468  "ror $0x9,%11;"
469  "mov %6,%12;"
470  "xor %8,%11;"
471  "ror $0x5,%10;"
472  "psrlq $0x11,%%xmm2;"
473  "xor %k2,%12;"
474  "psrlq $0x13,%%xmm3;"
475  "xor %5,%10;"
476  "and %5,%12;"
477  "psrld $0xa,%%xmm8;"
478  "ror $0xb,%11;"
479  "xor %8,%11;"
480  "xor %k2,%12;"
481  "ror $0x6,%10;"
482  "pxor %%xmm3,%%xmm2;"
483  "add %10,%12;"
484  "ror $0x2,%11;"
485  "add 8+%16,%12;"
486  "pxor %%xmm2,%%xmm8;"
487  "mov %8,%10;"
488  "add %12,%7;"
489  "mov %8,%12;"
490  "pshufb %%xmm10,%%xmm8;"
491  "or %3,%10;"
492  "add %7,%4;"
493  "and %3,%12;"
494  "paddd %%xmm8,%%xmm0;"
495  "and %9,%10;"
496  "add %11,%7;"
497  "pshufd $0x50,%%xmm0,%%xmm2;"
498  "or %12,%10;"
499  "add %10,%7;"
500  "movdqa %%xmm2,%%xmm3;"
501  "mov %4,%10;"
502  "ror $0xe,%10;"
503  "mov %7,%11;"
504  "movdqa %%xmm2,%%xmm6;"
505  "ror $0x9,%11;"
506  "xor %4,%10;"
507  "mov %5,%12;"
508  "ror $0x5,%10;"
509  "psrlq $0x11,%%xmm2;"
510  "xor %7,%11;"
511  "xor %6,%12;"
512  "psrlq $0x13,%%xmm3;"
513  "xor %4,%10;"
514  "and %4,%12;"
515  "ror $0xb,%11;"
516  "psrld $0xa,%%xmm6;"
517  "xor %7,%11;"
518  "ror $0x6,%10;"
519  "xor %6,%12;"
520  "pxor %%xmm3,%%xmm2;"
521  "ror $0x2,%11;"
522  "add %10,%12;"
523  "add 12+%16,%12;"
524  "pxor %%xmm2,%%xmm6;"
525  "mov %7,%10;"
526  "add %12,%k2;"
527  "mov %7,%12;"
528  "pshufb %%xmm11,%%xmm6;"
529  "or %9,%10;"
530  "add %k2,%3;"
531  "and %9,%12;"
532  "paddd %%xmm0,%%xmm6;"
533  "and %8,%10;"
534  "add %11,%k2;"
535  "or %12,%10;"
536  "add %10,%k2;"
537  "movdqa 0x30(%13),%%xmm9;"
538  "paddd %%xmm7,%%xmm9;"
539  "movdqa %%xmm9,%16;"
540  "add $0x40,%13;"
541  "movdqa %%xmm6,%%xmm0;"
542  "mov %3,%10;"
543  "ror $0xe,%10;"
544  "mov %k2,%11;"
545  "palignr $0x4,%%xmm5,%%xmm0;"
546  "ror $0x9,%11;"
547  "xor %3,%10;"
548  "mov %4,%12;"
549  "ror $0x5,%10;"
550  "movdqa %%xmm4,%%xmm1;"
551  "xor %k2,%11;"
552  "xor %5,%12;"
553  "paddd %%xmm7,%%xmm0;"
554  "xor %3,%10;"
555  "and %3,%12;"
556  "ror $0xb,%11;"
557  "palignr $0x4,%%xmm7,%%xmm1;"
558  "xor %k2,%11;"
559  "ror $0x6,%10;"
560  "xor %5,%12;"
561  "movdqa %%xmm1,%%xmm2;"
562  "ror $0x2,%11;"
563  "add %10,%12;"
564  "add %16,%12;"
565  "movdqa %%xmm1,%%xmm3;"
566  "mov %k2,%10;"
567  "add %12,%6;"
568  "mov %k2,%12;"
569  "pslld $0x19,%%xmm1;"
570  "or %8,%10;"
571  "add %6,%9;"
572  "and %8,%12;"
573  "psrld $0x7,%%xmm2;"
574  "and %7,%10;"
575  "add %11,%6;"
576  "por %%xmm2,%%xmm1;"
577  "or %12,%10;"
578  "add %10,%6;"
579  "movdqa %%xmm3,%%xmm2;"
580  "mov %9,%10;"
581  "mov %6,%11;"
582  "movdqa %%xmm3,%%xmm8;"
583  "ror $0xe,%10;"
584  "xor %9,%10;"
585  "mov %3,%12;"
586  "ror $0x9,%11;"
587  "pslld $0xe,%%xmm3;"
588  "xor %6,%11;"
589  "ror $0x5,%10;"
590  "xor %4,%12;"
591  "psrld $0x12,%%xmm2;"
592  "ror $0xb,%11;"
593  "xor %9,%10;"
594  "and %9,%12;"
595  "ror $0x6,%10;"
596  "pxor %%xmm3,%%xmm1;"
597  "xor %6,%11;"
598  "xor %4,%12;"
599  "psrld $0x3,%%xmm8;"
600  "add %10,%12;"
601  "add 4+%16,%12;"
602  "ror $0x2,%11;"
603  "pxor %%xmm2,%%xmm1;"
604  "mov %6,%10;"
605  "add %12,%5;"
606  "mov %6,%12;"
607  "pxor %%xmm8,%%xmm1;"
608  "or %7,%10;"
609  "add %5,%8;"
610  "and %7,%12;"
611  "pshufd $0xfa,%%xmm6,%%xmm2;"
612  "and %k2,%10;"
613  "add %11,%5;"
614  "paddd %%xmm1,%%xmm0;"
615  "or %12,%10;"
616  "add %10,%5;"
617  "movdqa %%xmm2,%%xmm3;"
618  "mov %8,%10;"
619  "mov %5,%11;"
620  "ror $0xe,%10;"
621  "movdqa %%xmm2,%%xmm8;"
622  "xor %8,%10;"
623  "ror $0x9,%11;"
624  "mov %9,%12;"
625  "xor %5,%11;"
626  "ror $0x5,%10;"
627  "psrlq $0x11,%%xmm2;"
628  "xor %3,%12;"
629  "psrlq $0x13,%%xmm3;"
630  "xor %8,%10;"
631  "and %8,%12;"
632  "psrld $0xa,%%xmm8;"
633  "ror $0xb,%11;"
634  "xor %5,%11;"
635  "xor %3,%12;"
636  "ror $0x6,%10;"
637  "pxor %%xmm3,%%xmm2;"
638  "add %10,%12;"
639  "ror $0x2,%11;"
640  "add 8+%16,%12;"
641  "pxor %%xmm2,%%xmm8;"
642  "mov %5,%10;"
643  "add %12,%4;"
644  "mov %5,%12;"
645  "pshufb %%xmm10,%%xmm8;"
646  "or %k2,%10;"
647  "add %4,%7;"
648  "and %k2,%12;"
649  "paddd %%xmm8,%%xmm0;"
650  "and %6,%10;"
651  "add %11,%4;"
652  "pshufd $0x50,%%xmm0,%%xmm2;"
653  "or %12,%10;"
654  "add %10,%4;"
655  "movdqa %%xmm2,%%xmm3;"
656  "mov %7,%10;"
657  "ror $0xe,%10;"
658  "mov %4,%11;"
659  "movdqa %%xmm2,%%xmm7;"
660  "ror $0x9,%11;"
661  "xor %7,%10;"
662  "mov %8,%12;"
663  "ror $0x5,%10;"
664  "psrlq $0x11,%%xmm2;"
665  "xor %4,%11;"
666  "xor %9,%12;"
667  "psrlq $0x13,%%xmm3;"
668  "xor %7,%10;"
669  "and %7,%12;"
670  "ror $0xb,%11;"
671  "psrld $0xa,%%xmm7;"
672  "xor %4,%11;"
673  "ror $0x6,%10;"
674  "xor %9,%12;"
675  "pxor %%xmm3,%%xmm2;"
676  "ror $0x2,%11;"
677  "add %10,%12;"
678  "add 12+%16,%12;"
679  "pxor %%xmm2,%%xmm7;"
680  "mov %4,%10;"
681  "add %12,%3;"
682  "mov %4,%12;"
683  "pshufb %%xmm11,%%xmm7;"
684  "or %6,%10;"
685  "add %3,%k2;"
686  "and %6,%12;"
687  "paddd %%xmm0,%%xmm7;"
688  "and %5,%10;"
689  "add %11,%3;"
690  "or %12,%10;"
691  "add %10,%3;"
692  "sub $0x1,%1;"
693  "jne Lloop1_%=;"
694  "mov $0x2,%1;"
695 
696  "Lloop2_%=:"
697  "paddd 0x0(%13),%%xmm4;"
698  "movdqa %%xmm4,%16;"
699  "mov %k2,%10;"
700  "ror $0xe,%10;"
701  "mov %3,%11;"
702  "xor %k2,%10;"
703  "ror $0x9,%11;"
704  "mov %7,%12;"
705  "xor %3,%11;"
706  "ror $0x5,%10;"
707  "xor %8,%12;"
708  "xor %k2,%10;"
709  "ror $0xb,%11;"
710  "and %k2,%12;"
711  "xor %3,%11;"
712  "ror $0x6,%10;"
713  "xor %8,%12;"
714  "add %10,%12;"
715  "ror $0x2,%11;"
716  "add %16,%12;"
717  "mov %3,%10;"
718  "add %12,%9;"
719  "mov %3,%12;"
720  "or %5,%10;"
721  "add %9,%6;"
722  "and %5,%12;"
723  "and %4,%10;"
724  "add %11,%9;"
725  "or %12,%10;"
726  "add %10,%9;"
727  "mov %6,%10;"
728  "ror $0xe,%10;"
729  "mov %9,%11;"
730  "xor %6,%10;"
731  "ror $0x9,%11;"
732  "mov %k2,%12;"
733  "xor %9,%11;"
734  "ror $0x5,%10;"
735  "xor %7,%12;"
736  "xor %6,%10;"
737  "ror $0xb,%11;"
738  "and %6,%12;"
739  "xor %9,%11;"
740  "ror $0x6,%10;"
741  "xor %7,%12;"
742  "add %10,%12;"
743  "ror $0x2,%11;"
744  "add 4+%16,%12;"
745  "mov %9,%10;"
746  "add %12,%8;"
747  "mov %9,%12;"
748  "or %4,%10;"
749  "add %8,%5;"
750  "and %4,%12;"
751  "and %3,%10;"
752  "add %11,%8;"
753  "or %12,%10;"
754  "add %10,%8;"
755  "mov %5,%10;"
756  "ror $0xe,%10;"
757  "mov %8,%11;"
758  "xor %5,%10;"
759  "ror $0x9,%11;"
760  "mov %6,%12;"
761  "xor %8,%11;"
762  "ror $0x5,%10;"
763  "xor %k2,%12;"
764  "xor %5,%10;"
765  "ror $0xb,%11;"
766  "and %5,%12;"
767  "xor %8,%11;"
768  "ror $0x6,%10;"
769  "xor %k2,%12;"
770  "add %10,%12;"
771  "ror $0x2,%11;"
772  "add 8+%16,%12;"
773  "mov %8,%10;"
774  "add %12,%7;"
775  "mov %8,%12;"
776  "or %3,%10;"
777  "add %7,%4;"
778  "and %3,%12;"
779  "and %9,%10;"
780  "add %11,%7;"
781  "or %12,%10;"
782  "add %10,%7;"
783  "mov %4,%10;"
784  "ror $0xe,%10;"
785  "mov %7,%11;"
786  "xor %4,%10;"
787  "ror $0x9,%11;"
788  "mov %5,%12;"
789  "xor %7,%11;"
790  "ror $0x5,%10;"
791  "xor %6,%12;"
792  "xor %4,%10;"
793  "ror $0xb,%11;"
794  "and %4,%12;"
795  "xor %7,%11;"
796  "ror $0x6,%10;"
797  "xor %6,%12;"
798  "add %10,%12;"
799  "ror $0x2,%11;"
800  "add 12+%16,%12;"
801  "mov %7,%10;"
802  "add %12,%k2;"
803  "mov %7,%12;"
804  "or %9,%10;"
805  "add %k2,%3;"
806  "and %9,%12;"
807  "and %8,%10;"
808  "add %11,%k2;"
809  "or %12,%10;"
810  "add %10,%k2;"
811  "paddd 0x10(%13),%%xmm5;"
812  "movdqa %%xmm5,%16;"
813  "add $0x20,%13;"
814  "mov %3,%10;"
815  "ror $0xe,%10;"
816  "mov %k2,%11;"
817  "xor %3,%10;"
818  "ror $0x9,%11;"
819  "mov %4,%12;"
820  "xor %k2,%11;"
821  "ror $0x5,%10;"
822  "xor %5,%12;"
823  "xor %3,%10;"
824  "ror $0xb,%11;"
825  "and %3,%12;"
826  "xor %k2,%11;"
827  "ror $0x6,%10;"
828  "xor %5,%12;"
829  "add %10,%12;"
830  "ror $0x2,%11;"
831  "add %16,%12;"
832  "mov %k2,%10;"
833  "add %12,%6;"
834  "mov %k2,%12;"
835  "or %8,%10;"
836  "add %6,%9;"
837  "and %8,%12;"
838  "and %7,%10;"
839  "add %11,%6;"
840  "or %12,%10;"
841  "add %10,%6;"
842  "mov %9,%10;"
843  "ror $0xe,%10;"
844  "mov %6,%11;"
845  "xor %9,%10;"
846  "ror $0x9,%11;"
847  "mov %3,%12;"
848  "xor %6,%11;"
849  "ror $0x5,%10;"
850  "xor %4,%12;"
851  "xor %9,%10;"
852  "ror $0xb,%11;"
853  "and %9,%12;"
854  "xor %6,%11;"
855  "ror $0x6,%10;"
856  "xor %4,%12;"
857  "add %10,%12;"
858  "ror $0x2,%11;"
859  "add 4+%16,%12;"
860  "mov %6,%10;"
861  "add %12,%5;"
862  "mov %6,%12;"
863  "or %7,%10;"
864  "add %5,%8;"
865  "and %7,%12;"
866  "and %k2,%10;"
867  "add %11,%5;"
868  "or %12,%10;"
869  "add %10,%5;"
870  "mov %8,%10;"
871  "ror $0xe,%10;"
872  "mov %5,%11;"
873  "xor %8,%10;"
874  "ror $0x9,%11;"
875  "mov %9,%12;"
876  "xor %5,%11;"
877  "ror $0x5,%10;"
878  "xor %3,%12;"
879  "xor %8,%10;"
880  "ror $0xb,%11;"
881  "and %8,%12;"
882  "xor %5,%11;"
883  "ror $0x6,%10;"
884  "xor %3,%12;"
885  "add %10,%12;"
886  "ror $0x2,%11;"
887  "add 8+%16,%12;"
888  "mov %5,%10;"
889  "add %12,%4;"
890  "mov %5,%12;"
891  "or %k2,%10;"
892  "add %4,%7;"
893  "and %k2,%12;"
894  "and %6,%10;"
895  "add %11,%4;"
896  "or %12,%10;"
897  "add %10,%4;"
898  "mov %7,%10;"
899  "ror $0xe,%10;"
900  "mov %4,%11;"
901  "xor %7,%10;"
902  "ror $0x9,%11;"
903  "mov %8,%12;"
904  "xor %4,%11;"
905  "ror $0x5,%10;"
906  "xor %9,%12;"
907  "xor %7,%10;"
908  "ror $0xb,%11;"
909  "and %7,%12;"
910  "xor %4,%11;"
911  "ror $0x6,%10;"
912  "xor %9,%12;"
913  "add %10,%12;"
914  "ror $0x2,%11;"
915  "add 12+%16,%12;"
916  "mov %4,%10;"
917  "add %12,%3;"
918  "mov %4,%12;"
919  "or %6,%10;"
920  "add %3,%k2;"
921  "and %6,%12;"
922  "and %5,%10;"
923  "add %11,%3;"
924  "or %12,%10;"
925  "add %10,%3;"
926  "movdqa %%xmm6,%%xmm4;"
927  "movdqa %%xmm7,%%xmm5;"
928  "sub $0x1,%1;"
929  "jne Lloop2_%=;"
930  "add (%0),%3;"
931  "mov %3,(%0);"
932  "add 0x4(%0),%4;"
933  "mov %4,0x4(%0);"
934  "add 0x8(%0),%5;"
935  "mov %5,0x8(%0);"
936  "add 0xc(%0),%6;"
937  "mov %6,0xc(%0);"
938  "add 0x10(%0),%k2;"
939  "mov %k2,0x10(%0);"
940  "add 0x14(%0),%7;"
941  "mov %7,0x14(%0);"
942  "add 0x18(%0),%8;"
943  "mov %8,0x18(%0);"
944  "add 0x1c(%0),%9;"
945  "mov %9,0x1c(%0);"
946  "mov %15,%1;"
947  "add $0x40,%1;"
948  "cmp %14,%1;"
949  "jne Lloop0_%=;"
950 
951  "Ldone_hash_%=:"
952 
953  : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
954  : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
955  : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
956  );
957 }
958 }
959 
960 /*
961 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
962 ; Copyright (c) 2012, Intel Corporation
963 ;
964 ; All rights reserved.
965 ;
966 ; Redistribution and use in source and binary forms, with or without
967 ; modification, are permitted provided that the following conditions are
968 ; met:
969 ;
970 ; * Redistributions of source code must retain the above copyright
971 ; notice, this list of conditions and the following disclaimer.
972 ;
973 ; * Redistributions in binary form must reproduce the above copyright
974 ; notice, this list of conditions and the following disclaimer in the
975 ; documentation and/or other materials provided with the
976 ; distribution.
977 ;
978 ; * Neither the name of the Intel Corporation nor the names of its
979 ; contributors may be used to endorse or promote products derived from
980 ; this software without specific prior written permission.
981 ;
982 ;
983 ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
984 ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
985 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
986 ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
987 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
988 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
989 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
990 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
991 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
992 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
993 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
994 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
995 ;
996 ; Example YASM command lines:
997 ; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
998 ; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
999 ;
1000 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1001 ;
1002 ; This code is described in an Intel White-Paper:
1003 ; "Fast SHA-256 Implementations on Intel Architecture Processors"
1004 ;
1005 ; To find it, surf to http://www.intel.com/p/en_US/embedded
1006 ; and search for that title.
1007 ; The paper is expected to be released roughly at the end of April, 2012
1008 ;
1009 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1010 ; This code schedules 1 blocks at a time, with 4 lanes per block
1011 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1012 
1013 %define MOVDQ movdqu ;; assume buffers not aligned
1014 
1015 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1016 
1017 ; addm [mem], reg
1018 ; Add reg to mem using reg-mem add and store
1019 %macro addm 2
1020  add %2, %1
1021  mov %1, %2
1022 %endm
1023 
1024 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1025 
1026 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1027 ; Load xmm with mem and byte swap each dword
1028 %macro COPY_XMM_AND_BSWAP 3
1029  MOVDQ %1, %2
1030  pshufb %1, %3
1031 %endmacro
1032 
1033 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1034 
1035 %define X0 xmm4
1036 %define X1 xmm5
1037 %define X2 xmm6
1038 %define X3 xmm7
1039 
1040 %define XTMP0 xmm0
1041 %define XTMP1 xmm1
1042 %define XTMP2 xmm2
1043 %define XTMP3 xmm3
1044 %define XTMP4 xmm8
1045 %define XFER xmm9
1046 
1047 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1048 %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1049 %define BYTE_FLIP_MASK xmm12
1050 
1051 %ifdef LINUX
1052 %define NUM_BLKS rdx ; 3rd arg
1053 %define CTX rsi ; 2nd arg
1054 %define INP rdi ; 1st arg
1055 
1056 %define SRND rdi ; clobbers INP
1057 %define c ecx
1058 %define d r8d
1059 %define e edx
1060 %else
1061 %define NUM_BLKS r8 ; 3rd arg
1062 %define CTX rdx ; 2nd arg
1063 %define INP rcx ; 1st arg
1064 
1065 %define SRND rcx ; clobbers INP
1066 %define c edi
1067 %define d esi
1068 %define e r8d
1069 
1070 %endif
1071 %define TBL rbp
1072 %define a eax
1073 %define b ebx
1074 
1075 %define f r9d
1076 %define g r10d
1077 %define h r11d
1078 
1079 %define y0 r13d
1080 %define y1 r14d
1081 %define y2 r15d
1082 
1083 
1084 
1085 _INP_END_SIZE equ 8
1086 _INP_SIZE equ 8
1087 _XFER_SIZE equ 8
1088 %ifdef LINUX
1089 _XMM_SAVE_SIZE equ 0
1090 %else
1091 _XMM_SAVE_SIZE equ 7*16
1092 %endif
1093 ; STACK_SIZE plus pushes must be an odd multiple of 8
1094 _ALIGN_SIZE equ 8
1095 
1096 _INP_END equ 0
1097 _INP equ _INP_END + _INP_END_SIZE
1098 _XFER equ _INP + _INP_SIZE
1099 _XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
1100 STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
1101 
1102 ; rotate_Xs
1103 ; Rotate values of symbols X0...X3
1104 %macro rotate_Xs 0
1105 %xdefine X_ X0
1106 %xdefine X0 X1
1107 %xdefine X1 X2
1108 %xdefine X2 X3
1109 %xdefine X3 X_
1110 %endm
1111 
1112 ; ROTATE_ARGS
1113 ; Rotate values of symbols a...h
1114 %macro ROTATE_ARGS 0
1115 %xdefine TMP_ h
1116 %xdefine h g
1117 %xdefine g f
1118 %xdefine f e
1119 %xdefine e d
1120 %xdefine d c
1121 %xdefine c b
1122 %xdefine b a
1123 %xdefine a TMP_
1124 %endm
1125 
1126 %macro FOUR_ROUNDS_AND_SCHED 0
1127  ;; compute s0 four at a time and s1 two at a time
1128  ;; compute W[-16] + W[-7] 4 at a time
1129  movdqa XTMP0, X3
1130  mov y0, e ; y0 = e
1131  ror y0, (25-11) ; y0 = e >> (25-11)
1132  mov y1, a ; y1 = a
1133  palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
1134  ror y1, (22-13) ; y1 = a >> (22-13)
1135  xor y0, e ; y0 = e ^ (e >> (25-11))
1136  mov y2, f ; y2 = f
1137  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1138  movdqa XTMP1, X1
1139  xor y1, a ; y1 = a ^ (a >> (22-13)
1140  xor y2, g ; y2 = f^g
1141  paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1142  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1143  and y2, e ; y2 = (f^g)&e
1144  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1145  ;; compute s0
1146  palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
1147  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1148  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1149  xor y2, g ; y2 = CH = ((f^g)&e)^g
1150  movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
1151  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1152  add y2, y0 ; y2 = S1 + CH
1153  add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1154  movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
1155  mov y0, a ; y0 = a
1156  add h, y2 ; h = h + S1 + CH + k + w
1157  mov y2, a ; y2 = a
1158  pslld XTMP1, (32-7)
1159  or y0, c ; y0 = a|c
1160  add d, h ; d = d + h + S1 + CH + k + w
1161  and y2, c ; y2 = a&c
1162  psrld XTMP2, 7
1163  and y0, b ; y0 = (a|c)&b
1164  add h, y1 ; h = h + S1 + CH + k + w + S0
1165  por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
1166  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1167  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1168 
1169 ROTATE_ARGS
1170  movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
1171  mov y0, e ; y0 = e
1172  mov y1, a ; y1 = a
1173  movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
1174  ror y0, (25-11) ; y0 = e >> (25-11)
1175  xor y0, e ; y0 = e ^ (e >> (25-11))
1176  mov y2, f ; y2 = f
1177  ror y1, (22-13) ; y1 = a >> (22-13)
1178  pslld XTMP3, (32-18)
1179  xor y1, a ; y1 = a ^ (a >> (22-13)
1180  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1181  xor y2, g ; y2 = f^g
1182  psrld XTMP2, 18
1183  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1184  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1185  and y2, e ; y2 = (f^g)&e
1186  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1187  pxor XTMP1, XTMP3
1188  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1189  xor y2, g ; y2 = CH = ((f^g)&e)^g
1190  psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
1191  add y2, y0 ; y2 = S1 + CH
1192  add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1193  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1194  pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1195  mov y0, a ; y0 = a
1196  add h, y2 ; h = h + S1 + CH + k + w
1197  mov y2, a ; y2 = a
1198  pxor XTMP1, XTMP4 ; XTMP1 = s0
1199  or y0, c ; y0 = a|c
1200  add d, h ; d = d + h + S1 + CH + k + w
1201  and y2, c ; y2 = a&c
1202  ;; compute low s1
1203  pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
1204  and y0, b ; y0 = (a|c)&b
1205  add h, y1 ; h = h + S1 + CH + k + w + S0
1206  paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
1207  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1208  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1209 
1210 ROTATE_ARGS
1211  movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
1212  mov y0, e ; y0 = e
1213  mov y1, a ; y1 = a
1214  ror y0, (25-11) ; y0 = e >> (25-11)
1215  movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
1216  xor y0, e ; y0 = e ^ (e >> (25-11))
1217  ror y1, (22-13) ; y1 = a >> (22-13)
1218  mov y2, f ; y2 = f
1219  xor y1, a ; y1 = a ^ (a >> (22-13)
1220  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1221  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1222  xor y2, g ; y2 = f^g
1223  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1224  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1225  and y2, e ; y2 = (f^g)&e
1226  psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1227  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1228  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1229  xor y2, g ; y2 = CH = ((f^g)&e)^g
1230  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1231  pxor XTMP2, XTMP3
1232  add y2, y0 ; y2 = S1 + CH
1233  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1234  add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1235  pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
1236  mov y0, a ; y0 = a
1237  add h, y2 ; h = h + S1 + CH + k + w
1238  mov y2, a ; y2 = a
1239  pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
1240  or y0, c ; y0 = a|c
1241  add d, h ; d = d + h + S1 + CH + k + w
1242  and y2, c ; y2 = a&c
1243  paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
1244  and y0, b ; y0 = (a|c)&b
1245  add h, y1 ; h = h + S1 + CH + k + w + S0
1246  ;; compute high s1
1247  pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1248  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1249  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1250 
1251 ROTATE_ARGS
1252  movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
1253  mov y0, e ; y0 = e
1254  ror y0, (25-11) ; y0 = e >> (25-11)
1255  mov y1, a ; y1 = a
1256  movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
1257  ror y1, (22-13) ; y1 = a >> (22-13)
1258  xor y0, e ; y0 = e ^ (e >> (25-11))
1259  mov y2, f ; y2 = f
1260  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1261  psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1262  xor y1, a ; y1 = a ^ (a >> (22-13)
1263  xor y2, g ; y2 = f^g
1264  psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1265  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1266  and y2, e ; y2 = (f^g)&e
1267  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1268  psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
1269  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1270  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1271  xor y2, g ; y2 = CH = ((f^g)&e)^g
1272  pxor XTMP2, XTMP3
1273  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1274  add y2, y0 ; y2 = S1 + CH
1275  add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1276  pxor X0, XTMP2 ; X0 = s1 {xDxC}
1277  mov y0, a ; y0 = a
1278  add h, y2 ; h = h + S1 + CH + k + w
1279  mov y2, a ; y2 = a
1280  pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
1281  or y0, c ; y0 = a|c
1282  add d, h ; d = d + h + S1 + CH + k + w
1283  and y2, c ; y2 = a&c
1284  paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1285  and y0, b ; y0 = (a|c)&b
1286  add h, y1 ; h = h + S1 + CH + k + w + S0
1287  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1288  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1289 
1290 ROTATE_ARGS
1291 rotate_Xs
1292 %endm
1293 
1294 ;; input is [rsp + _XFER + %1 * 4]
1295 %macro DO_ROUND 1
1296  mov y0, e ; y0 = e
1297  ror y0, (25-11) ; y0 = e >> (25-11)
1298  mov y1, a ; y1 = a
1299  xor y0, e ; y0 = e ^ (e >> (25-11))
1300  ror y1, (22-13) ; y1 = a >> (22-13)
1301  mov y2, f ; y2 = f
1302  xor y1, a ; y1 = a ^ (a >> (22-13)
1303  ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1304  xor y2, g ; y2 = f^g
1305  xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1306  ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1307  and y2, e ; y2 = (f^g)&e
1308  xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1309  ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1310  xor y2, g ; y2 = CH = ((f^g)&e)^g
1311  add y2, y0 ; y2 = S1 + CH
1312  ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1313  add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
1314  mov y0, a ; y0 = a
1315  add h, y2 ; h = h + S1 + CH + k + w
1316  mov y2, a ; y2 = a
1317  or y0, c ; y0 = a|c
1318  add d, h ; d = d + h + S1 + CH + k + w
1319  and y2, c ; y2 = a&c
1320  and y0, b ; y0 = (a|c)&b
1321  add h, y1 ; h = h + S1 + CH + k + w + S0
1322  or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1323  add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1324  ROTATE_ARGS
1325 %endm
1326 
1327 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1328 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1329 ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1330 ;; arg 1 : pointer to input data
1331 ;; arg 2 : pointer to digest
1332 ;; arg 3 : Num blocks
1333 section .text
1334 global sha256_sse4
1335 align 32
1336 sha256_sse4:
1337  push rbx
1338 %ifndef LINUX
1339  push rsi
1340  push rdi
1341 %endif
1342  push rbp
1343  push r13
1344  push r14
1345  push r15
1346 
1347  sub rsp,STACK_SIZE
1348 %ifndef LINUX
1349  movdqa [rsp + _XMM_SAVE + 0*16],xmm6
1350  movdqa [rsp + _XMM_SAVE + 1*16],xmm7
1351  movdqa [rsp + _XMM_SAVE + 2*16],xmm8
1352  movdqa [rsp + _XMM_SAVE + 3*16],xmm9
1353  movdqa [rsp + _XMM_SAVE + 4*16],xmm10
1354  movdqa [rsp + _XMM_SAVE + 5*16],xmm11
1355  movdqa [rsp + _XMM_SAVE + 6*16],xmm12
1356 %endif
1357 
1358  shl NUM_BLKS, 6 ; convert to bytes
1359  jz done_hash
1360  add NUM_BLKS, INP ; pointer to end of data
1361  mov [rsp + _INP_END], NUM_BLKS
1362 
1363  ;; load initial digest
1364  mov a,[4*0 + CTX]
1365  mov b,[4*1 + CTX]
1366  mov c,[4*2 + CTX]
1367  mov d,[4*3 + CTX]
1368  mov e,[4*4 + CTX]
1369  mov f,[4*5 + CTX]
1370  mov g,[4*6 + CTX]
1371  mov h,[4*7 + CTX]
1372 
1373  movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1374  movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
1375  movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
1376 
1377 loop0:
1378  lea TBL,[K256 wrt rip]
1379 
1380  ;; byte swap first 16 dwords
1381  COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
1382  COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
1383  COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
1384  COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
1385 
1386  mov [rsp + _INP], INP
1387 
1388  ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1389  mov SRND, 3
1390 align 16
1391 loop1:
1392  movdqa XFER, [TBL + 0*16]
1393  paddd XFER, X0
1394  movdqa [rsp + _XFER], XFER
1395  FOUR_ROUNDS_AND_SCHED
1396 
1397  movdqa XFER, [TBL + 1*16]
1398  paddd XFER, X0
1399  movdqa [rsp + _XFER], XFER
1400  FOUR_ROUNDS_AND_SCHED
1401 
1402  movdqa XFER, [TBL + 2*16]
1403  paddd XFER, X0
1404  movdqa [rsp + _XFER], XFER
1405  FOUR_ROUNDS_AND_SCHED
1406 
1407  movdqa XFER, [TBL + 3*16]
1408  paddd XFER, X0
1409  movdqa [rsp + _XFER], XFER
1410  add TBL, 4*16
1411  FOUR_ROUNDS_AND_SCHED
1412 
1413  sub SRND, 1
1414  jne loop1
1415 
1416  mov SRND, 2
1417 loop2:
1418  paddd X0, [TBL + 0*16]
1419  movdqa [rsp + _XFER], X0
1420  DO_ROUND 0
1421  DO_ROUND 1
1422  DO_ROUND 2
1423  DO_ROUND 3
1424  paddd X1, [TBL + 1*16]
1425  movdqa [rsp + _XFER], X1
1426  add TBL, 2*16
1427  DO_ROUND 0
1428  DO_ROUND 1
1429  DO_ROUND 2
1430  DO_ROUND 3
1431 
1432  movdqa X0, X2
1433  movdqa X1, X3
1434 
1435  sub SRND, 1
1436  jne loop2
1437 
1438  addm [4*0 + CTX],a
1439  addm [4*1 + CTX],b
1440  addm [4*2 + CTX],c
1441  addm [4*3 + CTX],d
1442  addm [4*4 + CTX],e
1443  addm [4*5 + CTX],f
1444  addm [4*6 + CTX],g
1445  addm [4*7 + CTX],h
1446 
1447  mov INP, [rsp + _INP]
1448  add INP, 64
1449  cmp INP, [rsp + _INP_END]
1450  jne loop0
1451 
1452 done_hash:
1453 %ifndef LINUX
1454  movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1455  movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1456  movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1457  movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1458  movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1459  movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1460  movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1461 %endif
1462 
1463  add rsp, STACK_SIZE
1464 
1465  pop r15
1466  pop r14
1467  pop r13
1468  pop rbp
1469 %ifndef LINUX
1470  pop rdi
1471  pop rsi
1472 %endif
1473  pop rbx
1474 
1475  ret
1476 
1477 
1478 section .data
1479 align 64
1480 K256:
1481  dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1482  dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1483  dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1484  dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1485  dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1486  dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1487  dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1488  dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1489  dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1490  dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1491  dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1492  dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1493  dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1494  dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1495  dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1496  dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1497 
1498 PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1499 
1500 ; shuffle xBxA -> 00BA
1501 _SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1502 
1503 ; shuffle xDxC -> DC00
1504 _SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1505 */
1506 
1507 #endif