Raven Core  3.0.0
P2P Digital Currency
simd.c
Go to the documentation of this file.
1 /* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */
2 /*
3  * SIMD implementation.
4  *
5  * ==========================(LICENSE BEGIN)============================
6  *
7  * Copyright (c) 2007-2010 Projet RNRT SAPHIR
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining
10  * a copy of this software and associated documentation files (the
11  * "Software"), to deal in the Software without restriction, including
12  * without limitation the rights to use, copy, modify, merge, publish,
13  * distribute, sublicense, and/or sell copies of the Software, and to
14  * permit persons to whom the Software is furnished to do so, subject to
15  * the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be
18  * included in all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27  *
28  * ===========================(LICENSE END)=============================
29  *
30  * @author Thomas Pornin <thomas.pornin@cryptolog.com>
31  */
32 
33 #include <stddef.h>
34 #include <string.h>
35 #include <limits.h>
36 
37 #include "sph_simd.h"
38 
39 #ifdef __cplusplus
40 extern "C"{
41 #endif
42 
43 #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD
44 #define SPH_SMALL_FOOTPRINT_SIMD 1
45 #endif
46 
47 #ifdef _MSC_VER
48 #pragma warning (disable: 4146)
49 #endif
50 
51 typedef sph_u32 u32;
52 typedef sph_s32 s32;
53 #define C32 SPH_C32
54 #define T32 SPH_T32
55 #define ROL32 SPH_ROTL32
56 
57 #define XCAT(x, y) XCAT_(x, y)
58 #define XCAT_(x, y) x ## y
59 
60 /*
61  * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
62  */
63 static const s32 alpha_tab[] = {
64  1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130,
65  190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28,
66  120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180,
67  184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19,
68  8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12,
69  235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224,
70  189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155,
71  187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152,
72  64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96,
73  81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250,
74  227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212,
75  211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188,
76  255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254,
77  134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201,
78  17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154,
79  146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219,
80  241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233,
81  44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66,
82  136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204,
83  140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210,
84  129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65,
85  95, 40, 98, 163
86 };
87 
88 /*
89  * Ranges:
90  * REDS1: from -32768..98302 to -383..383
91  * REDS2: from -2^31..2^31-1 to -32768..98302
92  */
93 #define REDS1(x) (((x) & 0xFF) - ((x) >> 8))
94 #define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16))
95 
96 /*
97  * If, upon entry, the values of q[] are all in the -N..N range (where
98  * N >= 98302) then the new values of q[] are in the -2N..2N range.
99  *
100  * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
101  */
102 #define FFT_LOOP(rb, hk, as, id) do { \
103  size_t u, v; \
104  s32 m = q[(rb)]; \
105  s32 n = q[(rb) + (hk)]; \
106  q[(rb)] = m + n; \
107  q[(rb) + (hk)] = m - n; \
108  u = v = 0; \
109  goto id; \
110  for (; u < (hk); u += 4, v += 4 * (as)) { \
111  s32 t; \
112  m = q[(rb) + u + 0]; \
113  n = q[(rb) + u + 0 + (hk)]; \
114  t = REDS2(n * alpha_tab[v + 0 * (as)]); \
115  q[(rb) + u + 0] = m + t; \
116  q[(rb) + u + 0 + (hk)] = m - t; \
117  id: \
118  m = q[(rb) + u + 1]; \
119  n = q[(rb) + u + 1 + (hk)]; \
120  t = REDS2(n * alpha_tab[v + 1 * (as)]); \
121  q[(rb) + u + 1] = m + t; \
122  q[(rb) + u + 1 + (hk)] = m - t; \
123  m = q[(rb) + u + 2]; \
124  n = q[(rb) + u + 2 + (hk)]; \
125  t = REDS2(n * alpha_tab[v + 2 * (as)]); \
126  q[(rb) + u + 2] = m + t; \
127  q[(rb) + u + 2 + (hk)] = m - t; \
128  m = q[(rb) + u + 3]; \
129  n = q[(rb) + u + 3 + (hk)]; \
130  t = REDS2(n * alpha_tab[v + 3 * (as)]); \
131  q[(rb) + u + 3] = m + t; \
132  q[(rb) + u + 3 + (hk)] = m - t; \
133  } \
134  } while (0)
135 
136 /*
137  * Output ranges:
138  * d0: min= 0 max= 1020
139  * d1: min= -67 max= 4587
140  * d2: min=-4335 max= 4335
141  * d3: min=-4147 max= 507
142  * d4: min= -510 max= 510
143  * d5: min= -252 max= 4402
144  * d6: min=-4335 max= 4335
145  * d7: min=-4332 max= 322
146  */
147 #define FFT8(xb, xs, d) do { \
148  s32 x0 = x[(xb)]; \
149  s32 x1 = x[(xb) + (xs)]; \
150  s32 x2 = x[(xb) + 2 * (xs)]; \
151  s32 x3 = x[(xb) + 3 * (xs)]; \
152  s32 a0 = x0 + x2; \
153  s32 a1 = x0 + (x2 << 4); \
154  s32 a2 = x0 - x2; \
155  s32 a3 = x0 - (x2 << 4); \
156  s32 b0 = x1 + x3; \
157  s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
158  s32 b2 = (x1 << 4) - (x3 << 4); \
159  s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
160  d ## 0 = a0 + b0; \
161  d ## 1 = a1 + b1; \
162  d ## 2 = a2 + b2; \
163  d ## 3 = a3 + b3; \
164  d ## 4 = a0 - b0; \
165  d ## 5 = a1 - b1; \
166  d ## 6 = a2 - b2; \
167  d ## 7 = a3 - b3; \
168  } while (0)
169 
170 /*
171  * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
172  * to some shifting.
173  *
174  * Output: within -591471..591723
175  */
176 #define FFT16(xb, xs, rb) do { \
177  s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
178  s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
179  FFT8(xb, (xs) << 1, d1_); \
180  FFT8((xb) + (xs), (xs) << 1, d2_); \
181  q[(rb) + 0] = d1_0 + d2_0; \
182  q[(rb) + 1] = d1_1 + (d2_1 << 1); \
183  q[(rb) + 2] = d1_2 + (d2_2 << 2); \
184  q[(rb) + 3] = d1_3 + (d2_3 << 3); \
185  q[(rb) + 4] = d1_4 + (d2_4 << 4); \
186  q[(rb) + 5] = d1_5 + (d2_5 << 5); \
187  q[(rb) + 6] = d1_6 + (d2_6 << 6); \
188  q[(rb) + 7] = d1_7 + (d2_7 << 7); \
189  q[(rb) + 8] = d1_0 - d2_0; \
190  q[(rb) + 9] = d1_1 - (d2_1 << 1); \
191  q[(rb) + 10] = d1_2 - (d2_2 << 2); \
192  q[(rb) + 11] = d1_3 - (d2_3 << 3); \
193  q[(rb) + 12] = d1_4 - (d2_4 << 4); \
194  q[(rb) + 13] = d1_5 - (d2_5 << 5); \
195  q[(rb) + 14] = d1_6 - (d2_6 << 6); \
196  q[(rb) + 15] = d1_7 - (d2_7 << 7); \
197  } while (0)
198 
199 /*
200  * Output range: |q| <= 1183446
201  */
202 #define FFT32(xb, xs, rb, id) do { \
203  FFT16(xb, (xs) << 1, rb); \
204  FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
205  FFT_LOOP(rb, 16, 8, id); \
206  } while (0)
207 
208 /*
209  * Output range: |q| <= 2366892
210  */
211 #define FFT64(xb, xs, rb, id) do { \
212  FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \
213  FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \
214  FFT_LOOP(rb, 32, 4, id); \
215  } while (0)
216 
217 #if SPH_SMALL_FOOTPRINT_SIMD
218 
219 static void
220 fft32(unsigned char *x, size_t xs, s32 *q)
221 {
222  size_t xd;
223 
224  xd = xs << 1;
225  FFT16(0, xd, 0);
226  FFT16(xs, xd, 16);
227  FFT_LOOP(0, 16, 8, label_);
228 }
229 
230 #define FFT128(xb, xs, rb, id) do { \
231  fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
232  fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \
233  FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \
234  fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \
235  fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \
236  FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \
237  FFT_LOOP(rb, 64, 2, XCAT(id, a)); \
238  } while (0)
239 
240 #else
241 
242 /*
243  * Output range: |q| <= 4733784
244  */
245 #define FFT128(xb, xs, rb, id) do { \
246  FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \
247  FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \
248  FFT_LOOP(rb, 64, 2, id); \
249  } while (0)
250 
251 #endif
252 
253 /*
254  * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression
255  * function which does not fit in the 32 kB L1 cache of a typical x86
256  * Intel. We therefore add a function call layer at the FFT64 level.
257  */
258 
259 static void
260 fft64(unsigned char *x, size_t xs, s32 *q)
261 {
262  size_t xd;
263 
264  xd = xs << 1;
265  FFT32(0, xd, 0, label_a);
266  FFT32(xs, xd, 32, label_b);
267  FFT_LOOP(0, 32, 4, label_);
268 }
269 
270 /*
271  * Output range: |q| <= 9467568
272  */
273 #define FFT256(xb, xs, rb, id) do { \
274  fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \
275  fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \
276  FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \
277  fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \
278  fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \
279  FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \
280  FFT_LOOP(rb, 128, 1, XCAT(id, a)); \
281  } while (0)
282 
283 /*
284  * alpha^(127*i) mod 257
285  */
286 static const unsigned short yoff_s_n[] = {
287  1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29,
288  15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178,
289  225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100,
290  34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215,
291  253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141,
292  197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59,
293  128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114,
294  121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168,
295  16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207,
296  240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21,
297  2, 196, 190, 116, 60, 226, 46, 139
298 };
299 
300 /*
301  * alpha^(127*i) + alpha^(125*i) mod 257
302  */
303 static const unsigned short yoff_s_f[] = {
304  2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3,
305  49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65,
306  96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113,
307  17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143,
308  189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6,
309  77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95,
310  160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53,
311  181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150,
312  0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109,
313  210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30,
314  10, 146, 117, 251, 180, 247, 236, 108
315 };
316 
317 /*
318  * beta^(255*i) mod 257
319  */
320 static const unsigned short yoff_b_n[] = {
321  1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172,
322  23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101,
323  15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10,
324  88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230,
325  225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150,
326  35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109,
327  34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194,
328  11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93,
329  253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83,
330  165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110,
331  197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217,
332  162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108,
333  128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171,
334  117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78,
335  121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252,
336  213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142,
337  16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182,
338  111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74,
339  240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160,
340  123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82,
341  2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87,
342  46, 45, 139, 41
343 };
344 
345 /*
346  * beta^(255*i) + beta^(253*i) mod 257
347  */
348 static const unsigned short yoff_b_f[] = {
349  2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20,
350  111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89,
351  49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239,
352  253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79,
353  96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226,
354  248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115,
355  17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208,
356  57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40,
357  189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45,
358  187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107,
359  77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210,
360  139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6,
361  160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190,
362  106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208,
363  181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127,
364  96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193,
365  0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44,
366  245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9,
367  210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94,
368  53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185,
369  10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156,
370  236, 192, 108, 86
371 };
372 
373 #define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \
374  + ((u32)((h) * (mm)) << 16))
375 
376 #define W_SMALL(sb, o1, o2, mm) \
377  (INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \
378  INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \
379  INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \
380  INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm)
381 
382 #define WS_0_0 W_SMALL( 4, 0, 1, 185)
383 #define WS_0_1 W_SMALL( 6, 0, 1, 185)
384 #define WS_0_2 W_SMALL( 0, 0, 1, 185)
385 #define WS_0_3 W_SMALL( 2, 0, 1, 185)
386 #define WS_0_4 W_SMALL( 7, 0, 1, 185)
387 #define WS_0_5 W_SMALL( 5, 0, 1, 185)
388 #define WS_0_6 W_SMALL( 3, 0, 1, 185)
389 #define WS_0_7 W_SMALL( 1, 0, 1, 185)
390 #define WS_1_0 W_SMALL(15, 0, 1, 185)
391 #define WS_1_1 W_SMALL(11, 0, 1, 185)
392 #define WS_1_2 W_SMALL(12, 0, 1, 185)
393 #define WS_1_3 W_SMALL( 8, 0, 1, 185)
394 #define WS_1_4 W_SMALL( 9, 0, 1, 185)
395 #define WS_1_5 W_SMALL(13, 0, 1, 185)
396 #define WS_1_6 W_SMALL(10, 0, 1, 185)
397 #define WS_1_7 W_SMALL(14, 0, 1, 185)
398 #define WS_2_0 W_SMALL(17, -128, -64, 233)
399 #define WS_2_1 W_SMALL(18, -128, -64, 233)
400 #define WS_2_2 W_SMALL(23, -128, -64, 233)
401 #define WS_2_3 W_SMALL(20, -128, -64, 233)
402 #define WS_2_4 W_SMALL(22, -128, -64, 233)
403 #define WS_2_5 W_SMALL(21, -128, -64, 233)
404 #define WS_2_6 W_SMALL(16, -128, -64, 233)
405 #define WS_2_7 W_SMALL(19, -128, -64, 233)
406 #define WS_3_0 W_SMALL(30, -191, -127, 233)
407 #define WS_3_1 W_SMALL(24, -191, -127, 233)
408 #define WS_3_2 W_SMALL(25, -191, -127, 233)
409 #define WS_3_3 W_SMALL(31, -191, -127, 233)
410 #define WS_3_4 W_SMALL(27, -191, -127, 233)
411 #define WS_3_5 W_SMALL(29, -191, -127, 233)
412 #define WS_3_6 W_SMALL(28, -191, -127, 233)
413 #define WS_3_7 W_SMALL(26, -191, -127, 233)
414 
415 #define W_BIG(sb, o1, o2, mm) \
416  (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
417  INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
418  INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
419  INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
420  INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
421  INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
422  INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
423  INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
424 
425 #define WB_0_0 W_BIG( 4, 0, 1, 185)
426 #define WB_0_1 W_BIG( 6, 0, 1, 185)
427 #define WB_0_2 W_BIG( 0, 0, 1, 185)
428 #define WB_0_3 W_BIG( 2, 0, 1, 185)
429 #define WB_0_4 W_BIG( 7, 0, 1, 185)
430 #define WB_0_5 W_BIG( 5, 0, 1, 185)
431 #define WB_0_6 W_BIG( 3, 0, 1, 185)
432 #define WB_0_7 W_BIG( 1, 0, 1, 185)
433 #define WB_1_0 W_BIG(15, 0, 1, 185)
434 #define WB_1_1 W_BIG(11, 0, 1, 185)
435 #define WB_1_2 W_BIG(12, 0, 1, 185)
436 #define WB_1_3 W_BIG( 8, 0, 1, 185)
437 #define WB_1_4 W_BIG( 9, 0, 1, 185)
438 #define WB_1_5 W_BIG(13, 0, 1, 185)
439 #define WB_1_6 W_BIG(10, 0, 1, 185)
440 #define WB_1_7 W_BIG(14, 0, 1, 185)
441 #define WB_2_0 W_BIG(17, -256, -128, 233)
442 #define WB_2_1 W_BIG(18, -256, -128, 233)
443 #define WB_2_2 W_BIG(23, -256, -128, 233)
444 #define WB_2_3 W_BIG(20, -256, -128, 233)
445 #define WB_2_4 W_BIG(22, -256, -128, 233)
446 #define WB_2_5 W_BIG(21, -256, -128, 233)
447 #define WB_2_6 W_BIG(16, -256, -128, 233)
448 #define WB_2_7 W_BIG(19, -256, -128, 233)
449 #define WB_3_0 W_BIG(30, -383, -255, 233)
450 #define WB_3_1 W_BIG(24, -383, -255, 233)
451 #define WB_3_2 W_BIG(25, -383, -255, 233)
452 #define WB_3_3 W_BIG(31, -383, -255, 233)
453 #define WB_3_4 W_BIG(27, -383, -255, 233)
454 #define WB_3_5 W_BIG(29, -383, -255, 233)
455 #define WB_3_6 W_BIG(28, -383, -255, 233)
456 #define WB_3_7 W_BIG(26, -383, -255, 233)
457 
458 #define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
459 #define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z)))
460 
461 #define PP4_0_0 1
462 #define PP4_0_1 0
463 #define PP4_0_2 3
464 #define PP4_0_3 2
465 #define PP4_1_0 2
466 #define PP4_1_1 3
467 #define PP4_1_2 0
468 #define PP4_1_3 1
469 #define PP4_2_0 3
470 #define PP4_2_1 2
471 #define PP4_2_2 1
472 #define PP4_2_3 0
473 
474 #define PP8_0_0 1
475 #define PP8_0_1 0
476 #define PP8_0_2 3
477 #define PP8_0_3 2
478 #define PP8_0_4 5
479 #define PP8_0_5 4
480 #define PP8_0_6 7
481 #define PP8_0_7 6
482 
483 #define PP8_1_0 6
484 #define PP8_1_1 7
485 #define PP8_1_2 4
486 #define PP8_1_3 5
487 #define PP8_1_4 2
488 #define PP8_1_5 3
489 #define PP8_1_6 0
490 #define PP8_1_7 1
491 
492 #define PP8_2_0 2
493 #define PP8_2_1 3
494 #define PP8_2_2 0
495 #define PP8_2_3 1
496 #define PP8_2_4 6
497 #define PP8_2_5 7
498 #define PP8_2_6 4
499 #define PP8_2_7 5
500 
501 #define PP8_3_0 3
502 #define PP8_3_1 2
503 #define PP8_3_2 1
504 #define PP8_3_3 0
505 #define PP8_3_4 7
506 #define PP8_3_5 6
507 #define PP8_3_6 5
508 #define PP8_3_7 4
509 
510 #define PP8_4_0 5
511 #define PP8_4_1 4
512 #define PP8_4_2 7
513 #define PP8_4_3 6
514 #define PP8_4_4 1
515 #define PP8_4_5 0
516 #define PP8_4_6 3
517 #define PP8_4_7 2
518 
519 #define PP8_5_0 7
520 #define PP8_5_1 6
521 #define PP8_5_2 5
522 #define PP8_5_3 4
523 #define PP8_5_4 3
524 #define PP8_5_5 2
525 #define PP8_5_6 1
526 #define PP8_5_7 0
527 
528 #define PP8_6_0 4
529 #define PP8_6_1 5
530 #define PP8_6_2 6
531 #define PP8_6_3 7
532 #define PP8_6_4 0
533 #define PP8_6_5 1
534 #define PP8_6_6 2
535 #define PP8_6_7 3
536 
537 #if SPH_SIMD_NOCOPY
538 
539 #define DECL_STATE_SMALL
540 #define READ_STATE_SMALL(sc)
541 #define WRITE_STATE_SMALL(sc)
542 #define DECL_STATE_BIG
543 #define READ_STATE_BIG(sc)
544 #define WRITE_STATE_BIG(sc)
545 
546 #else
547 
548 #define DECL_STATE_SMALL \
549  u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3;
550 
551 #define READ_STATE_SMALL(sc) do { \
552  A0 = (sc)->state[ 0]; \
553  A1 = (sc)->state[ 1]; \
554  A2 = (sc)->state[ 2]; \
555  A3 = (sc)->state[ 3]; \
556  B0 = (sc)->state[ 4]; \
557  B1 = (sc)->state[ 5]; \
558  B2 = (sc)->state[ 6]; \
559  B3 = (sc)->state[ 7]; \
560  C0 = (sc)->state[ 8]; \
561  C1 = (sc)->state[ 9]; \
562  C2 = (sc)->state[10]; \
563  C3 = (sc)->state[11]; \
564  D0 = (sc)->state[12]; \
565  D1 = (sc)->state[13]; \
566  D2 = (sc)->state[14]; \
567  D3 = (sc)->state[15]; \
568  } while (0)
569 
570 #define WRITE_STATE_SMALL(sc) do { \
571  (sc)->state[ 0] = A0; \
572  (sc)->state[ 1] = A1; \
573  (sc)->state[ 2] = A2; \
574  (sc)->state[ 3] = A3; \
575  (sc)->state[ 4] = B0; \
576  (sc)->state[ 5] = B1; \
577  (sc)->state[ 6] = B2; \
578  (sc)->state[ 7] = B3; \
579  (sc)->state[ 8] = C0; \
580  (sc)->state[ 9] = C1; \
581  (sc)->state[10] = C2; \
582  (sc)->state[11] = C3; \
583  (sc)->state[12] = D0; \
584  (sc)->state[13] = D1; \
585  (sc)->state[14] = D2; \
586  (sc)->state[15] = D3; \
587  } while (0)
588 
589 #define DECL_STATE_BIG \
590  u32 A0, A1, A2, A3, A4, A5, A6, A7; \
591  u32 B0, B1, B2, B3, B4, B5, B6, B7; \
592  u32 C0, C1, C2, C3, C4, C5, C6, C7; \
593  u32 D0, D1, D2, D3, D4, D5, D6, D7;
594 
595 #define READ_STATE_BIG(sc) do { \
596  A0 = (sc)->state[ 0]; \
597  A1 = (sc)->state[ 1]; \
598  A2 = (sc)->state[ 2]; \
599  A3 = (sc)->state[ 3]; \
600  A4 = (sc)->state[ 4]; \
601  A5 = (sc)->state[ 5]; \
602  A6 = (sc)->state[ 6]; \
603  A7 = (sc)->state[ 7]; \
604  B0 = (sc)->state[ 8]; \
605  B1 = (sc)->state[ 9]; \
606  B2 = (sc)->state[10]; \
607  B3 = (sc)->state[11]; \
608  B4 = (sc)->state[12]; \
609  B5 = (sc)->state[13]; \
610  B6 = (sc)->state[14]; \
611  B7 = (sc)->state[15]; \
612  C0 = (sc)->state[16]; \
613  C1 = (sc)->state[17]; \
614  C2 = (sc)->state[18]; \
615  C3 = (sc)->state[19]; \
616  C4 = (sc)->state[20]; \
617  C5 = (sc)->state[21]; \
618  C6 = (sc)->state[22]; \
619  C7 = (sc)->state[23]; \
620  D0 = (sc)->state[24]; \
621  D1 = (sc)->state[25]; \
622  D2 = (sc)->state[26]; \
623  D3 = (sc)->state[27]; \
624  D4 = (sc)->state[28]; \
625  D5 = (sc)->state[29]; \
626  D6 = (sc)->state[30]; \
627  D7 = (sc)->state[31]; \
628  } while (0)
629 
630 #define WRITE_STATE_BIG(sc) do { \
631  (sc)->state[ 0] = A0; \
632  (sc)->state[ 1] = A1; \
633  (sc)->state[ 2] = A2; \
634  (sc)->state[ 3] = A3; \
635  (sc)->state[ 4] = A4; \
636  (sc)->state[ 5] = A5; \
637  (sc)->state[ 6] = A6; \
638  (sc)->state[ 7] = A7; \
639  (sc)->state[ 8] = B0; \
640  (sc)->state[ 9] = B1; \
641  (sc)->state[10] = B2; \
642  (sc)->state[11] = B3; \
643  (sc)->state[12] = B4; \
644  (sc)->state[13] = B5; \
645  (sc)->state[14] = B6; \
646  (sc)->state[15] = B7; \
647  (sc)->state[16] = C0; \
648  (sc)->state[17] = C1; \
649  (sc)->state[18] = C2; \
650  (sc)->state[19] = C3; \
651  (sc)->state[20] = C4; \
652  (sc)->state[21] = C5; \
653  (sc)->state[22] = C6; \
654  (sc)->state[23] = C7; \
655  (sc)->state[24] = D0; \
656  (sc)->state[25] = D1; \
657  (sc)->state[26] = D2; \
658  (sc)->state[27] = D3; \
659  (sc)->state[28] = D4; \
660  (sc)->state[29] = D5; \
661  (sc)->state[30] = D6; \
662  (sc)->state[31] = D7; \
663  } while (0)
664 
665 #endif
666 
667 #define STEP_ELT(n, w, fun, s, ppb) do { \
668  u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
669  A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
670  D ## n = C ## n; \
671  C ## n = B ## n; \
672  B ## n = tA ## n; \
673  } while (0)
674 
675 #define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
676  u32 tA0 = ROL32(A0, r); \
677  u32 tA1 = ROL32(A1, r); \
678  u32 tA2 = ROL32(A2, r); \
679  u32 tA3 = ROL32(A3, r); \
680  STEP_ELT(0, w0, fun, s, pp4b); \
681  STEP_ELT(1, w1, fun, s, pp4b); \
682  STEP_ELT(2, w2, fun, s, pp4b); \
683  STEP_ELT(3, w3, fun, s, pp4b); \
684  } while (0)
685 
686 #define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
687  u32 tA0 = ROL32(A0, r); \
688  u32 tA1 = ROL32(A1, r); \
689  u32 tA2 = ROL32(A2, r); \
690  u32 tA3 = ROL32(A3, r); \
691  u32 tA4 = ROL32(A4, r); \
692  u32 tA5 = ROL32(A5, r); \
693  u32 tA6 = ROL32(A6, r); \
694  u32 tA7 = ROL32(A7, r); \
695  STEP_ELT(0, w0, fun, s, pp8b); \
696  STEP_ELT(1, w1, fun, s, pp8b); \
697  STEP_ELT(2, w2, fun, s, pp8b); \
698  STEP_ELT(3, w3, fun, s, pp8b); \
699  STEP_ELT(4, w4, fun, s, pp8b); \
700  STEP_ELT(5, w5, fun, s, pp8b); \
701  STEP_ELT(6, w6, fun, s, pp8b); \
702  STEP_ELT(7, w7, fun, s, pp8b); \
703  } while (0)
704 
705 #define M3_0_0 0_
706 #define M3_1_0 1_
707 #define M3_2_0 2_
708 #define M3_3_0 0_
709 #define M3_4_0 1_
710 #define M3_5_0 2_
711 #define M3_6_0 0_
712 #define M3_7_0 1_
713 
714 #define M3_0_1 1_
715 #define M3_1_1 2_
716 #define M3_2_1 0_
717 #define M3_3_1 1_
718 #define M3_4_1 2_
719 #define M3_5_1 0_
720 #define M3_6_1 1_
721 #define M3_7_1 2_
722 
723 #define M3_0_2 2_
724 #define M3_1_2 0_
725 #define M3_2_2 1_
726 #define M3_3_2 2_
727 #define M3_4_2 0_
728 #define M3_5_2 1_
729 #define M3_6_2 2_
730 #define M3_7_2 0_
731 
732 #define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b)
733 
734 #define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \
735  STEP_SMALL_(WS_ ## ri ## 0, \
736  IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \
737  STEP_SMALL_(WS_ ## ri ## 1, \
738  IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \
739  STEP_SMALL_(WS_ ## ri ## 2, \
740  IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \
741  STEP_SMALL_(WS_ ## ri ## 3, \
742  IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \
743  STEP_SMALL_(WS_ ## ri ## 4, \
744  MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \
745  STEP_SMALL_(WS_ ## ri ## 5, \
746  MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \
747  STEP_SMALL_(WS_ ## ri ## 6, \
748  MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \
749  STEP_SMALL_(WS_ ## ri ## 7, \
750  MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \
751  } while (0)
752 
753 #define M7_0_0 0_
754 #define M7_1_0 1_
755 #define M7_2_0 2_
756 #define M7_3_0 3_
757 #define M7_4_0 4_
758 #define M7_5_0 5_
759 #define M7_6_0 6_
760 #define M7_7_0 0_
761 
762 #define M7_0_1 1_
763 #define M7_1_1 2_
764 #define M7_2_1 3_
765 #define M7_3_1 4_
766 #define M7_4_1 5_
767 #define M7_5_1 6_
768 #define M7_6_1 0_
769 #define M7_7_1 1_
770 
771 #define M7_0_2 2_
772 #define M7_1_2 3_
773 #define M7_2_2 4_
774 #define M7_3_2 5_
775 #define M7_4_2 6_
776 #define M7_5_2 0_
777 #define M7_6_2 1_
778 #define M7_7_2 2_
779 
780 #define M7_0_3 3_
781 #define M7_1_3 4_
782 #define M7_2_3 5_
783 #define M7_3_3 6_
784 #define M7_4_3 0_
785 #define M7_5_3 1_
786 #define M7_6_3 2_
787 #define M7_7_3 3_
788 
789 #define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b)
790 
791 #define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \
792  STEP_BIG_(WB_ ## ri ## 0, \
793  IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
794  STEP_BIG_(WB_ ## ri ## 1, \
795  IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
796  STEP_BIG_(WB_ ## ri ## 2, \
797  IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
798  STEP_BIG_(WB_ ## ri ## 3, \
799  IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
800  STEP_BIG_(WB_ ## ri ## 4, \
801  MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
802  STEP_BIG_(WB_ ## ri ## 5, \
803  MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
804  STEP_BIG_(WB_ ## ri ## 6, \
805  MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
806  STEP_BIG_(WB_ ## ri ## 7, \
807  MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
808  } while (0)
809 
810 #if SPH_SMALL_FOOTPRINT_SIMD
811 
812 #define A0 state[ 0]
813 #define A1 state[ 1]
814 #define A2 state[ 2]
815 #define A3 state[ 3]
816 #define B0 state[ 4]
817 #define B1 state[ 5]
818 #define B2 state[ 6]
819 #define B3 state[ 7]
820 #define C0 state[ 8]
821 #define C1 state[ 9]
822 #define C2 state[10]
823 #define C3 state[11]
824 #define D0 state[12]
825 #define D1 state[13]
826 #define D2 state[14]
827 #define D3 state[15]
828 
829 #define STEP2_ELT(n, w, fun, s, ppb) do { \
830  u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
831  A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
832  D ## n = C ## n; \
833  C ## n = B ## n; \
834  B ## n = tA[n]; \
835  } while (0)
836 
837 #define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \
838  u32 tA[4]; \
839  tA[0] = ROL32(A0, r); \
840  tA[1] = ROL32(A1, r); \
841  tA[2] = ROL32(A2, r); \
842  tA[3] = ROL32(A3, r); \
843  STEP2_ELT(0, w0, fun, s, pp4b); \
844  STEP2_ELT(1, w1, fun, s, pp4b); \
845  STEP2_ELT(2, w2, fun, s, pp4b); \
846  STEP2_ELT(3, w3, fun, s, pp4b); \
847  } while (0)
848 
849 static void
850 one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
851 {
852  static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 };
853 
854  STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]);
855  STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]);
856  STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]);
857  STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]);
858  STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]);
859  STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]);
860  STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]);
861  STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]);
862 }
863 
864 static void
865 compress_small(sph_simd_small_context *sc, int last)
866 {
867  unsigned char *x;
868  s32 q[128];
869  int i;
870  u32 w[32];
871  u32 state[16];
872  size_t u;
873 
874  static const size_t wsp[32] = {
875  4 << 3, 6 << 3, 0 << 3, 2 << 3,
876  7 << 3, 5 << 3, 3 << 3, 1 << 3,
877  15 << 3, 11 << 3, 12 << 3, 8 << 3,
878  9 << 3, 13 << 3, 10 << 3, 14 << 3,
879  17 << 3, 18 << 3, 23 << 3, 20 << 3,
880  22 << 3, 21 << 3, 16 << 3, 19 << 3,
881  30 << 3, 24 << 3, 25 << 3, 31 << 3,
882  27 << 3, 29 << 3, 28 << 3, 26 << 3
883  };
884 
885  x = sc->buf;
886  FFT128(0, 1, 0, ll);
887  if (last) {
888  for (i = 0; i < 128; i ++) {
889  s32 tq;
890 
891  tq = q[i] + yoff_s_f[i];
892  tq = REDS2(tq);
893  tq = REDS1(tq);
894  tq = REDS1(tq);
895  q[i] = (tq <= 128 ? tq : tq - 257);
896  }
897  } else {
898  for (i = 0; i < 128; i ++) {
899  s32 tq;
900 
901  tq = q[i] + yoff_s_n[i];
902  tq = REDS2(tq);
903  tq = REDS1(tq);
904  tq = REDS1(tq);
905  q[i] = (tq <= 128 ? tq : tq - 257);
906  }
907  }
908 
909  for (i = 0; i < 16; i += 4) {
910  state[i + 0] = sc->state[i + 0]
911  ^ sph_dec32le_aligned(x + 4 * (i + 0));
912  state[i + 1] = sc->state[i + 1]
913  ^ sph_dec32le_aligned(x + 4 * (i + 1));
914  state[i + 2] = sc->state[i + 2]
915  ^ sph_dec32le_aligned(x + 4 * (i + 2));
916  state[i + 3] = sc->state[i + 3]
917  ^ sph_dec32le_aligned(x + 4 * (i + 3));
918  }
919 
920 #define WSREAD(sb, o1, o2, mm) do { \
921  for (u = 0; u < 32; u += 4) { \
922  size_t v = wsp[(u >> 2) + (sb)]; \
923  w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
924  q[v + 2 * 0 + (o2)], mm); \
925  w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
926  q[v + 2 * 1 + (o2)], mm); \
927  w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
928  q[v + 2 * 2 + (o2)], mm); \
929  w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
930  q[v + 2 * 3 + (o2)], mm); \
931  } \
932  } while (0)
933 
934  WSREAD( 0, 0, 1, 185);
935  one_round_small(state, w, 0, 3, 23, 17, 27);
936  WSREAD( 8, 0, 1, 185);
937  one_round_small(state, w, 2, 28, 19, 22, 7);
938  WSREAD(16, -128, -64, 233);
939  one_round_small(state, w, 1, 29, 9, 15, 5);
940  WSREAD(24, -191, -127, 233);
941  one_round_small(state, w, 0, 4, 13, 10, 25);
942 
943 #undef WSREAD
944 
945  STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
946  IF, 4, 13, PP4_2_);
947  STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
948  IF, 13, 10, PP4_0_);
949  STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
950  IF, 10, 25, PP4_1_);
951  STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
952  IF, 25, 4, PP4_2_);
953 
954  memcpy(sc->state, state, sizeof state);
955 }
956 
957 #undef A0
958 #undef A1
959 #undef A2
960 #undef A3
961 #undef B0
962 #undef B1
963 #undef B2
964 #undef B3
965 #undef C0
966 #undef C1
967 #undef C2
968 #undef C3
969 #undef D0
970 #undef D1
971 #undef D2
972 #undef D3
973 
974 #else
975 
976 #if SPH_SIMD_NOCOPY
977 #define A0 (sc->state[ 0])
978 #define A1 (sc->state[ 1])
979 #define A2 (sc->state[ 2])
980 #define A3 (sc->state[ 3])
981 #define B0 (sc->state[ 4])
982 #define B1 (sc->state[ 5])
983 #define B2 (sc->state[ 6])
984 #define B3 (sc->state[ 7])
985 #define C0 (sc->state[ 8])
986 #define C1 (sc->state[ 9])
987 #define C2 (sc->state[10])
988 #define C3 (sc->state[11])
989 #define D0 (sc->state[12])
990 #define D1 (sc->state[13])
991 #define D2 (sc->state[14])
992 #define D3 (sc->state[15])
993 #endif
994 
995 static void
996 compress_small(sph_simd_small_context *sc, int last)
997 {
998  unsigned char *x;
999  s32 q[128];
1000  int i;
1002 #if SPH_SIMD_NOCOPY
1003  sph_u32 saved[16];
1004 #endif
1005 
1006 #if SPH_SIMD_NOCOPY
1007  memcpy(saved, sc->state, sizeof saved);
1008 #endif
1009  x = sc->buf;
1010  FFT128(0, 1, 0, ll);
1011  if (last) {
1012  for (i = 0; i < 128; i ++) {
1013  s32 tq;
1014 
1015  tq = q[i] + yoff_s_f[i];
1016  tq = REDS2(tq);
1017  tq = REDS1(tq);
1018  tq = REDS1(tq);
1019  q[i] = (tq <= 128 ? tq : tq - 257);
1020  }
1021  } else {
1022  for (i = 0; i < 128; i ++) {
1023  s32 tq;
1024 
1025  tq = q[i] + yoff_s_n[i];
1026  tq = REDS2(tq);
1027  tq = REDS1(tq);
1028  tq = REDS1(tq);
1029  q[i] = (tq <= 128 ? tq : tq - 257);
1030  }
1031  }
1032  READ_STATE_SMALL(sc);
1033  A0 ^= sph_dec32le_aligned(x + 0);
1034  A1 ^= sph_dec32le_aligned(x + 4);
1035  A2 ^= sph_dec32le_aligned(x + 8);
1036  A3 ^= sph_dec32le_aligned(x + 12);
1037  B0 ^= sph_dec32le_aligned(x + 16);
1038  B1 ^= sph_dec32le_aligned(x + 20);
1039  B2 ^= sph_dec32le_aligned(x + 24);
1040  B3 ^= sph_dec32le_aligned(x + 28);
1041  C0 ^= sph_dec32le_aligned(x + 32);
1042  C1 ^= sph_dec32le_aligned(x + 36);
1043  C2 ^= sph_dec32le_aligned(x + 40);
1044  C3 ^= sph_dec32le_aligned(x + 44);
1045  D0 ^= sph_dec32le_aligned(x + 48);
1046  D1 ^= sph_dec32le_aligned(x + 52);
1047  D2 ^= sph_dec32le_aligned(x + 56);
1048  D3 ^= sph_dec32le_aligned(x + 60);
1049  ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27);
1050  ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7);
1051  ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5);
1052  ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25);
1053 #if SPH_SIMD_NOCOPY
1054  STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3],
1055  IF, 4, 13, PP4_2_);
1056  STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7],
1057  IF, 13, 10, PP4_0_);
1058  STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11],
1059  IF, 10, 25, PP4_1_);
1060  STEP_SMALL(saved[12], saved[13], saved[14], saved[15],
1061  IF, 25, 4, PP4_2_);
1062 #else
1063  STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1064  IF, 4, 13, PP4_2_);
1065  STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1066  IF, 13, 10, PP4_0_);
1067  STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1068  IF, 10, 25, PP4_1_);
1069  STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1070  IF, 25, 4, PP4_2_);
1071  WRITE_STATE_SMALL(sc);
1072 #endif
1073 }
1074 
1075 #if SPH_SIMD_NOCOPY
1076 #undef A0
1077 #undef A1
1078 #undef A2
1079 #undef A3
1080 #undef B0
1081 #undef B1
1082 #undef B2
1083 #undef B3
1084 #undef C0
1085 #undef C1
1086 #undef C2
1087 #undef C3
1088 #undef D0
1089 #undef D1
1090 #undef D2
1091 #undef D3
1092 #endif
1093 
1094 #endif
1095 
1096 #if SPH_SMALL_FOOTPRINT_SIMD
1097 
1098 #define A0 state[ 0]
1099 #define A1 state[ 1]
1100 #define A2 state[ 2]
1101 #define A3 state[ 3]
1102 #define A4 state[ 4]
1103 #define A5 state[ 5]
1104 #define A6 state[ 6]
1105 #define A7 state[ 7]
1106 #define B0 state[ 8]
1107 #define B1 state[ 9]
1108 #define B2 state[10]
1109 #define B3 state[11]
1110 #define B4 state[12]
1111 #define B5 state[13]
1112 #define B6 state[14]
1113 #define B7 state[15]
1114 #define C0 state[16]
1115 #define C1 state[17]
1116 #define C2 state[18]
1117 #define C3 state[19]
1118 #define C4 state[20]
1119 #define C5 state[21]
1120 #define C6 state[22]
1121 #define C7 state[23]
1122 #define D0 state[24]
1123 #define D1 state[25]
1124 #define D2 state[26]
1125 #define D3 state[27]
1126 #define D4 state[28]
1127 #define D5 state[29]
1128 #define D6 state[30]
1129 #define D7 state[31]
1130 
1131 /*
1132  * Not needed -- already defined for SIMD-224 / SIMD-256
1133  *
1134 #define STEP2_ELT(n, w, fun, s, ppb) do { \
1135  u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
1136  A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \
1137  D ## n = C ## n; \
1138  C ## n = B ## n; \
1139  B ## n = tA[n]; \
1140  } while (0)
1141  */
1142 
1143 #define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
1144  u32 tA[8]; \
1145  tA[0] = ROL32(A0, r); \
1146  tA[1] = ROL32(A1, r); \
1147  tA[2] = ROL32(A2, r); \
1148  tA[3] = ROL32(A3, r); \
1149  tA[4] = ROL32(A4, r); \
1150  tA[5] = ROL32(A5, r); \
1151  tA[6] = ROL32(A6, r); \
1152  tA[7] = ROL32(A7, r); \
1153  STEP2_ELT(0, w0, fun, s, pp8b); \
1154  STEP2_ELT(1, w1, fun, s, pp8b); \
1155  STEP2_ELT(2, w2, fun, s, pp8b); \
1156  STEP2_ELT(3, w3, fun, s, pp8b); \
1157  STEP2_ELT(4, w4, fun, s, pp8b); \
1158  STEP2_ELT(5, w5, fun, s, pp8b); \
1159  STEP2_ELT(6, w6, fun, s, pp8b); \
1160  STEP2_ELT(7, w7, fun, s, pp8b); \
1161  } while (0)
1162 
1163 static void
1164 one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3)
1165 {
1166  static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 };
1167 
1168  STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7],
1169  IF, p0, p1, pp8k[isp + 0]);
1170  STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15],
1171  IF, p1, p2, pp8k[isp + 1]);
1172  STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23],
1173  IF, p2, p3, pp8k[isp + 2]);
1174  STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31],
1175  IF, p3, p0, pp8k[isp + 3]);
1176  STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39],
1177  MAJ, p0, p1, pp8k[isp + 4]);
1178  STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47],
1179  MAJ, p1, p2, pp8k[isp + 5]);
1180  STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55],
1181  MAJ, p2, p3, pp8k[isp + 6]);
1182  STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63],
1183  MAJ, p3, p0, pp8k[isp + 7]);
1184 }
1185 
1186 static void
1187 compress_big(sph_simd_big_context *sc, int last)
1188 {
1189  unsigned char *x;
1190  s32 q[256];
1191  int i;
1192  u32 w[64];
1193  u32 state[32];
1194  size_t u;
1195 
1196  static const size_t wbp[32] = {
1197  4 << 4, 6 << 4, 0 << 4, 2 << 4,
1198  7 << 4, 5 << 4, 3 << 4, 1 << 4,
1199  15 << 4, 11 << 4, 12 << 4, 8 << 4,
1200  9 << 4, 13 << 4, 10 << 4, 14 << 4,
1201  17 << 4, 18 << 4, 23 << 4, 20 << 4,
1202  22 << 4, 21 << 4, 16 << 4, 19 << 4,
1203  30 << 4, 24 << 4, 25 << 4, 31 << 4,
1204  27 << 4, 29 << 4, 28 << 4, 26 << 4
1205  };
1206 
1207  x = sc->buf;
1208  FFT256(0, 1, 0, ll);
1209  if (last) {
1210  for (i = 0; i < 256; i ++) {
1211  s32 tq;
1212 
1213  tq = q[i] + yoff_b_f[i];
1214  tq = REDS2(tq);
1215  tq = REDS1(tq);
1216  tq = REDS1(tq);
1217  q[i] = (tq <= 128 ? tq : tq - 257);
1218  }
1219  } else {
1220  for (i = 0; i < 256; i ++) {
1221  s32 tq;
1222 
1223  tq = q[i] + yoff_b_n[i];
1224  tq = REDS2(tq);
1225  tq = REDS1(tq);
1226  tq = REDS1(tq);
1227  q[i] = (tq <= 128 ? tq : tq - 257);
1228  }
1229  }
1230 
1231  for (i = 0; i < 32; i += 8) {
1232  state[i + 0] = sc->state[i + 0]
1233  ^ sph_dec32le_aligned(x + 4 * (i + 0));
1234  state[i + 1] = sc->state[i + 1]
1235  ^ sph_dec32le_aligned(x + 4 * (i + 1));
1236  state[i + 2] = sc->state[i + 2]
1237  ^ sph_dec32le_aligned(x + 4 * (i + 2));
1238  state[i + 3] = sc->state[i + 3]
1239  ^ sph_dec32le_aligned(x + 4 * (i + 3));
1240  state[i + 4] = sc->state[i + 4]
1241  ^ sph_dec32le_aligned(x + 4 * (i + 4));
1242  state[i + 5] = sc->state[i + 5]
1243  ^ sph_dec32le_aligned(x + 4 * (i + 5));
1244  state[i + 6] = sc->state[i + 6]
1245  ^ sph_dec32le_aligned(x + 4 * (i + 6));
1246  state[i + 7] = sc->state[i + 7]
1247  ^ sph_dec32le_aligned(x + 4 * (i + 7));
1248  }
1249 
1250 #define WBREAD(sb, o1, o2, mm) do { \
1251  for (u = 0; u < 64; u += 8) { \
1252  size_t v = wbp[(u >> 3) + (sb)]; \
1253  w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \
1254  q[v + 2 * 0 + (o2)], mm); \
1255  w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \
1256  q[v + 2 * 1 + (o2)], mm); \
1257  w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \
1258  q[v + 2 * 2 + (o2)], mm); \
1259  w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \
1260  q[v + 2 * 3 + (o2)], mm); \
1261  w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \
1262  q[v + 2 * 4 + (o2)], mm); \
1263  w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \
1264  q[v + 2 * 5 + (o2)], mm); \
1265  w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \
1266  q[v + 2 * 6 + (o2)], mm); \
1267  w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \
1268  q[v + 2 * 7 + (o2)], mm); \
1269  } \
1270  } while (0)
1271 
1272  WBREAD( 0, 0, 1, 185);
1273  one_round_big(state, w, 0, 3, 23, 17, 27);
1274  WBREAD( 8, 0, 1, 185);
1275  one_round_big(state, w, 1, 28, 19, 22, 7);
1276  WBREAD(16, -256, -128, 233);
1277  one_round_big(state, w, 2, 29, 9, 15, 5);
1278  WBREAD(24, -383, -255, 233);
1279  one_round_big(state, w, 3, 4, 13, 10, 25);
1280 
1281 #undef WBREAD
1282 
1283  STEP_BIG(
1284  sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1285  sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1286  IF, 4, 13, PP8_4_);
1287  STEP_BIG(
1288  sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1289  sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1290  IF, 13, 10, PP8_5_);
1291  STEP_BIG(
1292  sc->state[16], sc->state[17], sc->state[18], sc->state[19],
1293  sc->state[20], sc->state[21], sc->state[22], sc->state[23],
1294  IF, 10, 25, PP8_6_);
1295  STEP_BIG(
1296  sc->state[24], sc->state[25], sc->state[26], sc->state[27],
1297  sc->state[28], sc->state[29], sc->state[30], sc->state[31],
1298  IF, 25, 4, PP8_0_);
1299 
1300  memcpy(sc->state, state, sizeof state);
1301 }
1302 
1303 #undef A0
1304 #undef A1
1305 #undef A2
1306 #undef A3
1307 #undef A4
1308 #undef A5
1309 #undef A6
1310 #undef A7
1311 #undef B0
1312 #undef B1
1313 #undef B2
1314 #undef B3
1315 #undef B4
1316 #undef B5
1317 #undef B6
1318 #undef B7
1319 #undef C0
1320 #undef C1
1321 #undef C2
1322 #undef C3
1323 #undef C4
1324 #undef C5
1325 #undef C6
1326 #undef C7
1327 #undef D0
1328 #undef D1
1329 #undef D2
1330 #undef D3
1331 #undef D4
1332 #undef D5
1333 #undef D6
1334 #undef D7
1335 
1336 #else
1337 
1338 #if SPH_SIMD_NOCOPY
1339 #define A0 (sc->state[ 0])
1340 #define A1 (sc->state[ 1])
1341 #define A2 (sc->state[ 2])
1342 #define A3 (sc->state[ 3])
1343 #define A4 (sc->state[ 4])
1344 #define A5 (sc->state[ 5])
1345 #define A6 (sc->state[ 6])
1346 #define A7 (sc->state[ 7])
1347 #define B0 (sc->state[ 8])
1348 #define B1 (sc->state[ 9])
1349 #define B2 (sc->state[10])
1350 #define B3 (sc->state[11])
1351 #define B4 (sc->state[12])
1352 #define B5 (sc->state[13])
1353 #define B6 (sc->state[14])
1354 #define B7 (sc->state[15])
1355 #define C0 (sc->state[16])
1356 #define C1 (sc->state[17])
1357 #define C2 (sc->state[18])
1358 #define C3 (sc->state[19])
1359 #define C4 (sc->state[20])
1360 #define C5 (sc->state[21])
1361 #define C6 (sc->state[22])
1362 #define C7 (sc->state[23])
1363 #define D0 (sc->state[24])
1364 #define D1 (sc->state[25])
1365 #define D2 (sc->state[26])
1366 #define D3 (sc->state[27])
1367 #define D4 (sc->state[28])
1368 #define D5 (sc->state[29])
1369 #define D6 (sc->state[30])
1370 #define D7 (sc->state[31])
1371 #endif
1372 
1373 static void
1374 compress_big(sph_simd_big_context *sc, int last)
1375 {
1376  unsigned char *x;
1377  s32 q[256];
1378  int i;
1380 #if SPH_SIMD_NOCOPY
1381  sph_u32 saved[32];
1382 #endif
1383 
1384 #if SPH_SIMD_NOCOPY
1385  memcpy(saved, sc->state, sizeof saved);
1386 #endif
1387 
1388  x = sc->buf;
1389  FFT256(0, 1, 0, ll);
1390  if (last) {
1391  for (i = 0; i < 256; i ++) {
1392  s32 tq;
1393 
1394  tq = q[i] + yoff_b_f[i];
1395  tq = REDS2(tq);
1396  tq = REDS1(tq);
1397  tq = REDS1(tq);
1398  q[i] = (tq <= 128 ? tq : tq - 257);
1399  }
1400  } else {
1401  for (i = 0; i < 256; i ++) {
1402  s32 tq;
1403 
1404  tq = q[i] + yoff_b_n[i];
1405  tq = REDS2(tq);
1406  tq = REDS1(tq);
1407  tq = REDS1(tq);
1408  q[i] = (tq <= 128 ? tq : tq - 257);
1409  }
1410  }
1411  READ_STATE_BIG(sc);
1412  A0 ^= sph_dec32le_aligned(x + 0);
1413  A1 ^= sph_dec32le_aligned(x + 4);
1414  A2 ^= sph_dec32le_aligned(x + 8);
1415  A3 ^= sph_dec32le_aligned(x + 12);
1416  A4 ^= sph_dec32le_aligned(x + 16);
1417  A5 ^= sph_dec32le_aligned(x + 20);
1418  A6 ^= sph_dec32le_aligned(x + 24);
1419  A7 ^= sph_dec32le_aligned(x + 28);
1420  B0 ^= sph_dec32le_aligned(x + 32);
1421  B1 ^= sph_dec32le_aligned(x + 36);
1422  B2 ^= sph_dec32le_aligned(x + 40);
1423  B3 ^= sph_dec32le_aligned(x + 44);
1424  B4 ^= sph_dec32le_aligned(x + 48);
1425  B5 ^= sph_dec32le_aligned(x + 52);
1426  B6 ^= sph_dec32le_aligned(x + 56);
1427  B7 ^= sph_dec32le_aligned(x + 60);
1428  C0 ^= sph_dec32le_aligned(x + 64);
1429  C1 ^= sph_dec32le_aligned(x + 68);
1430  C2 ^= sph_dec32le_aligned(x + 72);
1431  C3 ^= sph_dec32le_aligned(x + 76);
1432  C4 ^= sph_dec32le_aligned(x + 80);
1433  C5 ^= sph_dec32le_aligned(x + 84);
1434  C6 ^= sph_dec32le_aligned(x + 88);
1435  C7 ^= sph_dec32le_aligned(x + 92);
1436  D0 ^= sph_dec32le_aligned(x + 96);
1437  D1 ^= sph_dec32le_aligned(x + 100);
1438  D2 ^= sph_dec32le_aligned(x + 104);
1439  D3 ^= sph_dec32le_aligned(x + 108);
1440  D4 ^= sph_dec32le_aligned(x + 112);
1441  D5 ^= sph_dec32le_aligned(x + 116);
1442  D6 ^= sph_dec32le_aligned(x + 120);
1443  D7 ^= sph_dec32le_aligned(x + 124);
1444 
1445  ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27);
1446  ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
1447  ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
1448  ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
1449 #if SPH_SIMD_NOCOPY
1450  STEP_BIG(
1451  saved[ 0], saved[ 1], saved[ 2], saved[ 3],
1452  saved[ 4], saved[ 5], saved[ 6], saved[ 7],
1453  IF, 4, 13, PP8_4_);
1454  STEP_BIG(
1455  saved[ 8], saved[ 9], saved[10], saved[11],
1456  saved[12], saved[13], saved[14], saved[15],
1457  IF, 13, 10, PP8_5_);
1458  STEP_BIG(
1459  saved[16], saved[17], saved[18], saved[19],
1460  saved[20], saved[21], saved[22], saved[23],
1461  IF, 10, 25, PP8_6_);
1462  STEP_BIG(
1463  saved[24], saved[25], saved[26], saved[27],
1464  saved[28], saved[29], saved[30], saved[31],
1465  IF, 25, 4, PP8_0_);
1466 #else
1467  STEP_BIG(
1468  sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3],
1469  sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7],
1470  IF, 4, 13, PP8_4_);
1471  STEP_BIG(
1472  sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11],
1473  sc->state[12], sc->state[13], sc->state[14], sc->state[15],
1474  IF, 13, 10, PP8_5_);
1475  STEP_BIG(
1476  sc->state[16], sc->state[17], sc->state[18], sc->state[19],
1477  sc->state[20], sc->state[21], sc->state[22], sc->state[23],
1478  IF, 10, 25, PP8_6_);
1479  STEP_BIG(
1480  sc->state[24], sc->state[25], sc->state[26], sc->state[27],
1481  sc->state[28], sc->state[29], sc->state[30], sc->state[31],
1482  IF, 25, 4, PP8_0_);
1483  WRITE_STATE_BIG(sc);
1484 #endif
1485 }
1486 
1487 #if SPH_SIMD_NOCOPY
1488 #undef A0
1489 #undef A1
1490 #undef A2
1491 #undef A3
1492 #undef A4
1493 #undef A5
1494 #undef A6
1495 #undef A7
1496 #undef B0
1497 #undef B1
1498 #undef B2
1499 #undef B3
1500 #undef B4
1501 #undef B5
1502 #undef B6
1503 #undef B7
1504 #undef C0
1505 #undef C1
1506 #undef C2
1507 #undef C3
1508 #undef C4
1509 #undef C5
1510 #undef C6
1511 #undef C7
1512 #undef D0
1513 #undef D1
1514 #undef D2
1515 #undef D3
1516 #undef D4
1517 #undef D5
1518 #undef D6
1519 #undef D7
1520 #endif
1521 
1522 #endif
1523 
1524 static const u32 IV224[] = {
1525  C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53),
1526  C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96),
1527  C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6),
1528  C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8)
1529 };
1530 
1531 static const u32 IV256[] = {
1532  C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9),
1533  C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3),
1534  C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9),
1535  C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1)
1536 };
1537 
1538 static const u32 IV384[] = {
1539  C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B),
1540  C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1),
1541  C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A),
1542  C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8),
1543  C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2),
1544  C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462),
1545  C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5),
1546  C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71)
1547 };
1548 
1549 static const u32 IV512[] = {
1550  C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
1551  C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
1552  C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
1553  C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
1554  C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
1555  C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
1556  C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
1557  C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22)
1558 };
1559 
1560 static void
1561 init_small(void *cc, const u32 *iv)
1562 {
1564 
1565  sc = cc;
1566  memcpy(sc->state, iv, sizeof sc->state);
1567  sc->count_low = sc->count_high = 0;
1568  sc->ptr = 0;
1569 }
1570 
1571 static void
1572 init_big(void *cc, const u32 *iv)
1573 {
1575 
1576  sc = cc;
1577  memcpy(sc->state, iv, sizeof sc->state);
1578  sc->count_low = sc->count_high = 0;
1579  sc->ptr = 0;
1580 }
1581 
1582 static void
1583 update_small(void *cc, const void *data, size_t len)
1584 {
1586 
1587  sc = cc;
1588  while (len > 0) {
1589  size_t clen;
1590 
1591  clen = (sizeof sc->buf) - sc->ptr;
1592  if (clen > len)
1593  clen = len;
1594  memcpy(sc->buf + sc->ptr, data, clen);
1595  data = (const unsigned char *)data + clen;
1596  len -= clen;
1597  if ((sc->ptr += clen) == sizeof sc->buf) {
1598  compress_small(sc, 0);
1599  sc->ptr = 0;
1600  sc->count_low = T32(sc->count_low + 1);
1601  if (sc->count_low == 0)
1602  sc->count_high ++;
1603  }
1604  }
1605 }
1606 
1607 static void
1608 update_big(void *cc, const void *data, size_t len)
1609 {
1611 
1612  sc = cc;
1613  while (len > 0) {
1614  size_t clen;
1615 
1616  clen = (sizeof sc->buf) - sc->ptr;
1617  if (clen > len)
1618  clen = len;
1619  memcpy(sc->buf + sc->ptr, data, clen);
1620  data = (const unsigned char *)data + clen;
1621  len -= clen;
1622  if ((sc->ptr += clen) == sizeof sc->buf) {
1623  compress_big(sc, 0);
1624  sc->ptr = 0;
1625  sc->count_low = T32(sc->count_low + 1);
1626  if (sc->count_low == 0)
1627  sc->count_high ++;
1628  }
1629  }
1630 }
1631 
1632 static void
1633 encode_count_small(unsigned char *dst,
1634  u32 low, u32 high, size_t ptr, unsigned n)
1635 {
1636  low = T32(low << 9);
1637  high = T32(high << 9) + (low >> 23);
1638  low += (ptr << 3) + n;
1639  sph_enc32le(dst, low);
1640  sph_enc32le(dst + 4, high);
1641 }
1642 
1643 static void
1644 encode_count_big(unsigned char *dst,
1645  u32 low, u32 high, size_t ptr, unsigned n)
1646 {
1647  low = T32(low << 10);
1648  high = T32(high << 10) + (low >> 22);
1649  low += (ptr << 3) + n;
1650  sph_enc32le(dst, low);
1651  sph_enc32le(dst + 4, high);
1652 }
1653 
1654 static void
1655 finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
1656 {
1658  unsigned char *d;
1659  size_t u;
1660 
1661  sc = cc;
1662  if (sc->ptr > 0 || n > 0) {
1663  memset(sc->buf + sc->ptr, 0,
1664  (sizeof sc->buf) - sc->ptr);
1665  sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
1666  compress_small(sc, 0);
1667  }
1668  memset(sc->buf, 0, sizeof sc->buf);
1669  encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
1670  compress_small(sc, 1);
1671  d = dst;
1672  for (d = dst, u = 0; u < dst_len; u ++)
1673  sph_enc32le(d + (u << 2), sc->state[u]);
1674 }
1675 
1676 static void
1677 finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len)
1678 {
1680  unsigned char *d;
1681  size_t u;
1682 
1683  sc = cc;
1684  if (sc->ptr > 0 || n > 0) {
1685  memset(sc->buf + sc->ptr, 0,
1686  (sizeof sc->buf) - sc->ptr);
1687  sc->buf[sc->ptr] = ub & (0xFF << (8 - n));
1688  compress_big(sc, 0);
1689  }
1690  memset(sc->buf, 0, sizeof sc->buf);
1691  encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n);
1692  compress_big(sc, 1);
1693  d = dst;
1694  for (d = dst, u = 0; u < dst_len; u ++)
1695  sph_enc32le(d + (u << 2), sc->state[u]);
1696 }
1697 
1698 void
1700 {
1701  init_small(cc, IV224);
1702 }
1703 
1704 void
1705 sph_simd224(void *cc, const void *data, size_t len)
1706 {
1707  update_small(cc, data, len);
1708 }
1709 
1710 void
1711 sph_simd224_close(void *cc, void *dst)
1712 {
1713  sph_simd224_addbits_and_close(cc, 0, 0, dst);
1714 }
1715 
1716 void
1717 sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1718 {
1719  finalize_small(cc, ub, n, dst, 7);
1720  sph_simd224_init(cc);
1721 }
1722 
1723 void
1725 {
1726  init_small(cc, IV256);
1727 }
1728 
1729 void
1730 sph_simd256(void *cc, const void *data, size_t len)
1731 {
1732  update_small(cc, data, len);
1733 }
1734 
1735 void
1736 sph_simd256_close(void *cc, void *dst)
1737 {
1738  sph_simd256_addbits_and_close(cc, 0, 0, dst);
1739 }
1740 
1741 void
1742 sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1743 {
1744  finalize_small(cc, ub, n, dst, 8);
1745  sph_simd256_init(cc);
1746 }
1747 
1748 void
1750 {
1751  init_big(cc, IV384);
1752 }
1753 
1754 void
1755 sph_simd384(void *cc, const void *data, size_t len)
1756 {
1757  update_big(cc, data, len);
1758 }
1759 
1760 void
1761 sph_simd384_close(void *cc, void *dst)
1762 {
1763  sph_simd384_addbits_and_close(cc, 0, 0, dst);
1764 }
1765 
1766 void
1767 sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1768 {
1769  finalize_big(cc, ub, n, dst, 12);
1770  sph_simd384_init(cc);
1771 }
1772 
1773 void
1775 {
1776  init_big(cc, IV512);
1777 }
1778 
1779 void
1780 sph_simd512(void *cc, const void *data, size_t len)
1781 {
1782  update_big(cc, data, len);
1783 }
1784 
1785 void
1786 sph_simd512_close(void *cc, void *dst)
1787 {
1788  sph_simd512_addbits_and_close(cc, 0, 0, dst);
1789 }
1790 
1791 void
1792 sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
1793 {
1794  finalize_big(cc, ub, n, dst, 16);
1795  sph_simd512_init(cc);
1796 }
1797 #ifdef __cplusplus
1798 }
1799 #endif
sph_u32 state[16]
Definition: sph_simd.h:81
sph_u32 u32
Definition: simd.c:51
void sph_simd224(void *cc, const void *data, size_t len)
Process some data bytes.
Definition: simd.c:1705
#define FFT32(xb, xs, rb, id)
Definition: simd.c:202
#define READ_STATE_BIG(sc)
Definition: simd.c:595
#define FFT_LOOP(rb, hk, as, id)
Definition: simd.c:102
void sph_simd384_close(void *cc, void *dst)
Terminate the current SIMD-384 computation and output the result into the provided buffer...
Definition: simd.c:1761
This structure is a context for SIMD computations: it contains the intermediate values and some data ...
Definition: sph_simd.h:77
sph_u32 state[32]
Definition: sph_simd.h:101
void sph_simd384_init(void *cc)
Initialize an SIMD-384 context.
Definition: simd.c:1749
void sph_simd512(void *cc, const void *data, size_t len)
Process some data bytes.
Definition: simd.c:1780
long sph_s32
Definition: sph_types.h:871
#define DECL_STATE_SMALL
Definition: simd.c:548
void sph_simd512_close(void *cc, void *dst)
Terminate the current SIMD-512 computation and output the result into the provided buffer...
Definition: simd.c:1786
void sph_simd256_init(void *cc)
Initialize an SIMD-256 context.
Definition: simd.c:1724
#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3)
Definition: simd.c:791
#define IF(x, y, z)
Definition: simd.c:458
sph_u32 high
Definition: keccak.c:370
#define WRITE_STATE_BIG(sc)
Definition: simd.c:630
void sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
Add a few additional bits (0 to 7) to the current computation, then terminate it and output the resul...
Definition: simd.c:1767
#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b)
Definition: simd.c:686
#define REDS2(x)
Definition: simd.c:94
SIMD interface.
#define REDS1(x)
Definition: simd.c:93
unsigned char buf[64]
Definition: sph_simd.h:79
#define T32
Definition: simd.c:54
#define READ_STATE_SMALL(sc)
Definition: simd.c:551
#define C32
Definition: simd.c:53
unsigned char buf[128]
Definition: sph_simd.h:99
#define WRITE_STATE_SMALL(sc)
Definition: simd.c:570
This structure is a context for SIMD computations: it contains the intermediate values and some data ...
Definition: sph_simd.h:97
void sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
Add a few additional bits (0 to 7) to the current computation, then terminate it and output the resul...
Definition: simd.c:1792
#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3)
Definition: simd.c:734
#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b)
Definition: simd.c:675
#define MAJ(x, y, z)
Definition: simd.c:459
void sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
Add a few additional bits (0 to 7) to the current computation, then terminate it and output the resul...
Definition: simd.c:1742
void sph_simd256_close(void *cc, void *dst)
Terminate the current SIMD-256 computation and output the result into the provided buffer...
Definition: simd.c:1736
void sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
Add a few additional bits (0 to 7) to the current computation, then terminate it and output the resul...
Definition: simd.c:1717
void sph_simd384(void *cc, const void *data, size_t len)
Process some data bytes.
Definition: simd.c:1755
void sph_simd224_init(void *cc)
Initialize an SIMD-224 context.
Definition: simd.c:1699
#define FFT256(xb, xs, rb, id)
Definition: simd.c:273
void sph_simd256(void *cc, const void *data, size_t len)
Process some data bytes.
Definition: simd.c:1730
void * memcpy(void *a, const void *b, size_t c)
#define FFT128(xb, xs, rb, id)
Definition: simd.c:245
sph_u32 low
Definition: keccak.c:370
unsigned long sph_u32
Definition: sph_types.h:870
#define FFT16(xb, xs, rb)
Definition: simd.c:176
void sph_simd512_init(void *cc)
Initialize an SIMD-512 context.
Definition: simd.c:1774
sph_s32 s32
Definition: simd.c:52
#define DECL_STATE_BIG
Definition: simd.c:589
std::string _(const char *psz)
Translation function: Call Translate signal on UI interface, which returns a boost::optional result...
Definition: util.h:66
void sph_simd224_close(void *cc, void *dst)
Terminate the current SIMD-224 computation and output the result into the provided buffer...
Definition: simd.c:1711