Coverage Report

Created: 2026-06-01 18:35

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/bitcoin/src/crypto/sha256_sse4.cpp
Line
Count
Source
1
// Copyright (c) 2017-present The Bitcoin Core developers
2
// Distributed under the MIT software license, see the accompanying
3
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
//
5
// This is a translation to GCC extended asm syntax from YASM code by Intel
6
// (available at the bottom of this file).
7
8
#if defined(__x86_64__) || defined(__amd64__)
9
10
#include <cstdint>
11
#include <cstdlib>
12
13
namespace sha256_sse4
14
{
15
void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
16
#if defined(__clang__)
17
  /*
18
  clang is unable to compile this with -O0 and -fsanitize=address.
19
  See upstream bug: https://github.com/llvm/llvm-project/issues/92182.
20
  This also fails to compile with -O2, -fcf-protection & -fsanitize=address.
21
  See https://github.com/bitcoin/bitcoin/issues/31913.
22
  */
23
#if __has_feature(address_sanitizer)
24
  __attribute__((no_sanitize("address")))
25
#endif
26
#endif
27
25.0k
{
28
25.0k
    static const uint32_t K256 alignas(16) [] = {
29
25.0k
        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
30
25.0k
        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
31
25.0k
        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
32
25.0k
        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
33
25.0k
        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
34
25.0k
        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
35
25.0k
        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
36
25.0k
        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
37
25.0k
        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
38
25.0k
        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
39
25.0k
        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
40
25.0k
        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
41
25.0k
        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
42
25.0k
        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
43
25.0k
        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
44
25.0k
        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
45
25.0k
    };
46
25.0k
    static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
47
25.0k
    static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
48
25.0k
    static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
49
25.0k
    uint32_t a, b, c, d, f, g, h, y0, y1, y2;
50
25.0k
    uint64_t tbl;
51
25.0k
    uint64_t inp_end, inp;
52
25.0k
    uint32_t xfer alignas(16) [4];
53
54
25.0k
    __asm__ __volatile__(
55
25.0k
        "shl    $0x6,%2;"
56
25.0k
        "je     Ldone_hash_%=;"
57
25.0k
        "add    %1,%2;"
58
25.0k
        "mov    %2,%14;"
59
25.0k
        "mov    (%0),%3;"
60
25.0k
        "mov    0x4(%0),%4;"
61
25.0k
        "mov    0x8(%0),%5;"
62
25.0k
        "mov    0xc(%0),%6;"
63
25.0k
        "mov    0x10(%0),%k2;"
64
25.0k
        "mov    0x14(%0),%7;"
65
25.0k
        "mov    0x18(%0),%8;"
66
25.0k
        "mov    0x1c(%0),%9;"
67
25.0k
        "movdqa %18,%%xmm12;"
68
25.0k
        "movdqa %19,%%xmm10;"
69
25.0k
        "movdqa %20,%%xmm11;"
70
71
25.0k
        "Lloop0_%=:"
72
25.0k
        "lea    %17,%13;"
73
25.0k
        "movdqu (%1),%%xmm4;"
74
25.0k
        "pshufb %%xmm12,%%xmm4;"
75
25.0k
        "movdqu 0x10(%1),%%xmm5;"
76
25.0k
        "pshufb %%xmm12,%%xmm5;"
77
25.0k
        "movdqu 0x20(%1),%%xmm6;"
78
25.0k
        "pshufb %%xmm12,%%xmm6;"
79
25.0k
        "movdqu 0x30(%1),%%xmm7;"
80
25.0k
        "pshufb %%xmm12,%%xmm7;"
81
25.0k
        "mov    %1,%15;"
82
25.0k
        "mov    $3,%1;"
83
84
25.0k
        "Lloop1_%=:"
85
25.0k
        "movdqa 0x0(%13),%%xmm9;"
86
25.0k
        "paddd  %%xmm4,%%xmm9;"
87
25.0k
        "movdqa %%xmm9,%16;"
88
25.0k
        "movdqa %%xmm7,%%xmm0;"
89
25.0k
        "mov    %k2,%10;"
90
25.0k
        "ror    $0xe,%10;"
91
25.0k
        "mov    %3,%11;"
92
25.0k
        "palignr $0x4,%%xmm6,%%xmm0;"
93
25.0k
        "ror    $0x9,%11;"
94
25.0k
        "xor    %k2,%10;"
95
25.0k
        "mov    %7,%12;"
96
25.0k
        "ror    $0x5,%10;"
97
25.0k
        "movdqa %%xmm5,%%xmm1;"
98
25.0k
        "xor    %3,%11;"
99
25.0k
        "xor    %8,%12;"
100
25.0k
        "paddd  %%xmm4,%%xmm0;"
101
25.0k
        "xor    %k2,%10;"
102
25.0k
        "and    %k2,%12;"
103
25.0k
        "ror    $0xb,%11;"
104
25.0k
        "palignr $0x4,%%xmm4,%%xmm1;"
105
25.0k
        "xor    %3,%11;"
106
25.0k
        "ror    $0x6,%10;"
107
25.0k
        "xor    %8,%12;"
108
25.0k
        "movdqa %%xmm1,%%xmm2;"
109
25.0k
        "ror    $0x2,%11;"
110
25.0k
        "add    %10,%12;"
111
25.0k
        "add    %16,%12;"
112
25.0k
        "movdqa %%xmm1,%%xmm3;"
113
25.0k
        "mov    %3,%10;"
114
25.0k
        "add    %12,%9;"
115
25.0k
        "mov    %3,%12;"
116
25.0k
        "pslld  $0x19,%%xmm1;"
117
25.0k
        "or     %5,%10;"
118
25.0k
        "add    %9,%6;"
119
25.0k
        "and    %5,%12;"
120
25.0k
        "psrld  $0x7,%%xmm2;"
121
25.0k
        "and    %4,%10;"
122
25.0k
        "add    %11,%9;"
123
25.0k
        "por    %%xmm2,%%xmm1;"
124
25.0k
        "or     %12,%10;"
125
25.0k
        "add    %10,%9;"
126
25.0k
        "movdqa %%xmm3,%%xmm2;"
127
25.0k
        "mov    %6,%10;"
128
25.0k
        "mov    %9,%11;"
129
25.0k
        "movdqa %%xmm3,%%xmm8;"
130
25.0k
        "ror    $0xe,%10;"
131
25.0k
        "xor    %6,%10;"
132
25.0k
        "mov    %k2,%12;"
133
25.0k
        "ror    $0x9,%11;"
134
25.0k
        "pslld  $0xe,%%xmm3;"
135
25.0k
        "xor    %9,%11;"
136
25.0k
        "ror    $0x5,%10;"
137
25.0k
        "xor    %7,%12;"
138
25.0k
        "psrld  $0x12,%%xmm2;"
139
25.0k
        "ror    $0xb,%11;"
140
25.0k
        "xor    %6,%10;"
141
25.0k
        "and    %6,%12;"
142
25.0k
        "ror    $0x6,%10;"
143
25.0k
        "pxor   %%xmm3,%%xmm1;"
144
25.0k
        "xor    %9,%11;"
145
25.0k
        "xor    %7,%12;"
146
25.0k
        "psrld  $0x3,%%xmm8;"
147
25.0k
        "add    %10,%12;"
148
25.0k
        "add    4+%16,%12;"
149
25.0k
        "ror    $0x2,%11;"
150
25.0k
        "pxor   %%xmm2,%%xmm1;"
151
25.0k
        "mov    %9,%10;"
152
25.0k
        "add    %12,%8;"
153
25.0k
        "mov    %9,%12;"
154
25.0k
        "pxor   %%xmm8,%%xmm1;"
155
25.0k
        "or     %4,%10;"
156
25.0k
        "add    %8,%5;"
157
25.0k
        "and    %4,%12;"
158
25.0k
        "pshufd $0xfa,%%xmm7,%%xmm2;"
159
25.0k
        "and    %3,%10;"
160
25.0k
        "add    %11,%8;"
161
25.0k
        "paddd  %%xmm1,%%xmm0;"
162
25.0k
        "or     %12,%10;"
163
25.0k
        "add    %10,%8;"
164
25.0k
        "movdqa %%xmm2,%%xmm3;"
165
25.0k
        "mov    %5,%10;"
166
25.0k
        "mov    %8,%11;"
167
25.0k
        "ror    $0xe,%10;"
168
25.0k
        "movdqa %%xmm2,%%xmm8;"
169
25.0k
        "xor    %5,%10;"
170
25.0k
        "ror    $0x9,%11;"
171
25.0k
        "mov    %6,%12;"
172
25.0k
        "xor    %8,%11;"
173
25.0k
        "ror    $0x5,%10;"
174
25.0k
        "psrlq  $0x11,%%xmm2;"
175
25.0k
        "xor    %k2,%12;"
176
25.0k
        "psrlq  $0x13,%%xmm3;"
177
25.0k
        "xor    %5,%10;"
178
25.0k
        "and    %5,%12;"
179
25.0k
        "psrld  $0xa,%%xmm8;"
180
25.0k
        "ror    $0xb,%11;"
181
25.0k
        "xor    %8,%11;"
182
25.0k
        "xor    %k2,%12;"
183
25.0k
        "ror    $0x6,%10;"
184
25.0k
        "pxor   %%xmm3,%%xmm2;"
185
25.0k
        "add    %10,%12;"
186
25.0k
        "ror    $0x2,%11;"
187
25.0k
        "add    8+%16,%12;"
188
25.0k
        "pxor   %%xmm2,%%xmm8;"
189
25.0k
        "mov    %8,%10;"
190
25.0k
        "add    %12,%7;"
191
25.0k
        "mov    %8,%12;"
192
25.0k
        "pshufb %%xmm10,%%xmm8;"
193
25.0k
        "or     %3,%10;"
194
25.0k
        "add    %7,%4;"
195
25.0k
        "and    %3,%12;"
196
25.0k
        "paddd  %%xmm8,%%xmm0;"
197
25.0k
        "and    %9,%10;"
198
25.0k
        "add    %11,%7;"
199
25.0k
        "pshufd $0x50,%%xmm0,%%xmm2;"
200
25.0k
        "or     %12,%10;"
201
25.0k
        "add    %10,%7;"
202
25.0k
        "movdqa %%xmm2,%%xmm3;"
203
25.0k
        "mov    %4,%10;"
204
25.0k
        "ror    $0xe,%10;"
205
25.0k
        "mov    %7,%11;"
206
25.0k
        "movdqa %%xmm2,%%xmm4;"
207
25.0k
        "ror    $0x9,%11;"
208
25.0k
        "xor    %4,%10;"
209
25.0k
        "mov    %5,%12;"
210
25.0k
        "ror    $0x5,%10;"
211
25.0k
        "psrlq  $0x11,%%xmm2;"
212
25.0k
        "xor    %7,%11;"
213
25.0k
        "xor    %6,%12;"
214
25.0k
        "psrlq  $0x13,%%xmm3;"
215
25.0k
        "xor    %4,%10;"
216
25.0k
        "and    %4,%12;"
217
25.0k
        "ror    $0xb,%11;"
218
25.0k
        "psrld  $0xa,%%xmm4;"
219
25.0k
        "xor    %7,%11;"
220
25.0k
        "ror    $0x6,%10;"
221
25.0k
        "xor    %6,%12;"
222
25.0k
        "pxor   %%xmm3,%%xmm2;"
223
25.0k
        "ror    $0x2,%11;"
224
25.0k
        "add    %10,%12;"
225
25.0k
        "add    12+%16,%12;"
226
25.0k
        "pxor   %%xmm2,%%xmm4;"
227
25.0k
        "mov    %7,%10;"
228
25.0k
        "add    %12,%k2;"
229
25.0k
        "mov    %7,%12;"
230
25.0k
        "pshufb %%xmm11,%%xmm4;"
231
25.0k
        "or     %9,%10;"
232
25.0k
        "add    %k2,%3;"
233
25.0k
        "and    %9,%12;"
234
25.0k
        "paddd  %%xmm0,%%xmm4;"
235
25.0k
        "and    %8,%10;"
236
25.0k
        "add    %11,%k2;"
237
25.0k
        "or     %12,%10;"
238
25.0k
        "add    %10,%k2;"
239
25.0k
        "movdqa 0x10(%13),%%xmm9;"
240
25.0k
        "paddd  %%xmm5,%%xmm9;"
241
25.0k
        "movdqa %%xmm9,%16;"
242
25.0k
        "movdqa %%xmm4,%%xmm0;"
243
25.0k
        "mov    %3,%10;"
244
25.0k
        "ror    $0xe,%10;"
245
25.0k
        "mov    %k2,%11;"
246
25.0k
        "palignr $0x4,%%xmm7,%%xmm0;"
247
25.0k
        "ror    $0x9,%11;"
248
25.0k
        "xor    %3,%10;"
249
25.0k
        "mov    %4,%12;"
250
25.0k
        "ror    $0x5,%10;"
251
25.0k
        "movdqa %%xmm6,%%xmm1;"
252
25.0k
        "xor    %k2,%11;"
253
25.0k
        "xor    %5,%12;"
254
25.0k
        "paddd  %%xmm5,%%xmm0;"
255
25.0k
        "xor    %3,%10;"
256
25.0k
        "and    %3,%12;"
257
25.0k
        "ror    $0xb,%11;"
258
25.0k
        "palignr $0x4,%%xmm5,%%xmm1;"
259
25.0k
        "xor    %k2,%11;"
260
25.0k
        "ror    $0x6,%10;"
261
25.0k
        "xor    %5,%12;"
262
25.0k
        "movdqa %%xmm1,%%xmm2;"
263
25.0k
        "ror    $0x2,%11;"
264
25.0k
        "add    %10,%12;"
265
25.0k
        "add    %16,%12;"
266
25.0k
        "movdqa %%xmm1,%%xmm3;"
267
25.0k
        "mov    %k2,%10;"
268
25.0k
        "add    %12,%6;"
269
25.0k
        "mov    %k2,%12;"
270
25.0k
        "pslld  $0x19,%%xmm1;"
271
25.0k
        "or     %8,%10;"
272
25.0k
        "add    %6,%9;"
273
25.0k
        "and    %8,%12;"
274
25.0k
        "psrld  $0x7,%%xmm2;"
275
25.0k
        "and    %7,%10;"
276
25.0k
        "add    %11,%6;"
277
25.0k
        "por    %%xmm2,%%xmm1;"
278
25.0k
        "or     %12,%10;"
279
25.0k
        "add    %10,%6;"
280
25.0k
        "movdqa %%xmm3,%%xmm2;"
281
25.0k
        "mov    %9,%10;"
282
25.0k
        "mov    %6,%11;"
283
25.0k
        "movdqa %%xmm3,%%xmm8;"
284
25.0k
        "ror    $0xe,%10;"
285
25.0k
        "xor    %9,%10;"
286
25.0k
        "mov    %3,%12;"
287
25.0k
        "ror    $0x9,%11;"
288
25.0k
        "pslld  $0xe,%%xmm3;"
289
25.0k
        "xor    %6,%11;"
290
25.0k
        "ror    $0x5,%10;"
291
25.0k
        "xor    %4,%12;"
292
25.0k
        "psrld  $0x12,%%xmm2;"
293
25.0k
        "ror    $0xb,%11;"
294
25.0k
        "xor    %9,%10;"
295
25.0k
        "and    %9,%12;"
296
25.0k
        "ror    $0x6,%10;"
297
25.0k
        "pxor   %%xmm3,%%xmm1;"
298
25.0k
        "xor    %6,%11;"
299
25.0k
        "xor    %4,%12;"
300
25.0k
        "psrld  $0x3,%%xmm8;"
301
25.0k
        "add    %10,%12;"
302
25.0k
        "add    4+%16,%12;"
303
25.0k
        "ror    $0x2,%11;"
304
25.0k
        "pxor   %%xmm2,%%xmm1;"
305
25.0k
        "mov    %6,%10;"
306
25.0k
        "add    %12,%5;"
307
25.0k
        "mov    %6,%12;"
308
25.0k
        "pxor   %%xmm8,%%xmm1;"
309
25.0k
        "or     %7,%10;"
310
25.0k
        "add    %5,%8;"
311
25.0k
        "and    %7,%12;"
312
25.0k
        "pshufd $0xfa,%%xmm4,%%xmm2;"
313
25.0k
        "and    %k2,%10;"
314
25.0k
        "add    %11,%5;"
315
25.0k
        "paddd  %%xmm1,%%xmm0;"
316
25.0k
        "or     %12,%10;"
317
25.0k
        "add    %10,%5;"
318
25.0k
        "movdqa %%xmm2,%%xmm3;"
319
25.0k
        "mov    %8,%10;"
320
25.0k
        "mov    %5,%11;"
321
25.0k
        "ror    $0xe,%10;"
322
25.0k
        "movdqa %%xmm2,%%xmm8;"
323
25.0k
        "xor    %8,%10;"
324
25.0k
        "ror    $0x9,%11;"
325
25.0k
        "mov    %9,%12;"
326
25.0k
        "xor    %5,%11;"
327
25.0k
        "ror    $0x5,%10;"
328
25.0k
        "psrlq  $0x11,%%xmm2;"
329
25.0k
        "xor    %3,%12;"
330
25.0k
        "psrlq  $0x13,%%xmm3;"
331
25.0k
        "xor    %8,%10;"
332
25.0k
        "and    %8,%12;"
333
25.0k
        "psrld  $0xa,%%xmm8;"
334
25.0k
        "ror    $0xb,%11;"
335
25.0k
        "xor    %5,%11;"
336
25.0k
        "xor    %3,%12;"
337
25.0k
        "ror    $0x6,%10;"
338
25.0k
        "pxor   %%xmm3,%%xmm2;"
339
25.0k
        "add    %10,%12;"
340
25.0k
        "ror    $0x2,%11;"
341
25.0k
        "add    8+%16,%12;"
342
25.0k
        "pxor   %%xmm2,%%xmm8;"
343
25.0k
        "mov    %5,%10;"
344
25.0k
        "add    %12,%4;"
345
25.0k
        "mov    %5,%12;"
346
25.0k
        "pshufb %%xmm10,%%xmm8;"
347
25.0k
        "or     %k2,%10;"
348
25.0k
        "add    %4,%7;"
349
25.0k
        "and    %k2,%12;"
350
25.0k
        "paddd  %%xmm8,%%xmm0;"
351
25.0k
        "and    %6,%10;"
352
25.0k
        "add    %11,%4;"
353
25.0k
        "pshufd $0x50,%%xmm0,%%xmm2;"
354
25.0k
        "or     %12,%10;"
355
25.0k
        "add    %10,%4;"
356
25.0k
        "movdqa %%xmm2,%%xmm3;"
357
25.0k
        "mov    %7,%10;"
358
25.0k
        "ror    $0xe,%10;"
359
25.0k
        "mov    %4,%11;"
360
25.0k
        "movdqa %%xmm2,%%xmm5;"
361
25.0k
        "ror    $0x9,%11;"
362
25.0k
        "xor    %7,%10;"
363
25.0k
        "mov    %8,%12;"
364
25.0k
        "ror    $0x5,%10;"
365
25.0k
        "psrlq  $0x11,%%xmm2;"
366
25.0k
        "xor    %4,%11;"
367
25.0k
        "xor    %9,%12;"
368
25.0k
        "psrlq  $0x13,%%xmm3;"
369
25.0k
        "xor    %7,%10;"
370
25.0k
        "and    %7,%12;"
371
25.0k
        "ror    $0xb,%11;"
372
25.0k
        "psrld  $0xa,%%xmm5;"
373
25.0k
        "xor    %4,%11;"
374
25.0k
        "ror    $0x6,%10;"
375
25.0k
        "xor    %9,%12;"
376
25.0k
        "pxor   %%xmm3,%%xmm2;"
377
25.0k
        "ror    $0x2,%11;"
378
25.0k
        "add    %10,%12;"
379
25.0k
        "add    12+%16,%12;"
380
25.0k
        "pxor   %%xmm2,%%xmm5;"
381
25.0k
        "mov    %4,%10;"
382
25.0k
        "add    %12,%3;"
383
25.0k
        "mov    %4,%12;"
384
25.0k
        "pshufb %%xmm11,%%xmm5;"
385
25.0k
        "or     %6,%10;"
386
25.0k
        "add    %3,%k2;"
387
25.0k
        "and    %6,%12;"
388
25.0k
        "paddd  %%xmm0,%%xmm5;"
389
25.0k
        "and    %5,%10;"
390
25.0k
        "add    %11,%3;"
391
25.0k
        "or     %12,%10;"
392
25.0k
        "add    %10,%3;"
393
25.0k
        "movdqa 0x20(%13),%%xmm9;"
394
25.0k
        "paddd  %%xmm6,%%xmm9;"
395
25.0k
        "movdqa %%xmm9,%16;"
396
25.0k
        "movdqa %%xmm5,%%xmm0;"
397
25.0k
        "mov    %k2,%10;"
398
25.0k
        "ror    $0xe,%10;"
399
25.0k
        "mov    %3,%11;"
400
25.0k
        "palignr $0x4,%%xmm4,%%xmm0;"
401
25.0k
        "ror    $0x9,%11;"
402
25.0k
        "xor    %k2,%10;"
403
25.0k
        "mov    %7,%12;"
404
25.0k
        "ror    $0x5,%10;"
405
25.0k
        "movdqa %%xmm7,%%xmm1;"
406
25.0k
        "xor    %3,%11;"
407
25.0k
        "xor    %8,%12;"
408
25.0k
        "paddd  %%xmm6,%%xmm0;"
409
25.0k
        "xor    %k2,%10;"
410
25.0k
        "and    %k2,%12;"
411
25.0k
        "ror    $0xb,%11;"
412
25.0k
        "palignr $0x4,%%xmm6,%%xmm1;"
413
25.0k
        "xor    %3,%11;"
414
25.0k
        "ror    $0x6,%10;"
415
25.0k
        "xor    %8,%12;"
416
25.0k
        "movdqa %%xmm1,%%xmm2;"
417
25.0k
        "ror    $0x2,%11;"
418
25.0k
        "add    %10,%12;"
419
25.0k
        "add    %16,%12;"
420
25.0k
        "movdqa %%xmm1,%%xmm3;"
421
25.0k
        "mov    %3,%10;"
422
25.0k
        "add    %12,%9;"
423
25.0k
        "mov    %3,%12;"
424
25.0k
        "pslld  $0x19,%%xmm1;"
425
25.0k
        "or     %5,%10;"
426
25.0k
        "add    %9,%6;"
427
25.0k
        "and    %5,%12;"
428
25.0k
        "psrld  $0x7,%%xmm2;"
429
25.0k
        "and    %4,%10;"
430
25.0k
        "add    %11,%9;"
431
25.0k
        "por    %%xmm2,%%xmm1;"
432
25.0k
        "or     %12,%10;"
433
25.0k
        "add    %10,%9;"
434
25.0k
        "movdqa %%xmm3,%%xmm2;"
435
25.0k
        "mov    %6,%10;"
436
25.0k
        "mov    %9,%11;"
437
25.0k
        "movdqa %%xmm3,%%xmm8;"
438
25.0k
        "ror    $0xe,%10;"
439
25.0k
        "xor    %6,%10;"
440
25.0k
        "mov    %k2,%12;"
441
25.0k
        "ror    $0x9,%11;"
442
25.0k
        "pslld  $0xe,%%xmm3;"
443
25.0k
        "xor    %9,%11;"
444
25.0k
        "ror    $0x5,%10;"
445
25.0k
        "xor    %7,%12;"
446
25.0k
        "psrld  $0x12,%%xmm2;"
447
25.0k
        "ror    $0xb,%11;"
448
25.0k
        "xor    %6,%10;"
449
25.0k
        "and    %6,%12;"
450
25.0k
        "ror    $0x6,%10;"
451
25.0k
        "pxor   %%xmm3,%%xmm1;"
452
25.0k
        "xor    %9,%11;"
453
25.0k
        "xor    %7,%12;"
454
25.0k
        "psrld  $0x3,%%xmm8;"
455
25.0k
        "add    %10,%12;"
456
25.0k
        "add    4+%16,%12;"
457
25.0k
        "ror    $0x2,%11;"
458
25.0k
        "pxor   %%xmm2,%%xmm1;"
459
25.0k
        "mov    %9,%10;"
460
25.0k
        "add    %12,%8;"
461
25.0k
        "mov    %9,%12;"
462
25.0k
        "pxor   %%xmm8,%%xmm1;"
463
25.0k
        "or     %4,%10;"
464
25.0k
        "add    %8,%5;"
465
25.0k
        "and    %4,%12;"
466
25.0k
        "pshufd $0xfa,%%xmm5,%%xmm2;"
467
25.0k
        "and    %3,%10;"
468
25.0k
        "add    %11,%8;"
469
25.0k
        "paddd  %%xmm1,%%xmm0;"
470
25.0k
        "or     %12,%10;"
471
25.0k
        "add    %10,%8;"
472
25.0k
        "movdqa %%xmm2,%%xmm3;"
473
25.0k
        "mov    %5,%10;"
474
25.0k
        "mov    %8,%11;"
475
25.0k
        "ror    $0xe,%10;"
476
25.0k
        "movdqa %%xmm2,%%xmm8;"
477
25.0k
        "xor    %5,%10;"
478
25.0k
        "ror    $0x9,%11;"
479
25.0k
        "mov    %6,%12;"
480
25.0k
        "xor    %8,%11;"
481
25.0k
        "ror    $0x5,%10;"
482
25.0k
        "psrlq  $0x11,%%xmm2;"
483
25.0k
        "xor    %k2,%12;"
484
25.0k
        "psrlq  $0x13,%%xmm3;"
485
25.0k
        "xor    %5,%10;"
486
25.0k
        "and    %5,%12;"
487
25.0k
        "psrld  $0xa,%%xmm8;"
488
25.0k
        "ror    $0xb,%11;"
489
25.0k
        "xor    %8,%11;"
490
25.0k
        "xor    %k2,%12;"
491
25.0k
        "ror    $0x6,%10;"
492
25.0k
        "pxor   %%xmm3,%%xmm2;"
493
25.0k
        "add    %10,%12;"
494
25.0k
        "ror    $0x2,%11;"
495
25.0k
        "add    8+%16,%12;"
496
25.0k
        "pxor   %%xmm2,%%xmm8;"
497
25.0k
        "mov    %8,%10;"
498
25.0k
        "add    %12,%7;"
499
25.0k
        "mov    %8,%12;"
500
25.0k
        "pshufb %%xmm10,%%xmm8;"
501
25.0k
        "or     %3,%10;"
502
25.0k
        "add    %7,%4;"
503
25.0k
        "and    %3,%12;"
504
25.0k
        "paddd  %%xmm8,%%xmm0;"
505
25.0k
        "and    %9,%10;"
506
25.0k
        "add    %11,%7;"
507
25.0k
        "pshufd $0x50,%%xmm0,%%xmm2;"
508
25.0k
        "or     %12,%10;"
509
25.0k
        "add    %10,%7;"
510
25.0k
        "movdqa %%xmm2,%%xmm3;"
511
25.0k
        "mov    %4,%10;"
512
25.0k
        "ror    $0xe,%10;"
513
25.0k
        "mov    %7,%11;"
514
25.0k
        "movdqa %%xmm2,%%xmm6;"
515
25.0k
        "ror    $0x9,%11;"
516
25.0k
        "xor    %4,%10;"
517
25.0k
        "mov    %5,%12;"
518
25.0k
        "ror    $0x5,%10;"
519
25.0k
        "psrlq  $0x11,%%xmm2;"
520
25.0k
        "xor    %7,%11;"
521
25.0k
        "xor    %6,%12;"
522
25.0k
        "psrlq  $0x13,%%xmm3;"
523
25.0k
        "xor    %4,%10;"
524
25.0k
        "and    %4,%12;"
525
25.0k
        "ror    $0xb,%11;"
526
25.0k
        "psrld  $0xa,%%xmm6;"
527
25.0k
        "xor    %7,%11;"
528
25.0k
        "ror    $0x6,%10;"
529
25.0k
        "xor    %6,%12;"
530
25.0k
        "pxor   %%xmm3,%%xmm2;"
531
25.0k
        "ror    $0x2,%11;"
532
25.0k
        "add    %10,%12;"
533
25.0k
        "add    12+%16,%12;"
534
25.0k
        "pxor   %%xmm2,%%xmm6;"
535
25.0k
        "mov    %7,%10;"
536
25.0k
        "add    %12,%k2;"
537
25.0k
        "mov    %7,%12;"
538
25.0k
        "pshufb %%xmm11,%%xmm6;"
539
25.0k
        "or     %9,%10;"
540
25.0k
        "add    %k2,%3;"
541
25.0k
        "and    %9,%12;"
542
25.0k
        "paddd  %%xmm0,%%xmm6;"
543
25.0k
        "and    %8,%10;"
544
25.0k
        "add    %11,%k2;"
545
25.0k
        "or     %12,%10;"
546
25.0k
        "add    %10,%k2;"
547
25.0k
        "movdqa 0x30(%13),%%xmm9;"
548
25.0k
        "paddd  %%xmm7,%%xmm9;"
549
25.0k
        "movdqa %%xmm9,%16;"
550
25.0k
        "add    $0x40,%13;"
551
25.0k
        "movdqa %%xmm6,%%xmm0;"
552
25.0k
        "mov    %3,%10;"
553
25.0k
        "ror    $0xe,%10;"
554
25.0k
        "mov    %k2,%11;"
555
25.0k
        "palignr $0x4,%%xmm5,%%xmm0;"
556
25.0k
        "ror    $0x9,%11;"
557
25.0k
        "xor    %3,%10;"
558
25.0k
        "mov    %4,%12;"
559
25.0k
        "ror    $0x5,%10;"
560
25.0k
        "movdqa %%xmm4,%%xmm1;"
561
25.0k
        "xor    %k2,%11;"
562
25.0k
        "xor    %5,%12;"
563
25.0k
        "paddd  %%xmm7,%%xmm0;"
564
25.0k
        "xor    %3,%10;"
565
25.0k
        "and    %3,%12;"
566
25.0k
        "ror    $0xb,%11;"
567
25.0k
        "palignr $0x4,%%xmm7,%%xmm1;"
568
25.0k
        "xor    %k2,%11;"
569
25.0k
        "ror    $0x6,%10;"
570
25.0k
        "xor    %5,%12;"
571
25.0k
        "movdqa %%xmm1,%%xmm2;"
572
25.0k
        "ror    $0x2,%11;"
573
25.0k
        "add    %10,%12;"
574
25.0k
        "add    %16,%12;"
575
25.0k
        "movdqa %%xmm1,%%xmm3;"
576
25.0k
        "mov    %k2,%10;"
577
25.0k
        "add    %12,%6;"
578
25.0k
        "mov    %k2,%12;"
579
25.0k
        "pslld  $0x19,%%xmm1;"
580
25.0k
        "or     %8,%10;"
581
25.0k
        "add    %6,%9;"
582
25.0k
        "and    %8,%12;"
583
25.0k
        "psrld  $0x7,%%xmm2;"
584
25.0k
        "and    %7,%10;"
585
25.0k
        "add    %11,%6;"
586
25.0k
        "por    %%xmm2,%%xmm1;"
587
25.0k
        "or     %12,%10;"
588
25.0k
        "add    %10,%6;"
589
25.0k
        "movdqa %%xmm3,%%xmm2;"
590
25.0k
        "mov    %9,%10;"
591
25.0k
        "mov    %6,%11;"
592
25.0k
        "movdqa %%xmm3,%%xmm8;"
593
25.0k
        "ror    $0xe,%10;"
594
25.0k
        "xor    %9,%10;"
595
25.0k
        "mov    %3,%12;"
596
25.0k
        "ror    $0x9,%11;"
597
25.0k
        "pslld  $0xe,%%xmm3;"
598
25.0k
        "xor    %6,%11;"
599
25.0k
        "ror    $0x5,%10;"
600
25.0k
        "xor    %4,%12;"
601
25.0k
        "psrld  $0x12,%%xmm2;"
602
25.0k
        "ror    $0xb,%11;"
603
25.0k
        "xor    %9,%10;"
604
25.0k
        "and    %9,%12;"
605
25.0k
        "ror    $0x6,%10;"
606
25.0k
        "pxor   %%xmm3,%%xmm1;"
607
25.0k
        "xor    %6,%11;"
608
25.0k
        "xor    %4,%12;"
609
25.0k
        "psrld  $0x3,%%xmm8;"
610
25.0k
        "add    %10,%12;"
611
25.0k
        "add    4+%16,%12;"
612
25.0k
        "ror    $0x2,%11;"
613
25.0k
        "pxor   %%xmm2,%%xmm1;"
614
25.0k
        "mov    %6,%10;"
615
25.0k
        "add    %12,%5;"
616
25.0k
        "mov    %6,%12;"
617
25.0k
        "pxor   %%xmm8,%%xmm1;"
618
25.0k
        "or     %7,%10;"
619
25.0k
        "add    %5,%8;"
620
25.0k
        "and    %7,%12;"
621
25.0k
        "pshufd $0xfa,%%xmm6,%%xmm2;"
622
25.0k
        "and    %k2,%10;"
623
25.0k
        "add    %11,%5;"
624
25.0k
        "paddd  %%xmm1,%%xmm0;"
625
25.0k
        "or     %12,%10;"
626
25.0k
        "add    %10,%5;"
627
25.0k
        "movdqa %%xmm2,%%xmm3;"
628
25.0k
        "mov    %8,%10;"
629
25.0k
        "mov    %5,%11;"
630
25.0k
        "ror    $0xe,%10;"
631
25.0k
        "movdqa %%xmm2,%%xmm8;"
632
25.0k
        "xor    %8,%10;"
633
25.0k
        "ror    $0x9,%11;"
634
25.0k
        "mov    %9,%12;"
635
25.0k
        "xor    %5,%11;"
636
25.0k
        "ror    $0x5,%10;"
637
25.0k
        "psrlq  $0x11,%%xmm2;"
638
25.0k
        "xor    %3,%12;"
639
25.0k
        "psrlq  $0x13,%%xmm3;"
640
25.0k
        "xor    %8,%10;"
641
25.0k
        "and    %8,%12;"
642
25.0k
        "psrld  $0xa,%%xmm8;"
643
25.0k
        "ror    $0xb,%11;"
644
25.0k
        "xor    %5,%11;"
645
25.0k
        "xor    %3,%12;"
646
25.0k
        "ror    $0x6,%10;"
647
25.0k
        "pxor   %%xmm3,%%xmm2;"
648
25.0k
        "add    %10,%12;"
649
25.0k
        "ror    $0x2,%11;"
650
25.0k
        "add    8+%16,%12;"
651
25.0k
        "pxor   %%xmm2,%%xmm8;"
652
25.0k
        "mov    %5,%10;"
653
25.0k
        "add    %12,%4;"
654
25.0k
        "mov    %5,%12;"
655
25.0k
        "pshufb %%xmm10,%%xmm8;"
656
25.0k
        "or     %k2,%10;"
657
25.0k
        "add    %4,%7;"
658
25.0k
        "and    %k2,%12;"
659
25.0k
        "paddd  %%xmm8,%%xmm0;"
660
25.0k
        "and    %6,%10;"
661
25.0k
        "add    %11,%4;"
662
25.0k
        "pshufd $0x50,%%xmm0,%%xmm2;"
663
25.0k
        "or     %12,%10;"
664
25.0k
        "add    %10,%4;"
665
25.0k
        "movdqa %%xmm2,%%xmm3;"
666
25.0k
        "mov    %7,%10;"
667
25.0k
        "ror    $0xe,%10;"
668
25.0k
        "mov    %4,%11;"
669
25.0k
        "movdqa %%xmm2,%%xmm7;"
670
25.0k
        "ror    $0x9,%11;"
671
25.0k
        "xor    %7,%10;"
672
25.0k
        "mov    %8,%12;"
673
25.0k
        "ror    $0x5,%10;"
674
25.0k
        "psrlq  $0x11,%%xmm2;"
675
25.0k
        "xor    %4,%11;"
676
25.0k
        "xor    %9,%12;"
677
25.0k
        "psrlq  $0x13,%%xmm3;"
678
25.0k
        "xor    %7,%10;"
679
25.0k
        "and    %7,%12;"
680
25.0k
        "ror    $0xb,%11;"
681
25.0k
        "psrld  $0xa,%%xmm7;"
682
25.0k
        "xor    %4,%11;"
683
25.0k
        "ror    $0x6,%10;"
684
25.0k
        "xor    %9,%12;"
685
25.0k
        "pxor   %%xmm3,%%xmm2;"
686
25.0k
        "ror    $0x2,%11;"
687
25.0k
        "add    %10,%12;"
688
25.0k
        "add    12+%16,%12;"
689
25.0k
        "pxor   %%xmm2,%%xmm7;"
690
25.0k
        "mov    %4,%10;"
691
25.0k
        "add    %12,%3;"
692
25.0k
        "mov    %4,%12;"
693
25.0k
        "pshufb %%xmm11,%%xmm7;"
694
25.0k
        "or     %6,%10;"
695
25.0k
        "add    %3,%k2;"
696
25.0k
        "and    %6,%12;"
697
25.0k
        "paddd  %%xmm0,%%xmm7;"
698
25.0k
        "and    %5,%10;"
699
25.0k
        "add    %11,%3;"
700
25.0k
        "or     %12,%10;"
701
25.0k
        "add    %10,%3;"
702
25.0k
        "sub    $0x1,%1;"
703
25.0k
        "jne    Lloop1_%=;"
704
25.0k
        "mov    $0x2,%1;"
705
706
25.0k
        "Lloop2_%=:"
707
25.0k
        "paddd  0x0(%13),%%xmm4;"
708
25.0k
        "movdqa %%xmm4,%16;"
709
25.0k
        "mov    %k2,%10;"
710
25.0k
        "ror    $0xe,%10;"
711
25.0k
        "mov    %3,%11;"
712
25.0k
        "xor    %k2,%10;"
713
25.0k
        "ror    $0x9,%11;"
714
25.0k
        "mov    %7,%12;"
715
25.0k
        "xor    %3,%11;"
716
25.0k
        "ror    $0x5,%10;"
717
25.0k
        "xor    %8,%12;"
718
25.0k
        "xor    %k2,%10;"
719
25.0k
        "ror    $0xb,%11;"
720
25.0k
        "and    %k2,%12;"
721
25.0k
        "xor    %3,%11;"
722
25.0k
        "ror    $0x6,%10;"
723
25.0k
        "xor    %8,%12;"
724
25.0k
        "add    %10,%12;"
725
25.0k
        "ror    $0x2,%11;"
726
25.0k
        "add    %16,%12;"
727
25.0k
        "mov    %3,%10;"
728
25.0k
        "add    %12,%9;"
729
25.0k
        "mov    %3,%12;"
730
25.0k
        "or     %5,%10;"
731
25.0k
        "add    %9,%6;"
732
25.0k
        "and    %5,%12;"
733
25.0k
        "and    %4,%10;"
734
25.0k
        "add    %11,%9;"
735
25.0k
        "or     %12,%10;"
736
25.0k
        "add    %10,%9;"
737
25.0k
        "mov    %6,%10;"
738
25.0k
        "ror    $0xe,%10;"
739
25.0k
        "mov    %9,%11;"
740
25.0k
        "xor    %6,%10;"
741
25.0k
        "ror    $0x9,%11;"
742
25.0k
        "mov    %k2,%12;"
743
25.0k
        "xor    %9,%11;"
744
25.0k
        "ror    $0x5,%10;"
745
25.0k
        "xor    %7,%12;"
746
25.0k
        "xor    %6,%10;"
747
25.0k
        "ror    $0xb,%11;"
748
25.0k
        "and    %6,%12;"
749
25.0k
        "xor    %9,%11;"
750
25.0k
        "ror    $0x6,%10;"
751
25.0k
        "xor    %7,%12;"
752
25.0k
        "add    %10,%12;"
753
25.0k
        "ror    $0x2,%11;"
754
25.0k
        "add    4+%16,%12;"
755
25.0k
        "mov    %9,%10;"
756
25.0k
        "add    %12,%8;"
757
25.0k
        "mov    %9,%12;"
758
25.0k
        "or     %4,%10;"
759
25.0k
        "add    %8,%5;"
760
25.0k
        "and    %4,%12;"
761
25.0k
        "and    %3,%10;"
762
25.0k
        "add    %11,%8;"
763
25.0k
        "or     %12,%10;"
764
25.0k
        "add    %10,%8;"
765
25.0k
        "mov    %5,%10;"
766
25.0k
        "ror    $0xe,%10;"
767
25.0k
        "mov    %8,%11;"
768
25.0k
        "xor    %5,%10;"
769
25.0k
        "ror    $0x9,%11;"
770
25.0k
        "mov    %6,%12;"
771
25.0k
        "xor    %8,%11;"
772
25.0k
        "ror    $0x5,%10;"
773
25.0k
        "xor    %k2,%12;"
774
25.0k
        "xor    %5,%10;"
775
25.0k
        "ror    $0xb,%11;"
776
25.0k
        "and    %5,%12;"
777
25.0k
        "xor    %8,%11;"
778
25.0k
        "ror    $0x6,%10;"
779
25.0k
        "xor    %k2,%12;"
780
25.0k
        "add    %10,%12;"
781
25.0k
        "ror    $0x2,%11;"
782
25.0k
        "add    8+%16,%12;"
783
25.0k
        "mov    %8,%10;"
784
25.0k
        "add    %12,%7;"
785
25.0k
        "mov    %8,%12;"
786
25.0k
        "or     %3,%10;"
787
25.0k
        "add    %7,%4;"
788
25.0k
        "and    %3,%12;"
789
25.0k
        "and    %9,%10;"
790
25.0k
        "add    %11,%7;"
791
25.0k
        "or     %12,%10;"
792
25.0k
        "add    %10,%7;"
793
25.0k
        "mov    %4,%10;"
794
25.0k
        "ror    $0xe,%10;"
795
25.0k
        "mov    %7,%11;"
796
25.0k
        "xor    %4,%10;"
797
25.0k
        "ror    $0x9,%11;"
798
25.0k
        "mov    %5,%12;"
799
25.0k
        "xor    %7,%11;"
800
25.0k
        "ror    $0x5,%10;"
801
25.0k
        "xor    %6,%12;"
802
25.0k
        "xor    %4,%10;"
803
25.0k
        "ror    $0xb,%11;"
804
25.0k
        "and    %4,%12;"
805
25.0k
        "xor    %7,%11;"
806
25.0k
        "ror    $0x6,%10;"
807
25.0k
        "xor    %6,%12;"
808
25.0k
        "add    %10,%12;"
809
25.0k
        "ror    $0x2,%11;"
810
25.0k
        "add    12+%16,%12;"
811
25.0k
        "mov    %7,%10;"
812
25.0k
        "add    %12,%k2;"
813
25.0k
        "mov    %7,%12;"
814
25.0k
        "or     %9,%10;"
815
25.0k
        "add    %k2,%3;"
816
25.0k
        "and    %9,%12;"
817
25.0k
        "and    %8,%10;"
818
25.0k
        "add    %11,%k2;"
819
25.0k
        "or     %12,%10;"
820
25.0k
        "add    %10,%k2;"
821
25.0k
        "paddd  0x10(%13),%%xmm5;"
822
25.0k
        "movdqa %%xmm5,%16;"
823
25.0k
        "add    $0x20,%13;"
824
25.0k
        "mov    %3,%10;"
825
25.0k
        "ror    $0xe,%10;"
826
25.0k
        "mov    %k2,%11;"
827
25.0k
        "xor    %3,%10;"
828
25.0k
        "ror    $0x9,%11;"
829
25.0k
        "mov    %4,%12;"
830
25.0k
        "xor    %k2,%11;"
831
25.0k
        "ror    $0x5,%10;"
832
25.0k
        "xor    %5,%12;"
833
25.0k
        "xor    %3,%10;"
834
25.0k
        "ror    $0xb,%11;"
835
25.0k
        "and    %3,%12;"
836
25.0k
        "xor    %k2,%11;"
837
25.0k
        "ror    $0x6,%10;"
838
25.0k
        "xor    %5,%12;"
839
25.0k
        "add    %10,%12;"
840
25.0k
        "ror    $0x2,%11;"
841
25.0k
        "add    %16,%12;"
842
25.0k
        "mov    %k2,%10;"
843
25.0k
        "add    %12,%6;"
844
25.0k
        "mov    %k2,%12;"
845
25.0k
        "or     %8,%10;"
846
25.0k
        "add    %6,%9;"
847
25.0k
        "and    %8,%12;"
848
25.0k
        "and    %7,%10;"
849
25.0k
        "add    %11,%6;"
850
25.0k
        "or     %12,%10;"
851
25.0k
        "add    %10,%6;"
852
25.0k
        "mov    %9,%10;"
853
25.0k
        "ror    $0xe,%10;"
854
25.0k
        "mov    %6,%11;"
855
25.0k
        "xor    %9,%10;"
856
25.0k
        "ror    $0x9,%11;"
857
25.0k
        "mov    %3,%12;"
858
25.0k
        "xor    %6,%11;"
859
25.0k
        "ror    $0x5,%10;"
860
25.0k
        "xor    %4,%12;"
861
25.0k
        "xor    %9,%10;"
862
25.0k
        "ror    $0xb,%11;"
863
25.0k
        "and    %9,%12;"
864
25.0k
        "xor    %6,%11;"
865
25.0k
        "ror    $0x6,%10;"
866
25.0k
        "xor    %4,%12;"
867
25.0k
        "add    %10,%12;"
868
25.0k
        "ror    $0x2,%11;"
869
25.0k
        "add    4+%16,%12;"
870
25.0k
        "mov    %6,%10;"
871
25.0k
        "add    %12,%5;"
872
25.0k
        "mov    %6,%12;"
873
25.0k
        "or     %7,%10;"
874
25.0k
        "add    %5,%8;"
875
25.0k
        "and    %7,%12;"
876
25.0k
        "and    %k2,%10;"
877
25.0k
        "add    %11,%5;"
878
25.0k
        "or     %12,%10;"
879
25.0k
        "add    %10,%5;"
880
25.0k
        "mov    %8,%10;"
881
25.0k
        "ror    $0xe,%10;"
882
25.0k
        "mov    %5,%11;"
883
25.0k
        "xor    %8,%10;"
884
25.0k
        "ror    $0x9,%11;"
885
25.0k
        "mov    %9,%12;"
886
25.0k
        "xor    %5,%11;"
887
25.0k
        "ror    $0x5,%10;"
888
25.0k
        "xor    %3,%12;"
889
25.0k
        "xor    %8,%10;"
890
25.0k
        "ror    $0xb,%11;"
891
25.0k
        "and    %8,%12;"
892
25.0k
        "xor    %5,%11;"
893
25.0k
        "ror    $0x6,%10;"
894
25.0k
        "xor    %3,%12;"
895
25.0k
        "add    %10,%12;"
896
25.0k
        "ror    $0x2,%11;"
897
25.0k
        "add    8+%16,%12;"
898
25.0k
        "mov    %5,%10;"
899
25.0k
        "add    %12,%4;"
900
25.0k
        "mov    %5,%12;"
901
25.0k
        "or     %k2,%10;"
902
25.0k
        "add    %4,%7;"
903
25.0k
        "and    %k2,%12;"
904
25.0k
        "and    %6,%10;"
905
25.0k
        "add    %11,%4;"
906
25.0k
        "or     %12,%10;"
907
25.0k
        "add    %10,%4;"
908
25.0k
        "mov    %7,%10;"
909
25.0k
        "ror    $0xe,%10;"
910
25.0k
        "mov    %4,%11;"
911
25.0k
        "xor    %7,%10;"
912
25.0k
        "ror    $0x9,%11;"
913
25.0k
        "mov    %8,%12;"
914
25.0k
        "xor    %4,%11;"
915
25.0k
        "ror    $0x5,%10;"
916
25.0k
        "xor    %9,%12;"
917
25.0k
        "xor    %7,%10;"
918
25.0k
        "ror    $0xb,%11;"
919
25.0k
        "and    %7,%12;"
920
25.0k
        "xor    %4,%11;"
921
25.0k
        "ror    $0x6,%10;"
922
25.0k
        "xor    %9,%12;"
923
25.0k
        "add    %10,%12;"
924
25.0k
        "ror    $0x2,%11;"
925
25.0k
        "add    12+%16,%12;"
926
25.0k
        "mov    %4,%10;"
927
25.0k
        "add    %12,%3;"
928
25.0k
        "mov    %4,%12;"
929
25.0k
        "or     %6,%10;"
930
25.0k
        "add    %3,%k2;"
931
25.0k
        "and    %6,%12;"
932
25.0k
        "and    %5,%10;"
933
25.0k
        "add    %11,%3;"
934
25.0k
        "or     %12,%10;"
935
25.0k
        "add    %10,%3;"
936
25.0k
        "movdqa %%xmm6,%%xmm4;"
937
25.0k
        "movdqa %%xmm7,%%xmm5;"
938
25.0k
        "sub    $0x1,%1;"
939
25.0k
        "jne    Lloop2_%=;"
940
25.0k
        "add    (%0),%3;"
941
25.0k
        "mov    %3,(%0);"
942
25.0k
        "add    0x4(%0),%4;"
943
25.0k
        "mov    %4,0x4(%0);"
944
25.0k
        "add    0x8(%0),%5;"
945
25.0k
        "mov    %5,0x8(%0);"
946
25.0k
        "add    0xc(%0),%6;"
947
25.0k
        "mov    %6,0xc(%0);"
948
25.0k
        "add    0x10(%0),%k2;"
949
25.0k
        "mov    %k2,0x10(%0);"
950
25.0k
        "add    0x14(%0),%7;"
951
25.0k
        "mov    %7,0x14(%0);"
952
25.0k
        "add    0x18(%0),%8;"
953
25.0k
        "mov    %8,0x18(%0);"
954
25.0k
        "add    0x1c(%0),%9;"
955
25.0k
        "mov    %9,0x1c(%0);"
956
25.0k
        "mov    %15,%1;"
957
25.0k
        "add    $0x40,%1;"
958
25.0k
        "cmp    %14,%1;"
959
25.0k
        "jne    Lloop0_%=;"
960
961
25.0k
        "Ldone_hash_%=:"
962
963
25.0k
        : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
964
25.0k
        : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
965
25.0k
        : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
966
25.0k
   );
967
25.0k
}
968
}
969
970
/*
971
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
972
; Copyright (c) 2012, Intel Corporation
973
;
974
; All rights reserved.
975
;
976
; Redistribution and use in source and binary forms, with or without
977
; modification, are permitted provided that the following conditions are
978
; met:
979
;
980
; * Redistributions of source code must retain the above copyright
981
;   notice, this list of conditions and the following disclaimer.
982
;
983
; * Redistributions in binary form must reproduce the above copyright
984
;   notice, this list of conditions and the following disclaimer in the
985
;   documentation and/or other materials provided with the
986
;   distribution.
987
;
988
; * Neither the name of the Intel Corporation nor the names of its
989
;   contributors may be used to endorse or promote products derived from
990
;   this software without specific prior written permission.
991
;
992
;
993
; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
994
; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
995
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
996
; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
997
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
998
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
999
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
1000
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
1001
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1002
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1003
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1004
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1005
;
1006
; Example YASM command lines:
1007
; Windows:  yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1008
; Linux:    yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1009
;
1010
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1011
;
1012
; This code is described in an Intel White-Paper:
1013
; "Fast SHA-256 Implementations on Intel Architecture Processors"
1014
;
1015
; To find it, surf to https://www.intel.com/p/en_US/embedded
1016
; and search for that title.
1017
; The paper is expected to be released roughly at the end of April, 2012
1018
;
1019
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1020
; This code schedules 1 blocks at a time, with 4 lanes per block
1021
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1022
1023
%define MOVDQ movdqu ;; assume buffers not aligned
1024
1025
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1026
1027
; addm [mem], reg
1028
; Add reg to mem using reg-mem add and store
1029
%macro addm 2
1030
    add %2, %1
1031
    mov %1, %2
1032
%endm
1033
1034
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1035
1036
; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1037
; Load xmm with mem and byte swap each dword
1038
%macro COPY_XMM_AND_BSWAP 3
1039
    MOVDQ %1, %2
1040
    pshufb %1, %3
1041
%endmacro
1042
1043
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1044
1045
%define X0 xmm4
1046
%define X1 xmm5
1047
%define X2 xmm6
1048
%define X3 xmm7
1049
1050
%define XTMP0 xmm0
1051
%define XTMP1 xmm1
1052
%define XTMP2 xmm2
1053
%define XTMP3 xmm3
1054
%define XTMP4 xmm8
1055
%define XFER  xmm9
1056
1057
%define SHUF_00BA   xmm10 ; shuffle xBxA -> 00BA
1058
%define SHUF_DC00   xmm11 ; shuffle xDxC -> DC00
1059
%define BYTE_FLIP_MASK  xmm12
1060
1061
%ifdef LINUX
1062
%define NUM_BLKS rdx    ; 3rd arg
1063
%define CTX rsi ; 2nd arg
1064
%define INP rdi ; 1st arg
1065
1066
%define SRND    rdi ; clobbers INP
1067
%define c   ecx
1068
%define d   r8d
1069
%define e   edx
1070
%else
1071
%define NUM_BLKS r8 ; 3rd arg
1072
%define CTX rdx     ; 2nd arg
1073
%define INP rcx     ; 1st arg
1074
1075
%define SRND    rcx ; clobbers INP
1076
%define c   edi
1077
%define d   esi
1078
%define e   r8d
1079
1080
%endif
1081
%define TBL rbp
1082
%define a eax
1083
%define b ebx
1084
1085
%define f r9d
1086
%define g r10d
1087
%define h r11d
1088
1089
%define y0 r13d
1090
%define y1 r14d
1091
%define y2 r15d
1092
1093
1094
1095
_INP_END_SIZE   equ 8
1096
_INP_SIZE   equ 8
1097
_XFER_SIZE  equ 8
1098
%ifdef LINUX
1099
_XMM_SAVE_SIZE  equ 0
1100
%else
1101
_XMM_SAVE_SIZE  equ 7*16
1102
%endif
1103
; STACK_SIZE plus pushes must be an odd multiple of 8
1104
_ALIGN_SIZE equ 8
1105
1106
_INP_END    equ 0
1107
_INP        equ _INP_END  + _INP_END_SIZE
1108
_XFER       equ _INP      + _INP_SIZE
1109
_XMM_SAVE   equ _XFER     + _XFER_SIZE + _ALIGN_SIZE
1110
STACK_SIZE  equ _XMM_SAVE + _XMM_SAVE_SIZE
1111
1112
; rotate_Xs
1113
; Rotate values of symbols X0...X3
1114
%macro rotate_Xs 0
1115
%xdefine X_ X0
1116
%xdefine X0 X1
1117
%xdefine X1 X2
1118
%xdefine X2 X3
1119
%xdefine X3 X_
1120
%endm
1121
1122
; ROTATE_ARGS
1123
; Rotate values of symbols a...h
1124
%macro ROTATE_ARGS 0
1125
%xdefine TMP_ h
1126
%xdefine h g
1127
%xdefine g f
1128
%xdefine f e
1129
%xdefine e d
1130
%xdefine d c
1131
%xdefine c b
1132
%xdefine b a
1133
%xdefine a TMP_
1134
%endm
1135
1136
%macro FOUR_ROUNDS_AND_SCHED 0
1137
    ;; compute s0 four at a time and s1 two at a time
1138
    ;; compute W[-16] + W[-7] 4 at a time
1139
    movdqa  XTMP0, X3
1140
    mov y0, e       ; y0 = e
1141
    ror y0, (25-11) ; y0 = e >> (25-11)
1142
    mov y1, a       ; y1 = a
1143
    palignr XTMP0, X2, 4    ; XTMP0 = W[-7]
1144
    ror y1, (22-13) ; y1 = a >> (22-13)
1145
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1146
    mov y2, f       ; y2 = f
1147
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1148
    movdqa  XTMP1, X1
1149
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1150
    xor y2, g       ; y2 = f^g
1151
    paddd   XTMP0, X0   ; XTMP0 = W[-7] + W[-16]
1152
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1153
    and y2, e       ; y2 = (f^g)&e
1154
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1155
    ;; compute s0
1156
    palignr XTMP1, X0, 4    ; XTMP1 = W[-15]
1157
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1158
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1159
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1160
    movdqa  XTMP2, XTMP1    ; XTMP2 = W[-15]
1161
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1162
    add y2, y0      ; y2 = S1 + CH
1163
    add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1164
    movdqa  XTMP3, XTMP1    ; XTMP3 = W[-15]
1165
    mov y0, a       ; y0 = a
1166
    add h, y2       ; h = h + S1 + CH + k + w
1167
    mov y2, a       ; y2 = a
1168
    pslld   XTMP1, (32-7)
1169
    or  y0, c       ; y0 = a|c
1170
    add d, h        ; d = d + h + S1 + CH + k + w
1171
    and y2, c       ; y2 = a&c
1172
    psrld   XTMP2, 7
1173
    and y0, b       ; y0 = (a|c)&b
1174
    add h, y1       ; h = h + S1 + CH + k + w + S0
1175
    por XTMP1, XTMP2    ; XTMP1 = W[-15] ror 7
1176
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1177
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1178
1179
ROTATE_ARGS
1180
    movdqa  XTMP2, XTMP3    ; XTMP2 = W[-15]
1181
    mov y0, e       ; y0 = e
1182
    mov y1, a       ; y1 = a
1183
    movdqa  XTMP4, XTMP3    ; XTMP4 = W[-15]
1184
    ror y0, (25-11) ; y0 = e >> (25-11)
1185
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1186
    mov y2, f       ; y2 = f
1187
    ror y1, (22-13) ; y1 = a >> (22-13)
1188
    pslld   XTMP3, (32-18)
1189
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1190
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1191
    xor y2, g       ; y2 = f^g
1192
    psrld   XTMP2, 18
1193
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1194
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1195
    and y2, e       ; y2 = (f^g)&e
1196
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1197
    pxor    XTMP1, XTMP3
1198
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1199
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1200
    psrld   XTMP4, 3    ; XTMP4 = W[-15] >> 3
1201
    add y2, y0      ; y2 = S1 + CH
1202
    add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1203
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1204
    pxor    XTMP1, XTMP2    ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1205
    mov y0, a       ; y0 = a
1206
    add h, y2       ; h = h + S1 + CH + k + w
1207
    mov y2, a       ; y2 = a
1208
    pxor    XTMP1, XTMP4    ; XTMP1 = s0
1209
    or  y0, c       ; y0 = a|c
1210
    add d, h        ; d = d + h + S1 + CH + k + w
1211
    and y2, c       ; y2 = a&c
1212
    ;; compute low s1
1213
    pshufd  XTMP2, X3, 11111010b    ; XTMP2 = W[-2] {BBAA}
1214
    and y0, b       ; y0 = (a|c)&b
1215
    add h, y1       ; h = h + S1 + CH + k + w + S0
1216
    paddd   XTMP0, XTMP1    ; XTMP0 = W[-16] + W[-7] + s0
1217
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1218
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1219
1220
ROTATE_ARGS
1221
    movdqa  XTMP3, XTMP2    ; XTMP3 = W[-2] {BBAA}
1222
    mov y0, e       ; y0 = e
1223
    mov y1, a       ; y1 = a
1224
    ror y0, (25-11) ; y0 = e >> (25-11)
1225
    movdqa  XTMP4, XTMP2    ; XTMP4 = W[-2] {BBAA}
1226
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1227
    ror y1, (22-13) ; y1 = a >> (22-13)
1228
    mov y2, f       ; y2 = f
1229
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1230
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1231
    psrlq   XTMP2, 17   ; XTMP2 = W[-2] ror 17 {xBxA}
1232
    xor y2, g       ; y2 = f^g
1233
    psrlq   XTMP3, 19   ; XTMP3 = W[-2] ror 19 {xBxA}
1234
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1235
    and y2, e       ; y2 = (f^g)&e
1236
    psrld   XTMP4, 10   ; XTMP4 = W[-2] >> 10 {BBAA}
1237
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1238
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1239
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1240
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1241
    pxor    XTMP2, XTMP3
1242
    add y2, y0      ; y2 = S1 + CH
1243
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1244
    add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1245
    pxor    XTMP4, XTMP2    ; XTMP4 = s1 {xBxA}
1246
    mov y0, a       ; y0 = a
1247
    add h, y2       ; h = h + S1 + CH + k + w
1248
    mov y2, a       ; y2 = a
1249
    pshufb  XTMP4, SHUF_00BA    ; XTMP4 = s1 {00BA}
1250
    or  y0, c       ; y0 = a|c
1251
    add d, h        ; d = d + h + S1 + CH + k + w
1252
    and y2, c       ; y2 = a&c
1253
    paddd   XTMP0, XTMP4    ; XTMP0 = {..., ..., W[1], W[0]}
1254
    and y0, b       ; y0 = (a|c)&b
1255
    add h, y1       ; h = h + S1 + CH + k + w + S0
1256
    ;; compute high s1
1257
    pshufd  XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1258
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1259
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1260
1261
ROTATE_ARGS
1262
    movdqa  XTMP3, XTMP2    ; XTMP3 = W[-2] {DDCC}
1263
    mov y0, e       ; y0 = e
1264
    ror y0, (25-11) ; y0 = e >> (25-11)
1265
    mov y1, a       ; y1 = a
1266
    movdqa  X0,    XTMP2    ; X0    = W[-2] {DDCC}
1267
    ror y1, (22-13) ; y1 = a >> (22-13)
1268
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1269
    mov y2, f       ; y2 = f
1270
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1271
    psrlq   XTMP2, 17   ; XTMP2 = W[-2] ror 17 {xDxC}
1272
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1273
    xor y2, g       ; y2 = f^g
1274
    psrlq   XTMP3, 19   ; XTMP3 = W[-2] ror 19 {xDxC}
1275
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1276
    and y2, e       ; y2 = (f^g)&e
1277
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1278
    psrld   X0,    10   ; X0 = W[-2] >> 10 {DDCC}
1279
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1280
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1281
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1282
    pxor    XTMP2, XTMP3
1283
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1284
    add y2, y0      ; y2 = S1 + CH
1285
    add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1286
    pxor    X0, XTMP2   ; X0 = s1 {xDxC}
1287
    mov y0, a       ; y0 = a
1288
    add h, y2       ; h = h + S1 + CH + k + w
1289
    mov y2, a       ; y2 = a
1290
    pshufb  X0, SHUF_DC00   ; X0 = s1 {DC00}
1291
    or  y0, c       ; y0 = a|c
1292
    add d, h        ; d = d + h + S1 + CH + k + w
1293
    and y2, c       ; y2 = a&c
1294
    paddd   X0, XTMP0   ; X0 = {W[3], W[2], W[1], W[0]}
1295
    and y0, b       ; y0 = (a|c)&b
1296
    add h, y1       ; h = h + S1 + CH + k + w + S0
1297
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1298
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1299
1300
ROTATE_ARGS
1301
rotate_Xs
1302
%endm
1303
1304
;; input is [rsp + _XFER + %1 * 4]
1305
%macro DO_ROUND 1
1306
    mov y0, e       ; y0 = e
1307
    ror y0, (25-11) ; y0 = e >> (25-11)
1308
    mov y1, a       ; y1 = a
1309
    xor y0, e       ; y0 = e ^ (e >> (25-11))
1310
    ror y1, (22-13) ; y1 = a >> (22-13)
1311
    mov y2, f       ; y2 = f
1312
    xor y1, a       ; y1 = a ^ (a >> (22-13)
1313
    ror y0, (11-6)  ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1314
    xor y2, g       ; y2 = f^g
1315
    xor y0, e       ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1316
    ror y1, (13-2)  ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1317
    and y2, e       ; y2 = (f^g)&e
1318
    xor y1, a       ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1319
    ror y0, 6       ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1320
    xor y2, g       ; y2 = CH = ((f^g)&e)^g
1321
    add y2, y0      ; y2 = S1 + CH
1322
    ror y1, 2       ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1323
    add y2, [rsp + _XFER + %1 * 4]  ; y2 = k + w + S1 + CH
1324
    mov y0, a       ; y0 = a
1325
    add h, y2       ; h = h + S1 + CH + k + w
1326
    mov y2, a       ; y2 = a
1327
    or  y0, c       ; y0 = a|c
1328
    add d, h        ; d = d + h + S1 + CH + k + w
1329
    and y2, c       ; y2 = a&c
1330
    and y0, b       ; y0 = (a|c)&b
1331
    add h, y1       ; h = h + S1 + CH + k + w + S0
1332
    or  y0, y2      ; y0 = MAJ = (a|c)&b)|(a&c)
1333
    add h, y0       ; h = h + S1 + CH + k + w + S0 + MAJ
1334
    ROTATE_ARGS
1335
%endm
1336
1337
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1338
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1339
;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1340
;; arg 1 : pointer to input data
1341
;; arg 2 : pointer to digest
1342
;; arg 3 : Num blocks
1343
section .text
1344
global sha256_sse4
1345
align 32
1346
sha256_sse4:
1347
    push    rbx
1348
%ifndef LINUX
1349
    push    rsi
1350
    push    rdi
1351
%endif
1352
    push    rbp
1353
    push    r13
1354
    push    r14
1355
    push    r15
1356
1357
    sub rsp,STACK_SIZE
1358
%ifndef LINUX
1359
    movdqa  [rsp + _XMM_SAVE + 0*16],xmm6
1360
    movdqa  [rsp + _XMM_SAVE + 1*16],xmm7
1361
    movdqa  [rsp + _XMM_SAVE + 2*16],xmm8
1362
    movdqa  [rsp + _XMM_SAVE + 3*16],xmm9
1363
    movdqa  [rsp + _XMM_SAVE + 4*16],xmm10
1364
    movdqa  [rsp + _XMM_SAVE + 5*16],xmm11
1365
    movdqa  [rsp + _XMM_SAVE + 6*16],xmm12
1366
%endif
1367
1368
    shl NUM_BLKS, 6 ; convert to bytes
1369
    jz  done_hash
1370
    add NUM_BLKS, INP   ; pointer to end of data
1371
    mov [rsp + _INP_END], NUM_BLKS
1372
1373
    ;; load initial digest
1374
    mov a,[4*0 + CTX]
1375
    mov b,[4*1 + CTX]
1376
    mov c,[4*2 + CTX]
1377
    mov d,[4*3 + CTX]
1378
    mov e,[4*4 + CTX]
1379
    mov f,[4*5 + CTX]
1380
    mov g,[4*6 + CTX]
1381
    mov h,[4*7 + CTX]
1382
1383
    movdqa  BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1384
    movdqa  SHUF_00BA, [_SHUF_00BA wrt rip]
1385
    movdqa  SHUF_DC00, [_SHUF_DC00 wrt rip]
1386
1387
loop0:
1388
    lea TBL,[K256 wrt rip]
1389
1390
    ;; byte swap first 16 dwords
1391
    COPY_XMM_AND_BSWAP  X0, [INP + 0*16], BYTE_FLIP_MASK
1392
    COPY_XMM_AND_BSWAP  X1, [INP + 1*16], BYTE_FLIP_MASK
1393
    COPY_XMM_AND_BSWAP  X2, [INP + 2*16], BYTE_FLIP_MASK
1394
    COPY_XMM_AND_BSWAP  X3, [INP + 3*16], BYTE_FLIP_MASK
1395
1396
    mov [rsp + _INP], INP
1397
1398
    ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1399
    mov SRND, 3
1400
align 16
1401
loop1:
1402
    movdqa  XFER, [TBL + 0*16]
1403
    paddd   XFER, X0
1404
    movdqa  [rsp + _XFER], XFER
1405
    FOUR_ROUNDS_AND_SCHED
1406
1407
    movdqa  XFER, [TBL + 1*16]
1408
    paddd   XFER, X0
1409
    movdqa  [rsp + _XFER], XFER
1410
    FOUR_ROUNDS_AND_SCHED
1411
1412
    movdqa  XFER, [TBL + 2*16]
1413
    paddd   XFER, X0
1414
    movdqa  [rsp + _XFER], XFER
1415
    FOUR_ROUNDS_AND_SCHED
1416
1417
    movdqa  XFER, [TBL + 3*16]
1418
    paddd   XFER, X0
1419
    movdqa  [rsp + _XFER], XFER
1420
    add TBL, 4*16
1421
    FOUR_ROUNDS_AND_SCHED
1422
1423
    sub SRND, 1
1424
    jne loop1
1425
1426
    mov SRND, 2
1427
loop2:
1428
    paddd   X0, [TBL + 0*16]
1429
    movdqa  [rsp + _XFER], X0
1430
    DO_ROUND    0
1431
    DO_ROUND    1
1432
    DO_ROUND    2
1433
    DO_ROUND    3
1434
    paddd   X1, [TBL + 1*16]
1435
    movdqa  [rsp + _XFER], X1
1436
    add TBL, 2*16
1437
    DO_ROUND    0
1438
    DO_ROUND    1
1439
    DO_ROUND    2
1440
    DO_ROUND    3
1441
1442
    movdqa  X0, X2
1443
    movdqa  X1, X3
1444
1445
    sub SRND, 1
1446
    jne loop2
1447
1448
    addm    [4*0 + CTX],a
1449
    addm    [4*1 + CTX],b
1450
    addm    [4*2 + CTX],c
1451
    addm    [4*3 + CTX],d
1452
    addm    [4*4 + CTX],e
1453
    addm    [4*5 + CTX],f
1454
    addm    [4*6 + CTX],g
1455
    addm    [4*7 + CTX],h
1456
1457
    mov INP, [rsp + _INP]
1458
    add INP, 64
1459
    cmp INP, [rsp + _INP_END]
1460
    jne loop0
1461
1462
done_hash:
1463
%ifndef LINUX
1464
    movdqa  xmm6,[rsp + _XMM_SAVE + 0*16]
1465
    movdqa  xmm7,[rsp + _XMM_SAVE + 1*16]
1466
    movdqa  xmm8,[rsp + _XMM_SAVE + 2*16]
1467
    movdqa  xmm9,[rsp + _XMM_SAVE + 3*16]
1468
    movdqa  xmm10,[rsp + _XMM_SAVE + 4*16]
1469
    movdqa  xmm11,[rsp + _XMM_SAVE + 5*16]
1470
    movdqa  xmm12,[rsp + _XMM_SAVE + 6*16]
1471
%endif
1472
1473
    add rsp, STACK_SIZE
1474
1475
    pop r15
1476
    pop r14
1477
    pop r13
1478
    pop rbp
1479
%ifndef LINUX
1480
    pop rdi
1481
    pop rsi
1482
%endif
1483
    pop rbx
1484
1485
    ret
1486
1487
1488
section .data
1489
align 64
1490
K256:
1491
    dd  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1492
    dd  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1493
    dd  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1494
    dd  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1495
    dd  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1496
    dd  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1497
    dd  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1498
    dd  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1499
    dd  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1500
    dd  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1501
    dd  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1502
    dd  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1503
    dd  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1504
    dd  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1505
    dd  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1506
    dd  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1507
1508
PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1509
1510
; shuffle xBxA -> 00BA
1511
_SHUF_00BA:              ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1512
1513
; shuffle xDxC -> DC00
1514
_SHUF_DC00:              ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1515
*/
1516
1517
#endif