libbb/hash_md5_sha256_x86-32_shaNI.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281

#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
/* The code is adapted from Linux kernel's source */

// We use shorter insns, even though they are for "wrong"
// data type (fp, not int).
// For Intel, there is no penalty for doing it at all
// (CPUs which do have such penalty do not support SHA insns).
// For AMD, the penalty is one extra cycle
// (allegedly: I failed to find measurable difference).

//#define mova128 movdqa
#define mova128 movaps
//#define movu128 movdqu
#define movu128 movups
//#define shuf128_32 pshufd
#define shuf128_32 shufps

// pshufb and palignr are SSSE3 insns.
// We do not check SSSE3 in cpuid,
// all SHA-capable CPUs support it as well.

	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
	.globl	sha256_process_block64_shaNI
	.hidden	sha256_process_block64_shaNI
	.type	sha256_process_block64_shaNI, @function

#define DATA_PTR	%eax

#define SHA256CONSTANTS	%ecx

#define MSG		%xmm0
#define STATE0		%xmm1
#define STATE1		%xmm2
#define MSGTMP0		%xmm3
#define MSGTMP1		%xmm4
#define MSGTMP2		%xmm5
#define MSGTMP3		%xmm6

#define XMMTMP		%xmm7

#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))

	.balign	8	# allow decoders to fetch at least 2 first insns
sha256_process_block64_shaNI:

	movu128		76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
	movu128		76+1*16(%eax), STATE1 /* EFGH */
/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
	mova128		STATE1, STATE0
	/* ---		-------------- ABCD -- EFGH */
	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */

/* XMMTMP holds flip mask from here... */
	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
	movl		$K256+8*16, SHA256CONSTANTS

	/* Rounds 0-3 */
	movu128		0*16(DATA_PTR), MSG
	pshufb		XMMTMP, MSG
	mova128		MSG, MSGTMP0
		paddd		0*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0

	/* Rounds 4-7 */
	movu128		1*16(DATA_PTR), MSG
	pshufb		XMMTMP, MSG
	mova128		MSG, MSGTMP1
		paddd		1*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP1, MSGTMP0

	/* Rounds 8-11 */
	movu128		2*16(DATA_PTR), MSG
	pshufb		XMMTMP, MSG
	mova128		MSG, MSGTMP2
		paddd		2*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP2, MSGTMP1

	/* Rounds 12-15 */
	movu128		3*16(DATA_PTR), MSG
	pshufb		XMMTMP, MSG
/* ...to here */
	mova128		MSG, MSGTMP3
		paddd		3*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP3, XMMTMP
	palignr		$4, MSGTMP2, XMMTMP
	paddd		XMMTMP, MSGTMP0
	sha256msg2	MSGTMP3, MSGTMP0
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP3, MSGTMP2

	/* Rounds 16-19 */
	mova128		MSGTMP0, MSG
		paddd		4*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP0, XMMTMP
	palignr		$4, MSGTMP3, XMMTMP
	paddd		XMMTMP, MSGTMP1
	sha256msg2	MSGTMP0, MSGTMP1
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP0, MSGTMP3

	/* Rounds 20-23 */
	mova128		MSGTMP1, MSG
		paddd		5*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP1, XMMTMP
	palignr		$4, MSGTMP0, XMMTMP
	paddd		XMMTMP, MSGTMP2
	sha256msg2	MSGTMP1, MSGTMP2
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP1, MSGTMP0

	/* Rounds 24-27 */
	mova128		MSGTMP2, MSG
		paddd		6*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP2, XMMTMP
	palignr		$4, MSGTMP1, XMMTMP
	paddd		XMMTMP, MSGTMP3
	sha256msg2	MSGTMP2, MSGTMP3
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP2, MSGTMP1

	/* Rounds 28-31 */
	mova128		MSGTMP3, MSG
		paddd		7*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP3, XMMTMP
	palignr		$4, MSGTMP2, XMMTMP
	paddd		XMMTMP, MSGTMP0
	sha256msg2	MSGTMP3, MSGTMP0
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP3, MSGTMP2

	/* Rounds 32-35 */
	mova128		MSGTMP0, MSG
		paddd		8*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP0, XMMTMP
	palignr		$4, MSGTMP3, XMMTMP
	paddd		XMMTMP, MSGTMP1
	sha256msg2	MSGTMP0, MSGTMP1
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP0, MSGTMP3

	/* Rounds 36-39 */
	mova128		MSGTMP1, MSG
		paddd		9*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP1, XMMTMP
	palignr		$4, MSGTMP0, XMMTMP
	paddd		XMMTMP, MSGTMP2
	sha256msg2	MSGTMP1, MSGTMP2
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP1, MSGTMP0

	/* Rounds 40-43 */
	mova128		MSGTMP2, MSG
		paddd		10*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP2, XMMTMP
	palignr		$4, MSGTMP1, XMMTMP
	paddd		XMMTMP, MSGTMP3
	sha256msg2	MSGTMP2, MSGTMP3
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP2, MSGTMP1

	/* Rounds 44-47 */
	mova128		MSGTMP3, MSG
		paddd		11*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP3, XMMTMP
	palignr		$4, MSGTMP2, XMMTMP
	paddd		XMMTMP, MSGTMP0
	sha256msg2	MSGTMP3, MSGTMP0
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP3, MSGTMP2

	/* Rounds 48-51 */
	mova128		MSGTMP0, MSG
		paddd		12*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP0, XMMTMP
	palignr		$4, MSGTMP3, XMMTMP
	paddd		XMMTMP, MSGTMP1
	sha256msg2	MSGTMP0, MSGTMP1
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0
	sha256msg1	MSGTMP0, MSGTMP3

	/* Rounds 52-55 */
	mova128		MSGTMP1, MSG
		paddd		13*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP1, XMMTMP
	palignr		$4, MSGTMP0, XMMTMP
	paddd		XMMTMP, MSGTMP2
	sha256msg2	MSGTMP1, MSGTMP2
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0

	/* Rounds 56-59 */
	mova128		MSGTMP2, MSG
		paddd		14*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
	mova128		MSGTMP2, XMMTMP
	palignr		$4, MSGTMP1, XMMTMP
	paddd		XMMTMP, MSGTMP3
	sha256msg2	MSGTMP2, MSGTMP3
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0

	/* Rounds 60-63 */
	mova128		MSGTMP3, MSG
		paddd		15*16-8*16(SHA256CONSTANTS), MSG
		sha256rnds2	MSG, STATE0, STATE1
		shuf128_32	$0x0E, MSG, MSG
		sha256rnds2	MSG, STATE1, STATE0

	/* Write hash values back in the correct order */
	mova128		STATE0, XMMTMP
/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
	/* ---		-------------- HGDC -- FEBA */
	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
	/* add current hash values to previous ones */
	movu128		76+1*16(%eax), STATE1
	paddd		XMMTMP, STATE1
	movu128		STATE1, 76+1*16(%eax)
	movu128		76+0*16(%eax), XMMTMP
	paddd		XMMTMP, STATE0
	movu128		STATE0, 76+0*16(%eax)

	ret
	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI

	.section	.rodata.cst256.K256, "aM", @progbits, 256
	.balign	16
K256:
	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
	.balign	16
PSHUFFLE_BSWAP32_FLIP_MASK:
	.octa	0x0c0d0e0f08090a0b0405060700010203

#endif