1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
|
#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__)
/* The code is adapted from Linux kernel's source */
// We use shorter insns, even though they are for "wrong"
// data type (fp, not int).
// For Intel, there is no penalty for doing it at all
// (CPUs which do have such penalty do not support SHA1 insns).
// For AMD, the penalty is one extra cycle
// (allegedly: I failed to find measurable difference).
//#define mova128 movdqa
#define mova128 movaps
//#define movu128 movdqu
#define movu128 movups
//#define xor128 pxor
#define xor128 xorps
//#define shuf128_32 pshufd
#define shuf128_32 shufps
#define extr128_32 pextrd
//#define extr128_32 extractps # not shorter
.section .text.sha1_process_block64_shaNI, "ax", @progbits
.globl sha1_process_block64_shaNI
.hidden sha1_process_block64_shaNI
.type sha1_process_block64_shaNI, @function
#define ABCD %xmm0
#define E0 %xmm1 /* Need two E's b/c they ping pong */
#define E1 %xmm2
#define MSG0 %xmm3
#define MSG1 %xmm4
#define MSG2 %xmm5
#define MSG3 %xmm6
#define SHUF_MASK %xmm7
.balign 8 # allow decoders to fetch at least 3 first insns
sha1_process_block64_shaNI:
pushl %ebp
movl %esp, %ebp
subl $32, %esp
andl $~0xF, %esp # paddd needs aligned memory operand
/* load initial hash values */
xor128 E0, E0
movu128 76(%eax), ABCD
pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word
shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK
/* Save hash values for addition after rounds */
movu128 E0, 16(%esp)
movu128 ABCD, (%esp)
/* Rounds 0-3 */
movu128 0*16(%eax), MSG0
pshufb SHUF_MASK, MSG0
paddd MSG0, E0
mova128 ABCD, E1
sha1rnds4 $0, E0, ABCD
/* Rounds 4-7 */
movu128 1*16(%eax), MSG1
pshufb SHUF_MASK, MSG1
sha1nexte MSG1, E1
mova128 ABCD, E0
sha1rnds4 $0, E1, ABCD
sha1msg1 MSG1, MSG0
/* Rounds 8-11 */
movu128 2*16(%eax), MSG2
pshufb SHUF_MASK, MSG2
sha1nexte MSG2, E0
mova128 ABCD, E1
sha1rnds4 $0, E0, ABCD
sha1msg1 MSG2, MSG1
xor128 MSG2, MSG0
/* Rounds 12-15 */
movu128 3*16(%eax), MSG3
pshufb SHUF_MASK, MSG3
sha1nexte MSG3, E1
mova128 ABCD, E0
sha1msg2 MSG3, MSG0
sha1rnds4 $0, E1, ABCD
sha1msg1 MSG3, MSG2
xor128 MSG3, MSG1
/* Rounds 16-19 */
sha1nexte MSG0, E0
mova128 ABCD, E1
sha1msg2 MSG0, MSG1
sha1rnds4 $0, E0, ABCD
sha1msg1 MSG0, MSG3
xor128 MSG0, MSG2
/* Rounds 20-23 */
sha1nexte MSG1, E1
mova128 ABCD, E0
sha1msg2 MSG1, MSG2
sha1rnds4 $1, E1, ABCD
sha1msg1 MSG1, MSG0
xor128 MSG1, MSG3
/* Rounds 24-27 */
sha1nexte MSG2, E0
mova128 ABCD, E1
sha1msg2 MSG2, MSG3
sha1rnds4 $1, E0, ABCD
sha1msg1 MSG2, MSG1
xor128 MSG2, MSG0
/* Rounds 28-31 */
sha1nexte MSG3, E1
mova128 ABCD, E0
sha1msg2 MSG3, MSG0
sha1rnds4 $1, E1, ABCD
sha1msg1 MSG3, MSG2
xor128 MSG3, MSG1
/* Rounds 32-35 */
sha1nexte MSG0, E0
mova128 ABCD, E1
sha1msg2 MSG0, MSG1
sha1rnds4 $1, E0, ABCD
sha1msg1 MSG0, MSG3
xor128 MSG0, MSG2
/* Rounds 36-39 */
sha1nexte MSG1, E1
mova128 ABCD, E0
sha1msg2 MSG1, MSG2
sha1rnds4 $1, E1, ABCD
sha1msg1 MSG1, MSG0
xor128 MSG1, MSG3
/* Rounds 40-43 */
sha1nexte MSG2, E0
mova128 ABCD, E1
sha1msg2 MSG2, MSG3
sha1rnds4 $2, E0, ABCD
sha1msg1 MSG2, MSG1
xor128 MSG2, MSG0
/* Rounds 44-47 */
sha1nexte MSG3, E1
mova128 ABCD, E0
sha1msg2 MSG3, MSG0
sha1rnds4 $2, E1, ABCD
sha1msg1 MSG3, MSG2
xor128 MSG3, MSG1
/* Rounds 48-51 */
sha1nexte MSG0, E0
mova128 ABCD, E1
sha1msg2 MSG0, MSG1
sha1rnds4 $2, E0, ABCD
sha1msg1 MSG0, MSG3
xor128 MSG0, MSG2
/* Rounds 52-55 */
sha1nexte MSG1, E1
mova128 ABCD, E0
sha1msg2 MSG1, MSG2
sha1rnds4 $2, E1, ABCD
sha1msg1 MSG1, MSG0
xor128 MSG1, MSG3
/* Rounds 56-59 */
sha1nexte MSG2, E0
mova128 ABCD, E1
sha1msg2 MSG2, MSG3
sha1rnds4 $2, E0, ABCD
sha1msg1 MSG2, MSG1
xor128 MSG2, MSG0
/* Rounds 60-63 */
sha1nexte MSG3, E1
mova128 ABCD, E0
sha1msg2 MSG3, MSG0
sha1rnds4 $3, E1, ABCD
sha1msg1 MSG3, MSG2
xor128 MSG3, MSG1
/* Rounds 64-67 */
sha1nexte MSG0, E0
mova128 ABCD, E1
sha1msg2 MSG0, MSG1
sha1rnds4 $3, E0, ABCD
sha1msg1 MSG0, MSG3
xor128 MSG0, MSG2
/* Rounds 68-71 */
sha1nexte MSG1, E1
mova128 ABCD, E0
sha1msg2 MSG1, MSG2
sha1rnds4 $3, E1, ABCD
xor128 MSG1, MSG3
/* Rounds 72-75 */
sha1nexte MSG2, E0
mova128 ABCD, E1
sha1msg2 MSG2, MSG3
sha1rnds4 $3, E0, ABCD
/* Rounds 76-79 */
sha1nexte MSG3, E1
mova128 ABCD, E0
sha1rnds4 $3, E1, ABCD
/* Add current hash values with previously saved */
sha1nexte 16(%esp), E0
paddd (%esp), ABCD
/* Write hash values back in the correct order */
shuf128_32 $0x1B, ABCD, ABCD
movu128 ABCD, 76(%eax)
extr128_32 $3, E0, 76+4*4(%eax)
movl %ebp, %esp
popl %ebp
ret
.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
.balign 16
PSHUFFLE_BYTE_FLIP_MASK:
.octa 0x000102030405060708090a0b0c0d0e0f
#endif
|