summaryrefslogtreecommitdiff
path: root/networking/tls_sp_c32.patch
blob: 7559586c95161194a42c691f02c381803950ebd5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
Somehow, gcc 6+ does this optimization same or better than the below
hand-written optimized code (gcc seem to eliminate a32[] array, uses 32-bit
registers/memory for "lower halves" of a32[i] elements).

But there can be arches where gcc won't be this good?

diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 72a3be537..e8a011ad1 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -228,51 +228,96 @@ static void sp_256_rshift1_10(sp_digit* r, sp_digit* a)
 static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a)
 {
 	int64_t t[8];
-	int64_t a32[8];
+	uint32_t a32;
 	int64_t o;
 
-	a32[0] = a[0];
-	a32[0] |= a[1] << 26;
-	a32[0] &= 0xffffffff;
-	a32[1] = (sp_digit)(a[1] >> 6);
-	a32[1] |= a[2] << 20;
-	a32[1] &= 0xffffffff;
-	a32[2] = (sp_digit)(a[2] >> 12);
-	a32[2] |= a[3] << 14;
-	a32[2] &= 0xffffffff;
-	a32[3] = (sp_digit)(a[3] >> 18);
-	a32[3] |= a[4] << 8;
-	a32[3] &= 0xffffffff;
-	a32[4] = (sp_digit)(a[4] >> 24);
-	a32[4] |= a[5] << 2;
-	a32[4] |= a[6] << 28;
-	a32[4] &= 0xffffffff;
-	a32[5] = (sp_digit)(a[6] >> 4);
-	a32[5] |= a[7] << 22;
-	a32[5] &= 0xffffffff;
-	a32[6] = (sp_digit)(a[7] >> 10);
-	a32[6] |= a[8] << 16;
-	a32[6] &= 0xffffffff;
-	a32[7] = (sp_digit)(a[8] >> 16);
-	a32[7] |= a[9] << 10;
-	a32[7] &= 0xffffffff;
-
 	/*  1  1  0 -1 -1 -1 -1  0 */
-	t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6];
 	/*  0  1  1  0 -1 -1 -1 -1 */
-	t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7];
 	/*  0  0  1  1  0 -1 -1 -1 */
-	t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7];
 	/* -1 -1  0  2  2  1  0 -1 */
-	t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7];
 	/*  0 -1 -1  0  2  2  1  0 */
-	t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6];
 	/*  0  0 -1 -1  0  2  2  1 */
-	t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7];
 	/* -1 -1  0  0  0  1  3  2 */
-	t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7];
 	/*  1  0 -1 -1 -1 -1  0  3 */
-	t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7];
+	//t[0] = 0 + a32[0] + a32[1]            - a32[3]   - a32[4]   - a32[5]   - a32[6]             ;
+	//t[1] = 0          + a32[1] + a32[2]              - a32[4]   - a32[5]   - a32[6]   - a32[7]  ;
+	//t[2] = 0                   + a32[2]   + a32[3]              - a32[5]   - a32[6]   - a32[7]  ;
+	//t[3] = 0 - a32[0] - a32[1]            + 2*a32[3] + 2*a32[4] + a32[5]              - a32[7]  ;
+	//t[4] = 0          - a32[1] - a32[2]              + 2*a32[4] + 2*a32[5] + a32[6]             ;
+	//t[5] = 0                   - a32[2]   - a32[3]              + 2*a32[5] + 2*a32[6] + a32[7]  ;
+	//t[6] = 0 - a32[0] - a32[1]                                  + a32[5]   + 3*a32[6] + 2*a32[7];
+	//t[7] = 0 + a32[0]          - a32[2]   - a32[3]   - a32[4]   - a32[5]              + 3*a32[7];
+
+#define A32 (int64_t)a32
+	a32 = a[0];
+	a32 |= a[1] << 26;
+	t[0] = 0 + A32;
+	t[3] = 0 - A32;
+	t[6] = 0 - A32;
+	t[7] = 0 + A32;
+
+	a32 = (sp_digit)(a[1] >> 6);
+	a32 |= a[2] << 20;
+	t[0] += A32    ;
+	t[1]  = 0 + A32;
+	t[3] -= A32    ;
+	t[4]  = 0 - A32;
+	t[6] -= A32    ;
+
+	a32 = (sp_digit)(a[2] >> 12);
+	a32 |= a[3] << 14;
+	t[1] += A32    ;
+	t[2]  = 0 + A32;
+	t[4] -= A32    ;
+	t[5]  = 0 - A32;
+	t[7] -= A32    ;
+
+	a32 = (sp_digit)(a[3] >> 18);
+	a32 |= a[4] << 8;
+	t[0] -= A32  ;
+	t[2] += A32  ;
+	t[3] += 2*A32;
+	t[5] -= A32  ;
+	t[7] -= A32  ;
+
+	a32 = (sp_digit)(a[4] >> 24);
+	a32 |= a[5] << 2;
+	a32 |= a[6] << 28;
+	t[0] -= A32  ;
+	t[1] -= A32  ;
+	t[3] += 2*A32;
+	t[4] += 2*A32;
+	t[7] -= A32  ;
+
+	a32 = (sp_digit)(a[6] >> 4);
+	a32 |= a[7] << 22;
+	t[0] -= A32  ;
+	t[1] -= A32  ;
+	t[2] -= A32  ;
+	t[3] += A32  ;
+	t[4] += 2*A32;
+	t[5] += 2*A32;
+	t[6] += A32  ;
+	t[7] -= A32  ;
+
+	a32 = (sp_digit)(a[7] >> 10);
+	a32 |= a[8] << 16;
+	t[0] -= A32  ;
+	t[1] -= A32  ;
+	t[2] -= A32  ;
+	t[4] += A32  ;
+	t[5] += 2*A32;
+	t[6] += 3*A32;
+
+	a32 = (sp_digit)(a[8] >> 16);
+	a32 |= a[9] << 10;
+	t[1] -= A32  ;
+	t[2] -= A32  ;
+	t[3] -= A32  ;
+	t[5] += A32  ;
+	t[6] += 2*A32;
+	t[7] += 3*A32;
+#undef A32
 
 	t[1] += t[0] >> 32; t[0] &= 0xffffffff;
 	t[2] += t[1] >> 32; t[1] &= 0xffffffff;