Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
2 | pj | 1 | /*- |
2 | * Copyright (c) 1992, 1993 |
||
3 | * The Regents of the University of California. All rights reserved. |
||
4 | * |
||
5 | * This software was developed by the Computer Systems Engineering group |
||
6 | * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and |
||
7 | * contributed to Berkeley. |
||
8 | * |
||
9 | * Redistribution and use in source and binary forms, with or without |
||
10 | * modification, are permitted provided that the following conditions |
||
11 | * are met: |
||
12 | * 1. Redistributions of source code must retain the above copyright |
||
13 | * notice, this list of conditions and the following disclaimer. |
||
14 | * 2. Redistributions in binary form must reproduce the above copyright |
||
15 | * notice, this list of conditions and the following disclaimer in the |
||
16 | * documentation and/or other materials provided with the distribution. |
||
17 | * 3. All advertising materials mentioning features or use of this software |
||
18 | * must display the following acknowledgement: |
||
19 | * This product includes software developed by the University of |
||
20 | * California, Berkeley and its contributors. |
||
21 | * 4. Neither the name of the University nor the names of its contributors |
||
22 | * may be used to endorse or promote products derived from this software |
||
23 | * without specific prior written permission. |
||
24 | * |
||
25 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
||
26 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
||
27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||
28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
||
29 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
||
30 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
||
31 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
||
32 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
||
33 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
||
34 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
||
35 | * SUCH DAMAGE. |
||
36 | */ |
||
37 | |||
38 | #if defined(LIBC_SCCS) && !defined(lint) |
||
39 | static char sccsid[] = "@(#)muldi3.c 8.1 (Berkeley) 6/4/93"; |
||
40 | #endif /* LIBC_SCCS and not lint */ |
||
41 | |||
42 | #include "quad.h" |
||
43 | |||
44 | /* |
||
45 | * Multiply two quads. |
||
46 | * |
||
47 | * Our algorithm is based on the following. Split incoming quad values |
||
48 | * u and v (where u,v >= 0) into |
||
49 | * |
||
50 | * u = 2^n u1 * u0 (n = number of bits in `u_long', usu. 32) |
||
51 | * |
||
52 | * and |
||
53 | * |
||
54 | * v = 2^n v1 * v0 |
||
55 | * |
||
56 | * Then |
||
57 | * |
||
58 | * uv = 2^2n u1 v1 + 2^n u1 v0 + 2^n v1 u0 + u0 v0 |
||
59 | * = 2^2n u1 v1 + 2^n (u1 v0 + v1 u0) + u0 v0 |
||
60 | * |
||
61 | * Now add 2^n u1 v1 to the first term and subtract it from the middle, |
||
62 | * and add 2^n u0 v0 to the last term and subtract it from the middle. |
||
63 | * This gives: |
||
64 | * |
||
65 | * uv = (2^2n + 2^n) (u1 v1) + |
||
66 | * (2^n) (u1 v0 - u1 v1 + u0 v1 - u0 v0) + |
||
67 | * (2^n + 1) (u0 v0) |
||
68 | * |
||
69 | * Factoring the middle a bit gives us: |
||
70 | * |
||
71 | * uv = (2^2n + 2^n) (u1 v1) + [u1v1 = high] |
||
72 | * (2^n) (u1 - u0) (v0 - v1) + [(u1-u0)... = mid] |
||
73 | * (2^n + 1) (u0 v0) [u0v0 = low] |
||
74 | * |
||
75 | * The terms (u1 v1), (u1 - u0) (v0 - v1), and (u0 v0) can all be done |
||
76 | * in just half the precision of the original. (Note that either or both |
||
77 | * of (u1 - u0) or (v0 - v1) may be negative.) |
||
78 | * |
||
79 | * This algorithm is from Knuth vol. 2 (2nd ed), section 4.3.3, p. 278. |
||
80 | * |
||
81 | * Since C does not give us a `long * long = quad' operator, we split |
||
82 | * our input quads into two longs, then split the two longs into two |
||
83 | * shorts. We can then calculate `short * short = long' in native |
||
84 | * arithmetic. |
||
85 | * |
||
86 | * Our product should, strictly speaking, be a `long quad', with 128 |
||
87 | * bits, but we are going to discard the upper 64. In other words, |
||
88 | * we are not interested in uv, but rather in (uv mod 2^2n). This |
||
89 | * makes some of the terms above vanish, and we get: |
||
90 | * |
||
91 | * (2^n)(high) + (2^n)(mid) + (2^n + 1)(low) |
||
92 | * |
||
93 | * or |
||
94 | * |
||
95 | * (2^n)(high + mid + low) + low |
||
96 | * |
||
97 | * Furthermore, `high' and `mid' can be computed mod 2^n, as any factor |
||
98 | * of 2^n in either one will also vanish. Only `low' need be computed |
||
99 | * mod 2^2n, and only because of the final term above. |
||
100 | */ |
||
101 | static quad_t __lmulq(u_long, u_long); |
||
102 | |||
103 | quad_t |
||
104 | __muldi3(a, b) |
||
105 | quad_t a, b; |
||
106 | { |
||
107 | union uu u, v, low, prod; |
||
108 | register u_long high, mid, udiff, vdiff; |
||
109 | register int negall, negmid; |
||
110 | #define u1 u.ul[H] |
||
111 | #define u0 u.ul[L] |
||
112 | #define v1 v.ul[H] |
||
113 | #define v0 v.ul[L] |
||
114 | |||
115 | /* |
||
116 | * Get u and v such that u, v >= 0. When this is finished, |
||
117 | * u1, u0, v1, and v0 will be directly accessible through the |
||
118 | * longword fields. |
||
119 | */ |
||
120 | if (a >= 0) |
||
121 | u.q = a, negall = 0; |
||
122 | else |
||
123 | u.q = -a, negall = 1; |
||
124 | if (b >= 0) |
||
125 | v.q = b; |
||
126 | else |
||
127 | v.q = -b, negall ^= 1; |
||
128 | |||
129 | if (u1 == 0 && v1 == 0) { |
||
130 | /* |
||
131 | * An (I hope) important optimization occurs when u1 and v1 |
||
132 | * are both 0. This should be common since most numbers |
||
133 | * are small. Here the product is just u0*v0. |
||
134 | */ |
||
135 | prod.q = __lmulq(u0, v0); |
||
136 | } else { |
||
137 | /* |
||
138 | * Compute the three intermediate products, remembering |
||
139 | * whether the middle term is negative. We can discard |
||
140 | * any upper bits in high and mid, so we can use native |
||
141 | * u_long * u_long => u_long arithmetic. |
||
142 | */ |
||
143 | low.q = __lmulq(u0, v0); |
||
144 | |||
145 | if (u1 >= u0) |
||
146 | negmid = 0, udiff = u1 - u0; |
||
147 | else |
||
148 | negmid = 1, udiff = u0 - u1; |
||
149 | if (v0 >= v1) |
||
150 | vdiff = v0 - v1; |
||
151 | else |
||
152 | vdiff = v1 - v0, negmid ^= 1; |
||
153 | mid = udiff * vdiff; |
||
154 | |||
155 | high = u1 * v1; |
||
156 | |||
157 | /* |
||
158 | * Assemble the final product. |
||
159 | */ |
||
160 | prod.ul[H] = high + (negmid ? -mid : mid) + low.ul[L] + |
||
161 | low.ul[H]; |
||
162 | prod.ul[L] = low.ul[L]; |
||
163 | } |
||
164 | return (negall ? -prod.q : prod.q); |
||
165 | #undef u1 |
||
166 | #undef u0 |
||
167 | #undef v1 |
||
168 | #undef v0 |
||
169 | } |
||
170 | |||
171 | /* |
||
172 | * Multiply two 2N-bit longs to produce a 4N-bit quad, where N is half |
||
173 | * the number of bits in a long (whatever that is---the code below |
||
174 | * does not care as long as quad.h does its part of the bargain---but |
||
175 | * typically N==16). |
||
176 | * |
||
177 | * We use the same algorithm from Knuth, but this time the modulo refinement |
||
178 | * does not apply. On the other hand, since N is half the size of a long, |
||
179 | * we can get away with native multiplication---none of our input terms |
||
180 | * exceeds (ULONG_MAX >> 1). |
||
181 | * |
||
182 | * Note that, for u_long l, the quad-precision result |
||
183 | * |
||
184 | * l << N |
||
185 | * |
||
186 | * splits into high and low longs as HHALF(l) and LHUP(l) respectively. |
||
187 | */ |
||
188 | static quad_t |
||
189 | __lmulq(u_long u, u_long v) |
||
190 | { |
||
191 | u_long u1, u0, v1, v0, udiff, vdiff, high, mid, low; |
||
192 | u_long prodh, prodl, was; |
||
193 | union uu prod; |
||
194 | int neg; |
||
195 | |||
196 | u1 = HHALF(u); |
||
197 | u0 = LHALF(u); |
||
198 | v1 = HHALF(v); |
||
199 | v0 = LHALF(v); |
||
200 | |||
201 | low = u0 * v0; |
||
202 | |||
203 | /* This is the same small-number optimization as before. */ |
||
204 | if (u1 == 0 && v1 == 0) |
||
205 | return (low); |
||
206 | |||
207 | if (u1 >= u0) |
||
208 | udiff = u1 - u0, neg = 0; |
||
209 | else |
||
210 | udiff = u0 - u1, neg = 1; |
||
211 | if (v0 >= v1) |
||
212 | vdiff = v0 - v1; |
||
213 | else |
||
214 | vdiff = v1 - v0, neg ^= 1; |
||
215 | mid = udiff * vdiff; |
||
216 | |||
217 | high = u1 * v1; |
||
218 | |||
219 | /* prod = (high << 2N) + (high << N); */ |
||
220 | prodh = high + HHALF(high); |
||
221 | prodl = LHUP(high); |
||
222 | |||
223 | /* if (neg) prod -= mid << N; else prod += mid << N; */ |
||
224 | if (neg) { |
||
225 | was = prodl; |
||
226 | prodl -= LHUP(mid); |
||
227 | prodh -= HHALF(mid) + (prodl > was); |
||
228 | } else { |
||
229 | was = prodl; |
||
230 | prodl += LHUP(mid); |
||
231 | prodh += HHALF(mid) + (prodl < was); |
||
232 | } |
||
233 | |||
234 | /* prod += low << N */ |
||
235 | was = prodl; |
||
236 | prodl += LHUP(low); |
||
237 | prodh += HHALF(low) + (prodl < was); |
||
238 | /* ... + low; */ |
||
239 | if ((prodl += low) < low) |
||
240 | prodh++; |
||
241 | |||
242 | /* return 4N-bit product */ |
||
243 | prod.ul[H] = prodh; |
||
244 | prod.ul[L] = prodl; |
||
245 | return (prod.q); |
||
246 | } |