Rev 422 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
422 | giacomo | 1 | /* |
2 | * include/asm-i386/xor.h |
||
3 | * |
||
4 | * Optimized RAID-5 checksumming functions for MMX and SSE. |
||
5 | * |
||
6 | * This program is free software; you can redistribute it and/or modify |
||
7 | * it under the terms of the GNU General Public License as published by |
||
8 | * the Free Software Foundation; either version 2, or (at your option) |
||
9 | * any later version. |
||
10 | * |
||
11 | * You should have received a copy of the GNU General Public License |
||
12 | * (for example /usr/src/linux/COPYING); if not, write to the Free |
||
13 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
||
14 | */ |
||
15 | |||
16 | /* |
||
17 | * High-speed RAID5 checksumming functions utilizing MMX instructions. |
||
18 | * Copyright (C) 1998 Ingo Molnar. |
||
19 | */ |
||
20 | |||
21 | #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n" |
||
22 | #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n" |
||
23 | #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" |
||
24 | #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" |
||
25 | #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" |
||
26 | #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" |
||
27 | |||
28 | #include <asm/i387.h> |
||
29 | |||
30 | static void |
||
31 | xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) |
||
32 | { |
||
33 | unsigned long lines = bytes >> 7; |
||
34 | |||
35 | kernel_fpu_begin(); |
||
36 | |||
37 | __asm__ __volatile__ ( |
||
38 | #undef BLOCK |
||
39 | #define BLOCK(i) \ |
||
40 | LD(i,0) \ |
||
41 | LD(i+1,1) \ |
||
42 | LD(i+2,2) \ |
||
43 | LD(i+3,3) \ |
||
44 | XO1(i,0) \ |
||
45 | ST(i,0) \ |
||
46 | XO1(i+1,1) \ |
||
47 | ST(i+1,1) \ |
||
48 | XO1(i+2,2) \ |
||
49 | ST(i+2,2) \ |
||
50 | XO1(i+3,3) \ |
||
51 | ST(i+3,3) |
||
52 | |||
53 | " .align 32 ;\n" |
||
54 | " 1: ;\n" |
||
55 | |||
56 | BLOCK(0) |
||
57 | BLOCK(4) |
||
58 | BLOCK(8) |
||
59 | BLOCK(12) |
||
60 | |||
61 | " addl $128, %1 ;\n" |
||
62 | " addl $128, %2 ;\n" |
||
63 | " decl %0 ;\n" |
||
64 | " jnz 1b ;\n" |
||
65 | : "+r" (lines), |
||
66 | "+r" (p1), "+r" (p2) |
||
67 | : |
||
68 | : "memory"); |
||
69 | |||
70 | kernel_fpu_end(); |
||
71 | } |
||
72 | |||
73 | static void |
||
74 | xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
||
75 | unsigned long *p3) |
||
76 | { |
||
77 | unsigned long lines = bytes >> 7; |
||
78 | |||
79 | kernel_fpu_begin(); |
||
80 | |||
81 | __asm__ __volatile__ ( |
||
82 | #undef BLOCK |
||
83 | #define BLOCK(i) \ |
||
84 | LD(i,0) \ |
||
85 | LD(i+1,1) \ |
||
86 | LD(i+2,2) \ |
||
87 | LD(i+3,3) \ |
||
88 | XO1(i,0) \ |
||
89 | XO1(i+1,1) \ |
||
90 | XO1(i+2,2) \ |
||
91 | XO1(i+3,3) \ |
||
92 | XO2(i,0) \ |
||
93 | ST(i,0) \ |
||
94 | XO2(i+1,1) \ |
||
95 | ST(i+1,1) \ |
||
96 | XO2(i+2,2) \ |
||
97 | ST(i+2,2) \ |
||
98 | XO2(i+3,3) \ |
||
99 | ST(i+3,3) |
||
100 | |||
101 | " .align 32 ;\n" |
||
102 | " 1: ;\n" |
||
103 | |||
104 | BLOCK(0) |
||
105 | BLOCK(4) |
||
106 | BLOCK(8) |
||
107 | BLOCK(12) |
||
108 | |||
109 | " addl $128, %1 ;\n" |
||
110 | " addl $128, %2 ;\n" |
||
111 | " addl $128, %3 ;\n" |
||
112 | " decl %0 ;\n" |
||
113 | " jnz 1b ;\n" |
||
114 | : "+r" (lines), |
||
115 | "+r" (p1), "+r" (p2), "+r" (p3) |
||
116 | : |
||
117 | : "memory"); |
||
118 | |||
119 | kernel_fpu_end(); |
||
120 | } |
||
121 | |||
122 | static void |
||
123 | xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
||
124 | unsigned long *p3, unsigned long *p4) |
||
125 | { |
||
126 | unsigned long lines = bytes >> 7; |
||
127 | |||
128 | kernel_fpu_begin(); |
||
129 | |||
130 | __asm__ __volatile__ ( |
||
131 | #undef BLOCK |
||
132 | #define BLOCK(i) \ |
||
133 | LD(i,0) \ |
||
134 | LD(i+1,1) \ |
||
135 | LD(i+2,2) \ |
||
136 | LD(i+3,3) \ |
||
137 | XO1(i,0) \ |
||
138 | XO1(i+1,1) \ |
||
139 | XO1(i+2,2) \ |
||
140 | XO1(i+3,3) \ |
||
141 | XO2(i,0) \ |
||
142 | XO2(i+1,1) \ |
||
143 | XO2(i+2,2) \ |
||
144 | XO2(i+3,3) \ |
||
145 | XO3(i,0) \ |
||
146 | ST(i,0) \ |
||
147 | XO3(i+1,1) \ |
||
148 | ST(i+1,1) \ |
||
149 | XO3(i+2,2) \ |
||
150 | ST(i+2,2) \ |
||
151 | XO3(i+3,3) \ |
||
152 | ST(i+3,3) |
||
153 | |||
154 | " .align 32 ;\n" |
||
155 | " 1: ;\n" |
||
156 | |||
157 | BLOCK(0) |
||
158 | BLOCK(4) |
||
159 | BLOCK(8) |
||
160 | BLOCK(12) |
||
161 | |||
162 | " addl $128, %1 ;\n" |
||
163 | " addl $128, %2 ;\n" |
||
164 | " addl $128, %3 ;\n" |
||
165 | " addl $128, %4 ;\n" |
||
166 | " decl %0 ;\n" |
||
167 | " jnz 1b ;\n" |
||
168 | : "+r" (lines), |
||
169 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) |
||
170 | : |
||
171 | : "memory"); |
||
172 | |||
173 | kernel_fpu_end(); |
||
174 | } |
||
175 | |||
176 | |||
177 | static void |
||
178 | xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
||
179 | unsigned long *p3, unsigned long *p4, unsigned long *p5) |
||
180 | { |
||
181 | unsigned long lines = bytes >> 7; |
||
182 | |||
183 | kernel_fpu_begin(); |
||
184 | |||
185 | /* need to save/restore p4/p5 manually otherwise gcc's 10 argument |
||
186 | limit gets exceeded (+ counts as two arguments) */ |
||
187 | __asm__ __volatile__ ( |
||
188 | " pushl %4\n" |
||
189 | " pushl %5\n" |
||
190 | #undef BLOCK |
||
191 | #define BLOCK(i) \ |
||
192 | LD(i,0) \ |
||
193 | LD(i+1,1) \ |
||
194 | LD(i+2,2) \ |
||
195 | LD(i+3,3) \ |
||
196 | XO1(i,0) \ |
||
197 | XO1(i+1,1) \ |
||
198 | XO1(i+2,2) \ |
||
199 | XO1(i+3,3) \ |
||
200 | XO2(i,0) \ |
||
201 | XO2(i+1,1) \ |
||
202 | XO2(i+2,2) \ |
||
203 | XO2(i+3,3) \ |
||
204 | XO3(i,0) \ |
||
205 | XO3(i+1,1) \ |
||
206 | XO3(i+2,2) \ |
||
207 | XO3(i+3,3) \ |
||
208 | XO4(i,0) \ |
||
209 | ST(i,0) \ |
||
210 | XO4(i+1,1) \ |
||
211 | ST(i+1,1) \ |
||
212 | XO4(i+2,2) \ |
||
213 | ST(i+2,2) \ |
||
214 | XO4(i+3,3) \ |
||
215 | ST(i+3,3) |
||
216 | |||
217 | " .align 32 ;\n" |
||
218 | " 1: ;\n" |
||
219 | |||
220 | BLOCK(0) |
||
221 | BLOCK(4) |
||
222 | BLOCK(8) |
||
223 | BLOCK(12) |
||
224 | |||
225 | " addl $128, %1 ;\n" |
||
226 | " addl $128, %2 ;\n" |
||
227 | " addl $128, %3 ;\n" |
||
228 | " addl $128, %4 ;\n" |
||
229 | " addl $128, %5 ;\n" |
||
230 | " decl %0 ;\n" |
||
231 | " jnz 1b ;\n" |
||
232 | " popl %5\n" |
||
233 | " popl %4\n" |
||
234 | : "+r" (lines), |
||
235 | "+r" (p1), "+r" (p2), "+r" (p3) |
||
236 | : "r" (p4), "r" (p5) |
||
237 | : "memory"); |
||
238 | |||
239 | kernel_fpu_end(); |
||
240 | } |
||
241 | |||
242 | #undef LD |
||
243 | #undef XO1 |
||
244 | #undef XO2 |
||
245 | #undef XO3 |
||
246 | #undef XO4 |
||
247 | #undef ST |
||
248 | #undef BLOCK |
||
249 | |||
250 | static void |
||
251 | xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) |
||
252 | { |
||
253 | unsigned long lines = bytes >> 6; |
||
254 | |||
255 | kernel_fpu_begin(); |
||
256 | |||
257 | __asm__ __volatile__ ( |
||
258 | " .align 32 ;\n" |
||
259 | " 1: ;\n" |
||
260 | " movq (%1), %%mm0 ;\n" |
||
261 | " movq 8(%1), %%mm1 ;\n" |
||
262 | " pxor (%2), %%mm0 ;\n" |
||
263 | " movq 16(%1), %%mm2 ;\n" |
||
264 | " movq %%mm0, (%1) ;\n" |
||
265 | " pxor 8(%2), %%mm1 ;\n" |
||
266 | " movq 24(%1), %%mm3 ;\n" |
||
267 | " movq %%mm1, 8(%1) ;\n" |
||
268 | " pxor 16(%2), %%mm2 ;\n" |
||
269 | " movq 32(%1), %%mm4 ;\n" |
||
270 | " movq %%mm2, 16(%1) ;\n" |
||
271 | " pxor 24(%2), %%mm3 ;\n" |
||
272 | " movq 40(%1), %%mm5 ;\n" |
||
273 | " movq %%mm3, 24(%1) ;\n" |
||
274 | " pxor 32(%2), %%mm4 ;\n" |
||
275 | " movq 48(%1), %%mm6 ;\n" |
||
276 | " movq %%mm4, 32(%1) ;\n" |
||
277 | " pxor 40(%2), %%mm5 ;\n" |
||
278 | " movq 56(%1), %%mm7 ;\n" |
||
279 | " movq %%mm5, 40(%1) ;\n" |
||
280 | " pxor 48(%2), %%mm6 ;\n" |
||
281 | " pxor 56(%2), %%mm7 ;\n" |
||
282 | " movq %%mm6, 48(%1) ;\n" |
||
283 | " movq %%mm7, 56(%1) ;\n" |
||
284 | |||
285 | " addl $64, %1 ;\n" |
||
286 | " addl $64, %2 ;\n" |
||
287 | " decl %0 ;\n" |
||
288 | " jnz 1b ;\n" |
||
289 | : "+r" (lines), |
||
290 | "+r" (p1), "+r" (p2) |
||
291 | : |
||
292 | : "memory"); |
||
293 | |||
294 | kernel_fpu_end(); |
||
295 | } |
||
296 | |||
297 | static void |
||
298 | xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
||
299 | unsigned long *p3) |
||
300 | { |
||
301 | unsigned long lines = bytes >> 6; |
||
302 | |||
303 | kernel_fpu_begin(); |
||
304 | |||
305 | __asm__ __volatile__ ( |
||
306 | " .align 32,0x90 ;\n" |
||
307 | " 1: ;\n" |
||
308 | " movq (%1), %%mm0 ;\n" |
||
309 | " movq 8(%1), %%mm1 ;\n" |
||
310 | " pxor (%2), %%mm0 ;\n" |
||
311 | " movq 16(%1), %%mm2 ;\n" |
||
312 | " pxor 8(%2), %%mm1 ;\n" |
||
313 | " pxor (%3), %%mm0 ;\n" |
||
314 | " pxor 16(%2), %%mm2 ;\n" |
||
315 | " movq %%mm0, (%1) ;\n" |
||
316 | " pxor 8(%3), %%mm1 ;\n" |
||
317 | " pxor 16(%3), %%mm2 ;\n" |
||
318 | " movq 24(%1), %%mm3 ;\n" |
||
319 | " movq %%mm1, 8(%1) ;\n" |
||
320 | " movq 32(%1), %%mm4 ;\n" |
||
321 | " movq 40(%1), %%mm5 ;\n" |
||
322 | " pxor 24(%2), %%mm3 ;\n" |
||
323 | " movq %%mm2, 16(%1) ;\n" |
||
324 | " pxor 32(%2), %%mm4 ;\n" |
||
325 | " pxor 24(%3), %%mm3 ;\n" |
||
326 | " pxor 40(%2), %%mm5 ;\n" |
||
327 | " movq %%mm3, 24(%1) ;\n" |
||
328 | " pxor 32(%3), %%mm4 ;\n" |
||
329 | " pxor 40(%3), %%mm5 ;\n" |
||
330 | " movq 48(%1), %%mm6 ;\n" |
||
331 | " movq %%mm4, 32(%1) ;\n" |
||
332 | " movq 56(%1), %%mm7 ;\n" |
||
333 | " pxor 48(%2), %%mm6 ;\n" |
||
334 | " movq %%mm5, 40(%1) ;\n" |
||
335 | " pxor 56(%2), %%mm7 ;\n" |
||
336 | " pxor 48(%3), %%mm6 ;\n" |
||
337 | " pxor 56(%3), %%mm7 ;\n" |
||
338 | " movq %%mm6, 48(%1) ;\n" |
||
339 | " movq %%mm7, 56(%1) ;\n" |
||
340 | |||
341 | " addl $64, %1 ;\n" |
||
342 | " addl $64, %2 ;\n" |
||
343 | " addl $64, %3 ;\n" |
||
344 | " decl %0 ;\n" |
||
345 | " jnz 1b ;\n" |
||
346 | : "+r" (lines), |
||
347 | "+r" (p1), "+r" (p2), "+r" (p3) |
||
348 | : |
||
349 | : "memory" ); |
||
350 | |||
351 | kernel_fpu_end(); |
||
352 | } |
||
353 | |||
354 | static void |
||
355 | xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
||
356 | unsigned long *p3, unsigned long *p4) |
||
357 | { |
||
358 | unsigned long lines = bytes >> 6; |
||
359 | |||
360 | kernel_fpu_begin(); |
||
361 | |||
362 | __asm__ __volatile__ ( |
||
363 | " .align 32,0x90 ;\n" |
||
364 | " 1: ;\n" |
||
365 | " movq (%1), %%mm0 ;\n" |
||
366 | " movq 8(%1), %%mm1 ;\n" |
||
367 | " pxor (%2), %%mm0 ;\n" |
||
368 | " movq 16(%1), %%mm2 ;\n" |
||
369 | " pxor 8(%2), %%mm1 ;\n" |
||
370 | " pxor (%3), %%mm0 ;\n" |
||
371 | " pxor 16(%2), %%mm2 ;\n" |
||
372 | " pxor 8(%3), %%mm1 ;\n" |
||
373 | " pxor (%4), %%mm0 ;\n" |
||
374 | " movq 24(%1), %%mm3 ;\n" |
||
375 | " pxor 16(%3), %%mm2 ;\n" |
||
376 | " pxor 8(%4), %%mm1 ;\n" |
||
377 | " movq %%mm0, (%1) ;\n" |
||
378 | " movq 32(%1), %%mm4 ;\n" |
||
379 | " pxor 24(%2), %%mm3 ;\n" |
||
380 | " pxor 16(%4), %%mm2 ;\n" |
||
381 | " movq %%mm1, 8(%1) ;\n" |
||
382 | " movq 40(%1), %%mm5 ;\n" |
||
383 | " pxor 32(%2), %%mm4 ;\n" |
||
384 | " pxor 24(%3), %%mm3 ;\n" |
||
385 | " movq %%mm2, 16(%1) ;\n" |
||
386 | " pxor 40(%2), %%mm5 ;\n" |
||
387 | " pxor 32(%3), %%mm4 ;\n" |
||
388 | " pxor 24(%4), %%mm3 ;\n" |
||
389 | " movq %%mm3, 24(%1) ;\n" |
||
390 | " movq 56(%1), %%mm7 ;\n" |
||
391 | " movq 48(%1), %%mm6 ;\n" |
||
392 | " pxor 40(%3), %%mm5 ;\n" |
||
393 | " pxor 32(%4), %%mm4 ;\n" |
||
394 | " pxor 48(%2), %%mm6 ;\n" |
||
395 | " movq %%mm4, 32(%1) ;\n" |
||
396 | " pxor 56(%2), %%mm7 ;\n" |
||
397 | " pxor 40(%4), %%mm5 ;\n" |
||
398 | " pxor 48(%3), %%mm6 ;\n" |
||
399 | " pxor 56(%3), %%mm7 ;\n" |
||
400 | " movq %%mm5, 40(%1) ;\n" |
||
401 | " pxor 48(%4), %%mm6 ;\n" |
||
402 | " pxor 56(%4), %%mm7 ;\n" |
||
403 | " movq %%mm6, 48(%1) ;\n" |
||
404 | " movq %%mm7, 56(%1) ;\n" |
||
405 | |||
406 | " addl $64, %1 ;\n" |
||
407 | " addl $64, %2 ;\n" |
||
408 | " addl $64, %3 ;\n" |
||
409 | " addl $64, %4 ;\n" |
||
410 | " decl %0 ;\n" |
||
411 | " jnz 1b ;\n" |
||
412 | : "+r" (lines), |
||
413 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) |
||
414 | : |
||
415 | : "memory"); |
||
416 | |||
417 | kernel_fpu_end(); |
||
418 | } |
||
419 | |||
420 | static void |
||
421 | xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
||
422 | unsigned long *p3, unsigned long *p4, unsigned long *p5) |
||
423 | { |
||
424 | unsigned long lines = bytes >> 6; |
||
425 | |||
426 | kernel_fpu_begin(); |
||
427 | |||
428 | /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */ |
||
429 | __asm__ __volatile__ ( |
||
430 | " pushl %4\n" |
||
431 | " pushl %5\n" |
||
432 | " .align 32,0x90 ;\n" |
||
433 | " 1: ;\n" |
||
434 | " movq (%1), %%mm0 ;\n" |
||
435 | " movq 8(%1), %%mm1 ;\n" |
||
436 | " pxor (%2), %%mm0 ;\n" |
||
437 | " pxor 8(%2), %%mm1 ;\n" |
||
438 | " movq 16(%1), %%mm2 ;\n" |
||
439 | " pxor (%3), %%mm0 ;\n" |
||
440 | " pxor 8(%3), %%mm1 ;\n" |
||
441 | " pxor 16(%2), %%mm2 ;\n" |
||
442 | " pxor (%4), %%mm0 ;\n" |
||
443 | " pxor 8(%4), %%mm1 ;\n" |
||
444 | " pxor 16(%3), %%mm2 ;\n" |
||
445 | " movq 24(%1), %%mm3 ;\n" |
||
446 | " pxor (%5), %%mm0 ;\n" |
||
447 | " pxor 8(%5), %%mm1 ;\n" |
||
448 | " movq %%mm0, (%1) ;\n" |
||
449 | " pxor 16(%4), %%mm2 ;\n" |
||
450 | " pxor 24(%2), %%mm3 ;\n" |
||
451 | " movq %%mm1, 8(%1) ;\n" |
||
452 | " pxor 16(%5), %%mm2 ;\n" |
||
453 | " pxor 24(%3), %%mm3 ;\n" |
||
454 | " movq 32(%1), %%mm4 ;\n" |
||
455 | " movq %%mm2, 16(%1) ;\n" |
||
456 | " pxor 24(%4), %%mm3 ;\n" |
||
457 | " pxor 32(%2), %%mm4 ;\n" |
||
458 | " movq 40(%1), %%mm5 ;\n" |
||
459 | " pxor 24(%5), %%mm3 ;\n" |
||
460 | " pxor 32(%3), %%mm4 ;\n" |
||
461 | " pxor 40(%2), %%mm5 ;\n" |
||
462 | " movq %%mm3, 24(%1) ;\n" |
||
463 | " pxor 32(%4), %%mm4 ;\n" |
||
464 | " pxor 40(%3), %%mm5 ;\n" |
||
465 | " movq 48(%1), %%mm6 ;\n" |
||
466 | " movq 56(%1), %%mm7 ;\n" |
||
467 | " pxor 32(%5), %%mm4 ;\n" |
||
468 | " pxor 40(%4), %%mm5 ;\n" |
||
469 | " pxor 48(%2), %%mm6 ;\n" |
||
470 | " pxor 56(%2), %%mm7 ;\n" |
||
471 | " movq %%mm4, 32(%1) ;\n" |
||
472 | " pxor 48(%3), %%mm6 ;\n" |
||
473 | " pxor 56(%3), %%mm7 ;\n" |
||
474 | " pxor 40(%5), %%mm5 ;\n" |
||
475 | " pxor 48(%4), %%mm6 ;\n" |
||
476 | " pxor 56(%4), %%mm7 ;\n" |
||
477 | " movq %%mm5, 40(%1) ;\n" |
||
478 | " pxor 48(%5), %%mm6 ;\n" |
||
479 | " pxor 56(%5), %%mm7 ;\n" |
||
480 | " movq %%mm6, 48(%1) ;\n" |
||
481 | " movq %%mm7, 56(%1) ;\n" |
||
482 | |||
483 | " addl $64, %1 ;\n" |
||
484 | " addl $64, %2 ;\n" |
||
485 | " addl $64, %3 ;\n" |
||
486 | " addl $64, %4 ;\n" |
||
487 | " addl $64, %5 ;\n" |
||
488 | " decl %0 ;\n" |
||
489 | " jnz 1b ;\n" |
||
490 | " popl %5\n" |
||
491 | " popl %4\n" |
||
492 | : "+g" (lines), |
||
493 | "+r" (p1), "+r" (p2), "+r" (p3) |
||
494 | : "r" (p4), "r" (p5) |
||
495 | : "memory"); |
||
496 | |||
497 | kernel_fpu_end(); |
||
498 | } |
||
499 | |||
500 | static struct xor_block_template xor_block_pII_mmx = { |
||
501 | .name = "pII_mmx", |
||
502 | .do_2 = xor_pII_mmx_2, |
||
503 | .do_3 = xor_pII_mmx_3, |
||
504 | .do_4 = xor_pII_mmx_4, |
||
505 | .do_5 = xor_pII_mmx_5, |
||
506 | }; |
||
507 | |||
508 | static struct xor_block_template xor_block_p5_mmx = { |
||
509 | .name = "p5_mmx", |
||
510 | .do_2 = xor_p5_mmx_2, |
||
511 | .do_3 = xor_p5_mmx_3, |
||
512 | .do_4 = xor_p5_mmx_4, |
||
513 | .do_5 = xor_p5_mmx_5, |
||
514 | }; |
||
515 | |||
516 | /* |
||
517 | * Cache avoiding checksumming functions utilizing KNI instructions |
||
518 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) |
||
519 | */ |
||
520 | |||
521 | #define XMMS_SAVE do { \ |
||
522 | preempt_disable(); \ |
||
523 | __asm__ __volatile__ ( \ |
||
524 | "movl %%cr0,%0 ;\n\t" \ |
||
525 | "clts ;\n\t" \ |
||
526 | "movups %%xmm0,(%1) ;\n\t" \ |
||
527 | "movups %%xmm1,0x10(%1) ;\n\t" \ |
||
528 | "movups %%xmm2,0x20(%1) ;\n\t" \ |
||
529 | "movups %%xmm3,0x30(%1) ;\n\t" \ |
||
530 | : "=&r" (cr0) \ |
||
531 | : "r" (xmm_save) \ |
||
532 | : "memory"); \ |
||
533 | } while(0) |
||
534 | |||
535 | #define XMMS_RESTORE do { \ |
||
536 | __asm__ __volatile__ ( \ |
||
537 | "sfence ;\n\t" \ |
||
538 | "movups (%1),%%xmm0 ;\n\t" \ |
||
539 | "movups 0x10(%1),%%xmm1 ;\n\t" \ |
||
540 | "movups 0x20(%1),%%xmm2 ;\n\t" \ |
||
541 | "movups 0x30(%1),%%xmm3 ;\n\t" \ |
||
542 | "movl %0,%%cr0 ;\n\t" \ |
||
543 | : \ |
||
544 | : "r" (cr0), "r" (xmm_save) \ |
||
545 | : "memory"); \ |
||
546 | preempt_enable(); \ |
||
547 | } while(0) |
||
548 | |||
549 | #define ALIGN16 __attribute__((aligned(16))) |
||
550 | |||
551 | #define OFFS(x) "16*("#x")" |
||
552 | #define PF_OFFS(x) "256+16*("#x")" |
||
553 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" |
||
554 | #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" |
||
555 | #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" |
||
556 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" |
||
557 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" |
||
558 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" |
||
559 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" |
||
560 | #define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" |
||
561 | #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" |
||
562 | #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" |
||
563 | #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" |
||
564 | #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" |
||
565 | #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" |
||
566 | |||
567 | |||
568 | static void |
||
569 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) |
||
570 | { |
||
571 | unsigned long lines = bytes >> 8; |
||
572 | char xmm_save[16*4] ALIGN16; |
||
573 | int cr0; |
||
574 | |||
575 | XMMS_SAVE; |
||
576 | |||
577 | __asm__ __volatile__ ( |
||
578 | #undef BLOCK |
||
579 | #define BLOCK(i) \ |
||
580 | LD(i,0) \ |
||
581 | LD(i+1,1) \ |
||
582 | PF1(i) \ |
||
583 | PF1(i+2) \ |
||
584 | LD(i+2,2) \ |
||
585 | LD(i+3,3) \ |
||
586 | PF0(i+4) \ |
||
587 | PF0(i+6) \ |
||
588 | XO1(i,0) \ |
||
589 | XO1(i+1,1) \ |
||
590 | XO1(i+2,2) \ |
||
591 | XO1(i+3,3) \ |
||
592 | ST(i,0) \ |
||
593 | ST(i+1,1) \ |
||
594 | ST(i+2,2) \ |
||
595 | ST(i+3,3) \ |
||
596 | |||
597 | |||
598 | PF0(0) |
||
599 | PF0(2) |
||
600 | |||
601 | " .align 32 ;\n" |
||
602 | " 1: ;\n" |
||
603 | |||
604 | BLOCK(0) |
||
605 | BLOCK(4) |
||
606 | BLOCK(8) |
||
607 | BLOCK(12) |
||
608 | |||
609 | " addl $256, %1 ;\n" |
||
610 | " addl $256, %2 ;\n" |
||
611 | " decl %0 ;\n" |
||
612 | " jnz 1b ;\n" |
||
613 | : "+r" (lines), |
||
614 | "+r" (p1), "+r" (p2) |
||
615 | : |
||
616 | : "memory"); |
||
617 | |||
618 | XMMS_RESTORE; |
||
619 | } |
||
620 | |||
621 | static void |
||
622 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
||
623 | unsigned long *p3) |
||
624 | { |
||
625 | unsigned long lines = bytes >> 8; |
||
626 | char xmm_save[16*4] ALIGN16; |
||
627 | int cr0; |
||
628 | |||
629 | XMMS_SAVE; |
||
630 | |||
631 | __asm__ __volatile__ ( |
||
632 | #undef BLOCK |
||
633 | #define BLOCK(i) \ |
||
634 | PF1(i) \ |
||
635 | PF1(i+2) \ |
||
636 | LD(i,0) \ |
||
637 | LD(i+1,1) \ |
||
638 | LD(i+2,2) \ |
||
639 | LD(i+3,3) \ |
||
640 | PF2(i) \ |
||
641 | PF2(i+2) \ |
||
642 | PF0(i+4) \ |
||
643 | PF0(i+6) \ |
||
644 | XO1(i,0) \ |
||
645 | XO1(i+1,1) \ |
||
646 | XO1(i+2,2) \ |
||
647 | XO1(i+3,3) \ |
||
648 | XO2(i,0) \ |
||
649 | XO2(i+1,1) \ |
||
650 | XO2(i+2,2) \ |
||
651 | XO2(i+3,3) \ |
||
652 | ST(i,0) \ |
||
653 | ST(i+1,1) \ |
||
654 | ST(i+2,2) \ |
||
655 | ST(i+3,3) \ |
||
656 | |||
657 | |||
658 | PF0(0) |
||
659 | PF0(2) |
||
660 | |||
661 | " .align 32 ;\n" |
||
662 | " 1: ;\n" |
||
663 | |||
664 | BLOCK(0) |
||
665 | BLOCK(4) |
||
666 | BLOCK(8) |
||
667 | BLOCK(12) |
||
668 | |||
669 | " addl $256, %1 ;\n" |
||
670 | " addl $256, %2 ;\n" |
||
671 | " addl $256, %3 ;\n" |
||
672 | " decl %0 ;\n" |
||
673 | " jnz 1b ;\n" |
||
674 | : "+r" (lines), |
||
675 | "+r" (p1), "+r"(p2), "+r"(p3) |
||
676 | : |
||
677 | : "memory" ); |
||
678 | |||
679 | XMMS_RESTORE; |
||
680 | } |
||
681 | |||
682 | static void |
||
683 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
||
684 | unsigned long *p3, unsigned long *p4) |
||
685 | { |
||
686 | unsigned long lines = bytes >> 8; |
||
687 | char xmm_save[16*4] ALIGN16; |
||
688 | int cr0; |
||
689 | |||
690 | XMMS_SAVE; |
||
691 | |||
692 | __asm__ __volatile__ ( |
||
693 | #undef BLOCK |
||
694 | #define BLOCK(i) \ |
||
695 | PF1(i) \ |
||
696 | PF1(i+2) \ |
||
697 | LD(i,0) \ |
||
698 | LD(i+1,1) \ |
||
699 | LD(i+2,2) \ |
||
700 | LD(i+3,3) \ |
||
701 | PF2(i) \ |
||
702 | PF2(i+2) \ |
||
703 | XO1(i,0) \ |
||
704 | XO1(i+1,1) \ |
||
705 | XO1(i+2,2) \ |
||
706 | XO1(i+3,3) \ |
||
707 | PF3(i) \ |
||
708 | PF3(i+2) \ |
||
709 | PF0(i+4) \ |
||
710 | PF0(i+6) \ |
||
711 | XO2(i,0) \ |
||
712 | XO2(i+1,1) \ |
||
713 | XO2(i+2,2) \ |
||
714 | XO2(i+3,3) \ |
||
715 | XO3(i,0) \ |
||
716 | XO3(i+1,1) \ |
||
717 | XO3(i+2,2) \ |
||
718 | XO3(i+3,3) \ |
||
719 | ST(i,0) \ |
||
720 | ST(i+1,1) \ |
||
721 | ST(i+2,2) \ |
||
722 | ST(i+3,3) \ |
||
723 | |||
724 | |||
725 | PF0(0) |
||
726 | PF0(2) |
||
727 | |||
728 | " .align 32 ;\n" |
||
729 | " 1: ;\n" |
||
730 | |||
731 | BLOCK(0) |
||
732 | BLOCK(4) |
||
733 | BLOCK(8) |
||
734 | BLOCK(12) |
||
735 | |||
736 | " addl $256, %1 ;\n" |
||
737 | " addl $256, %2 ;\n" |
||
738 | " addl $256, %3 ;\n" |
||
739 | " addl $256, %4 ;\n" |
||
740 | " decl %0 ;\n" |
||
741 | " jnz 1b ;\n" |
||
742 | : "+r" (lines), |
||
743 | "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) |
||
744 | : |
||
745 | : "memory" ); |
||
746 | |||
747 | XMMS_RESTORE; |
||
748 | } |
||
749 | |||
750 | static void |
||
751 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
||
752 | unsigned long *p3, unsigned long *p4, unsigned long *p5) |
||
753 | { |
||
754 | unsigned long lines = bytes >> 8; |
||
755 | char xmm_save[16*4] ALIGN16; |
||
756 | int cr0; |
||
757 | |||
758 | XMMS_SAVE; |
||
759 | |||
760 | /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */ |
||
761 | __asm__ __volatile__ ( |
||
762 | " pushl %4\n" |
||
763 | " pushl %5\n" |
||
764 | #undef BLOCK |
||
765 | #define BLOCK(i) \ |
||
766 | PF1(i) \ |
||
767 | PF1(i+2) \ |
||
768 | LD(i,0) \ |
||
769 | LD(i+1,1) \ |
||
770 | LD(i+2,2) \ |
||
771 | LD(i+3,3) \ |
||
772 | PF2(i) \ |
||
773 | PF2(i+2) \ |
||
774 | XO1(i,0) \ |
||
775 | XO1(i+1,1) \ |
||
776 | XO1(i+2,2) \ |
||
777 | XO1(i+3,3) \ |
||
778 | PF3(i) \ |
||
779 | PF3(i+2) \ |
||
780 | XO2(i,0) \ |
||
781 | XO2(i+1,1) \ |
||
782 | XO2(i+2,2) \ |
||
783 | XO2(i+3,3) \ |
||
784 | PF4(i) \ |
||
785 | PF4(i+2) \ |
||
786 | PF0(i+4) \ |
||
787 | PF0(i+6) \ |
||
788 | XO3(i,0) \ |
||
789 | XO3(i+1,1) \ |
||
790 | XO3(i+2,2) \ |
||
791 | XO3(i+3,3) \ |
||
792 | XO4(i,0) \ |
||
793 | XO4(i+1,1) \ |
||
794 | XO4(i+2,2) \ |
||
795 | XO4(i+3,3) \ |
||
796 | ST(i,0) \ |
||
797 | ST(i+1,1) \ |
||
798 | ST(i+2,2) \ |
||
799 | ST(i+3,3) \ |
||
800 | |||
801 | |||
802 | PF0(0) |
||
803 | PF0(2) |
||
804 | |||
805 | " .align 32 ;\n" |
||
806 | " 1: ;\n" |
||
807 | |||
808 | BLOCK(0) |
||
809 | BLOCK(4) |
||
810 | BLOCK(8) |
||
811 | BLOCK(12) |
||
812 | |||
813 | " addl $256, %1 ;\n" |
||
814 | " addl $256, %2 ;\n" |
||
815 | " addl $256, %3 ;\n" |
||
816 | " addl $256, %4 ;\n" |
||
817 | " addl $256, %5 ;\n" |
||
818 | " decl %0 ;\n" |
||
819 | " jnz 1b ;\n" |
||
820 | " popl %5\n" |
||
821 | " popl %4\n" |
||
822 | : "+r" (lines), |
||
823 | "+r" (p1), "+r" (p2), "+r" (p3) |
||
824 | : "r" (p4), "r" (p5) |
||
825 | : "memory"); |
||
826 | |||
827 | XMMS_RESTORE; |
||
828 | } |
||
829 | |||
830 | static struct xor_block_template xor_block_pIII_sse = { |
||
831 | .name = "pIII_sse", |
||
832 | .do_2 = xor_sse_2, |
||
833 | .do_3 = xor_sse_3, |
||
834 | .do_4 = xor_sse_4, |
||
835 | .do_5 = xor_sse_5, |
||
836 | }; |
||
837 | |||
838 | /* Also try the generic routines. */ |
||
839 | #include <asm-generic/xor.h> |
||
840 | |||
841 | #undef XOR_TRY_TEMPLATES |
||
842 | #define XOR_TRY_TEMPLATES \ |
||
843 | do { \ |
||
844 | xor_speed(&xor_block_8regs); \ |
||
845 | xor_speed(&xor_block_8regs_p); \ |
||
846 | xor_speed(&xor_block_32regs); \ |
||
847 | xor_speed(&xor_block_32regs_p); \ |
||
848 | if (cpu_has_xmm) \ |
||
849 | xor_speed(&xor_block_pIII_sse); \ |
||
850 | if (cpu_has_mmx) { \ |
||
851 | xor_speed(&xor_block_pII_mmx); \ |
||
852 | xor_speed(&xor_block_p5_mmx); \ |
||
853 | } \ |
||
854 | } while (0) |
||
855 | |||
856 | /* We force the use of the SSE xor block because it can write around L2. |
||
857 | We may also be able to load into the L1 only depending on how the cpu |
||
858 | deals with a load to a line that is being prefetched. */ |
||
859 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
||
860 | (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) |