Subversion Repositories shark

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
70 giacomo 1
/* $Id: sse_xform3.s,v 1.1 2003-03-13 12:11:49 giacomo Exp $ */
2
 
3
/*
4
 * Mesa 3-D graphics library
5
 * Version:  3.5
6
 *
7
 * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
8
 *
9
 * Permission is hereby granted, free of charge, to any person obtaining a
10
 * copy of this software and associated documentation files (the "Software"),
11
 * to deal in the Software without restriction, including without limitation
12
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13
 * and/or sell copies of the Software, and to permit persons to whom the
14
 * Software is furnished to do so, subject to the following conditions:
15
 *
16
 * The above copyright notice and this permission notice shall be included
17
 * in all copies or substantial portions of the Software.
18
 *
19
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22
 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
23
 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
 */
26
 
27
/** TODO:
28
  * - insert PREFETCH instructions to avoid cache-misses !
29
  * - some more optimizations are possible...
30
  * - for 40-50% more performance in the SSE-functions, the
31
  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
32
  */
33
 
34
#include "matypes.h"
35
#include "xform_args.h"
36
 
37
   SEG_TEXT
38
 
39
#define S(i) 	REGOFF(i * 4, ESI)
40
#define D(i) 	REGOFF(i * 4, EDI)
41
#define M(i) 	REGOFF(i * 4, EDX)
42
 
43
 
44
ALIGNTEXT4
45
GLOBL GLNAME(_mesa_sse_transform_points3_general)
46
GLNAME( _mesa_sse_transform_points3_general ):
47
 
48
#define FRAME_OFFSET 8
49
    PUSH_L    ( ESI )
50
    PUSH_L    ( EDI )
51
 
52
    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
53
    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
54
 
55
    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
56
    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
57
 
58
    CMP_L     ( CONST(0), ECX )			/* count == 0 ? */
59
    JE        ( LLBL(K_GTPGR_finish) )		/* yes -> nothing to do. */
60
 
61
    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
62
    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
63
 
64
    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
65
    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
66
 
67
    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
68
    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
69
 
70
    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
71
    ADD_L( EDI, ECX ) 				/* count += dest ptr */
72
 
73
 
74
ALIGNTEXT32
75
    MOVAPS    ( REGOFF(0, EDX), XMM0 )	/* m0  | m1  | m2  | m3 */
76
    MOVAPS    ( REGOFF(16, EDX), XMM1 )	/* m4  | m5  | m6  | m7 */
77
    MOVAPS    ( REGOFF(32, EDX), XMM2 )	/* m8  | m9  | m10 | m11 */
78
    MOVAPS    ( REGOFF(48, EDX), XMM3 )	/* m12 | m13 | m14 | m15 */
79
 
80
 
81
ALIGNTEXT32
82
LLBL(K_GTPGR_top):
83
    MOVSS     ( REGOFF(0, ESI), XMM4 )		/*    |    |    | ox */
84
    SHUFPS    ( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox | ox */
85
    MOVSS     ( REGOFF(4, ESI), XMM5 )		/*    |    |    | oy */
86
    SHUFPS    ( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy | oy */
87
    MOVSS     ( REGOFF(8, ESI), XMM6 )		/*    |    |    | oz */
88
    SHUFPS    ( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz | oz */
89
 
90
    MULPS     ( XMM0, XMM4 )		/* m3*ox  | m2*ox  | m1*ox | m0*ox */
91
    MULPS     ( XMM1, XMM5 )		/* m7*oy  | m6*oy  | m5*oy | m4*oy */
92
    MULPS     ( XMM2, XMM6 )		/* m11*oz | m10*oz | m9*oz | m8*oz */
93
 
94
    ADDPS     ( XMM5, XMM4 )
95
    ADDPS     ( XMM6, XMM4 )
96
    ADDPS     ( XMM3, XMM4 )
97
 
98
    MOVAPS    ( XMM4, REGOFF(0, EDI) )
99
 
100
LLBL(K_GTPGR_skip):
101
    ADD_L     ( CONST(16), EDI )
102
    ADD_L     ( EAX, ESI )
103
    CMP_L     ( ECX, EDI )
104
    JNE       ( LLBL(K_GTPGR_top) )
105
 
106
LLBL(K_GTPGR_finish):
107
    POP_L     ( EDI )
108
    POP_L     ( ESI )
109
    RET
110
#undef FRAME_OFFSET
111
 
112
 
113
ALIGNTEXT4
114
GLOBL GLNAME(_mesa_sse_transform_points3_identity)
115
GLNAME( _mesa_sse_transform_points3_identity ):
116
 
117
#define FRAME_OFFSET 8
118
    PUSH_L    ( ESI )
119
    PUSH_L    ( EDI )
120
 
121
    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
122
    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
123
 
124
    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
125
 
126
    TEST_L( ECX, ECX)
127
    JZ( LLBL(K_GTPIR_finish) ) 			/* count was zero; go to finish */
128
 
129
    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
130
    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
131
 
132
    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
133
    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
134
 
135
    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
136
    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
137
 
138
    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
139
    ADD_L( EDI, ECX ) 				/* count += dest ptr */
140
 
141
    CMP_L( ESI, EDI )
142
    JE( LLBL(K_GTPIR_finish) )
143
 
144
 
145
ALIGNTEXT32
146
LLBL(K_GTPIR_top):
147
    MOVLPS    ( S(0), XMM0 )
148
    MOVLPS    ( XMM0, D(0) )
149
    MOVSS     ( S(2), XMM0 )
150
    MOVSS     ( XMM0, D(2) )
151
 
152
LLBL(K_GTPIR_skip):
153
    ADD_L     ( CONST(16), EDI )
154
    ADD_L     ( EAX, ESI )
155
    CMP_L     ( ECX, EDI )
156
    JNE       ( LLBL(K_GTPIR_top) )
157
 
158
LLBL(K_GTPIR_finish):
159
    POP_L     ( EDI )
160
    POP_L     ( ESI )
161
    RET
162
#undef FRAME_OFFSET
163
 
164
 
165
 
166
 
167
ALIGNTEXT4
168
GLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot)
169
GLNAME(_mesa_sse_transform_points3_3d_no_rot):
170
 
171
#define FRAME_OFFSET 8
172
    PUSH_L( ESI )
173
    PUSH_L( EDI )
174
 
175
    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
176
    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
177
 
178
 
179
    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
180
    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
181
 
182
    TEST_L( ECX, ECX)
183
    JZ( LLBL(K_GTP3DNRR_finish) ) 		/* count was zero; go to finish */
184
 
185
    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
186
    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
187
 
188
    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
189
    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
190
 
191
    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
192
    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
193
 
194
    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
195
    ADD_L( EDI, ECX ) 				/* count += dest ptr */
196
 
197
 
198
ALIGNTEXT32
199
    MOVSS    ( M(0), XMM1 )			/* - | - |  -  | m0  */
200
    MOVSS    ( M(5), XMM2 )			/* - | - |  -  | m5  */
201
    UNPCKLPS ( XMM2, XMM1 )			/* - | - | m5  | m0  */
202
    MOVLPS   ( M(12), XMM2 )			/* - | - | m13 | m12 */
203
    MOVSS    ( M(10), XMM3 )			/* - | - |  -  | m10 */
204
    MOVSS    ( M(14), XMM4 )			/* - | - |  -  | m14 */
205
 
206
ALIGNTEXT32
207
LLBL(K_GTP3DNRR_top):
208
 
209
    MOVLPS   ( S(0), XMM0 )			/* - | - |  s1   | s0 */
210
    MULPS    ( XMM1, XMM0 )			/* - | - | s1*m5 | s0*m0 */
211
    ADDPS    ( XMM2, XMM0 )			/* - | - | +m13  | +m12 */
212
    MOVLPS   ( XMM0, D(0) )			/* -> D(1) | -> D(0) */
213
 
214
    MOVSS    ( S(2), XMM0 )			/* sz */
215
    MULSS    ( XMM3, XMM0 )			/* sz*m10 */
216
    ADDSS    ( XMM4, XMM0 )			/* +m14 */
217
    MOVSS    ( XMM0, D(2) )			/* -> D(2) */
218
 
219
LLBL(K_GTP3DNRR_skip):
220
    ADD_L    ( CONST(16), EDI )
221
    ADD_L    ( EAX, ESI )
222
    CMP_L    ( ECX, EDI )
223
    JNE      ( LLBL(K_GTP3DNRR_top) )
224
 
225
LLBL(K_GTP3DNRR_finish):
226
    POP_L    ( EDI )
227
    POP_L    ( ESI )
228
    RET
229
#undef FRAME_OFFSET
230
 
231
 
232
 
233
ALIGNTEXT4
234
GLOBL GLNAME(_mesa_sse_transform_points3_perspective)
235
GLNAME(_mesa_sse_transform_points3_perspective):
236
 
237
#define FRAME_OFFSET 8
238
    PUSH_L   ( ESI )
239
    PUSH_L   ( EDI )
240
 
241
    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
242
    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
243
 
244
    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
245
    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
246
 
247
    TEST_L( ECX, ECX)
248
    JZ( LLBL(K_GTP3PR_finish) )			/* count was zero; go to finish */
249
 
250
    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
251
    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
252
 
253
    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
254
    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
255
 
256
    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
257
    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
258
 
259
    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
260
    ADD_L( EDI, ECX ) 				/* count += dest ptr */
261
 
262
ALIGNTEXT32
263
    MOVSS    ( M(0), XMM1 )			/* -  | -  |  -  | m0  */
264
    MOVSS    ( M(5), XMM2 )			/* -  | -  |  -  | m5  */
265
    UNPCKLPS ( XMM2, XMM1 )			/* -  | -  | m5  | m0  */
266
    MOVLPS   ( M(8), XMM2 )			/* -  | -  | m9  | m8  */
267
    MOVSS    ( M(10), XMM3 )			/* m10 */
268
    MOVSS    ( M(14), XMM4 )			/* m14 */
269
    XORPS    ( XMM6, XMM6 )			/* 0 */
270
 
271
ALIGNTEXT32
272
LLBL(K_GTP3PR_top):
273
    MOVLPS   ( S(0), XMM0 )			/* oy | ox */
274
    MULPS    ( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
275
    MOVSS    ( S(2), XMM5 )			/* oz */
276
    SHUFPS   ( CONST(0x0), XMM5, XMM5 )		/* oz | oz */
277
    MULPS    ( XMM2, XMM5 )			/* oz*m9 | oz*m8 */
278
    ADDPS    ( XMM5, XMM0 )			/* +oy*m5 | +ox*m0 */
279
    MOVLPS   ( XMM0, D(0) )			/* ->D(1) | ->D(0) */
280
 
281
    MOVSS    ( S(2), XMM0 )			/* oz */
282
    MULSS    ( XMM3, XMM0 )			/* oz*m10 */
283
    ADDSS    ( XMM4, XMM0 )			/* +m14 */
284
    MOVSS    ( XMM0, D(2) )			/* ->D(2) */
285
 
286
    MOVSS    ( S(2), XMM0 )			/* oz */
287
    MOVSS    ( XMM6, XMM5 )			/* 0 */
288
    SUBPS    ( XMM0, XMM5 )			/* -oz */
289
    MOVSS    ( XMM5, D(3) )			/* ->D(3) */
290
 
291
LLBL(K_GTP3PR_skip):
292
    ADD_L( CONST(16), EDI )
293
    ADD_L( EAX, ESI )
294
    CMP_L( ECX, EDI )
295
    JNE( LLBL(K_GTP3PR_top) )
296
 
297
LLBL(K_GTP3PR_finish):
298
    POP_L    ( EDI )
299
    POP_L    ( ESI )
300
    RET
301
#undef FRAME_OFFSET
302
 
303
 
304
 
305
ALIGNTEXT4
306
GLOBL GLNAME(_mesa_sse_transform_points3_2d)
307
GLNAME(_mesa_sse_transform_points3_2d):
308
 
309
#define FRAME_OFFSET 8
310
    PUSH_L( ESI )
311
    PUSH_L( EDI )
312
 
313
    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
314
    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
315
 
316
    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
317
    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
318
 
319
    TEST_L( ECX, ECX)
320
    JZ( LLBL(K_GTP3P2DR_finish) ) 		/* count was zero; go to finish */
321
 
322
    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
323
    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
324
 
325
    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
326
    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
327
 
328
    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
329
    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
330
 
331
    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
332
    ADD_L( EDI, ECX ) 				/* count += dest ptr */
333
 
334
ALIGNTEXT32
335
    MOVLPS( M(0), XMM0 )			/* m1  | m0 */
336
    MOVLPS( M(4), XMM1 )			/* m5  | m4 */
337
    MOVLPS( M(12), XMM2 )			/* m13 | m12 */
338
 
339
ALIGNTEXT32
340
LLBL(K_GTP3P2DR_top):
341
    MOVSS    ( S(0), XMM3 )			/* ox */
342
    SHUFPS   ( CONST(0x0), XMM3, XMM3 )		/* ox | ox */
343
    MULPS    ( XMM0, XMM3 )			/* ox*m1 | ox*m0 */
344
    MOVSS    ( S(1), XMM4 )			/* oy */
345
    SHUFPS   ( CONST(0x0), XMM4, XMM4 )		/* oy | oy */
346
    MULPS    ( XMM1, XMM4 )			/* oy*m5 | oy*m4 */
347
 
348
    ADDPS    ( XMM4, XMM3 )
349
    ADDPS    ( XMM2, XMM3 )
350
    MOVLPS   ( XMM3, D(0) )
351
 
352
    MOVSS    ( S(2), XMM3 )
353
    MOVSS    ( XMM3, D(2) )
354
 
355
LLBL(K_GTP3P2DR_skip):
356
    ADD_L    ( CONST(16), EDI )
357
    ADD_L    ( EAX, ESI )
358
    CMP_L    ( ECX, EDI )
359
    JNE      ( LLBL(K_GTP3P2DR_top) )
360
 
361
LLBL(K_GTP3P2DR_finish):
362
    POP_L    ( EDI )
363
    POP_L    ( ESI )
364
    RET
365
#undef FRAME_OFFSET
366
 
367
 
368
 
369
ALIGNTEXT4
370
GLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot)
371
GLNAME(_mesa_sse_transform_points3_2d_no_rot):
372
 
373
#define FRAME_OFFSET 8
374
	PUSH_L( ESI )
375
	PUSH_L( EDI )
376
 
377
	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
378
	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
379
 
380
	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
381
	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
382
 
383
	TEST_L( ECX, ECX)
384
	JZ( LLBL(K_GTP3P2DNRR_finish) ) 	/* count was zero; go to finish */
385
 
386
	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
387
	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
388
 
389
	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
390
	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
391
 
392
	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
393
	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
394
 
395
	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
396
	ADD_L( EDI, ECX ) 			/* count += dest ptr */
397
 
398
ALIGNTEXT32
399
	MOVSS    ( M(0), XMM1 )			/* m0 */
400
	MOVSS    ( M(5), XMM2 )			/* m5 */
401
	UNPCKLPS ( XMM2, XMM1 )			/* m5 | m0 */
402
	MOVLPS   ( M(12), XMM2 )		/* m13 | m12 */
403
 
404
ALIGNTEXT32
405
LLBL(K_GTP3P2DNRR_top):
406
	MOVLPS( S(0), XMM0 )			/* oy | ox */
407
	MULPS( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
408
	ADDPS( XMM2, XMM0 )			/* +m13 | +m12 */
409
	MOVLPS( XMM0, D(0) )			/* ->D(1) | ->D(0) */
410
 
411
	MOVSS( S(2), XMM0 )
412
	MOVSS( XMM0, D(2) )
413
 
414
LLBL(K_GTP3P2DNRR_skip):
415
	ADD_L( CONST(16), EDI )
416
	ADD_L( EAX, ESI )
417
	CMP_L( ECX, EDI )
418
	JNE( LLBL(K_GTP3P2DNRR_top) )
419
 
420
LLBL(K_GTP3P2DNRR_finish):
421
	POP_L( EDI )
422
	POP_L( ESI )
423
	RET
424
#undef FRAME_OFFSET
425
 
426
 
427
 
428
 
429
ALIGNTEXT4
430
GLOBL GLNAME(_mesa_sse_transform_points3_3d)
431
GLNAME(_mesa_sse_transform_points3_3d):
432
 
433
#define FRAME_OFFSET 8
434
	PUSH_L( ESI )
435
	PUSH_L( EDI )
436
 
437
	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
438
	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
439
 
440
 
441
	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
442
	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
443
 
444
	TEST_L( ECX, ECX)
445
	JZ( LLBL(K_GTP3P3DR_finish) ) 	/* count was zero; go to finish */
446
 
447
	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
448
	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
449
 
450
	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
451
	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
452
 
453
	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
454
	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
455
 
456
	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
457
	ADD_L( EDI, ECX ) 			/* count += dest ptr */
458
 
459
 
460
ALIGNTEXT32
461
	MOVAPS( M(0), XMM0 )			/* m2  | m1  | m0 */
462
	MOVAPS( M(4), XMM1 )			/* m6  | m5  | m4 */
463
	MOVAPS( M(8), XMM2 )			/* m10 | m9  | m8 */
464
	MOVAPS( M(12), XMM3 )			/* m14 | m13 | m12 */
465
 
466
ALIGNTEXT32
467
LLBL(K_GTP3P3DR_top):
468
	MOVSS( S(0), XMM4 )
469
	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox */
470
	MULPS( XMM0, XMM4 )			/* ox*m2 | ox*m1 | ox*m0 */
471
 
472
	MOVSS( S(1), XMM5 )
473
	SHUFPS( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy */
474
	MULPS( XMM1, XMM5 )			/* oy*m6 | oy*m5 | oy*m4 */
475
 
476
	MOVSS( S(2), XMM6 )
477
	SHUFPS( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz */
478
	MULPS( XMM2, XMM6 )			/* oz*m10 | oz*m9 | oz*m8 */
479
 
480
	ADDPS( XMM5, XMM4 )			/* + | + | + */
481
	ADDPS( XMM6, XMM4 )			/* + | + | + */
482
	ADDPS( XMM3, XMM4 )			/* + | + | + */
483
 
484
	MOVLPS( XMM4, D(0) )			/* => D(1) | => D(0) */
485
	UNPCKHPS( XMM4, XMM4 )
486
	MOVSS( XMM4, D(2) )
487
 
488
LLBL(K_GTP3P3DR_skip):
489
	ADD_L( CONST(16), EDI )
490
	ADD_L( EAX, ESI )
491
	CMP_L( ECX, EDI )
492
	JNE( LLBL(K_GTP3P3DR_top) )
493
 
494
LLBL(K_GTP3P3DR_finish):
495
	POP_L( EDI )
496
	POP_L( ESI )
497
	RET
498
#undef FRAME_OFFSET