WebSVN - shark - Blame - Rev 96 - /shark/trunk/ports/png/pnggccrd.c

Rev	Author	Line No.	Line
96	giacomo	1	/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
		2	*
		3	* For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
		4	*
		5	* See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
		6	* and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
		7	* for Intel's performance analysis of the MMX vs. non-MMX code.
		8	*
		9	* libpng version 1.2.5 - October 3, 2002
		10	* For conditions of distribution and use, see copyright notice in png.h
		11	* Copyright (c) 1998-2002 Glenn Randers-Pehrson
		12	* Copyright (c) 1998, Intel Corporation
		13	*
		14	* Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
		15	* Interface to libpng contributed by Gilles Vollant, 1999.
		16	* GNU C port by Greg Roelofs, 1999-2001.
		17	*
		18	* Lines 2350-4300 converted in place with intel2gas 1.3.1:
		19	*
		20	* intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
		21	*
		22	* and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
		23	*
		24	* NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
		25	* is required to assemble the newer MMX instructions such as movq.
		26	* For djgpp, see
		27	*
		28	* ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
		29	*
		30	* (or a later version in the same directory). For Linux, check your
		31	* distribution's web site(s) or try these links:
		32	*
		33	* http://rufus.w3.org/linux/RPM/binutils.html
		34	* http://www.debian.org/Packages/stable/devel/binutils.html
		35	* ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
		36	* binutils.tgz
		37	*
		38	* For other platforms, see the main GNU site:
		39	*
		40	* ftp://ftp.gnu.org/pub/gnu/binutils/
		41	*
		42	* Version 2.5.2l.15 is definitely too old...
		43	*/
		44
		45	/*
		46	* TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
		47	* =====================================
		48	*
		49	* 19991006:
		50	* - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
		51	*
		52	* 19991007:
		53	* - additional optimizations (possible or definite):
		54	* x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
		55	* - write MMX code for 48-bit case (pixel_bytes == 6)
		56	* - figure out what's up with 24-bit case (pixel_bytes == 3):
		57	* why subtract 8 from width_mmx in the pass 4/5 case?
		58	* (only width_mmx case) (near line 1606)
		59	* x [DONE] replace pixel_bytes within each block with the true
		60	* constant value (or are compilers smart enough to do that?)
		61	* - rewrite all MMX interlacing code so it's aligned with
		62	* the beginning of the row buffer, not the end. This
		63	* would not only allow one to eliminate half of the memory
		64	* writes for odd passes (that is, pass == odd), it may also
		65	* eliminate some unaligned-data-access exceptions (assuming
		66	* there's a penalty for not aligning 64-bit accesses on
		67	* 64-bit boundaries). The only catch is that the "leftover"
		68	* pixel(s) at the end of the row would have to be saved,
		69	* but there are enough unused MMX registers in every case,
		70	* so this is not a problem. A further benefit is that the
		71	* post-MMX cleanup code (C code) in at least some of the
		72	* cases could be done within the assembler block.
		73	* x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
		74	* inconsistent, and don't match the MMX Programmer's Reference
		75	* Manual conventions anyway. They should be changed to
		76	* "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
		77	* was lowest in memory (e.g., corresponding to a left pixel)
		78	* and b7 is the byte that was highest (e.g., a right pixel).
		79	*
		80	* 19991016:
		81	* - Brennan's Guide notwithstanding, gcc under Linux does not
		82	* want globals prefixed by underscores when referencing them--
		83	* i.e., if the variable is const4, then refer to it as const4,
		84	* not _const4. This seems to be a djgpp-specific requirement.
		85	* Also, such variables apparently must be declared outside
		86	* of functions; neither static nor automatic variables work if
		87	* defined within the scope of a single function, but both
		88	* static and truly global (multi-module) variables work fine.
		89	*
		90	* 19991023:
		91	* - fixed png_combine_row() non-MMX replication bug (odd passes only?)
		92	* - switched from string-concatenation-with-macros to cleaner method of
		93	* renaming global variables for djgpp--i.e., always use prefixes in
		94	* inlined assembler code (== strings) and conditionally rename the
		95	* variables, not the other way around. Hence _const4, _mask8_0, etc.
		96	*
		97	* 19991024:
		98	* - fixed mmxsupport()/png_do_read_interlace() first-row bug
		99	* This one was severely weird: even though mmxsupport() doesn't touch
		100	* ebx (where "row" pointer was stored), it nevertheless managed to zero
		101	* the register (even in static/non-fPIC code--see below), which in turn
		102	* caused png_do_read_interlace() to return prematurely on the first row of
		103	* interlaced images (i.e., without expanding the interlaced pixels).
		104	* Inspection of the generated assembly code didn't turn up any clues,
		105	* although it did point at a minor optimization (i.e., get rid of
		106	* mmx_supported_local variable and just use eax). Possibly the CPUID
		107	* instruction is more destructive than it looks? (Not yet checked.)
		108	* - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
		109	* listings... Apparently register spillage has to do with ebx, since
		110	* it's used to index the global offset table. Commenting it out of the
		111	* input-reg lists in png_combine_row() eliminated compiler barfage, so
		112	* ifdef'd with __PIC__ macro: if defined, use a global for unmask
		113	*
		114	* 19991107:
		115	* - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
		116	* "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
		117	*
		118	* 19991120:
		119	* - made "diff" variable (now "_dif") global to simplify conversion of
		120	* filtering routines (running out of regs, sigh). "diff" is still used
		121	* in interlacing routines, however.
		122	* - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
		123	* macro determines which is used); original not yet tested.
		124	*
		125	* 20000213:
		126	* - when compiling with gcc, be sure to use -fomit-frame-pointer
		127	*
		128	* 20000319:
		129	* - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
		130	* pass == 4 or 5, that caused visible corruption of interlaced images
		131	*
		132	* 20000623:
		133	* - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
		134	* many of the form "forbidden register 0 (ax) was spilled for class AREG."
		135	* This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
		136	* Chuck Wilson supplied a patch involving dummy output registers. See
		137	* http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
		138	* for the original (anonymous) SourceForge bug report.
		139	*
		140	* 20000706:
		141	* - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
		142	* pnggccrd.c: In function `png_combine_row':
		143	* pnggccrd.c:525: more than 10 operands in `asm'
		144	* pnggccrd.c:669: more than 10 operands in `asm'
		145	* pnggccrd.c:828: more than 10 operands in `asm'
		146	* pnggccrd.c:994: more than 10 operands in `asm'
		147	* pnggccrd.c:1177: more than 10 operands in `asm'
		148	* They are all the same problem and can be worked around by using the
		149	* global _unmask variable unconditionally, not just in the -fPIC case.
		150	* Reportedly earlier versions of gcc also have the problem with more than
		151	* 10 operands; they just don't report it. Much strangeness ensues, etc.
		152	*
		153	* 20000729:
		154	* - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
		155	* MMX routine); began converting png_read_filter_row_mmx_sub()
		156	* - to finish remaining sections:
		157	* - clean up indentation and comments
		158	* - preload local variables
		159	* - add output and input regs (order of former determines numerical
		160	* mapping of latter)
		161	* - avoid all usage of ebx (including bx, bh, bl) register [20000823]
		162	* - remove "$" from addressing of Shift and Mask variables [20000823]
		163	*
		164	* 20000731:
		165	* - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
		166	*
		167	* 20000822:
		168	* - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
		169	* shared-library (-fPIC) version! Code works just fine as part of static
		170	* library. Damn damn damn damn damn, should have tested that sooner.
		171	* ebx is getting clobbered again (explicitly this time); need to save it
		172	* on stack or rewrite asm code to avoid using it altogether. Blargh!
		173	*
		174	* 20000823:
		175	* - first section was trickiest; all remaining sections have ebx -> edx now.
		176	* (-fPIC works again.) Also added missing underscores to various Shift*
		177	* and Mask globals and got rid of leading "$" signs.
		178	*
		179	* 20000826:
		180	* - added visual separators to help navigate microscopic printed copies
		181	* (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
		182	* on png_read_filter_row_mmx_avg()
		183	*
		184	* 20000828:
		185	* - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
		186	* What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
		187	* cleaned up/shortened in either routine, but functionality is complete
		188	* and seems to be working fine.
		189	*
		190	* 20000829:
		191	* - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
		192	* as an input reg (with dummy output variables, etc.), then it cannot
		193	* also appear in the clobber list or gcc 2.95.2 will barf. The solution
		194	* is simple enough...
		195	*
		196	* 20000914:
		197	* - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
		198	* correctly (but 48-bit RGB just fine)
		199	*
		200	* 20000916:
		201	* - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
		202	* - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
		203	* - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
		204	* - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
		205	*
		206	* 20010101:
		207	* - added new png_init_mmx_flags() function (here only because it needs to
		208	* call mmxsupport(), which should probably become global png_mmxsupport());
		209	* modified other MMX routines to run conditionally (png_ptr->asm_flags)
		210	*
		211	* 20010103:
		212	* - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
		213	* and made it public; moved png_init_mmx_flags() to png.c as internal func
		214	*
		215	* 20010104:
		216	* - removed dependency on png_read_filter_row_c() (C code already duplicated
		217	* within MMX version of png_read_filter_row()) so no longer necessary to
		218	* compile it into pngrutil.o
		219	*
		220	* 20010310:
		221	* - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
		222	*
		223	* 20020304:
		224	* - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
		225	*
		226	* STILL TO DO:
		227	* - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
		228	* - write MMX code for 48-bit case (pixel_bytes == 6)
		229	* - figure out what's up with 24-bit case (pixel_bytes == 3):
		230	* why subtract 8 from width_mmx in the pass 4/5 case?
		231	* (only width_mmx case) (near line 1606)
		232	* - rewrite all MMX interlacing code so it's aligned with beginning
		233	* of the row buffer, not the end (see 19991007 for details)
		234	* x pick one version of mmxsupport() and get rid of the other
		235	* - add error messages to any remaining bogus default cases
		236	* - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
		237	* x add support for runtime enable/disable/query of various MMX routines
		238	*/
		239
		240	#define PNG_INTERNAL
		241	#include "png.h"
		242
		243	#if defined(PNG_USE_PNGGCCRD)
		244
		245	int PNGAPI png_mmx_support(void);
		246
		247	#ifdef PNG_USE_LOCAL_ARRAYS
		248	static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
		249	static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
		250	static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
		251	#endif
		252
		253	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
		254	/* djgpp, Win32, and Cygwin add their own underscores to global variables,
		255	* so define them without: */
		256	#if defined(__DJGPP__) \|\| defined(WIN32) \|\| defined(__CYGWIN__)
		257	# define _mmx_supported mmx_supported
		258	# define _const4 const4
		259	# define _const6 const6
		260	# define _mask8_0 mask8_0
		261	# define _mask16_1 mask16_1
		262	# define _mask16_0 mask16_0
		263	# define _mask24_2 mask24_2
		264	# define _mask24_1 mask24_1
		265	# define _mask24_0 mask24_0
		266	# define _mask32_3 mask32_3
		267	# define _mask32_2 mask32_2
		268	# define _mask32_1 mask32_1
		269	# define _mask32_0 mask32_0
		270	# define _mask48_5 mask48_5
		271	# define _mask48_4 mask48_4
		272	# define _mask48_3 mask48_3
		273	# define _mask48_2 mask48_2
		274	# define _mask48_1 mask48_1
		275	# define _mask48_0 mask48_0
		276	# define _LBCarryMask LBCarryMask
		277	# define _HBClearMask HBClearMask
		278	# define _ActiveMask ActiveMask
		279	# define _ActiveMask2 ActiveMask2
		280	# define _ActiveMaskEnd ActiveMaskEnd
		281	# define _ShiftBpp ShiftBpp
		282	# define _ShiftRem ShiftRem
		283	#ifdef PNG_THREAD_UNSAFE_OK
		284	# define _unmask unmask
		285	# define _FullLength FullLength
		286	# define _MMXLength MMXLength
		287	# define _dif dif
		288	# define _patemp patemp
		289	# define _pbtemp pbtemp
		290	# define _pctemp pctemp
		291	#endif
		292	#endif
		293
		294
		295	/* These constants are used in the inlined MMX assembly code.
		296	Ignore gcc's "At top level: defined but not used" warnings. */
		297
		298	/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
		299	* since that case uses the %ebx register for indexing the Global Offset Table
		300	* and there were no other registers available. But gcc 2.95 and later emit
		301	* "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
		302	* in the non-PIC case, so we'll just use the global unconditionally now.
		303	*/
		304	#ifdef PNG_THREAD_UNSAFE_OK
		305	static int _unmask;
		306	#endif
		307
		308	static unsigned long long _mask8_0 = 0x0102040810204080LL;
		309
		310	static unsigned long long _mask16_1 = 0x0101020204040808LL;
		311	static unsigned long long _mask16_0 = 0x1010202040408080LL;
		312
		313	static unsigned long long _mask24_2 = 0x0101010202020404LL;
		314	static unsigned long long _mask24_1 = 0x0408080810101020LL;
		315	static unsigned long long _mask24_0 = 0x2020404040808080LL;
		316
		317	static unsigned long long _mask32_3 = 0x0101010102020202LL;
		318	static unsigned long long _mask32_2 = 0x0404040408080808LL;
		319	static unsigned long long _mask32_1 = 0x1010101020202020LL;
		320	static unsigned long long _mask32_0 = 0x4040404080808080LL;
		321
		322	static unsigned long long _mask48_5 = 0x0101010101010202LL;
		323	static unsigned long long _mask48_4 = 0x0202020204040404LL;
		324	static unsigned long long _mask48_3 = 0x0404080808080808LL;
		325	static unsigned long long _mask48_2 = 0x1010101010102020LL;
		326	static unsigned long long _mask48_1 = 0x2020202040404040LL;
		327	static unsigned long long _mask48_0 = 0x4040808080808080LL;
		328
		329	static unsigned long long _const4 = 0x0000000000FFFFFFLL;
		330	//static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
		331	static unsigned long long _const6 = 0x00000000000000FFLL;
		332
		333	// These are used in the row-filter routines and should/would be local
		334	// variables if not for gcc addressing limitations.
		335	// WARNING: Their presence probably defeats the thread safety of libpng.
		336
		337	#ifdef PNG_THREAD_UNSAFE_OK
		338	static png_uint_32 _FullLength;
		339	static png_uint_32 _MMXLength;
		340	static int _dif;
		341	static int _patemp; // temp variables for Paeth routine
		342	static int _pbtemp;
		343	static int _pctemp;
		344	#endif
		345
		346	void /* PRIVATE */
		347	png_squelch_warnings(void)
		348	{
		349	#ifdef PNG_THREAD_UNSAFE_OK
		350	_dif = _dif;
		351	_patemp = _patemp;
		352	_pbtemp = _pbtemp;
		353	_pctemp = _pctemp;
		354	_MMXLength = _MMXLength;
		355	#endif
		356	_const4 = _const4;
		357	_const6 = _const6;
		358	_mask8_0 = _mask8_0;
		359	_mask16_1 = _mask16_1;
		360	_mask16_0 = _mask16_0;
		361	_mask24_2 = _mask24_2;
		362	_mask24_1 = _mask24_1;
		363	_mask24_0 = _mask24_0;
		364	_mask32_3 = _mask32_3;
		365	_mask32_2 = _mask32_2;
		366	_mask32_1 = _mask32_1;
		367	_mask32_0 = _mask32_0;
		368	_mask48_5 = _mask48_5;
		369	_mask48_4 = _mask48_4;
		370	_mask48_3 = _mask48_3;
		371	_mask48_2 = _mask48_2;
		372	_mask48_1 = _mask48_1;
		373	_mask48_0 = _mask48_0;
		374	}
		375	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		376
		377
		378	static int _mmx_supported = 2;
		379
		380	/===========================================================================/
		381	/* */
		382	/* P N G _ C O M B I N E _ R O W */
		383	/* */
		384	/===========================================================================/
		385
		386	#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
		387
		388	#define BPP2 2
		389	#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
		390	#define BPP4 4
		391	#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
		392	#define BPP8 8
		393
		394	/* Combines the row recently read in with the previous row.
		395	This routine takes care of alpha and transparency if requested.
		396	This routine also handles the two methods of progressive display
		397	of interlaced images, depending on the mask value.
		398	The mask value describes which pixels are to be combined with
		399	the row. The pattern always repeats every 8 pixels, so just 8
		400	bits are needed. A one indicates the pixel is to be combined; a
		401	zero indicates the pixel is to be skipped. This is in addition
		402	to any alpha or transparency value associated with the pixel.
		403	If you want all pixels to be combined, pass 0xff (255) in mask. */
		404
		405	/* Use this routine for the x86 platform - it uses a faster MMX routine
		406	if the machine supports MMX. */
		407
		408	void /* PRIVATE */
		409	png_combine_row(png_structp png_ptr, png_bytep row, int mask)
		410	{
		411	png_debug(1, "in png_combine_row (pnggccrd.c)\n");
		412
		413	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
		414	if (_mmx_supported == 2) {
		415	/* this should have happened in png_init_mmx_flags() already */
		416	png_warning(png_ptr, "asm_flags may not have been initialized");
		417	png_mmx_support();
		418	}
		419	#endif
		420
		421	if (mask == 0xff)
		422	{
		423	png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
		424	png_memcpy(row, png_ptr->row_buf + 1,
		425	(png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
		426	}
		427	else /* (png_combine_row() is never called with mask == 0) */
		428	{
		429	switch (png_ptr->row_info.pixel_depth)
		430	{
		431	case 1: /* png_ptr->row_info.pixel_depth */
		432	{
		433	png_bytep sp;
		434	png_bytep dp;
		435	int s_inc, s_start, s_end;
		436	int m;
		437	int shift;
		438	png_uint_32 i;
		439
		440	sp = png_ptr->row_buf + 1;
		441	dp = row;
		442	m = 0x80;
		443	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
		444	if (png_ptr->transformations & PNG_PACKSWAP)
		445	{
		446	s_start = 0;
		447	s_end = 7;
		448	s_inc = 1;
		449	}
		450	else
		451	#endif
		452	{
		453	s_start = 7;
		454	s_end = 0;
		455	s_inc = -1;
		456	}
		457
		458	shift = s_start;
		459
		460	for (i = 0; i < png_ptr->width; i++)
		461	{
		462	if (m & mask)
		463	{
		464	int value;
		465
		466	value = (*sp >> shift) & 0x1;
		467	*dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
		468	*dp \|= (png_byte)(value << shift);
		469	}
		470
		471	if (shift == s_end)
		472	{
		473	shift = s_start;
		474	sp++;
		475	dp++;
		476	}
		477	else
		478	shift += s_inc;
		479
		480	if (m == 1)
		481	m = 0x80;
		482	else
		483	m >>= 1;
		484	}
		485	break;
		486	}
		487
		488	case 2: /* png_ptr->row_info.pixel_depth */
		489	{
		490	png_bytep sp;
		491	png_bytep dp;
		492	int s_start, s_end, s_inc;
		493	int m;
		494	int shift;
		495	png_uint_32 i;
		496	int value;
		497
		498	sp = png_ptr->row_buf + 1;
		499	dp = row;
		500	m = 0x80;
		501	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
		502	if (png_ptr->transformations & PNG_PACKSWAP)
		503	{
		504	s_start = 0;
		505	s_end = 6;
		506	s_inc = 2;
		507	}
		508	else
		509	#endif
		510	{
		511	s_start = 6;
		512	s_end = 0;
		513	s_inc = -2;
		514	}
		515
		516	shift = s_start;
		517
		518	for (i = 0; i < png_ptr->width; i++)
		519	{
		520	if (m & mask)
		521	{
		522	value = (*sp >> shift) & 0x3;
		523	*dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
		524	*dp \|= (png_byte)(value << shift);
		525	}
		526
		527	if (shift == s_end)
		528	{
		529	shift = s_start;
		530	sp++;
		531	dp++;
		532	}
		533	else
		534	shift += s_inc;
		535	if (m == 1)
		536	m = 0x80;
		537	else
		538	m >>= 1;
		539	}
		540	break;
		541	}
		542
		543	case 4: /* png_ptr->row_info.pixel_depth */
		544	{
		545	png_bytep sp;
		546	png_bytep dp;
		547	int s_start, s_end, s_inc;
		548	int m;
		549	int shift;
		550	png_uint_32 i;
		551	int value;
		552
		553	sp = png_ptr->row_buf + 1;
		554	dp = row;
		555	m = 0x80;
		556	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
		557	if (png_ptr->transformations & PNG_PACKSWAP)
		558	{
		559	s_start = 0;
		560	s_end = 4;
		561	s_inc = 4;
		562	}
		563	else
		564	#endif
		565	{
		566	s_start = 4;
		567	s_end = 0;
		568	s_inc = -4;
		569	}
		570	shift = s_start;
		571
		572	for (i = 0; i < png_ptr->width; i++)
		573	{
		574	if (m & mask)
		575	{
		576	value = (*sp >> shift) & 0xf;
		577	*dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
		578	*dp \|= (png_byte)(value << shift);
		579	}
		580
		581	if (shift == s_end)
		582	{
		583	shift = s_start;
		584	sp++;
		585	dp++;
		586	}
		587	else
		588	shift += s_inc;
		589	if (m == 1)
		590	m = 0x80;
		591	else
		592	m >>= 1;
		593	}
		594	break;
		595	}
		596
		597	case 8: /* png_ptr->row_info.pixel_depth */
		598	{
		599	png_bytep srcptr;
		600	png_bytep dstptr;
		601
		602	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		603	#if !defined(PNG_1_0_X)
		604	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
		605	/* && _mmx_supported */ )
		606	#else
		607	if (_mmx_supported)
		608	#endif
		609	{
		610	png_uint_32 len;
		611	int diff;
		612	int dummy_value_a; // fix 'forbidden register spilled' error
		613	int dummy_value_d;
		614	int dummy_value_c;
		615	int dummy_value_S;
		616	int dummy_value_D;
		617	_unmask = ~mask; // global variable for -fPIC version
		618	srcptr = png_ptr->row_buf + 1;
		619	dstptr = row;
		620	len = png_ptr->width &~7; // reduce to multiple of 8
		621	diff = (int) (png_ptr->width & 7); // amount lost
		622
		623	__asm__ __volatile__ (
		624	"movd _unmask, %%mm7 \n\t" // load bit pattern
		625	"psubb %%mm6, %%mm6 \n\t" // zero mm6
		626	"punpcklbw %%mm7, %%mm7 \n\t"
		627	"punpcklwd %%mm7, %%mm7 \n\t"
		628	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
		629
		630	"movq _mask8_0, %%mm0 \n\t"
		631	"pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
		632	"pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
		633
		634	// preload "movl len, %%ecx \n\t" // load length of line
		635	// preload "movl srcptr, %%esi \n\t" // load source
		636	// preload "movl dstptr, %%edi \n\t" // load dest
		637
		638	"cmpl $0, %%ecx \n\t" // len == 0 ?
		639	"je mainloop8end \n\t"
		640
		641	"mainloop8: \n\t"
		642	"movq (%%esi), %%mm4 \n\t" // *srcptr
		643	"pand %%mm0, %%mm4 \n\t"
		644	"movq %%mm0, %%mm6 \n\t"
		645	"pandn (%%edi), %%mm6 \n\t" // *dstptr
		646	"por %%mm6, %%mm4 \n\t"
		647	"movq %%mm4, (%%edi) \n\t"
		648	"addl $8, %%esi \n\t" // inc by 8 bytes processed
		649	"addl $8, %%edi \n\t"
		650	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
		651	"ja mainloop8 \n\t"
		652
		653	"mainloop8end: \n\t"
		654	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
		655	"movl %%eax, %%ecx \n\t"
		656	"cmpl $0, %%ecx \n\t"
		657	"jz end8 \n\t"
		658	// preload "movl mask, %%edx \n\t"
		659	"sall $24, %%edx \n\t" // make low byte, high byte
		660
		661	"secondloop8: \n\t"
		662	"sall %%edx \n\t" // move high bit to CF
		663	"jnc skip8 \n\t" // if CF = 0
		664	"movb (%%esi), %%al \n\t"
		665	"movb %%al, (%%edi) \n\t"
		666
		667	"skip8: \n\t"
		668	"incl %%esi \n\t"
		669	"incl %%edi \n\t"
		670	"decl %%ecx \n\t"
		671	"jnz secondloop8 \n\t"
		672
		673	"end8: \n\t"
		674	"EMMS \n\t" // DONE
		675
		676	: "=a" (dummy_value_a), // output regs (dummy)
		677	"=d" (dummy_value_d),
		678	"=c" (dummy_value_c),
		679	"=S" (dummy_value_S),
		680	"=D" (dummy_value_D)
		681
		682	: "3" (srcptr), // esi // input regs
		683	"4" (dstptr), // edi
		684	"0" (diff), // eax
		685	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
		686	"2" (len), // ecx
		687	"1" (mask) // edx
		688
		689	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		690	: "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
		691	#endif
		692	);
		693	}
		694	else /* mmx _not supported - Use modified C routine */
		695	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		696	{
		697	register png_uint_32 i;
		698	png_uint_32 initial_val = png_pass_start[png_ptr->pass];
		699	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
		700	register int stride = png_pass_inc[png_ptr->pass];
		701	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
		702	register int rep_bytes = png_pass_width[png_ptr->pass];
		703	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
		704	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
		705	int diff = (int) (png_ptr->width & 7); /* amount lost */
		706	register png_uint_32 final_val = len; /* GRR bugfix */
		707
		708	srcptr = png_ptr->row_buf + 1 + initial_val;
		709	dstptr = row + initial_val;
		710
		711	for (i = initial_val; i < final_val; i += stride)
		712	{
		713	png_memcpy(dstptr, srcptr, rep_bytes);
		714	srcptr += stride;
		715	dstptr += stride;
		716	}
		717	if (diff) /* number of leftover pixels: 3 for pngtest */
		718	{
		719	final_val+=diff /* BPP1 / ;
		720	for (; i < final_val; i += stride)
		721	{
		722	if (rep_bytes > (int)(final_val-i))
		723	rep_bytes = (int)(final_val-i);
		724	png_memcpy(dstptr, srcptr, rep_bytes);
		725	srcptr += stride;
		726	dstptr += stride;
		727	}
		728	}
		729
		730	} /* end of else (_mmx_supported) */
		731
		732	break;
		733	} /* end 8 bpp */
		734
		735	case 16: /* png_ptr->row_info.pixel_depth */
		736	{
		737	png_bytep srcptr;
		738	png_bytep dstptr;
		739
		740	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		741	#if !defined(PNG_1_0_X)
		742	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
		743	/* && _mmx_supported */ )
		744	#else
		745	if (_mmx_supported)
		746	#endif
		747	{
		748	png_uint_32 len;
		749	int diff;
		750	int dummy_value_a; // fix 'forbidden register spilled' error
		751	int dummy_value_d;
		752	int dummy_value_c;
		753	int dummy_value_S;
		754	int dummy_value_D;
		755	_unmask = ~mask; // global variable for -fPIC version
		756	srcptr = png_ptr->row_buf + 1;
		757	dstptr = row;
		758	len = png_ptr->width &~7; // reduce to multiple of 8
		759	diff = (int) (png_ptr->width & 7); // amount lost //
		760
		761	__asm__ __volatile__ (
		762	"movd _unmask, %%mm7 \n\t" // load bit pattern
		763	"psubb %%mm6, %%mm6 \n\t" // zero mm6
		764	"punpcklbw %%mm7, %%mm7 \n\t"
		765	"punpcklwd %%mm7, %%mm7 \n\t"
		766	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
		767
		768	"movq _mask16_0, %%mm0 \n\t"
		769	"movq _mask16_1, %%mm1 \n\t"
		770
		771	"pand %%mm7, %%mm0 \n\t"
		772	"pand %%mm7, %%mm1 \n\t"
		773
		774	"pcmpeqb %%mm6, %%mm0 \n\t"
		775	"pcmpeqb %%mm6, %%mm1 \n\t"
		776
		777	// preload "movl len, %%ecx \n\t" // load length of line
		778	// preload "movl srcptr, %%esi \n\t" // load source
		779	// preload "movl dstptr, %%edi \n\t" // load dest
		780
		781	"cmpl $0, %%ecx \n\t"
		782	"jz mainloop16end \n\t"
		783
		784	"mainloop16: \n\t"
		785	"movq (%%esi), %%mm4 \n\t"
		786	"pand %%mm0, %%mm4 \n\t"
		787	"movq %%mm0, %%mm6 \n\t"
		788	"movq (%%edi), %%mm7 \n\t"
		789	"pandn %%mm7, %%mm6 \n\t"
		790	"por %%mm6, %%mm4 \n\t"
		791	"movq %%mm4, (%%edi) \n\t"
		792
		793	"movq 8(%%esi), %%mm5 \n\t"
		794	"pand %%mm1, %%mm5 \n\t"
		795	"movq %%mm1, %%mm7 \n\t"
		796	"movq 8(%%edi), %%mm6 \n\t"
		797	"pandn %%mm6, %%mm7 \n\t"
		798	"por %%mm7, %%mm5 \n\t"
		799	"movq %%mm5, 8(%%edi) \n\t"
		800
		801	"addl $16, %%esi \n\t" // inc by 16 bytes processed
		802	"addl $16, %%edi \n\t"
		803	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
		804	"ja mainloop16 \n\t"
		805
		806	"mainloop16end: \n\t"
		807	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
		808	"movl %%eax, %%ecx \n\t"
		809	"cmpl $0, %%ecx \n\t"
		810	"jz end16 \n\t"
		811	// preload "movl mask, %%edx \n\t"
		812	"sall $24, %%edx \n\t" // make low byte, high byte
		813
		814	"secondloop16: \n\t"
		815	"sall %%edx \n\t" // move high bit to CF
		816	"jnc skip16 \n\t" // if CF = 0
		817	"movw (%%esi), %%ax \n\t"
		818	"movw %%ax, (%%edi) \n\t"
		819
		820	"skip16: \n\t"
		821	"addl $2, %%esi \n\t"
		822	"addl $2, %%edi \n\t"
		823	"decl %%ecx \n\t"
		824	"jnz secondloop16 \n\t"
		825
		826	"end16: \n\t"
		827	"EMMS \n\t" // DONE
		828
		829	: "=a" (dummy_value_a), // output regs (dummy)
		830	"=c" (dummy_value_c),
		831	"=d" (dummy_value_d),
		832	"=S" (dummy_value_S),
		833	"=D" (dummy_value_D)
		834
		835	: "0" (diff), // eax // input regs
		836	// was (unmask) " " RESERVED // ebx // Global Offset Table idx
		837	"1" (len), // ecx
		838	"2" (mask), // edx
		839	"3" (srcptr), // esi
		840	"4" (dstptr) // edi
		841
		842	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		843	: "%mm0", "%mm1", "%mm4" // clobber list
		844	, "%mm5", "%mm6", "%mm7"
		845	#endif
		846	);
		847	}
		848	else /* mmx _not supported - Use modified C routine */
		849	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		850	{
		851	register png_uint_32 i;
		852	png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
		853	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
		854	register int stride = BPP2 * png_pass_inc[png_ptr->pass];
		855	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
		856	register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
		857	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
		858	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
		859	int diff = (int) (png_ptr->width & 7); /* amount lost */
		860	register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
		861
		862	srcptr = png_ptr->row_buf + 1 + initial_val;
		863	dstptr = row + initial_val;
		864
		865	for (i = initial_val; i < final_val; i += stride)
		866	{
		867	png_memcpy(dstptr, srcptr, rep_bytes);
		868	srcptr += stride;
		869	dstptr += stride;
		870	}
		871	if (diff) /* number of leftover pixels: 3 for pngtest */
		872	{
		873	final_val+=diff*BPP2;
		874	for (; i < final_val; i += stride)
		875	{
		876	if (rep_bytes > (int)(final_val-i))
		877	rep_bytes = (int)(final_val-i);
		878	png_memcpy(dstptr, srcptr, rep_bytes);
		879	srcptr += stride;
		880	dstptr += stride;
		881	}
		882	}
		883	} /* end of else (_mmx_supported) */
		884
		885	break;
		886	} /* end 16 bpp */
		887
		888	case 24: /* png_ptr->row_info.pixel_depth */
		889	{
		890	png_bytep srcptr;
		891	png_bytep dstptr;
		892
		893	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		894	#if !defined(PNG_1_0_X)
		895	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
		896	/* && _mmx_supported */ )
		897	#else
		898	if (_mmx_supported)
		899	#endif
		900	{
		901	png_uint_32 len;
		902	int diff;
		903	int dummy_value_a; // fix 'forbidden register spilled' error
		904	int dummy_value_d;
		905	int dummy_value_c;
		906	int dummy_value_S;
		907	int dummy_value_D;
		908	_unmask = ~mask; // global variable for -fPIC version
		909	srcptr = png_ptr->row_buf + 1;
		910	dstptr = row;
		911	len = png_ptr->width &~7; // reduce to multiple of 8
		912	diff = (int) (png_ptr->width & 7); // amount lost //
		913
		914	__asm__ __volatile__ (
		915	"movd _unmask, %%mm7 \n\t" // load bit pattern
		916	"psubb %%mm6, %%mm6 \n\t" // zero mm6
		917	"punpcklbw %%mm7, %%mm7 \n\t"
		918	"punpcklwd %%mm7, %%mm7 \n\t"
		919	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
		920
		921	"movq _mask24_0, %%mm0 \n\t"
		922	"movq _mask24_1, %%mm1 \n\t"
		923	"movq _mask24_2, %%mm2 \n\t"
		924
		925	"pand %%mm7, %%mm0 \n\t"
		926	"pand %%mm7, %%mm1 \n\t"
		927	"pand %%mm7, %%mm2 \n\t"
		928
		929	"pcmpeqb %%mm6, %%mm0 \n\t"
		930	"pcmpeqb %%mm6, %%mm1 \n\t"
		931	"pcmpeqb %%mm6, %%mm2 \n\t"
		932
		933	// preload "movl len, %%ecx \n\t" // load length of line
		934	// preload "movl srcptr, %%esi \n\t" // load source
		935	// preload "movl dstptr, %%edi \n\t" // load dest
		936
		937	"cmpl $0, %%ecx \n\t"
		938	"jz mainloop24end \n\t"
		939
		940	"mainloop24: \n\t"
		941	"movq (%%esi), %%mm4 \n\t"
		942	"pand %%mm0, %%mm4 \n\t"
		943	"movq %%mm0, %%mm6 \n\t"
		944	"movq (%%edi), %%mm7 \n\t"
		945	"pandn %%mm7, %%mm6 \n\t"
		946	"por %%mm6, %%mm4 \n\t"
		947	"movq %%mm4, (%%edi) \n\t"
		948
		949	"movq 8(%%esi), %%mm5 \n\t"
		950	"pand %%mm1, %%mm5 \n\t"
		951	"movq %%mm1, %%mm7 \n\t"
		952	"movq 8(%%edi), %%mm6 \n\t"
		953	"pandn %%mm6, %%mm7 \n\t"
		954	"por %%mm7, %%mm5 \n\t"
		955	"movq %%mm5, 8(%%edi) \n\t"
		956
		957	"movq 16(%%esi), %%mm6 \n\t"
		958	"pand %%mm2, %%mm6 \n\t"
		959	"movq %%mm2, %%mm4 \n\t"
		960	"movq 16(%%edi), %%mm7 \n\t"
		961	"pandn %%mm7, %%mm4 \n\t"
		962	"por %%mm4, %%mm6 \n\t"
		963	"movq %%mm6, 16(%%edi) \n\t"
		964
		965	"addl $24, %%esi \n\t" // inc by 24 bytes processed
		966	"addl $24, %%edi \n\t"
		967	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
		968
		969	"ja mainloop24 \n\t"
		970
		971	"mainloop24end: \n\t"
		972	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
		973	"movl %%eax, %%ecx \n\t"
		974	"cmpl $0, %%ecx \n\t"
		975	"jz end24 \n\t"
		976	// preload "movl mask, %%edx \n\t"
		977	"sall $24, %%edx \n\t" // make low byte, high byte
		978
		979	"secondloop24: \n\t"
		980	"sall %%edx \n\t" // move high bit to CF
		981	"jnc skip24 \n\t" // if CF = 0
		982	"movw (%%esi), %%ax \n\t"
		983	"movw %%ax, (%%edi) \n\t"
		984	"xorl %%eax, %%eax \n\t"
		985	"movb 2(%%esi), %%al \n\t"
		986	"movb %%al, 2(%%edi) \n\t"
		987
		988	"skip24: \n\t"
		989	"addl $3, %%esi \n\t"
		990	"addl $3, %%edi \n\t"
		991	"decl %%ecx \n\t"
		992	"jnz secondloop24 \n\t"
		993
		994	"end24: \n\t"
		995	"EMMS \n\t" // DONE
		996
		997	: "=a" (dummy_value_a), // output regs (dummy)
		998	"=d" (dummy_value_d),
		999	"=c" (dummy_value_c),
		1000	"=S" (dummy_value_S),
		1001	"=D" (dummy_value_D)
		1002
		1003	: "3" (srcptr), // esi // input regs
		1004	"4" (dstptr), // edi
		1005	"0" (diff), // eax
		1006	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
		1007	"2" (len), // ecx
		1008	"1" (mask) // edx
		1009
		1010	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		1011	: "%mm0", "%mm1", "%mm2" // clobber list
		1012	, "%mm4", "%mm5", "%mm6", "%mm7"
		1013	#endif
		1014	);
		1015	}
		1016	else /* mmx _not supported - Use modified C routine */
		1017	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		1018	{
		1019	register png_uint_32 i;
		1020	png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
		1021	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
		1022	register int stride = BPP3 * png_pass_inc[png_ptr->pass];
		1023	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
		1024	register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
		1025	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
		1026	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
		1027	int diff = (int) (png_ptr->width & 7); /* amount lost */
		1028	register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
		1029
		1030	srcptr = png_ptr->row_buf + 1 + initial_val;
		1031	dstptr = row + initial_val;
		1032
		1033	for (i = initial_val; i < final_val; i += stride)
		1034	{
		1035	png_memcpy(dstptr, srcptr, rep_bytes);
		1036	srcptr += stride;
		1037	dstptr += stride;
		1038	}
		1039	if (diff) /* number of leftover pixels: 3 for pngtest */
		1040	{
		1041	final_val+=diff*BPP3;
		1042	for (; i < final_val; i += stride)
		1043	{
		1044	if (rep_bytes > (int)(final_val-i))
		1045	rep_bytes = (int)(final_val-i);
		1046	png_memcpy(dstptr, srcptr, rep_bytes);
		1047	srcptr += stride;
		1048	dstptr += stride;
		1049	}
		1050	}
		1051	} /* end of else (_mmx_supported) */
		1052
		1053	break;
		1054	} /* end 24 bpp */
		1055
		1056	case 32: /* png_ptr->row_info.pixel_depth */
		1057	{
		1058	png_bytep srcptr;
		1059	png_bytep dstptr;
		1060
		1061	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		1062	#if !defined(PNG_1_0_X)
		1063	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
		1064	/* && _mmx_supported */ )
		1065	#else
		1066	if (_mmx_supported)
		1067	#endif
		1068	{
		1069	png_uint_32 len;
		1070	int diff;
		1071	int dummy_value_a; // fix 'forbidden register spilled' error
		1072	int dummy_value_d;
		1073	int dummy_value_c;
		1074	int dummy_value_S;
		1075	int dummy_value_D;
		1076	_unmask = ~mask; // global variable for -fPIC version
		1077	srcptr = png_ptr->row_buf + 1;
		1078	dstptr = row;
		1079	len = png_ptr->width &~7; // reduce to multiple of 8
		1080	diff = (int) (png_ptr->width & 7); // amount lost //
		1081
		1082	__asm__ __volatile__ (
		1083	"movd _unmask, %%mm7 \n\t" // load bit pattern
		1084	"psubb %%mm6, %%mm6 \n\t" // zero mm6
		1085	"punpcklbw %%mm7, %%mm7 \n\t"
		1086	"punpcklwd %%mm7, %%mm7 \n\t"
		1087	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
		1088
		1089	"movq _mask32_0, %%mm0 \n\t"
		1090	"movq _mask32_1, %%mm1 \n\t"
		1091	"movq _mask32_2, %%mm2 \n\t"
		1092	"movq _mask32_3, %%mm3 \n\t"
		1093
		1094	"pand %%mm7, %%mm0 \n\t"
		1095	"pand %%mm7, %%mm1 \n\t"
		1096	"pand %%mm7, %%mm2 \n\t"
		1097	"pand %%mm7, %%mm3 \n\t"
		1098
		1099	"pcmpeqb %%mm6, %%mm0 \n\t"
		1100	"pcmpeqb %%mm6, %%mm1 \n\t"
		1101	"pcmpeqb %%mm6, %%mm2 \n\t"
		1102	"pcmpeqb %%mm6, %%mm3 \n\t"
		1103
		1104	// preload "movl len, %%ecx \n\t" // load length of line
		1105	// preload "movl srcptr, %%esi \n\t" // load source
		1106	// preload "movl dstptr, %%edi \n\t" // load dest
		1107
		1108	"cmpl $0, %%ecx \n\t" // lcr
		1109	"jz mainloop32end \n\t"
		1110
		1111	"mainloop32: \n\t"
		1112	"movq (%%esi), %%mm4 \n\t"
		1113	"pand %%mm0, %%mm4 \n\t"
		1114	"movq %%mm0, %%mm6 \n\t"
		1115	"movq (%%edi), %%mm7 \n\t"
		1116	"pandn %%mm7, %%mm6 \n\t"
		1117	"por %%mm6, %%mm4 \n\t"
		1118	"movq %%mm4, (%%edi) \n\t"
		1119
		1120	"movq 8(%%esi), %%mm5 \n\t"
		1121	"pand %%mm1, %%mm5 \n\t"
		1122	"movq %%mm1, %%mm7 \n\t"
		1123	"movq 8(%%edi), %%mm6 \n\t"
		1124	"pandn %%mm6, %%mm7 \n\t"
		1125	"por %%mm7, %%mm5 \n\t"
		1126	"movq %%mm5, 8(%%edi) \n\t"
		1127
		1128	"movq 16(%%esi), %%mm6 \n\t"
		1129	"pand %%mm2, %%mm6 \n\t"
		1130	"movq %%mm2, %%mm4 \n\t"
		1131	"movq 16(%%edi), %%mm7 \n\t"
		1132	"pandn %%mm7, %%mm4 \n\t"
		1133	"por %%mm4, %%mm6 \n\t"
		1134	"movq %%mm6, 16(%%edi) \n\t"
		1135
		1136	"movq 24(%%esi), %%mm7 \n\t"
		1137	"pand %%mm3, %%mm7 \n\t"
		1138	"movq %%mm3, %%mm5 \n\t"
		1139	"movq 24(%%edi), %%mm4 \n\t"
		1140	"pandn %%mm4, %%mm5 \n\t"
		1141	"por %%mm5, %%mm7 \n\t"
		1142	"movq %%mm7, 24(%%edi) \n\t"
		1143
		1144	"addl $32, %%esi \n\t" // inc by 32 bytes processed
		1145	"addl $32, %%edi \n\t"
		1146	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
		1147	"ja mainloop32 \n\t"
		1148
		1149	"mainloop32end: \n\t"
		1150	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
		1151	"movl %%eax, %%ecx \n\t"
		1152	"cmpl $0, %%ecx \n\t"
		1153	"jz end32 \n\t"
		1154	// preload "movl mask, %%edx \n\t"
		1155	"sall $24, %%edx \n\t" // low byte => high byte
		1156
		1157	"secondloop32: \n\t"
		1158	"sall %%edx \n\t" // move high bit to CF
		1159	"jnc skip32 \n\t" // if CF = 0
		1160	"movl (%%esi), %%eax \n\t"
		1161	"movl %%eax, (%%edi) \n\t"
		1162
		1163	"skip32: \n\t"
		1164	"addl $4, %%esi \n\t"
		1165	"addl $4, %%edi \n\t"
		1166	"decl %%ecx \n\t"
		1167	"jnz secondloop32 \n\t"
		1168
		1169	"end32: \n\t"
		1170	"EMMS \n\t" // DONE
		1171
		1172	: "=a" (dummy_value_a), // output regs (dummy)
		1173	"=d" (dummy_value_d),
		1174	"=c" (dummy_value_c),
		1175	"=S" (dummy_value_S),
		1176	"=D" (dummy_value_D)
		1177
		1178	: "3" (srcptr), // esi // input regs
		1179	"4" (dstptr), // edi
		1180	"0" (diff), // eax
		1181	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
		1182	"2" (len), // ecx
		1183	"1" (mask) // edx
		1184
		1185	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		1186	: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
		1187	, "%mm4", "%mm5", "%mm6", "%mm7"
		1188	#endif
		1189	);
		1190	}
		1191	else /* mmx _not supported - Use modified C routine */
		1192	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		1193	{
		1194	register png_uint_32 i;
		1195	png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
		1196	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
		1197	register int stride = BPP4 * png_pass_inc[png_ptr->pass];
		1198	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
		1199	register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
		1200	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
		1201	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
		1202	int diff = (int) (png_ptr->width & 7); /* amount lost */
		1203	register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
		1204
		1205	srcptr = png_ptr->row_buf + 1 + initial_val;
		1206	dstptr = row + initial_val;
		1207
		1208	for (i = initial_val; i < final_val; i += stride)
		1209	{
		1210	png_memcpy(dstptr, srcptr, rep_bytes);
		1211	srcptr += stride;
		1212	dstptr += stride;
		1213	}
		1214	if (diff) /* number of leftover pixels: 3 for pngtest */
		1215	{
		1216	final_val+=diff*BPP4;
		1217	for (; i < final_val; i += stride)
		1218	{
		1219	if (rep_bytes > (int)(final_val-i))
		1220	rep_bytes = (int)(final_val-i);
		1221	png_memcpy(dstptr, srcptr, rep_bytes);
		1222	srcptr += stride;
		1223	dstptr += stride;
		1224	}
		1225	}
		1226	} /* end of else (_mmx_supported) */
		1227
		1228	break;
		1229	} /* end 32 bpp */
		1230
		1231	case 48: /* png_ptr->row_info.pixel_depth */
		1232	{
		1233	png_bytep srcptr;
		1234	png_bytep dstptr;
		1235
		1236	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		1237	#if !defined(PNG_1_0_X)
		1238	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
		1239	/* && _mmx_supported */ )
		1240	#else
		1241	if (_mmx_supported)
		1242	#endif
		1243	{
		1244	png_uint_32 len;
		1245	int diff;
		1246	int dummy_value_a; // fix 'forbidden register spilled' error
		1247	int dummy_value_d;
		1248	int dummy_value_c;
		1249	int dummy_value_S;
		1250	int dummy_value_D;
		1251	_unmask = ~mask; // global variable for -fPIC version
		1252	srcptr = png_ptr->row_buf + 1;
		1253	dstptr = row;
		1254	len = png_ptr->width &~7; // reduce to multiple of 8
		1255	diff = (int) (png_ptr->width & 7); // amount lost //
		1256
		1257	__asm__ __volatile__ (
		1258	"movd _unmask, %%mm7 \n\t" // load bit pattern
		1259	"psubb %%mm6, %%mm6 \n\t" // zero mm6
		1260	"punpcklbw %%mm7, %%mm7 \n\t"
		1261	"punpcklwd %%mm7, %%mm7 \n\t"
		1262	"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
		1263
		1264	"movq _mask48_0, %%mm0 \n\t"
		1265	"movq _mask48_1, %%mm1 \n\t"
		1266	"movq _mask48_2, %%mm2 \n\t"
		1267	"movq _mask48_3, %%mm3 \n\t"
		1268	"movq _mask48_4, %%mm4 \n\t"
		1269	"movq _mask48_5, %%mm5 \n\t"
		1270
		1271	"pand %%mm7, %%mm0 \n\t"
		1272	"pand %%mm7, %%mm1 \n\t"
		1273	"pand %%mm7, %%mm2 \n\t"
		1274	"pand %%mm7, %%mm3 \n\t"
		1275	"pand %%mm7, %%mm4 \n\t"
		1276	"pand %%mm7, %%mm5 \n\t"
		1277
		1278	"pcmpeqb %%mm6, %%mm0 \n\t"
		1279	"pcmpeqb %%mm6, %%mm1 \n\t"
		1280	"pcmpeqb %%mm6, %%mm2 \n\t"
		1281	"pcmpeqb %%mm6, %%mm3 \n\t"
		1282	"pcmpeqb %%mm6, %%mm4 \n\t"
		1283	"pcmpeqb %%mm6, %%mm5 \n\t"
		1284
		1285	// preload "movl len, %%ecx \n\t" // load length of line
		1286	// preload "movl srcptr, %%esi \n\t" // load source
		1287	// preload "movl dstptr, %%edi \n\t" // load dest
		1288
		1289	"cmpl $0, %%ecx \n\t"
		1290	"jz mainloop48end \n\t"
		1291
		1292	"mainloop48: \n\t"
		1293	"movq (%%esi), %%mm7 \n\t"
		1294	"pand %%mm0, %%mm7 \n\t"
		1295	"movq %%mm0, %%mm6 \n\t"
		1296	"pandn (%%edi), %%mm6 \n\t"
		1297	"por %%mm6, %%mm7 \n\t"
		1298	"movq %%mm7, (%%edi) \n\t"
		1299
		1300	"movq 8(%%esi), %%mm6 \n\t"
		1301	"pand %%mm1, %%mm6 \n\t"
		1302	"movq %%mm1, %%mm7 \n\t"
		1303	"pandn 8(%%edi), %%mm7 \n\t"
		1304	"por %%mm7, %%mm6 \n\t"
		1305	"movq %%mm6, 8(%%edi) \n\t"
		1306
		1307	"movq 16(%%esi), %%mm6 \n\t"
		1308	"pand %%mm2, %%mm6 \n\t"
		1309	"movq %%mm2, %%mm7 \n\t"
		1310	"pandn 16(%%edi), %%mm7 \n\t"
		1311	"por %%mm7, %%mm6 \n\t"
		1312	"movq %%mm6, 16(%%edi) \n\t"
		1313
		1314	"movq 24(%%esi), %%mm7 \n\t"
		1315	"pand %%mm3, %%mm7 \n\t"
		1316	"movq %%mm3, %%mm6 \n\t"
		1317	"pandn 24(%%edi), %%mm6 \n\t"
		1318	"por %%mm6, %%mm7 \n\t"
		1319	"movq %%mm7, 24(%%edi) \n\t"
		1320
		1321	"movq 32(%%esi), %%mm6 \n\t"
		1322	"pand %%mm4, %%mm6 \n\t"
		1323	"movq %%mm4, %%mm7 \n\t"
		1324	"pandn 32(%%edi), %%mm7 \n\t"
		1325	"por %%mm7, %%mm6 \n\t"
		1326	"movq %%mm6, 32(%%edi) \n\t"
		1327
		1328	"movq 40(%%esi), %%mm7 \n\t"
		1329	"pand %%mm5, %%mm7 \n\t"
		1330	"movq %%mm5, %%mm6 \n\t"
		1331	"pandn 40(%%edi), %%mm6 \n\t"
		1332	"por %%mm6, %%mm7 \n\t"
		1333	"movq %%mm7, 40(%%edi) \n\t"
		1334
		1335	"addl $48, %%esi \n\t" // inc by 48 bytes processed
		1336	"addl $48, %%edi \n\t"
		1337	"subl $8, %%ecx \n\t" // dec by 8 pixels processed
		1338
		1339	"ja mainloop48 \n\t"
		1340
		1341	"mainloop48end: \n\t"
		1342	// preload "movl diff, %%ecx \n\t" // (diff is in eax)
		1343	"movl %%eax, %%ecx \n\t"
		1344	"cmpl $0, %%ecx \n\t"
		1345	"jz end48 \n\t"
		1346	// preload "movl mask, %%edx \n\t"
		1347	"sall $24, %%edx \n\t" // make low byte, high byte
		1348
		1349	"secondloop48: \n\t"
		1350	"sall %%edx \n\t" // move high bit to CF
		1351	"jnc skip48 \n\t" // if CF = 0
		1352	"movl (%%esi), %%eax \n\t"
		1353	"movl %%eax, (%%edi) \n\t"
		1354
		1355	"skip48: \n\t"
		1356	"addl $4, %%esi \n\t"
		1357	"addl $4, %%edi \n\t"
		1358	"decl %%ecx \n\t"
		1359	"jnz secondloop48 \n\t"
		1360
		1361	"end48: \n\t"
		1362	"EMMS \n\t" // DONE
		1363
		1364	: "=a" (dummy_value_a), // output regs (dummy)
		1365	"=d" (dummy_value_d),
		1366	"=c" (dummy_value_c),
		1367	"=S" (dummy_value_S),
		1368	"=D" (dummy_value_D)
		1369
		1370	: "3" (srcptr), // esi // input regs
		1371	"4" (dstptr), // edi
		1372	"0" (diff), // eax
		1373	// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
		1374	"2" (len), // ecx
		1375	"1" (mask) // edx
		1376
		1377	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		1378	: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
		1379	, "%mm4", "%mm5", "%mm6", "%mm7"
		1380	#endif
		1381	);
		1382	}
		1383	else /* mmx _not supported - Use modified C routine */
		1384	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		1385	{
		1386	register png_uint_32 i;
		1387	png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
		1388	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
		1389	register int stride = BPP6 * png_pass_inc[png_ptr->pass];
		1390	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
		1391	register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
		1392	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
		1393	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
		1394	int diff = (int) (png_ptr->width & 7); /* amount lost */
		1395	register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
		1396
		1397	srcptr = png_ptr->row_buf + 1 + initial_val;
		1398	dstptr = row + initial_val;
		1399
		1400	for (i = initial_val; i < final_val; i += stride)
		1401	{
		1402	png_memcpy(dstptr, srcptr, rep_bytes);
		1403	srcptr += stride;
		1404	dstptr += stride;
		1405	}
		1406	if (diff) /* number of leftover pixels: 3 for pngtest */
		1407	{
		1408	final_val+=diff*BPP6;
		1409	for (; i < final_val; i += stride)
		1410	{
		1411	if (rep_bytes > (int)(final_val-i))
		1412	rep_bytes = (int)(final_val-i);
		1413	png_memcpy(dstptr, srcptr, rep_bytes);
		1414	srcptr += stride;
		1415	dstptr += stride;
		1416	}
		1417	}
		1418	} /* end of else (_mmx_supported) */
		1419
		1420	break;
		1421	} /* end 48 bpp */
		1422
		1423	case 64: /* png_ptr->row_info.pixel_depth */
		1424	{
		1425	png_bytep srcptr;
		1426	png_bytep dstptr;
		1427	register png_uint_32 i;
		1428	png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
		1429	/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
		1430	register int stride = BPP8 * png_pass_inc[png_ptr->pass];
		1431	/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
		1432	register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
		1433	/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
		1434	png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
		1435	int diff = (int) (png_ptr->width & 7); /* amount lost */
		1436	register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
		1437
		1438	srcptr = png_ptr->row_buf + 1 + initial_val;
		1439	dstptr = row + initial_val;
		1440
		1441	for (i = initial_val; i < final_val; i += stride)
		1442	{
		1443	png_memcpy(dstptr, srcptr, rep_bytes);
		1444	srcptr += stride;
		1445	dstptr += stride;
		1446	}
		1447	if (diff) /* number of leftover pixels: 3 for pngtest */
		1448	{
		1449	final_val+=diff*BPP8;
		1450	for (; i < final_val; i += stride)
		1451	{
		1452	if (rep_bytes > (int)(final_val-i))
		1453	rep_bytes = (int)(final_val-i);
		1454	png_memcpy(dstptr, srcptr, rep_bytes);
		1455	srcptr += stride;
		1456	dstptr += stride;
		1457	}
		1458	}
		1459
		1460	break;
		1461	} /* end 64 bpp */
		1462
		1463	default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
		1464	{
		1465	/* this should never happen */
		1466	png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
		1467	break;
		1468	}
		1469	} /* end switch (png_ptr->row_info.pixel_depth) */
		1470
		1471	} /* end if (non-trivial mask) */
		1472
		1473	} /* end png_combine_row() */
		1474
		1475	#endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
		1476
		1477
		1478
		1479
		1480	/===========================================================================/
		1481	/* */
		1482	/* P N G _ D O _ R E A D _ I N T E R L A C E */
		1483	/* */
		1484	/===========================================================================/
		1485
		1486	#if defined(PNG_READ_INTERLACING_SUPPORTED)
		1487	#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
		1488
		1489	/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
		1490	* has taken place. [GRR: what other steps come before and/or after?]
		1491	*/
		1492
		1493	void /* PRIVATE */
		1494	png_do_read_interlace(png_structp png_ptr)
		1495	{
		1496	png_row_infop row_info = &(png_ptr->row_info);
		1497	png_bytep row = png_ptr->row_buf + 1;
		1498	int pass = png_ptr->pass;
		1499	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
		1500	png_uint_32 transformations = png_ptr->transformations;
		1501	#endif
		1502
		1503	png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
		1504
		1505	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
		1506	if (_mmx_supported == 2) {
		1507	#if !defined(PNG_1_0_X)
		1508	/* this should have happened in png_init_mmx_flags() already */
		1509	png_warning(png_ptr, "asm_flags may not have been initialized");
		1510	#endif
		1511	png_mmx_support();
		1512	}
		1513	#endif
		1514
		1515	if (row != NULL && row_info != NULL)
		1516	{
		1517	png_uint_32 final_width;
		1518
		1519	final_width = row_info->width * png_pass_inc[pass];
		1520
		1521	switch (row_info->pixel_depth)
		1522	{
		1523	case 1:
		1524	{
		1525	png_bytep sp, dp;
		1526	int sshift, dshift;
		1527	int s_start, s_end, s_inc;
		1528	png_byte v;
		1529	png_uint_32 i;
		1530	int j;
		1531
		1532	sp = row + (png_size_t)((row_info->width - 1) >> 3);
		1533	dp = row + (png_size_t)((final_width - 1) >> 3);
		1534	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
		1535	if (transformations & PNG_PACKSWAP)
		1536	{
		1537	sshift = (int)((row_info->width + 7) & 7);
		1538	dshift = (int)((final_width + 7) & 7);
		1539	s_start = 7;
		1540	s_end = 0;
		1541	s_inc = -1;
		1542	}
		1543	else
		1544	#endif
		1545	{
		1546	sshift = 7 - (int)((row_info->width + 7) & 7);
		1547	dshift = 7 - (int)((final_width + 7) & 7);
		1548	s_start = 0;
		1549	s_end = 7;
		1550	s_inc = 1;
		1551	}
		1552
		1553	for (i = row_info->width; i; i--)
		1554	{
		1555	v = (png_byte)((*sp >> sshift) & 0x1);
		1556	for (j = 0; j < png_pass_inc[pass]; j++)
		1557	{
		1558	*dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
		1559	*dp \|= (png_byte)(v << dshift);
		1560	if (dshift == s_end)
		1561	{
		1562	dshift = s_start;
		1563	dp--;
		1564	}
		1565	else
		1566	dshift += s_inc;
		1567	}
		1568	if (sshift == s_end)
		1569	{
		1570	sshift = s_start;
		1571	sp--;
		1572	}
		1573	else
		1574	sshift += s_inc;
		1575	}
		1576	break;
		1577	}
		1578
		1579	case 2:
		1580	{
		1581	png_bytep sp, dp;
		1582	int sshift, dshift;
		1583	int s_start, s_end, s_inc;
		1584	png_uint_32 i;
		1585
		1586	sp = row + (png_size_t)((row_info->width - 1) >> 2);
		1587	dp = row + (png_size_t)((final_width - 1) >> 2);
		1588	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
		1589	if (transformations & PNG_PACKSWAP)
		1590	{
		1591	sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
		1592	dshift = (png_size_t)(((final_width + 3) & 3) << 1);
		1593	s_start = 6;
		1594	s_end = 0;
		1595	s_inc = -2;
		1596	}
		1597	else
		1598	#endif
		1599	{
		1600	sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
		1601	dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
		1602	s_start = 0;
		1603	s_end = 6;
		1604	s_inc = 2;
		1605	}
		1606
		1607	for (i = row_info->width; i; i--)
		1608	{
		1609	png_byte v;
		1610	int j;
		1611
		1612	v = (png_byte)((*sp >> sshift) & 0x3);
		1613	for (j = 0; j < png_pass_inc[pass]; j++)
		1614	{
		1615	*dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
		1616	*dp \|= (png_byte)(v << dshift);
		1617	if (dshift == s_end)
		1618	{
		1619	dshift = s_start;
		1620	dp--;
		1621	}
		1622	else
		1623	dshift += s_inc;
		1624	}
		1625	if (sshift == s_end)
		1626	{
		1627	sshift = s_start;
		1628	sp--;
		1629	}
		1630	else
		1631	sshift += s_inc;
		1632	}
		1633	break;
		1634	}
		1635
		1636	case 4:
		1637	{
		1638	png_bytep sp, dp;
		1639	int sshift, dshift;
		1640	int s_start, s_end, s_inc;
		1641	png_uint_32 i;
		1642
		1643	sp = row + (png_size_t)((row_info->width - 1) >> 1);
		1644	dp = row + (png_size_t)((final_width - 1) >> 1);
		1645	#if defined(PNG_READ_PACKSWAP_SUPPORTED)
		1646	if (transformations & PNG_PACKSWAP)
		1647	{
		1648	sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
		1649	dshift = (png_size_t)(((final_width + 1) & 1) << 2);
		1650	s_start = 4;
		1651	s_end = 0;
		1652	s_inc = -4;
		1653	}
		1654	else
		1655	#endif
		1656	{
		1657	sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
		1658	dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
		1659	s_start = 0;
		1660	s_end = 4;
		1661	s_inc = 4;
		1662	}
		1663
		1664	for (i = row_info->width; i; i--)
		1665	{
		1666	png_byte v;
		1667	int j;
		1668
		1669	v = (png_byte)((*sp >> sshift) & 0xf);
		1670	for (j = 0; j < png_pass_inc[pass]; j++)
		1671	{
		1672	*dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
		1673	*dp \|= (png_byte)(v << dshift);
		1674	if (dshift == s_end)
		1675	{
		1676	dshift = s_start;
		1677	dp--;
		1678	}
		1679	else
		1680	dshift += s_inc;
		1681	}
		1682	if (sshift == s_end)
		1683	{
		1684	sshift = s_start;
		1685	sp--;
		1686	}
		1687	else
		1688	sshift += s_inc;
		1689	}
		1690	break;
		1691	}
		1692
		1693	/====================================================================/
		1694
		1695	default: /* 8-bit or larger (this is where the routine is modified) */
		1696	{
		1697	#if 0
		1698	// static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
		1699	// static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
		1700	// unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
		1701	// unsigned long long const4 = 0x0000000000FFFFFFLL; no good
		1702	#endif
		1703	png_bytep sptr, dp;
		1704	png_uint_32 i;
		1705	png_size_t pixel_bytes;
		1706	int width = (int)row_info->width;
		1707
		1708	pixel_bytes = (row_info->pixel_depth >> 3);
		1709
		1710	/* point sptr at the last pixel in the pre-expanded row: */
		1711	sptr = row + (width - 1) * pixel_bytes;
		1712
		1713	/* point dp at the last pixel position in the expanded row: */
		1714	dp = row + (final_width - 1) * pixel_bytes;
		1715
		1716	/* New code by Nirav Chhatrapati - Intel Corporation */
		1717
		1718	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
		1719	#if !defined(PNG_1_0_X)
		1720	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
		1721	/* && _mmx_supported */ )
		1722	#else
		1723	if (_mmx_supported)
		1724	#endif
		1725	{
		1726	//--------------------------------------------------------------
		1727	if (pixel_bytes == 3)
		1728	{
		1729	if (((pass == 0) \|\| (pass == 1)) && width)
		1730	{
		1731	int dummy_value_c; // fix 'forbidden register spilled'
		1732	int dummy_value_S;
		1733	int dummy_value_D;
		1734
		1735	__asm__ __volatile__ (
		1736	"subl $21, %%edi \n\t"
		1737	// (png_pass_inc[pass] - 1)*pixel_bytes
		1738
		1739	".loop3_pass0: \n\t"
		1740	"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
		1741	"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
		1742	"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
		1743	"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
		1744	"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
		1745	"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
		1746	"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
		1747	"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
		1748	"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
		1749	"movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
		1750	"psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
		1751	"movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
		1752	"punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
		1753	"movq %%mm4, 16(%%edi) \n\t"
		1754	"psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
		1755	"movq %%mm3, 8(%%edi) \n\t"
		1756	"punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
		1757	"subl $3, %%esi \n\t"
		1758	"movq %%mm0, (%%edi) \n\t"
		1759	"subl $24, %%edi \n\t"
		1760	"decl %%ecx \n\t"
		1761	"jnz .loop3_pass0 \n\t"
		1762	"EMMS \n\t" // DONE
		1763
		1764	: "=c" (dummy_value_c), // output regs (dummy)
		1765	"=S" (dummy_value_S),
		1766	"=D" (dummy_value_D)
		1767
		1768	: "1" (sptr), // esi // input regs
		1769	"2" (dp), // edi
		1770	"0" (width) // ecx
		1771	// doesn't work "i" (0x0000000000FFFFFFLL) // %1 (a.k.a. _const4)
		1772
		1773	#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
		1774	: "%mm0", "%mm1", "%mm2" // clobber list
		1775	, "%mm3", "%mm4"
		1776	#endif
		1777	);
		1778	}
		1779	else if (((pass == 2) \|\| (pass == 3)) && width)
		1780	{
		1781	int dummy_value_c; // fix 'forbidden register spilled'
		1782	int dummy_value_S;
		1783	int dummy_value_D;
		1784
		1785	__asm__ __volatile__ (
		1786	"subl $9, %%edi \n\t"
		1787	// (png_pass_inc[pass] - 1)*pixel_bytes
		1788
		1789	".loop3_pass2: \n\t"
		1790	"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
		1791	"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
		1792	"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
		1793	"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
		1794	"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
		1795	"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
		1796	"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
		1797	"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
		1798	"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
		1799	"movq %%mm0, 4(%%edi) \n\t"
		1800	"psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
		1801	"subl $3, %%esi \n\t"
		1802	"movd %%mm0, (%%edi) \n\t"
		1803	"subl $12, %%edi \n\t"
		1804	"decl %%ecx \n\t"
		1805	"jnz .loop3_pass2 \n\t"
		1806	"EMMS \n\t" // DONE
		1807
		1808	: "=c" (dummy_value_c), // output regs (dummy)
		1809	"=S" (dummy_value_S),
		1810	"=D" (dummy_value_D)
		1811
		1812	: "1" (sptr), // esi // input regs
		1813	"2" (dp), // edi
		1814	"0" (width) // ecx
		1815
		1816	#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
		1817	: "%mm0", "%mm1", "%mm2" // clobber list
		1818	#endif
		1819	);
		1820	}
		1821	else if (width) /* && ((pass == 4) \|\| (pass == 5)) */
		1822	{
		1823	int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
		1824	if (width_mmx < 0)
		1825	width_mmx = 0;
		1826	width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
		1827	if (width_mmx)
		1828	{
		1829	// png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
		1830	// sptr points at last pixel in pre-expanded row
		1831	// dp points at last pixel position in expanded row
		1832	int dummy_value_c; // fix 'forbidden register spilled'
		1833	int dummy_value_S;
		1834	int dummy_value_D;
		1835
		1836	__asm__ __volatile__ (
		1837	"subl $3, %%esi \n\t"
		1838	"subl $9, %%edi \n\t"
		1839	// (png_pass_inc[pass] + 1)*pixel_bytes
		1840
		1841	".loop3_pass4: \n\t"
		1842	"movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
		1843	"movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
		1844	"movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
		1845	"psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
		1846	"pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
		1847	"psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
		1848	"por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
		1849	"movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
		1850	"psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
		1851	"movq %%mm0, (%%edi) \n\t"
		1852	"psrlq $16, %%mm3 \n\t" // z z z z z x x 5
		1853	"pand _const6, %%mm3 \n\t" // z z z z z z z 5
		1854	"por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
		1855	"subl $6, %%esi \n\t"
		1856	"movd %%mm2, 8(%%edi) \n\t"
		1857	"subl $12, %%edi \n\t"
		1858	"subl $2, %%ecx \n\t"
		1859	"jnz .loop3_pass4 \n\t"
		1860	"EMMS \n\t" // DONE
		1861
		1862	: "=c" (dummy_value_c), // output regs (dummy)
		1863	"=S" (dummy_value_S),
		1864	"=D" (dummy_value_D)
		1865
		1866	: "1" (sptr), // esi // input regs
		1867	"2" (dp), // edi
		1868	"0" (width_mmx) // ecx
		1869
		1870	#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
		1871	: "%mm0", "%mm1" // clobber list
		1872	, "%mm2", "%mm3"
		1873	#endif
		1874	);
		1875	}
		1876
		1877	sptr -= width_mmx*3;
		1878	dp -= width_mmx*6;
		1879	for (i = width; i; i--)
		1880	{
		1881	png_byte v[8];
		1882	int j;
		1883
		1884	png_memcpy(v, sptr, 3);
		1885	for (j = 0; j < png_pass_inc[pass]; j++)
		1886	{
		1887	png_memcpy(dp, v, 3);
		1888	dp -= 3;
		1889	}
		1890	sptr -= 3;
		1891	}
		1892	}
		1893	} /* end of pixel_bytes == 3 */
		1894
		1895	//--------------------------------------------------------------
		1896	else if (pixel_bytes == 1)
		1897	{
		1898	if (((pass == 0) \|\| (pass == 1)) && width)
		1899	{
		1900	int width_mmx = ((width >> 2) << 2);
		1901	width -= width_mmx; // 0-3 pixels => 0-3 bytes
		1902	if (width_mmx)
		1903	{
		1904	int dummy_value_c; // fix 'forbidden register spilled'
		1905	int dummy_value_S;
		1906	int dummy_value_D;
		1907
		1908	__asm__ __volatile__ (
		1909	"subl $3, %%esi \n\t"
		1910	"subl $31, %%edi \n\t"
		1911
		1912	".loop1_pass0: \n\t"
		1913	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
		1914	"movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
		1915	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
		1916	"movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
		1917	"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
		1918	"movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
		1919	"punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
		1920	"punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
		1921	"movq %%mm0, (%%edi) \n\t"
		1922	"punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
		1923	"movq %%mm3, 8(%%edi) \n\t"
		1924	"movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
		1925	"punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
		1926	"punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
		1927	"movq %%mm2, 16(%%edi) \n\t"
		1928	"subl $4, %%esi \n\t"
		1929	"movq %%mm4, 24(%%edi) \n\t"
		1930	"subl $32, %%edi \n\t"
		1931	"subl $4, %%ecx \n\t"
		1932	"jnz .loop1_pass0 \n\t"
		1933	"EMMS \n\t" // DONE
		1934
		1935	: "=c" (dummy_value_c), // output regs (dummy)
		1936	"=S" (dummy_value_S),
		1937	"=D" (dummy_value_D)
		1938
		1939	: "1" (sptr), // esi // input regs
		1940	"2" (dp), // edi
		1941	"0" (width_mmx) // ecx
		1942
		1943	#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
		1944	: "%mm0", "%mm1", "%mm2" // clobber list
		1945	, "%mm3", "%mm4"
		1946	#endif
		1947	);
		1948	}
		1949
		1950	sptr -= width_mmx;
		1951	dp -= width_mmx*8;
		1952	for (i = width; i; i--)
		1953	{
		1954	int j;
		1955
		1956	/* I simplified this part in version 1.0.4e
		1957	* here and in several other instances where
		1958	* pixel_bytes == 1 -- GR-P
		1959	*
		1960	* Original code:
		1961	*
		1962	* png_byte v[8];
		1963	* png_memcpy(v, sptr, pixel_bytes);
		1964	* for (j = 0; j < png_pass_inc[pass]; j++)
		1965	* {
		1966	* png_memcpy(dp, v, pixel_bytes);
		1967	* dp -= pixel_bytes;
		1968	* }
		1969	* sptr -= pixel_bytes;
		1970	*
		1971	* Replacement code is in the next three lines:
		1972	*/
		1973
		1974	for (j = 0; j < png_pass_inc[pass]; j++)
		1975	{
		1976	dp-- = sptr;
		1977	}
		1978	--sptr;
		1979	}
		1980	}
		1981	else if (((pass == 2) \|\| (pass == 3)) && width)
		1982	{
		1983	int width_mmx = ((width >> 2) << 2);
		1984	width -= width_mmx; // 0-3 pixels => 0-3 bytes
		1985	if (width_mmx)
		1986	{
		1987	int dummy_value_c; // fix 'forbidden register spilled'
		1988	int dummy_value_S;
		1989	int dummy_value_D;
		1990
		1991	__asm__ __volatile__ (
		1992	"subl $3, %%esi \n\t"
		1993	"subl $15, %%edi \n\t"
		1994
		1995	".loop1_pass2: \n\t"
		1996	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
		1997	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
		1998	"movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
		1999	"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
		2000	"punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
		2001	"movq %%mm0, (%%edi) \n\t"
		2002	"subl $4, %%esi \n\t"
		2003	"movq %%mm1, 8(%%edi) \n\t"
		2004	"subl $16, %%edi \n\t"
		2005	"subl $4, %%ecx \n\t"
		2006	"jnz .loop1_pass2 \n\t"
		2007	"EMMS \n\t" // DONE
		2008
		2009	: "=c" (dummy_value_c), // output regs (dummy)
		2010	"=S" (dummy_value_S),
		2011	"=D" (dummy_value_D)
		2012
		2013	: "1" (sptr), // esi // input regs
		2014	"2" (dp), // edi
		2015	"0" (width_mmx) // ecx
		2016
		2017	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2018	: "%mm0", "%mm1" // clobber list
		2019	#endif
		2020	);
		2021	}
		2022
		2023	sptr -= width_mmx;
		2024	dp -= width_mmx*4;
		2025	for (i = width; i; i--)
		2026	{
		2027	int j;
		2028
		2029	for (j = 0; j < png_pass_inc[pass]; j++)
		2030	{
		2031	dp-- = sptr;
		2032	}
		2033	--sptr;
		2034	}
		2035	}
		2036	else if (width) /* && ((pass == 4) \|\| (pass == 5)) */
		2037	{
		2038	int width_mmx = ((width >> 3) << 3);
		2039	width -= width_mmx; // 0-3 pixels => 0-3 bytes
		2040	if (width_mmx)
		2041	{
		2042	int dummy_value_c; // fix 'forbidden register spilled'
		2043	int dummy_value_S;
		2044	int dummy_value_D;
		2045
		2046	__asm__ __volatile__ (
		2047	"subl $7, %%esi \n\t"
		2048	"subl $15, %%edi \n\t"
		2049
		2050	".loop1_pass4: \n\t"
		2051	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
		2052	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
		2053	"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
		2054	"punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
		2055	"movq %%mm1, 8(%%edi) \n\t"
		2056	"subl $8, %%esi \n\t"
		2057	"movq %%mm0, (%%edi) \n\t"
		2058	"subl $16, %%edi \n\t"
		2059	"subl $8, %%ecx \n\t"
		2060	"jnz .loop1_pass4 \n\t"
		2061	"EMMS \n\t" // DONE
		2062
		2063	: "=c" (dummy_value_c), // output regs (none)
		2064	"=S" (dummy_value_S),
		2065	"=D" (dummy_value_D)
		2066
		2067	: "1" (sptr), // esi // input regs
		2068	"2" (dp), // edi
		2069	"0" (width_mmx) // ecx
		2070
		2071	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2072	: "%mm0", "%mm1" // clobber list
		2073	#endif
		2074	);
		2075	}
		2076
		2077	sptr -= width_mmx;
		2078	dp -= width_mmx*2;
		2079	for (i = width; i; i--)
		2080	{
		2081	int j;
		2082
		2083	for (j = 0; j < png_pass_inc[pass]; j++)
		2084	{
		2085	dp-- = sptr;
		2086	}
		2087	--sptr;
		2088	}
		2089	}
		2090	} /* end of pixel_bytes == 1 */
		2091
		2092	//--------------------------------------------------------------
		2093	else if (pixel_bytes == 2)
		2094	{
		2095	if (((pass == 0) \|\| (pass == 1)) && width)
		2096	{
		2097	int width_mmx = ((width >> 1) << 1);
		2098	width -= width_mmx; // 0,1 pixels => 0,2 bytes
		2099	if (width_mmx)
		2100	{
		2101	int dummy_value_c; // fix 'forbidden register spilled'
		2102	int dummy_value_S;
		2103	int dummy_value_D;
		2104
		2105	__asm__ __volatile__ (
		2106	"subl $2, %%esi \n\t"
		2107	"subl $30, %%edi \n\t"
		2108
		2109	".loop2_pass0: \n\t"
		2110	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
		2111	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
		2112	"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
		2113	"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
		2114	"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
		2115	"movq %%mm0, (%%edi) \n\t"
		2116	"movq %%mm0, 8(%%edi) \n\t"
		2117	"movq %%mm1, 16(%%edi) \n\t"
		2118	"subl $4, %%esi \n\t"
		2119	"movq %%mm1, 24(%%edi) \n\t"
		2120	"subl $32, %%edi \n\t"
		2121	"subl $2, %%ecx \n\t"
		2122	"jnz .loop2_pass0 \n\t"
		2123	"EMMS \n\t" // DONE
		2124
		2125	: "=c" (dummy_value_c), // output regs (dummy)
		2126	"=S" (dummy_value_S),
		2127	"=D" (dummy_value_D)
		2128
		2129	: "1" (sptr), // esi // input regs
		2130	"2" (dp), // edi
		2131	"0" (width_mmx) // ecx
		2132
		2133	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2134	: "%mm0", "%mm1" // clobber list
		2135	#endif
		2136	);
		2137	}
		2138
		2139	sptr -= (width_mmx*2 - 2); // sign fixed
		2140	dp -= (width_mmx*16 - 2); // sign fixed
		2141	for (i = width; i; i--)
		2142	{
		2143	png_byte v[8];
		2144	int j;
		2145	sptr -= 2;
		2146	png_memcpy(v, sptr, 2);
		2147	for (j = 0; j < png_pass_inc[pass]; j++)
		2148	{
		2149	dp -= 2;
		2150	png_memcpy(dp, v, 2);
		2151	}
		2152	}
		2153	}
		2154	else if (((pass == 2) \|\| (pass == 3)) && width)
		2155	{
		2156	int width_mmx = ((width >> 1) << 1) ;
		2157	width -= width_mmx; // 0,1 pixels => 0,2 bytes
		2158	if (width_mmx)
		2159	{
		2160	int dummy_value_c; // fix 'forbidden register spilled'
		2161	int dummy_value_S;
		2162	int dummy_value_D;
		2163
		2164	__asm__ __volatile__ (
		2165	"subl $2, %%esi \n\t"
		2166	"subl $14, %%edi \n\t"
		2167
		2168	".loop2_pass2: \n\t"
		2169	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
		2170	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
		2171	"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
		2172	"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
		2173	"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
		2174	"movq %%mm0, (%%edi) \n\t"
		2175	"subl $4, %%esi \n\t"
		2176	"movq %%mm1, 8(%%edi) \n\t"
		2177	"subl $16, %%edi \n\t"
		2178	"subl $2, %%ecx \n\t"
		2179	"jnz .loop2_pass2 \n\t"
		2180	"EMMS \n\t" // DONE
		2181
		2182	: "=c" (dummy_value_c), // output regs (dummy)
		2183	"=S" (dummy_value_S),
		2184	"=D" (dummy_value_D)
		2185
		2186	: "1" (sptr), // esi // input regs
		2187	"2" (dp), // edi
		2188	"0" (width_mmx) // ecx
		2189
		2190	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2191	: "%mm0", "%mm1" // clobber list
		2192	#endif
		2193	);
		2194	}
		2195
		2196	sptr -= (width_mmx*2 - 2); // sign fixed
		2197	dp -= (width_mmx*8 - 2); // sign fixed
		2198	for (i = width; i; i--)
		2199	{
		2200	png_byte v[8];
		2201	int j;
		2202	sptr -= 2;
		2203	png_memcpy(v, sptr, 2);
		2204	for (j = 0; j < png_pass_inc[pass]; j++)
		2205	{
		2206	dp -= 2;
		2207	png_memcpy(dp, v, 2);
		2208	}
		2209	}
		2210	}
		2211	else if (width) // pass == 4 or 5
		2212	{
		2213	int width_mmx = ((width >> 1) << 1) ;
		2214	width -= width_mmx; // 0,1 pixels => 0,2 bytes
		2215	if (width_mmx)
		2216	{
		2217	int dummy_value_c; // fix 'forbidden register spilled'
		2218	int dummy_value_S;
		2219	int dummy_value_D;
		2220
		2221	__asm__ __volatile__ (
		2222	"subl $2, %%esi \n\t"
		2223	"subl $6, %%edi \n\t"
		2224
		2225	".loop2_pass4: \n\t"
		2226	"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
		2227	"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
		2228	"subl $4, %%esi \n\t"
		2229	"movq %%mm0, (%%edi) \n\t"
		2230	"subl $8, %%edi \n\t"
		2231	"subl $2, %%ecx \n\t"
		2232	"jnz .loop2_pass4 \n\t"
		2233	"EMMS \n\t" // DONE
		2234
		2235	: "=c" (dummy_value_c), // output regs (dummy)
		2236	"=S" (dummy_value_S),
		2237	"=D" (dummy_value_D)
		2238
		2239	: "1" (sptr), // esi // input regs
		2240	"2" (dp), // edi
		2241	"0" (width_mmx) // ecx
		2242
		2243	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2244	: "%mm0" // clobber list
		2245	#endif
		2246	);
		2247	}
		2248
		2249	sptr -= (width_mmx*2 - 2); // sign fixed
		2250	dp -= (width_mmx*4 - 2); // sign fixed
		2251	for (i = width; i; i--)
		2252	{
		2253	png_byte v[8];
		2254	int j;
		2255	sptr -= 2;
		2256	png_memcpy(v, sptr, 2);
		2257	for (j = 0; j < png_pass_inc[pass]; j++)
		2258	{
		2259	dp -= 2;
		2260	png_memcpy(dp, v, 2);
		2261	}
		2262	}
		2263	}
		2264	} /* end of pixel_bytes == 2 */
		2265
		2266	//--------------------------------------------------------------
		2267	else if (pixel_bytes == 4)
		2268	{
		2269	if (((pass == 0) \|\| (pass == 1)) && width)
		2270	{
		2271	int width_mmx = ((width >> 1) << 1);
		2272	width -= width_mmx; // 0,1 pixels => 0,4 bytes
		2273	if (width_mmx)
		2274	{
		2275	int dummy_value_c; // fix 'forbidden register spilled'
		2276	int dummy_value_S;
		2277	int dummy_value_D;
		2278
		2279	__asm__ __volatile__ (
		2280	"subl $4, %%esi \n\t"
		2281	"subl $60, %%edi \n\t"
		2282
		2283	".loop4_pass0: \n\t"
		2284	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
		2285	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
		2286	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
		2287	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
		2288	"movq %%mm0, (%%edi) \n\t"
		2289	"movq %%mm0, 8(%%edi) \n\t"
		2290	"movq %%mm0, 16(%%edi) \n\t"
		2291	"movq %%mm0, 24(%%edi) \n\t"
		2292	"movq %%mm1, 32(%%edi) \n\t"
		2293	"movq %%mm1, 40(%%edi) \n\t"
		2294	"movq %%mm1, 48(%%edi) \n\t"
		2295	"subl $8, %%esi \n\t"
		2296	"movq %%mm1, 56(%%edi) \n\t"
		2297	"subl $64, %%edi \n\t"
		2298	"subl $2, %%ecx \n\t"
		2299	"jnz .loop4_pass0 \n\t"
		2300	"EMMS \n\t" // DONE
		2301
		2302	: "=c" (dummy_value_c), // output regs (dummy)
		2303	"=S" (dummy_value_S),
		2304	"=D" (dummy_value_D)
		2305
		2306	: "1" (sptr), // esi // input regs
		2307	"2" (dp), // edi
		2308	"0" (width_mmx) // ecx
		2309
		2310	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2311	: "%mm0", "%mm1" // clobber list
		2312	#endif
		2313	);
		2314	}
		2315
		2316	sptr -= (width_mmx*4 - 4); // sign fixed
		2317	dp -= (width_mmx*32 - 4); // sign fixed
		2318	for (i = width; i; i--)
		2319	{
		2320	png_byte v[8];
		2321	int j;
		2322	sptr -= 4;
		2323	png_memcpy(v, sptr, 4);
		2324	for (j = 0; j < png_pass_inc[pass]; j++)
		2325	{
		2326	dp -= 4;
		2327	png_memcpy(dp, v, 4);
		2328	}
		2329	}
		2330	}
		2331	else if (((pass == 2) \|\| (pass == 3)) && width)
		2332	{
		2333	int width_mmx = ((width >> 1) << 1);
		2334	width -= width_mmx; // 0,1 pixels => 0,4 bytes
		2335	if (width_mmx)
		2336	{
		2337	int dummy_value_c; // fix 'forbidden register spilled'
		2338	int dummy_value_S;
		2339	int dummy_value_D;
		2340
		2341	__asm__ __volatile__ (
		2342	"subl $4, %%esi \n\t"
		2343	"subl $28, %%edi \n\t"
		2344
		2345	".loop4_pass2: \n\t"
		2346	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
		2347	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
		2348	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
		2349	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
		2350	"movq %%mm0, (%%edi) \n\t"
		2351	"movq %%mm0, 8(%%edi) \n\t"
		2352	"movq %%mm1, 16(%%edi) \n\t"
		2353	"movq %%mm1, 24(%%edi) \n\t"
		2354	"subl $8, %%esi \n\t"
		2355	"subl $32, %%edi \n\t"
		2356	"subl $2, %%ecx \n\t"
		2357	"jnz .loop4_pass2 \n\t"
		2358	"EMMS \n\t" // DONE
		2359
		2360	: "=c" (dummy_value_c), // output regs (dummy)
		2361	"=S" (dummy_value_S),
		2362	"=D" (dummy_value_D)
		2363
		2364	: "1" (sptr), // esi // input regs
		2365	"2" (dp), // edi
		2366	"0" (width_mmx) // ecx
		2367
		2368	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2369	: "%mm0", "%mm1" // clobber list
		2370	#endif
		2371	);
		2372	}
		2373
		2374	sptr -= (width_mmx*4 - 4); // sign fixed
		2375	dp -= (width_mmx*16 - 4); // sign fixed
		2376	for (i = width; i; i--)
		2377	{
		2378	png_byte v[8];
		2379	int j;
		2380	sptr -= 4;
		2381	png_memcpy(v, sptr, 4);
		2382	for (j = 0; j < png_pass_inc[pass]; j++)
		2383	{
		2384	dp -= 4;
		2385	png_memcpy(dp, v, 4);
		2386	}
		2387	}
		2388	}
		2389	else if (width) // pass == 4 or 5
		2390	{
		2391	int width_mmx = ((width >> 1) << 1) ;
		2392	width -= width_mmx; // 0,1 pixels => 0,4 bytes
		2393	if (width_mmx)
		2394	{
		2395	int dummy_value_c; // fix 'forbidden register spilled'
		2396	int dummy_value_S;
		2397	int dummy_value_D;
		2398
		2399	__asm__ __volatile__ (
		2400	"subl $4, %%esi \n\t"
		2401	"subl $12, %%edi \n\t"
		2402
		2403	".loop4_pass4: \n\t"
		2404	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
		2405	"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
		2406	"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
		2407	"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
		2408	"movq %%mm0, (%%edi) \n\t"
		2409	"subl $8, %%esi \n\t"
		2410	"movq %%mm1, 8(%%edi) \n\t"
		2411	"subl $16, %%edi \n\t"
		2412	"subl $2, %%ecx \n\t"
		2413	"jnz .loop4_pass4 \n\t"
		2414	"EMMS \n\t" // DONE
		2415
		2416	: "=c" (dummy_value_c), // output regs (dummy)
		2417	"=S" (dummy_value_S),
		2418	"=D" (dummy_value_D)
		2419
		2420	: "1" (sptr), // esi // input regs
		2421	"2" (dp), // edi
		2422	"0" (width_mmx) // ecx
		2423
		2424	#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2425	: "%mm0", "%mm1" // clobber list
		2426	#endif
		2427	);
		2428	}
		2429
		2430	sptr -= (width_mmx*4 - 4); // sign fixed
		2431	dp -= (width_mmx*8 - 4); // sign fixed
		2432	for (i = width; i; i--)
		2433	{
		2434	png_byte v[8];
		2435	int j;
		2436	sptr -= 4;
		2437	png_memcpy(v, sptr, 4);
		2438	for (j = 0; j < png_pass_inc[pass]; j++)
		2439	{
		2440	dp -= 4;
		2441	png_memcpy(dp, v, 4);
		2442	}
		2443	}
		2444	}
		2445	} /* end of pixel_bytes == 4 */
		2446
		2447	//--------------------------------------------------------------
		2448	else if (pixel_bytes == 8)
		2449	{
		2450	// GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
		2451	// GRR NOTE: no need to combine passes here!
		2452	if (((pass == 0) \|\| (pass == 1)) && width)
		2453	{
		2454	int dummy_value_c; // fix 'forbidden register spilled'
		2455	int dummy_value_S;
		2456	int dummy_value_D;
		2457
		2458	// source is 8-byte RRGGBBAA
		2459	// dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
		2460	__asm__ __volatile__ (
		2461	"subl $56, %%edi \n\t" // start of last block
		2462
		2463	".loop8_pass0: \n\t"
		2464	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
		2465	"movq %%mm0, (%%edi) \n\t"
		2466	"movq %%mm0, 8(%%edi) \n\t"
		2467	"movq %%mm0, 16(%%edi) \n\t"
		2468	"movq %%mm0, 24(%%edi) \n\t"
		2469	"movq %%mm0, 32(%%edi) \n\t"
		2470	"movq %%mm0, 40(%%edi) \n\t"
		2471	"movq %%mm0, 48(%%edi) \n\t"
		2472	"subl $8, %%esi \n\t"
		2473	"movq %%mm0, 56(%%edi) \n\t"
		2474	"subl $64, %%edi \n\t"
		2475	"decl %%ecx \n\t"
		2476	"jnz .loop8_pass0 \n\t"
		2477	"EMMS \n\t" // DONE
		2478
		2479	: "=c" (dummy_value_c), // output regs (dummy)
		2480	"=S" (dummy_value_S),
		2481	"=D" (dummy_value_D)
		2482
		2483	: "1" (sptr), // esi // input regs
		2484	"2" (dp), // edi
		2485	"0" (width) // ecx
		2486
		2487	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2488	: "%mm0" // clobber list
		2489	#endif
		2490	);
		2491	}
		2492	else if (((pass == 2) \|\| (pass == 3)) && width)
		2493	{
		2494	// source is 8-byte RRGGBBAA
		2495	// dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
		2496	// (recall that expansion is _in place_: sptr and dp
		2497	// both point at locations within same row buffer)
		2498	{
		2499	int dummy_value_c; // fix 'forbidden register spilled'
		2500	int dummy_value_S;
		2501	int dummy_value_D;
		2502
		2503	__asm__ __volatile__ (
		2504	"subl $24, %%edi \n\t" // start of last block
		2505
		2506	".loop8_pass2: \n\t"
		2507	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
		2508	"movq %%mm0, (%%edi) \n\t"
		2509	"movq %%mm0, 8(%%edi) \n\t"
		2510	"movq %%mm0, 16(%%edi) \n\t"
		2511	"subl $8, %%esi \n\t"
		2512	"movq %%mm0, 24(%%edi) \n\t"
		2513	"subl $32, %%edi \n\t"
		2514	"decl %%ecx \n\t"
		2515	"jnz .loop8_pass2 \n\t"
		2516	"EMMS \n\t" // DONE
		2517
		2518	: "=c" (dummy_value_c), // output regs (dummy)
		2519	"=S" (dummy_value_S),
		2520	"=D" (dummy_value_D)
		2521
		2522	: "1" (sptr), // esi // input regs
		2523	"2" (dp), // edi
		2524	"0" (width) // ecx
		2525
		2526	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2527	: "%mm0" // clobber list
		2528	#endif
		2529	);
		2530	}
		2531	}
		2532	else if (width) // pass == 4 or 5
		2533	{
		2534	// source is 8-byte RRGGBBAA
		2535	// dest is 16-byte RRGGBBAA RRGGBBAA
		2536	{
		2537	int dummy_value_c; // fix 'forbidden register spilled'
		2538	int dummy_value_S;
		2539	int dummy_value_D;
		2540
		2541	__asm__ __volatile__ (
		2542	"subl $8, %%edi \n\t" // start of last block
		2543
		2544	".loop8_pass4: \n\t"
		2545	"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
		2546	"movq %%mm0, (%%edi) \n\t"
		2547	"subl $8, %%esi \n\t"
		2548	"movq %%mm0, 8(%%edi) \n\t"
		2549	"subl $16, %%edi \n\t"
		2550	"decl %%ecx \n\t"
		2551	"jnz .loop8_pass4 \n\t"
		2552	"EMMS \n\t" // DONE
		2553
		2554	: "=c" (dummy_value_c), // output regs (dummy)
		2555	"=S" (dummy_value_S),
		2556	"=D" (dummy_value_D)
		2557
		2558	: "1" (sptr), // esi // input regs
		2559	"2" (dp), // edi
		2560	"0" (width) // ecx
		2561
		2562	#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2563	: "%mm0" // clobber list
		2564	#endif
		2565	);
		2566	}
		2567	}
		2568
		2569	} /* end of pixel_bytes == 8 */
		2570
		2571	//--------------------------------------------------------------
		2572	else if (pixel_bytes == 6)
		2573	{
		2574	for (i = width; i; i--)
		2575	{
		2576	png_byte v[8];
		2577	int j;
		2578	png_memcpy(v, sptr, 6);
		2579	for (j = 0; j < png_pass_inc[pass]; j++)
		2580	{
		2581	png_memcpy(dp, v, 6);
		2582	dp -= 6;
		2583	}
		2584	sptr -= 6;
		2585	}
		2586	} /* end of pixel_bytes == 6 */
		2587
		2588	//--------------------------------------------------------------
		2589	else
		2590	{
		2591	for (i = width; i; i--)
		2592	{
		2593	png_byte v[8];
		2594	int j;
		2595	png_memcpy(v, sptr, pixel_bytes);
		2596	for (j = 0; j < png_pass_inc[pass]; j++)
		2597	{
		2598	png_memcpy(dp, v, pixel_bytes);
		2599	dp -= pixel_bytes;
		2600	}
		2601	sptr-= pixel_bytes;
		2602	}
		2603	}
		2604	} // end of _mmx_supported ========================================
		2605
		2606	else /* MMX not supported: use modified C code - takes advantage
		2607	* of inlining of png_memcpy for a constant */
		2608	/* GRR 19991007: does it? or should pixel_bytes in each
		2609	* block be replaced with immediate value (e.g., 1)? */
		2610	/* GRR 19991017: replaced with constants in each case */
		2611	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		2612	{
		2613	if (pixel_bytes == 1)
		2614	{
		2615	for (i = width; i; i--)
		2616	{
		2617	int j;
		2618	for (j = 0; j < png_pass_inc[pass]; j++)
		2619	{
		2620	dp-- = sptr;
		2621	}
		2622	--sptr;
		2623	}
		2624	}
		2625	else if (pixel_bytes == 3)
		2626	{
		2627	for (i = width; i; i--)
		2628	{
		2629	png_byte v[8];
		2630	int j;
		2631	png_memcpy(v, sptr, 3);
		2632	for (j = 0; j < png_pass_inc[pass]; j++)
		2633	{
		2634	png_memcpy(dp, v, 3);
		2635	dp -= 3;
		2636	}
		2637	sptr -= 3;
		2638	}
		2639	}
		2640	else if (pixel_bytes == 2)
		2641	{
		2642	for (i = width; i; i--)
		2643	{
		2644	png_byte v[8];
		2645	int j;
		2646	png_memcpy(v, sptr, 2);
		2647	for (j = 0; j < png_pass_inc[pass]; j++)
		2648	{
		2649	png_memcpy(dp, v, 2);
		2650	dp -= 2;
		2651	}
		2652	sptr -= 2;
		2653	}
		2654	}
		2655	else if (pixel_bytes == 4)
		2656	{
		2657	for (i = width; i; i--)
		2658	{
		2659	png_byte v[8];
		2660	int j;
		2661	png_memcpy(v, sptr, 4);
		2662	for (j = 0; j < png_pass_inc[pass]; j++)
		2663	{
		2664	#ifdef PNG_DEBUG
		2665	if (dp < row \|\| dp+3 > row+png_ptr->row_buf_size)
		2666	{
		2667	printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
		2668	row, dp, row+png_ptr->row_buf_size);
		2669	printf("row_buf=%d\n",png_ptr->row_buf_size);
		2670	}
		2671	#endif
		2672	png_memcpy(dp, v, 4);
		2673	dp -= 4;
		2674	}
		2675	sptr -= 4;
		2676	}
		2677	}
		2678	else if (pixel_bytes == 6)
		2679	{
		2680	for (i = width; i; i--)
		2681	{
		2682	png_byte v[8];
		2683	int j;
		2684	png_memcpy(v, sptr, 6);
		2685	for (j = 0; j < png_pass_inc[pass]; j++)
		2686	{
		2687	png_memcpy(dp, v, 6);
		2688	dp -= 6;
		2689	}
		2690	sptr -= 6;
		2691	}
		2692	}
		2693	else if (pixel_bytes == 8)
		2694	{
		2695	for (i = width; i; i--)
		2696	{
		2697	png_byte v[8];
		2698	int j;
		2699	png_memcpy(v, sptr, 8);
		2700	for (j = 0; j < png_pass_inc[pass]; j++)
		2701	{
		2702	png_memcpy(dp, v, 8);
		2703	dp -= 8;
		2704	}
		2705	sptr -= 8;
		2706	}
		2707	}
		2708	else /* GRR: should never be reached */
		2709	{
		2710	for (i = width; i; i--)
		2711	{
		2712	png_byte v[8];
		2713	int j;
		2714	png_memcpy(v, sptr, pixel_bytes);
		2715	for (j = 0; j < png_pass_inc[pass]; j++)
		2716	{
		2717	png_memcpy(dp, v, pixel_bytes);
		2718	dp -= pixel_bytes;
		2719	}
		2720	sptr -= pixel_bytes;
		2721	}
		2722	}
		2723
		2724	} /* end if (MMX not supported) */
		2725	break;
		2726	}
		2727	} /* end switch (row_info->pixel_depth) */
		2728
		2729	row_info->width = final_width;
		2730	row_info->rowbytes = ((final_width *
		2731	(png_uint_32)row_info->pixel_depth + 7) >> 3);
		2732	}
		2733
		2734	} /* end png_do_read_interlace() */
		2735
		2736	#endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
		2737	#endif /* PNG_READ_INTERLACING_SUPPORTED */
		2738
		2739
		2740
		2741	#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
		2742	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
		2743
		2744	// These variables are utilized in the functions below. They are declared
		2745	// globally here to ensure alignment on 8-byte boundaries.
		2746
		2747	union uAll {
		2748	long long use;
		2749	double align;
		2750	} _LBCarryMask = {0x0101010101010101LL},
		2751	_HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
		2752	_ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
		2753
		2754	#ifdef PNG_THREAD_UNSAFE_OK
		2755	//===========================================================================//
		2756	// //
		2757	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
		2758	// //
		2759	//===========================================================================//
		2760
		2761	// Optimized code for PNG Average filter decoder
		2762
		2763	static void /* PRIVATE */
		2764	png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
		2765	png_bytep prev_row)
		2766	{
		2767	int bpp;
		2768	int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
		2769	int dummy_value_S;
		2770	int dummy_value_D;
		2771
		2772	bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
		2773	_FullLength = row_info->rowbytes; // # of bytes to filter
		2774
		2775	__asm__ __volatile__ (
		2776	// initialize address pointers and offset
		2777	#ifdef __PIC__
		2778	"pushl %%ebx \n\t" // save index to Global Offset Table
		2779	#endif
		2780	//pre "movl row, %%edi \n\t" // edi: Avg(x)
		2781	"xorl %%ebx, %%ebx \n\t" // ebx: x
		2782	"movl %%edi, %%edx \n\t"
		2783	//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
		2784	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
		2785	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
		2786
		2787	"xorl %%eax,%%eax \n\t"
		2788
		2789	// Compute the Raw value for the first bpp bytes
		2790	// Raw(x) = Avg(x) + (Prior(x)/2)
		2791	"avg_rlp: \n\t"
		2792	"movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
		2793	"incl %%ebx \n\t"
		2794	"shrb %%al \n\t" // divide by 2
		2795	"addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
		2796	//pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
		2797	"cmpl %%ecx, %%ebx \n\t"
		2798	"movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
		2799	"jb avg_rlp \n\t" // mov does not affect flags
		2800
		2801	// get # of bytes to alignment
		2802	"movl %%edi, _dif \n\t" // take start of row
		2803	"addl %%ebx, _dif \n\t" // add bpp
		2804	"addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
		2805	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
		2806	"subl %%edi, _dif \n\t" // subtract from start => value ebx at
		2807	"jz avg_go \n\t" // alignment
		2808
		2809	// fix alignment
		2810	// Compute the Raw value for the bytes up to the alignment boundary
		2811	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
		2812	"xorl %%ecx, %%ecx \n\t"
		2813
		2814	"avg_lp1: \n\t"
		2815	"xorl %%eax, %%eax \n\t"
		2816	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
		2817	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
		2818	"addw %%cx, %%ax \n\t"
		2819	"incl %%ebx \n\t"
		2820	"shrw %%ax \n\t" // divide by 2
		2821	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
		2822	"cmpl _dif, %%ebx \n\t" // check if at alignment boundary
		2823	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
		2824	"jb avg_lp1 \n\t" // repeat until at alignment boundary
		2825
		2826	"avg_go: \n\t"
		2827	"movl _FullLength, %%eax \n\t"
		2828	"movl %%eax, %%ecx \n\t"
		2829	"subl %%ebx, %%eax \n\t" // subtract alignment fix
		2830	"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
		2831	"subl %%eax, %%ecx \n\t" // drop over bytes from original length
		2832	"movl %%ecx, _MMXLength \n\t"
		2833	#ifdef __PIC__
		2834	"popl %%ebx \n\t" // restore index to Global Offset Table
		2835	#endif
		2836
		2837	: "=c" (dummy_value_c), // output regs (dummy)
		2838	"=S" (dummy_value_S),
		2839	"=D" (dummy_value_D)
		2840
		2841	: "0" (bpp), // ecx // input regs
		2842	"1" (prev_row), // esi
		2843	"2" (row) // edi
		2844
		2845	: "%eax", "%edx" // clobber list
		2846	#ifndef __PIC__
		2847	, "%ebx"
		2848	#endif
		2849	// GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
		2850	// (seems to work fine without...)
		2851	);
		2852
		2853	// now do the math for the rest of the row
		2854	switch (bpp)
		2855	{
		2856	case 3:
		2857	{
		2858	_ActiveMask.use = 0x0000000000ffffffLL;
		2859	_ShiftBpp.use = 24; // == 3 * 8
		2860	_ShiftRem.use = 40; // == 64 - 24
		2861
		2862	__asm__ __volatile__ (
		2863	// re-init address pointers and offset
		2864	"movq _ActiveMask, %%mm7 \n\t"
		2865	"movl _dif, %%ecx \n\t" // ecx: x = offset to
		2866	"movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
		2867	// preload "movl row, %%edi \n\t" // edi: Avg(x)
		2868	"movq _HBClearMask, %%mm4 \n\t"
		2869	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
		2870
		2871	// prime the pump: load the first Raw(x-bpp) data set
		2872	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
		2873	// (correct pos. in loop below)
		2874	"avg_3lp: \n\t"
		2875	"movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
		2876	"movq %%mm5, %%mm3 \n\t"
		2877	"psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
		2878	// data
		2879	"movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
		2880	"movq %%mm7, %%mm6 \n\t"
		2881	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
		2882	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
		2883	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
		2884	// byte
		2885	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
		2886	// each byte
		2887	// add 1st active group (Raw(x-bpp)/2) to average with LBCarry
		2888	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
		2889	// LBCarrys
		2890	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
		2891	// where both
		2892	// lsb's were == 1 (only valid for active group)
		2893	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		2894	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		2895	// byte
		2896	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
		2897	// for each byte
		2898	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
		2899	// bytes to add to Avg
		2900	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
		2901	// Avg for each Active
		2902	// byte
		2903	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
		2904	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
		2905	// bytes 3-5
		2906	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
		2907	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
		2908	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
		2909	// LBCarrys
		2910	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
		2911	// where both
		2912	// lsb's were == 1 (only valid for active group)
		2913	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		2914	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		2915	// byte
		2916	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
		2917	// for each byte
		2918	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
		2919	// bytes to add to Avg
		2920	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
		2921	// Avg for each Active
		2922	// byte
		2923
		2924	// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
		2925	"psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
		2926	// two
		2927	// bytes
		2928	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
		2929	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
		2930	// Data only needs to be shifted once here to
		2931	// get the correct x-bpp offset.
		2932	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
		2933	// LBCarrys
		2934	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
		2935	// where both
		2936	// lsb's were == 1 (only valid for active group)
		2937	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		2938	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		2939	// byte
		2940	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
		2941	// for each byte
		2942	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
		2943	// bytes to add to Avg
		2944	"addl $8, %%ecx \n\t"
		2945	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
		2946	// Avg for each Active
		2947	// byte
		2948	// now ready to write back to memory
		2949	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
		2950	// move updated Raw(x) to use as Raw(x-bpp) for next loop
		2951	"cmpl _MMXLength, %%ecx \n\t"
		2952	"movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
		2953	"jb avg_3lp \n\t"
		2954
		2955	: "=S" (dummy_value_S), // output regs (dummy)
		2956	"=D" (dummy_value_D)
		2957
		2958	: "0" (prev_row), // esi // input regs
		2959	"1" (row) // edi
		2960
		2961	: "%ecx" // clobber list
		2962	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
		2963	, "%mm0", "%mm1", "%mm2", "%mm3"
		2964	, "%mm4", "%mm5", "%mm6", "%mm7"
		2965	#endif
		2966	);
		2967	}
		2968	break; // end 3 bpp
		2969
		2970	case 6:
		2971	case 4:
		2972	//case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
		2973	//case 5: // GRR BOGUS
		2974	{
		2975	_ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
		2976	// appropriate inactive bytes
		2977	_ShiftBpp.use = bpp << 3;
		2978	_ShiftRem.use = 64 - _ShiftBpp.use;
		2979
		2980	__asm__ __volatile__ (
		2981	"movq _HBClearMask, %%mm4 \n\t"
		2982
		2983	// re-init address pointers and offset
		2984	"movl _dif, %%ecx \n\t" // ecx: x = offset to
		2985	// alignment boundary
		2986
		2987	// load _ActiveMask and clear all bytes except for 1st active group
		2988	"movq _ActiveMask, %%mm7 \n\t"
		2989	// preload "movl row, %%edi \n\t" // edi: Avg(x)
		2990	"psrlq _ShiftRem, %%mm7 \n\t"
		2991	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
		2992	"movq %%mm7, %%mm6 \n\t"
		2993	"movq _LBCarryMask, %%mm5 \n\t"
		2994	"psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
		2995	// group
		2996
		2997	// prime the pump: load the first Raw(x-bpp) data set
		2998	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
		2999	// (we correct pos. in loop below)
		3000	"avg_4lp: \n\t"
		3001	"movq (%%edi,%%ecx,), %%mm0 \n\t"
		3002	"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
		3003	"movq (%%esi,%%ecx,), %%mm1 \n\t"
		3004	// add (Prev_row/2) to average
		3005	"movq %%mm5, %%mm3 \n\t"
		3006	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
		3007	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
		3008	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
		3009	// byte
		3010	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
		3011	// each byte
		3012	// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
		3013	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
		3014	// LBCarrys
		3015	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
		3016	// where both
		3017	// lsb's were == 1 (only valid for active group)
		3018	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		3019	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		3020	// byte
		3021	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
		3022	// for each byte
		3023	"pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
		3024	// bytes to add to Avg
		3025	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
		3026	// for each Active
		3027	// byte
		3028	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
		3029	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
		3030	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
		3031	"addl $8, %%ecx \n\t"
		3032	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
		3033	// LBCarrys
		3034	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
		3035	// where both
		3036	// lsb's were == 1 (only valid for active group)
		3037	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		3038	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		3039	// byte
		3040	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
		3041	// for each byte
		3042	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
		3043	// bytes to add to Avg
		3044	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
		3045	// Avg for each Active
		3046	// byte
		3047	"cmpl _MMXLength, %%ecx \n\t"
		3048	// now ready to write back to memory
		3049	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
		3050	// prep Raw(x-bpp) for next loop
		3051	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
		3052	"jb avg_4lp \n\t"
		3053
		3054	: "=S" (dummy_value_S), // output regs (dummy)
		3055	"=D" (dummy_value_D)
		3056
		3057	: "0" (prev_row), // esi // input regs
		3058	"1" (row) // edi
		3059
		3060	: "%ecx" // clobber list
		3061	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
		3062	, "%mm0", "%mm1", "%mm2", "%mm3"
		3063	, "%mm4", "%mm5", "%mm6", "%mm7"
		3064	#endif
		3065	);
		3066	}
		3067	break; // end 4,6 bpp
		3068
		3069	case 2:
		3070	{
		3071	_ActiveMask.use = 0x000000000000ffffLL;
		3072	_ShiftBpp.use = 16; // == 2 * 8
		3073	_ShiftRem.use = 48; // == 64 - 16
		3074
		3075	__asm__ __volatile__ (
		3076	// load _ActiveMask
		3077	"movq _ActiveMask, %%mm7 \n\t"
		3078	// re-init address pointers and offset
		3079	"movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
		3080	// boundary
		3081	"movq _LBCarryMask, %%mm5 \n\t"
		3082	// preload "movl row, %%edi \n\t" // edi: Avg(x)
		3083	"movq _HBClearMask, %%mm4 \n\t"
		3084	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
		3085
		3086	// prime the pump: load the first Raw(x-bpp) data set
		3087	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
		3088	// (we correct pos. in loop below)
		3089	"avg_2lp: \n\t"
		3090	"movq (%%edi,%%ecx,), %%mm0 \n\t"
		3091	"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
		3092	"movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
		3093	// add (Prev_row/2) to average
		3094	"movq %%mm5, %%mm3 \n\t"
		3095	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
		3096	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
		3097	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
		3098	// byte
		3099	"movq %%mm7, %%mm6 \n\t"
		3100	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
		3101	// each byte
		3102
		3103	// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
		3104	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
		3105	// LBCarrys
		3106	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
		3107	// where both
		3108	// lsb's were == 1 (only valid
		3109	// for active group)
		3110	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		3111	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		3112	// byte
		3113	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
		3114	// for each byte
		3115	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
		3116	// bytes to add to Avg
		3117	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
		3118	// for each Active byte
		3119
		3120	// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
		3121	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
		3122	// bytes 2 & 3
		3123	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
		3124	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
		3125	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
		3126	// LBCarrys
		3127	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
		3128	// where both
		3129	// lsb's were == 1 (only valid
		3130	// for active group)
		3131	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		3132	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		3133	// byte
		3134	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
		3135	// for each byte
		3136	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
		3137	// bytes to add to Avg
		3138	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
		3139	// Avg for each Active byte
		3140
		3141	// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
		3142	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
		3143	// bytes 4 & 5
		3144	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
		3145	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
		3146	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
		3147	// LBCarrys
		3148	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
		3149	// where both lsb's were == 1
		3150	// (only valid for active group)
		3151	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		3152	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		3153	// byte
		3154	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
		3155	// for each byte
		3156	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
		3157	// bytes to add to Avg
		3158	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
		3159	// Avg for each Active byte
		3160
		3161	// add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
		3162	"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
		3163	// bytes 6 & 7
		3164	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
		3165	"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
		3166	"addl $8, %%ecx \n\t"
		3167	"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
		3168	// LBCarrys
		3169	"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
		3170	// where both
		3171	// lsb's were == 1 (only valid
		3172	// for active group)
		3173	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		3174	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		3175	// byte
		3176	"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
		3177	// for each byte
		3178	"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
		3179	// bytes to add to Avg
		3180	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
		3181	// Avg for each Active byte
		3182
		3183	"cmpl _MMXLength, %%ecx \n\t"
		3184	// now ready to write back to memory
		3185	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
		3186	// prep Raw(x-bpp) for next loop
		3187	"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
		3188	"jb avg_2lp \n\t"
		3189
		3190	: "=S" (dummy_value_S), // output regs (dummy)
		3191	"=D" (dummy_value_D)
		3192
		3193	: "0" (prev_row), // esi // input regs
		3194	"1" (row) // edi
		3195
		3196	: "%ecx" // clobber list
		3197	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
		3198	, "%mm0", "%mm1", "%mm2", "%mm3"
		3199	, "%mm4", "%mm5", "%mm6", "%mm7"
		3200	#endif
		3201	);
		3202	}
		3203	break; // end 2 bpp
		3204
		3205	case 1:
		3206	{
		3207	__asm__ __volatile__ (
		3208	// re-init address pointers and offset
		3209	#ifdef __PIC__
		3210	"pushl %%ebx \n\t" // save Global Offset Table index
		3211	#endif
		3212	"movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
		3213	// boundary
		3214	// preload "movl row, %%edi \n\t" // edi: Avg(x)
		3215	"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
		3216	"jnb avg_1end \n\t"
		3217	// do Paeth decode for remaining bytes
		3218	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
		3219	"movl %%edi, %%edx \n\t"
		3220	// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
		3221	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
		3222	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
		3223	// in loop below
		3224	"avg_1lp: \n\t"
		3225	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
		3226	"xorl %%eax, %%eax \n\t"
		3227	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
		3228	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
		3229	"addw %%cx, %%ax \n\t"
		3230	"incl %%ebx \n\t"
		3231	"shrw %%ax \n\t" // divide by 2
		3232	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
		3233	// inc ebx
		3234	"cmpl _FullLength, %%ebx \n\t" // check if at end of array
		3235	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
		3236	// mov does not affect flags; -1 to offset inc ebx
		3237	"jb avg_1lp \n\t"
		3238
		3239	"avg_1end: \n\t"
		3240	#ifdef __PIC__
		3241	"popl %%ebx \n\t" // Global Offset Table index
		3242	#endif
		3243
		3244	: "=c" (dummy_value_c), // output regs (dummy)
		3245	"=S" (dummy_value_S),
		3246	"=D" (dummy_value_D)
		3247
		3248	: "0" (bpp), // ecx // input regs
		3249	"1" (prev_row), // esi
		3250	"2" (row) // edi
		3251
		3252	: "%eax", "%edx" // clobber list
		3253	#ifndef __PIC__
		3254	, "%ebx"
		3255	#endif
		3256	);
		3257	}
		3258	return; // end 1 bpp
		3259
		3260	case 8:
		3261	{
		3262	__asm__ __volatile__ (
		3263	// re-init address pointers and offset
		3264	"movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
		3265	"movq _LBCarryMask, %%mm5 \n\t" // boundary
		3266	// preload "movl row, %%edi \n\t" // edi: Avg(x)
		3267	"movq _HBClearMask, %%mm4 \n\t"
		3268	// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
		3269
		3270	// prime the pump: load the first Raw(x-bpp) data set
		3271	"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
		3272	// (NO NEED to correct pos. in loop below)
		3273
		3274	"avg_8lp: \n\t"
		3275	"movq (%%edi,%%ecx,), %%mm0 \n\t"
		3276	"movq %%mm5, %%mm3 \n\t"
		3277	"movq (%%esi,%%ecx,), %%mm1 \n\t"
		3278	"addl $8, %%ecx \n\t"
		3279	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
		3280	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
		3281	"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
		3282	// where both lsb's were == 1
		3283	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		3284	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
		3285	"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
		3286	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
		3287	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
		3288	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
		3289	"cmpl _MMXLength, %%ecx \n\t"
		3290	"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
		3291	"movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
		3292	"jb avg_8lp \n\t"
		3293
		3294	: "=S" (dummy_value_S), // output regs (dummy)
		3295	"=D" (dummy_value_D)
		3296
		3297	: "0" (prev_row), // esi // input regs
		3298	"1" (row) // edi
		3299
		3300	: "%ecx" // clobber list
		3301	#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
		3302	, "%mm0", "%mm1", "%mm2"
		3303	, "%mm3", "%mm4", "%mm5"
		3304	#endif
		3305	);
		3306	}
		3307	break; // end 8 bpp
		3308
		3309	default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
		3310	{
		3311
		3312	#ifdef PNG_DEBUG
		3313	// GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
		3314	png_debug(1,
		3315	"Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
		3316	#endif
		3317
		3318	#if 0
		3319	__asm__ __volatile__ (
		3320	"movq _LBCarryMask, %%mm5 \n\t"
		3321	// re-init address pointers and offset
		3322	"movl _dif, %%ebx \n\t" // ebx: x = offset to
		3323	// alignment boundary
		3324	"movl row, %%edi \n\t" // edi: Avg(x)
		3325	"movq _HBClearMask, %%mm4 \n\t"
		3326	"movl %%edi, %%edx \n\t"
		3327	"movl prev_row, %%esi \n\t" // esi: Prior(x)
		3328	"subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
		3329	"avg_Alp: \n\t"
		3330	"movq (%%edi,%%ebx,), %%mm0 \n\t"
		3331	"movq %%mm5, %%mm3 \n\t"
		3332	"movq (%%esi,%%ebx,), %%mm1 \n\t"
		3333	"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
		3334	"movq (%%edx,%%ebx,), %%mm2 \n\t"
		3335	"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
		3336	"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
		3337	// where both lsb's were == 1
		3338	"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
		3339	"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
		3340	// byte
		3341	"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
		3342	// byte
		3343	"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
		3344	// byte
		3345	"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
		3346	// each byte
		3347	"addl $8, %%ebx \n\t"
		3348	"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
		3349	// byte
		3350	"cmpl _MMXLength, %%ebx \n\t"
		3351	"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
		3352	"jb avg_Alp \n\t"
		3353
		3354	: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
		3355
		3356	: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
		3357
		3358	: "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
		3359	);
		3360	#endif /* 0 - NEVER REACHED */
		3361	}
		3362	break;
		3363
		3364	} // end switch (bpp)
		3365
		3366	__asm__ __volatile__ (
		3367	// MMX acceleration complete; now do clean-up
		3368	// check if any remaining bytes left to decode
		3369	#ifdef __PIC__
		3370	"pushl %%ebx \n\t" // save index to Global Offset Table
		3371	#endif
		3372	"movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
		3373	//pre "movl row, %%edi \n\t" // edi: Avg(x)
		3374	"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
		3375	"jnb avg_end \n\t"
		3376
		3377	// do Avg decode for remaining bytes
		3378	//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
		3379	"movl %%edi, %%edx \n\t"
		3380	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
		3381	"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
		3382	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
		3383
		3384	"avg_lp2: \n\t"
		3385	// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
		3386	"xorl %%eax, %%eax \n\t"
		3387	"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
		3388	"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
		3389	"addw %%cx, %%ax \n\t"
		3390	"incl %%ebx \n\t"
		3391	"shrw %%ax \n\t" // divide by 2
		3392	"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
		3393	"cmpl _FullLength, %%ebx \n\t" // check if at end of array
		3394	"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
		3395	"jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
		3396
		3397	"avg_end: \n\t"
		3398	"EMMS \n\t" // end MMX; prep for poss. FP instrs.
		3399	#ifdef __PIC__
		3400	"popl %%ebx \n\t" // restore index to Global Offset Table
		3401	#endif
		3402
		3403	: "=c" (dummy_value_c), // output regs (dummy)
		3404	"=S" (dummy_value_S),
		3405	"=D" (dummy_value_D)
		3406
		3407	: "0" (bpp), // ecx // input regs
		3408	"1" (prev_row), // esi
		3409	"2" (row) // edi
		3410
		3411	: "%eax", "%edx" // clobber list
		3412	#ifndef __PIC__
		3413	, "%ebx"
		3414	#endif
		3415	);
		3416
		3417	} /* end png_read_filter_row_mmx_avg() */
		3418	#endif
		3419
		3420
		3421
		3422	#ifdef PNG_THREAD_UNSAFE_OK
		3423	//===========================================================================//
		3424	// //
		3425	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
		3426	// //
		3427	//===========================================================================//
		3428
		3429	// Optimized code for PNG Paeth filter decoder
		3430
		3431	static void /* PRIVATE */
		3432	png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
		3433	png_bytep prev_row)
		3434	{
		3435	int bpp;
		3436	int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
		3437	int dummy_value_S;
		3438	int dummy_value_D;
		3439
		3440	bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
		3441	_FullLength = row_info->rowbytes; // # of bytes to filter
		3442
		3443	__asm__ __volatile__ (
		3444	#ifdef __PIC__
		3445	"pushl %%ebx \n\t" // save index to Global Offset Table
		3446	#endif
		3447	"xorl %%ebx, %%ebx \n\t" // ebx: x offset
		3448	//pre "movl row, %%edi \n\t"
		3449	"xorl %%edx, %%edx \n\t" // edx: x-bpp offset
		3450	//pre "movl prev_row, %%esi \n\t"
		3451	"xorl %%eax, %%eax \n\t"
		3452
		3453	// Compute the Raw value for the first bpp bytes
		3454	// Note: the formula works out to be always
		3455	// Paeth(x) = Raw(x) + Prior(x) where x < bpp
		3456	"paeth_rlp: \n\t"
		3457	"movb (%%edi,%%ebx,), %%al \n\t"
		3458	"addb (%%esi,%%ebx,), %%al \n\t"
		3459	"incl %%ebx \n\t"
		3460	//pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
		3461	"cmpl %%ecx, %%ebx \n\t"
		3462	"movb %%al, -1(%%edi,%%ebx,) \n\t"
		3463	"jb paeth_rlp \n\t"
		3464	// get # of bytes to alignment
		3465	"movl %%edi, _dif \n\t" // take start of row
		3466	"addl %%ebx, _dif \n\t" // add bpp
		3467	"xorl %%ecx, %%ecx \n\t"
		3468	"addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
		3469	// boundary
		3470	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
		3471	"subl %%edi, _dif \n\t" // subtract from start ==> value ebx
		3472	// at alignment
		3473	"jz paeth_go \n\t"
		3474	// fix alignment
		3475
		3476	"paeth_lp1: \n\t"
		3477	"xorl %%eax, %%eax \n\t"
		3478	// pav = p - a = (a + b - c) - a = b - c
		3479	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
		3480	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
		3481	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
		3482	"movl %%eax, _patemp \n\t" // Save pav for later use
		3483	"xorl %%eax, %%eax \n\t"
		3484	// pbv = p - b = (a + b - c) - b = a - c
		3485	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
		3486	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
		3487	"movl %%eax, %%ecx \n\t"
		3488	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		3489	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
		3490	// pc = abs(pcv)
		3491	"testl $0x80000000, %%eax \n\t"
		3492	"jz paeth_pca \n\t"
		3493	"negl %%eax \n\t" // reverse sign of neg values
		3494
		3495	"paeth_pca: \n\t"
		3496	"movl %%eax, _pctemp \n\t" // save pc for later use
		3497	// pb = abs(pbv)
		3498	"testl $0x80000000, %%ecx \n\t"
		3499	"jz paeth_pba \n\t"
		3500	"negl %%ecx \n\t" // reverse sign of neg values
		3501
		3502	"paeth_pba: \n\t"
		3503	"movl %%ecx, _pbtemp \n\t" // save pb for later use
		3504	// pa = abs(pav)
		3505	"movl _patemp, %%eax \n\t"
		3506	"testl $0x80000000, %%eax \n\t"
		3507	"jz paeth_paa \n\t"
		3508	"negl %%eax \n\t" // reverse sign of neg values
		3509
		3510	"paeth_paa: \n\t"
		3511	"movl %%eax, _patemp \n\t" // save pa for later use
		3512	// test if pa <= pb
		3513	"cmpl %%ecx, %%eax \n\t"
		3514	"jna paeth_abb \n\t"
		3515	// pa > pb; now test if pb <= pc
		3516	"cmpl _pctemp, %%ecx \n\t"
		3517	"jna paeth_bbc \n\t"
		3518	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
		3519	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
		3520	"jmp paeth_paeth \n\t"
		3521
		3522	"paeth_bbc: \n\t"
		3523	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
		3524	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
		3525	"jmp paeth_paeth \n\t"
		3526
		3527	"paeth_abb: \n\t"
		3528	// pa <= pb; now test if pa <= pc
		3529	"cmpl _pctemp, %%eax \n\t"
		3530	"jna paeth_abc \n\t"
		3531	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
		3532	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
		3533	"jmp paeth_paeth \n\t"
		3534
		3535	"paeth_abc: \n\t"
		3536	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
		3537	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
		3538
		3539	"paeth_paeth: \n\t"
		3540	"incl %%ebx \n\t"
		3541	"incl %%edx \n\t"
		3542	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
		3543	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
		3544	"cmpl _dif, %%ebx \n\t"
		3545	"jb paeth_lp1 \n\t"
		3546
		3547	"paeth_go: \n\t"
		3548	"movl _FullLength, %%ecx \n\t"
		3549	"movl %%ecx, %%eax \n\t"
		3550	"subl %%ebx, %%eax \n\t" // subtract alignment fix
		3551	"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
		3552	"subl %%eax, %%ecx \n\t" // drop over bytes from original length
		3553	"movl %%ecx, _MMXLength \n\t"
		3554	#ifdef __PIC__
		3555	"popl %%ebx \n\t" // restore index to Global Offset Table
		3556	#endif
		3557
		3558	: "=c" (dummy_value_c), // output regs (dummy)
		3559	"=S" (dummy_value_S),
		3560	"=D" (dummy_value_D)
		3561
		3562	: "0" (bpp), // ecx // input regs
		3563	"1" (prev_row), // esi
		3564	"2" (row) // edi
		3565
		3566	: "%eax", "%edx" // clobber list
		3567	#ifndef __PIC__
		3568	, "%ebx"
		3569	#endif
		3570	);
		3571
		3572	// now do the math for the rest of the row
		3573	switch (bpp)
		3574	{
		3575	case 3:
		3576	{
		3577	_ActiveMask.use = 0x0000000000ffffffLL;
		3578	_ActiveMaskEnd.use = 0xffff000000000000LL;
		3579	_ShiftBpp.use = 24; // == bpp(3) * 8
		3580	_ShiftRem.use = 40; // == 64 - 24
		3581
		3582	__asm__ __volatile__ (
		3583	"movl _dif, %%ecx \n\t"
		3584	// preload "movl row, %%edi \n\t"
		3585	// preload "movl prev_row, %%esi \n\t"
		3586	"pxor %%mm0, %%mm0 \n\t"
		3587	// prime the pump: load the first Raw(x-bpp) data set
		3588	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
		3589	"paeth_3lp: \n\t"
		3590	"psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
		3591	// 3 bytes
		3592	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
		3593	"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
		3594	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
		3595	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
		3596	"psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
		3597	// 3 bytes
		3598	// pav = p - a = (a + b - c) - a = b - c
		3599	"movq %%mm2, %%mm4 \n\t"
		3600	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
		3601	// pbv = p - b = (a + b - c) - b = a - c
		3602	"movq %%mm1, %%mm5 \n\t"
		3603	"psubw %%mm3, %%mm4 \n\t"
		3604	"pxor %%mm7, %%mm7 \n\t"
		3605	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		3606	"movq %%mm4, %%mm6 \n\t"
		3607	"psubw %%mm3, %%mm5 \n\t"
		3608
		3609	// pa = abs(p-a) = abs(pav)
		3610	// pb = abs(p-b) = abs(pbv)
		3611	// pc = abs(p-c) = abs(pcv)
		3612	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
		3613	"paddw %%mm5, %%mm6 \n\t"
		3614	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3615	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
		3616	"psubw %%mm0, %%mm4 \n\t"
		3617	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
		3618	"psubw %%mm0, %%mm4 \n\t"
		3619	"psubw %%mm7, %%mm5 \n\t"
		3620	"pxor %%mm0, %%mm0 \n\t"
		3621	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
		3622	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3623	"psubw %%mm7, %%mm5 \n\t"
		3624	"psubw %%mm0, %%mm6 \n\t"
		3625	// test pa <= pb
		3626	"movq %%mm4, %%mm7 \n\t"
		3627	"psubw %%mm0, %%mm6 \n\t"
		3628	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
		3629	"movq %%mm7, %%mm0 \n\t"
		3630	// use mm7 mask to merge pa & pb
		3631	"pand %%mm7, %%mm5 \n\t"
		3632	// use mm0 mask copy to merge a & b
		3633	"pand %%mm0, %%mm2 \n\t"
		3634	"pandn %%mm4, %%mm7 \n\t"
		3635	"pandn %%mm1, %%mm0 \n\t"
		3636	"paddw %%mm5, %%mm7 \n\t"
		3637	"paddw %%mm2, %%mm0 \n\t"
		3638	// test ((pa <= pb)? pa:pb) <= pc
		3639	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
		3640	"pxor %%mm1, %%mm1 \n\t"
		3641	"pand %%mm7, %%mm3 \n\t"
		3642	"pandn %%mm0, %%mm7 \n\t"
		3643	"paddw %%mm3, %%mm7 \n\t"
		3644	"pxor %%mm0, %%mm0 \n\t"
		3645	"packuswb %%mm1, %%mm7 \n\t"
		3646	"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
		3647	"pand _ActiveMask, %%mm7 \n\t"
		3648	"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
		3649	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
		3650	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
		3651	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
		3652	"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
		3653	// Raw(x-bpp)
		3654	// now do Paeth for 2nd set of bytes (3-5)
		3655	"psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
		3656	"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
		3657	"pxor %%mm7, %%mm7 \n\t"
		3658	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
		3659	// pbv = p - b = (a + b - c) - b = a - c
		3660	"movq %%mm1, %%mm5 \n\t"
		3661	// pav = p - a = (a + b - c) - a = b - c
		3662	"movq %%mm2, %%mm4 \n\t"
		3663	"psubw %%mm3, %%mm5 \n\t"
		3664	"psubw %%mm3, %%mm4 \n\t"
		3665	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
		3666	// pav + pbv = pbv + pav
		3667	"movq %%mm5, %%mm6 \n\t"
		3668	"paddw %%mm4, %%mm6 \n\t"
		3669
		3670	// pa = abs(p-a) = abs(pav)
		3671	// pb = abs(p-b) = abs(pbv)
		3672	// pc = abs(p-c) = abs(pcv)
		3673	"pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
		3674	"pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
		3675	"pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
		3676	"pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
		3677	"psubw %%mm0, %%mm5 \n\t"
		3678	"psubw %%mm7, %%mm4 \n\t"
		3679	"psubw %%mm0, %%mm5 \n\t"
		3680	"psubw %%mm7, %%mm4 \n\t"
		3681	"pxor %%mm0, %%mm0 \n\t"
		3682	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
		3683	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3684	"psubw %%mm0, %%mm6 \n\t"
		3685	// test pa <= pb
		3686	"movq %%mm4, %%mm7 \n\t"
		3687	"psubw %%mm0, %%mm6 \n\t"
		3688	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
		3689	"movq %%mm7, %%mm0 \n\t"
		3690	// use mm7 mask to merge pa & pb
		3691	"pand %%mm7, %%mm5 \n\t"
		3692	// use mm0 mask copy to merge a & b
		3693	"pand %%mm0, %%mm2 \n\t"
		3694	"pandn %%mm4, %%mm7 \n\t"
		3695	"pandn %%mm1, %%mm0 \n\t"
		3696	"paddw %%mm5, %%mm7 \n\t"
		3697	"paddw %%mm2, %%mm0 \n\t"
		3698	// test ((pa <= pb)? pa:pb) <= pc
		3699	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
		3700	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
		3701	"pand %%mm7, %%mm3 \n\t"
		3702	"pandn %%mm0, %%mm7 \n\t"
		3703	"pxor %%mm1, %%mm1 \n\t"
		3704	"paddw %%mm3, %%mm7 \n\t"
		3705	"pxor %%mm0, %%mm0 \n\t"
		3706	"packuswb %%mm1, %%mm7 \n\t"
		3707	"movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
		3708	"pand _ActiveMask, %%mm7 \n\t"
		3709	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
		3710	"psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
		3711	// 3 bytes
		3712	// pav = p - a = (a + b - c) - a = b - c
		3713	"movq %%mm2, %%mm4 \n\t"
		3714	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
		3715	"psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
		3716	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
		3717	"movq %%mm7, %%mm1 \n\t"
		3718	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
		3719	"psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
		3720	// now mm1 will be used as Raw(x-bpp)
		3721	// now do Paeth for 3rd, and final, set of bytes (6-7)
		3722	"pxor %%mm7, %%mm7 \n\t"
		3723	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
		3724	"psubw %%mm3, %%mm4 \n\t"
		3725	// pbv = p - b = (a + b - c) - b = a - c
		3726	"movq %%mm1, %%mm5 \n\t"
		3727	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		3728	"movq %%mm4, %%mm6 \n\t"
		3729	"psubw %%mm3, %%mm5 \n\t"
		3730	"pxor %%mm0, %%mm0 \n\t"
		3731	"paddw %%mm5, %%mm6 \n\t"
		3732
		3733	// pa = abs(p-a) = abs(pav)
		3734	// pb = abs(p-b) = abs(pbv)
		3735	// pc = abs(p-c) = abs(pcv)
		3736	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
		3737	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
		3738	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3739	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
		3740	"psubw %%mm0, %%mm4 \n\t"
		3741	"psubw %%mm7, %%mm5 \n\t"
		3742	"psubw %%mm0, %%mm4 \n\t"
		3743	"psubw %%mm7, %%mm5 \n\t"
		3744	"pxor %%mm0, %%mm0 \n\t"
		3745	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
		3746	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3747	"psubw %%mm0, %%mm6 \n\t"
		3748	// test pa <= pb
		3749	"movq %%mm4, %%mm7 \n\t"
		3750	"psubw %%mm0, %%mm6 \n\t"
		3751	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
		3752	"movq %%mm7, %%mm0 \n\t"
		3753	// use mm0 mask copy to merge a & b
		3754	"pand %%mm0, %%mm2 \n\t"
		3755	// use mm7 mask to merge pa & pb
		3756	"pand %%mm7, %%mm5 \n\t"
		3757	"pandn %%mm1, %%mm0 \n\t"
		3758	"pandn %%mm4, %%mm7 \n\t"
		3759	"paddw %%mm2, %%mm0 \n\t"
		3760	"paddw %%mm5, %%mm7 \n\t"
		3761	// test ((pa <= pb)? pa:pb) <= pc
		3762	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
		3763	"pand %%mm7, %%mm3 \n\t"
		3764	"pandn %%mm0, %%mm7 \n\t"
		3765	"paddw %%mm3, %%mm7 \n\t"
		3766	"pxor %%mm1, %%mm1 \n\t"
		3767	"packuswb %%mm7, %%mm1 \n\t"
		3768	// step ecx to next set of 8 bytes and repeat loop til done
		3769	"addl $8, %%ecx \n\t"
		3770	"pand _ActiveMaskEnd, %%mm1 \n\t"
		3771	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
		3772	// Raw(x)
		3773
		3774	"cmpl _MMXLength, %%ecx \n\t"
		3775	"pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
		3776	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
		3777	// mm1 will be used as Raw(x-bpp) next loop
		3778	// mm3 ready to be used as Prior(x-bpp) next loop
		3779	"jb paeth_3lp \n\t"
		3780
		3781	: "=S" (dummy_value_S), // output regs (dummy)
		3782	"=D" (dummy_value_D)
		3783
		3784	: "0" (prev_row), // esi // input regs
		3785	"1" (row) // edi
		3786
		3787	: "%ecx" // clobber list
		3788	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
		3789	, "%mm0", "%mm1", "%mm2", "%mm3"
		3790	, "%mm4", "%mm5", "%mm6", "%mm7"
		3791	#endif
		3792	);
		3793	}
		3794	break; // end 3 bpp
		3795
		3796	case 6:
		3797	//case 7: // GRR BOGUS
		3798	//case 5: // GRR BOGUS
		3799	{
		3800	_ActiveMask.use = 0x00000000ffffffffLL;
		3801	_ActiveMask2.use = 0xffffffff00000000LL;
		3802	_ShiftBpp.use = bpp << 3; // == bpp * 8
		3803	_ShiftRem.use = 64 - _ShiftBpp.use;
		3804
		3805	__asm__ __volatile__ (
		3806	"movl _dif, %%ecx \n\t"
		3807	// preload "movl row, %%edi \n\t"
		3808	// preload "movl prev_row, %%esi \n\t"
		3809	// prime the pump: load the first Raw(x-bpp) data set
		3810	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
		3811	"pxor %%mm0, %%mm0 \n\t"
		3812
		3813	"paeth_6lp: \n\t"
		3814	// must shift to position Raw(x-bpp) data
		3815	"psrlq _ShiftRem, %%mm1 \n\t"
		3816	// do first set of 4 bytes
		3817	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
		3818	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
		3819	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
		3820	"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
		3821	// must shift to position Prior(x-bpp) data
		3822	"psrlq _ShiftRem, %%mm3 \n\t"
		3823	// pav = p - a = (a + b - c) - a = b - c
		3824	"movq %%mm2, %%mm4 \n\t"
		3825	"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
		3826	// pbv = p - b = (a + b - c) - b = a - c
		3827	"movq %%mm1, %%mm5 \n\t"
		3828	"psubw %%mm3, %%mm4 \n\t"
		3829	"pxor %%mm7, %%mm7 \n\t"
		3830	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		3831	"movq %%mm4, %%mm6 \n\t"
		3832	"psubw %%mm3, %%mm5 \n\t"
		3833	// pa = abs(p-a) = abs(pav)
		3834	// pb = abs(p-b) = abs(pbv)
		3835	// pc = abs(p-c) = abs(pcv)
		3836	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
		3837	"paddw %%mm5, %%mm6 \n\t"
		3838	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3839	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
		3840	"psubw %%mm0, %%mm4 \n\t"
		3841	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
		3842	"psubw %%mm0, %%mm4 \n\t"
		3843	"psubw %%mm7, %%mm5 \n\t"
		3844	"pxor %%mm0, %%mm0 \n\t"
		3845	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
		3846	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3847	"psubw %%mm7, %%mm5 \n\t"
		3848	"psubw %%mm0, %%mm6 \n\t"
		3849	// test pa <= pb
		3850	"movq %%mm4, %%mm7 \n\t"
		3851	"psubw %%mm0, %%mm6 \n\t"
		3852	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
		3853	"movq %%mm7, %%mm0 \n\t"
		3854	// use mm7 mask to merge pa & pb
		3855	"pand %%mm7, %%mm5 \n\t"
		3856	// use mm0 mask copy to merge a & b
		3857	"pand %%mm0, %%mm2 \n\t"
		3858	"pandn %%mm4, %%mm7 \n\t"
		3859	"pandn %%mm1, %%mm0 \n\t"
		3860	"paddw %%mm5, %%mm7 \n\t"
		3861	"paddw %%mm2, %%mm0 \n\t"
		3862	// test ((pa <= pb)? pa:pb) <= pc
		3863	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
		3864	"pxor %%mm1, %%mm1 \n\t"
		3865	"pand %%mm7, %%mm3 \n\t"
		3866	"pandn %%mm0, %%mm7 \n\t"
		3867	"paddw %%mm3, %%mm7 \n\t"
		3868	"pxor %%mm0, %%mm0 \n\t"
		3869	"packuswb %%mm1, %%mm7 \n\t"
		3870	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
		3871	"pand _ActiveMask, %%mm7 \n\t"
		3872	"psrlq _ShiftRem, %%mm3 \n\t"
		3873	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
		3874	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
		3875	"movq %%mm2, %%mm6 \n\t"
		3876	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
		3877	"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
		3878	"psllq _ShiftBpp, %%mm6 \n\t"
		3879	"movq %%mm7, %%mm5 \n\t"
		3880	"psrlq _ShiftRem, %%mm1 \n\t"
		3881	"por %%mm6, %%mm3 \n\t"
		3882	"psllq _ShiftBpp, %%mm5 \n\t"
		3883	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
		3884	"por %%mm5, %%mm1 \n\t"
		3885	// do second set of 4 bytes
		3886	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
		3887	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
		3888	// pav = p - a = (a + b - c) - a = b - c
		3889	"movq %%mm2, %%mm4 \n\t"
		3890	// pbv = p - b = (a + b - c) - b = a - c
		3891	"movq %%mm1, %%mm5 \n\t"
		3892	"psubw %%mm3, %%mm4 \n\t"
		3893	"pxor %%mm7, %%mm7 \n\t"
		3894	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		3895	"movq %%mm4, %%mm6 \n\t"
		3896	"psubw %%mm3, %%mm5 \n\t"
		3897	// pa = abs(p-a) = abs(pav)
		3898	// pb = abs(p-b) = abs(pbv)
		3899	// pc = abs(p-c) = abs(pcv)
		3900	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
		3901	"paddw %%mm5, %%mm6 \n\t"
		3902	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3903	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
		3904	"psubw %%mm0, %%mm4 \n\t"
		3905	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
		3906	"psubw %%mm0, %%mm4 \n\t"
		3907	"psubw %%mm7, %%mm5 \n\t"
		3908	"pxor %%mm0, %%mm0 \n\t"
		3909	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
		3910	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3911	"psubw %%mm7, %%mm5 \n\t"
		3912	"psubw %%mm0, %%mm6 \n\t"
		3913	// test pa <= pb
		3914	"movq %%mm4, %%mm7 \n\t"
		3915	"psubw %%mm0, %%mm6 \n\t"
		3916	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
		3917	"movq %%mm7, %%mm0 \n\t"
		3918	// use mm7 mask to merge pa & pb
		3919	"pand %%mm7, %%mm5 \n\t"
		3920	// use mm0 mask copy to merge a & b
		3921	"pand %%mm0, %%mm2 \n\t"
		3922	"pandn %%mm4, %%mm7 \n\t"
		3923	"pandn %%mm1, %%mm0 \n\t"
		3924	"paddw %%mm5, %%mm7 \n\t"
		3925	"paddw %%mm2, %%mm0 \n\t"
		3926	// test ((pa <= pb)? pa:pb) <= pc
		3927	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
		3928	"pxor %%mm1, %%mm1 \n\t"
		3929	"pand %%mm7, %%mm3 \n\t"
		3930	"pandn %%mm0, %%mm7 \n\t"
		3931	"pxor %%mm1, %%mm1 \n\t"
		3932	"paddw %%mm3, %%mm7 \n\t"
		3933	"pxor %%mm0, %%mm0 \n\t"
		3934	// step ecx to next set of 8 bytes and repeat loop til done
		3935	"addl $8, %%ecx \n\t"
		3936	"packuswb %%mm7, %%mm1 \n\t"
		3937	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
		3938	"cmpl _MMXLength, %%ecx \n\t"
		3939	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
		3940	// mm1 will be used as Raw(x-bpp) next loop
		3941	"jb paeth_6lp \n\t"
		3942
		3943	: "=S" (dummy_value_S), // output regs (dummy)
		3944	"=D" (dummy_value_D)
		3945
		3946	: "0" (prev_row), // esi // input regs
		3947	"1" (row) // edi
		3948
		3949	: "%ecx" // clobber list
		3950	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
		3951	, "%mm0", "%mm1", "%mm2", "%mm3"
		3952	, "%mm4", "%mm5", "%mm6", "%mm7"
		3953	#endif
		3954	);
		3955	}
		3956	break; // end 6 bpp
		3957
		3958	case 4:
		3959	{
		3960	_ActiveMask.use = 0x00000000ffffffffLL;
		3961
		3962	__asm__ __volatile__ (
		3963	"movl _dif, %%ecx \n\t"
		3964	// preload "movl row, %%edi \n\t"
		3965	// preload "movl prev_row, %%esi \n\t"
		3966	"pxor %%mm0, %%mm0 \n\t"
		3967	// prime the pump: load the first Raw(x-bpp) data set
		3968	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
		3969	// a=Raw(x-bpp) bytes
		3970	"paeth_4lp: \n\t"
		3971	// do first set of 4 bytes
		3972	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
		3973	"punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
		3974	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
		3975	"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
		3976	// pav = p - a = (a + b - c) - a = b - c
		3977	"movq %%mm2, %%mm4 \n\t"
		3978	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
		3979	// pbv = p - b = (a + b - c) - b = a - c
		3980	"movq %%mm1, %%mm5 \n\t"
		3981	"psubw %%mm3, %%mm4 \n\t"
		3982	"pxor %%mm7, %%mm7 \n\t"
		3983	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		3984	"movq %%mm4, %%mm6 \n\t"
		3985	"psubw %%mm3, %%mm5 \n\t"
		3986	// pa = abs(p-a) = abs(pav)
		3987	// pb = abs(p-b) = abs(pbv)
		3988	// pc = abs(p-c) = abs(pcv)
		3989	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
		3990	"paddw %%mm5, %%mm6 \n\t"
		3991	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
		3992	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
		3993	"psubw %%mm0, %%mm4 \n\t"
		3994	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
		3995	"psubw %%mm0, %%mm4 \n\t"
		3996	"psubw %%mm7, %%mm5 \n\t"
		3997	"pxor %%mm0, %%mm0 \n\t"
		3998	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
		3999	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
		4000	"psubw %%mm7, %%mm5 \n\t"
		4001	"psubw %%mm0, %%mm6 \n\t"
		4002	// test pa <= pb
		4003	"movq %%mm4, %%mm7 \n\t"
		4004	"psubw %%mm0, %%mm6 \n\t"
		4005	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
		4006	"movq %%mm7, %%mm0 \n\t"
		4007	// use mm7 mask to merge pa & pb
		4008	"pand %%mm7, %%mm5 \n\t"
		4009	// use mm0 mask copy to merge a & b
		4010	"pand %%mm0, %%mm2 \n\t"
		4011	"pandn %%mm4, %%mm7 \n\t"
		4012	"pandn %%mm1, %%mm0 \n\t"
		4013	"paddw %%mm5, %%mm7 \n\t"
		4014	"paddw %%mm2, %%mm0 \n\t"
		4015	// test ((pa <= pb)? pa:pb) <= pc
		4016	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
		4017	"pxor %%mm1, %%mm1 \n\t"
		4018	"pand %%mm7, %%mm3 \n\t"
		4019	"pandn %%mm0, %%mm7 \n\t"
		4020	"paddw %%mm3, %%mm7 \n\t"
		4021	"pxor %%mm0, %%mm0 \n\t"
		4022	"packuswb %%mm1, %%mm7 \n\t"
		4023	"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
		4024	"pand _ActiveMask, %%mm7 \n\t"
		4025	"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
		4026	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
		4027	"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
		4028	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
		4029	"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
		4030	// do second set of 4 bytes
		4031	"punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
		4032	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
		4033	// pav = p - a = (a + b - c) - a = b - c
		4034	"movq %%mm2, %%mm4 \n\t"
		4035	// pbv = p - b = (a + b - c) - b = a - c
		4036	"movq %%mm1, %%mm5 \n\t"
		4037	"psubw %%mm3, %%mm4 \n\t"
		4038	"pxor %%mm7, %%mm7 \n\t"
		4039	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		4040	"movq %%mm4, %%mm6 \n\t"
		4041	"psubw %%mm3, %%mm5 \n\t"
		4042	// pa = abs(p-a) = abs(pav)
		4043	// pb = abs(p-b) = abs(pbv)
		4044	// pc = abs(p-c) = abs(pcv)
		4045	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
		4046	"paddw %%mm5, %%mm6 \n\t"
		4047	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
		4048	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
		4049	"psubw %%mm0, %%mm4 \n\t"
		4050	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
		4051	"psubw %%mm0, %%mm4 \n\t"
		4052	"psubw %%mm7, %%mm5 \n\t"
		4053	"pxor %%mm0, %%mm0 \n\t"
		4054	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
		4055	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
		4056	"psubw %%mm7, %%mm5 \n\t"
		4057	"psubw %%mm0, %%mm6 \n\t"
		4058	// test pa <= pb
		4059	"movq %%mm4, %%mm7 \n\t"
		4060	"psubw %%mm0, %%mm6 \n\t"
		4061	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
		4062	"movq %%mm7, %%mm0 \n\t"
		4063	// use mm7 mask to merge pa & pb
		4064	"pand %%mm7, %%mm5 \n\t"
		4065	// use mm0 mask copy to merge a & b
		4066	"pand %%mm0, %%mm2 \n\t"
		4067	"pandn %%mm4, %%mm7 \n\t"
		4068	"pandn %%mm1, %%mm0 \n\t"
		4069	"paddw %%mm5, %%mm7 \n\t"
		4070	"paddw %%mm2, %%mm0 \n\t"
		4071	// test ((pa <= pb)? pa:pb) <= pc
		4072	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
		4073	"pxor %%mm1, %%mm1 \n\t"
		4074	"pand %%mm7, %%mm3 \n\t"
		4075	"pandn %%mm0, %%mm7 \n\t"
		4076	"pxor %%mm1, %%mm1 \n\t"
		4077	"paddw %%mm3, %%mm7 \n\t"
		4078	"pxor %%mm0, %%mm0 \n\t"
		4079	// step ecx to next set of 8 bytes and repeat loop til done
		4080	"addl $8, %%ecx \n\t"
		4081	"packuswb %%mm7, %%mm1 \n\t"
		4082	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
		4083	"cmpl _MMXLength, %%ecx \n\t"
		4084	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
		4085	// mm1 will be used as Raw(x-bpp) next loop
		4086	"jb paeth_4lp \n\t"
		4087
		4088	: "=S" (dummy_value_S), // output regs (dummy)
		4089	"=D" (dummy_value_D)
		4090
		4091	: "0" (prev_row), // esi // input regs
		4092	"1" (row) // edi
		4093
		4094	: "%ecx" // clobber list
		4095	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
		4096	, "%mm0", "%mm1", "%mm2", "%mm3"
		4097	, "%mm4", "%mm5", "%mm6", "%mm7"
		4098	#endif
		4099	);
		4100	}
		4101	break; // end 4 bpp
		4102
		4103	case 8: // bpp == 8
		4104	{
		4105	_ActiveMask.use = 0x00000000ffffffffLL;
		4106
		4107	__asm__ __volatile__ (
		4108	"movl _dif, %%ecx \n\t"
		4109	// preload "movl row, %%edi \n\t"
		4110	// preload "movl prev_row, %%esi \n\t"
		4111	"pxor %%mm0, %%mm0 \n\t"
		4112	// prime the pump: load the first Raw(x-bpp) data set
		4113	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
		4114	// a=Raw(x-bpp) bytes
		4115	"paeth_8lp: \n\t"
		4116	// do first set of 4 bytes
		4117	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
		4118	"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
		4119	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
		4120	"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
		4121	// pav = p - a = (a + b - c) - a = b - c
		4122	"movq %%mm2, %%mm4 \n\t"
		4123	"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
		4124	// pbv = p - b = (a + b - c) - b = a - c
		4125	"movq %%mm1, %%mm5 \n\t"
		4126	"psubw %%mm3, %%mm4 \n\t"
		4127	"pxor %%mm7, %%mm7 \n\t"
		4128	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		4129	"movq %%mm4, %%mm6 \n\t"
		4130	"psubw %%mm3, %%mm5 \n\t"
		4131	// pa = abs(p-a) = abs(pav)
		4132	// pb = abs(p-b) = abs(pbv)
		4133	// pc = abs(p-c) = abs(pcv)
		4134	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
		4135	"paddw %%mm5, %%mm6 \n\t"
		4136	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
		4137	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
		4138	"psubw %%mm0, %%mm4 \n\t"
		4139	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
		4140	"psubw %%mm0, %%mm4 \n\t"
		4141	"psubw %%mm7, %%mm5 \n\t"
		4142	"pxor %%mm0, %%mm0 \n\t"
		4143	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
		4144	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
		4145	"psubw %%mm7, %%mm5 \n\t"
		4146	"psubw %%mm0, %%mm6 \n\t"
		4147	// test pa <= pb
		4148	"movq %%mm4, %%mm7 \n\t"
		4149	"psubw %%mm0, %%mm6 \n\t"
		4150	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
		4151	"movq %%mm7, %%mm0 \n\t"
		4152	// use mm7 mask to merge pa & pb
		4153	"pand %%mm7, %%mm5 \n\t"
		4154	// use mm0 mask copy to merge a & b
		4155	"pand %%mm0, %%mm2 \n\t"
		4156	"pandn %%mm4, %%mm7 \n\t"
		4157	"pandn %%mm1, %%mm0 \n\t"
		4158	"paddw %%mm5, %%mm7 \n\t"
		4159	"paddw %%mm2, %%mm0 \n\t"
		4160	// test ((pa <= pb)? pa:pb) <= pc
		4161	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
		4162	"pxor %%mm1, %%mm1 \n\t"
		4163	"pand %%mm7, %%mm3 \n\t"
		4164	"pandn %%mm0, %%mm7 \n\t"
		4165	"paddw %%mm3, %%mm7 \n\t"
		4166	"pxor %%mm0, %%mm0 \n\t"
		4167	"packuswb %%mm1, %%mm7 \n\t"
		4168	"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
		4169	"pand _ActiveMask, %%mm7 \n\t"
		4170	"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
		4171	"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
		4172	"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
		4173	"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
		4174	"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
		4175
		4176	// do second set of 4 bytes
		4177	"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
		4178	"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
		4179	// pav = p - a = (a + b - c) - a = b - c
		4180	"movq %%mm2, %%mm4 \n\t"
		4181	// pbv = p - b = (a + b - c) - b = a - c
		4182	"movq %%mm1, %%mm5 \n\t"
		4183	"psubw %%mm3, %%mm4 \n\t"
		4184	"pxor %%mm7, %%mm7 \n\t"
		4185	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		4186	"movq %%mm4, %%mm6 \n\t"
		4187	"psubw %%mm3, %%mm5 \n\t"
		4188	// pa = abs(p-a) = abs(pav)
		4189	// pb = abs(p-b) = abs(pbv)
		4190	// pc = abs(p-c) = abs(pcv)
		4191	"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
		4192	"paddw %%mm5, %%mm6 \n\t"
		4193	"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
		4194	"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
		4195	"psubw %%mm0, %%mm4 \n\t"
		4196	"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
		4197	"psubw %%mm0, %%mm4 \n\t"
		4198	"psubw %%mm7, %%mm5 \n\t"
		4199	"pxor %%mm0, %%mm0 \n\t"
		4200	"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
		4201	"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
		4202	"psubw %%mm7, %%mm5 \n\t"
		4203	"psubw %%mm0, %%mm6 \n\t"
		4204	// test pa <= pb
		4205	"movq %%mm4, %%mm7 \n\t"
		4206	"psubw %%mm0, %%mm6 \n\t"
		4207	"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
		4208	"movq %%mm7, %%mm0 \n\t"
		4209	// use mm7 mask to merge pa & pb
		4210	"pand %%mm7, %%mm5 \n\t"
		4211	// use mm0 mask copy to merge a & b
		4212	"pand %%mm0, %%mm2 \n\t"
		4213	"pandn %%mm4, %%mm7 \n\t"
		4214	"pandn %%mm1, %%mm0 \n\t"
		4215	"paddw %%mm5, %%mm7 \n\t"
		4216	"paddw %%mm2, %%mm0 \n\t"
		4217	// test ((pa <= pb)? pa:pb) <= pc
		4218	"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
		4219	"pxor %%mm1, %%mm1 \n\t"
		4220	"pand %%mm7, %%mm3 \n\t"
		4221	"pandn %%mm0, %%mm7 \n\t"
		4222	"pxor %%mm1, %%mm1 \n\t"
		4223	"paddw %%mm3, %%mm7 \n\t"
		4224	"pxor %%mm0, %%mm0 \n\t"
		4225	// step ecx to next set of 8 bytes and repeat loop til done
		4226	"addl $8, %%ecx \n\t"
		4227	"packuswb %%mm7, %%mm1 \n\t"
		4228	"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
		4229	"cmpl _MMXLength, %%ecx \n\t"
		4230	"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
		4231	// mm1 will be used as Raw(x-bpp) next loop
		4232	"jb paeth_8lp \n\t"
		4233
		4234	: "=S" (dummy_value_S), // output regs (dummy)
		4235	"=D" (dummy_value_D)
		4236
		4237	: "0" (prev_row), // esi // input regs
		4238	"1" (row) // edi
		4239
		4240	: "%ecx" // clobber list
		4241	#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
		4242	, "%mm0", "%mm1", "%mm2", "%mm3"
		4243	, "%mm4", "%mm5", "%mm6", "%mm7"
		4244	#endif
		4245	);
		4246	}
		4247	break; // end 8 bpp
		4248
		4249	case 1: // bpp = 1
		4250	case 2: // bpp = 2
		4251	default: // bpp > 8
		4252	{
		4253	__asm__ __volatile__ (
		4254	#ifdef __PIC__
		4255	"pushl %%ebx \n\t" // save Global Offset Table index
		4256	#endif
		4257	"movl _dif, %%ebx \n\t"
		4258	"cmpl _FullLength, %%ebx \n\t"
		4259	"jnb paeth_dend \n\t"
		4260
		4261	// preload "movl row, %%edi \n\t"
		4262	// preload "movl prev_row, %%esi \n\t"
		4263	// do Paeth decode for remaining bytes
		4264	"movl %%ebx, %%edx \n\t"
		4265	// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
		4266	"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
		4267	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
		4268
		4269	"paeth_dlp: \n\t"
		4270	"xorl %%eax, %%eax \n\t"
		4271	// pav = p - a = (a + b - c) - a = b - c
		4272	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
		4273	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
		4274	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
		4275	"movl %%eax, _patemp \n\t" // Save pav for later use
		4276	"xorl %%eax, %%eax \n\t"
		4277	// pbv = p - b = (a + b - c) - b = a - c
		4278	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
		4279	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
		4280	"movl %%eax, %%ecx \n\t"
		4281	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		4282	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
		4283	// pc = abs(pcv)
		4284	"testl $0x80000000, %%eax \n\t"
		4285	"jz paeth_dpca \n\t"
		4286	"negl %%eax \n\t" // reverse sign of neg values
		4287
		4288	"paeth_dpca: \n\t"
		4289	"movl %%eax, _pctemp \n\t" // save pc for later use
		4290	// pb = abs(pbv)
		4291	"testl $0x80000000, %%ecx \n\t"
		4292	"jz paeth_dpba \n\t"
		4293	"negl %%ecx \n\t" // reverse sign of neg values
		4294
		4295	"paeth_dpba: \n\t"
		4296	"movl %%ecx, _pbtemp \n\t" // save pb for later use
		4297	// pa = abs(pav)
		4298	"movl _patemp, %%eax \n\t"
		4299	"testl $0x80000000, %%eax \n\t"
		4300	"jz paeth_dpaa \n\t"
		4301	"negl %%eax \n\t" // reverse sign of neg values
		4302
		4303	"paeth_dpaa: \n\t"
		4304	"movl %%eax, _patemp \n\t" // save pa for later use
		4305	// test if pa <= pb
		4306	"cmpl %%ecx, %%eax \n\t"
		4307	"jna paeth_dabb \n\t"
		4308	// pa > pb; now test if pb <= pc
		4309	"cmpl _pctemp, %%ecx \n\t"
		4310	"jna paeth_dbbc \n\t"
		4311	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
		4312	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
		4313	"jmp paeth_dpaeth \n\t"
		4314
		4315	"paeth_dbbc: \n\t"
		4316	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
		4317	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
		4318	"jmp paeth_dpaeth \n\t"
		4319
		4320	"paeth_dabb: \n\t"
		4321	// pa <= pb; now test if pa <= pc
		4322	"cmpl _pctemp, %%eax \n\t"
		4323	"jna paeth_dabc \n\t"
		4324	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
		4325	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
		4326	"jmp paeth_dpaeth \n\t"
		4327
		4328	"paeth_dabc: \n\t"
		4329	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
		4330	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
		4331
		4332	"paeth_dpaeth: \n\t"
		4333	"incl %%ebx \n\t"
		4334	"incl %%edx \n\t"
		4335	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
		4336	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
		4337	"cmpl _FullLength, %%ebx \n\t"
		4338	"jb paeth_dlp \n\t"
		4339
		4340	"paeth_dend: \n\t"
		4341	#ifdef __PIC__
		4342	"popl %%ebx \n\t" // index to Global Offset Table
		4343	#endif
		4344
		4345	: "=c" (dummy_value_c), // output regs (dummy)
		4346	"=S" (dummy_value_S),
		4347	"=D" (dummy_value_D)
		4348
		4349	: "0" (bpp), // ecx // input regs
		4350	"1" (prev_row), // esi
		4351	"2" (row) // edi
		4352
		4353	: "%eax", "%edx" // clobber list
		4354	#ifndef __PIC__
		4355	, "%ebx"
		4356	#endif
		4357	);
		4358	}
		4359	return; // No need to go further with this one
		4360
		4361	} // end switch (bpp)
		4362
		4363	__asm__ __volatile__ (
		4364	// MMX acceleration complete; now do clean-up
		4365	// check if any remaining bytes left to decode
		4366	#ifdef __PIC__
		4367	"pushl %%ebx \n\t" // save index to Global Offset Table
		4368	#endif
		4369	"movl _MMXLength, %%ebx \n\t"
		4370	"cmpl _FullLength, %%ebx \n\t"
		4371	"jnb paeth_end \n\t"
		4372	//pre "movl row, %%edi \n\t"
		4373	//pre "movl prev_row, %%esi \n\t"
		4374	// do Paeth decode for remaining bytes
		4375	"movl %%ebx, %%edx \n\t"
		4376	//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
		4377	"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
		4378	"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
		4379
		4380	"paeth_lp2: \n\t"
		4381	"xorl %%eax, %%eax \n\t"
		4382	// pav = p - a = (a + b - c) - a = b - c
		4383	"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
		4384	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
		4385	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
		4386	"movl %%eax, _patemp \n\t" // Save pav for later use
		4387	"xorl %%eax, %%eax \n\t"
		4388	// pbv = p - b = (a + b - c) - b = a - c
		4389	"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
		4390	"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
		4391	"movl %%eax, %%ecx \n\t"
		4392	// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
		4393	"addl _patemp, %%eax \n\t" // pcv = pav + pbv
		4394	// pc = abs(pcv)
		4395	"testl $0x80000000, %%eax \n\t"
		4396	"jz paeth_pca2 \n\t"
		4397	"negl %%eax \n\t" // reverse sign of neg values
		4398
		4399	"paeth_pca2: \n\t"
		4400	"movl %%eax, _pctemp \n\t" // save pc for later use
		4401	// pb = abs(pbv)
		4402	"testl $0x80000000, %%ecx \n\t"
		4403	"jz paeth_pba2 \n\t"
		4404	"negl %%ecx \n\t" // reverse sign of neg values
		4405
		4406	"paeth_pba2: \n\t"
		4407	"movl %%ecx, _pbtemp \n\t" // save pb for later use
		4408	// pa = abs(pav)
		4409	"movl _patemp, %%eax \n\t"
		4410	"testl $0x80000000, %%eax \n\t"
		4411	"jz paeth_paa2 \n\t"
		4412	"negl %%eax \n\t" // reverse sign of neg values
		4413
		4414	"paeth_paa2: \n\t"
		4415	"movl %%eax, _patemp \n\t" // save pa for later use
		4416	// test if pa <= pb
		4417	"cmpl %%ecx, %%eax \n\t"
		4418	"jna paeth_abb2 \n\t"
		4419	// pa > pb; now test if pb <= pc
		4420	"cmpl _pctemp, %%ecx \n\t"
		4421	"jna paeth_bbc2 \n\t"
		4422	// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
		4423	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
		4424	"jmp paeth_paeth2 \n\t"
		4425
		4426	"paeth_bbc2: \n\t"
		4427	// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
		4428	"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
		4429	"jmp paeth_paeth2 \n\t"
		4430
		4431	"paeth_abb2: \n\t"
		4432	// pa <= pb; now test if pa <= pc
		4433	"cmpl _pctemp, %%eax \n\t"
		4434	"jna paeth_abc2 \n\t"
		4435	// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
		4436	"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
		4437	"jmp paeth_paeth2 \n\t"
		4438
		4439	"paeth_abc2: \n\t"
		4440	// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
		4441	"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
		4442
		4443	"paeth_paeth2: \n\t"
		4444	"incl %%ebx \n\t"
		4445	"incl %%edx \n\t"
		4446	// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
		4447	"addb %%cl, -1(%%edi,%%ebx,) \n\t"
		4448	"cmpl _FullLength, %%ebx \n\t"
		4449	"jb paeth_lp2 \n\t"
		4450
		4451	"paeth_end: \n\t"
		4452	"EMMS \n\t" // end MMX; prep for poss. FP instrs.
		4453	#ifdef __PIC__
		4454	"popl %%ebx \n\t" // restore index to Global Offset Table
		4455	#endif
		4456
		4457	: "=c" (dummy_value_c), // output regs (dummy)
		4458	"=S" (dummy_value_S),
		4459	"=D" (dummy_value_D)
		4460
		4461	: "0" (bpp), // ecx // input regs
		4462	"1" (prev_row), // esi
		4463	"2" (row) // edi
		4464
		4465	: "%eax", "%edx" // clobber list (no input regs!)
		4466	#ifndef __PIC__
		4467	, "%ebx"
		4468	#endif
		4469	);
		4470
		4471	} /* end png_read_filter_row_mmx_paeth() */
		4472	#endif
		4473
		4474
		4475
		4476
		4477	#ifdef PNG_THREAD_UNSAFE_OK
		4478	//===========================================================================//
		4479	// //
		4480	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
		4481	// //
		4482	//===========================================================================//
		4483
		4484	// Optimized code for PNG Sub filter decoder
		4485
		4486	static void /* PRIVATE */
		4487	png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
		4488	{
		4489	int bpp;
		4490	int dummy_value_a;
		4491	int dummy_value_D;
		4492
		4493	bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
		4494	_FullLength = row_info->rowbytes - bpp; // number of bytes to filter
		4495
		4496	__asm__ __volatile__ (
		4497	//pre "movl row, %%edi \n\t"
		4498	"movl %%edi, %%esi \n\t" // lp = row
		4499	//pre "movl bpp, %%eax \n\t"
		4500	"addl %%eax, %%edi \n\t" // rp = row + bpp
		4501	//irr "xorl %%eax, %%eax \n\t"
		4502	// get # of bytes to alignment
		4503	"movl %%edi, _dif \n\t" // take start of row
		4504	"addl $0xf, _dif \n\t" // add 7 + 8 to incr past
		4505	// alignment boundary
		4506	"xorl %%ecx, %%ecx \n\t"
		4507	"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
		4508	"subl %%edi, _dif \n\t" // subtract from start ==> value
		4509	"jz sub_go \n\t" // ecx at alignment
		4510
		4511	"sub_lp1: \n\t" // fix alignment
		4512	"movb (%%esi,%%ecx,), %%al \n\t"
		4513	"addb %%al, (%%edi,%%ecx,) \n\t"
		4514	"incl %%ecx \n\t"
		4515	"cmpl _dif, %%ecx \n\t"
		4516	"jb sub_lp1 \n\t"
		4517
		4518	"sub_go: \n\t"
		4519	"movl _FullLength, %%eax \n\t"
		4520	"movl %%eax, %%edx \n\t"
		4521	"subl %%ecx, %%edx \n\t" // subtract alignment fix
		4522	"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
		4523	"subl %%edx, %%eax \n\t" // drop over bytes from length
		4524	"movl %%eax, _MMXLength \n\t"
		4525
		4526	: "=a" (dummy_value_a), // 0 // output regs (dummy)
		4527	"=D" (dummy_value_D) // 1
		4528
		4529	: "0" (bpp), // eax // input regs
		4530	"1" (row) // edi
		4531
		4532	: "%ebx", "%ecx", "%edx" // clobber list
		4533	, "%esi"
		4534
		4535	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		4536	, "%mm0", "%mm1", "%mm2", "%mm3"
		4537	, "%mm4", "%mm5", "%mm6", "%mm7"
		4538	#endif
		4539	);
		4540
		4541	// now do the math for the rest of the row
		4542	switch (bpp)
		4543	{
		4544	case 3:
		4545	{
		4546	_ActiveMask.use = 0x0000ffffff000000LL;
		4547	_ShiftBpp.use = 24; // == 3 * 8
		4548	_ShiftRem.use = 40; // == 64 - 24
		4549
		4550	__asm__ __volatile__ (
		4551	// preload "movl row, %%edi \n\t"
		4552	"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
		4553	// active byte group
		4554	"movl %%edi, %%esi \n\t" // lp = row
		4555	// preload "movl bpp, %%eax \n\t"
		4556	"addl %%eax, %%edi \n\t" // rp = row + bpp
		4557	"movq %%mm7, %%mm6 \n\t"
		4558	"movl _dif, %%edx \n\t"
		4559	"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
		4560	// 3rd active byte group
		4561	// prime the pump: load the first Raw(x-bpp) data set
		4562	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
		4563
		4564	"sub_3lp: \n\t" // shift data for adding first
		4565	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
		4566	// shift clears inactive bytes)
		4567	// add 1st active group
		4568	"movq (%%edi,%%edx,), %%mm0 \n\t"
		4569	"paddb %%mm1, %%mm0 \n\t"
		4570
		4571	// add 2nd active group
		4572	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
		4573	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
		4574	"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
		4575	"paddb %%mm1, %%mm0 \n\t"
		4576
		4577	// add 3rd active group
		4578	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
		4579	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
		4580	"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
		4581	"addl $8, %%edx \n\t"
		4582	"paddb %%mm1, %%mm0 \n\t"
		4583
		4584	"cmpl _MMXLength, %%edx \n\t"
		4585	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
		4586	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
		4587	"jb sub_3lp \n\t"
		4588
		4589	: "=a" (dummy_value_a), // 0 // output regs (dummy)
		4590	"=D" (dummy_value_D) // 1
		4591
		4592	: "0" (bpp), // eax // input regs
		4593	"1" (row) // edi
		4594
		4595	: "%edx", "%esi" // clobber list
		4596	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		4597	, "%mm0", "%mm1", "%mm6", "%mm7"
		4598	#endif
		4599	);
		4600	}
		4601	break;
		4602
		4603	case 1:
		4604	{
		4605	__asm__ __volatile__ (
		4606	"movl _dif, %%edx \n\t"
		4607	// preload "movl row, %%edi \n\t"
		4608	"cmpl _FullLength, %%edx \n\t"
		4609	"jnb sub_1end \n\t"
		4610	"movl %%edi, %%esi \n\t" // lp = row
		4611	"xorl %%eax, %%eax \n\t"
		4612	// preload "movl bpp, %%eax \n\t"
		4613	"addl %%eax, %%edi \n\t" // rp = row + bpp
		4614
		4615	"sub_1lp: \n\t"
		4616	"movb (%%esi,%%edx,), %%al \n\t"
		4617	"addb %%al, (%%edi,%%edx,) \n\t"
		4618	"incl %%edx \n\t"
		4619	"cmpl _FullLength, %%edx \n\t"
		4620	"jb sub_1lp \n\t"
		4621
		4622	"sub_1end: \n\t"
		4623
		4624	: "=a" (dummy_value_a), // 0 // output regs (dummy)
		4625	"=D" (dummy_value_D) // 1
		4626
		4627	: "0" (bpp), // eax // input regs
		4628	"1" (row) // edi
		4629
		4630	: "%edx", "%esi" // clobber list
		4631	);
		4632	}
		4633	return;
		4634
		4635	case 6:
		4636	case 4:
		4637	//case 7: // GRR BOGUS
		4638	//case 5: // GRR BOGUS
		4639	{
		4640	_ShiftBpp.use = bpp << 3;
		4641	_ShiftRem.use = 64 - _ShiftBpp.use;
		4642
		4643	__asm__ __volatile__ (
		4644	// preload "movl row, %%edi \n\t"
		4645	"movl _dif, %%edx \n\t"
		4646	"movl %%edi, %%esi \n\t" // lp = row
		4647	// preload "movl bpp, %%eax \n\t"
		4648	"addl %%eax, %%edi \n\t" // rp = row + bpp
		4649
		4650	// prime the pump: load the first Raw(x-bpp) data set
		4651	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
		4652
		4653	"sub_4lp: \n\t" // shift data for adding first
		4654	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
		4655	// shift clears inactive bytes)
		4656	"movq (%%edi,%%edx,), %%mm0 \n\t"
		4657	"paddb %%mm1, %%mm0 \n\t"
		4658
		4659	// add 2nd active group
		4660	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
		4661	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
		4662	"addl $8, %%edx \n\t"
		4663	"paddb %%mm1, %%mm0 \n\t"
		4664
		4665	"cmpl _MMXLength, %%edx \n\t"
		4666	"movq %%mm0, -8(%%edi,%%edx,) \n\t"
		4667	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
		4668	"jb sub_4lp \n\t"
		4669
		4670	: "=a" (dummy_value_a), // 0 // output regs (dummy)
		4671	"=D" (dummy_value_D) // 1
		4672
		4673	: "0" (bpp), // eax // input regs
		4674	"1" (row) // edi
		4675
		4676	: "%edx", "%esi" // clobber list
		4677	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		4678	, "%mm0", "%mm1"
		4679	#endif
		4680	);
		4681	}
		4682	break;
		4683
		4684	case 2:
		4685	{
		4686	_ActiveMask.use = 0x00000000ffff0000LL;
		4687	_ShiftBpp.use = 16; // == 2 * 8
		4688	_ShiftRem.use = 48; // == 64 - 16
		4689
		4690	__asm__ __volatile__ (
		4691	"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
		4692	// active byte group
		4693	"movl _dif, %%edx \n\t"
		4694	"movq %%mm7, %%mm6 \n\t"
		4695	// preload "movl row, %%edi \n\t"
		4696	"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
		4697	// 3rd active byte group
		4698	"movl %%edi, %%esi \n\t" // lp = row
		4699	"movq %%mm6, %%mm5 \n\t"
		4700	// preload "movl bpp, %%eax \n\t"
		4701	"addl %%eax, %%edi \n\t" // rp = row + bpp
		4702	"psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
		4703	// 4th active byte group
		4704	// prime the pump: load the first Raw(x-bpp) data set
		4705	"movq -8(%%edi,%%edx,), %%mm1 \n\t"
		4706
		4707	"sub_2lp: \n\t" // shift data for adding first
		4708	"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
		4709	// shift clears inactive bytes)
		4710	// add 1st active group
		4711	"movq (%%edi,%%edx,), %%mm0 \n\t"
		4712	"paddb %%mm1, %%mm0 \n\t"
		4713
		4714	// add 2nd active group
		4715	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
		4716	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
		4717	"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
		4718	"paddb %%mm1, %%mm0 \n\t"
		4719
		4720	// add 3rd active group
		4721	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
		4722	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
		4723	"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
		4724	"paddb %%mm1, %%mm0 \n\t"
		4725
		4726	// add 4th active group
		4727	"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
		4728	"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
		4729	"pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
		4730	"addl $8, %%edx \n\t"
		4731	"paddb %%mm1, %%mm0 \n\t"
		4732	"cmpl _MMXLength, %%edx \n\t"
		4733	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
		4734	"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
		4735	"jb sub_2lp \n\t"
		4736
		4737	: "=a" (dummy_value_a), // 0 // output regs (dummy)
		4738	"=D" (dummy_value_D) // 1
		4739
		4740	: "0" (bpp), // eax // input regs
		4741	"1" (row) // edi
		4742
		4743	: "%edx", "%esi" // clobber list
		4744	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		4745	, "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
		4746	#endif
		4747	);
		4748	}
		4749	break;
		4750
		4751	case 8:
		4752	{
		4753	__asm__ __volatile__ (
		4754	// preload "movl row, %%edi \n\t"
		4755	"movl _dif, %%edx \n\t"
		4756	"movl %%edi, %%esi \n\t" // lp = row
		4757	// preload "movl bpp, %%eax \n\t"
		4758	"addl %%eax, %%edi \n\t" // rp = row + bpp
		4759	"movl _MMXLength, %%ecx \n\t"
		4760
		4761	// prime the pump: load the first Raw(x-bpp) data set
		4762	"movq -8(%%edi,%%edx,), %%mm7 \n\t"
		4763	"andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
		4764
		4765	"sub_8lp: \n\t"
		4766	"movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
		4767	"paddb %%mm7, %%mm0 \n\t"
		4768	"movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
		4769	"movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
		4770
		4771	// Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
		4772	// This will be repeated for each group of 8 bytes with the 8th
		4773	// group being used as the Raw(x-bpp) for the 1st group of the
		4774	// next loop.
		4775
		4776	"paddb %%mm0, %%mm1 \n\t"
		4777	"movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
		4778	"movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
		4779	"paddb %%mm1, %%mm2 \n\t"
		4780	"movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
		4781	"movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
		4782	"paddb %%mm2, %%mm3 \n\t"
		4783	"movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
		4784	"movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
		4785	"paddb %%mm3, %%mm4 \n\t"
		4786	"movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
		4787	"movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
		4788	"paddb %%mm4, %%mm5 \n\t"
		4789	"movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
		4790	"movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
		4791	"paddb %%mm5, %%mm6 \n\t"
		4792	"movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
		4793	"movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
		4794	"addl $64, %%edx \n\t"
		4795	"paddb %%mm6, %%mm7 \n\t"
		4796	"cmpl %%ecx, %%edx \n\t"
		4797	"movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
		4798	"jb sub_8lp \n\t"
		4799
		4800	"cmpl _MMXLength, %%edx \n\t"
		4801	"jnb sub_8lt8 \n\t"
		4802
		4803	"sub_8lpA: \n\t"
		4804	"movq (%%edi,%%edx,), %%mm0 \n\t"
		4805	"addl $8, %%edx \n\t"
		4806	"paddb %%mm7, %%mm0 \n\t"
		4807	"cmpl _MMXLength, %%edx \n\t"
		4808	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
		4809	"movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
		4810	// to mm1 to be new Raw(x-bpp)
		4811	// for next loop
		4812	"jb sub_8lpA \n\t"
		4813
		4814	"sub_8lt8: \n\t"
		4815
		4816	: "=a" (dummy_value_a), // 0 // output regs (dummy)
		4817	"=D" (dummy_value_D) // 1
		4818
		4819	: "0" (bpp), // eax // input regs
		4820	"1" (row) // edi
		4821
		4822	: "%ecx", "%edx", "%esi" // clobber list
		4823	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		4824	, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
		4825	#endif
		4826	);
		4827	}
		4828	break;
		4829
		4830	default: // bpp greater than 8 bytes GRR BOGUS
		4831	{
		4832	__asm__ __volatile__ (
		4833	"movl _dif, %%edx \n\t"
		4834	// preload "movl row, %%edi \n\t"
		4835	"movl %%edi, %%esi \n\t" // lp = row
		4836	// preload "movl bpp, %%eax \n\t"
		4837	"addl %%eax, %%edi \n\t" // rp = row + bpp
		4838
		4839	"sub_Alp: \n\t"
		4840	"movq (%%edi,%%edx,), %%mm0 \n\t"
		4841	"movq (%%esi,%%edx,), %%mm1 \n\t"
		4842	"addl $8, %%edx \n\t"
		4843	"paddb %%mm1, %%mm0 \n\t"
		4844	"cmpl _MMXLength, %%edx \n\t"
		4845	"movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
		4846	// -8 to offset addl edx
		4847	"jb sub_Alp \n\t"
		4848
		4849	: "=a" (dummy_value_a), // 0 // output regs (dummy)
		4850	"=D" (dummy_value_D) // 1
		4851
		4852	: "0" (bpp), // eax // input regs
		4853	"1" (row) // edi
		4854
		4855	: "%edx", "%esi" // clobber list
		4856	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		4857	, "%mm0", "%mm1"
		4858	#endif
		4859	);
		4860	}
		4861	break;
		4862
		4863	} // end switch (bpp)
		4864
		4865	__asm__ __volatile__ (
		4866	"movl _MMXLength, %%edx \n\t"
		4867	//pre "movl row, %%edi \n\t"
		4868	"cmpl _FullLength, %%edx \n\t"
		4869	"jnb sub_end \n\t"
		4870
		4871	"movl %%edi, %%esi \n\t" // lp = row
		4872	//pre "movl bpp, %%eax \n\t"
		4873	"addl %%eax, %%edi \n\t" // rp = row + bpp
		4874	"xorl %%eax, %%eax \n\t"
		4875
		4876	"sub_lp2: \n\t"
		4877	"movb (%%esi,%%edx,), %%al \n\t"
		4878	"addb %%al, (%%edi,%%edx,) \n\t"
		4879	"incl %%edx \n\t"
		4880	"cmpl _FullLength, %%edx \n\t"
		4881	"jb sub_lp2 \n\t"
		4882
		4883	"sub_end: \n\t"
		4884	"EMMS \n\t" // end MMX instructions
		4885
		4886	: "=a" (dummy_value_a), // 0 // output regs (dummy)
		4887	"=D" (dummy_value_D) // 1
		4888
		4889	: "0" (bpp), // eax // input regs
		4890	"1" (row) // edi
		4891
		4892	: "%edx", "%esi" // clobber list
		4893	);
		4894
		4895	} // end of png_read_filter_row_mmx_sub()
		4896	#endif
		4897
		4898
		4899
		4900
		4901	//===========================================================================//
		4902	// //
		4903	// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
		4904	// //
		4905	//===========================================================================//
		4906
		4907	// Optimized code for PNG Up filter decoder
		4908
		4909	static void /* PRIVATE */
		4910	png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
		4911	png_bytep prev_row)
		4912	{
		4913	png_uint_32 len;
		4914	int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
		4915	int dummy_value_S;
		4916	int dummy_value_D;
		4917
		4918	len = row_info->rowbytes; // number of bytes to filter
		4919
		4920	__asm__ __volatile__ (
		4921	//pre "movl row, %%edi \n\t"
		4922	// get # of bytes to alignment
		4923	#ifdef __PIC__
		4924	"pushl %%ebx \n\t"
		4925	#endif
		4926	"movl %%edi, %%ecx \n\t"
		4927	"xorl %%ebx, %%ebx \n\t"
		4928	"addl $0x7, %%ecx \n\t"
		4929	"xorl %%eax, %%eax \n\t"
		4930	"andl $0xfffffff8, %%ecx \n\t"
		4931	//pre "movl prev_row, %%esi \n\t"
		4932	"subl %%edi, %%ecx \n\t"
		4933	"jz up_go \n\t"
		4934
		4935	"up_lp1: \n\t" // fix alignment
		4936	"movb (%%edi,%%ebx,), %%al \n\t"
		4937	"addb (%%esi,%%ebx,), %%al \n\t"
		4938	"incl %%ebx \n\t"
		4939	"cmpl %%ecx, %%ebx \n\t"
		4940	"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
		4941	"jb up_lp1 \n\t" // offset incl ebx
		4942
		4943	"up_go: \n\t"
		4944	//pre "movl len, %%edx \n\t"
		4945	"movl %%edx, %%ecx \n\t"
		4946	"subl %%ebx, %%edx \n\t" // subtract alignment fix
		4947	"andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
		4948	"subl %%edx, %%ecx \n\t" // drop over bytes from length
		4949
		4950	// unrolled loop - use all MMX registers and interleave to reduce
		4951	// number of branch instructions (loops) and reduce partial stalls
		4952	"up_loop: \n\t"
		4953	"movq (%%esi,%%ebx,), %%mm1 \n\t"
		4954	"movq (%%edi,%%ebx,), %%mm0 \n\t"
		4955	"movq 8(%%esi,%%ebx,), %%mm3 \n\t"
		4956	"paddb %%mm1, %%mm0 \n\t"
		4957	"movq 8(%%edi,%%ebx,), %%mm2 \n\t"
		4958	"movq %%mm0, (%%edi,%%ebx,) \n\t"
		4959	"paddb %%mm3, %%mm2 \n\t"
		4960	"movq 16(%%esi,%%ebx,), %%mm5 \n\t"
		4961	"movq %%mm2, 8(%%edi,%%ebx,) \n\t"
		4962	"movq 16(%%edi,%%ebx,), %%mm4 \n\t"
		4963	"movq 24(%%esi,%%ebx,), %%mm7 \n\t"
		4964	"paddb %%mm5, %%mm4 \n\t"
		4965	"movq 24(%%edi,%%ebx,), %%mm6 \n\t"
		4966	"movq %%mm4, 16(%%edi,%%ebx,) \n\t"
		4967	"paddb %%mm7, %%mm6 \n\t"
		4968	"movq 32(%%esi,%%ebx,), %%mm1 \n\t"
		4969	"movq %%mm6, 24(%%edi,%%ebx,) \n\t"
		4970	"movq 32(%%edi,%%ebx,), %%mm0 \n\t"
		4971	"movq 40(%%esi,%%ebx,), %%mm3 \n\t"
		4972	"paddb %%mm1, %%mm0 \n\t"
		4973	"movq 40(%%edi,%%ebx,), %%mm2 \n\t"
		4974	"movq %%mm0, 32(%%edi,%%ebx,) \n\t"
		4975	"paddb %%mm3, %%mm2 \n\t"
		4976	"movq 48(%%esi,%%ebx,), %%mm5 \n\t"
		4977	"movq %%mm2, 40(%%edi,%%ebx,) \n\t"
		4978	"movq 48(%%edi,%%ebx,), %%mm4 \n\t"
		4979	"movq 56(%%esi,%%ebx,), %%mm7 \n\t"
		4980	"paddb %%mm5, %%mm4 \n\t"
		4981	"movq 56(%%edi,%%ebx,), %%mm6 \n\t"
		4982	"movq %%mm4, 48(%%edi,%%ebx,) \n\t"
		4983	"addl $64, %%ebx \n\t"
		4984	"paddb %%mm7, %%mm6 \n\t"
		4985	"cmpl %%ecx, %%ebx \n\t"
		4986	"movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
		4987	"jb up_loop \n\t" // -8 to offset addl ebx
		4988
		4989	"cmpl $0, %%edx \n\t" // test for bytes over mult of 64
		4990	"jz up_end \n\t"
		4991
		4992	"cmpl $8, %%edx \n\t" // test for less than 8 bytes
		4993	"jb up_lt8 \n\t" // [added by lcreeve@netins.net]
		4994
		4995	"addl %%edx, %%ecx \n\t"
		4996	"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
		4997	"subl %%edx, %%ecx \n\t" // drop over bytes from length
		4998	"jz up_lt8 \n\t"
		4999
		5000	"up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
		5001	"movq (%%esi,%%ebx,), %%mm1 \n\t"
		5002	"movq (%%edi,%%ebx,), %%mm0 \n\t"
		5003	"addl $8, %%ebx \n\t"
		5004	"paddb %%mm1, %%mm0 \n\t"
		5005	"cmpl %%ecx, %%ebx \n\t"
		5006	"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
		5007	"jb up_lpA \n\t" // offset add ebx
		5008	"cmpl $0, %%edx \n\t" // test for bytes over mult of 8
		5009	"jz up_end \n\t"
		5010
		5011	"up_lt8: \n\t"
		5012	"xorl %%eax, %%eax \n\t"
		5013	"addl %%edx, %%ecx \n\t" // move over byte count into counter
		5014
		5015	"up_lp2: \n\t" // use x86 regs for remaining bytes
		5016	"movb (%%edi,%%ebx,), %%al \n\t"
		5017	"addb (%%esi,%%ebx,), %%al \n\t"
		5018	"incl %%ebx \n\t"
		5019	"cmpl %%ecx, %%ebx \n\t"
		5020	"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
		5021	"jb up_lp2 \n\t" // offset inc ebx
		5022
		5023	"up_end: \n\t"
		5024	"EMMS \n\t" // conversion of filtered row complete
		5025	#ifdef __PIC__
		5026	"popl %%ebx \n\t"
		5027	#endif
		5028
		5029	: "=d" (dummy_value_d), // 0 // output regs (dummy)
		5030	"=S" (dummy_value_S), // 1
		5031	"=D" (dummy_value_D) // 2
		5032
		5033	: "0" (len), // edx // input regs
		5034	"1" (prev_row), // esi
		5035	"2" (row) // edi
		5036
		5037	: "%eax", "%ebx", "%ecx" // clobber list (no input regs!)
		5038
		5039	#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
		5040	, "%mm0", "%mm1", "%mm2", "%mm3"
		5041	, "%mm4", "%mm5", "%mm6", "%mm7"
		5042	#endif
		5043	);
		5044
		5045	} // end of png_read_filter_row_mmx_up()
		5046
		5047	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		5048
		5049
		5050
		5051
		5052	/===========================================================================/
		5053	/* */
		5054	/* P N G _ R E A D _ F I L T E R _ R O W */
		5055	/* */
		5056	/===========================================================================/
		5057
		5058
		5059	/* Optimized png_read_filter_row routines */
		5060
		5061	void /* PRIVATE */
		5062	png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
		5063	row, png_bytep prev_row, int filter)
		5064	{
		5065	#ifdef PNG_DEBUG
		5066	char filnm[10];
		5067	#endif
		5068
		5069	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
		5070	/* GRR: these are superseded by png_ptr->asm_flags: */
		5071	#define UseMMX_sub 1 // GRR: converted 20000730
		5072	#define UseMMX_up 1 // GRR: converted 20000729
		5073	#define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
		5074	#define UseMMX_paeth 1 // GRR: converted 20000828
		5075
		5076	if (_mmx_supported == 2) {
		5077	/* this should have happened in png_init_mmx_flags() already */
		5078	#if !defined(PNG_1_0_X)
		5079	png_warning(png_ptr, "asm_flags may not have been initialized");
		5080	#endif
		5081	png_mmx_support();
		5082	}
		5083	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		5084
		5085	#ifdef PNG_DEBUG
		5086	png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
		5087	switch (filter)
		5088	{
		5089	case 0: sprintf(filnm, "none");
		5090	break;
		5091	case 1: sprintf(filnm, "sub-%s",
		5092	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		5093	#if !defined(PNG_1_0_X)
		5094	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
		5095	#endif
		5096	#endif
		5097	"x86");
		5098	break;
		5099	case 2: sprintf(filnm, "up-%s",
		5100	#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
		5101	#if !defined(PNG_1_0_X)
		5102	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
		5103	#endif
		5104	#endif
		5105	"x86");
		5106	break;
		5107	case 3: sprintf(filnm, "avg-%s",
		5108	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		5109	#if !defined(PNG_1_0_X)
		5110	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
		5111	#endif
		5112	#endif
		5113	"x86");
		5114	break;
		5115	case 4: sprintf(filnm, "Paeth-%s",
		5116	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		5117	#if !defined(PNG_1_0_X)
		5118	(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
		5119	#endif
		5120	#endif
		5121	"x86");
		5122	break;
		5123	default: sprintf(filnm, "unknw");
		5124	break;
		5125	}
		5126	png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
		5127	png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
		5128	png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
		5129	(int)((row_info->pixel_depth + 7) >> 3));
		5130	png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
		5131	#endif /* PNG_DEBUG */
		5132
		5133	switch (filter)
		5134	{
		5135	case PNG_FILTER_VALUE_NONE:
		5136	break;
		5137
		5138	case PNG_FILTER_VALUE_SUB:
		5139	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		5140	#if !defined(PNG_1_0_X)
		5141	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
		5142	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
		5143	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
		5144	#else
		5145	if (_mmx_supported)
		5146	#endif
		5147	{
		5148	png_read_filter_row_mmx_sub(row_info, row);
		5149	}
		5150	else
		5151	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		5152	{
		5153	png_uint_32 i;
		5154	png_uint_32 istop = row_info->rowbytes;
		5155	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
		5156	png_bytep rp = row + bpp;
		5157	png_bytep lp = row;
		5158
		5159	for (i = bpp; i < istop; i++)
		5160	{
		5161	rp = (png_byte)(((int)(rp) + (int)(*lp++)) & 0xff);
		5162	rp++;
		5163	}
		5164	} /* end !UseMMX_sub */
		5165	break;
		5166
		5167	case PNG_FILTER_VALUE_UP:
		5168	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
		5169	#if !defined(PNG_1_0_X)
		5170	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
		5171	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
		5172	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
		5173	#else
		5174	if (_mmx_supported)
		5175	#endif
		5176	{
		5177	png_read_filter_row_mmx_up(row_info, row, prev_row);
		5178	}
		5179	else
		5180	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		5181	{
		5182	png_uint_32 i;
		5183	png_uint_32 istop = row_info->rowbytes;
		5184	png_bytep rp = row;
		5185	png_bytep pp = prev_row;
		5186
		5187	for (i = 0; i < istop; ++i)
		5188	{
		5189	rp = (png_byte)(((int)(rp) + (int)(*pp++)) & 0xff);
		5190	rp++;
		5191	}
		5192	} /* end !UseMMX_up */
		5193	break;
		5194
		5195	case PNG_FILTER_VALUE_AVG:
		5196	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		5197	#if !defined(PNG_1_0_X)
		5198	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
		5199	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
		5200	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
		5201	#else
		5202	if (_mmx_supported)
		5203	#endif
		5204	{
		5205	png_read_filter_row_mmx_avg(row_info, row, prev_row);
		5206	}
		5207	else
		5208	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		5209	{
		5210	png_uint_32 i;
		5211	png_bytep rp = row;
		5212	png_bytep pp = prev_row;
		5213	png_bytep lp = row;
		5214	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
		5215	png_uint_32 istop = row_info->rowbytes - bpp;
		5216
		5217	for (i = 0; i < bpp; i++)
		5218	{
		5219	rp = (png_byte)(((int)(rp) +
		5220	((int)(*pp++) >> 1)) & 0xff);
		5221	rp++;
		5222	}
		5223
		5224	for (i = 0; i < istop; i++)
		5225	{
		5226	rp = (png_byte)(((int)(rp) +
		5227	((int)(pp++ + lp++) >> 1)) & 0xff);
		5228	rp++;
		5229	}
		5230	} /* end !UseMMX_avg */
		5231	break;
		5232
		5233	case PNG_FILTER_VALUE_PAETH:
		5234	#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
		5235	#if !defined(PNG_1_0_X)
		5236	if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
		5237	(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
		5238	(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
		5239	#else
		5240	if (_mmx_supported)
		5241	#endif
		5242	{
		5243	png_read_filter_row_mmx_paeth(row_info, row, prev_row);
		5244	}
		5245	else
		5246	#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
		5247	{
		5248	png_uint_32 i;
		5249	png_bytep rp = row;
		5250	png_bytep pp = prev_row;
		5251	png_bytep lp = row;
		5252	png_bytep cp = prev_row;
		5253	png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
		5254	png_uint_32 istop = row_info->rowbytes - bpp;
		5255
		5256	for (i = 0; i < bpp; i++)
		5257	{
		5258	rp = (png_byte)(((int)(rp) + (int)(*pp++)) & 0xff);
		5259	rp++;
		5260	}
		5261
		5262	for (i = 0; i < istop; i++) /* use leftover rp,pp */
		5263	{
		5264	int a, b, c, pa, pb, pc, p;
		5265
		5266	a = *lp++;
		5267	b = *pp++;
		5268	c = *cp++;
		5269
		5270	p = b - c;
		5271	pc = a - c;
		5272
		5273	#ifdef PNG_USE_ABS
		5274	pa = abs(p);
		5275	pb = abs(pc);
		5276	pc = abs(p + pc);
		5277	#else
		5278	pa = p < 0 ? -p : p;
		5279	pb = pc < 0 ? -pc : pc;
		5280	pc = (p + pc) < 0 ? -(p + pc) : p + pc;
		5281	#endif
		5282
		5283	/*
		5284	if (pa <= pb && pa <= pc)
		5285	p = a;
		5286	else if (pb <= pc)
		5287	p = b;
		5288	else
		5289	p = c;
		5290	*/
		5291
		5292	p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
		5293
		5294	rp = (png_byte)(((int)(rp) + p) & 0xff);
		5295	rp++;
		5296	}
		5297	} /* end !UseMMX_paeth */
		5298	break;
		5299
		5300	default:
		5301	png_warning(png_ptr, "Ignoring bad row-filter type");
		5302	*row=0;
		5303	break;
		5304	}
		5305	}
		5306
		5307	#endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
		5308
		5309
		5310	/===========================================================================/
		5311	/* */
		5312	/* P N G _ M M X _ S U P P O R T */
		5313	/* */
		5314	/===========================================================================/
		5315
		5316	/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
		5317	* (2) all instructions compile with gcc 2.7.2.3 and later
		5318	* (3) the function is moved down here to prevent gcc from
		5319	* inlining it in multiple places and then barfing be-
		5320	* cause the ".NOT_SUPPORTED" label is multiply defined
		5321	* [is there a way to signal that a single function should
		5322	* not be inlined? is there a way to modify the label for
		5323	* each inlined instance, e.g., by appending _1, _2, etc.?
		5324	* maybe if don't use leading "." in label name? (nope...sigh)]
		5325	*/
		5326
		5327	int PNGAPI
		5328	png_mmx_support(void)
		5329	{
		5330	#if defined(PNG_MMX_CODE_SUPPORTED)
		5331	__asm__ __volatile__ (
		5332	"pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
		5333	"pushl %%ecx \n\t" // so does ecx...
		5334	"pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
		5335	// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
		5336	// "pushf \n\t" // 16-bit pushf
		5337	"pushfl \n\t" // save Eflag to stack
		5338	"popl %%eax \n\t" // get Eflag from stack into eax
		5339	"movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
		5340	"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
		5341	"pushl %%eax \n\t" // save modified Eflag back to stack
		5342	// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
		5343	// "popf \n\t" // 16-bit popf
		5344	"popfl \n\t" // restore modified value to Eflag reg
		5345	"pushfl \n\t" // save Eflag to stack
		5346	"popl %%eax \n\t" // get Eflag from stack
		5347	"pushl %%ecx \n\t" // save original Eflag to stack
		5348	"popfl \n\t" // restore original Eflag
		5349	"xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
		5350	"jz 0f \n\t" // if same, CPUID instr. is not supported
		5351
		5352	"xorl %%eax, %%eax \n\t" // set eax to zero
		5353	// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
		5354	"cpuid \n\t" // get the CPU identification info
		5355	"cmpl $1, %%eax \n\t" // make sure eax return non-zero value
		5356	"jl 0f \n\t" // if eax is zero, MMX is not supported
		5357
		5358	"xorl %%eax, %%eax \n\t" // set eax to zero and...
		5359	"incl %%eax \n\t" // ...increment eax to 1. This pair is
		5360	// faster than the instruction "mov eax, 1"
		5361	"cpuid \n\t" // get the CPU identification info again
		5362	"andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
		5363	"cmpl $0, %%edx \n\t" // 0 = MMX not supported
		5364	"jz 0f \n\t" // non-zero = yes, MMX IS supported
		5365
		5366	"movl $1, %%eax \n\t" // set return value to 1
		5367	"jmp 1f \n\t" // DONE: have MMX support
		5368
		5369	"0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
		5370	"movl $0, %%eax \n\t" // set return value to 0
		5371	"1: \n\t" // .RETURN: target label for jump instructions
		5372	"movl %%eax, _mmx_supported \n\t" // save in global static variable, too
		5373	"popl %%edx \n\t" // restore edx
		5374	"popl %%ecx \n\t" // restore ecx
		5375	"popl %%ebx \n\t" // restore ebx
		5376
		5377	// "ret \n\t" // DONE: no MMX support
		5378	// (fall through to standard C "ret")
		5379
		5380	: // output list (none)
		5381
		5382	: // any variables used on input (none)
		5383
		5384	: "%eax" // clobber list
		5385	// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
		5386	// , "memory" // if write to a variable gcc thought was in a reg
		5387	// , "cc" // "condition codes" (flag bits)
		5388	);
		5389	#else
		5390	_mmx_supported = 0;
		5391	#endif /* PNG_MMX_CODE_SUPPORTED */
		5392
		5393	return _mmx_supported;
		5394	}
		5395
		5396
		5397	#endif /* PNG_USE_PNGGCCRD */

Subversion Repositories shark

(root)/shark/trunk/ports/png/pnggccrd.c @ 1038 - Rev 96