MAME SVN History

199869 Revisions

r34335 Saturday 10th January, 2015 at 20:45:26 UTC by Miodrag Milanović
Moved softfloat to 3rdparty (nw)

[3rdparty/softfloat]	README.txt* fpu_constant.h* fsincos.c* fyl2x.c* mamesf.h* milieu.h* softfloat-macros* softfloat-specialize* softfloat.c* softfloat.h*
[src/emu/cpu/i386]	i386.h
[src/emu/cpu/m68000]	m68000.h
[src/lib]	lib.mak
[src/lib/softfloat]	~~README.txt~~ ~~fpu_constant.h~~ ~~fsincos.c~~ ~~fyl2x.c~~ ~~mamesf.h~~ ~~milieu.h~~ ~~softfloat-macros~~ ~~softfloat-specialize~~ ~~softfloat.c~~ ~~softfloat.h~~

trunk/3rdparty/softfloat/README.txt
r0	r242847
	1	MAME note: this package is derived from the following original SoftFloat
	2	package and has been "re-packaged" to work with MAME's conventions and
	3	build system. The source files come from bits64/ and bits64/templates
	4	in the original distribution as MAME requires a compiler with a 64-bit
	5	integer type.
	6
	7
	8	Package Overview for SoftFloat Release 2b
	9
	10	John R. Hauser
	11	2002 May 27
	12
	13
	14	----------------------------------------------------------------------------
	15	Overview
	16
	17	SoftFloat is a software implementation of floating-point that conforms to
	18	the IEC/IEEE Standard for Binary Floating-Point Arithmetic. SoftFloat is
	19	distributed in the form of C source code. Compiling the SoftFloat sources
	20	generates two things:
	21
	22	-- A SoftFloat object file (typically `softfloat.o') containing the complete
	23	set of IEC/IEEE floating-point routines.
	24
	25	-- A `timesoftfloat' program for evaluating the speed of the SoftFloat
	26	routines. (The SoftFloat module is linked into this program.)
	27
	28	The SoftFloat package is documented in four text files:
	29
	30	SoftFloat.txt Documentation for using the SoftFloat functions.
	31	SoftFloat-source.txt Documentation for compiling SoftFloat.
	32	SoftFloat-history.txt History of major changes to SoftFloat.
	33	timesoftfloat.txt Documentation for using `timesoftfloat'.
	34
	35	Other files in the package comprise the source code for SoftFloat.
	36
	37	Please be aware that some work is involved in porting this software to other
	38	targets. It is not just a matter of getting `make' to complete without
	39	error messages. I would have written the code that way if I could, but
	40	there are fundamental differences between systems that can't be hidden.
	41	You should not attempt to compile SoftFloat without first reading both
	42	`SoftFloat.txt' and `SoftFloat-source.txt'.
	43
	44
	45	----------------------------------------------------------------------------
	46	Legal Notice
	47
	48	SoftFloat was written by me, John R. Hauser. This work was made possible in
	49	part by the International Computer Science Institute, located at Suite 600,
	50	1947 Center Street, Berkeley, California 94704. Funding was partially
	51	provided by the National Science Foundation under grant MIP-9311980. The
	52	original version of this code was written as part of a project to build
	53	a fixed-point vector processor in collaboration with the University of
	54	California at Berkeley, overseen by Profs. Nelson Morgan and John Wawrzynek.
	55
	56	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
	57	has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
	58	TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
	59	PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL
	60	LOSSES, COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO
	61	FURTHERMORE EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER
	62	SCIENCE INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES,
	63	COSTS, OR OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE
	64	SOFTWARE.
	65
	66	Derivative works are acceptable, even for commercial purposes, provided
	67	that the minimal documentation requirements stated in the source code are
	68	satisfied.
	69
	70
	71	----------------------------------------------------------------------------
	72	Contact Information
	73
	74	At the time of this writing, the most up-to-date information about
	75	SoftFloat and the latest release can be found at the Web page `http://
	76	www.cs.berkeley.edu/~jhauser/arithmetic/SoftFloat.html'.
	77
	78

trunk/3rdparty/softfloat/fpu_constant.h
r0	r242847
	1	/*============================================================================
	2	This source file is an extension to the SoftFloat IEC/IEEE Floating-point
	3	Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
	4	floating point emulation.
	5
	6	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
	7	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
	8	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
	9	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
	10	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
	11	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
	12	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
	13	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
	14
	15	Derivative works are acceptable, even for commercial purposes, so long as
	16	(1) the source code for the derivative work includes prominent notice that
	17	the work is derivative, and (2) the source code includes prominent notice with
	18	these four paragraphs for those parts of this code that are retained.
	19	=============================================================================*/
	20
	21	#ifndef _FPU_CONSTANTS_H_
	22	#define _FPU_CONSTANTS_H_
	23
	24	// Pentium CPU uses only 68-bit precision M_PI approximation
	25	#define BETTER_THAN_PENTIUM
	26
	27	/*============================================================================
	28	* Written for Bochs (x86 achitecture simulator) by
	29	* Stanislav Shwartsman [sshwarts at sourceforge net]
	30	* ==========================================================================*/
	31
	32	//////////////////////////////
	33	// PI, PI/2, PI/4 constants
	34	//////////////////////////////
	35
	36	#define FLOATX80_PI_EXP (0x4000)
	37
	38	// 128-bit PI fraction
	39	#ifdef BETTER_THAN_PENTIUM
	40	#define FLOAT_PI_HI (U64(0xc90fdaa22168c234))
	41	#define FLOAT_PI_LO (U64(0xc4c6628b80dc1cd1))
	42	#else
	43	#define FLOAT_PI_HI (U64(0xc90fdaa22168c234))
	44	#define FLOAT_PI_LO (U64(0xC000000000000000))
	45	#endif
	46
	47	#define FLOATX80_PI2_EXP (0x3FFF)
	48	#define FLOATX80_PI4_EXP (0x3FFE)
	49
	50	//////////////////////////////
	51	// 3PI/4 constant
	52	//////////////////////////////
	53
	54	#define FLOATX80_3PI4_EXP (0x4000)
	55
	56	// 128-bit 3PI/4 fraction
	57	#ifdef BETTER_THAN_PENTIUM
	58	#define FLOAT_3PI4_HI (U64(0x96cbe3f9990e91a7))
	59	#define FLOAT_3PI4_LO (U64(0x9394c9e8a0a5159c))
	60	#else
	61	#define FLOAT_3PI4_HI (U64(0x96cbe3f9990e91a7))
	62	#define FLOAT_3PI4_LO (U64(0x9000000000000000))
	63	#endif
	64
	65	//////////////////////////////
	66	// 1/LN2 constant
	67	//////////////////////////////
	68
	69	#define FLOAT_LN2INV_EXP (0x3FFF)
	70
	71	// 128-bit 1/LN2 fraction
	72	#ifdef BETTER_THAN_PENTIUM
	73	#define FLOAT_LN2INV_HI (U64(0xb8aa3b295c17f0bb))
	74	#define FLOAT_LN2INV_LO (U64(0xbe87fed0691d3e89))
	75	#else
	76	#define FLOAT_LN2INV_HI (U64(0xb8aa3b295c17f0bb))
	77	#define FLOAT_LN2INV_LO (U64(0xC000000000000000))
	78	#endif
	79
	80	#endif

trunk/3rdparty/softfloat/fsincos.c
r0	r242847
	1	/*============================================================================
	2	This source file is an extension to the SoftFloat IEC/IEEE Floating-point
	3	Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
	4	floating point emulation.
	5
	6	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
	7	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
	8	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
	9	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
	10	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
	11	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
	12	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
	13	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
	14
	15	Derivative works are acceptable, even for commercial purposes, so long as
	16	(1) the source code for the derivative work includes prominent notice that
	17	the work is derivative, and (2) the source code includes prominent notice with
	18	these four paragraphs for those parts of this code that are retained.
	19	=============================================================================*/
	20
	21	/*============================================================================
	22	* Written for Bochs (x86 achitecture simulator) by
	23	* Stanislav Shwartsman [sshwarts at sourceforge net]
	24	* ==========================================================================*/
	25
	26	#define FLOAT128
	27
	28	#define USE_estimateDiv128To64
	29	#include "mamesf.h"
	30	#include "softfloat.h"
	31	//#include "softfloat-specialize"
	32	#include "fpu_constant.h"
	33
	34	static const floatx80 floatx80_one = packFloatx80(0, 0x3fff, U64(0x8000000000000000));
	35	static const floatx80 floatx80_default_nan = packFloatx80(0, 0xffff, U64(0xffffffffffffffff));
	36
	37	#define packFloat2x128m(zHi, zLo) {(zHi), (zLo)}
	38	#define PACK_FLOAT_128(hi,lo) packFloat2x128m(LIT64(hi),LIT64(lo))
	39
	40	#define EXP_BIAS 0x3FFF
	41
	42	/*----------------------------------------------------------------------------
	43	\| Returns the fraction bits of the extended double-precision floating-point
	44	\| value `a'.
	45	----------------------------------------------------------------------------/
	46
	47	INLINE bits64 extractFloatx80Frac( floatx80 a )
	48	{
	49	return a.low;
	50
	51	}
	52
	53	/*----------------------------------------------------------------------------
	54	\| Returns the exponent bits of the extended double-precision floating-point
	55	\| value `a'.
	56	----------------------------------------------------------------------------/
	57
	58	INLINE int32 extractFloatx80Exp( floatx80 a )
	59	{
	60	return a.high & 0x7FFF;
	61
	62	}
	63
	64	/*----------------------------------------------------------------------------
	65	\| Returns the sign bit of the extended double-precision floating-point value
	66	\| `a'.
	67	----------------------------------------------------------------------------/
	68
	69	INLINE flag extractFloatx80Sign( floatx80 a )
	70	{
	71	return a.high>>15;
	72
	73	}
	74
	75	/*----------------------------------------------------------------------------
	76	\| Takes extended double-precision floating-point NaN `a' and returns the
	77	\| appropriate NaN result. If `a' is a signaling NaN, the invalid exception
	78	\| is raised.
	79	----------------------------------------------------------------------------/
	80
	81	INLINE floatx80 propagateFloatx80NaNOneArg(floatx80 a)
	82	{
	83	if (floatx80_is_signaling_nan(a))
	84	float_raise(float_flag_invalid);
	85
	86	a.low \|= U64(0xC000000000000000);
	87
	88	return a;
	89	}
	90
	91	/*----------------------------------------------------------------------------
	92	\| Normalizes the subnormal extended double-precision floating-point value
	93	\| represented by the denormalized significand `aSig'. The normalized exponent
	94	\| and significand are stored at the locations pointed to by `zExpPtr' and
	95	\| `zSigPtr', respectively.
	96	----------------------------------------------------------------------------/
	97
	98	void normalizeFloatx80Subnormal(UINT64 aSig, INT32 zExpPtr, UINT64 zSigPtr)
	99	{
	100	int shiftCount = countLeadingZeros64(aSig);
	101	*zSigPtr = aSig<<shiftCount;
	102	*zExpPtr = 1 - shiftCount;
	103	}
	104
	105	/* reduce trigonometric function argument using 128-bit precision
	106	M_PI approximation */
	107	static UINT64 argument_reduction_kernel(UINT64 aSig0, int Exp, UINT64 zSig0, UINT64 zSig1)
	108	{
	109	UINT64 term0, term1, term2;
	110	UINT64 aSig1 = 0;
	111
	112	shortShift128Left(aSig1, aSig0, Exp, &aSig1, &aSig0);
	113	UINT64 q = estimateDiv128To64(aSig1, aSig0, FLOAT_PI_HI);
	114	mul128By64To192(FLOAT_PI_HI, FLOAT_PI_LO, q, &term0, &term1, &term2);
	115	sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
	116	while ((INT64)(*zSig1) < 0) {
	117	--q;
	118	add192(zSig1, zSig0, term2, 0, FLOAT_PI_HI, FLOAT_PI_LO, zSig1, zSig0, &term2);
	119	}
	120	*zSig1 = term2;
	121	return q;
	122	}
	123
	124	static int reduce_trig_arg(int expDiff, int &zSign, UINT64 &aSig0, UINT64 &aSig1)
	125	{
	126	UINT64 term0, term1, q = 0;
	127
	128	if (expDiff < 0) {
	129	shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
	130	expDiff = 0;
	131	}
	132	if (expDiff > 0) {
	133	q = argument_reduction_kernel(aSig0, expDiff, &aSig0, &aSig1);
	134	}
	135	else {
	136	if (FLOAT_PI_HI <= aSig0) {
	137	aSig0 -= FLOAT_PI_HI;
	138	q = 1;
	139	}
	140	}
	141
	142	shift128Right(FLOAT_PI_HI, FLOAT_PI_LO, 1, &term0, &term1);
	143	if (! lt128(aSig0, aSig1, term0, term1))
	144	{
	145	int lt = lt128(term0, term1, aSig0, aSig1);
	146	int eq = eq128(aSig0, aSig1, term0, term1);
	147
	148	if ((eq && (q & 1)) \|\| lt) {
	149	zSign = !zSign;
	150	++q;
	151	}
	152	if (lt) sub128(FLOAT_PI_HI, FLOAT_PI_LO, aSig0, aSig1, &aSig0, &aSig1);
	153	}
	154
	155	return (int)(q & 3);
	156	}
	157
	158	#define SIN_ARR_SIZE 11
	159	#define COS_ARR_SIZE 11
	160
	161	static float128 sin_arr[SIN_ARR_SIZE] =
	162	{
	163	PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 1 */
	164	PACK_FLOAT_128(0xbffc555555555555, 0x5555555555555555), /* 3 */
	165	PACK_FLOAT_128(0x3ff8111111111111, 0x1111111111111111), /* 5 */
	166	PACK_FLOAT_128(0xbff2a01a01a01a01, 0xa01a01a01a01a01a), /* 7 */
	167	PACK_FLOAT_128(0x3fec71de3a556c73, 0x38faac1c88e50017), /* 9 */
	168	PACK_FLOAT_128(0xbfe5ae64567f544e, 0x38fe747e4b837dc7), /* 11 */
	169	PACK_FLOAT_128(0x3fde6124613a86d0, 0x97ca38331d23af68), /* 13 */
	170	PACK_FLOAT_128(0xbfd6ae7f3e733b81, 0xf11d8656b0ee8cb0), /* 15 */
	171	PACK_FLOAT_128(0x3fce952c77030ad4, 0xa6b2605197771b00), /* 17 */
	172	PACK_FLOAT_128(0xbfc62f49b4681415, 0x724ca1ec3b7b9675), /* 19 */
	173	PACK_FLOAT_128(0x3fbd71b8ef6dcf57, 0x18bef146fcee6e45) /* 21 */
	174	};
	175
	176	static float128 cos_arr[COS_ARR_SIZE] =
	177	{
	178	PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 0 */
	179	PACK_FLOAT_128(0xbffe000000000000, 0x0000000000000000), /* 2 */
	180	PACK_FLOAT_128(0x3ffa555555555555, 0x5555555555555555), /* 4 */
	181	PACK_FLOAT_128(0xbff56c16c16c16c1, 0x6c16c16c16c16c17), /* 6 */
	182	PACK_FLOAT_128(0x3fefa01a01a01a01, 0xa01a01a01a01a01a), /* 8 */
	183	PACK_FLOAT_128(0xbfe927e4fb7789f5, 0xc72ef016d3ea6679), /* 10 */
	184	PACK_FLOAT_128(0x3fe21eed8eff8d89, 0x7b544da987acfe85), /* 12 */
	185	PACK_FLOAT_128(0xbfda93974a8c07c9, 0xd20badf145dfa3e5), /* 14 */
	186	PACK_FLOAT_128(0x3fd2ae7f3e733b81, 0xf11d8656b0ee8cb0), /* 16 */
	187	PACK_FLOAT_128(0xbfca6827863b97d9, 0x77bb004886a2c2ab), /* 18 */
	188	PACK_FLOAT_128(0x3fc1e542ba402022, 0x507a9cad2bf8f0bb) /* 20 */
	189	};
	190
	191	extern float128 OddPoly (float128 x, float128 *arr, unsigned n);
	192
	193	/* 0 <= x <= pi/4 */
	194	INLINE float128 poly_sin(float128 x)
	195	{
	196	// 3 5 7 9 11 13 15
	197	// x x x x x x x
	198	// sin (x) ~ x - --- + --- - --- + --- - ---- + ---- - ---- =
	199	// 3! 5! 7! 9! 11! 13! 15!
	200	//
	201	// 2 4 6 8 10 12 14
	202	// x x x x x x x
	203	// = x * [ 1 - --- + --- - --- + --- - ---- + ---- - ---- ] =
	204	// 3! 5! 7! 9! 11! 13! 15!
	205	//
	206	// 3 3
	207	// -- 4k -- 4k+2
	208	// p(x) = > C * x > 0 q(x) = > C * x < 0
	209	// -- 2k -- 2k+1
	210	// k=0 k=0
	211	//
	212	// 2
	213	// sin(x) ~ x * [ p(x) + x * q(x) ]
	214	//
	215
	216	return OddPoly(x, sin_arr, SIN_ARR_SIZE);
	217	}
	218
	219	extern float128 EvenPoly(float128 x, float128 *arr, unsigned n);
	220
	221	/* 0 <= x <= pi/4 */
	222	INLINE float128 poly_cos(float128 x)
	223	{
	224	// 2 4 6 8 10 12 14
	225	// x x x x x x x
	226	// cos (x) ~ 1 - --- + --- - --- + --- - ---- + ---- - ----
	227	// 2! 4! 6! 8! 10! 12! 14!
	228	//
	229	// 3 3
	230	// -- 4k -- 4k+2
	231	// p(x) = > C * x > 0 q(x) = > C * x < 0
	232	// -- 2k -- 2k+1
	233	// k=0 k=0
	234	//
	235	// 2
	236	// cos(x) ~ [ p(x) + x * q(x) ]
	237	//
	238
	239	return EvenPoly(x, cos_arr, COS_ARR_SIZE);
	240	}
	241
	242	INLINE void sincos_invalid(floatx80 sin_a, floatx80 cos_a, floatx80 a)
	243	{
	244	if (sin_a) *sin_a = a;
	245	if (cos_a) *cos_a = a;
	246	}
	247
	248	INLINE void sincos_tiny_argument(floatx80 sin_a, floatx80 cos_a, floatx80 a)
	249	{
	250	if (sin_a) *sin_a = a;
	251	if (cos_a) *cos_a = floatx80_one;
	252	}
	253
	254	static floatx80 sincos_approximation(int neg, float128 r, UINT64 quotient)
	255	{
	256	if (quotient & 0x1) {
	257	r = poly_cos(r);
	258	neg = 0;
	259	} else {
	260	r = poly_sin(r);
	261	}
	262
	263	floatx80 result = float128_to_floatx80(r);
	264	if (quotient & 0x2)
	265	neg = ! neg;
	266
	267	if (neg)
	268	result = floatx80_chs(result);
	269
	270	return result;
	271	}
	272
	273	// =================================================
	274	// SFFSINCOS Compute sin(x) and cos(x)
	275	// =================================================
	276
	277	//
	278	// Uses the following identities:
	279	// ----------------------------------------------------------
	280	//
	281	// sin(-x) = -sin(x)
	282	// cos(-x) = cos(x)
	283	//
	284	// sin(x+y) = sin(x)cos(y)+cos(x)sin(y)
	285	// cos(x+y) = sin(x)sin(y)+cos(x)cos(y)
	286	//
	287	// sin(x+ pi/2) = cos(x)
	288	// sin(x+ pi) = -sin(x)
	289	// sin(x+3pi/2) = -cos(x)
	290	// sin(x+2pi) = sin(x)
	291	//
	292
	293	int sf_fsincos(floatx80 a, floatx80 sin_a, floatx80 cos_a)
	294	{
	295	UINT64 aSig0, aSig1 = 0;
	296	INT32 aExp, zExp, expDiff;
	297	int aSign, zSign;
	298	int q = 0;
	299
	300	aSig0 = extractFloatx80Frac(a);
	301	aExp = extractFloatx80Exp(a);
	302	aSign = extractFloatx80Sign(a);
	303
	304	/* invalid argument */
	305	if (aExp == 0x7FFF) {
	306	if ((UINT64) (aSig0<<1)) {
	307	sincos_invalid(sin_a, cos_a, propagateFloatx80NaNOneArg(a));
	308	return 0;
	309	}
	310
	311	float_raise(float_flag_invalid);
	312	sincos_invalid(sin_a, cos_a, floatx80_default_nan);
	313	return 0;
	314	}
	315
	316	if (aExp == 0) {
	317	if (aSig0 == 0) {
	318	sincos_tiny_argument(sin_a, cos_a, a);
	319	return 0;
	320	}
	321
	322	// float_raise(float_flag_denormal);
	323
	324	/* handle pseudo denormals */
	325	if (! (aSig0 & U64(0x8000000000000000)))
	326	{
	327	float_raise(float_flag_inexact);
	328	if (sin_a)
	329	float_raise(float_flag_underflow);
	330	sincos_tiny_argument(sin_a, cos_a, a);
	331	return 0;
	332	}
	333
	334	normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
	335	}
	336
	337	zSign = aSign;
	338	zExp = EXP_BIAS;
	339	expDiff = aExp - zExp;
	340
	341	/* argument is out-of-range */
	342	if (expDiff >= 63)
	343	return -1;
	344
	345	float_raise(float_flag_inexact);
	346
	347	if (expDiff < -1) { // doesn't require reduction
	348	if (expDiff <= -68) {
	349	a = packFloatx80(aSign, aExp, aSig0);
	350	sincos_tiny_argument(sin_a, cos_a, a);
	351	return 0;
	352	}
	353	zExp = aExp;
	354	}
	355	else {
	356	q = reduce_trig_arg(expDiff, zSign, aSig0, aSig1);
	357	}
	358
	359	/* **************************** */
	360	/* argument reduction completed */
	361	/* **************************** */
	362
	363	/* using float128 for approximation */
	364	float128 r = normalizeRoundAndPackFloat128(0, zExp-0x10, aSig0, aSig1);
	365
	366	if (aSign) q = -q;
	367	if (sin_a) *sin_a = sincos_approximation(zSign, r, q);
	368	if (cos_a) *cos_a = sincos_approximation(zSign, r, q+1);
	369
	370	return 0;
	371	}
	372
	373	int floatx80_fsin(floatx80 &a)
	374	{
	375	return sf_fsincos(a, &a, 0);
	376	}
	377
	378	int floatx80_fcos(floatx80 &a)
	379	{
	380	return sf_fsincos(a, 0, &a);
	381	}
	382
	383	// =================================================
	384	// FPTAN Compute tan(x)
	385	// =================================================
	386
	387	//
	388	// Uses the following identities:
	389	//
	390	// 1. ----------------------------------------------------------
	391	//
	392	// sin(-x) = -sin(x)
	393	// cos(-x) = cos(x)
	394	//
	395	// sin(x+y) = sin(x)cos(y)+cos(x)sin(y)
	396	// cos(x+y) = sin(x)sin(y)+cos(x)cos(y)
	397	//
	398	// sin(x+ pi/2) = cos(x)
	399	// sin(x+ pi) = -sin(x)
	400	// sin(x+3pi/2) = -cos(x)
	401	// sin(x+2pi) = sin(x)
	402	//
	403	// 2. ----------------------------------------------------------
	404	//
	405	// sin(x)
	406	// tan(x) = ------
	407	// cos(x)
	408	//
	409
	410	int floatx80_ftan(floatx80 &a)
	411	{
	412	UINT64 aSig0, aSig1 = 0;
	413	INT32 aExp, zExp, expDiff;
	414	int aSign, zSign;
	415	int q = 0;
	416
	417	aSig0 = extractFloatx80Frac(a);
	418	aExp = extractFloatx80Exp(a);
	419	aSign = extractFloatx80Sign(a);
	420
	421	/* invalid argument */
	422	if (aExp == 0x7FFF) {
	423	if ((UINT64) (aSig0<<1))
	424	{
	425	a = propagateFloatx80NaNOneArg(a);
	426	return 0;
	427	}
	428
	429	float_raise(float_flag_invalid);
	430	a = floatx80_default_nan;
	431	return 0;
	432	}
	433
	434	if (aExp == 0) {
	435	if (aSig0 == 0) return 0;
	436	// float_raise(float_flag_denormal);
	437	/* handle pseudo denormals */
	438	if (! (aSig0 & U64(0x8000000000000000)))
	439	{
	440	float_raise(float_flag_inexact \| float_flag_underflow);
	441	return 0;
	442	}
	443	normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
	444	}
	445
	446	zSign = aSign;
	447	zExp = EXP_BIAS;
	448	expDiff = aExp - zExp;
	449
	450	/* argument is out-of-range */
	451	if (expDiff >= 63)
	452	return -1;
	453
	454	float_raise(float_flag_inexact);
	455
	456	if (expDiff < -1) { // doesn't require reduction
	457	if (expDiff <= -68) {
	458	a = packFloatx80(aSign, aExp, aSig0);
	459	return 0;
	460	}
	461	zExp = aExp;
	462	}
	463	else {
	464	q = reduce_trig_arg(expDiff, zSign, aSig0, aSig1);
	465	}
	466
	467	/* **************************** */
	468	/* argument reduction completed */
	469	/* **************************** */
	470
	471	/* using float128 for approximation */
	472	float128 r = normalizeRoundAndPackFloat128(0, zExp-0x10, aSig0, aSig1);
	473
	474	float128 sin_r = poly_sin(r);
	475	float128 cos_r = poly_cos(r);
	476
	477	if (q & 0x1) {
	478	r = float128_div(cos_r, sin_r);
	479	zSign = ! zSign;
	480	} else {
	481	r = float128_div(sin_r, cos_r);
	482	}
	483
	484	a = float128_to_floatx80(r);
	485	if (zSign)
	486	a = floatx80_chs(a);
	487
	488	return 0;
	489	}
	490
	491	// 2 3 4 n
	492	// f(x) ~ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
	493	// 0 1 2 3 4 n
	494	//
	495	// -- 2k -- 2k+1
	496	// p(x) = > C * x q(x) = > C * x
	497	// -- 2k -- 2k+1
	498	//
	499	// f(x) ~ [ p(x) + x * q(x) ]
	500	//
	501
	502	float128 EvalPoly(float128 x, float128 *arr, unsigned n)
	503	{
	504	float128 x2 = float128_mul(x, x);
	505	unsigned i;
	506
	507	assert(n > 1);
	508
	509	float128 r1 = arr[--n];
	510	i = n;
	511	while(i >= 2) {
	512	r1 = float128_mul(r1, x2);
	513	i -= 2;
	514	r1 = float128_add(r1, arr[i]);
	515	}
	516	if (i) r1 = float128_mul(r1, x);
	517
	518	float128 r2 = arr[--n];
	519	i = n;
	520	while(i >= 2) {
	521	r2 = float128_mul(r2, x2);
	522	i -= 2;
	523	r2 = float128_add(r2, arr[i]);
	524	}
	525	if (i) r2 = float128_mul(r2, x);
	526
	527	return float128_add(r1, r2);
	528	}
	529
	530	// 2 4 6 8 2n
	531	// f(x) ~ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
	532	// 0 1 2 3 4 n
	533	//
	534	// -- 4k -- 4k+2
	535	// p(x) = > C * x q(x) = > C * x
	536	// -- 2k -- 2k+1
	537	//
	538	// 2
	539	// f(x) ~ [ p(x) + x * q(x) ]
	540	//
	541
	542	float128 EvenPoly(float128 x, float128 *arr, unsigned n)
	543	{
	544	return EvalPoly(float128_mul(x, x), arr, n);
	545	}
	546
	547	// 3 5 7 9 2n+1
	548	// f(x) ~ (C * x) + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
	549	// 0 1 2 3 4 n
	550	// 2 4 6 8 2n
	551	// = x * [ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x)
	552	// 0 1 2 3 4 n
	553	//
	554	// -- 4k -- 4k+2
	555	// p(x) = > C * x q(x) = > C * x
	556	// -- 2k -- 2k+1
	557	//
	558	// 2
	559	// f(x) ~ x * [ p(x) + x * q(x) ]
	560	//
	561
	562	float128 OddPoly(float128 x, float128 *arr, unsigned n)
	563	{
	564	return float128_mul(x, EvenPoly(x, arr, n));
	565	}
	566
	567	/*----------------------------------------------------------------------------
	568	\| Scales extended double-precision floating-point value in operand `a' by
	569	\| value `b'. The function truncates the value in the second operand 'b' to
	570	\| an integral value and adds that value to the exponent of the operand 'a'.
	571	\| The operation performed according to the IEC/IEEE Standard for Binary
	572	\| Floating-Point Arithmetic.
	573	----------------------------------------------------------------------------/
	574
	575	extern floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b );
	576
	577	floatx80 floatx80_scale(floatx80 a, floatx80 b)
	578	{
	579	sbits32 aExp, bExp;
	580	bits64 aSig, bSig;
	581
	582	// handle unsupported extended double-precision floating encodings
	583	/* if (floatx80_is_unsupported(a) \|\| floatx80_is_unsupported(b))
	584	{
	585	float_raise(float_flag_invalid);
	586	return floatx80_default_nan;
	587	}*/
	588
	589	aSig = extractFloatx80Frac(a);
	590	aExp = extractFloatx80Exp(a);
	591	int aSign = extractFloatx80Sign(a);
	592	bSig = extractFloatx80Frac(b);
	593	bExp = extractFloatx80Exp(b);
	594	int bSign = extractFloatx80Sign(b);
	595
	596	if (aExp == 0x7FFF) {
	597	if ((bits64) (aSig<<1) \|\| ((bExp == 0x7FFF) && (bits64) (bSig<<1)))
	598	{
	599	return propagateFloatx80NaN(a, b);
	600	}
	601	if ((bExp == 0x7FFF) && bSign) {
	602	float_raise(float_flag_invalid);
	603	return floatx80_default_nan;
	604	}
	605	if (bSig && (bExp == 0)) float_raise(float_flag_denormal);
	606	return a;
	607	}
	608	if (bExp == 0x7FFF) {
	609	if ((bits64) (bSig<<1)) return propagateFloatx80NaN(a, b);
	610	if ((aExp \| aSig) == 0) {
	611	if (! bSign) {
	612	float_raise(float_flag_invalid);
	613	return floatx80_default_nan;
	614	}
	615	return a;
	616	}
	617	if (aSig && (aExp == 0)) float_raise(float_flag_denormal);
	618	if (bSign) return packFloatx80(aSign, 0, 0);
	619	return packFloatx80(aSign, 0x7FFF, U64(0x8000000000000000));
	620	}
	621	if (aExp == 0) {
	622	if (aSig == 0) return a;
	623	float_raise(float_flag_denormal);
	624	normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
	625	}
	626	if (bExp == 0) {
	627	if (bSig == 0) return a;
	628	float_raise(float_flag_denormal);
	629	normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
	630	}
	631
	632	if (bExp > 0x400E) {
	633	/* generate appropriate overflow/underflow */
	634	return roundAndPackFloatx80(80, aSign,
	635	bSign ? -0x3FFF : 0x7FFF, aSig, 0);
	636	}
	637	if (bExp < 0x3FFF) return a;
	638
	639	int shiftCount = 0x403E - bExp;
	640	bSig >>= shiftCount;
	641	sbits32 scale = bSig;
	642	if (bSign) scale = -scale; /* -32768..32767 */
	643	return
	644	roundAndPackFloatx80(80, aSign, aExp+scale, aSig, 0);
	645	}

trunk/3rdparty/softfloat/fyl2x.c
r0	r242847
	1	/*============================================================================
	2	This source file is an extension to the SoftFloat IEC/IEEE Floating-point
	3	Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator)
	4	floating point emulation.
	5	float_raise(float_flag_invalid)
	6	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
	7	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
	8	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
	9	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
	10	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
	11	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
	12	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
	13	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
	14
	15	Derivative works are acceptable, even for commercial purposes, so long as
	16	(1) the source code for the derivative work includes prominent notice that
	17	the work is derivative, and (2) the source code includes prominent notice with
	18	these four paragraphs for those parts of this code that are retained.
	19	=============================================================================*/
	20
	21	/*============================================================================
	22	* Written for Bochs (x86 achitecture simulator) by
	23	* Stanislav Shwartsman [sshwarts at sourceforge net]
	24	* Adapted for lib/softfloat in MESS by Hans Ostermeyer (03/2012)
	25	* ==========================================================================*/
	26
	27	#define FLOAT128
	28
	29	#define USE_estimateDiv128To64
	30	#include "mamesf.h"
	31	#include "softfloat.h"
	32	//#include "softfloat-specialize"
	33	#include "fpu_constant.h"
	34
	35	static const floatx80 floatx80_log10_2 = packFloatx80(0, 0x3ffd, U64(0x9a209a84fbcff798));
	36	static const floatx80 floatx80_ln_2 = packFloatx80(0, 0x3ffe, U64(0xb17217f7d1cf79ac));
	37	static const floatx80 floatx80_one = packFloatx80(0, 0x3fff, U64(0x8000000000000000));
	38	static const floatx80 floatx80_default_nan = packFloatx80(0, 0xffff, U64(0xffffffffffffffff));
	39
	40	#define packFloat_128(zHi, zLo) {(zHi), (zLo)}
	41	#define PACK_FLOAT_128(hi,lo) packFloat_128(LIT64(hi),LIT64(lo))
	42
	43	#define EXP_BIAS 0x3FFF
	44
	45	/*----------------------------------------------------------------------------
	46	\| Returns the fraction bits of the extended double-precision floating-point
	47	\| value `a'.
	48	----------------------------------------------------------------------------/
	49
	50	INLINE bits64 extractFloatx80Frac( floatx80 a )
	51	{
	52	return a.low;
	53
	54	}
	55
	56	/*----------------------------------------------------------------------------
	57	\| Returns the exponent bits of the extended double-precision floating-point
	58	\| value `a'.
	59	----------------------------------------------------------------------------/
	60
	61	INLINE int32 extractFloatx80Exp( floatx80 a )
	62	{
	63	return a.high & 0x7FFF;
	64
	65	}
	66
	67	/*----------------------------------------------------------------------------
	68	\| Returns the sign bit of the extended double-precision floating-point value
	69	\| `a'.
	70	----------------------------------------------------------------------------/
	71
	72	INLINE flag extractFloatx80Sign( floatx80 a )
	73	{
	74	return a.high>>15;
	75
	76	}
	77
	78	#if 0
	79	/*----------------------------------------------------------------------------
	80	\| Takes extended double-precision floating-point NaN `a' and returns the
	81	\| appropriate NaN result. If `a' is a signaling NaN, the invalid exception
	82	\| is raised.
	83	----------------------------------------------------------------------------/
	84
	85	INLINE floatx80 propagateFloatx80NaNOneArg(floatx80 a)
	86	{
	87	if (floatx80_is_signaling_nan(a))
	88	float_raise(float_flag_invalid);
	89
	90	a.low \|= U64(0xC000000000000000);
	91
	92	return a;
	93	}
	94	#endif
	95
	96	/*----------------------------------------------------------------------------
	97	\| Normalizes the subnormal extended double-precision floating-point value
	98	\| represented by the denormalized significand `aSig'. The normalized exponent
	99	\| and significand are stored at the locations pointed to by `zExpPtr' and
	100	\| `zSigPtr', respectively.
	101	----------------------------------------------------------------------------/
	102
	103	INLINE void normalizeFloatx80Subnormal(UINT64 aSig, INT32 zExpPtr, UINT64 zSigPtr)
	104	{
	105	int shiftCount = countLeadingZeros64(aSig);
	106	*zSigPtr = aSig<<shiftCount;
	107	*zExpPtr = 1 - shiftCount;
	108	}
	109
	110
	111	/*----------------------------------------------------------------------------
	112	\| Returns 1 if the extended double-precision floating-point value `a' is a
	113	\| NaN; otherwise returns 0.
	114	----------------------------------------------------------------------------/
	115
	116	INLINE int floatx80_is_nan(floatx80 a)
	117	{
	118	return ((a.high & 0x7FFF) == 0x7FFF) && (INT64) (a.low<<1);
	119	}
	120
	121	/*----------------------------------------------------------------------------
	122	\| Takes two extended double-precision floating-point values `a' and `b', one
	123	\| of which is a NaN, and returns the appropriate NaN result. If either `a' or
	124	\| `b' is a signaling NaN, the invalid exception is raised.
	125	----------------------------------------------------------------------------/
	126
	127	static floatx80 propagateFloatx80NaN(floatx80 a, floatx80 b)
	128	{
	129	int aIsNaN = floatx80_is_nan(a);
	130	int aIsSignalingNaN = floatx80_is_signaling_nan(a);
	131	int bIsNaN = floatx80_is_nan(b);
	132	int bIsSignalingNaN = floatx80_is_signaling_nan(b);
	133	a.low \|= U64(0xC000000000000000);
	134	b.low \|= U64(0xC000000000000000);
	135	if (aIsSignalingNaN \| bIsSignalingNaN) float_raise(float_flag_invalid);
	136	if (aIsSignalingNaN) {
	137	if (bIsSignalingNaN) goto returnLargerSignificand;
	138	return bIsNaN ? b : a;
	139	}
	140	else if (aIsNaN) {
	141	if (bIsSignalingNaN \| ! bIsNaN) return a;
	142	returnLargerSignificand:
	143	if (a.low < b.low) return b;
	144	if (b.low < a.low) return a;
	145	return (a.high < b.high) ? a : b;
	146	}
	147	else {
	148	return b;
	149	}
	150	}
	151
	152	static const float128 float128_one =
	153	packFloat_128(U64(0x3fff000000000000), U64(0x0000000000000000));
	154	static const float128 float128_two =
	155	packFloat_128(U64(0x4000000000000000), U64(0x0000000000000000));
	156
	157	static const float128 float128_ln2inv2 =
	158	packFloat_128(U64(0x400071547652b82f), U64(0xe1777d0ffda0d23a));
	159
	160	#define SQRT2_HALF_SIG U64(0xb504f333f9de6484)
	161
	162	extern float128 OddPoly(float128 x, float128 *arr, unsigned n);
	163
	164	#define L2_ARR_SIZE 9
	165
	166	static float128 ln_arr[L2_ARR_SIZE] =
	167	{
	168	PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 1 */
	169	PACK_FLOAT_128(0x3ffd555555555555, 0x5555555555555555), /* 3 */
	170	PACK_FLOAT_128(0x3ffc999999999999, 0x999999999999999a), /* 5 */
	171	PACK_FLOAT_128(0x3ffc249249249249, 0x2492492492492492), /* 7 */
	172	PACK_FLOAT_128(0x3ffbc71c71c71c71, 0xc71c71c71c71c71c), /* 9 */
	173	PACK_FLOAT_128(0x3ffb745d1745d174, 0x5d1745d1745d1746), /* 11 */
	174	PACK_FLOAT_128(0x3ffb3b13b13b13b1, 0x3b13b13b13b13b14), /* 13 */
	175	PACK_FLOAT_128(0x3ffb111111111111, 0x1111111111111111), /* 15 */
	176	PACK_FLOAT_128(0x3ffae1e1e1e1e1e1, 0xe1e1e1e1e1e1e1e2) /* 17 */
	177	};
	178
	179	static float128 poly_ln(float128 x1)
	180	{
	181	/*
	182	//
	183	// 3 5 7 9 11 13 15
	184	// 1+u u u u u u u u
	185	// 1/2 ln --- ~ u + --- + --- + --- + --- + ---- + ---- + ---- =
	186	// 1-u 3 5 7 9 11 13 15
	187	//
	188	// 2 4 6 8 10 12 14
	189	// u u u u u u u
	190	// = u * [ 1 + --- + --- + --- + --- + ---- + ---- + ---- ] =
	191	// 3 5 7 9 11 13 15
	192	//
	193	// 3 3
	194	// -- 4k -- 4k+2
	195	// p(u) = > C * u q(u) = > C * u
	196	// -- 2k -- 2k+1
	197	// k=0 k=0
	198	//
	199	// 1+u 2
	200	// 1/2 ln --- ~ u * [ p(u) + u * q(u) ]
	201	// 1-u
	202	//
	203	*/
	204	return OddPoly(x1, ln_arr, L2_ARR_SIZE);
	205	}
	206
	207	/* required sqrt(2)/2 < x < sqrt(2) */
	208	static float128 poly_l2(float128 x)
	209	{
	210	/* using float128 for approximation */
	211	float128 x_p1 = float128_add(x, float128_one);
	212	float128 x_m1 = float128_sub(x, float128_one);
	213	x = float128_div(x_m1, x_p1);
	214	x = poly_ln(x);
	215	x = float128_mul(x, float128_ln2inv2);
	216	return x;
	217	}
	218
	219	static float128 poly_l2p1(float128 x)
	220	{
	221	/* using float128 for approximation */
	222	float128 x_p2 = float128_add(x, float128_two);
	223	x = float128_div(x, x_p2);
	224	x = poly_ln(x);
	225	x = float128_mul(x, float128_ln2inv2);
	226	return x;
	227	}
	228
	229	// =================================================
	230	// FYL2X Compute y * log (x)
	231	// 2
	232	// =================================================
	233
	234	//
	235	// Uses the following identities:
	236	//
	237	// 1. ----------------------------------------------------------
	238	// ln(x)
	239	// log (x) = -------, ln (x*y) = ln(x) + ln(y)
	240	// 2 ln(2)
	241	//
	242	// 2. ----------------------------------------------------------
	243	// 1+u x-1
	244	// ln (x) = ln -----, when u = -----
	245	// 1-u x+1
	246	//
	247	// 3. ----------------------------------------------------------
	248	// 3 5 7 2n+1
	249	// 1+u u u u u
	250	// ln ----- = 2 [ u + --- + --- + --- + ... + ------ + ... ]
	251	// 1-u 3 5 7 2n+1
	252	//
	253
	254	static floatx80 fyl2x(floatx80 a, floatx80 b)
	255	{
	256	UINT64 aSig = extractFloatx80Frac(a);
	257	INT32 aExp = extractFloatx80Exp(a);
	258	int aSign = extractFloatx80Sign(a);
	259	UINT64 bSig = extractFloatx80Frac(b);
	260	INT32 bExp = extractFloatx80Exp(b);
	261	int bSign = extractFloatx80Sign(b);
	262
	263	int zSign = bSign ^ 1;
	264
	265	if (aExp == 0x7FFF) {
	266	if ((UINT64) (aSig<<1)
	267	\|\| ((bExp == 0x7FFF) && (UINT64) (bSig<<1)))
	268	{
	269	return propagateFloatx80NaN(a, b);
	270	}
	271	if (aSign)
	272	{
	273	invalid:
	274	float_raise(float_flag_invalid);
	275	return floatx80_default_nan;
	276	}
	277	else {
	278	if (bExp == 0) {
	279	if (bSig == 0) goto invalid;
	280	float_raise(float_flag_denormal);
	281	}
	282	return packFloatx80(bSign, 0x7FFF, U64(0x8000000000000000));
	283	}
	284	}
	285	if (bExp == 0x7FFF)
	286	{
	287	if ((UINT64) (bSig<<1)) return propagateFloatx80NaN(a, b);
	288	if (aSign && (UINT64)(aExp \| aSig)) goto invalid;
	289	if (aSig && (aExp == 0))
	290	float_raise(float_flag_denormal);
	291	if (aExp < 0x3FFF) {
	292	return packFloatx80(zSign, 0x7FFF, U64(0x8000000000000000));
	293	}
	294	if (aExp == 0x3FFF && ((UINT64) (aSig<<1) == 0)) goto invalid;
	295	return packFloatx80(bSign, 0x7FFF, U64(0x8000000000000000));
	296	}
	297	if (aExp == 0) {
	298	if (aSig == 0) {
	299	if ((bExp \| bSig) == 0) goto invalid;
	300	float_raise(float_flag_divbyzero);
	301	return packFloatx80(zSign, 0x7FFF, U64(0x8000000000000000));
	302	}
	303	if (aSign) goto invalid;
	304	float_raise(float_flag_denormal);
	305	normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
	306	}
	307	if (aSign) goto invalid;
	308	if (bExp == 0) {
	309	if (bSig == 0) {
	310	if (aExp < 0x3FFF) return packFloatx80(zSign, 0, 0);
	311	return packFloatx80(bSign, 0, 0);
	312	}
	313	float_raise(float_flag_denormal);
	314	normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
	315	}
	316	if (aExp == 0x3FFF && ((UINT64) (aSig<<1) == 0))
	317	return packFloatx80(bSign, 0, 0);
	318
	319	float_raise(float_flag_inexact);
	320
	321	int ExpDiff = aExp - 0x3FFF;
	322	aExp = 0;
	323	if (aSig >= SQRT2_HALF_SIG) {
	324	ExpDiff++;
	325	aExp--;
	326	}
	327
	328	/* ******************************** */
	329	/* using float128 for approximation */
	330	/* ******************************** */
	331
	332	UINT64 zSig0, zSig1;
	333	shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
	334	float128 x = packFloat128(0, aExp+0x3FFF, zSig0, zSig1);
	335	x = poly_l2(x);
	336	x = float128_add(x, int64_to_float128((INT64) ExpDiff));
	337	return floatx80_mul(b, float128_to_floatx80(x));
	338	}
	339
	340	// =================================================
	341	// FYL2XP1 Compute y * log (x + 1)
	342	// 2
	343	// =================================================
	344
	345	//
	346	// Uses the following identities:
	347	//
	348	// 1. ----------------------------------------------------------
	349	// ln(x)
	350	// log (x) = -------
	351	// 2 ln(2)
	352	//
	353	// 2. ----------------------------------------------------------
	354	// 1+u x
	355	// ln (x+1) = ln -----, when u = -----
	356	// 1-u x+2
	357	//
	358	// 3. ----------------------------------------------------------
	359	// 3 5 7 2n+1
	360	// 1+u u u u u
	361	// ln ----- = 2 [ u + --- + --- + --- + ... + ------ + ... ]
	362	// 1-u 3 5 7 2n+1
	363	//
	364
	365	floatx80 fyl2xp1(floatx80 a, floatx80 b)
	366	{
	367	INT32 aExp, bExp;
	368	UINT64 aSig, bSig, zSig0, zSig1, zSig2;
	369	int aSign, bSign;
	370
	371	aSig = extractFloatx80Frac(a);
	372	aExp = extractFloatx80Exp(a);
	373	aSign = extractFloatx80Sign(a);
	374	bSig = extractFloatx80Frac(b);
	375	bExp = extractFloatx80Exp(b);
	376	bSign = extractFloatx80Sign(b);
	377	int zSign = aSign ^ bSign;
	378
	379	if (aExp == 0x7FFF) {
	380	if ((UINT64) (aSig<<1)
	381	\|\| ((bExp == 0x7FFF) && (UINT64) (bSig<<1)))
	382	{
	383	return propagateFloatx80NaN(a, b);
	384	}
	385	if (aSign)
	386	{
	387	invalid:
	388	float_raise(float_flag_invalid);
	389	return floatx80_default_nan;
	390	}
	391	else {
	392	if (bExp == 0) {
	393	if (bSig == 0) goto invalid;
	394	float_raise(float_flag_denormal);
	395	}
	396	return packFloatx80(bSign, 0x7FFF, U64(0x8000000000000000));
	397	}
	398	}
	399	if (bExp == 0x7FFF)
	400	{
	401	if ((UINT64) (bSig<<1))
	402	return propagateFloatx80NaN(a, b);
	403
	404	if (aExp == 0) {
	405	if (aSig == 0) goto invalid;
	406	float_raise(float_flag_denormal);
	407	}
	408
	409	return packFloatx80(zSign, 0x7FFF, U64(0x8000000000000000));
	410	}
	411	if (aExp == 0) {
	412	if (aSig == 0) {
	413	if (bSig && (bExp == 0)) float_raise(float_flag_denormal);
	414	return packFloatx80(zSign, 0, 0);
	415	}
	416	float_raise(float_flag_denormal);
	417	normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
	418	}
	419	if (bExp == 0) {
	420	if (bSig == 0) return packFloatx80(zSign, 0, 0);
	421	float_raise(float_flag_denormal);
	422	normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
	423	}
	424
	425	float_raise(float_flag_inexact);
	426
	427	if (aSign && aExp >= 0x3FFF)
	428	return a;
	429
	430	if (aExp >= 0x3FFC) // big argument
	431	{
	432	return fyl2x(floatx80_add(a, floatx80_one), b);
	433	}
	434
	435	// handle tiny argument
	436	if (aExp < EXP_BIAS-70)
	437	{
	438	// first order approximation, return (a*b)/ln(2)
	439	INT32 zExp = aExp + FLOAT_LN2INV_EXP - 0x3FFE;
	440
	441	mul128By64To192(FLOAT_LN2INV_HI, FLOAT_LN2INV_LO, aSig, &zSig0, &zSig1, &zSig2);
	442	if (0 < (INT64) zSig0) {
	443	shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
	444	--zExp;
	445	}
	446
	447	zExp = zExp + bExp - 0x3FFE;
	448	mul128By64To192(zSig0, zSig1, bSig, &zSig0, &zSig1, &zSig2);
	449	if (0 < (INT64) zSig0) {
	450	shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
	451	--zExp;
	452	}
	453
	454	return
	455	roundAndPackFloatx80(80, aSign ^ bSign, zExp, zSig0, zSig1);
	456	}
	457
	458	/* ******************************** */
	459	/* using float128 for approximation */
	460	/* ******************************** */
	461
	462	shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
	463	float128 x = packFloat128(aSign, aExp, zSig0, zSig1);
	464	x = poly_l2p1(x);
	465	return floatx80_mul(b, float128_to_floatx80(x));
	466	}
	467
	468	floatx80 floatx80_flognp1(floatx80 a)
	469	{
	470	return fyl2xp1(a, floatx80_ln_2);
	471	}
	472
	473	floatx80 floatx80_flogn(floatx80 a)
	474	{
	475	return fyl2x(a, floatx80_ln_2);
	476	}
	477
	478	floatx80 floatx80_flog2(floatx80 a)
	479	{
	480	return fyl2x(a, floatx80_one);
	481	}
	482
	483	floatx80 floatx80_flog10(floatx80 a)
	484	{
	485	return fyl2x(a, floatx80_log10_2);
	486	}

trunk/3rdparty/softfloat/mamesf.h
r0	r242847
	1	/*----------------------------------------------------------------------------
	2	\| One of the macros `BIGENDIAN' or `LITTLEENDIAN' must be defined.
	3	----------------------------------------------------------------------------/
	4	#ifdef LSB_FIRST
	5	#define LITTLEENDIAN
	6	#else
	7	#define BIGENDIAN
	8	#endif
	9
	10	/*----------------------------------------------------------------------------
	11	\| The macro `BITS64' can be defined to indicate that 64-bit integer types are
	12	\| supported by the compiler.
	13	----------------------------------------------------------------------------/
	14	#define BITS64
	15
	16	/*----------------------------------------------------------------------------
	17	\| Each of the following `typedef's defines the most convenient type that holds
	18	\| integers of at least as many bits as specified. For example, `uint8' should
	19	\| be the most convenient type that can hold unsigned integers of as many as
	20	\| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most
	21	\| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed
	22	\| to the same as `int'.
	23	----------------------------------------------------------------------------/
	24	#include "emu.h"
	25
	26	typedef INT8 flag;
	27	typedef UINT8 uint8;
	28	typedef INT8 int8;
	29	typedef UINT16 uint16;
	30	typedef INT16 int16;
	31	typedef UINT32 uint32;
	32	typedef INT32 int32;
	33	typedef UINT64 uint64;
	34	typedef INT64 int64;
	35
	36	/*----------------------------------------------------------------------------
	37	\| Each of the following `typedef's defines a type that holds integers
	38	\| of _exactly_ the number of bits specified. For instance, for most
	39	\| implementation of C, `bits16' and `sbits16' should be `typedef'ed to
	40	\| `unsigned short int' and `signed short int' (or `short int'), respectively.
	41	----------------------------------------------------------------------------/
	42	typedef UINT8 bits8;
	43	typedef INT8 sbits8;
	44	typedef UINT16 bits16;
	45	typedef INT16 sbits16;
	46	typedef UINT32 bits32;
	47	typedef INT32 sbits32;
	48	typedef UINT64 bits64;
	49	typedef INT64 sbits64;
	50
	51	/*----------------------------------------------------------------------------
	52	\| The `LIT64' macro takes as its argument a textual integer literal and
	53	\| if necessary ``marks'' the literal as having a 64-bit integer type.
	54	\| For example, the GNU C Compiler (`gcc') requires that 64-bit literals be
	55	\| appended with the letters `LL' standing for `long long', which is `gcc's
	56	\| name for the 64-bit integer type. Some compilers may allow `LIT64' to be
	57	\| defined as the identity macro: `#define LIT64( a ) a'.
	58	----------------------------------------------------------------------------/
	59	#define LIT64( a ) a##ULL
	60
	61	/*----------------------------------------------------------------------------
	62	\| The macro `INLINE' can be used before functions that should be inlined. If
	63	\| a compiler does not support explicit inlining, this macro should be defined
	64	\| to be `static'.
	65	----------------------------------------------------------------------------/
	66	// MAME defines INLINE

trunk/3rdparty/softfloat/milieu.h
r0	r242847
	1
	2	/*============================================================================
	3
	4	This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
	5	Package, Release 2b.
	6
	7	Written by John R. Hauser. This work was made possible in part by the
	8	International Computer Science Institute, located at Suite 600, 1947 Center
	9	Street, Berkeley, California 94704. Funding was partially provided by the
	10	National Science Foundation under grant MIP-9311980. The original version
	11	of this code was written as part of a project to build a fixed-point vector
	12	processor in collaboration with the University of California at Berkeley,
	13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
	14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
	15	arithmetic/SoftFloat.html'.
	16
	17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
	18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
	19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
	20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
	21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
	22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
	23	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
	24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
	25
	26	Derivative works are acceptable, even for commercial purposes, so long as
	27	(1) the source code for the derivative work includes prominent notice that
	28	the work is derivative, and (2) the source code includes prominent notice with
	29	these four paragraphs for those parts of this code that are retained.
	30
	31	=============================================================================*/
	32
	33	/*----------------------------------------------------------------------------
	34	\| Include common integer types and flags.
	35	----------------------------------------------------------------------------/
	36	#include "mamesf.h"
	37
	38	/*----------------------------------------------------------------------------
	39	\| Symbolic Boolean literals.
	40	----------------------------------------------------------------------------/
	41	#define FALSE 0
	42	#define TRUE 1

trunk/3rdparty/softfloat/softfloat-macros
r0	r242847
	1
	2	/*============================================================================
	3
	4	This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
	5	Arithmetic Package, Release 2b.
	6
	7	Written by John R. Hauser. This work was made possible in part by the
	8	International Computer Science Institute, located at Suite 600, 1947 Center
	9	Street, Berkeley, California 94704. Funding was partially provided by the
	10	National Science Foundation under grant MIP-9311980. The original version
	11	of this code was written as part of a project to build a fixed-point vector
	12	processor in collaboration with the University of California at Berkeley,
	13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
	14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
	15	arithmetic/SoftFloat.html'.
	16
	17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
	18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
	19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
	20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
	21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
	22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
	23	INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR
	24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
	25
	26	Derivative works are acceptable, even for commercial purposes, so long as
	27	(1) the source code for the derivative work includes prominent notice that
	28	the work is derivative, and (2) the source code includes prominent notice with
	29	these four paragraphs for those parts of this code that are retained.
	30
	31	=============================================================================*/
	32
	33	/*----------------------------------------------------------------------------
	34	\| Shifts `a' right by the number of bits given in `count'. If any nonzero
	35	\| bits are shifted off, they are ``jammed'' into the least significant bit of
	36	\| the result by setting the least significant bit to 1. The value of `count'
	37	\| can be arbitrarily large; in particular, if `count' is greater than 32, the
	38	\| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
	39	\| The result is stored in the location pointed to by `zPtr'.
	40	----------------------------------------------------------------------------/
	41
	42	INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr )
	43	{
	44	bits32 z;
	45
	46	if ( count == 0 ) {
	47	z = a;
	48	}
	49	else if ( count < 32 ) {
	50	z = ( a>>count ) \| ( ( a<<( ( - count ) & 31 ) ) != 0 );
	51	}
	52	else {
	53	z = ( a != 0 );
	54	}
	55	*zPtr = z;
	56
	57	}
	58
	59	/*----------------------------------------------------------------------------
	60	\| Shifts `a' right by the number of bits given in `count'. If any nonzero
	61	\| bits are shifted off, they are ``jammed'' into the least significant bit of
	62	\| the result by setting the least significant bit to 1. The value of `count'
	63	\| can be arbitrarily large; in particular, if `count' is greater than 64, the
	64	\| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
	65	\| The result is stored in the location pointed to by `zPtr'.
	66	----------------------------------------------------------------------------/
	67
	68	INLINE void shift64RightJamming( bits64 a, int16 count, bits64 *zPtr )
	69	{
	70	bits64 z;
	71
	72	if ( count == 0 ) {
	73	z = a;
	74	}
	75	else if ( count < 64 ) {
	76	z = ( a>>count ) \| ( ( a<<( ( - count ) & 63 ) ) != 0 );
	77	}
	78	else {
	79	z = ( a != 0 );
	80	}
	81	*zPtr = z;
	82
	83	}
	84
	85	/*----------------------------------------------------------------------------
	86	\| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
	87	\| _plus_ the number of bits given in `count'. The shifted result is at most
	88	\| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
	89	\| bits shifted off form a second 64-bit result as follows: The _last_ bit
	90	\| shifted off is the most-significant bit of the extra result, and the other
	91	\| 63 bits of the extra result are all zero if and only if _all_but_the_last_
	92	\| bits shifted off were all zero. This extra result is stored in the location
	93	\| pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
	94	\| (This routine makes more sense if `a0' and `a1' are considered to form
	95	\| a fixed-point value with binary point between `a0' and `a1'. This fixed-
	96	\| point value is shifted right by the number of bits given in `count', and
	97	\| the integer part of the result is returned at the location pointed to by
	98	\| `z0Ptr'. The fractional part of the result may be slightly corrupted as
	99	\| described above, and is returned at the location pointed to by `z1Ptr'.)
	100	----------------------------------------------------------------------------/
	101
	102	INLINE void
	103	shift64ExtraRightJamming(
	104	bits64 a0, bits64 a1, int16 count, bits64 z0Ptr, bits64 z1Ptr )
	105	{
	106	bits64 z0, z1;
	107	int8 negCount = ( - count ) & 63;
	108
	109	if ( count == 0 ) {
	110	z1 = a1;
	111	z0 = a0;
	112	}
	113	else if ( count < 64 ) {
	114	z1 = ( a0<<negCount ) \| ( a1 != 0 );
	115	z0 = a0>>count;
	116	}
	117	else {
	118	if ( count == 64 ) {
	119	z1 = a0 \| ( a1 != 0 );
	120	}
	121	else {
	122	z1 = ( ( a0 \| a1 ) != 0 );
	123	}
	124	z0 = 0;
	125	}
	126	*z1Ptr = z1;
	127	*z0Ptr = z0;
	128
	129	}
	130
	131	/*----------------------------------------------------------------------------
	132	\| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
	133	\| number of bits given in `count'. Any bits shifted off are lost. The value
	134	\| of `count' can be arbitrarily large; in particular, if `count' is greater
	135	\| than 128, the result will be 0. The result is broken into two 64-bit pieces
	136	\| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
	137	----------------------------------------------------------------------------/
	138
	139	INLINE void
	140	shift128Right(
	141	bits64 a0, bits64 a1, int16 count, bits64 z0Ptr, bits64 z1Ptr )
	142	{
	143	bits64 z0, z1;
	144	int8 negCount = ( - count ) & 63;
	145
	146	if ( count == 0 ) {
	147	z1 = a1;
	148	z0 = a0;
	149	}
	150	else if ( count < 64 ) {
	151	z1 = ( a0<<negCount ) \| ( a1>>count );
	152	z0 = a0>>count;
	153	}
	154	else {
	155	z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0;
	156	z0 = 0;
	157	}
	158	*z1Ptr = z1;
	159	*z0Ptr = z0;
	160
	161	}
	162
	163	/*----------------------------------------------------------------------------
	164	\| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
	165	\| number of bits given in `count'. If any nonzero bits are shifted off, they
	166	\| are ``jammed'' into the least significant bit of the result by setting the
	167	\| least significant bit to 1. The value of `count' can be arbitrarily large;
	168	\| in particular, if `count' is greater than 128, the result will be either
	169	\| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
	170	\| nonzero. The result is broken into two 64-bit pieces which are stored at
	171	\| the locations pointed to by `z0Ptr' and `z1Ptr'.
	172	----------------------------------------------------------------------------/
	173
	174	INLINE void
	175	shift128RightJamming(
	176	bits64 a0, bits64 a1, int16 count, bits64 z0Ptr, bits64 z1Ptr )
	177	{
	178	bits64 z0, z1;
	179	int8 negCount = ( - count ) & 63;
	180
	181	if ( count == 0 ) {
	182	z1 = a1;
	183	z0 = a0;
	184	}
	185	else if ( count < 64 ) {
	186	z1 = ( a0<<negCount ) \| ( a1>>count ) \| ( ( a1<<negCount ) != 0 );
	187	z0 = a0>>count;
	188	}
	189	else {
	190	if ( count == 64 ) {
	191	z1 = a0 \| ( a1 != 0 );
	192	}
	193	else if ( count < 128 ) {
	194	z1 = ( a0>>( count & 63 ) ) \| ( ( ( a0<<negCount ) \| a1 ) != 0 );
	195	}
	196	else {
	197	z1 = ( ( a0 \| a1 ) != 0 );
	198	}
	199	z0 = 0;
	200	}
	201	*z1Ptr = z1;
	202	*z0Ptr = z0;
	203
	204	}
	205
	206	/*----------------------------------------------------------------------------
	207	\| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
	208	\| by 64 _plus_ the number of bits given in `count'. The shifted result is
	209	\| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
	210	\| stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
	211	\| off form a third 64-bit result as follows: The _last_ bit shifted off is
	212	\| the most-significant bit of the extra result, and the other 63 bits of the
	213	\| extra result are all zero if and only if _all_but_the_last_ bits shifted off
	214	\| were all zero. This extra result is stored in the location pointed to by
	215	\| `z2Ptr'. The value of `count' can be arbitrarily large.
	216	\| (This routine makes more sense if `a0', `a1', and `a2' are considered
	217	\| to form a fixed-point value with binary point between `a1' and `a2'. This
	218	\| fixed-point value is shifted right by the number of bits given in `count',
	219	\| and the integer part of the result is returned at the locations pointed to
	220	\| by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
	221	\| corrupted as described above, and is returned at the location pointed to by
	222	\| `z2Ptr'.)
	223	----------------------------------------------------------------------------/
	224
	225	INLINE void
	226	shift128ExtraRightJamming(
	227	bits64 a0,
	228	bits64 a1,
	229	bits64 a2,
	230	int16 count,
	231	bits64 *z0Ptr,
	232	bits64 *z1Ptr,
	233	bits64 *z2Ptr
	234	)
	235	{
	236	bits64 z0, z1, z2;
	237	int8 negCount = ( - count ) & 63;
	238
	239	if ( count == 0 ) {
	240	z2 = a2;
	241	z1 = a1;
	242	z0 = a0;
	243	}
	244	else {
	245	if ( count < 64 ) {
	246	z2 = a1<<negCount;
	247	z1 = ( a0<<negCount ) \| ( a1>>count );
	248	z0 = a0>>count;
	249	}
	250	else {
	251	if ( count == 64 ) {
	252	z2 = a1;
	253	z1 = a0;
	254	}
	255	else {
	256	a2 \|= a1;
	257	if ( count < 128 ) {
	258	z2 = a0<<negCount;
	259	z1 = a0>>( count & 63 );
	260	}
	261	else {
	262	z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
	263	z1 = 0;
	264	}
	265	}
	266	z0 = 0;
	267	}
	268	z2 \|= ( a2 != 0 );
	269	}
	270	*z2Ptr = z2;
	271	*z1Ptr = z1;
	272	*z0Ptr = z0;
	273
	274	}
	275
	276	/*----------------------------------------------------------------------------
	277	\| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
	278	\| number of bits given in `count'. Any bits shifted off are lost. The value
	279	\| of `count' must be less than 64. The result is broken into two 64-bit
	280	\| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
	281	----------------------------------------------------------------------------/
	282
	283	INLINE void
	284	shortShift128Left(
	285	bits64 a0, bits64 a1, int16 count, bits64 z0Ptr, bits64 z1Ptr )
	286	{
	287
	288	*z1Ptr = a1<<count;
	289	*z0Ptr =
	290	( count == 0 ) ? a0 : ( a0<<count ) \| ( a1>>( ( - count ) & 63 ) );
	291
	292	}
	293
	294	/*----------------------------------------------------------------------------
	295	\| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
	296	\| by the number of bits given in `count'. Any bits shifted off are lost.
	297	\| The value of `count' must be less than 64. The result is broken into three
	298	\| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
	299	\| `z1Ptr', and `z2Ptr'.
	300	----------------------------------------------------------------------------/
	301
	302	INLINE void
	303	shortShift192Left(
	304	bits64 a0,
	305	bits64 a1,
	306	bits64 a2,
	307	int16 count,
	308	bits64 *z0Ptr,
	309	bits64 *z1Ptr,
	310	bits64 *z2Ptr
	311	)
	312	{
	313	bits64 z0, z1, z2;
	314	int8 negCount;
	315
	316	z2 = a2<<count;
	317	z1 = a1<<count;
	318	z0 = a0<<count;
	319	if ( 0 < count ) {
	320	negCount = ( ( - count ) & 63 );
	321	z1 \|= a2>>negCount;
	322	z0 \|= a1>>negCount;
	323	}
	324	*z2Ptr = z2;
	325	*z1Ptr = z1;
	326	*z0Ptr = z0;
	327
	328	}
	329
	330	/*----------------------------------------------------------------------------
	331	\| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
	332	\| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
	333	\| any carry out is lost. The result is broken into two 64-bit pieces which
	334	\| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
	335	----------------------------------------------------------------------------/
	336
	337	INLINE void
	338	add128(
	339	bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 z0Ptr, bits64 z1Ptr )
	340	{
	341	bits64 z1;
	342
	343	z1 = a1 + b1;
	344	*z1Ptr = z1;
	345	*z0Ptr = a0 + b0 + ( z1 < a1 );
	346
	347	}
	348
	349	/*----------------------------------------------------------------------------
	350	\| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
	351	\| 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
	352	\| modulo 2^192, so any carry out is lost. The result is broken into three
	353	\| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
	354	\| `z1Ptr', and `z2Ptr'.
	355	----------------------------------------------------------------------------/
	356
	357	INLINE void
	358	add192(
	359	bits64 a0,
	360	bits64 a1,
	361	bits64 a2,
	362	bits64 b0,
	363	bits64 b1,
	364	bits64 b2,
	365	bits64 *z0Ptr,
	366	bits64 *z1Ptr,
	367	bits64 *z2Ptr
	368	)
	369	{
	370	bits64 z0, z1, z2;
	371	uint8 carry0, carry1;
	372
	373	z2 = a2 + b2;
	374	carry1 = ( z2 < a2 );
	375	z1 = a1 + b1;
	376	carry0 = ( z1 < a1 );
	377	z0 = a0 + b0;
	378	z1 += carry1;
	379	z0 += ( z1 < carry1 );
	380	z0 += carry0;
	381	*z2Ptr = z2;
	382	*z1Ptr = z1;
	383	*z0Ptr = z0;
	384
	385	}
	386
	387	/*----------------------------------------------------------------------------
	388	\| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
	389	\| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
	390	\| 2^128, so any borrow out (carry out) is lost. The result is broken into two
	391	\| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
	392	\| `z1Ptr'.
	393	----------------------------------------------------------------------------/
	394
	395	INLINE void
	396	sub128(
	397	bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 z0Ptr, bits64 z1Ptr )
	398	{
	399
	400	*z1Ptr = a1 - b1;
	401	*z0Ptr = a0 - b0 - ( a1 < b1 );
	402
	403	}
	404
	405	/*----------------------------------------------------------------------------
	406	\| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
	407	\| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
	408	\| Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
	409	\| result is broken into three 64-bit pieces which are stored at the locations
	410	\| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
	411	----------------------------------------------------------------------------/
	412
	413	INLINE void
	414	sub192(
	415	bits64 a0,
	416	bits64 a1,
	417	bits64 a2,
	418	bits64 b0,
	419	bits64 b1,
	420	bits64 b2,
	421	bits64 *z0Ptr,
	422	bits64 *z1Ptr,
	423	bits64 *z2Ptr
	424	)
	425	{
	426	bits64 z0, z1, z2;
	427	uint8 borrow0, borrow1;
	428
	429	z2 = a2 - b2;
	430	borrow1 = ( a2 < b2 );
	431	z1 = a1 - b1;
	432	borrow0 = ( a1 < b1 );
	433	z0 = a0 - b0;
	434	z0 -= ( z1 < borrow1 );
	435	z1 -= borrow1;
	436	z0 -= borrow0;
	437	*z2Ptr = z2;
	438	*z1Ptr = z1;
	439	*z0Ptr = z0;
	440
	441	}
	442
	443	/*----------------------------------------------------------------------------
	444	\| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
	445	\| into two 64-bit pieces which are stored at the locations pointed to by
	446	\| `z0Ptr' and `z1Ptr'.
	447	----------------------------------------------------------------------------/
	448
	449	INLINE void mul64To128( bits64 a, bits64 b, bits64 z0Ptr, bits64 z1Ptr )
	450	{
	451	bits32 aHigh, aLow, bHigh, bLow;
	452	bits64 z0, zMiddleA, zMiddleB, z1;
	453
	454	aLow = a;
	455	aHigh = a>>32;
	456	bLow = b;
	457	bHigh = b>>32;
	458	z1 = ( (bits64) aLow ) * bLow;
	459	zMiddleA = ( (bits64) aLow ) * bHigh;
	460	zMiddleB = ( (bits64) aHigh ) * bLow;
	461	z0 = ( (bits64) aHigh ) * bHigh;
	462	zMiddleA += zMiddleB;
	463	z0 += ( ( (bits64) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
	464	zMiddleA <<= 32;
	465	z1 += zMiddleA;
	466	z0 += ( z1 < zMiddleA );
	467	*z1Ptr = z1;
	468	*z0Ptr = z0;
	469
	470	}
	471
	472	/*----------------------------------------------------------------------------
	473	\| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
	474	\| `b' to obtain a 192-bit product. The product is broken into three 64-bit
	475	\| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
	476	\| `z2Ptr'.
	477	----------------------------------------------------------------------------/
	478
	479	INLINE void
	480	mul128By64To192(
	481	bits64 a0,
	482	bits64 a1,
	483	bits64 b,
	484	bits64 *z0Ptr,
	485	bits64 *z1Ptr,
	486	bits64 *z2Ptr
	487	)
	488	{
	489	bits64 z0, z1, z2, more1;
	490
	491	mul64To128( a1, b, &z1, &z2 );
	492	mul64To128( a0, b, &z0, &more1 );
	493	add128( z0, more1, 0, z1, &z0, &z1 );
	494	*z2Ptr = z2;
	495	*z1Ptr = z1;
	496	*z0Ptr = z0;
	497
	498	}
	499
	500	/*----------------------------------------------------------------------------
	501	\| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
	502	\| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
	503	\| product. The product is broken into four 64-bit pieces which are stored at
	504	\| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
	505	----------------------------------------------------------------------------/
	506
	507	INLINE void
	508	mul128To256(
	509	bits64 a0,
	510	bits64 a1,
	511	bits64 b0,
	512	bits64 b1,
	513	bits64 *z0Ptr,
	514	bits64 *z1Ptr,
	515	bits64 *z2Ptr,
	516	bits64 *z3Ptr
	517	)
	518	{
	519	bits64 z0, z1, z2, z3;
	520	bits64 more1, more2;
	521
	522	mul64To128( a1, b1, &z2, &z3 );
	523	mul64To128( a1, b0, &z1, &more2 );
	524	add128( z1, more2, 0, z2, &z1, &z2 );
	525	mul64To128( a0, b0, &z0, &more1 );
	526	add128( z0, more1, 0, z1, &z0, &z1 );
	527	mul64To128( a0, b1, &more1, &more2 );
	528	add128( more1, more2, 0, z2, &more1, &z2 );
	529	add128( z0, z1, 0, more1, &z0, &z1 );
	530	*z3Ptr = z3;
	531	*z2Ptr = z2;
	532	*z1Ptr = z1;
	533	*z0Ptr = z0;
	534
	535	}
	536
	537	/*----------------------------------------------------------------------------
	538	\| Returns an approximation to the 64-bit integer quotient obtained by dividing
	539	\| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
	540	\| divisor `b' must be at least 2^63. If q is the exact quotient truncated
	541	\| toward zero, the approximation returned lies between q and q + 2 inclusive.
	542	\| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
	543	\| unsigned integer is returned.
	544	----------------------------------------------------------------------------/
	545
	546	INLINE bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b )
	547	{
	548	bits64 b0, b1;
	549	bits64 rem0, rem1, term0, term1;
	550	bits64 z;
	551
	552	if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
	553	b0 = b>>32;
	554	z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
	555	mul64To128( b, z, &term0, &term1 );
	556	sub128( a0, a1, term0, term1, &rem0, &rem1 );
	557	while ( ( (sbits64) rem0 ) < 0 ) {
	558	z -= LIT64( 0x100000000 );
	559	b1 = b<<32;
	560	add128( rem0, rem1, b0, b1, &rem0, &rem1 );
	561	}
	562	rem0 = ( rem0<<32 ) \| ( rem1>>32 );
	563	z \|= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
	564	return z;
	565
	566	}
	567
	568	/*----------------------------------------------------------------------------
	569	\| Returns an approximation to the square root of the 32-bit significand given
	570	\| by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
	571	\| `aExp' (the least significant bit) is 1, the integer returned approximates
	572	\| 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
	573	\| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
	574	\| case, the approximation returned lies strictly within +/-2 of the exact
	575	\| value.
	576	----------------------------------------------------------------------------/
	577
	578	INLINE bits32 estimateSqrt32( int16 aExp, bits32 a )
	579	{
	580	static const bits16 sqrtOddAdjustments[] = {
	581	0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
	582	0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
	583	};
	584	static const bits16 sqrtEvenAdjustments[] = {
	585	0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
	586	0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
	587	};
	588	int8 index;
	589	bits32 z;
	590
	591	index = ( a>>27 ) & 15;
	592	if ( aExp & 1 ) {
	593	z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ];
	594	z = ( ( a / z )<<14 ) + ( z<<15 );
	595	a >>= 1;
	596	}
	597	else {
	598	z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ];
	599	z = a / z + z;
	600	z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
	601	if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 );
	602	}
	603	return ( (bits32) ( ( ( (bits64) a )<<31 ) / z ) ) + ( z>>1 );
	604
	605	}
	606
	607	/*----------------------------------------------------------------------------
	608	\| Returns the number of leading 0 bits before the most-significant 1 bit of
	609	\| `a'. If `a' is zero, 32 is returned.
	610	----------------------------------------------------------------------------/
	611
	612	static int8 countLeadingZeros32( bits32 a )
	613	{
	614	static const int8 countLeadingZerosHigh[] = {
	615	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
	616	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
	617	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	618	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	619	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	620	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	621	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	622	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	623	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	624	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	625	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	626	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	627	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	628	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	629	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	630	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
	631	};
	632	int8 shiftCount;
	633
	634	shiftCount = 0;
	635	if ( a < 0x10000 ) {
	636	shiftCount += 16;
	637	a <<= 16;
	638	}
	639	if ( a < 0x1000000 ) {
	640	shiftCount += 8;
	641	a <<= 8;
	642	}
	643	shiftCount += countLeadingZerosHigh[ a>>24 ];
	644	return shiftCount;
	645
	646	}
	647
	648	/*----------------------------------------------------------------------------
	649	\| Returns the number of leading 0 bits before the most-significant 1 bit of
	650	\| `a'. If `a' is zero, 64 is returned.
	651	----------------------------------------------------------------------------/
	652
	653	static int8 countLeadingZeros64( bits64 a )
	654	{
	655	int8 shiftCount;
	656
	657	shiftCount = 0;
	658	if ( a < ( (bits64) 1 )<<32 ) {
	659	shiftCount += 32;
	660	}
	661	else {
	662	a >>= 32;
	663	}
	664	shiftCount += countLeadingZeros32( a );
	665	return shiftCount;
	666
	667	}
	668
	669	/*----------------------------------------------------------------------------
	670	\| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
	671	\| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
	672	\| Otherwise, returns 0.
	673	----------------------------------------------------------------------------/
	674
	675	INLINE flag eq128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
	676	{
	677
	678	return ( a0 == b0 ) && ( a1 == b1 );
	679
	680	}
	681
	682	/*----------------------------------------------------------------------------
	683	\| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
	684	\| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
	685	\| Otherwise, returns 0.
	686	----------------------------------------------------------------------------/
	687
	688	INLINE flag le128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
	689	{
	690
	691	return ( a0 < b0 ) \|\| ( ( a0 == b0 ) && ( a1 <= b1 ) );
	692
	693	}
	694
	695	/*----------------------------------------------------------------------------
	696	\| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
	697	\| than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
	698	\| returns 0.
	699	----------------------------------------------------------------------------/
	700
	701	INLINE flag lt128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
	702	{
	703
	704	return ( a0 < b0 ) \|\| ( ( a0 == b0 ) && ( a1 < b1 ) );
	705
	706	}
	707
	708	/*----------------------------------------------------------------------------
	709	\| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
	710	\| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
	711	\| Otherwise, returns 0.
	712	----------------------------------------------------------------------------/
	713
	714	INLINE flag ne128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
	715	{
	716
	717	return ( a0 != b0 ) \|\| ( a1 != b1 );
	718
	719	}
	720
	721	/*-----------------------------------------------------------------------------
	722	\| Changes the sign of the extended double-precision floating-point value 'a'.
	723	\| The operation is performed according to the IEC/IEEE Standard for Binary
	724	\| Floating-Point Arithmetic.
	725	----------------------------------------------------------------------------/
	726
	727	INLINE floatx80 floatx80_chs(floatx80 reg)
	728	{
	729	reg.high ^= 0x8000;
	730	return reg;
	731	}
	732

trunk/3rdparty/softfloat/softfloat-specialize
r0	r242847
	1
	2	/*============================================================================
	3
	4	This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
	5	Arithmetic Package, Release 2b.
	6
	7	Written by John R. Hauser. This work was made possible in part by the
	8	International Computer Science Institute, located at Suite 600, 1947 Center
	9	Street, Berkeley, California 94704. Funding was partially provided by the
	10	National Science Foundation under grant MIP-9311980. The original version
	11	of this code was written as part of a project to build a fixed-point vector
	12	processor in collaboration with the University of California at Berkeley,
	13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
	14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
	15	arithmetic/SoftFloat.html'.
	16
	17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
	18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
	19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
	20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
	21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
	22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
	23	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
	24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
	25
	26	Derivative works are acceptable, even for commercial purposes, so long as
	27	(1) the source code for the derivative work includes prominent notice that
	28	the work is derivative, and (2) the source code includes prominent notice with
	29	these four paragraphs for those parts of this code that are retained.
	30
	31	=============================================================================*/
	32
	33	/*----------------------------------------------------------------------------
	34	\| Underflow tininess-detection mode, statically initialized to default value.
	35	\| (The declaration in `softfloat.h' must match the `int8' type here.)
	36	----------------------------------------------------------------------------/
	37	int8 float_detect_tininess = float_tininess_after_rounding;
	38
	39	/*----------------------------------------------------------------------------
	40	\| Raises the exceptions specified by `flags'. Floating-point traps can be
	41	\| defined here if desired. It is currently not possible for such a trap to
	42	\| substitute a result value. If traps are not implemented, this routine
	43	\| should be simply `float_exception_flags \|= flags;'.
	44	----------------------------------------------------------------------------/
	45
	46	void float_raise( int8 flags )
	47	{
	48
	49	float_exception_flags \|= flags;
	50
	51	}
	52
	53	/*----------------------------------------------------------------------------
	54	\| Internal canonical NaN format.
	55	----------------------------------------------------------------------------/
	56	typedef struct {
	57	flag sign;
	58	bits64 high, low;
	59	} commonNaNT;
	60
	61	/*----------------------------------------------------------------------------
	62	\| The pattern for a default generated single-precision NaN.
	63	----------------------------------------------------------------------------/
	64	#define float32_default_nan 0xFFFFFFFF
	65
	66	/*----------------------------------------------------------------------------
	67	\| Returns 1 if the single-precision floating-point value `a' is a NaN;
	68	\| otherwise returns 0.
	69	----------------------------------------------------------------------------/
	70
	71	flag float32_is_nan( float32 a )
	72	{
	73
	74	return ( 0xFF000000 < (bits32) ( a<<1 ) );
	75
	76	}
	77
	78	/*----------------------------------------------------------------------------
	79	\| Returns 1 if the single-precision floating-point value `a' is a signaling
	80	\| NaN; otherwise returns 0.
	81	----------------------------------------------------------------------------/
	82
	83	flag float32_is_signaling_nan( float32 a )
	84	{
	85
	86	return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
	87
	88	}
	89
	90	/*----------------------------------------------------------------------------
	91	\| Returns the result of converting the single-precision floating-point NaN
	92	\| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
	93	\| exception is raised.
	94	----------------------------------------------------------------------------/
	95
	96	static commonNaNT float32ToCommonNaN( float32 a )
	97	{
	98	commonNaNT z;
	99
	100	if ( float32_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
	101	z.sign = a>>31;
	102	z.low = 0;
	103	z.high = ( (bits64) a )<<41;
	104	return z;
	105
	106	}
	107
	108	/*----------------------------------------------------------------------------
	109	\| Returns the result of converting the canonical NaN `a' to the single-
	110	\| precision floating-point format.
	111	----------------------------------------------------------------------------/
	112
	113	static float32 commonNaNToFloat32( commonNaNT a )
	114	{
	115
	116	return ( ( (bits32) a.sign )<<31 ) \| 0x7FC00000 \| ( a.high>>41 );
	117
	118	}
	119
	120	/*----------------------------------------------------------------------------
	121	\| Takes two single-precision floating-point values `a' and `b', one of which
	122	\| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
	123	\| signaling NaN, the invalid exception is raised.
	124	----------------------------------------------------------------------------/
	125
	126	static float32 propagateFloat32NaN( float32 a, float32 b )
	127	{
	128	flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
	129
	130	aIsNaN = float32_is_nan( a );
	131	aIsSignalingNaN = float32_is_signaling_nan( a );
	132	bIsNaN = float32_is_nan( b );
	133	bIsSignalingNaN = float32_is_signaling_nan( b );
	134	a \|= 0x00400000;
	135	b \|= 0x00400000;
	136	if ( aIsSignalingNaN \| bIsSignalingNaN ) float_raise( float_flag_invalid );
	137	if ( aIsNaN ) {
	138	return ( aIsSignalingNaN & bIsNaN ) ? b : a;
	139	}
	140	else {
	141	return b;
	142	}
	143
	144	}
	145
	146	/*----------------------------------------------------------------------------
	147	\| The pattern for a default generated double-precision NaN.
	148	----------------------------------------------------------------------------/
	149	#define float64_default_nan LIT64( 0xFFFFFFFFFFFFFFFF )
	150
	151	/*----------------------------------------------------------------------------
	152	\| Returns 1 if the double-precision floating-point value `a' is a NaN;
	153	\| otherwise returns 0.
	154	----------------------------------------------------------------------------/
	155
	156	flag float64_is_nan( float64 a )
	157	{
	158
	159	return ( LIT64( 0xFFE0000000000000 ) < (bits64) ( a<<1 ) );
	160
	161	}
	162
	163	/*----------------------------------------------------------------------------
	164	\| Returns 1 if the double-precision floating-point value `a' is a signaling
	165	\| NaN; otherwise returns 0.
	166	----------------------------------------------------------------------------/
	167
	168	flag float64_is_signaling_nan( float64 a )
	169	{
	170
	171	return
	172	( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
	173	&& ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
	174
	175	}
	176
	177	/*----------------------------------------------------------------------------
	178	\| Returns the result of converting the double-precision floating-point NaN
	179	\| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
	180	\| exception is raised.
	181	----------------------------------------------------------------------------/
	182
	183	static commonNaNT float64ToCommonNaN( float64 a )
	184	{
	185	commonNaNT z;
	186
	187	if ( float64_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
	188	z.sign = a>>63;
	189	z.low = 0;
	190	z.high = a<<12;
	191	return z;
	192
	193	}
	194
	195	/*----------------------------------------------------------------------------
	196	\| Returns the result of converting the canonical NaN `a' to the double-
	197	\| precision floating-point format.
	198	----------------------------------------------------------------------------/
	199
	200	static float64 commonNaNToFloat64( commonNaNT a )
	201	{
	202
	203	return
	204	( ( (bits64) a.sign )<<63 )
	205	\| LIT64( 0x7FF8000000000000 )
	206	\| ( a.high>>12 );
	207
	208	}
	209
	210	/*----------------------------------------------------------------------------
	211	\| Takes two double-precision floating-point values `a' and `b', one of which
	212	\| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a
	213	\| signaling NaN, the invalid exception is raised.
	214	----------------------------------------------------------------------------/
	215
	216	static float64 propagateFloat64NaN( float64 a, float64 b )
	217	{
	218	flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
	219
	220	aIsNaN = float64_is_nan( a );
	221	aIsSignalingNaN = float64_is_signaling_nan( a );
	222	bIsNaN = float64_is_nan( b );
	223	bIsSignalingNaN = float64_is_signaling_nan( b );
	224	a \|= LIT64( 0x0008000000000000 );
	225	b \|= LIT64( 0x0008000000000000 );
	226	if ( aIsSignalingNaN \| bIsSignalingNaN ) float_raise( float_flag_invalid );
	227	if ( aIsNaN ) {
	228	return ( aIsSignalingNaN & bIsNaN ) ? b : a;
	229	}
	230	else {
	231	return b;
	232	}
	233
	234	}
	235
	236	#ifdef FLOATX80
	237
	238	/*----------------------------------------------------------------------------
	239	\| The pattern for a default generated extended double-precision NaN. The
	240	\| `high' and `low' values hold the most- and least-significant bits,
	241	\| respectively.
	242	----------------------------------------------------------------------------/
	243	#define floatx80_default_nan_high 0xFFFF
	244	#define floatx80_default_nan_low LIT64( 0xFFFFFFFFFFFFFFFF )
	245
	246	/*----------------------------------------------------------------------------
	247	\| Returns 1 if the extended double-precision floating-point value `a' is a
	248	\| NaN; otherwise returns 0.
	249	----------------------------------------------------------------------------/
	250
	251	flag floatx80_is_nan( floatx80 a )
	252	{
	253
	254	return ( ( a.high & 0x7FFF ) == 0x7FFF ) && (bits64) ( a.low<<1 );
	255
	256	}
	257
	258	/*----------------------------------------------------------------------------
	259	\| Returns 1 if the extended double-precision floating-point value `a' is a
	260	\| signaling NaN; otherwise returns 0.
	261	----------------------------------------------------------------------------/
	262
	263	flag floatx80_is_signaling_nan( floatx80 a )
	264	{
	265	bits64 aLow;
	266
	267	aLow = a.low & ~ LIT64( 0x4000000000000000 );
	268	return
	269	( ( a.high & 0x7FFF ) == 0x7FFF )
	270	&& (bits64) ( aLow<<1 )
	271	&& ( a.low == aLow );
	272
	273	}
	274
	275	/*----------------------------------------------------------------------------
	276	\| Returns the result of converting the extended double-precision floating-
	277	\| point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the
	278	\| invalid exception is raised.
	279	----------------------------------------------------------------------------/
	280
	281	static commonNaNT floatx80ToCommonNaN( floatx80 a )
	282	{
	283	commonNaNT z;
	284
	285	if ( floatx80_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
	286	z.sign = a.high>>15;
	287	z.low = 0;
	288	z.high = a.low<<1;
	289	return z;
	290
	291	}
	292
	293	/*----------------------------------------------------------------------------
	294	\| Returns the result of converting the canonical NaN `a' to the extended
	295	\| double-precision floating-point format.
	296	----------------------------------------------------------------------------/
	297
	298	static floatx80 commonNaNToFloatx80( commonNaNT a )
	299	{
	300	floatx80 z;
	301
	302	z.low = LIT64( 0xC000000000000000 ) \| ( a.high>>1 );
	303	z.high = ( ( (bits16) a.sign )<<15 ) \| 0x7FFF;
	304	return z;
	305
	306	}
	307
	308	/*----------------------------------------------------------------------------
	309	\| Takes two extended double-precision floating-point values `a' and `b', one
	310	\| of which is a NaN, and returns the appropriate NaN result. If either `a' or
	311	\| `b' is a signaling NaN, the invalid exception is raised.
	312	----------------------------------------------------------------------------/
	313
	314	floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b )
	315	{
	316	flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
	317
	318	aIsNaN = floatx80_is_nan( a );
	319	aIsSignalingNaN = floatx80_is_signaling_nan( a );
	320	bIsNaN = floatx80_is_nan( b );
	321	bIsSignalingNaN = floatx80_is_signaling_nan( b );
	322	a.low \|= LIT64( 0xC000000000000000 );
	323	b.low \|= LIT64( 0xC000000000000000 );
	324	if ( aIsSignalingNaN \| bIsSignalingNaN ) float_raise( float_flag_invalid );
	325	if ( aIsNaN ) {
	326	return ( aIsSignalingNaN & bIsNaN ) ? b : a;
	327	}
	328	else {
	329	return b;
	330	}
	331
	332	}
	333
	334	#define EXP_BIAS 0x3FFF
	335
	336	/*----------------------------------------------------------------------------
	337	\| Returns the fraction bits of the extended double-precision floating-point
	338	\| value `a'.
	339	----------------------------------------------------------------------------/
	340
	341	INLINE bits64 extractFloatx80Frac( floatx80 a )
	342	{
	343
	344	return a.low;
	345
	346	}
	347
	348	/*----------------------------------------------------------------------------
	349	\| Returns the exponent bits of the extended double-precision floating-point
	350	\| value `a'.
	351	----------------------------------------------------------------------------/
	352
	353	INLINE int32 extractFloatx80Exp( floatx80 a )
	354	{
	355
	356	return a.high & 0x7FFF;
	357
	358	}
	359
	360	/*----------------------------------------------------------------------------
	361	\| Returns the sign bit of the extended double-precision floating-point value
	362	\| `a'.
	363	----------------------------------------------------------------------------/
	364
	365	INLINE flag extractFloatx80Sign( floatx80 a )
	366	{
	367
	368	return a.high>>15;
	369
	370	}
	371
	372	#endif
	373
	374	#ifdef FLOAT128
	375
	376	/*----------------------------------------------------------------------------
	377	\| The pattern for a default generated quadruple-precision NaN. The `high' and
	378	\| `low' values hold the most- and least-significant bits, respectively.
	379	----------------------------------------------------------------------------/
	380	#define float128_default_nan_high LIT64( 0xFFFFFFFFFFFFFFFF )
	381	#define float128_default_nan_low LIT64( 0xFFFFFFFFFFFFFFFF )
	382
	383	/*----------------------------------------------------------------------------
	384	\| Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
	385	\| otherwise returns 0.
	386	----------------------------------------------------------------------------/
	387
	388	flag float128_is_nan( float128 a )
	389	{
	390
	391	return
	392	( LIT64( 0xFFFE000000000000 ) <= (bits64) ( a.high<<1 ) )
	393	&& ( a.low \|\| ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
	394
	395	}
	396
	397	/*----------------------------------------------------------------------------
	398	\| Returns 1 if the quadruple-precision floating-point value `a' is a
	399	\| signaling NaN; otherwise returns 0.
	400	----------------------------------------------------------------------------/
	401
	402	flag float128_is_signaling_nan( float128 a )
	403	{
	404
	405	return
	406	( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
	407	&& ( a.low \|\| ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
	408
	409	}
	410
	411	/*----------------------------------------------------------------------------
	412	\| Returns the result of converting the quadruple-precision floating-point NaN
	413	\| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
	414	\| exception is raised.
	415	----------------------------------------------------------------------------/
	416
	417	static commonNaNT float128ToCommonNaN( float128 a )
	418	{
	419	commonNaNT z;
	420
	421	if ( float128_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
	422	z.sign = a.high>>63;
	423	shortShift128Left( a.high, a.low, 16, &z.high, &z.low );
	424	return z;
	425
	426	}
	427
	428	/*----------------------------------------------------------------------------
	429	\| Returns the result of converting the canonical NaN `a' to the quadruple-
	430	\| precision floating-point format.
	431	----------------------------------------------------------------------------/
	432
	433	static float128 commonNaNToFloat128( commonNaNT a )
	434	{
	435	float128 z;
	436
	437	shift128Right( a.high, a.low, 16, &z.high, &z.low );
	438	z.high \|= ( ( (bits64) a.sign )<<63 ) \| LIT64( 0x7FFF800000000000 );
	439	return z;
	440
	441	}
	442
	443	/*----------------------------------------------------------------------------
	444	\| Takes two quadruple-precision floating-point values `a' and `b', one of
	445	\| which is a NaN, and returns the appropriate NaN result. If either `a' or
	446	\| `b' is a signaling NaN, the invalid exception is raised.
	447	----------------------------------------------------------------------------/
	448
	449	static float128 propagateFloat128NaN( float128 a, float128 b )
	450	{
	451	flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
	452
	453	aIsNaN = float128_is_nan( a );
	454	aIsSignalingNaN = float128_is_signaling_nan( a );
	455	bIsNaN = float128_is_nan( b );
	456	bIsSignalingNaN = float128_is_signaling_nan( b );
	457	a.high \|= LIT64( 0x0000800000000000 );
	458	b.high \|= LIT64( 0x0000800000000000 );
	459	if ( aIsSignalingNaN \| bIsSignalingNaN ) float_raise( float_flag_invalid );
	460	if ( aIsNaN ) {
	461	return ( aIsSignalingNaN & bIsNaN ) ? b : a;
	462	}
	463	else {
	464	return b;
	465	}
	466
	467	}
	468
	469	#endif
	470

trunk/3rdparty/softfloat/softfloat.c
r0	r242847
	1
	2	/*============================================================================
	3
	4	This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
	5	Package, Release 2b.
	6
	7	Written by John R. Hauser. This work was made possible in part by the
	8	International Computer Science Institute, located at Suite 600, 1947 Center
	9	Street, Berkeley, California 94704. Funding was partially provided by the
	10	National Science Foundation under grant MIP-9311980. The original version
	11	of this code was written as part of a project to build a fixed-point vector
	12	processor in collaboration with the University of California at Berkeley,
	13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
	14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
	15	arithmetic/SoftFloat.html'.
	16
	17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
	18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
	19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
	20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
	21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
	22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
	23	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
	24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
	25
	26	Derivative works are acceptable, even for commercial purposes, so long as
	27	(1) the source code for the derivative work includes prominent notice that
	28	the work is derivative, and (2) the source code includes prominent notice with
	29	these four paragraphs for those parts of this code that are retained.
	30
	31	=============================================================================*/
	32
	33	#include "milieu.h"
	34	#include "softfloat.h"
	35
	36	/*----------------------------------------------------------------------------
	37	\| Floating-point rounding mode, extended double-precision rounding precision,
	38	\| and exception flags.
	39	----------------------------------------------------------------------------/
	40	int8 float_exception_flags = 0;
	41	#ifdef FLOATX80
	42	int8 floatx80_rounding_precision = 80;
	43	#endif
	44
	45	int8 float_rounding_mode = float_round_nearest_even;
	46
	47	/*----------------------------------------------------------------------------
	48	\| Functions and definitions to determine: (1) whether tininess for underflow
	49	\| is detected before or after rounding by default, (2) what (if anything)
	50	\| happens when exceptions are raised, (3) how signaling NaNs are distinguished
	51	\| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
	52	\| are propagated from function inputs to output. These details are target-
	53	\| specific.
	54	----------------------------------------------------------------------------/
	55	#include "softfloat-specialize"
	56
	57	/*----------------------------------------------------------------------------
	58	\| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
	59	\| and 7, and returns the properly rounded 32-bit integer corresponding to the
	60	\| input. If `zSign' is 1, the input is negated before being converted to an
	61	\| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
	62	\| is simply rounded to an integer, with the inexact exception raised if the
	63	\| input cannot be represented exactly as an integer. However, if the fixed-
	64	\| point input is too large, the invalid exception is raised and the largest
	65	\| positive or negative integer is returned.
	66	----------------------------------------------------------------------------/
	67
	68	static int32 roundAndPackInt32( flag zSign, bits64 absZ )
	69	{
	70	int8 roundingMode;
	71	flag roundNearestEven;
	72	int8 roundIncrement, roundBits;
	73	int32 z;
	74
	75	roundingMode = float_rounding_mode;
	76	roundNearestEven = ( roundingMode == float_round_nearest_even );
	77	roundIncrement = 0x40;
	78	if ( ! roundNearestEven ) {
	79	if ( roundingMode == float_round_to_zero ) {
	80	roundIncrement = 0;
	81	}
	82	else {
	83	roundIncrement = 0x7F;
	84	if ( zSign ) {
	85	if ( roundingMode == float_round_up ) roundIncrement = 0;
	86	}
	87	else {
	88	if ( roundingMode == float_round_down ) roundIncrement = 0;
	89	}
	90	}
	91	}
	92	roundBits = absZ & 0x7F;
	93	absZ = ( absZ + roundIncrement )>>7;
	94	absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
	95	z = absZ;
	96	if ( zSign ) z = - z;
	97	if ( ( absZ>>32 ) \|\| ( z && ( ( z < 0 ) ^ zSign ) ) ) {
	98	float_raise( float_flag_invalid );
	99	return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
	100	}
	101	if ( roundBits ) float_exception_flags \|= float_flag_inexact;
	102	return z;
	103
	104	}
	105
	106	/*----------------------------------------------------------------------------
	107	\| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
	108	\| `absZ1', with binary point between bits 63 and 64 (between the input words),
	109	\| and returns the properly rounded 64-bit integer corresponding to the input.
	110	\| If `zSign' is 1, the input is negated before being converted to an integer.
	111	\| Ordinarily, the fixed-point input is simply rounded to an integer, with
	112	\| the inexact exception raised if the input cannot be represented exactly as
	113	\| an integer. However, if the fixed-point input is too large, the invalid
	114	\| exception is raised and the largest positive or negative integer is
	115	\| returned.
	116	----------------------------------------------------------------------------/
	117
	118	static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
	119	{
	120	int8 roundingMode;
	121	flag roundNearestEven, increment;
	122	int64 z;
	123
	124	roundingMode = float_rounding_mode;
	125	roundNearestEven = ( roundingMode == float_round_nearest_even );
	126	increment = ( (sbits64) absZ1 < 0 );
	127	if ( ! roundNearestEven ) {
	128	if ( roundingMode == float_round_to_zero ) {
	129	increment = 0;
	130	}
	131	else {
	132	if ( zSign ) {
	133	increment = ( roundingMode == float_round_down ) && absZ1;
	134	}
	135	else {
	136	increment = ( roundingMode == float_round_up ) && absZ1;
	137	}
	138	}
	139	}
	140	if ( increment ) {
	141	++absZ0;
	142	if ( absZ0 == 0 ) goto overflow;
	143	absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
	144	}
	145	z = absZ0;
	146	if ( zSign ) z = - z;
	147	if ( z && ( ( z < 0 ) ^ zSign ) ) {
	148	overflow:
	149	float_raise( float_flag_invalid );
	150	return
	151	zSign ? (sbits64) LIT64( 0x8000000000000000 )
	152	: LIT64( 0x7FFFFFFFFFFFFFFF );
	153	}
	154	if ( absZ1 ) float_exception_flags \|= float_flag_inexact;
	155	return z;
	156
	157	}
	158
	159	/*----------------------------------------------------------------------------
	160	\| Returns the fraction bits of the single-precision floating-point value `a'.
	161	----------------------------------------------------------------------------/
	162
	163	INLINE bits32 extractFloat32Frac( float32 a )
	164	{
	165	return a & 0x007FFFFF;
	166
	167	}
	168
	169	/*----------------------------------------------------------------------------
	170	\| Returns the exponent bits of the single-precision floating-point value `a'.
	171	----------------------------------------------------------------------------/
	172
	173	INLINE int16 extractFloat32Exp( float32 a )
	174	{
	175	return ( a>>23 ) & 0xFF;
	176
	177	}
	178
	179	/*----------------------------------------------------------------------------
	180	\| Returns the sign bit of the single-precision floating-point value `a'.
	181	----------------------------------------------------------------------------/
	182
	183	INLINE flag extractFloat32Sign( float32 a )
	184	{
	185	return a>>31;
	186
	187	}
	188
	189	/*----------------------------------------------------------------------------
	190	\| Normalizes the subnormal single-precision floating-point value represented
	191	\| by the denormalized significand `aSig'. The normalized exponent and
	192	\| significand are stored at the locations pointed to by `zExpPtr' and
	193	\| `zSigPtr', respectively.
	194	----------------------------------------------------------------------------/
	195
	196	static void
	197	normalizeFloat32Subnormal( bits32 aSig, int16 zExpPtr, bits32 zSigPtr )
	198	{
	199	int8 shiftCount;
	200
	201	shiftCount = countLeadingZeros32( aSig ) - 8;
	202	*zSigPtr = aSig<<shiftCount;
	203	*zExpPtr = 1 - shiftCount;
	204
	205	}
	206
	207	/*----------------------------------------------------------------------------
	208	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
	209	\| single-precision floating-point value, returning the result. After being
	210	\| shifted into the proper positions, the three fields are simply added
	211	\| together to form the result. This means that any integer portion of `zSig'
	212	\| will be added into the exponent. Since a properly normalized significand
	213	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
	214	\| than the desired result exponent whenever `zSig' is a complete, normalized
	215	\| significand.
	216	----------------------------------------------------------------------------/
	217
	218	INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
	219	{
	220	return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
	221
	222	}
	223
	224	/*----------------------------------------------------------------------------
	225	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
	226	\| and significand `zSig', and returns the proper single-precision floating-
	227	\| point value corresponding to the abstract input. Ordinarily, the abstract
	228	\| value is simply rounded and packed into the single-precision format, with
	229	\| the inexact exception raised if the abstract input cannot be represented
	230	\| exactly. However, if the abstract value is too large, the overflow and
	231	\| inexact exceptions are raised and an infinity or maximal finite value is
	232	\| returned. If the abstract value is too small, the input value is rounded to
	233	\| a subnormal number, and the underflow and inexact exceptions are raised if
	234	\| the abstract input cannot be represented exactly as a subnormal single-
	235	\| precision floating-point number.
	236	\| The input significand `zSig' has its binary point between bits 30
	237	\| and 29, which is 7 bits to the left of the usual location. This shifted
	238	\| significand must be normalized or smaller. If `zSig' is not normalized,
	239	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
	240	\| and it must not require rounding. In the usual case that `zSig' is
	241	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
	242	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
	243	\| Binary Floating-Point Arithmetic.
	244	----------------------------------------------------------------------------/
	245
	246	static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
	247	{
	248	int8 roundingMode;
	249	flag roundNearestEven;
	250	int8 roundIncrement, roundBits;
	251	flag isTiny;
	252
	253	roundingMode = float_rounding_mode;
	254	roundNearestEven = ( roundingMode == float_round_nearest_even );
	255	roundIncrement = 0x40;
	256	if ( ! roundNearestEven ) {
	257	if ( roundingMode == float_round_to_zero ) {
	258	roundIncrement = 0;
	259	}
	260	else {
	261	roundIncrement = 0x7F;
	262	if ( zSign ) {
	263	if ( roundingMode == float_round_up ) roundIncrement = 0;
	264	}
	265	else {
	266	if ( roundingMode == float_round_down ) roundIncrement = 0;
	267	}
	268	}
	269	}
	270	roundBits = zSig & 0x7F;
	271	if ( 0xFD <= (bits16) zExp ) {
	272	if ( ( 0xFD < zExp )
	273	\|\| ( ( zExp == 0xFD )
	274	&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
	275	) {
	276	float_raise( float_flag_overflow \| float_flag_inexact );
	277	return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
	278	}
	279	if ( zExp < 0 ) {
	280	isTiny =
	281	( float_detect_tininess == float_tininess_before_rounding )
	282	\|\| ( zExp < -1 )
	283	\|\| ( zSig + roundIncrement < 0x80000000 );
	284	shift32RightJamming( zSig, - zExp, &zSig );
	285	zExp = 0;
	286	roundBits = zSig & 0x7F;
	287	if ( isTiny && roundBits ) float_raise( float_flag_underflow );
	288	}
	289	}
	290	if ( roundBits ) float_exception_flags \|= float_flag_inexact;
	291	zSig = ( zSig + roundIncrement )>>7;
	292	zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
	293	if ( zSig == 0 ) zExp = 0;
	294	return packFloat32( zSign, zExp, zSig );
	295
	296	}
	297
	298	/*----------------------------------------------------------------------------
	299	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
	300	\| and significand `zSig', and returns the proper single-precision floating-
	301	\| point value corresponding to the abstract input. This routine is just like
	302	\| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
	303	\| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
	304	\| floating-point exponent.
	305	----------------------------------------------------------------------------/
	306
	307	static float32
	308	normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
	309	{
	310	int8 shiftCount;
	311
	312	shiftCount = countLeadingZeros32( zSig ) - 1;
	313	return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
	314
	315	}
	316
	317	/*----------------------------------------------------------------------------
	318	\| Returns the fraction bits of the double-precision floating-point value `a'.
	319	----------------------------------------------------------------------------/
	320
	321	INLINE bits64 extractFloat64Frac( float64 a )
	322	{
	323	return a & LIT64( 0x000FFFFFFFFFFFFF );
	324
	325	}
	326
	327	/*----------------------------------------------------------------------------
	328	\| Returns the exponent bits of the double-precision floating-point value `a'.
	329	----------------------------------------------------------------------------/
	330
	331	INLINE int16 extractFloat64Exp( float64 a )
	332	{
	333	return ( a>>52 ) & 0x7FF;
	334
	335	}
	336
	337	/*----------------------------------------------------------------------------
	338	\| Returns the sign bit of the double-precision floating-point value `a'.
	339	----------------------------------------------------------------------------/
	340
	341	INLINE flag extractFloat64Sign( float64 a )
	342	{
	343	return a>>63;
	344
	345	}
	346
	347	/*----------------------------------------------------------------------------
	348	\| Normalizes the subnormal double-precision floating-point value represented
	349	\| by the denormalized significand `aSig'. The normalized exponent and
	350	\| significand are stored at the locations pointed to by `zExpPtr' and
	351	\| `zSigPtr', respectively.
	352	----------------------------------------------------------------------------/
	353
	354	static void
	355	normalizeFloat64Subnormal( bits64 aSig, int16 zExpPtr, bits64 zSigPtr )
	356	{
	357	int8 shiftCount;
	358
	359	shiftCount = countLeadingZeros64( aSig ) - 11;
	360	*zSigPtr = aSig<<shiftCount;
	361	*zExpPtr = 1 - shiftCount;
	362
	363	}
	364
	365	/*----------------------------------------------------------------------------
	366	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
	367	\| double-precision floating-point value, returning the result. After being
	368	\| shifted into the proper positions, the three fields are simply added
	369	\| together to form the result. This means that any integer portion of `zSig'
	370	\| will be added into the exponent. Since a properly normalized significand
	371	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
	372	\| than the desired result exponent whenever `zSig' is a complete, normalized
	373	\| significand.
	374	----------------------------------------------------------------------------/
	375
	376	INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
	377	{
	378	return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
	379
	380	}
	381
	382	/*----------------------------------------------------------------------------
	383	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
	384	\| and significand `zSig', and returns the proper double-precision floating-
	385	\| point value corresponding to the abstract input. Ordinarily, the abstract
	386	\| value is simply rounded and packed into the double-precision format, with
	387	\| the inexact exception raised if the abstract input cannot be represented
	388	\| exactly. However, if the abstract value is too large, the overflow and
	389	\| inexact exceptions are raised and an infinity or maximal finite value is
	390	\| returned. If the abstract value is too small, the input value is rounded
	391	\| to a subnormal number, and the underflow and inexact exceptions are raised
	392	\| if the abstract input cannot be represented exactly as a subnormal double-
	393	\| precision floating-point number.
	394	\| The input significand `zSig' has its binary point between bits 62
	395	\| and 61, which is 10 bits to the left of the usual location. This shifted
	396	\| significand must be normalized or smaller. If `zSig' is not normalized,
	397	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
	398	\| and it must not require rounding. In the usual case that `zSig' is
	399	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
	400	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
	401	\| Binary Floating-Point Arithmetic.
	402	----------------------------------------------------------------------------/
	403
	404	static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
	405	{
	406	int8 roundingMode;
	407	flag roundNearestEven;
	408	int16 roundIncrement, roundBits;
	409	flag isTiny;
	410
	411	roundingMode = float_rounding_mode;
	412	roundNearestEven = ( roundingMode == float_round_nearest_even );
	413	roundIncrement = 0x200;
	414	if ( ! roundNearestEven ) {
	415	if ( roundingMode == float_round_to_zero ) {
	416	roundIncrement = 0;
	417	}
	418	else {
	419	roundIncrement = 0x3FF;
	420	if ( zSign ) {
	421	if ( roundingMode == float_round_up ) roundIncrement = 0;
	422	}
	423	else {
	424	if ( roundingMode == float_round_down ) roundIncrement = 0;
	425	}
	426	}
	427	}
	428	roundBits = zSig & 0x3FF;
	429	if ( 0x7FD <= (bits16) zExp ) {
	430	if ( ( 0x7FD < zExp )
	431	\|\| ( ( zExp == 0x7FD )
	432	&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
	433	) {
	434	float_raise( float_flag_overflow \| float_flag_inexact );
	435	return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
	436	}
	437	if ( zExp < 0 ) {
	438	isTiny =
	439	( float_detect_tininess == float_tininess_before_rounding )
	440	\|\| ( zExp < -1 )
	441	\|\| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
	442	shift64RightJamming( zSig, - zExp, &zSig );
	443	zExp = 0;
	444	roundBits = zSig & 0x3FF;
	445	if ( isTiny && roundBits ) float_raise( float_flag_underflow );
	446	}
	447	}
	448	if ( roundBits ) float_exception_flags \|= float_flag_inexact;
	449	zSig = ( zSig + roundIncrement )>>10;
	450	zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
	451	if ( zSig == 0 ) zExp = 0;
	452	return packFloat64( zSign, zExp, zSig );
	453
	454	}
	455
	456	/*----------------------------------------------------------------------------
	457	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
	458	\| and significand `zSig', and returns the proper double-precision floating-
	459	\| point value corresponding to the abstract input. This routine is just like
	460	\| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
	461	\| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
	462	\| floating-point exponent.
	463	----------------------------------------------------------------------------/
	464
	465	static float64
	466	normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
	467	{
	468	int8 shiftCount;
	469
	470	shiftCount = countLeadingZeros64( zSig ) - 1;
	471	return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
	472
	473	}
	474
	475	#ifdef FLOATX80
	476
	477	/*----------------------------------------------------------------------------
	478	\| Normalizes the subnormal extended double-precision floating-point value
	479	\| represented by the denormalized significand `aSig'. The normalized exponent
	480	\| and significand are stored at the locations pointed to by `zExpPtr' and
	481	\| `zSigPtr', respectively.
	482	----------------------------------------------------------------------------/
	483
	484	static void
	485	normalizeFloatx80Subnormal( bits64 aSig, int32 zExpPtr, bits64 zSigPtr )
	486	{
	487	int8 shiftCount;
	488
	489	shiftCount = countLeadingZeros64( aSig );
	490	*zSigPtr = aSig<<shiftCount;
	491	*zExpPtr = 1 - shiftCount;
	492
	493	}
	494
	495	/*----------------------------------------------------------------------------
	496	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
	497	\| and extended significand formed by the concatenation of `zSig0' and `zSig1',
	498	\| and returns the proper extended double-precision floating-point value
	499	\| corresponding to the abstract input. Ordinarily, the abstract value is
	500	\| rounded and packed into the extended double-precision format, with the
	501	\| inexact exception raised if the abstract input cannot be represented
	502	\| exactly. However, if the abstract value is too large, the overflow and
	503	\| inexact exceptions are raised and an infinity or maximal finite value is
	504	\| returned. If the abstract value is too small, the input value is rounded to
	505	\| a subnormal number, and the underflow and inexact exceptions are raised if
	506	\| the abstract input cannot be represented exactly as a subnormal extended
	507	\| double-precision floating-point number.
	508	\| If `roundingPrecision' is 32 or 64, the result is rounded to the same
	509	\| number of bits as single or double precision, respectively. Otherwise, the
	510	\| result is rounded to the full precision of the extended double-precision
	511	\| format.
	512	\| The input significand must be normalized or smaller. If the input
	513	\| significand is not normalized, `zExp' must be 0; in that case, the result
	514	\| returned is a subnormal number, and it must not require rounding. The
	515	\| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
	516	\| Floating-Point Arithmetic.
	517	----------------------------------------------------------------------------/
	518
	519	// roundAndPackFloatx80 is now also used in fyl2x.c
	520
	521	/* static */ floatx80
	522	roundAndPackFloatx80(
	523	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
	524	)
	525	{
	526	int8 roundingMode;
	527	flag roundNearestEven, increment, isTiny;
	528	int64 roundIncrement, roundMask, roundBits;
	529
	530	roundingMode = float_rounding_mode;
	531	roundNearestEven = ( roundingMode == float_round_nearest_even );
	532	if ( roundingPrecision == 80 ) goto precision80;
	533	if ( roundingPrecision == 64 ) {
	534	roundIncrement = LIT64( 0x0000000000000400 );
	535	roundMask = LIT64( 0x00000000000007FF );
	536	}
	537	else if ( roundingPrecision == 32 ) {
	538	roundIncrement = LIT64( 0x0000008000000000 );
	539	roundMask = LIT64( 0x000000FFFFFFFFFF );
	540	}
	541	else {
	542	goto precision80;
	543	}
	544	zSig0 \|= ( zSig1 != 0 );
	545	if ( ! roundNearestEven ) {
	546	if ( roundingMode == float_round_to_zero ) {
	547	roundIncrement = 0;
	548	}
	549	else {
	550	roundIncrement = roundMask;
	551	if ( zSign ) {
	552	if ( roundingMode == float_round_up ) roundIncrement = 0;
	553	}
	554	else {
	555	if ( roundingMode == float_round_down ) roundIncrement = 0;
	556	}
	557	}
	558	}
	559	roundBits = zSig0 & roundMask;
	560	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
	561	if ( ( 0x7FFE < zExp )
	562	\|\| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
	563	) {
	564	goto overflow;
	565	}
	566	if ( zExp <= 0 ) {
	567	isTiny =
	568	( float_detect_tininess == float_tininess_before_rounding )
	569	\|\| ( zExp < 0 )
	570	\|\| ( zSig0 <= zSig0 + roundIncrement );
	571	shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
	572	zExp = 0;
	573	roundBits = zSig0 & roundMask;
	574	if ( isTiny && roundBits ) float_raise( float_flag_underflow );
	575	if ( roundBits ) float_exception_flags \|= float_flag_inexact;
	576	zSig0 += roundIncrement;
	577	if ( (sbits64) zSig0 < 0 ) zExp = 1;
	578	roundIncrement = roundMask + 1;
	579	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
	580	roundMask \|= roundIncrement;
	581	}
	582	zSig0 &= ~ roundMask;
	583	return packFloatx80( zSign, zExp, zSig0 );
	584	}
	585	}
	586	if ( roundBits ) float_exception_flags \|= float_flag_inexact;
	587	zSig0 += roundIncrement;
	588	if ( zSig0 < roundIncrement ) {
	589	++zExp;
	590	zSig0 = LIT64( 0x8000000000000000 );
	591	}
	592	roundIncrement = roundMask + 1;
	593	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
	594	roundMask \|= roundIncrement;
	595	}
	596	zSig0 &= ~ roundMask;
	597	if ( zSig0 == 0 ) zExp = 0;
	598	return packFloatx80( zSign, zExp, zSig0 );
	599	precision80:
	600	increment = ( (sbits64) zSig1 < 0 );
	601	if ( ! roundNearestEven ) {
	602	if ( roundingMode == float_round_to_zero ) {
	603	increment = 0;
	604	}
	605	else {
	606	if ( zSign ) {
	607	increment = ( roundingMode == float_round_down ) && zSig1;
	608	}
	609	else {
	610	increment = ( roundingMode == float_round_up ) && zSig1;
	611	}
	612	}
	613	}
	614	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
	615	if ( ( 0x7FFE < zExp )
	616	\|\| ( ( zExp == 0x7FFE )
	617	&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
	618	&& increment
	619	)
	620	) {
	621	roundMask = 0;
	622	overflow:
	623	float_raise( float_flag_overflow \| float_flag_inexact );
	624	if ( ( roundingMode == float_round_to_zero )
	625	\|\| ( zSign && ( roundingMode == float_round_up ) )
	626	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
	627	) {
	628	return packFloatx80( zSign, 0x7FFE, ~ roundMask );
	629	}
	630	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
	631	}
	632	if ( zExp <= 0 ) {
	633	isTiny =
	634	( float_detect_tininess == float_tininess_before_rounding )
	635	\|\| ( zExp < 0 )
	636	\|\| ! increment
	637	\|\| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
	638	shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
	639	zExp = 0;
	640	if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
	641	if ( zSig1 ) float_exception_flags \|= float_flag_inexact;
	642	if ( roundNearestEven ) {
	643	increment = ( (sbits64) zSig1 < 0 );
	644	}
	645	else {
	646	if ( zSign ) {
	647	increment = ( roundingMode == float_round_down ) && zSig1;
	648	}
	649	else {
	650	increment = ( roundingMode == float_round_up ) && zSig1;
	651	}
	652	}
	653	if ( increment ) {
	654	++zSig0;
	655	zSig0 &=
	656	~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
	657	if ( (sbits64) zSig0 < 0 ) zExp = 1;
	658	}
	659	return packFloatx80( zSign, zExp, zSig0 );
	660	}
	661	}
	662	if ( zSig1 ) float_exception_flags \|= float_flag_inexact;
	663	if ( increment ) {
	664	++zSig0;
	665	if ( zSig0 == 0 ) {
	666	++zExp;
	667	zSig0 = LIT64( 0x8000000000000000 );
	668	}
	669	else {
	670	zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
	671	}
	672	}
	673	else {
	674	if ( zSig0 == 0 ) zExp = 0;
	675	}
	676	return packFloatx80( zSign, zExp, zSig0 );
	677
	678	}
	679
	680	/*----------------------------------------------------------------------------
	681	\| Takes an abstract floating-point value having sign `zSign', exponent
	682	\| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
	683	\| and returns the proper extended double-precision floating-point value
	684	\| corresponding to the abstract input. This routine is just like
	685	\| `roundAndPackFloatx80' except that the input significand does not have to be
	686	\| normalized.
	687	----------------------------------------------------------------------------/
	688
	689	static floatx80
	690	normalizeRoundAndPackFloatx80(
	691	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
	692	)
	693	{
	694	int8 shiftCount;
	695
	696	if ( zSig0 == 0 ) {
	697	zSig0 = zSig1;
	698	zSig1 = 0;
	699	zExp -= 64;
	700	}
	701	shiftCount = countLeadingZeros64( zSig0 );
	702	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
	703	zExp -= shiftCount;
	704	return
	705	roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
	706
	707	}
	708
	709	#endif
	710
	711	#ifdef FLOAT128
	712
	713	/*----------------------------------------------------------------------------
	714	\| Returns the least-significant 64 fraction bits of the quadruple-precision
	715	\| floating-point value `a'.
	716	----------------------------------------------------------------------------/
	717
	718	INLINE bits64 extractFloat128Frac1( float128 a )
	719	{
	720	return a.low;
	721
	722	}
	723
	724	/*----------------------------------------------------------------------------
	725	\| Returns the most-significant 48 fraction bits of the quadruple-precision
	726	\| floating-point value `a'.
	727	----------------------------------------------------------------------------/
	728
	729	INLINE bits64 extractFloat128Frac0( float128 a )
	730	{
	731	return a.high & LIT64( 0x0000FFFFFFFFFFFF );
	732
	733	}
	734
	735	/*----------------------------------------------------------------------------
	736	\| Returns the exponent bits of the quadruple-precision floating-point value
	737	\| `a'.
	738	----------------------------------------------------------------------------/
	739
	740	INLINE int32 extractFloat128Exp( float128 a )
	741	{
	742	return ( a.high>>48 ) & 0x7FFF;
	743
	744	}
	745
	746	/*----------------------------------------------------------------------------
	747	\| Returns the sign bit of the quadruple-precision floating-point value `a'.
	748	----------------------------------------------------------------------------/
	749
	750	INLINE flag extractFloat128Sign( float128 a )
	751	{
	752	return a.high>>63;
	753
	754	}
	755
	756	/*----------------------------------------------------------------------------
	757	\| Normalizes the subnormal quadruple-precision floating-point value
	758	\| represented by the denormalized significand formed by the concatenation of
	759	\| `aSig0' and `aSig1'. The normalized exponent is stored at the location
	760	\| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
	761	\| significand are stored at the location pointed to by `zSig0Ptr', and the
	762	\| least significant 64 bits of the normalized significand are stored at the
	763	\| location pointed to by `zSig1Ptr'.
	764	----------------------------------------------------------------------------/
	765
	766	static void
	767	normalizeFloat128Subnormal(
	768	bits64 aSig0,
	769	bits64 aSig1,
	770	int32 *zExpPtr,
	771	bits64 *zSig0Ptr,
	772	bits64 *zSig1Ptr
	773	)
	774	{
	775	int8 shiftCount;
	776
	777	if ( aSig0 == 0 ) {
	778	shiftCount = countLeadingZeros64( aSig1 ) - 15;
	779	if ( shiftCount < 0 ) {
	780	*zSig0Ptr = aSig1>>( - shiftCount );
	781	*zSig1Ptr = aSig1<<( shiftCount & 63 );
	782	}
	783	else {
	784	*zSig0Ptr = aSig1<<shiftCount;
	785	*zSig1Ptr = 0;
	786	}
	787	*zExpPtr = - shiftCount - 63;
	788	}
	789	else {
	790	shiftCount = countLeadingZeros64( aSig0 ) - 15;
	791	shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
	792	*zExpPtr = 1 - shiftCount;
	793	}
	794
	795	}
	796
	797	#endif
	798
	799	/*----------------------------------------------------------------------------
	800	\| Returns the result of converting the 32-bit two's complement integer `a'
	801	\| to the single-precision floating-point format. The conversion is performed
	802	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	803	----------------------------------------------------------------------------/
	804
	805	float32 int32_to_float32( int32 a )
	806	{
	807	flag zSign;
	808
	809	if ( a == 0 ) return 0;
	810	if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
	811	zSign = ( a < 0 );
	812	return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
	813
	814	}
	815
	816	/*----------------------------------------------------------------------------
	817	\| Returns the result of converting the 32-bit two's complement integer `a'
	818	\| to the double-precision floating-point format. The conversion is performed
	819	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	820	----------------------------------------------------------------------------/
	821
	822	float64 int32_to_float64( int32 a )
	823	{
	824	flag zSign;
	825	uint32 absA;
	826	int8 shiftCount;
	827	bits64 zSig;
	828
	829	if ( a == 0 ) return 0;
	830	zSign = ( a < 0 );
	831	absA = zSign ? - a : a;
	832	shiftCount = countLeadingZeros32( absA ) + 21;
	833	zSig = absA;
	834	return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
	835
	836	}
	837
	838	#ifdef FLOATX80
	839
	840	/*----------------------------------------------------------------------------
	841	\| Returns the result of converting the 32-bit two's complement integer `a'
	842	\| to the extended double-precision floating-point format. The conversion
	843	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	844	\| Arithmetic.
	845	----------------------------------------------------------------------------/
	846
	847	floatx80 int32_to_floatx80( int32 a )
	848	{
	849	flag zSign;
	850	uint32 absA;
	851	int8 shiftCount;
	852	bits64 zSig;
	853
	854	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
	855	zSign = ( a < 0 );
	856	absA = zSign ? - a : a;
	857	shiftCount = countLeadingZeros32( absA ) + 32;
	858	zSig = absA;
	859	return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
	860
	861	}
	862
	863	#endif
	864
	865	#ifdef FLOAT128
	866
	867	/*----------------------------------------------------------------------------
	868	\| Returns the result of converting the 32-bit two's complement integer `a' to
	869	\| the quadruple-precision floating-point format. The conversion is performed
	870	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	871	----------------------------------------------------------------------------/
	872
	873	float128 int32_to_float128( int32 a )
	874	{
	875	flag zSign;
	876	uint32 absA;
	877	int8 shiftCount;
	878	bits64 zSig0;
	879
	880	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
	881	zSign = ( a < 0 );
	882	absA = zSign ? - a : a;
	883	shiftCount = countLeadingZeros32( absA ) + 17;
	884	zSig0 = absA;
	885	return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
	886
	887	}
	888
	889	#endif
	890
	891	/*----------------------------------------------------------------------------
	892	\| Returns the result of converting the 64-bit two's complement integer `a'
	893	\| to the single-precision floating-point format. The conversion is performed
	894	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	895	----------------------------------------------------------------------------/
	896
	897	float32 int64_to_float32( int64 a )
	898	{
	899	flag zSign;
	900	uint64 absA;
	901	int8 shiftCount;
	902	// bits32 zSig;
	903
	904	if ( a == 0 ) return 0;
	905	zSign = ( a < 0 );
	906	absA = zSign ? - a : a;
	907	shiftCount = countLeadingZeros64( absA ) - 40;
	908	if ( 0 <= shiftCount ) {
	909	return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
	910	}
	911	else {
	912	shiftCount += 7;
	913	if ( shiftCount < 0 ) {
	914	shift64RightJamming( absA, - shiftCount, &absA );
	915	}
	916	else {
	917	absA <<= shiftCount;
	918	}
	919	return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
	920	}
	921
	922	}
	923
	924	/*----------------------------------------------------------------------------
	925	\| Returns the result of converting the 64-bit two's complement integer `a'
	926	\| to the double-precision floating-point format. The conversion is performed
	927	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	928	----------------------------------------------------------------------------/
	929
	930	float64 int64_to_float64( int64 a )
	931	{
	932	flag zSign;
	933
	934	if ( a == 0 ) return 0;
	935	if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
	936	return packFloat64( 1, 0x43E, 0 );
	937	}
	938	zSign = ( a < 0 );
	939	return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
	940
	941	}
	942
	943	#ifdef FLOATX80
	944
	945	/*----------------------------------------------------------------------------
	946	\| Returns the result of converting the 64-bit two's complement integer `a'
	947	\| to the extended double-precision floating-point format. The conversion
	948	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	949	\| Arithmetic.
	950	----------------------------------------------------------------------------/
	951
	952	floatx80 int64_to_floatx80( int64 a )
	953	{
	954	flag zSign;
	955	uint64 absA;
	956	int8 shiftCount;
	957
	958	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
	959	zSign = ( a < 0 );
	960	absA = zSign ? - a : a;
	961	shiftCount = countLeadingZeros64( absA );
	962	return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
	963
	964	}
	965
	966	#endif
	967
	968	#ifdef FLOAT128
	969
	970	/*----------------------------------------------------------------------------
	971	\| Returns the result of converting the 64-bit two's complement integer `a' to
	972	\| the quadruple-precision floating-point format. The conversion is performed
	973	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	974	----------------------------------------------------------------------------/
	975
	976	float128 int64_to_float128( int64 a )
	977	{
	978	flag zSign;
	979	uint64 absA;
	980	int8 shiftCount;
	981	int32 zExp;
	982	bits64 zSig0, zSig1;
	983
	984	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
	985	zSign = ( a < 0 );
	986	absA = zSign ? - a : a;
	987	shiftCount = countLeadingZeros64( absA ) + 49;
	988	zExp = 0x406E - shiftCount;
	989	if ( 64 <= shiftCount ) {
	990	zSig1 = 0;
	991	zSig0 = absA;
	992	shiftCount -= 64;
	993	}
	994	else {
	995	zSig1 = absA;
	996	zSig0 = 0;
	997	}
	998	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
	999	return packFloat128( zSign, zExp, zSig0, zSig1 );
	1000
	1001	}
	1002
	1003	#endif
	1004
	1005	/*----------------------------------------------------------------------------
	1006	\| Returns the result of converting the single-precision floating-point value
	1007	\| `a' to the 32-bit two's complement integer format. The conversion is
	1008	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	1009	\| Arithmetic---which means in particular that the conversion is rounded
	1010	\| according to the current rounding mode. If `a' is a NaN, the largest
	1011	\| positive integer is returned. Otherwise, if the conversion overflows, the
	1012	\| largest integer with the same sign as `a' is returned.
	1013	----------------------------------------------------------------------------/
	1014
	1015	int32 float32_to_int32( float32 a )
	1016	{
	1017	flag aSign;
	1018	int16 aExp, shiftCount;
	1019	bits32 aSig;
	1020	bits64 aSig64;
	1021
	1022	aSig = extractFloat32Frac( a );
	1023	aExp = extractFloat32Exp( a );
	1024	aSign = extractFloat32Sign( a );
	1025	if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
	1026	if ( aExp ) aSig \|= 0x00800000;
	1027	shiftCount = 0xAF - aExp;
	1028	aSig64 = aSig;
	1029	aSig64 <<= 32;
	1030	if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
	1031	return roundAndPackInt32( aSign, aSig64 );
	1032
	1033	}
	1034
	1035	/*----------------------------------------------------------------------------
	1036	\| Returns the result of converting the single-precision floating-point value
	1037	\| `a' to the 32-bit two's complement integer format. The conversion is
	1038	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	1039	\| Arithmetic, except that the conversion is always rounded toward zero.
	1040	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
	1041	\| the conversion overflows, the largest integer with the same sign as `a' is
	1042	\| returned.
	1043	----------------------------------------------------------------------------/
	1044
	1045	int32 float32_to_int32_round_to_zero( float32 a )
	1046	{
	1047	flag aSign;
	1048	int16 aExp, shiftCount;
	1049	bits32 aSig;
	1050	int32 z;
	1051
	1052	aSig = extractFloat32Frac( a );
	1053	aExp = extractFloat32Exp( a );
	1054	aSign = extractFloat32Sign( a );
	1055	shiftCount = aExp - 0x9E;
	1056	if ( 0 <= shiftCount ) {
	1057	if ( a != 0xCF000000 ) {
	1058	float_raise( float_flag_invalid );
	1059	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
	1060	}
	1061	return (sbits32) 0x80000000;
	1062	}
	1063	else if ( aExp <= 0x7E ) {
	1064	if ( aExp \| aSig ) float_exception_flags \|= float_flag_inexact;
	1065	return 0;
	1066	}
	1067	aSig = ( aSig \| 0x00800000 )<<8;
	1068	z = aSig>>( - shiftCount );
	1069	if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
	1070	float_exception_flags \|= float_flag_inexact;
	1071	}
	1072	if ( aSign ) z = - z;
	1073	return z;
	1074
	1075	}
	1076
	1077	/*----------------------------------------------------------------------------
	1078	\| Returns the result of converting the single-precision floating-point value
	1079	\| `a' to the 64-bit two's complement integer format. The conversion is
	1080	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	1081	\| Arithmetic---which means in particular that the conversion is rounded
	1082	\| according to the current rounding mode. If `a' is a NaN, the largest
	1083	\| positive integer is returned. Otherwise, if the conversion overflows, the
	1084	\| largest integer with the same sign as `a' is returned.
	1085	----------------------------------------------------------------------------/
	1086
	1087	int64 float32_to_int64( float32 a )
	1088	{
	1089	flag aSign;
	1090	int16 aExp, shiftCount;
	1091	bits32 aSig;
	1092	bits64 aSig64, aSigExtra;
	1093
	1094	aSig = extractFloat32Frac( a );
	1095	aExp = extractFloat32Exp( a );
	1096	aSign = extractFloat32Sign( a );
	1097	shiftCount = 0xBE - aExp;
	1098	if ( shiftCount < 0 ) {
	1099	float_raise( float_flag_invalid );
	1100	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
	1101	return LIT64( 0x7FFFFFFFFFFFFFFF );
	1102	}
	1103	return (sbits64) LIT64( 0x8000000000000000 );
	1104	}
	1105	if ( aExp ) aSig \|= 0x00800000;
	1106	aSig64 = aSig;
	1107	aSig64 <<= 40;
	1108	shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
	1109	return roundAndPackInt64( aSign, aSig64, aSigExtra );
	1110
	1111	}
	1112
	1113	/*----------------------------------------------------------------------------
	1114	\| Returns the result of converting the single-precision floating-point value
	1115	\| `a' to the 64-bit two's complement integer format. The conversion is
	1116	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	1117	\| Arithmetic, except that the conversion is always rounded toward zero. If
	1118	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
	1119	\| conversion overflows, the largest integer with the same sign as `a' is
	1120	\| returned.
	1121	----------------------------------------------------------------------------/
	1122
	1123	int64 float32_to_int64_round_to_zero( float32 a )
	1124	{
	1125	flag aSign;
	1126	int16 aExp, shiftCount;
	1127	bits32 aSig;
	1128	bits64 aSig64;
	1129	int64 z;
	1130
	1131	aSig = extractFloat32Frac( a );
	1132	aExp = extractFloat32Exp( a );
	1133	aSign = extractFloat32Sign( a );
	1134	shiftCount = aExp - 0xBE;
	1135	if ( 0 <= shiftCount ) {
	1136	if ( a != 0xDF000000 ) {
	1137	float_raise( float_flag_invalid );
	1138	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
	1139	return LIT64( 0x7FFFFFFFFFFFFFFF );
	1140	}
	1141	}
	1142	return (sbits64) LIT64( 0x8000000000000000 );
	1143	}
	1144	else if ( aExp <= 0x7E ) {
	1145	if ( aExp \| aSig ) float_exception_flags \|= float_flag_inexact;
	1146	return 0;
	1147	}
	1148	aSig64 = aSig \| 0x00800000;
	1149	aSig64 <<= 40;
	1150	z = aSig64>>( - shiftCount );
	1151	if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
	1152	float_exception_flags \|= float_flag_inexact;
	1153	}
	1154	if ( aSign ) z = - z;
	1155	return z;
	1156
	1157	}
	1158
	1159	/*----------------------------------------------------------------------------
	1160	\| Returns the result of converting the single-precision floating-point value
	1161	\| `a' to the double-precision floating-point format. The conversion is
	1162	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	1163	\| Arithmetic.
	1164	----------------------------------------------------------------------------/
	1165
	1166	float64 float32_to_float64( float32 a )
	1167	{
	1168	flag aSign;
	1169	int16 aExp;
	1170	bits32 aSig;
	1171
	1172	aSig = extractFloat32Frac( a );
	1173	aExp = extractFloat32Exp( a );
	1174	aSign = extractFloat32Sign( a );
	1175	if ( aExp == 0xFF ) {
	1176	if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
	1177	return packFloat64( aSign, 0x7FF, 0 );
	1178	}
	1179	if ( aExp == 0 ) {
	1180	if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
	1181	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
	1182	--aExp;
	1183	}
	1184	return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
	1185
	1186	}
	1187
	1188	#ifdef FLOATX80
	1189
	1190	/*----------------------------------------------------------------------------
	1191	\| Returns the result of converting the single-precision floating-point value
	1192	\| `a' to the extended double-precision floating-point format. The conversion
	1193	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	1194	\| Arithmetic.
	1195	----------------------------------------------------------------------------/
	1196
	1197	floatx80 float32_to_floatx80( float32 a )
	1198	{
	1199	flag aSign;
	1200	int16 aExp;
	1201	bits32 aSig;
	1202
	1203	aSig = extractFloat32Frac( a );
	1204	aExp = extractFloat32Exp( a );
	1205	aSign = extractFloat32Sign( a );
	1206	if ( aExp == 0xFF ) {
	1207	if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
	1208	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
	1209	}
	1210	if ( aExp == 0 ) {
	1211	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
	1212	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
	1213	}
	1214	aSig \|= 0x00800000;
	1215	return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
	1216
	1217	}
	1218
	1219	#endif
	1220
	1221	#ifdef FLOAT128
	1222
	1223	/*----------------------------------------------------------------------------
	1224	\| Returns the result of converting the single-precision floating-point value
	1225	\| `a' to the double-precision floating-point format. The conversion is
	1226	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	1227	\| Arithmetic.
	1228	----------------------------------------------------------------------------/
	1229
	1230	float128 float32_to_float128( float32 a )
	1231	{
	1232	flag aSign;
	1233	int16 aExp;
	1234	bits32 aSig;
	1235
	1236	aSig = extractFloat32Frac( a );
	1237	aExp = extractFloat32Exp( a );
	1238	aSign = extractFloat32Sign( a );
	1239	if ( aExp == 0xFF ) {
	1240	if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
	1241	return packFloat128( aSign, 0x7FFF, 0, 0 );
	1242	}
	1243	if ( aExp == 0 ) {
	1244	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
	1245	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
	1246	--aExp;
	1247	}
	1248	return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
	1249
	1250	}
	1251
	1252	#endif
	1253
	1254	/*----------------------------------------------------------------------------
	1255	\| Rounds the single-precision floating-point value `a' to an integer, and
	1256	\| returns the result as a single-precision floating-point value. The
	1257	\| operation is performed according to the IEC/IEEE Standard for Binary
	1258	\| Floating-Point Arithmetic.
	1259	----------------------------------------------------------------------------/
	1260
	1261	float32 float32_round_to_int( float32 a )
	1262	{
	1263	flag aSign;
	1264	int16 aExp;
	1265	bits32 lastBitMask, roundBitsMask;
	1266	int8 roundingMode;
	1267	float32 z;
	1268
	1269	aExp = extractFloat32Exp( a );
	1270	if ( 0x96 <= aExp ) {
	1271	if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
	1272	return propagateFloat32NaN( a, a );
	1273	}
	1274	return a;
	1275	}
	1276	if ( aExp <= 0x7E ) {
	1277	if ( (bits32) ( a<<1 ) == 0 ) return a;
	1278	float_exception_flags \|= float_flag_inexact;
	1279	aSign = extractFloat32Sign( a );
	1280	switch ( float_rounding_mode ) {
	1281	case float_round_nearest_even:
	1282	if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
	1283	return packFloat32( aSign, 0x7F, 0 );
	1284	}
	1285	break;
	1286	case float_round_down:
	1287	return aSign ? 0xBF800000 : 0;
	1288	case float_round_up:
	1289	return aSign ? 0x80000000 : 0x3F800000;
	1290	}
	1291	return packFloat32( aSign, 0, 0 );
	1292	}
	1293	lastBitMask = 1;
	1294	lastBitMask <<= 0x96 - aExp;
	1295	roundBitsMask = lastBitMask - 1;
	1296	z = a;
	1297	roundingMode = float_rounding_mode;
	1298	if ( roundingMode == float_round_nearest_even ) {
	1299	z += lastBitMask>>1;
	1300	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
	1301	}
	1302	else if ( roundingMode != float_round_to_zero ) {
	1303	if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
	1304	z += roundBitsMask;
	1305	}
	1306	}
	1307	z &= ~ roundBitsMask;
	1308	if ( z != a ) float_exception_flags \|= float_flag_inexact;
	1309	return z;
	1310
	1311	}
	1312
	1313	/*----------------------------------------------------------------------------
	1314	\| Returns the result of adding the absolute values of the single-precision
	1315	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
	1316	\| before being returned. `zSign' is ignored if the result is a NaN.
	1317	\| The addition is performed according to the IEC/IEEE Standard for Binary
	1318	\| Floating-Point Arithmetic.
	1319	----------------------------------------------------------------------------/
	1320
	1321	static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
	1322	{
	1323	int16 aExp, bExp, zExp;
	1324	bits32 aSig, bSig, zSig;
	1325	int16 expDiff;
	1326
	1327	aSig = extractFloat32Frac( a );
	1328	aExp = extractFloat32Exp( a );
	1329	bSig = extractFloat32Frac( b );
	1330	bExp = extractFloat32Exp( b );
	1331	expDiff = aExp - bExp;
	1332	aSig <<= 6;
	1333	bSig <<= 6;
	1334	if ( 0 < expDiff ) {
	1335	if ( aExp == 0xFF ) {
	1336	if ( aSig ) return propagateFloat32NaN( a, b );
	1337	return a;
	1338	}
	1339	if ( bExp == 0 ) {
	1340	--expDiff;
	1341	}
	1342	else {
	1343	bSig \|= 0x20000000;
	1344	}
	1345	shift32RightJamming( bSig, expDiff, &bSig );
	1346	zExp = aExp;
	1347	}
	1348	else if ( expDiff < 0 ) {
	1349	if ( bExp == 0xFF ) {
	1350	if ( bSig ) return propagateFloat32NaN( a, b );
	1351	return packFloat32( zSign, 0xFF, 0 );
	1352	}
	1353	if ( aExp == 0 ) {
	1354	++expDiff;
	1355	}
	1356	else {
	1357	aSig \|= 0x20000000;
	1358	}
	1359	shift32RightJamming( aSig, - expDiff, &aSig );
	1360	zExp = bExp;
	1361	}
	1362	else {
	1363	if ( aExp == 0xFF ) {
	1364	if ( aSig \| bSig ) return propagateFloat32NaN( a, b );
	1365	return a;
	1366	}
	1367	if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
	1368	zSig = 0x40000000 + aSig + bSig;
	1369	zExp = aExp;
	1370	goto roundAndPack;
	1371	}
	1372	aSig \|= 0x20000000;
	1373	zSig = ( aSig + bSig )<<1;
	1374	--zExp;
	1375	if ( (sbits32) zSig < 0 ) {
	1376	zSig = aSig + bSig;
	1377	++zExp;
	1378	}
	1379	roundAndPack:
	1380	return roundAndPackFloat32( zSign, zExp, zSig );
	1381
	1382	}
	1383
	1384	/*----------------------------------------------------------------------------
	1385	\| Returns the result of subtracting the absolute values of the single-
	1386	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
	1387	\| difference is negated before being returned. `zSign' is ignored if the
	1388	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
	1389	\| Standard for Binary Floating-Point Arithmetic.
	1390	----------------------------------------------------------------------------/
	1391
	1392	static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
	1393	{
	1394	int16 aExp, bExp, zExp;
	1395	bits32 aSig, bSig, zSig;
	1396	int16 expDiff;
	1397
	1398	aSig = extractFloat32Frac( a );
	1399	aExp = extractFloat32Exp( a );
	1400	bSig = extractFloat32Frac( b );
	1401	bExp = extractFloat32Exp( b );
	1402	expDiff = aExp - bExp;
	1403	aSig <<= 7;
	1404	bSig <<= 7;
	1405	if ( 0 < expDiff ) goto aExpBigger;
	1406	if ( expDiff < 0 ) goto bExpBigger;
	1407	if ( aExp == 0xFF ) {
	1408	if ( aSig \| bSig ) return propagateFloat32NaN( a, b );
	1409	float_raise( float_flag_invalid );
	1410	return float32_default_nan;
	1411	}
	1412	if ( aExp == 0 ) {
	1413	aExp = 1;
	1414	bExp = 1;
	1415	}
	1416	if ( bSig < aSig ) goto aBigger;
	1417	if ( aSig < bSig ) goto bBigger;
	1418	return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
	1419	bExpBigger:
	1420	if ( bExp == 0xFF ) {
	1421	if ( bSig ) return propagateFloat32NaN( a, b );
	1422	return packFloat32( zSign ^ 1, 0xFF, 0 );
	1423	}
	1424	if ( aExp == 0 ) {
	1425	++expDiff;
	1426	}
	1427	else {
	1428	aSig \|= 0x40000000;
	1429	}
	1430	shift32RightJamming( aSig, - expDiff, &aSig );
	1431	bSig \|= 0x40000000;
	1432	bBigger:
	1433	zSig = bSig - aSig;
	1434	zExp = bExp;
	1435	zSign ^= 1;
	1436	goto normalizeRoundAndPack;
	1437	aExpBigger:
	1438	if ( aExp == 0xFF ) {
	1439	if ( aSig ) return propagateFloat32NaN( a, b );
	1440	return a;
	1441	}
	1442	if ( bExp == 0 ) {
	1443	--expDiff;
	1444	}
	1445	else {
	1446	bSig \|= 0x40000000;
	1447	}
	1448	shift32RightJamming( bSig, expDiff, &bSig );
	1449	aSig \|= 0x40000000;
	1450	aBigger:
	1451	zSig = aSig - bSig;
	1452	zExp = aExp;
	1453	normalizeRoundAndPack:
	1454	--zExp;
	1455	return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
	1456
	1457	}
	1458
	1459	/*----------------------------------------------------------------------------
	1460	\| Returns the result of adding the single-precision floating-point values `a'
	1461	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
	1462	\| Binary Floating-Point Arithmetic.
	1463	----------------------------------------------------------------------------/
	1464
	1465	float32 float32_add( float32 a, float32 b )
	1466	{
	1467	flag aSign, bSign;
	1468
	1469	aSign = extractFloat32Sign( a );
	1470	bSign = extractFloat32Sign( b );
	1471	if ( aSign == bSign ) {
	1472	return addFloat32Sigs( a, b, aSign );
	1473	}
	1474	else {
	1475	return subFloat32Sigs( a, b, aSign );
	1476	}
	1477
	1478	}
	1479
	1480	/*----------------------------------------------------------------------------
	1481	\| Returns the result of subtracting the single-precision floating-point values
	1482	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
	1483	\| for Binary Floating-Point Arithmetic.
	1484	----------------------------------------------------------------------------/
	1485
	1486	float32 float32_sub( float32 a, float32 b )
	1487	{
	1488	flag aSign, bSign;
	1489
	1490	aSign = extractFloat32Sign( a );
	1491	bSign = extractFloat32Sign( b );
	1492	if ( aSign == bSign ) {
	1493	return subFloat32Sigs( a, b, aSign );
	1494	}
	1495	else {
	1496	return addFloat32Sigs( a, b, aSign );
	1497	}
	1498
	1499	}
	1500
	1501	/*----------------------------------------------------------------------------
	1502	\| Returns the result of multiplying the single-precision floating-point values
	1503	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
	1504	\| for Binary Floating-Point Arithmetic.
	1505	----------------------------------------------------------------------------/
	1506
	1507	float32 float32_mul( float32 a, float32 b )
	1508	{
	1509	flag aSign, bSign, zSign;
	1510	int16 aExp, bExp, zExp;
	1511	bits32 aSig, bSig;
	1512	bits64 zSig64;
	1513	bits32 zSig;
	1514
	1515	aSig = extractFloat32Frac( a );
	1516	aExp = extractFloat32Exp( a );
	1517	aSign = extractFloat32Sign( a );
	1518	bSig = extractFloat32Frac( b );
	1519	bExp = extractFloat32Exp( b );
	1520	bSign = extractFloat32Sign( b );
	1521	zSign = aSign ^ bSign;
	1522	if ( aExp == 0xFF ) {
	1523	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
	1524	return propagateFloat32NaN( a, b );
	1525	}
	1526	if ( ( bExp \| bSig ) == 0 ) {
	1527	float_raise( float_flag_invalid );
	1528	return float32_default_nan;
	1529	}
	1530	return packFloat32( zSign, 0xFF, 0 );
	1531	}
	1532	if ( bExp == 0xFF ) {
	1533	if ( bSig ) return propagateFloat32NaN( a, b );
	1534	if ( ( aExp \| aSig ) == 0 ) {
	1535	float_raise( float_flag_invalid );
	1536	return float32_default_nan;
	1537	}
	1538	return packFloat32( zSign, 0xFF, 0 );
	1539	}
	1540	if ( aExp == 0 ) {
	1541	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
	1542	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
	1543	}
	1544	if ( bExp == 0 ) {
	1545	if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
	1546	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
	1547	}
	1548	zExp = aExp + bExp - 0x7F;
	1549	aSig = ( aSig \| 0x00800000 )<<7;
	1550	bSig = ( bSig \| 0x00800000 )<<8;
	1551	shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
	1552	zSig = zSig64;
	1553	if ( 0 <= (sbits32) ( zSig<<1 ) ) {
	1554	zSig <<= 1;
	1555	--zExp;
	1556	}
	1557	return roundAndPackFloat32( zSign, zExp, zSig );
	1558
	1559	}
	1560
	1561	/*----------------------------------------------------------------------------
	1562	\| Returns the result of dividing the single-precision floating-point value `a'
	1563	\| by the corresponding value `b'. The operation is performed according to the
	1564	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	1565	----------------------------------------------------------------------------/
	1566
	1567	float32 float32_div( float32 a, float32 b )
	1568	{
	1569	flag aSign, bSign, zSign;
	1570	int16 aExp, bExp, zExp;
	1571	bits32 aSig, bSig, zSig;
	1572
	1573	aSig = extractFloat32Frac( a );
	1574	aExp = extractFloat32Exp( a );
	1575	aSign = extractFloat32Sign( a );
	1576	bSig = extractFloat32Frac( b );
	1577	bExp = extractFloat32Exp( b );
	1578	bSign = extractFloat32Sign( b );
	1579	zSign = aSign ^ bSign;
	1580	if ( aExp == 0xFF ) {
	1581	if ( aSig ) return propagateFloat32NaN( a, b );
	1582	if ( bExp == 0xFF ) {
	1583	if ( bSig ) return propagateFloat32NaN( a, b );
	1584	float_raise( float_flag_invalid );
	1585	return float32_default_nan;
	1586	}
	1587	return packFloat32( zSign, 0xFF, 0 );
	1588	}
	1589	if ( bExp == 0xFF ) {
	1590	if ( bSig ) return propagateFloat32NaN( a, b );
	1591	return packFloat32( zSign, 0, 0 );
	1592	}
	1593	if ( bExp == 0 ) {
	1594	if ( bSig == 0 ) {
	1595	if ( ( aExp \| aSig ) == 0 ) {
	1596	float_raise( float_flag_invalid );
	1597	return float32_default_nan;
	1598	}
	1599	float_raise( float_flag_divbyzero );
	1600	return packFloat32( zSign, 0xFF, 0 );
	1601	}
	1602	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
	1603	}
	1604	if ( aExp == 0 ) {
	1605	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
	1606	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
	1607	}
	1608	zExp = aExp - bExp + 0x7D;
	1609	aSig = ( aSig \| 0x00800000 )<<7;
	1610	bSig = ( bSig \| 0x00800000 )<<8;
	1611	if ( bSig <= ( aSig + aSig ) ) {
	1612	aSig >>= 1;
	1613	++zExp;
	1614	}
	1615	zSig = ( ( (bits64) aSig )<<32 ) / bSig;
	1616	if ( ( zSig & 0x3F ) == 0 ) {
	1617	zSig \|= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
	1618	}
	1619	return roundAndPackFloat32( zSign, zExp, zSig );
	1620
	1621	}
	1622
	1623	/*----------------------------------------------------------------------------
	1624	\| Returns the remainder of the single-precision floating-point value `a'
	1625	\| with respect to the corresponding value `b'. The operation is performed
	1626	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	1627	----------------------------------------------------------------------------/
	1628
	1629	float32 float32_rem( float32 a, float32 b )
	1630	{
	1631	flag aSign, zSign;
	1632	int16 aExp, bExp, expDiff;
	1633	bits32 aSig, bSig;
	1634	bits32 q;
	1635	bits64 aSig64, bSig64, q64;
	1636	bits32 alternateASig;
	1637	sbits32 sigMean;
	1638
	1639	aSig = extractFloat32Frac( a );
	1640	aExp = extractFloat32Exp( a );
	1641	aSign = extractFloat32Sign( a );
	1642	bSig = extractFloat32Frac( b );
	1643	bExp = extractFloat32Exp( b );
	1644	// bSign = extractFloat32Sign( b );
	1645	if ( aExp == 0xFF ) {
	1646	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
	1647	return propagateFloat32NaN( a, b );
	1648	}
	1649	float_raise( float_flag_invalid );
	1650	return float32_default_nan;
	1651	}
	1652	if ( bExp == 0xFF ) {
	1653	if ( bSig ) return propagateFloat32NaN( a, b );
	1654	return a;
	1655	}
	1656	if ( bExp == 0 ) {
	1657	if ( bSig == 0 ) {
	1658	float_raise( float_flag_invalid );
	1659	return float32_default_nan;
	1660	}
	1661	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
	1662	}
	1663	if ( aExp == 0 ) {
	1664	if ( aSig == 0 ) return a;
	1665	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
	1666	}
	1667	expDiff = aExp - bExp;
	1668	aSig \|= 0x00800000;
	1669	bSig \|= 0x00800000;
	1670	if ( expDiff < 32 ) {
	1671	aSig <<= 8;
	1672	bSig <<= 8;
	1673	if ( expDiff < 0 ) {
	1674	if ( expDiff < -1 ) return a;
	1675	aSig >>= 1;
	1676	}
	1677	q = ( bSig <= aSig );
	1678	if ( q ) aSig -= bSig;
	1679	if ( 0 < expDiff ) {
	1680	q = ( ( (bits64) aSig )<<32 ) / bSig;
	1681	q >>= 32 - expDiff;
	1682	bSig >>= 2;
	1683	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
	1684	}
	1685	else {
	1686	aSig >>= 2;
	1687	bSig >>= 2;
	1688	}
	1689	}
	1690	else {
	1691	if ( bSig <= aSig ) aSig -= bSig;
	1692	aSig64 = ( (bits64) aSig )<<40;
	1693	bSig64 = ( (bits64) bSig )<<40;
	1694	expDiff -= 64;
	1695	while ( 0 < expDiff ) {
	1696	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
	1697	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
	1698	aSig64 = - ( ( bSig * q64 )<<38 );
	1699	expDiff -= 62;
	1700	}
	1701	expDiff += 64;
	1702	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
	1703	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
	1704	q = q64>>( 64 - expDiff );
	1705	bSig <<= 6;
	1706	aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
	1707	}
	1708	do {
	1709	alternateASig = aSig;
	1710	++q;
	1711	aSig -= bSig;
	1712	} while ( 0 <= (sbits32) aSig );
	1713	sigMean = aSig + alternateASig;
	1714	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
	1715	aSig = alternateASig;
	1716	}
	1717	zSign = ( (sbits32) aSig < 0 );
	1718	if ( zSign ) aSig = - aSig;
	1719	return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
	1720
	1721	}
	1722
	1723	/*----------------------------------------------------------------------------
	1724	\| Returns the square root of the single-precision floating-point value `a'.
	1725	\| The operation is performed according to the IEC/IEEE Standard for Binary
	1726	\| Floating-Point Arithmetic.
	1727	----------------------------------------------------------------------------/
	1728
	1729	float32 float32_sqrt( float32 a )
	1730	{
	1731	flag aSign;
	1732	int16 aExp, zExp;
	1733	bits32 aSig, zSig;
	1734	bits64 rem, term;
	1735
	1736	aSig = extractFloat32Frac( a );
	1737	aExp = extractFloat32Exp( a );
	1738	aSign = extractFloat32Sign( a );
	1739	if ( aExp == 0xFF ) {
	1740	if ( aSig ) return propagateFloat32NaN( a, 0 );
	1741	if ( ! aSign ) return a;
	1742	float_raise( float_flag_invalid );
	1743	return float32_default_nan;
	1744	}
	1745	if ( aSign ) {
	1746	if ( ( aExp \| aSig ) == 0 ) return a;
	1747	float_raise( float_flag_invalid );
	1748	return float32_default_nan;
	1749	}
	1750	if ( aExp == 0 ) {
	1751	if ( aSig == 0 ) return 0;
	1752	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
	1753	}
	1754	zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
	1755	aSig = ( aSig \| 0x00800000 )<<8;
	1756	zSig = estimateSqrt32( aExp, aSig ) + 2;
	1757	if ( ( zSig & 0x7F ) <= 5 ) {
	1758	if ( zSig < 2 ) {
	1759	zSig = 0x7FFFFFFF;
	1760	goto roundAndPack;
	1761	}
	1762	aSig >>= aExp & 1;
	1763	term = ( (bits64) zSig ) * zSig;
	1764	rem = ( ( (bits64) aSig )<<32 ) - term;
	1765	while ( (sbits64) rem < 0 ) {
	1766	--zSig;
	1767	rem += ( ( (bits64) zSig )<<1 ) \| 1;
	1768	}
	1769	zSig \|= ( rem != 0 );
	1770	}
	1771	shift32RightJamming( zSig, 1, &zSig );
	1772	roundAndPack:
	1773	return roundAndPackFloat32( 0, zExp, zSig );
	1774
	1775	}
	1776
	1777	/*----------------------------------------------------------------------------
	1778	\| Returns 1 if the single-precision floating-point value `a' is equal to
	1779	\| the corresponding value `b', and 0 otherwise. The comparison is performed
	1780	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	1781	----------------------------------------------------------------------------/
	1782
	1783	flag float32_eq( float32 a, float32 b )
	1784	{
	1785	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
	1786	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
	1787	) {
	1788	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
	1789	float_raise( float_flag_invalid );
	1790	}
	1791	return 0;
	1792	}
	1793	return ( a == b ) \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
	1794
	1795	}
	1796
	1797	/*----------------------------------------------------------------------------
	1798	\| Returns 1 if the single-precision floating-point value `a' is less than
	1799	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
	1800	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	1801	\| Arithmetic.
	1802	----------------------------------------------------------------------------/
	1803
	1804	flag float32_le( float32 a, float32 b )
	1805	{
	1806	flag aSign, bSign;
	1807
	1808	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
	1809	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
	1810	) {
	1811	float_raise( float_flag_invalid );
	1812	return 0;
	1813	}
	1814	aSign = extractFloat32Sign( a );
	1815	bSign = extractFloat32Sign( b );
	1816	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
	1817	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
	1818
	1819	}
	1820
	1821	/*----------------------------------------------------------------------------
	1822	\| Returns 1 if the single-precision floating-point value `a' is less than
	1823	\| the corresponding value `b', and 0 otherwise. The comparison is performed
	1824	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	1825	----------------------------------------------------------------------------/
	1826
	1827	flag float32_lt( float32 a, float32 b )
	1828	{
	1829	flag aSign, bSign;
	1830
	1831	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
	1832	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
	1833	) {
	1834	float_raise( float_flag_invalid );
	1835	return 0;
	1836	}
	1837	aSign = extractFloat32Sign( a );
	1838	bSign = extractFloat32Sign( b );
	1839	if ( aSign != bSign ) return aSign && ( (bits32) ( ( a \| b )<<1 ) != 0 );
	1840	return ( a != b ) && ( aSign ^ ( a < b ) );
	1841
	1842	}
	1843
	1844	/*----------------------------------------------------------------------------
	1845	\| Returns 1 if the single-precision floating-point value `a' is equal to
	1846	\| the corresponding value `b', and 0 otherwise. The invalid exception is
	1847	\| raised if either operand is a NaN. Otherwise, the comparison is performed
	1848	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	1849	----------------------------------------------------------------------------/
	1850
	1851	flag float32_eq_signaling( float32 a, float32 b )
	1852	{
	1853	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
	1854	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
	1855	) {
	1856	float_raise( float_flag_invalid );
	1857	return 0;
	1858	}
	1859	return ( a == b ) \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
	1860
	1861	}
	1862
	1863	/*----------------------------------------------------------------------------
	1864	\| Returns 1 if the single-precision floating-point value `a' is less than or
	1865	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
	1866	\| cause an exception. Otherwise, the comparison is performed according to the
	1867	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	1868	----------------------------------------------------------------------------/
	1869
	1870	flag float32_le_quiet( float32 a, float32 b )
	1871	{
	1872	flag aSign, bSign;
	1873	// int16 aExp, bExp;
	1874
	1875	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
	1876	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
	1877	) {
	1878	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
	1879	float_raise( float_flag_invalid );
	1880	}
	1881	return 0;
	1882	}
	1883	aSign = extractFloat32Sign( a );
	1884	bSign = extractFloat32Sign( b );
	1885	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( a \| b )<<1 ) == 0 );
	1886	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
	1887
	1888	}
	1889
	1890	/*----------------------------------------------------------------------------
	1891	\| Returns 1 if the single-precision floating-point value `a' is less than
	1892	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
	1893	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
	1894	\| Standard for Binary Floating-Point Arithmetic.
	1895	----------------------------------------------------------------------------/
	1896
	1897	flag float32_lt_quiet( float32 a, float32 b )
	1898	{
	1899	flag aSign, bSign;
	1900
	1901	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
	1902	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
	1903	) {
	1904	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
	1905	float_raise( float_flag_invalid );
	1906	}
	1907	return 0;
	1908	}
	1909	aSign = extractFloat32Sign( a );
	1910	bSign = extractFloat32Sign( b );
	1911	if ( aSign != bSign ) return aSign && ( (bits32) ( ( a \| b )<<1 ) != 0 );
	1912	return ( a != b ) && ( aSign ^ ( a < b ) );
	1913
	1914	}
	1915
	1916	/*----------------------------------------------------------------------------
	1917	\| Returns the result of converting the double-precision floating-point value
	1918	\| `a' to the 32-bit two's complement integer format. The conversion is
	1919	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	1920	\| Arithmetic---which means in particular that the conversion is rounded
	1921	\| according to the current rounding mode. If `a' is a NaN, the largest
	1922	\| positive integer is returned. Otherwise, if the conversion overflows, the
	1923	\| largest integer with the same sign as `a' is returned.
	1924	----------------------------------------------------------------------------/
	1925
	1926	int32 float64_to_int32( float64 a )
	1927	{
	1928	flag aSign;
	1929	int16 aExp, shiftCount;
	1930	bits64 aSig;
	1931
	1932	aSig = extractFloat64Frac( a );
	1933	aExp = extractFloat64Exp( a );
	1934	aSign = extractFloat64Sign( a );
	1935	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
	1936	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
	1937	shiftCount = 0x42C - aExp;
	1938	if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
	1939	return roundAndPackInt32( aSign, aSig );
	1940
	1941	}
	1942
	1943	/*----------------------------------------------------------------------------
	1944	\| Returns the result of converting the double-precision floating-point value
	1945	\| `a' to the 32-bit two's complement integer format. The conversion is
	1946	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	1947	\| Arithmetic, except that the conversion is always rounded toward zero.
	1948	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
	1949	\| the conversion overflows, the largest integer with the same sign as `a' is
	1950	\| returned.
	1951	----------------------------------------------------------------------------/
	1952
	1953	int32 float64_to_int32_round_to_zero( float64 a )
	1954	{
	1955	flag aSign;
	1956	int16 aExp, shiftCount;
	1957	bits64 aSig, savedASig;
	1958	int32 z;
	1959
	1960	aSig = extractFloat64Frac( a );
	1961	aExp = extractFloat64Exp( a );
	1962	aSign = extractFloat64Sign( a );
	1963	if ( 0x41E < aExp ) {
	1964	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
	1965	goto invalid;
	1966	}
	1967	else if ( aExp < 0x3FF ) {
	1968	if ( aExp \|\| aSig ) float_exception_flags \|= float_flag_inexact;
	1969	return 0;
	1970	}
	1971	aSig \|= LIT64( 0x0010000000000000 );
	1972	shiftCount = 0x433 - aExp;
	1973	savedASig = aSig;
	1974	aSig >>= shiftCount;
	1975	z = aSig;
	1976	if ( aSign ) z = - z;
	1977	if ( ( z < 0 ) ^ aSign ) {
	1978	invalid:
	1979	float_raise( float_flag_invalid );
	1980	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
	1981	}
	1982	if ( ( aSig<<shiftCount ) != savedASig ) {
	1983	float_exception_flags \|= float_flag_inexact;
	1984	}
	1985	return z;
	1986
	1987	}
	1988
	1989	/*----------------------------------------------------------------------------
	1990	\| Returns the result of converting the double-precision floating-point value
	1991	\| `a' to the 64-bit two's complement integer format. The conversion is
	1992	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	1993	\| Arithmetic---which means in particular that the conversion is rounded
	1994	\| according to the current rounding mode. If `a' is a NaN, the largest
	1995	\| positive integer is returned. Otherwise, if the conversion overflows, the
	1996	\| largest integer with the same sign as `a' is returned.
	1997	----------------------------------------------------------------------------/
	1998
	1999	int64 float64_to_int64( float64 a )
	2000	{
	2001	flag aSign;
	2002	int16 aExp, shiftCount;
	2003	bits64 aSig, aSigExtra;
	2004
	2005	aSig = extractFloat64Frac( a );
	2006	aExp = extractFloat64Exp( a );
	2007	aSign = extractFloat64Sign( a );
	2008	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
	2009	shiftCount = 0x433 - aExp;
	2010	if ( shiftCount <= 0 ) {
	2011	if ( 0x43E < aExp ) {
	2012	float_raise( float_flag_invalid );
	2013	if ( ! aSign
	2014	\|\| ( ( aExp == 0x7FF )
	2015	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
	2016	) {
	2017	return LIT64( 0x7FFFFFFFFFFFFFFF );
	2018	}
	2019	return (sbits64) LIT64( 0x8000000000000000 );
	2020	}
	2021	aSigExtra = 0;
	2022	aSig <<= - shiftCount;
	2023	}
	2024	else {
	2025	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
	2026	}
	2027	return roundAndPackInt64( aSign, aSig, aSigExtra );
	2028
	2029	}
	2030
	2031	/*----------------------------------------------------------------------------
	2032	\| Returns the result of converting the double-precision floating-point value
	2033	\| `a' to the 64-bit two's complement integer format. The conversion is
	2034	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	2035	\| Arithmetic, except that the conversion is always rounded toward zero.
	2036	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
	2037	\| the conversion overflows, the largest integer with the same sign as `a' is
	2038	\| returned.
	2039	----------------------------------------------------------------------------/
	2040
	2041	int64 float64_to_int64_round_to_zero( float64 a )
	2042	{
	2043	flag aSign;
	2044	int16 aExp, shiftCount;
	2045	bits64 aSig;
	2046	int64 z;
	2047
	2048	aSig = extractFloat64Frac( a );
	2049	aExp = extractFloat64Exp( a );
	2050	aSign = extractFloat64Sign( a );
	2051	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
	2052	shiftCount = aExp - 0x433;
	2053	if ( 0 <= shiftCount ) {
	2054	if ( 0x43E <= aExp ) {
	2055	if ( a != LIT64( 0xC3E0000000000000 ) ) {
	2056	float_raise( float_flag_invalid );
	2057	if ( ! aSign
	2058	\|\| ( ( aExp == 0x7FF )
	2059	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
	2060	) {
	2061	return LIT64( 0x7FFFFFFFFFFFFFFF );
	2062	}
	2063	}
	2064	return (sbits64) LIT64( 0x8000000000000000 );
	2065	}
	2066	z = aSig<<shiftCount;
	2067	}
	2068	else {
	2069	if ( aExp < 0x3FE ) {
	2070	if ( aExp \| aSig ) float_exception_flags \|= float_flag_inexact;
	2071	return 0;
	2072	}
	2073	z = aSig>>( - shiftCount );
	2074	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
	2075	float_exception_flags \|= float_flag_inexact;
	2076	}
	2077	}
	2078	if ( aSign ) z = - z;
	2079	return z;
	2080
	2081	}
	2082
	2083	/*----------------------------------------------------------------------------
	2084	\| Returns the result of converting the double-precision floating-point value
	2085	\| `a' to the single-precision floating-point format. The conversion is
	2086	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	2087	\| Arithmetic.
	2088	----------------------------------------------------------------------------/
	2089
	2090	float32 float64_to_float32( float64 a )
	2091	{
	2092	flag aSign;
	2093	int16 aExp;
	2094	bits64 aSig;
	2095	bits32 zSig;
	2096
	2097	aSig = extractFloat64Frac( a );
	2098	aExp = extractFloat64Exp( a );
	2099	aSign = extractFloat64Sign( a );
	2100	if ( aExp == 0x7FF ) {
	2101	if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
	2102	return packFloat32( aSign, 0xFF, 0 );
	2103	}
	2104	shift64RightJamming( aSig, 22, &aSig );
	2105	zSig = aSig;
	2106	if ( aExp \|\| zSig ) {
	2107	zSig \|= 0x40000000;
	2108	aExp -= 0x381;
	2109	}
	2110	return roundAndPackFloat32( aSign, aExp, zSig );
	2111
	2112	}
	2113
	2114	#ifdef FLOATX80
	2115
	2116	/*----------------------------------------------------------------------------
	2117	\| Returns the result of converting the double-precision floating-point value
	2118	\| `a' to the extended double-precision floating-point format. The conversion
	2119	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	2120	\| Arithmetic.
	2121	----------------------------------------------------------------------------/
	2122
	2123	floatx80 float64_to_floatx80( float64 a )
	2124	{
	2125	flag aSign;
	2126	int16 aExp;
	2127	bits64 aSig;
	2128
	2129	aSig = extractFloat64Frac( a );
	2130	aExp = extractFloat64Exp( a );
	2131	aSign = extractFloat64Sign( a );
	2132	if ( aExp == 0x7FF ) {
	2133	if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
	2134	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
	2135	}
	2136	if ( aExp == 0 ) {
	2137	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
	2138	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
	2139	}
	2140	return
	2141	packFloatx80(
	2142	aSign, aExp + 0x3C00, ( aSig \| LIT64( 0x0010000000000000 ) )<<11 );
	2143
	2144	}
	2145
	2146	#endif
	2147
	2148	#ifdef FLOAT128
	2149
	2150	/*----------------------------------------------------------------------------
	2151	\| Returns the result of converting the double-precision floating-point value
	2152	\| `a' to the quadruple-precision floating-point format. The conversion is
	2153	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	2154	\| Arithmetic.
	2155	----------------------------------------------------------------------------/
	2156
	2157	float128 float64_to_float128( float64 a )
	2158	{
	2159	flag aSign;
	2160	int16 aExp;
	2161	bits64 aSig, zSig0, zSig1;
	2162
	2163	aSig = extractFloat64Frac( a );
	2164	aExp = extractFloat64Exp( a );
	2165	aSign = extractFloat64Sign( a );
	2166	if ( aExp == 0x7FF ) {
	2167	if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
	2168	return packFloat128( aSign, 0x7FFF, 0, 0 );
	2169	}
	2170	if ( aExp == 0 ) {
	2171	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
	2172	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
	2173	--aExp;
	2174	}
	2175	shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
	2176	return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
	2177
	2178	}
	2179
	2180	#endif
	2181
	2182	/*----------------------------------------------------------------------------
	2183	\| Rounds the double-precision floating-point value `a' to an integer, and
	2184	\| returns the result as a double-precision floating-point value. The
	2185	\| operation is performed according to the IEC/IEEE Standard for Binary
	2186	\| Floating-Point Arithmetic.
	2187	----------------------------------------------------------------------------/
	2188
	2189	float64 float64_round_to_int( float64 a )
	2190	{
	2191	flag aSign;
	2192	int16 aExp;
	2193	bits64 lastBitMask, roundBitsMask;
	2194	int8 roundingMode;
	2195	float64 z;
	2196
	2197	aExp = extractFloat64Exp( a );
	2198	if ( 0x433 <= aExp ) {
	2199	if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
	2200	return propagateFloat64NaN( a, a );
	2201	}
	2202	return a;
	2203	}
	2204	if ( aExp < 0x3FF ) {
	2205	if ( (bits64) ( a<<1 ) == 0 ) return a;
	2206	float_exception_flags \|= float_flag_inexact;
	2207	aSign = extractFloat64Sign( a );
	2208	switch ( float_rounding_mode ) {
	2209	case float_round_nearest_even:
	2210	if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
	2211	return packFloat64( aSign, 0x3FF, 0 );
	2212	}
	2213	break;
	2214	case float_round_down:
	2215	return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
	2216	case float_round_up:
	2217	return
	2218	aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
	2219	}
	2220	return packFloat64( aSign, 0, 0 );
	2221	}
	2222	lastBitMask = 1;
	2223	lastBitMask <<= 0x433 - aExp;
	2224	roundBitsMask = lastBitMask - 1;
	2225	z = a;
	2226	roundingMode = float_rounding_mode;
	2227	if ( roundingMode == float_round_nearest_even ) {
	2228	z += lastBitMask>>1;
	2229	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
	2230	}
	2231	else if ( roundingMode != float_round_to_zero ) {
	2232	if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
	2233	z += roundBitsMask;
	2234	}
	2235	}
	2236	z &= ~ roundBitsMask;
	2237	if ( z != a ) float_exception_flags \|= float_flag_inexact;
	2238	return z;
	2239
	2240	}
	2241
	2242	/*----------------------------------------------------------------------------
	2243	\| Returns the result of adding the absolute values of the double-precision
	2244	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
	2245	\| before being returned. `zSign' is ignored if the result is a NaN.
	2246	\| The addition is performed according to the IEC/IEEE Standard for Binary
	2247	\| Floating-Point Arithmetic.
	2248	----------------------------------------------------------------------------/
	2249
	2250	static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
	2251	{
	2252	int16 aExp, bExp, zExp;
	2253	bits64 aSig, bSig, zSig;
	2254	int16 expDiff;
	2255
	2256	aSig = extractFloat64Frac( a );
	2257	aExp = extractFloat64Exp( a );
	2258	bSig = extractFloat64Frac( b );
	2259	bExp = extractFloat64Exp( b );
	2260	expDiff = aExp - bExp;
	2261	aSig <<= 9;
	2262	bSig <<= 9;
	2263	if ( 0 < expDiff ) {
	2264	if ( aExp == 0x7FF ) {
	2265	if ( aSig ) return propagateFloat64NaN( a, b );
	2266	return a;
	2267	}
	2268	if ( bExp == 0 ) {
	2269	--expDiff;
	2270	}
	2271	else {
	2272	bSig \|= LIT64( 0x2000000000000000 );
	2273	}
	2274	shift64RightJamming( bSig, expDiff, &bSig );
	2275	zExp = aExp;
	2276	}
	2277	else if ( expDiff < 0 ) {
	2278	if ( bExp == 0x7FF ) {
	2279	if ( bSig ) return propagateFloat64NaN( a, b );
	2280	return packFloat64( zSign, 0x7FF, 0 );
	2281	}
	2282	if ( aExp == 0 ) {
	2283	++expDiff;
	2284	}
	2285	else {
	2286	aSig \|= LIT64( 0x2000000000000000 );
	2287	}
	2288	shift64RightJamming( aSig, - expDiff, &aSig );
	2289	zExp = bExp;
	2290	}
	2291	else {
	2292	if ( aExp == 0x7FF ) {
	2293	if ( aSig \| bSig ) return propagateFloat64NaN( a, b );
	2294	return a;
	2295	}
	2296	if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
	2297	zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
	2298	zExp = aExp;
	2299	goto roundAndPack;
	2300	}
	2301	aSig \|= LIT64( 0x2000000000000000 );
	2302	zSig = ( aSig + bSig )<<1;
	2303	--zExp;
	2304	if ( (sbits64) zSig < 0 ) {
	2305	zSig = aSig + bSig;
	2306	++zExp;
	2307	}
	2308	roundAndPack:
	2309	return roundAndPackFloat64( zSign, zExp, zSig );
	2310
	2311	}
	2312
	2313	/*----------------------------------------------------------------------------
	2314	\| Returns the result of subtracting the absolute values of the double-
	2315	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
	2316	\| difference is negated before being returned. `zSign' is ignored if the
	2317	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
	2318	\| Standard for Binary Floating-Point Arithmetic.
	2319	----------------------------------------------------------------------------/
	2320
	2321	static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
	2322	{
	2323	int16 aExp, bExp, zExp;
	2324	bits64 aSig, bSig, zSig;
	2325	int16 expDiff;
	2326
	2327	aSig = extractFloat64Frac( a );
	2328	aExp = extractFloat64Exp( a );
	2329	bSig = extractFloat64Frac( b );
	2330	bExp = extractFloat64Exp( b );
	2331	expDiff = aExp - bExp;
	2332	aSig <<= 10;
	2333	bSig <<= 10;
	2334	if ( 0 < expDiff ) goto aExpBigger;
	2335	if ( expDiff < 0 ) goto bExpBigger;
	2336	if ( aExp == 0x7FF ) {
	2337	if ( aSig \| bSig ) return propagateFloat64NaN( a, b );
	2338	float_raise( float_flag_invalid );
	2339	return float64_default_nan;
	2340	}
	2341	if ( aExp == 0 ) {
	2342	aExp = 1;
	2343	bExp = 1;
	2344	}
	2345	if ( bSig < aSig ) goto aBigger;
	2346	if ( aSig < bSig ) goto bBigger;
	2347	return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
	2348	bExpBigger:
	2349	if ( bExp == 0x7FF ) {
	2350	if ( bSig ) return propagateFloat64NaN( a, b );
	2351	return packFloat64( zSign ^ 1, 0x7FF, 0 );
	2352	}
	2353	if ( aExp == 0 ) {
	2354	++expDiff;
	2355	}
	2356	else {
	2357	aSig \|= LIT64( 0x4000000000000000 );
	2358	}
	2359	shift64RightJamming( aSig, - expDiff, &aSig );
	2360	bSig \|= LIT64( 0x4000000000000000 );
	2361	bBigger:
	2362	zSig = bSig - aSig;
	2363	zExp = bExp;
	2364	zSign ^= 1;
	2365	goto normalizeRoundAndPack;
	2366	aExpBigger:
	2367	if ( aExp == 0x7FF ) {
	2368	if ( aSig ) return propagateFloat64NaN( a, b );
	2369	return a;
	2370	}
	2371	if ( bExp == 0 ) {
	2372	--expDiff;
	2373	}
	2374	else {
	2375	bSig \|= LIT64( 0x4000000000000000 );
	2376	}
	2377	shift64RightJamming( bSig, expDiff, &bSig );
	2378	aSig \|= LIT64( 0x4000000000000000 );
	2379	aBigger:
	2380	zSig = aSig - bSig;
	2381	zExp = aExp;
	2382	normalizeRoundAndPack:
	2383	--zExp;
	2384	return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
	2385
	2386	}
	2387
	2388	/*----------------------------------------------------------------------------
	2389	\| Returns the result of adding the double-precision floating-point values `a'
	2390	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
	2391	\| Binary Floating-Point Arithmetic.
	2392	----------------------------------------------------------------------------/
	2393
	2394	float64 float64_add( float64 a, float64 b )
	2395	{
	2396	flag aSign, bSign;
	2397
	2398	aSign = extractFloat64Sign( a );
	2399	bSign = extractFloat64Sign( b );
	2400	if ( aSign == bSign ) {
	2401	return addFloat64Sigs( a, b, aSign );
	2402	}
	2403	else {
	2404	return subFloat64Sigs( a, b, aSign );
	2405	}
	2406
	2407	}
	2408
	2409	/*----------------------------------------------------------------------------
	2410	\| Returns the result of subtracting the double-precision floating-point values
	2411	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
	2412	\| for Binary Floating-Point Arithmetic.
	2413	----------------------------------------------------------------------------/
	2414
	2415	float64 float64_sub( float64 a, float64 b )
	2416	{
	2417	flag aSign, bSign;
	2418
	2419	aSign = extractFloat64Sign( a );
	2420	bSign = extractFloat64Sign( b );
	2421	if ( aSign == bSign ) {
	2422	return subFloat64Sigs( a, b, aSign );
	2423	}
	2424	else {
	2425	return addFloat64Sigs( a, b, aSign );
	2426	}
	2427
	2428	}
	2429
	2430	/*----------------------------------------------------------------------------
	2431	\| Returns the result of multiplying the double-precision floating-point values
	2432	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
	2433	\| for Binary Floating-Point Arithmetic.
	2434	----------------------------------------------------------------------------/
	2435
	2436	float64 float64_mul( float64 a, float64 b )
	2437	{
	2438	flag aSign, bSign, zSign;
	2439	int16 aExp, bExp, zExp;
	2440	bits64 aSig, bSig, zSig0, zSig1;
	2441
	2442	aSig = extractFloat64Frac( a );
	2443	aExp = extractFloat64Exp( a );
	2444	aSign = extractFloat64Sign( a );
	2445	bSig = extractFloat64Frac( b );
	2446	bExp = extractFloat64Exp( b );
	2447	bSign = extractFloat64Sign( b );
	2448	zSign = aSign ^ bSign;
	2449	if ( aExp == 0x7FF ) {
	2450	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
	2451	return propagateFloat64NaN( a, b );
	2452	}
	2453	if ( ( bExp \| bSig ) == 0 ) {
	2454	float_raise( float_flag_invalid );
	2455	return float64_default_nan;
	2456	}
	2457	return packFloat64( zSign, 0x7FF, 0 );
	2458	}
	2459	if ( bExp == 0x7FF ) {
	2460	if ( bSig ) return propagateFloat64NaN( a, b );
	2461	if ( ( aExp \| aSig ) == 0 ) {
	2462	float_raise( float_flag_invalid );
	2463	return float64_default_nan;
	2464	}
	2465	return packFloat64( zSign, 0x7FF, 0 );
	2466	}
	2467	if ( aExp == 0 ) {
	2468	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
	2469	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
	2470	}
	2471	if ( bExp == 0 ) {
	2472	if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
	2473	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
	2474	}
	2475	zExp = aExp + bExp - 0x3FF;
	2476	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
	2477	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
	2478	mul64To128( aSig, bSig, &zSig0, &zSig1 );
	2479	zSig0 \|= ( zSig1 != 0 );
	2480	if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
	2481	zSig0 <<= 1;
	2482	--zExp;
	2483	}
	2484	return roundAndPackFloat64( zSign, zExp, zSig0 );
	2485
	2486	}
	2487
	2488	/*----------------------------------------------------------------------------
	2489	\| Returns the result of dividing the double-precision floating-point value `a'
	2490	\| by the corresponding value `b'. The operation is performed according to
	2491	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	2492	----------------------------------------------------------------------------/
	2493
	2494	float64 float64_div( float64 a, float64 b )
	2495	{
	2496	flag aSign, bSign, zSign;
	2497	int16 aExp, bExp, zExp;
	2498	bits64 aSig, bSig, zSig;
	2499	bits64 rem0, rem1;
	2500	bits64 term0, term1;
	2501
	2502	aSig = extractFloat64Frac( a );
	2503	aExp = extractFloat64Exp( a );
	2504	aSign = extractFloat64Sign( a );
	2505	bSig = extractFloat64Frac( b );
	2506	bExp = extractFloat64Exp( b );
	2507	bSign = extractFloat64Sign( b );
	2508	zSign = aSign ^ bSign;
	2509	if ( aExp == 0x7FF ) {
	2510	if ( aSig ) return propagateFloat64NaN( a, b );
	2511	if ( bExp == 0x7FF ) {
	2512	if ( bSig ) return propagateFloat64NaN( a, b );
	2513	float_raise( float_flag_invalid );
	2514	return float64_default_nan;
	2515	}
	2516	return packFloat64( zSign, 0x7FF, 0 );
	2517	}
	2518	if ( bExp == 0x7FF ) {
	2519	if ( bSig ) return propagateFloat64NaN( a, b );
	2520	return packFloat64( zSign, 0, 0 );
	2521	}
	2522	if ( bExp == 0 ) {
	2523	if ( bSig == 0 ) {
	2524	if ( ( aExp \| aSig ) == 0 ) {
	2525	float_raise( float_flag_invalid );
	2526	return float64_default_nan;
	2527	}
	2528	float_raise( float_flag_divbyzero );
	2529	return packFloat64( zSign, 0x7FF, 0 );
	2530	}
	2531	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
	2532	}
	2533	if ( aExp == 0 ) {
	2534	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
	2535	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
	2536	}
	2537	zExp = aExp - bExp + 0x3FD;
	2538	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
	2539	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
	2540	if ( bSig <= ( aSig + aSig ) ) {
	2541	aSig >>= 1;
	2542	++zExp;
	2543	}
	2544	zSig = estimateDiv128To64( aSig, 0, bSig );
	2545	if ( ( zSig & 0x1FF ) <= 2 ) {
	2546	mul64To128( bSig, zSig, &term0, &term1 );
	2547	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
	2548	while ( (sbits64) rem0 < 0 ) {
	2549	--zSig;
	2550	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
	2551	}
	2552	zSig \|= ( rem1 != 0 );
	2553	}
	2554	return roundAndPackFloat64( zSign, zExp, zSig );
	2555
	2556	}
	2557
	2558	/*----------------------------------------------------------------------------
	2559	\| Returns the remainder of the double-precision floating-point value `a'
	2560	\| with respect to the corresponding value `b'. The operation is performed
	2561	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	2562	----------------------------------------------------------------------------/
	2563
	2564	float64 float64_rem( float64 a, float64 b )
	2565	{
	2566	flag aSign, zSign;
	2567	int16 aExp, bExp, expDiff;
	2568	bits64 aSig, bSig;
	2569	bits64 q, alternateASig;
	2570	sbits64 sigMean;
	2571
	2572	aSig = extractFloat64Frac( a );
	2573	aExp = extractFloat64Exp( a );
	2574	aSign = extractFloat64Sign( a );
	2575	bSig = extractFloat64Frac( b );
	2576	bExp = extractFloat64Exp( b );
	2577	// bSign = extractFloat64Sign( b );
	2578	if ( aExp == 0x7FF ) {
	2579	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
	2580	return propagateFloat64NaN( a, b );
	2581	}
	2582	float_raise( float_flag_invalid );
	2583	return float64_default_nan;
	2584	}
	2585	if ( bExp == 0x7FF ) {
	2586	if ( bSig ) return propagateFloat64NaN( a, b );
	2587	return a;
	2588	}
	2589	if ( bExp == 0 ) {
	2590	if ( bSig == 0 ) {
	2591	float_raise( float_flag_invalid );
	2592	return float64_default_nan;
	2593	}
	2594	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
	2595	}
	2596	if ( aExp == 0 ) {
	2597	if ( aSig == 0 ) return a;
	2598	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
	2599	}
	2600	expDiff = aExp - bExp;
	2601	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<11;
	2602	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
	2603	if ( expDiff < 0 ) {
	2604	if ( expDiff < -1 ) return a;
	2605	aSig >>= 1;
	2606	}
	2607	q = ( bSig <= aSig );
	2608	if ( q ) aSig -= bSig;
	2609	expDiff -= 64;
	2610	while ( 0 < expDiff ) {
	2611	q = estimateDiv128To64( aSig, 0, bSig );
	2612	q = ( 2 < q ) ? q - 2 : 0;
	2613	aSig = - ( ( bSig>>2 ) * q );
	2614	expDiff -= 62;
	2615	}
	2616	expDiff += 64;
	2617	if ( 0 < expDiff ) {
	2618	q = estimateDiv128To64( aSig, 0, bSig );
	2619	q = ( 2 < q ) ? q - 2 : 0;
	2620	q >>= 64 - expDiff;
	2621	bSig >>= 2;
	2622	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
	2623	}
	2624	else {
	2625	aSig >>= 2;
	2626	bSig >>= 2;
	2627	}
	2628	do {
	2629	alternateASig = aSig;
	2630	++q;
	2631	aSig -= bSig;
	2632	} while ( 0 <= (sbits64) aSig );
	2633	sigMean = aSig + alternateASig;
	2634	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
	2635	aSig = alternateASig;
	2636	}
	2637	zSign = ( (sbits64) aSig < 0 );
	2638	if ( zSign ) aSig = - aSig;
	2639	return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
	2640
	2641	}
	2642
	2643	/*----------------------------------------------------------------------------
	2644	\| Returns the square root of the double-precision floating-point value `a'.
	2645	\| The operation is performed according to the IEC/IEEE Standard for Binary
	2646	\| Floating-Point Arithmetic.
	2647	----------------------------------------------------------------------------/
	2648
	2649	float64 float64_sqrt( float64 a )
	2650	{
	2651	flag aSign;
	2652	int16 aExp, zExp;
	2653	bits64 aSig, zSig, doubleZSig;
	2654	bits64 rem0, rem1, term0, term1;
	2655	// float64 z;
	2656
	2657	aSig = extractFloat64Frac( a );
	2658	aExp = extractFloat64Exp( a );
	2659	aSign = extractFloat64Sign( a );
	2660	if ( aExp == 0x7FF ) {
	2661	if ( aSig ) return propagateFloat64NaN( a, a );
	2662	if ( ! aSign ) return a;
	2663	float_raise( float_flag_invalid );
	2664	return float64_default_nan;
	2665	}
	2666	if ( aSign ) {
	2667	if ( ( aExp \| aSig ) == 0 ) return a;
	2668	float_raise( float_flag_invalid );
	2669	return float64_default_nan;
	2670	}
	2671	if ( aExp == 0 ) {
	2672	if ( aSig == 0 ) return 0;
	2673	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
	2674	}
	2675	zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
	2676	aSig \|= LIT64( 0x0010000000000000 );
	2677	zSig = estimateSqrt32( aExp, aSig>>21 );
	2678	aSig <<= 9 - ( aExp & 1 );
	2679	zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
	2680	if ( ( zSig & 0x1FF ) <= 5 ) {
	2681	doubleZSig = zSig<<1;
	2682	mul64To128( zSig, zSig, &term0, &term1 );
	2683	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
	2684	while ( (sbits64) rem0 < 0 ) {
	2685	--zSig;
	2686	doubleZSig -= 2;
	2687	add128( rem0, rem1, zSig>>63, doubleZSig \| 1, &rem0, &rem1 );
	2688	}
	2689	zSig \|= ( ( rem0 \| rem1 ) != 0 );
	2690	}
	2691	return roundAndPackFloat64( 0, zExp, zSig );
	2692
	2693	}
	2694
	2695	/*----------------------------------------------------------------------------
	2696	\| Returns 1 if the double-precision floating-point value `a' is equal to the
	2697	\| corresponding value `b', and 0 otherwise. The comparison is performed
	2698	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	2699	----------------------------------------------------------------------------/
	2700
	2701	flag float64_eq( float64 a, float64 b )
	2702	{
	2703	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
	2704	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
	2705	) {
	2706	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
	2707	float_raise( float_flag_invalid );
	2708	}
	2709	return 0;
	2710	}
	2711	return ( a == b ) \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
	2712
	2713	}
	2714
	2715	/*----------------------------------------------------------------------------
	2716	\| Returns 1 if the double-precision floating-point value `a' is less than or
	2717	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
	2718	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	2719	\| Arithmetic.
	2720	----------------------------------------------------------------------------/
	2721
	2722	flag float64_le( float64 a, float64 b )
	2723	{
	2724	flag aSign, bSign;
	2725
	2726	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
	2727	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
	2728	) {
	2729	float_raise( float_flag_invalid );
	2730	return 0;
	2731	}
	2732	aSign = extractFloat64Sign( a );
	2733	bSign = extractFloat64Sign( b );
	2734	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
	2735	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
	2736
	2737	}
	2738
	2739	/*----------------------------------------------------------------------------
	2740	\| Returns 1 if the double-precision floating-point value `a' is less than
	2741	\| the corresponding value `b', and 0 otherwise. The comparison is performed
	2742	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	2743	----------------------------------------------------------------------------/
	2744
	2745	flag float64_lt( float64 a, float64 b )
	2746	{
	2747	flag aSign, bSign;
	2748
	2749	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
	2750	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
	2751	) {
	2752	float_raise( float_flag_invalid );
	2753	return 0;
	2754	}
	2755	aSign = extractFloat64Sign( a );
	2756	bSign = extractFloat64Sign( b );
	2757	if ( aSign != bSign ) return aSign && ( (bits64) ( ( a \| b )<<1 ) != 0 );
	2758	return ( a != b ) && ( aSign ^ ( a < b ) );
	2759
	2760	}
	2761
	2762	/*----------------------------------------------------------------------------
	2763	\| Returns 1 if the double-precision floating-point value `a' is equal to the
	2764	\| corresponding value `b', and 0 otherwise. The invalid exception is raised
	2765	\| if either operand is a NaN. Otherwise, the comparison is performed
	2766	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	2767	----------------------------------------------------------------------------/
	2768
	2769	flag float64_eq_signaling( float64 a, float64 b )
	2770	{
	2771	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
	2772	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
	2773	) {
	2774	float_raise( float_flag_invalid );
	2775	return 0;
	2776	}
	2777	return ( a == b ) \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
	2778
	2779	}
	2780
	2781	/*----------------------------------------------------------------------------
	2782	\| Returns 1 if the double-precision floating-point value `a' is less than or
	2783	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
	2784	\| cause an exception. Otherwise, the comparison is performed according to the
	2785	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	2786	----------------------------------------------------------------------------/
	2787
	2788	flag float64_le_quiet( float64 a, float64 b )
	2789	{
	2790	flag aSign, bSign;
	2791	// int16 aExp, bExp;
	2792
	2793	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
	2794	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
	2795	) {
	2796	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
	2797	float_raise( float_flag_invalid );
	2798	}
	2799	return 0;
	2800	}
	2801	aSign = extractFloat64Sign( a );
	2802	bSign = extractFloat64Sign( b );
	2803	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( a \| b )<<1 ) == 0 );
	2804	return ( a == b ) \|\| ( aSign ^ ( a < b ) );
	2805
	2806	}
	2807
	2808	/*----------------------------------------------------------------------------
	2809	\| Returns 1 if the double-precision floating-point value `a' is less than
	2810	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
	2811	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
	2812	\| Standard for Binary Floating-Point Arithmetic.
	2813	----------------------------------------------------------------------------/
	2814
	2815	flag float64_lt_quiet( float64 a, float64 b )
	2816	{
	2817	flag aSign, bSign;
	2818
	2819	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
	2820	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
	2821	) {
	2822	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
	2823	float_raise( float_flag_invalid );
	2824	}
	2825	return 0;
	2826	}
	2827	aSign = extractFloat64Sign( a );
	2828	bSign = extractFloat64Sign( b );
	2829	if ( aSign != bSign ) return aSign && ( (bits64) ( ( a \| b )<<1 ) != 0 );
	2830	return ( a != b ) && ( aSign ^ ( a < b ) );
	2831
	2832	}
	2833
	2834	#ifdef FLOATX80
	2835
	2836	/*----------------------------------------------------------------------------
	2837	\| Returns the result of converting the extended double-precision floating-
	2838	\| point value `a' to the 32-bit two's complement integer format. The
	2839	\| conversion is performed according to the IEC/IEEE Standard for Binary
	2840	\| Floating-Point Arithmetic---which means in particular that the conversion
	2841	\| is rounded according to the current rounding mode. If `a' is a NaN, the
	2842	\| largest positive integer is returned. Otherwise, if the conversion
	2843	\| overflows, the largest integer with the same sign as `a' is returned.
	2844	----------------------------------------------------------------------------/
	2845
	2846	int32 floatx80_to_int32( floatx80 a )
	2847	{
	2848	flag aSign;
	2849	int32 aExp, shiftCount;
	2850	bits64 aSig;
	2851
	2852	aSig = extractFloatx80Frac( a );
	2853	aExp = extractFloatx80Exp( a );
	2854	aSign = extractFloatx80Sign( a );
	2855	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
	2856	shiftCount = 0x4037 - aExp;
	2857	if ( shiftCount <= 0 ) shiftCount = 1;
	2858	shift64RightJamming( aSig, shiftCount, &aSig );
	2859	return roundAndPackInt32( aSign, aSig );
	2860
	2861	}
	2862
	2863	/*----------------------------------------------------------------------------
	2864	\| Returns the result of converting the extended double-precision floating-
	2865	\| point value `a' to the 32-bit two's complement integer format. The
	2866	\| conversion is performed according to the IEC/IEEE Standard for Binary
	2867	\| Floating-Point Arithmetic, except that the conversion is always rounded
	2868	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
	2869	\| Otherwise, if the conversion overflows, the largest integer with the same
	2870	\| sign as `a' is returned.
	2871	----------------------------------------------------------------------------/
	2872
	2873	int32 floatx80_to_int32_round_to_zero( floatx80 a )
	2874	{
	2875	flag aSign;
	2876	int32 aExp, shiftCount;
	2877	bits64 aSig, savedASig;
	2878	int32 z;
	2879
	2880	aSig = extractFloatx80Frac( a );
	2881	aExp = extractFloatx80Exp( a );
	2882	aSign = extractFloatx80Sign( a );
	2883	if ( 0x401E < aExp ) {
	2884	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
	2885	goto invalid;
	2886	}
	2887	else if ( aExp < 0x3FFF ) {
	2888	if ( aExp \|\| aSig ) float_exception_flags \|= float_flag_inexact;
	2889	return 0;
	2890	}
	2891	shiftCount = 0x403E - aExp;
	2892	savedASig = aSig;
	2893	aSig >>= shiftCount;
	2894	z = aSig;
	2895	if ( aSign ) z = - z;
	2896	if ( ( z < 0 ) ^ aSign ) {
	2897	invalid:
	2898	float_raise( float_flag_invalid );
	2899	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
	2900	}
	2901	if ( ( aSig<<shiftCount ) != savedASig ) {
	2902	float_exception_flags \|= float_flag_inexact;
	2903	}
	2904	return z;
	2905
	2906	}
	2907
	2908	/*----------------------------------------------------------------------------
	2909	\| Returns the result of converting the extended double-precision floating-
	2910	\| point value `a' to the 64-bit two's complement integer format. The
	2911	\| conversion is performed according to the IEC/IEEE Standard for Binary
	2912	\| Floating-Point Arithmetic---which means in particular that the conversion
	2913	\| is rounded according to the current rounding mode. If `a' is a NaN,
	2914	\| the largest positive integer is returned. Otherwise, if the conversion
	2915	\| overflows, the largest integer with the same sign as `a' is returned.
	2916	----------------------------------------------------------------------------/
	2917
	2918	int64 floatx80_to_int64( floatx80 a )
	2919	{
	2920	flag aSign;
	2921	int32 aExp, shiftCount;
	2922	bits64 aSig, aSigExtra;
	2923
	2924	aSig = extractFloatx80Frac( a );
	2925	aExp = extractFloatx80Exp( a );
	2926	aSign = extractFloatx80Sign( a );
	2927	shiftCount = 0x403E - aExp;
	2928	if ( shiftCount <= 0 ) {
	2929	if ( shiftCount ) {
	2930	float_raise( float_flag_invalid );
	2931	if ( ! aSign
	2932	\|\| ( ( aExp == 0x7FFF )
	2933	&& ( aSig != LIT64( 0x8000000000000000 ) ) )
	2934	) {
	2935	return LIT64( 0x7FFFFFFFFFFFFFFF );
	2936	}
	2937	return (sbits64) LIT64( 0x8000000000000000 );
	2938	}
	2939	aSigExtra = 0;
	2940	}
	2941	else {
	2942	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
	2943	}
	2944	return roundAndPackInt64( aSign, aSig, aSigExtra );
	2945
	2946	}
	2947
	2948	/*----------------------------------------------------------------------------
	2949	\| Returns the result of converting the extended double-precision floating-
	2950	\| point value `a' to the 64-bit two's complement integer format. The
	2951	\| conversion is performed according to the IEC/IEEE Standard for Binary
	2952	\| Floating-Point Arithmetic, except that the conversion is always rounded
	2953	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
	2954	\| Otherwise, if the conversion overflows, the largest integer with the same
	2955	\| sign as `a' is returned.
	2956	----------------------------------------------------------------------------/
	2957
	2958	int64 floatx80_to_int64_round_to_zero( floatx80 a )
	2959	{
	2960	flag aSign;
	2961	int32 aExp, shiftCount;
	2962	bits64 aSig;
	2963	int64 z;
	2964
	2965	aSig = extractFloatx80Frac( a );
	2966	aExp = extractFloatx80Exp( a );
	2967	aSign = extractFloatx80Sign( a );
	2968	shiftCount = aExp - 0x403E;
	2969	if ( 0 <= shiftCount ) {
	2970	aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
	2971	if ( ( a.high != 0xC03E ) \|\| aSig ) {
	2972	float_raise( float_flag_invalid );
	2973	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && aSig ) ) {
	2974	return LIT64( 0x7FFFFFFFFFFFFFFF );
	2975	}
	2976	}
	2977	return (sbits64) LIT64( 0x8000000000000000 );
	2978	}
	2979	else if ( aExp < 0x3FFF ) {
	2980	if ( aExp \| aSig ) float_exception_flags \|= float_flag_inexact;
	2981	return 0;
	2982	}
	2983	z = aSig>>( - shiftCount );
	2984	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
	2985	float_exception_flags \|= float_flag_inexact;
	2986	}
	2987	if ( aSign ) z = - z;
	2988	return z;
	2989
	2990	}
	2991
	2992	/*----------------------------------------------------------------------------
	2993	\| Returns the result of converting the extended double-precision floating-
	2994	\| point value `a' to the single-precision floating-point format. The
	2995	\| conversion is performed according to the IEC/IEEE Standard for Binary
	2996	\| Floating-Point Arithmetic.
	2997	----------------------------------------------------------------------------/
	2998
	2999	float32 floatx80_to_float32( floatx80 a )
	3000	{
	3001	flag aSign;
	3002	int32 aExp;
	3003	bits64 aSig;
	3004
	3005	aSig = extractFloatx80Frac( a );
	3006	aExp = extractFloatx80Exp( a );
	3007	aSign = extractFloatx80Sign( a );
	3008	if ( aExp == 0x7FFF ) {
	3009	if ( (bits64) ( aSig<<1 ) ) {
	3010	return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
	3011	}
	3012	return packFloat32( aSign, 0xFF, 0 );
	3013	}
	3014	shift64RightJamming( aSig, 33, &aSig );
	3015	if ( aExp \|\| aSig ) aExp -= 0x3F81;
	3016	return roundAndPackFloat32( aSign, aExp, aSig );
	3017
	3018	}
	3019
	3020	/*----------------------------------------------------------------------------
	3021	\| Returns the result of converting the extended double-precision floating-
	3022	\| point value `a' to the double-precision floating-point format. The
	3023	\| conversion is performed according to the IEC/IEEE Standard for Binary
	3024	\| Floating-Point Arithmetic.
	3025	----------------------------------------------------------------------------/
	3026
	3027	float64 floatx80_to_float64( floatx80 a )
	3028	{
	3029	flag aSign;
	3030	int32 aExp;
	3031	bits64 aSig, zSig;
	3032
	3033	aSig = extractFloatx80Frac( a );
	3034	aExp = extractFloatx80Exp( a );
	3035	aSign = extractFloatx80Sign( a );
	3036	if ( aExp == 0x7FFF ) {
	3037	if ( (bits64) ( aSig<<1 ) ) {
	3038	return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
	3039	}
	3040	return packFloat64( aSign, 0x7FF, 0 );
	3041	}
	3042	shift64RightJamming( aSig, 1, &zSig );
	3043	if ( aExp \|\| aSig ) aExp -= 0x3C01;
	3044	return roundAndPackFloat64( aSign, aExp, zSig );
	3045
	3046	}
	3047
	3048	#ifdef FLOAT128
	3049
	3050	/*----------------------------------------------------------------------------
	3051	\| Returns the result of converting the extended double-precision floating-
	3052	\| point value `a' to the quadruple-precision floating-point format. The
	3053	\| conversion is performed according to the IEC/IEEE Standard for Binary
	3054	\| Floating-Point Arithmetic.
	3055	----------------------------------------------------------------------------/
	3056
	3057	float128 floatx80_to_float128( floatx80 a )
	3058	{
	3059	flag aSign;
	3060	int16 aExp;
	3061	bits64 aSig, zSig0, zSig1;
	3062
	3063	aSig = extractFloatx80Frac( a );
	3064	aExp = extractFloatx80Exp( a );
	3065	aSign = extractFloatx80Sign( a );
	3066	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
	3067	return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
	3068	}
	3069	shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
	3070	return packFloat128( aSign, aExp, zSig0, zSig1 );
	3071
	3072	}
	3073
	3074	#endif
	3075
	3076	/*----------------------------------------------------------------------------
	3077	\| Rounds the extended double-precision floating-point value `a' to an integer,
	3078	\| and returns the result as an extended quadruple-precision floating-point
	3079	\| value. The operation is performed according to the IEC/IEEE Standard for
	3080	\| Binary Floating-Point Arithmetic.
	3081	----------------------------------------------------------------------------/
	3082
	3083	floatx80 floatx80_round_to_int( floatx80 a )
	3084	{
	3085	flag aSign;
	3086	int32 aExp;
	3087	bits64 lastBitMask, roundBitsMask;
	3088	int8 roundingMode;
	3089	floatx80 z;
	3090
	3091	aExp = extractFloatx80Exp( a );
	3092	if ( 0x403E <= aExp ) {
	3093	if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
	3094	return propagateFloatx80NaN( a, a );
	3095	}
	3096	return a;
	3097	}
	3098	if ( aExp < 0x3FFF ) {
	3099	if ( ( aExp == 0 )
	3100	&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
	3101	return a;
	3102	}
	3103	float_exception_flags \|= float_flag_inexact;
	3104	aSign = extractFloatx80Sign( a );
	3105	switch ( float_rounding_mode ) {
	3106	case float_round_nearest_even:
	3107	if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
	3108	) {
	3109	return
	3110	packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
	3111	}
	3112	break;
	3113	case float_round_down:
	3114	return
	3115	aSign ?
	3116	packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
	3117	: packFloatx80( 0, 0, 0 );
	3118	case float_round_up:
	3119	return
	3120	aSign ? packFloatx80( 1, 0, 0 )
	3121	: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
	3122	}
	3123	return packFloatx80( aSign, 0, 0 );
	3124	}
	3125	lastBitMask = 1;
	3126	lastBitMask <<= 0x403E - aExp;
	3127	roundBitsMask = lastBitMask - 1;
	3128	z = a;
	3129	roundingMode = float_rounding_mode;
	3130	if ( roundingMode == float_round_nearest_even ) {
	3131	z.low += lastBitMask>>1;
	3132	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
	3133	}
	3134	else if ( roundingMode != float_round_to_zero ) {
	3135	if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
	3136	z.low += roundBitsMask;
	3137	}
	3138	}
	3139	z.low &= ~ roundBitsMask;
	3140	if ( z.low == 0 ) {
	3141	++z.high;
	3142	z.low = LIT64( 0x8000000000000000 );
	3143	}
	3144	if ( z.low != a.low ) float_exception_flags \|= float_flag_inexact;
	3145	return z;
	3146
	3147	}
	3148
	3149	/*----------------------------------------------------------------------------
	3150	\| Returns the result of adding the absolute values of the extended double-
	3151	\| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
	3152	\| negated before being returned. `zSign' is ignored if the result is a NaN.
	3153	\| The addition is performed according to the IEC/IEEE Standard for Binary
	3154	\| Floating-Point Arithmetic.
	3155	----------------------------------------------------------------------------/
	3156
	3157	static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
	3158	{
	3159	int32 aExp, bExp, zExp;
	3160	bits64 aSig, bSig, zSig0, zSig1;
	3161	int32 expDiff;
	3162
	3163	aSig = extractFloatx80Frac( a );
	3164	aExp = extractFloatx80Exp( a );
	3165	bSig = extractFloatx80Frac( b );
	3166	bExp = extractFloatx80Exp( b );
	3167	expDiff = aExp - bExp;
	3168	if ( 0 < expDiff ) {
	3169	if ( aExp == 0x7FFF ) {
	3170	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
	3171	return a;
	3172	}
	3173	if ( bExp == 0 ) --expDiff;
	3174	shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
	3175	zExp = aExp;
	3176	}
	3177	else if ( expDiff < 0 ) {
	3178	if ( bExp == 0x7FFF ) {
	3179	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
	3180	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
	3181	}
	3182	if ( aExp == 0 ) ++expDiff;
	3183	shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
	3184	zExp = bExp;
	3185	}
	3186	else {
	3187	if ( aExp == 0x7FFF ) {
	3188	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
	3189	return propagateFloatx80NaN( a, b );
	3190	}
	3191	return a;
	3192	}
	3193	zSig1 = 0;
	3194	zSig0 = aSig + bSig;
	3195	if ( aExp == 0 ) {
	3196	normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
	3197	goto roundAndPack;
	3198	}
	3199	zExp = aExp;
	3200	goto shiftRight1;
	3201	}
	3202	zSig0 = aSig + bSig;
	3203	if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
	3204	shiftRight1:
	3205	shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
	3206	zSig0 \|= LIT64( 0x8000000000000000 );
	3207	++zExp;
	3208	roundAndPack:
	3209	return
	3210	roundAndPackFloatx80(
	3211	floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
	3212
	3213	}
	3214
	3215	/*----------------------------------------------------------------------------
	3216	\| Returns the result of subtracting the absolute values of the extended
	3217	\| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
	3218	\| difference is negated before being returned. `zSign' is ignored if the
	3219	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
	3220	\| Standard for Binary Floating-Point Arithmetic.
	3221	----------------------------------------------------------------------------/
	3222
	3223	static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
	3224	{
	3225	int32 aExp, bExp, zExp;
	3226	bits64 aSig, bSig, zSig0, zSig1;
	3227	int32 expDiff;
	3228	floatx80 z;
	3229
	3230	aSig = extractFloatx80Frac( a );
	3231	aExp = extractFloatx80Exp( a );
	3232	bSig = extractFloatx80Frac( b );
	3233	bExp = extractFloatx80Exp( b );
	3234	expDiff = aExp - bExp;
	3235	if ( 0 < expDiff ) goto aExpBigger;
	3236	if ( expDiff < 0 ) goto bExpBigger;
	3237	if ( aExp == 0x7FFF ) {
	3238	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
	3239	return propagateFloatx80NaN( a, b );
	3240	}
	3241	float_raise( float_flag_invalid );
	3242	z.low = floatx80_default_nan_low;
	3243	z.high = floatx80_default_nan_high;
	3244	return z;
	3245	}
	3246	if ( aExp == 0 ) {
	3247	aExp = 1;
	3248	bExp = 1;
	3249	}
	3250	zSig1 = 0;
	3251	if ( bSig < aSig ) goto aBigger;
	3252	if ( aSig < bSig ) goto bBigger;
	3253	return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
	3254	bExpBigger:
	3255	if ( bExp == 0x7FFF ) {
	3256	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
	3257	return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
	3258	}
	3259	if ( aExp == 0 ) ++expDiff;
	3260	shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
	3261	bBigger:
	3262	sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
	3263	zExp = bExp;
	3264	zSign ^= 1;
	3265	goto normalizeRoundAndPack;
	3266	aExpBigger:
	3267	if ( aExp == 0x7FFF ) {
	3268	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
	3269	return a;
	3270	}
	3271	if ( bExp == 0 ) --expDiff;
	3272	shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
	3273	aBigger:
	3274	sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
	3275	zExp = aExp;
	3276	normalizeRoundAndPack:
	3277	return
	3278	normalizeRoundAndPackFloatx80(
	3279	floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
	3280
	3281	}
	3282
	3283	/*----------------------------------------------------------------------------
	3284	\| Returns the result of adding the extended double-precision floating-point
	3285	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
	3286	\| Standard for Binary Floating-Point Arithmetic.
	3287	----------------------------------------------------------------------------/
	3288
	3289	floatx80 floatx80_add( floatx80 a, floatx80 b )
	3290	{
	3291	flag aSign, bSign;
	3292
	3293	aSign = extractFloatx80Sign( a );
	3294	bSign = extractFloatx80Sign( b );
	3295	if ( aSign == bSign ) {
	3296	return addFloatx80Sigs( a, b, aSign );
	3297	}
	3298	else {
	3299	return subFloatx80Sigs( a, b, aSign );
	3300	}
	3301
	3302	}
	3303
	3304	/*----------------------------------------------------------------------------
	3305	\| Returns the result of subtracting the extended double-precision floating-
	3306	\| point values `a' and `b'. The operation is performed according to the
	3307	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	3308	----------------------------------------------------------------------------/
	3309
	3310	floatx80 floatx80_sub( floatx80 a, floatx80 b )
	3311	{
	3312	flag aSign, bSign;
	3313
	3314	aSign = extractFloatx80Sign( a );
	3315	bSign = extractFloatx80Sign( b );
	3316	if ( aSign == bSign ) {
	3317	return subFloatx80Sigs( a, b, aSign );
	3318	}
	3319	else {
	3320	return addFloatx80Sigs( a, b, aSign );
	3321	}
	3322
	3323	}
	3324
	3325	/*----------------------------------------------------------------------------
	3326	\| Returns the result of multiplying the extended double-precision floating-
	3327	\| point values `a' and `b'. The operation is performed according to the
	3328	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	3329	----------------------------------------------------------------------------/
	3330
	3331	floatx80 floatx80_mul( floatx80 a, floatx80 b )
	3332	{
	3333	flag aSign, bSign, zSign;
	3334	int32 aExp, bExp, zExp;
	3335	bits64 aSig, bSig, zSig0, zSig1;
	3336	floatx80 z;
	3337
	3338	aSig = extractFloatx80Frac( a );
	3339	aExp = extractFloatx80Exp( a );
	3340	aSign = extractFloatx80Sign( a );
	3341	bSig = extractFloatx80Frac( b );
	3342	bExp = extractFloatx80Exp( b );
	3343	bSign = extractFloatx80Sign( b );
	3344	zSign = aSign ^ bSign;
	3345	if ( aExp == 0x7FFF ) {
	3346	if ( (bits64) ( aSig<<1 )
	3347	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
	3348	return propagateFloatx80NaN( a, b );
	3349	}
	3350	if ( ( bExp \| bSig ) == 0 ) goto invalid;
	3351	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
	3352	}
	3353	if ( bExp == 0x7FFF ) {
	3354	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
	3355	if ( ( aExp \| aSig ) == 0 ) {
	3356	invalid:
	3357	float_raise( float_flag_invalid );
	3358	z.low = floatx80_default_nan_low;
	3359	z.high = floatx80_default_nan_high;
	3360	return z;
	3361	}
	3362	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
	3363	}
	3364	if ( aExp == 0 ) {
	3365	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
	3366	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
	3367	}
	3368	if ( bExp == 0 ) {
	3369	if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
	3370	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
	3371	}
	3372	zExp = aExp + bExp - 0x3FFE;
	3373	mul64To128( aSig, bSig, &zSig0, &zSig1 );
	3374	if ( 0 < (sbits64) zSig0 ) {
	3375	shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
	3376	--zExp;
	3377	}
	3378	return
	3379	roundAndPackFloatx80(
	3380	floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
	3381
	3382	}
	3383
	3384	/*----------------------------------------------------------------------------
	3385	\| Returns the result of dividing the extended double-precision floating-point
	3386	\| value `a' by the corresponding value `b'. The operation is performed
	3387	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	3388	----------------------------------------------------------------------------/
	3389
	3390	floatx80 floatx80_div( floatx80 a, floatx80 b )
	3391	{
	3392	flag aSign, bSign, zSign;
	3393	int32 aExp, bExp, zExp;
	3394	bits64 aSig, bSig, zSig0, zSig1;
	3395	bits64 rem0, rem1, rem2, term0, term1, term2;
	3396	floatx80 z;
	3397
	3398	aSig = extractFloatx80Frac( a );
	3399	aExp = extractFloatx80Exp( a );
	3400	aSign = extractFloatx80Sign( a );
	3401	bSig = extractFloatx80Frac( b );
	3402	bExp = extractFloatx80Exp( b );
	3403	bSign = extractFloatx80Sign( b );
	3404	zSign = aSign ^ bSign;
	3405	if ( aExp == 0x7FFF ) {
	3406	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
	3407	if ( bExp == 0x7FFF ) {
	3408	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
	3409	goto invalid;
	3410	}
	3411	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
	3412	}
	3413	if ( bExp == 0x7FFF ) {
	3414	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
	3415	return packFloatx80( zSign, 0, 0 );
	3416	}
	3417	if ( bExp == 0 ) {
	3418	if ( bSig == 0 ) {
	3419	if ( ( aExp \| aSig ) == 0 ) {
	3420	invalid:
	3421	float_raise( float_flag_invalid );
	3422	z.low = floatx80_default_nan_low;
	3423	z.high = floatx80_default_nan_high;
	3424	return z;
	3425	}
	3426	float_raise( float_flag_divbyzero );
	3427	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
	3428	}
	3429	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
	3430	}
	3431	if ( aExp == 0 ) {
	3432	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
	3433	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
	3434	}
	3435	zExp = aExp - bExp + 0x3FFE;
	3436	rem1 = 0;
	3437	if ( bSig <= aSig ) {
	3438	shift128Right( aSig, 0, 1, &aSig, &rem1 );
	3439	++zExp;
	3440	}
	3441	zSig0 = estimateDiv128To64( aSig, rem1, bSig );
	3442	mul64To128( bSig, zSig0, &term0, &term1 );
	3443	sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
	3444	while ( (sbits64) rem0 < 0 ) {
	3445	--zSig0;
	3446	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
	3447	}
	3448	zSig1 = estimateDiv128To64( rem1, 0, bSig );
	3449	if ( (bits64) ( zSig1<<1 ) <= 8 ) {
	3450	mul64To128( bSig, zSig1, &term1, &term2 );
	3451	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
	3452	while ( (sbits64) rem1 < 0 ) {
	3453	--zSig1;
	3454	add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
	3455	}
	3456	zSig1 \|= ( ( rem1 \| rem2 ) != 0 );
	3457	}
	3458	return
	3459	roundAndPackFloatx80(
	3460	floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
	3461
	3462	}
	3463
	3464	/*----------------------------------------------------------------------------
	3465	\| Returns the remainder of the extended double-precision floating-point value
	3466	\| `a' with respect to the corresponding value `b'. The operation is performed
	3467	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	3468	----------------------------------------------------------------------------/
	3469
	3470	floatx80 floatx80_rem( floatx80 a, floatx80 b )
	3471	{
	3472	flag aSign, zSign;
	3473	int32 aExp, bExp, expDiff;
	3474	bits64 aSig0, aSig1, bSig;
	3475	bits64 q, term0, term1, alternateASig0, alternateASig1;
	3476	floatx80 z;
	3477
	3478	aSig0 = extractFloatx80Frac( a );
	3479	aExp = extractFloatx80Exp( a );
	3480	aSign = extractFloatx80Sign( a );
	3481	bSig = extractFloatx80Frac( b );
	3482	bExp = extractFloatx80Exp( b );
	3483	// bSign = extractFloatx80Sign( b );
	3484	if ( aExp == 0x7FFF ) {
	3485	if ( (bits64) ( aSig0<<1 )
	3486	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
	3487	return propagateFloatx80NaN( a, b );
	3488	}
	3489	goto invalid;
	3490	}
	3491	if ( bExp == 0x7FFF ) {
	3492	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
	3493	return a;
	3494	}
	3495	if ( bExp == 0 ) {
	3496	if ( bSig == 0 ) {
	3497	invalid:
	3498	float_raise( float_flag_invalid );
	3499	z.low = floatx80_default_nan_low;
	3500	z.high = floatx80_default_nan_high;
	3501	return z;
	3502	}
	3503	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
	3504	}
	3505	if ( aExp == 0 ) {
	3506	if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
	3507	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
	3508	}
	3509	bSig \|= LIT64( 0x8000000000000000 );
	3510	zSign = aSign;
	3511	expDiff = aExp - bExp;
	3512	aSig1 = 0;
	3513	if ( expDiff < 0 ) {
	3514	if ( expDiff < -1 ) return a;
	3515	shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
	3516	expDiff = 0;
	3517	}
	3518	q = ( bSig <= aSig0 );
	3519	if ( q ) aSig0 -= bSig;
	3520	expDiff -= 64;
	3521	while ( 0 < expDiff ) {
	3522	q = estimateDiv128To64( aSig0, aSig1, bSig );
	3523	q = ( 2 < q ) ? q - 2 : 0;
	3524	mul64To128( bSig, q, &term0, &term1 );
	3525	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
	3526	shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
	3527	expDiff -= 62;
	3528	}
	3529	expDiff += 64;
	3530	if ( 0 < expDiff ) {
	3531	q = estimateDiv128To64( aSig0, aSig1, bSig );
	3532	q = ( 2 < q ) ? q - 2 : 0;
	3533	q >>= 64 - expDiff;
	3534	mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
	3535	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
	3536	shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
	3537	while ( le128( term0, term1, aSig0, aSig1 ) ) {
	3538	++q;
	3539	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
	3540	}
	3541	}
	3542	else {
	3543	term1 = 0;
	3544	term0 = bSig;
	3545	}
	3546	sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
	3547	if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
	3548	\|\| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
	3549	&& ( q & 1 ) )
	3550	) {
	3551	aSig0 = alternateASig0;
	3552	aSig1 = alternateASig1;
	3553	zSign = ! zSign;
	3554	}
	3555	return
	3556	normalizeRoundAndPackFloatx80(
	3557	80, zSign, bExp + expDiff, aSig0, aSig1 );
	3558
	3559	}
	3560
	3561	/*----------------------------------------------------------------------------
	3562	\| Returns the square root of the extended double-precision floating-point
	3563	\| value `a'. The operation is performed according to the IEC/IEEE Standard
	3564	\| for Binary Floating-Point Arithmetic.
	3565	----------------------------------------------------------------------------/
	3566
	3567	floatx80 floatx80_sqrt( floatx80 a )
	3568	{
	3569	flag aSign;
	3570	int32 aExp, zExp;
	3571	bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
	3572	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
	3573	floatx80 z;
	3574
	3575	aSig0 = extractFloatx80Frac( a );
	3576	aExp = extractFloatx80Exp( a );
	3577	aSign = extractFloatx80Sign( a );
	3578	if ( aExp == 0x7FFF ) {
	3579	if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
	3580	if ( ! aSign ) return a;
	3581	goto invalid;
	3582	}
	3583	if ( aSign ) {
	3584	if ( ( aExp \| aSig0 ) == 0 ) return a;
	3585	invalid:
	3586	float_raise( float_flag_invalid );
	3587	z.low = floatx80_default_nan_low;
	3588	z.high = floatx80_default_nan_high;
	3589	return z;
	3590	}
	3591	if ( aExp == 0 ) {
	3592	if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
	3593	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
	3594	}
	3595	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
	3596	zSig0 = estimateSqrt32( aExp, aSig0>>32 );
	3597	shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
	3598	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
	3599	doubleZSig0 = zSig0<<1;
	3600	mul64To128( zSig0, zSig0, &term0, &term1 );
	3601	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
	3602	while ( (sbits64) rem0 < 0 ) {
	3603	--zSig0;
	3604	doubleZSig0 -= 2;
	3605	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
	3606	}
	3607	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
	3608	if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
	3609	if ( zSig1 == 0 ) zSig1 = 1;
	3610	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
	3611	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
	3612	mul64To128( zSig1, zSig1, &term2, &term3 );
	3613	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
	3614	while ( (sbits64) rem1 < 0 ) {
	3615	--zSig1;
	3616	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
	3617	term3 \|= 1;
	3618	term2 \|= doubleZSig0;
	3619	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
	3620	}
	3621	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
	3622	}
	3623	shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
	3624	zSig0 \|= doubleZSig0;
	3625	return
	3626	roundAndPackFloatx80(
	3627	floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
	3628
	3629	}
	3630
	3631	/*----------------------------------------------------------------------------
	3632	\| Returns 1 if the extended double-precision floating-point value `a' is
	3633	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
	3634	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
	3635	\| Arithmetic.
	3636	----------------------------------------------------------------------------/
	3637
	3638	flag floatx80_eq( floatx80 a, floatx80 b )
	3639	{
	3640	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
	3641	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
	3642	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
	3643	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
	3644	) {
	3645	if ( floatx80_is_signaling_nan( a )
	3646	\|\| floatx80_is_signaling_nan( b ) ) {
	3647	float_raise( float_flag_invalid );
	3648	}
	3649	return 0;
	3650	}
	3651	return
	3652	( a.low == b.low )
	3653	&& ( ( a.high == b.high )
	3654	\|\| ( ( a.low == 0 )
	3655	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
	3656	);
	3657
	3658	}
	3659
	3660	/*----------------------------------------------------------------------------
	3661	\| Returns 1 if the extended double-precision floating-point value `a' is
	3662	\| less than or equal to the corresponding value `b', and 0 otherwise. The
	3663	\| comparison is performed according to the IEC/IEEE Standard for Binary
	3664	\| Floating-Point Arithmetic.
	3665	----------------------------------------------------------------------------/
	3666
	3667	flag floatx80_le( floatx80 a, floatx80 b )
	3668	{
	3669	flag aSign, bSign;
	3670
	3671	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
	3672	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
	3673	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
	3674	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
	3675	) {
	3676	float_raise( float_flag_invalid );
	3677	return 0;
	3678	}
	3679	aSign = extractFloatx80Sign( a );
	3680	bSign = extractFloatx80Sign( b );
	3681	if ( aSign != bSign ) {
	3682	return
	3683	aSign
	3684	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
	3685	== 0 );
	3686	}
	3687	return
	3688	aSign ? le128( b.high, b.low, a.high, a.low )
	3689	: le128( a.high, a.low, b.high, b.low );
	3690
	3691	}
	3692
	3693	/*----------------------------------------------------------------------------
	3694	\| Returns 1 if the extended double-precision floating-point value `a' is
	3695	\| less than the corresponding value `b', and 0 otherwise. The comparison
	3696	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	3697	\| Arithmetic.
	3698	----------------------------------------------------------------------------/
	3699
	3700	flag floatx80_lt( floatx80 a, floatx80 b )
	3701	{
	3702	flag aSign, bSign;
	3703
	3704	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
	3705	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
	3706	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
	3707	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
	3708	) {
	3709	float_raise( float_flag_invalid );
	3710	return 0;
	3711	}
	3712	aSign = extractFloatx80Sign( a );
	3713	bSign = extractFloatx80Sign( b );
	3714	if ( aSign != bSign ) {
	3715	return
	3716	aSign
	3717	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
	3718	!= 0 );
	3719	}
	3720	return
	3721	aSign ? lt128( b.high, b.low, a.high, a.low )
	3722	: lt128( a.high, a.low, b.high, b.low );
	3723
	3724	}
	3725
	3726	/*----------------------------------------------------------------------------
	3727	\| Returns 1 if the extended double-precision floating-point value `a' is equal
	3728	\| to the corresponding value `b', and 0 otherwise. The invalid exception is
	3729	\| raised if either operand is a NaN. Otherwise, the comparison is performed
	3730	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	3731	----------------------------------------------------------------------------/
	3732
	3733	flag floatx80_eq_signaling( floatx80 a, floatx80 b )
	3734	{
	3735	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
	3736	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
	3737	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
	3738	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
	3739	) {
	3740	float_raise( float_flag_invalid );
	3741	return 0;
	3742	}
	3743	return
	3744	( a.low == b.low )
	3745	&& ( ( a.high == b.high )
	3746	\|\| ( ( a.low == 0 )
	3747	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
	3748	);
	3749
	3750	}
	3751
	3752	/*----------------------------------------------------------------------------
	3753	\| Returns 1 if the extended double-precision floating-point value `a' is less
	3754	\| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
	3755	\| do not cause an exception. Otherwise, the comparison is performed according
	3756	\| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	3757	----------------------------------------------------------------------------/
	3758
	3759	flag floatx80_le_quiet( floatx80 a, floatx80 b )
	3760	{
	3761	flag aSign, bSign;
	3762
	3763	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
	3764	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
	3765	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
	3766	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
	3767	) {
	3768	if ( floatx80_is_signaling_nan( a )
	3769	\|\| floatx80_is_signaling_nan( b ) ) {
	3770	float_raise( float_flag_invalid );
	3771	}
	3772	return 0;
	3773	}
	3774	aSign = extractFloatx80Sign( a );
	3775	bSign = extractFloatx80Sign( b );
	3776	if ( aSign != bSign ) {
	3777	return
	3778	aSign
	3779	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
	3780	== 0 );
	3781	}
	3782	return
	3783	aSign ? le128( b.high, b.low, a.high, a.low )
	3784	: le128( a.high, a.low, b.high, b.low );
	3785
	3786	}
	3787
	3788	/*----------------------------------------------------------------------------
	3789	\| Returns 1 if the extended double-precision floating-point value `a' is less
	3790	\| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
	3791	\| an exception. Otherwise, the comparison is performed according to the
	3792	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	3793	----------------------------------------------------------------------------/
	3794
	3795	flag floatx80_lt_quiet( floatx80 a, floatx80 b )
	3796	{
	3797	flag aSign, bSign;
	3798
	3799	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
	3800	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
	3801	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
	3802	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
	3803	) {
	3804	if ( floatx80_is_signaling_nan( a )
	3805	\|\| floatx80_is_signaling_nan( b ) ) {
	3806	float_raise( float_flag_invalid );
	3807	}
	3808	return 0;
	3809	}
	3810	aSign = extractFloatx80Sign( a );
	3811	bSign = extractFloatx80Sign( b );
	3812	if ( aSign != bSign ) {
	3813	return
	3814	aSign
	3815	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
	3816	!= 0 );
	3817	}
	3818	return
	3819	aSign ? lt128( b.high, b.low, a.high, a.low )
	3820	: lt128( a.high, a.low, b.high, b.low );
	3821
	3822	}
	3823
	3824	#endif
	3825
	3826	#ifdef FLOAT128
	3827
	3828	/*----------------------------------------------------------------------------
	3829	\| Returns the result of converting the quadruple-precision floating-point
	3830	\| value `a' to the 32-bit two's complement integer format. The conversion
	3831	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	3832	\| Arithmetic---which means in particular that the conversion is rounded
	3833	\| according to the current rounding mode. If `a' is a NaN, the largest
	3834	\| positive integer is returned. Otherwise, if the conversion overflows, the
	3835	\| largest integer with the same sign as `a' is returned.
	3836	----------------------------------------------------------------------------/
	3837
	3838	int32 float128_to_int32( float128 a )
	3839	{
	3840	flag aSign;
	3841	int32 aExp, shiftCount;
	3842	bits64 aSig0, aSig1;
	3843
	3844	aSig1 = extractFloat128Frac1( a );
	3845	aSig0 = extractFloat128Frac0( a );
	3846	aExp = extractFloat128Exp( a );
	3847	aSign = extractFloat128Sign( a );
	3848	if ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) aSign = 0;
	3849	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
	3850	aSig0 \|= ( aSig1 != 0 );
	3851	shiftCount = 0x4028 - aExp;
	3852	if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
	3853	return roundAndPackInt32( aSign, aSig0 );
	3854
	3855	}
	3856
	3857	/*----------------------------------------------------------------------------
	3858	\| Returns the result of converting the quadruple-precision floating-point
	3859	\| value `a' to the 32-bit two's complement integer format. The conversion
	3860	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	3861	\| Arithmetic, except that the conversion is always rounded toward zero. If
	3862	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
	3863	\| conversion overflows, the largest integer with the same sign as `a' is
	3864	\| returned.
	3865	----------------------------------------------------------------------------/
	3866
	3867	int32 float128_to_int32_round_to_zero( float128 a )
	3868	{
	3869	flag aSign;
	3870	int32 aExp, shiftCount;
	3871	bits64 aSig0, aSig1, savedASig;
	3872	int32 z;
	3873
	3874	aSig1 = extractFloat128Frac1( a );
	3875	aSig0 = extractFloat128Frac0( a );
	3876	aExp = extractFloat128Exp( a );
	3877	aSign = extractFloat128Sign( a );
	3878	aSig0 \|= ( aSig1 != 0 );
	3879	if ( 0x401E < aExp ) {
	3880	if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
	3881	goto invalid;
	3882	}
	3883	else if ( aExp < 0x3FFF ) {
	3884	if ( aExp \|\| aSig0 ) float_exception_flags \|= float_flag_inexact;
	3885	return 0;
	3886	}
	3887	aSig0 \|= LIT64( 0x0001000000000000 );
	3888	shiftCount = 0x402F - aExp;
	3889	savedASig = aSig0;
	3890	aSig0 >>= shiftCount;
	3891	z = aSig0;
	3892	if ( aSign ) z = - z;
	3893	if ( ( z < 0 ) ^ aSign ) {
	3894	invalid:
	3895	float_raise( float_flag_invalid );
	3896	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
	3897	}
	3898	if ( ( aSig0<<shiftCount ) != savedASig ) {
	3899	float_exception_flags \|= float_flag_inexact;
	3900	}
	3901	return z;
	3902
	3903	}
	3904
	3905	/*----------------------------------------------------------------------------
	3906	\| Returns the result of converting the quadruple-precision floating-point
	3907	\| value `a' to the 64-bit two's complement integer format. The conversion
	3908	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	3909	\| Arithmetic---which means in particular that the conversion is rounded
	3910	\| according to the current rounding mode. If `a' is a NaN, the largest
	3911	\| positive integer is returned. Otherwise, if the conversion overflows, the
	3912	\| largest integer with the same sign as `a' is returned.
	3913	----------------------------------------------------------------------------/
	3914
	3915	int64 float128_to_int64( float128 a )
	3916	{
	3917	flag aSign;
	3918	int32 aExp, shiftCount;
	3919	bits64 aSig0, aSig1;
	3920
	3921	aSig1 = extractFloat128Frac1( a );
	3922	aSig0 = extractFloat128Frac0( a );
	3923	aExp = extractFloat128Exp( a );
	3924	aSign = extractFloat128Sign( a );
	3925	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
	3926	shiftCount = 0x402F - aExp;
	3927	if ( shiftCount <= 0 ) {
	3928	if ( 0x403E < aExp ) {
	3929	float_raise( float_flag_invalid );
	3930	if ( ! aSign
	3931	\|\| ( ( aExp == 0x7FFF )
	3932	&& ( aSig1 \|\| ( aSig0 != LIT64( 0x0001000000000000 ) ) )
	3933	)
	3934	) {
	3935	return LIT64( 0x7FFFFFFFFFFFFFFF );
	3936	}
	3937	return (sbits64) LIT64( 0x8000000000000000 );
	3938	}
	3939	shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
	3940	}
	3941	else {
	3942	shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
	3943	}
	3944	return roundAndPackInt64( aSign, aSig0, aSig1 );
	3945
	3946	}
	3947
	3948	/*----------------------------------------------------------------------------
	3949	\| Returns the result of converting the quadruple-precision floating-point
	3950	\| value `a' to the 64-bit two's complement integer format. The conversion
	3951	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	3952	\| Arithmetic, except that the conversion is always rounded toward zero.
	3953	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
	3954	\| the conversion overflows, the largest integer with the same sign as `a' is
	3955	\| returned.
	3956	----------------------------------------------------------------------------/
	3957
	3958	int64 float128_to_int64_round_to_zero( float128 a )
	3959	{
	3960	flag aSign;
	3961	int32 aExp, shiftCount;
	3962	bits64 aSig0, aSig1;
	3963	int64 z;
	3964
	3965	aSig1 = extractFloat128Frac1( a );
	3966	aSig0 = extractFloat128Frac0( a );
	3967	aExp = extractFloat128Exp( a );
	3968	aSign = extractFloat128Sign( a );
	3969	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
	3970	shiftCount = aExp - 0x402F;
	3971	if ( 0 < shiftCount ) {
	3972	if ( 0x403E <= aExp ) {
	3973	aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
	3974	if ( ( a.high == LIT64( 0xC03E000000000000 ) )
	3975	&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
	3976	if ( aSig1 ) float_exception_flags \|= float_flag_inexact;
	3977	}
	3978	else {
	3979	float_raise( float_flag_invalid );
	3980	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) ) {
	3981	return LIT64( 0x7FFFFFFFFFFFFFFF );
	3982	}
	3983	}
	3984	return (sbits64) LIT64( 0x8000000000000000 );
	3985	}
	3986	z = ( aSig0<<shiftCount ) \| ( aSig1>>( ( - shiftCount ) & 63 ) );
	3987	if ( (bits64) ( aSig1<<shiftCount ) ) {
	3988	float_exception_flags \|= float_flag_inexact;
	3989	}
	3990	}
	3991	else {
	3992	if ( aExp < 0x3FFF ) {
	3993	if ( aExp \| aSig0 \| aSig1 ) {
	3994	float_exception_flags \|= float_flag_inexact;
	3995	}
	3996	return 0;
	3997	}
	3998	z = aSig0>>( - shiftCount );
	3999	if ( aSig1
	4000	\|\| ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
	4001	float_exception_flags \|= float_flag_inexact;
	4002	}
	4003	}
	4004	if ( aSign ) z = - z;
	4005	return z;
	4006
	4007	}
	4008
	4009	/*----------------------------------------------------------------------------
	4010	\| Returns the result of converting the quadruple-precision floating-point
	4011	\| value `a' to the single-precision floating-point format. The conversion
	4012	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	4013	\| Arithmetic.
	4014	----------------------------------------------------------------------------/
	4015
	4016	float32 float128_to_float32( float128 a )
	4017	{
	4018	flag aSign;
	4019	int32 aExp;
	4020	bits64 aSig0, aSig1;
	4021	bits32 zSig;
	4022
	4023	aSig1 = extractFloat128Frac1( a );
	4024	aSig0 = extractFloat128Frac0( a );
	4025	aExp = extractFloat128Exp( a );
	4026	aSign = extractFloat128Sign( a );
	4027	if ( aExp == 0x7FFF ) {
	4028	if ( aSig0 \| aSig1 ) {
	4029	return commonNaNToFloat32( float128ToCommonNaN( a ) );
	4030	}
	4031	return packFloat32( aSign, 0xFF, 0 );
	4032	}
	4033	aSig0 \|= ( aSig1 != 0 );
	4034	shift64RightJamming( aSig0, 18, &aSig0 );
	4035	zSig = aSig0;
	4036	if ( aExp \|\| zSig ) {
	4037	zSig \|= 0x40000000;
	4038	aExp -= 0x3F81;
	4039	}
	4040	return roundAndPackFloat32( aSign, aExp, zSig );
	4041
	4042	}
	4043
	4044	/*----------------------------------------------------------------------------
	4045	\| Returns the result of converting the quadruple-precision floating-point
	4046	\| value `a' to the double-precision floating-point format. The conversion
	4047	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	4048	\| Arithmetic.
	4049	----------------------------------------------------------------------------/
	4050
	4051	float64 float128_to_float64( float128 a )
	4052	{
	4053	flag aSign;
	4054	int32 aExp;
	4055	bits64 aSig0, aSig1;
	4056
	4057	aSig1 = extractFloat128Frac1( a );
	4058	aSig0 = extractFloat128Frac0( a );
	4059	aExp = extractFloat128Exp( a );
	4060	aSign = extractFloat128Sign( a );
	4061	if ( aExp == 0x7FFF ) {
	4062	if ( aSig0 \| aSig1 ) {
	4063	return commonNaNToFloat64( float128ToCommonNaN( a ) );
	4064	}
	4065	return packFloat64( aSign, 0x7FF, 0 );
	4066	}
	4067	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
	4068	aSig0 \|= ( aSig1 != 0 );
	4069	if ( aExp \|\| aSig0 ) {
	4070	aSig0 \|= LIT64( 0x4000000000000000 );
	4071	aExp -= 0x3C01;
	4072	}
	4073	return roundAndPackFloat64( aSign, aExp, aSig0 );
	4074
	4075	}
	4076
	4077	#ifdef FLOATX80
	4078
	4079	/*----------------------------------------------------------------------------
	4080	\| Returns the result of converting the quadruple-precision floating-point
	4081	\| value `a' to the extended double-precision floating-point format. The
	4082	\| conversion is performed according to the IEC/IEEE Standard for Binary
	4083	\| Floating-Point Arithmetic.
	4084	----------------------------------------------------------------------------/
	4085
	4086	floatx80 float128_to_floatx80( float128 a )
	4087	{
	4088	flag aSign;
	4089	int32 aExp;
	4090	bits64 aSig0, aSig1;
	4091
	4092	aSig1 = extractFloat128Frac1( a );
	4093	aSig0 = extractFloat128Frac0( a );
	4094	aExp = extractFloat128Exp( a );
	4095	aSign = extractFloat128Sign( a );
	4096	if ( aExp == 0x7FFF ) {
	4097	if ( aSig0 \| aSig1 ) {
	4098	return commonNaNToFloatx80( float128ToCommonNaN( a ) );
	4099	}
	4100	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
	4101	}
	4102	if ( aExp == 0 ) {
	4103	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
	4104	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
	4105	}
	4106	else {
	4107	aSig0 \|= LIT64( 0x0001000000000000 );
	4108	}
	4109	shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
	4110	return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
	4111
	4112	}
	4113
	4114	#endif
	4115
	4116	/*----------------------------------------------------------------------------
	4117	\| Rounds the quadruple-precision floating-point value `a' to an integer, and
	4118	\| returns the result as a quadruple-precision floating-point value. The
	4119	\| operation is performed according to the IEC/IEEE Standard for Binary
	4120	\| Floating-Point Arithmetic.
	4121	----------------------------------------------------------------------------/
	4122
	4123	float128 float128_round_to_int( float128 a )
	4124	{
	4125	flag aSign;
	4126	int32 aExp;
	4127	bits64 lastBitMask, roundBitsMask;
	4128	int8 roundingMode;
	4129	float128 z;
	4130
	4131	aExp = extractFloat128Exp( a );
	4132	if ( 0x402F <= aExp ) {
	4133	if ( 0x406F <= aExp ) {
	4134	if ( ( aExp == 0x7FFF )
	4135	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) )
	4136	) {
	4137	return propagateFloat128NaN( a, a );
	4138	}
	4139	return a;
	4140	}
	4141	lastBitMask = 1;
	4142	lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
	4143	roundBitsMask = lastBitMask - 1;
	4144	z = a;
	4145	roundingMode = float_rounding_mode;
	4146	if ( roundingMode == float_round_nearest_even ) {
	4147	if ( lastBitMask ) {
	4148	add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
	4149	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
	4150	}
	4151	else {
	4152	if ( (sbits64) z.low < 0 ) {
	4153	++z.high;
	4154	if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
	4155	}
	4156	}
	4157	}
	4158	else if ( roundingMode != float_round_to_zero ) {
	4159	if ( extractFloat128Sign( z )
	4160	^ ( roundingMode == float_round_up ) ) {
	4161	add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
	4162	}
	4163	}
	4164	z.low &= ~ roundBitsMask;
	4165	}
	4166	else {
	4167	if ( aExp < 0x3FFF ) {
	4168	if ( ( ( (bits64) ( a.high<<1 ) ) \| a.low ) == 0 ) return a;
	4169	float_exception_flags \|= float_flag_inexact;
	4170	aSign = extractFloat128Sign( a );
	4171	switch ( float_rounding_mode ) {
	4172	case float_round_nearest_even:
	4173	if ( ( aExp == 0x3FFE )
	4174	&& ( extractFloat128Frac0( a )
	4175	\| extractFloat128Frac1( a ) )
	4176	) {
	4177	return packFloat128( aSign, 0x3FFF, 0, 0 );
	4178	}
	4179	break;
	4180	case float_round_down:
	4181	return
	4182	aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
	4183	: packFloat128( 0, 0, 0, 0 );
	4184	case float_round_up:
	4185	return
	4186	aSign ? packFloat128( 1, 0, 0, 0 )
	4187	: packFloat128( 0, 0x3FFF, 0, 0 );
	4188	}
	4189	return packFloat128( aSign, 0, 0, 0 );
	4190	}
	4191	lastBitMask = 1;
	4192	lastBitMask <<= 0x402F - aExp;
	4193	roundBitsMask = lastBitMask - 1;
	4194	z.low = 0;
	4195	z.high = a.high;
	4196	roundingMode = float_rounding_mode;
	4197	if ( roundingMode == float_round_nearest_even ) {
	4198	z.high += lastBitMask>>1;
	4199	if ( ( ( z.high & roundBitsMask ) \| a.low ) == 0 ) {
	4200	z.high &= ~ lastBitMask;
	4201	}
	4202	}
	4203	else if ( roundingMode != float_round_to_zero ) {
	4204	if ( extractFloat128Sign( z )
	4205	^ ( roundingMode == float_round_up ) ) {
	4206	z.high \|= ( a.low != 0 );
	4207	z.high += roundBitsMask;
	4208	}
	4209	}
	4210	z.high &= ~ roundBitsMask;
	4211	}
	4212	if ( ( z.low != a.low ) \|\| ( z.high != a.high ) ) {
	4213	float_exception_flags \|= float_flag_inexact;
	4214	}
	4215	return z;
	4216
	4217	}
	4218
	4219	/*----------------------------------------------------------------------------
	4220	\| Returns the result of adding the absolute values of the quadruple-precision
	4221	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
	4222	\| before being returned. `zSign' is ignored if the result is a NaN.
	4223	\| The addition is performed according to the IEC/IEEE Standard for Binary
	4224	\| Floating-Point Arithmetic.
	4225	----------------------------------------------------------------------------/
	4226
	4227	static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
	4228	{
	4229	int32 aExp, bExp, zExp;
	4230	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
	4231	int32 expDiff;
	4232
	4233	aSig1 = extractFloat128Frac1( a );
	4234	aSig0 = extractFloat128Frac0( a );
	4235	aExp = extractFloat128Exp( a );
	4236	bSig1 = extractFloat128Frac1( b );
	4237	bSig0 = extractFloat128Frac0( b );
	4238	bExp = extractFloat128Exp( b );
	4239	expDiff = aExp - bExp;
	4240	if ( 0 < expDiff ) {
	4241	if ( aExp == 0x7FFF ) {
	4242	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b );
	4243	return a;
	4244	}
	4245	if ( bExp == 0 ) {
	4246	--expDiff;
	4247	}
	4248	else {
	4249	bSig0 \|= LIT64( 0x0001000000000000 );
	4250	}
	4251	shift128ExtraRightJamming(
	4252	bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
	4253	zExp = aExp;
	4254	}
	4255	else if ( expDiff < 0 ) {
	4256	if ( bExp == 0x7FFF ) {
	4257	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b );
	4258	return packFloat128( zSign, 0x7FFF, 0, 0 );
	4259	}
	4260	if ( aExp == 0 ) {
	4261	++expDiff;
	4262	}
	4263	else {
	4264	aSig0 \|= LIT64( 0x0001000000000000 );
	4265	}
	4266	shift128ExtraRightJamming(
	4267	aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
	4268	zExp = bExp;
	4269	}
	4270	else {
	4271	if ( aExp == 0x7FFF ) {
	4272	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
	4273	return propagateFloat128NaN( a, b );
	4274	}
	4275	return a;
	4276	}
	4277	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
	4278	if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
	4279	zSig2 = 0;
	4280	zSig0 \|= LIT64( 0x0002000000000000 );
	4281	zExp = aExp;
	4282	goto shiftRight1;
	4283	}
	4284	aSig0 \|= LIT64( 0x0001000000000000 );
	4285	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
	4286	--zExp;
	4287	if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
	4288	++zExp;
	4289	shiftRight1:
	4290	shift128ExtraRightJamming(
	4291	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
	4292	roundAndPack:
	4293	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
	4294
	4295	}
	4296
	4297	/*----------------------------------------------------------------------------
	4298	\| Returns the result of subtracting the absolute values of the quadruple-
	4299	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
	4300	\| difference is negated before being returned. `zSign' is ignored if the
	4301	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
	4302	\| Standard for Binary Floating-Point Arithmetic.
	4303	----------------------------------------------------------------------------/
	4304
	4305	static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
	4306	{
	4307	int32 aExp, bExp, zExp;
	4308	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
	4309	int32 expDiff;
	4310	float128 z;
	4311
	4312	aSig1 = extractFloat128Frac1( a );
	4313	aSig0 = extractFloat128Frac0( a );
	4314	aExp = extractFloat128Exp( a );
	4315	bSig1 = extractFloat128Frac1( b );
	4316	bSig0 = extractFloat128Frac0( b );
	4317	bExp = extractFloat128Exp( b );
	4318	expDiff = aExp - bExp;
	4319	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
	4320	shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
	4321	if ( 0 < expDiff ) goto aExpBigger;
	4322	if ( expDiff < 0 ) goto bExpBigger;
	4323	if ( aExp == 0x7FFF ) {
	4324	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
	4325	return propagateFloat128NaN( a, b );
	4326	}
	4327	float_raise( float_flag_invalid );
	4328	z.low = float128_default_nan_low;
	4329	z.high = float128_default_nan_high;
	4330	return z;
	4331	}
	4332	if ( aExp == 0 ) {
	4333	aExp = 1;
	4334	bExp = 1;
	4335	}
	4336	if ( bSig0 < aSig0 ) goto aBigger;
	4337	if ( aSig0 < bSig0 ) goto bBigger;
	4338	if ( bSig1 < aSig1 ) goto aBigger;
	4339	if ( aSig1 < bSig1 ) goto bBigger;
	4340	return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
	4341	bExpBigger:
	4342	if ( bExp == 0x7FFF ) {
	4343	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b );
	4344	return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
	4345	}
	4346	if ( aExp == 0 ) {
	4347	++expDiff;
	4348	}
	4349	else {
	4350	aSig0 \|= LIT64( 0x4000000000000000 );
	4351	}
	4352	shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
	4353	bSig0 \|= LIT64( 0x4000000000000000 );
	4354	bBigger:
	4355	sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
	4356	zExp = bExp;
	4357	zSign ^= 1;
	4358	goto normalizeRoundAndPack;
	4359	aExpBigger:
	4360	if ( aExp == 0x7FFF ) {
	4361	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b );
	4362	return a;
	4363	}
	4364	if ( bExp == 0 ) {
	4365	--expDiff;
	4366	}
	4367	else {
	4368	bSig0 \|= LIT64( 0x4000000000000000 );
	4369	}
	4370	shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
	4371	aSig0 \|= LIT64( 0x4000000000000000 );
	4372	aBigger:
	4373	sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
	4374	zExp = aExp;
	4375	normalizeRoundAndPack:
	4376	--zExp;
	4377	return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
	4378
	4379	}
	4380
	4381	/*----------------------------------------------------------------------------
	4382	\| Returns the result of adding the quadruple-precision floating-point values
	4383	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
	4384	\| for Binary Floating-Point Arithmetic.
	4385	----------------------------------------------------------------------------/
	4386
	4387	float128 float128_add( float128 a, float128 b )
	4388	{
	4389	flag aSign, bSign;
	4390
	4391	aSign = extractFloat128Sign( a );
	4392	bSign = extractFloat128Sign( b );
	4393	if ( aSign == bSign ) {
	4394	return addFloat128Sigs( a, b, aSign );
	4395	}
	4396	else {
	4397	return subFloat128Sigs( a, b, aSign );
	4398	}
	4399
	4400	}
	4401
	4402	/*----------------------------------------------------------------------------
	4403	\| Returns the result of subtracting the quadruple-precision floating-point
	4404	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
	4405	\| Standard for Binary Floating-Point Arithmetic.
	4406	----------------------------------------------------------------------------/
	4407
	4408	float128 float128_sub( float128 a, float128 b )
	4409	{
	4410	flag aSign, bSign;
	4411
	4412	aSign = extractFloat128Sign( a );
	4413	bSign = extractFloat128Sign( b );
	4414	if ( aSign == bSign ) {
	4415	return subFloat128Sigs( a, b, aSign );
	4416	}
	4417	else {
	4418	return addFloat128Sigs( a, b, aSign );
	4419	}
	4420
	4421	}
	4422
	4423	/*----------------------------------------------------------------------------
	4424	\| Returns the result of multiplying the quadruple-precision floating-point
	4425	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
	4426	\| Standard for Binary Floating-Point Arithmetic.
	4427	----------------------------------------------------------------------------/
	4428
	4429	float128 float128_mul( float128 a, float128 b )
	4430	{
	4431	flag aSign, bSign, zSign;
	4432	int32 aExp, bExp, zExp;
	4433	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
	4434	float128 z;
	4435
	4436	aSig1 = extractFloat128Frac1( a );
	4437	aSig0 = extractFloat128Frac0( a );
	4438	aExp = extractFloat128Exp( a );
	4439	aSign = extractFloat128Sign( a );
	4440	bSig1 = extractFloat128Frac1( b );
	4441	bSig0 = extractFloat128Frac0( b );
	4442	bExp = extractFloat128Exp( b );
	4443	bSign = extractFloat128Sign( b );
	4444	zSign = aSign ^ bSign;
	4445	if ( aExp == 0x7FFF ) {
	4446	if ( ( aSig0 \| aSig1 )
	4447	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
	4448	return propagateFloat128NaN( a, b );
	4449	}
	4450	if ( ( bExp \| bSig0 \| bSig1 ) == 0 ) goto invalid;
	4451	return packFloat128( zSign, 0x7FFF, 0, 0 );
	4452	}
	4453	if ( bExp == 0x7FFF ) {
	4454	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b );
	4455	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
	4456	invalid:
	4457	float_raise( float_flag_invalid );
	4458	z.low = float128_default_nan_low;
	4459	z.high = float128_default_nan_high;
	4460	return z;
	4461	}
	4462	return packFloat128( zSign, 0x7FFF, 0, 0 );
	4463	}
	4464	if ( aExp == 0 ) {
	4465	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
	4466	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
	4467	}
	4468	if ( bExp == 0 ) {
	4469	if ( ( bSig0 \| bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
	4470	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
	4471	}
	4472	zExp = aExp + bExp - 0x4000;
	4473	aSig0 \|= LIT64( 0x0001000000000000 );
	4474	shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
	4475	mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
	4476	add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
	4477	zSig2 \|= ( zSig3 != 0 );
	4478	if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
	4479	shift128ExtraRightJamming(
	4480	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
	4481	++zExp;
	4482	}
	4483	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
	4484
	4485	}
	4486
	4487	/*----------------------------------------------------------------------------
	4488	\| Returns the result of dividing the quadruple-precision floating-point value
	4489	\| `a' by the corresponding value `b'. The operation is performed according to
	4490	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	4491	----------------------------------------------------------------------------/
	4492
	4493	float128 float128_div( float128 a, float128 b )
	4494	{
	4495	flag aSign, bSign, zSign;
	4496	int32 aExp, bExp, zExp;
	4497	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
	4498	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
	4499	float128 z;
	4500
	4501	aSig1 = extractFloat128Frac1( a );
	4502	aSig0 = extractFloat128Frac0( a );
	4503	aExp = extractFloat128Exp( a );
	4504	aSign = extractFloat128Sign( a );
	4505	bSig1 = extractFloat128Frac1( b );
	4506	bSig0 = extractFloat128Frac0( b );
	4507	bExp = extractFloat128Exp( b );
	4508	bSign = extractFloat128Sign( b );
	4509	zSign = aSign ^ bSign;
	4510	if ( aExp == 0x7FFF ) {
	4511	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b );
	4512	if ( bExp == 0x7FFF ) {
	4513	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b );
	4514	goto invalid;
	4515	}
	4516	return packFloat128( zSign, 0x7FFF, 0, 0 );
	4517	}
	4518	if ( bExp == 0x7FFF ) {
	4519	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b );
	4520	return packFloat128( zSign, 0, 0, 0 );
	4521	}
	4522	if ( bExp == 0 ) {
	4523	if ( ( bSig0 \| bSig1 ) == 0 ) {
	4524	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
	4525	invalid:
	4526	float_raise( float_flag_invalid );
	4527	z.low = float128_default_nan_low;
	4528	z.high = float128_default_nan_high;
	4529	return z;
	4530	}
	4531	float_raise( float_flag_divbyzero );
	4532	return packFloat128( zSign, 0x7FFF, 0, 0 );
	4533	}
	4534	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
	4535	}
	4536	if ( aExp == 0 ) {
	4537	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
	4538	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
	4539	}
	4540	zExp = aExp - bExp + 0x3FFD;
	4541	shortShift128Left(
	4542	aSig0 \| LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
	4543	shortShift128Left(
	4544	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
	4545	if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
	4546	shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
	4547	++zExp;
	4548	}
	4549	zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
	4550	mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
	4551	sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
	4552	while ( (sbits64) rem0 < 0 ) {
	4553	--zSig0;
	4554	add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
	4555	}
	4556	zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
	4557	if ( ( zSig1 & 0x3FFF ) <= 4 ) {
	4558	mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
	4559	sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
	4560	while ( (sbits64) rem1 < 0 ) {
	4561	--zSig1;
	4562	add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
	4563	}
	4564	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
	4565	}
	4566	shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
	4567	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
	4568
	4569	}
	4570
	4571	/*----------------------------------------------------------------------------
	4572	\| Returns the remainder of the quadruple-precision floating-point value `a'
	4573	\| with respect to the corresponding value `b'. The operation is performed
	4574	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	4575	----------------------------------------------------------------------------/
	4576
	4577	float128 float128_rem( float128 a, float128 b )
	4578	{
	4579	flag aSign, zSign;
	4580	int32 aExp, bExp, expDiff;
	4581	bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
	4582	bits64 allZero, alternateASig0, alternateASig1, sigMean1;
	4583	sbits64 sigMean0;
	4584	float128 z;
	4585
	4586	aSig1 = extractFloat128Frac1( a );
	4587	aSig0 = extractFloat128Frac0( a );
	4588	aExp = extractFloat128Exp( a );
	4589	aSign = extractFloat128Sign( a );
	4590	bSig1 = extractFloat128Frac1( b );
	4591	bSig0 = extractFloat128Frac0( b );
	4592	bExp = extractFloat128Exp( b );
	4593	// bSign = extractFloat128Sign( b );
	4594	if ( aExp == 0x7FFF ) {
	4595	if ( ( aSig0 \| aSig1 )
	4596	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
	4597	return propagateFloat128NaN( a, b );
	4598	}
	4599	goto invalid;
	4600	}
	4601	if ( bExp == 0x7FFF ) {
	4602	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b );
	4603	return a;
	4604	}
	4605	if ( bExp == 0 ) {
	4606	if ( ( bSig0 \| bSig1 ) == 0 ) {
	4607	invalid:
	4608	float_raise( float_flag_invalid );
	4609	z.low = float128_default_nan_low;
	4610	z.high = float128_default_nan_high;
	4611	return z;
	4612	}
	4613	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
	4614	}
	4615	if ( aExp == 0 ) {
	4616	if ( ( aSig0 \| aSig1 ) == 0 ) return a;
	4617	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
	4618	}
	4619	expDiff = aExp - bExp;
	4620	if ( expDiff < -1 ) return a;
	4621	shortShift128Left(
	4622	aSig0 \| LIT64( 0x0001000000000000 ),
	4623	aSig1,
	4624	15 - ( expDiff < 0 ),
	4625	&aSig0,
	4626	&aSig1
	4627	);
	4628	shortShift128Left(
	4629	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
	4630	q = le128( bSig0, bSig1, aSig0, aSig1 );
	4631	if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
	4632	expDiff -= 64;
	4633	while ( 0 < expDiff ) {
	4634	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
	4635	q = ( 4 < q ) ? q - 4 : 0;
	4636	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
	4637	shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
	4638	shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
	4639	sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
	4640	expDiff -= 61;
	4641	}
	4642	if ( -64 < expDiff ) {
	4643	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
	4644	q = ( 4 < q ) ? q - 4 : 0;
	4645	q >>= - expDiff;
	4646	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
	4647	expDiff += 52;
	4648	if ( expDiff < 0 ) {
	4649	shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
	4650	}
	4651	else {
	4652	shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
	4653	}
	4654	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
	4655	sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
	4656	}
	4657	else {
	4658	shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
	4659	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
	4660	}
	4661	do {
	4662	alternateASig0 = aSig0;
	4663	alternateASig1 = aSig1;
	4664	++q;
	4665	sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
	4666	} while ( 0 <= (sbits64) aSig0 );
	4667	add128(
	4668	aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
	4669	if ( ( sigMean0 < 0 )
	4670	\|\| ( ( ( sigMean0 \| sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
	4671	aSig0 = alternateASig0;
	4672	aSig1 = alternateASig1;
	4673	}
	4674	zSign = ( (sbits64) aSig0 < 0 );
	4675	if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
	4676	return
	4677	normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
	4678
	4679	}
	4680
	4681	/*----------------------------------------------------------------------------
	4682	\| Returns the square root of the quadruple-precision floating-point value `a'.
	4683	\| The operation is performed according to the IEC/IEEE Standard for Binary
	4684	\| Floating-Point Arithmetic.
	4685	----------------------------------------------------------------------------/
	4686
	4687	float128 float128_sqrt( float128 a )
	4688	{
	4689	flag aSign;
	4690	int32 aExp, zExp;
	4691	bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
	4692	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
	4693	float128 z;
	4694
	4695	aSig1 = extractFloat128Frac1( a );
	4696	aSig0 = extractFloat128Frac0( a );
	4697	aExp = extractFloat128Exp( a );
	4698	aSign = extractFloat128Sign( a );
	4699	if ( aExp == 0x7FFF ) {
	4700	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, a );
	4701	if ( ! aSign ) return a;
	4702	goto invalid;
	4703	}
	4704	if ( aSign ) {
	4705	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) return a;
	4706	invalid:
	4707	float_raise( float_flag_invalid );
	4708	z.low = float128_default_nan_low;
	4709	z.high = float128_default_nan_high;
	4710	return z;
	4711	}
	4712	if ( aExp == 0 ) {
	4713	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
	4714	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
	4715	}
	4716	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
	4717	aSig0 \|= LIT64( 0x0001000000000000 );
	4718	zSig0 = estimateSqrt32( aExp, aSig0>>17 );
	4719	shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
	4720	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
	4721	doubleZSig0 = zSig0<<1;
	4722	mul64To128( zSig0, zSig0, &term0, &term1 );
	4723	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
	4724	while ( (sbits64) rem0 < 0 ) {
	4725	--zSig0;
	4726	doubleZSig0 -= 2;
	4727	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
	4728	}
	4729	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
	4730	if ( ( zSig1 & 0x1FFF ) <= 5 ) {
	4731	if ( zSig1 == 0 ) zSig1 = 1;
	4732	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
	4733	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
	4734	mul64To128( zSig1, zSig1, &term2, &term3 );
	4735	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
	4736	while ( (sbits64) rem1 < 0 ) {
	4737	--zSig1;
	4738	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
	4739	term3 \|= 1;
	4740	term2 \|= doubleZSig0;
	4741	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
	4742	}
	4743	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
	4744	}
	4745	shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
	4746	return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
	4747
	4748	}
	4749
	4750	/*----------------------------------------------------------------------------
	4751	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
	4752	\| the corresponding value `b', and 0 otherwise. The comparison is performed
	4753	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	4754	----------------------------------------------------------------------------/
	4755
	4756	flag float128_eq( float128 a, float128 b )
	4757	{
	4758	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
	4759	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
	4760	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
	4761	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
	4762	) {
	4763	if ( float128_is_signaling_nan( a )
	4764	\|\| float128_is_signaling_nan( b ) ) {
	4765	float_raise( float_flag_invalid );
	4766	}
	4767	return 0;
	4768	}
	4769	return
	4770	( a.low == b.low )
	4771	&& ( ( a.high == b.high )
	4772	\|\| ( ( a.low == 0 )
	4773	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
	4774	);
	4775
	4776	}
	4777
	4778	/*----------------------------------------------------------------------------
	4779	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
	4780	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
	4781	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
	4782	\| Arithmetic.
	4783	----------------------------------------------------------------------------/
	4784
	4785	flag float128_le( float128 a, float128 b )
	4786	{
	4787	flag aSign, bSign;
	4788
	4789	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
	4790	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
	4791	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
	4792	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
	4793	) {
	4794	float_raise( float_flag_invalid );
	4795	return 0;
	4796	}
	4797	aSign = extractFloat128Sign( a );
	4798	bSign = extractFloat128Sign( b );
	4799	if ( aSign != bSign ) {
	4800	return
	4801	aSign
	4802	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
	4803	== 0 );
	4804	}
	4805	return
	4806	aSign ? le128( b.high, b.low, a.high, a.low )
	4807	: le128( a.high, a.low, b.high, b.low );
	4808
	4809	}
	4810
	4811	/*----------------------------------------------------------------------------
	4812	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
	4813	\| the corresponding value `b', and 0 otherwise. The comparison is performed
	4814	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	4815	----------------------------------------------------------------------------/
	4816
	4817	flag float128_lt( float128 a, float128 b )
	4818	{
	4819	flag aSign, bSign;
	4820
	4821	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
	4822	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
	4823	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
	4824	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
	4825	) {
	4826	float_raise( float_flag_invalid );
	4827	return 0;
	4828	}
	4829	aSign = extractFloat128Sign( a );
	4830	bSign = extractFloat128Sign( b );
	4831	if ( aSign != bSign ) {
	4832	return
	4833	aSign
	4834	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
	4835	!= 0 );
	4836	}
	4837	return
	4838	aSign ? lt128( b.high, b.low, a.high, a.low )
	4839	: lt128( a.high, a.low, b.high, b.low );
	4840
	4841	}
	4842
	4843	/*----------------------------------------------------------------------------
	4844	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
	4845	\| the corresponding value `b', and 0 otherwise. The invalid exception is
	4846	\| raised if either operand is a NaN. Otherwise, the comparison is performed
	4847	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	4848	----------------------------------------------------------------------------/
	4849
	4850	flag float128_eq_signaling( float128 a, float128 b )
	4851	{
	4852	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
	4853	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
	4854	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
	4855	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
	4856	) {
	4857	float_raise( float_flag_invalid );
	4858	return 0;
	4859	}
	4860	return
	4861	( a.low == b.low )
	4862	&& ( ( a.high == b.high )
	4863	\|\| ( ( a.low == 0 )
	4864	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
	4865	);
	4866
	4867	}
	4868
	4869	/*----------------------------------------------------------------------------
	4870	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
	4871	\| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
	4872	\| cause an exception. Otherwise, the comparison is performed according to the
	4873	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	4874	----------------------------------------------------------------------------/
	4875
	4876	flag float128_le_quiet( float128 a, float128 b )
	4877	{
	4878	flag aSign, bSign;
	4879
	4880	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
	4881	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
	4882	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
	4883	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
	4884	) {
	4885	if ( float128_is_signaling_nan( a )
	4886	\|\| float128_is_signaling_nan( b ) ) {
	4887	float_raise( float_flag_invalid );
	4888	}
	4889	return 0;
	4890	}
	4891	aSign = extractFloat128Sign( a );
	4892	bSign = extractFloat128Sign( b );
	4893	if ( aSign != bSign ) {
	4894	return
	4895	aSign
	4896	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
	4897	== 0 );
	4898	}
	4899	return
	4900	aSign ? le128( b.high, b.low, a.high, a.low )
	4901	: le128( a.high, a.low, b.high, b.low );
	4902
	4903	}
	4904
	4905	/*----------------------------------------------------------------------------
	4906	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
	4907	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
	4908	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
	4909	\| Standard for Binary Floating-Point Arithmetic.
	4910	----------------------------------------------------------------------------/
	4911
	4912	flag float128_lt_quiet( float128 a, float128 b )
	4913	{
	4914	flag aSign, bSign;
	4915
	4916	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
	4917	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
	4918	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
	4919	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
	4920	) {
	4921	if ( float128_is_signaling_nan( a )
	4922	\|\| float128_is_signaling_nan( b ) ) {
	4923	float_raise( float_flag_invalid );
	4924	}
	4925	return 0;
	4926	}
	4927	aSign = extractFloat128Sign( a );
	4928	bSign = extractFloat128Sign( b );
	4929	if ( aSign != bSign ) {
	4930	return
	4931	aSign
	4932	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
	4933	!= 0 );
	4934	}
	4935	return
	4936	aSign ? lt128( b.high, b.low, a.high, a.low )
	4937	: lt128( a.high, a.low, b.high, b.low );
	4938
	4939	}
	4940
	4941	#endif

trunk/3rdparty/softfloat/softfloat.h
r0	r242847
	1
	2	/*============================================================================
	3
	4	This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
	5	Package, Release 2b.
	6
	7	Written by John R. Hauser. This work was made possible in part by the
	8	International Computer Science Institute, located at Suite 600, 1947 Center
	9	Street, Berkeley, California 94704. Funding was partially provided by the
	10	National Science Foundation under grant MIP-9311980. The original version
	11	of this code was written as part of a project to build a fixed-point vector
	12	processor in collaboration with the University of California at Berkeley,
	13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
	14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
	15	arithmetic/SoftFloat.html'.
	16
	17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
	18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
	19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
	20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
	21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
	22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
	23	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
	24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
	25
	26	Derivative works are acceptable, even for commercial purposes, so long as
	27	(1) the source code for the derivative work includes prominent notice that
	28	the work is derivative, and (2) the source code includes prominent notice with
	29	these four paragraphs for those parts of this code that are retained.
	30
	31	=============================================================================*/
	32
	33	/*----------------------------------------------------------------------------
	34	\| The macro `FLOATX80' must be defined to enable the extended double-precision
	35	\| floating-point format `floatx80'. If this macro is not defined, the
	36	\| `floatx80' type will not be defined, and none of the functions that either
	37	\| input or output the `floatx80' type will be defined. The same applies to
	38	\| the `FLOAT128' macro and the quadruple-precision format `float128'.
	39	----------------------------------------------------------------------------/
	40	#define FLOATX80
	41	#define FLOAT128
	42
	43	/*----------------------------------------------------------------------------
	44	\| Software IEC/IEEE floating-point types.
	45	----------------------------------------------------------------------------/
	46	typedef bits32 float32;
	47	typedef bits64 float64;
	48	#ifdef FLOATX80
	49	typedef struct {
	50	bits16 high;
	51	bits64 low;
	52	} floatx80;
	53	#endif
	54	#ifdef FLOAT128
	55	typedef struct {
	56	bits64 high, low;
	57	} float128;
	58	#endif
	59
	60	/*----------------------------------------------------------------------------
	61	\| Primitive arithmetic functions, including multi-word arithmetic, and
	62	\| division and square root approximations. (Can be specialized to target if
	63	\| desired.)
	64	----------------------------------------------------------------------------/
	65	#include "softfloat-macros"
	66
	67	/*----------------------------------------------------------------------------
	68	\| Software IEC/IEEE floating-point underflow tininess-detection mode.
	69	----------------------------------------------------------------------------/
	70	extern int8 float_detect_tininess;
	71	enum {
	72	float_tininess_after_rounding = 0,
	73	float_tininess_before_rounding = 1
	74	};
	75
	76	/*----------------------------------------------------------------------------
	77	\| Software IEC/IEEE floating-point rounding mode.
	78	----------------------------------------------------------------------------/
	79	extern int8 float_rounding_mode;
	80	enum {
	81	float_round_nearest_even = 0,
	82	float_round_to_zero = 1,
	83	float_round_down = 2,
	84	float_round_up = 3
	85	};
	86
	87	/*----------------------------------------------------------------------------
	88	\| Software IEC/IEEE floating-point exception flags.
	89	----------------------------------------------------------------------------/
	90	extern int8 float_exception_flags;
	91	enum {
	92	float_flag_invalid = 0x01, float_flag_denormal = 0x02, float_flag_divbyzero = 0x04, float_flag_overflow = 0x08,
	93	float_flag_underflow = 0x10, float_flag_inexact = 0x20
	94	};
	95
	96	/*----------------------------------------------------------------------------
	97	\| Routine to raise any or all of the software IEC/IEEE floating-point
	98	\| exception flags.
	99	----------------------------------------------------------------------------/
	100	void float_raise( int8 );
	101
	102	/*----------------------------------------------------------------------------
	103	\| Software IEC/IEEE integer-to-floating-point conversion routines.
	104	----------------------------------------------------------------------------/
	105	float32 int32_to_float32( int32 );
	106	float64 int32_to_float64( int32 );
	107	#ifdef FLOATX80
	108	floatx80 int32_to_floatx80( int32 );
	109	#endif
	110	#ifdef FLOAT128
	111	float128 int32_to_float128( int32 );
	112	#endif
	113	float32 int64_to_float32( int64 );
	114	float64 int64_to_float64( int64 );
	115	#ifdef FLOATX80
	116	floatx80 int64_to_floatx80( int64 );
	117	#endif
	118	#ifdef FLOAT128
	119	float128 int64_to_float128( int64 );
	120	#endif
	121
	122	/*----------------------------------------------------------------------------
	123	\| Software IEC/IEEE single-precision conversion routines.
	124	----------------------------------------------------------------------------/
	125	int32 float32_to_int32( float32 );
	126	int32 float32_to_int32_round_to_zero( float32 );
	127	int64 float32_to_int64( float32 );
	128	int64 float32_to_int64_round_to_zero( float32 );
	129	float64 float32_to_float64( float32 );
	130	#ifdef FLOATX80
	131	floatx80 float32_to_floatx80( float32 );
	132	#endif
	133	#ifdef FLOAT128
	134	float128 float32_to_float128( float32 );
	135	#endif
	136
	137	/*----------------------------------------------------------------------------
	138	\| Software IEC/IEEE single-precision operations.
	139	----------------------------------------------------------------------------/
	140	float32 float32_round_to_int( float32 );
	141	float32 float32_add( float32, float32 );
	142	float32 float32_sub( float32, float32 );
	143	float32 float32_mul( float32, float32 );
	144	float32 float32_div( float32, float32 );
	145	float32 float32_rem( float32, float32 );
	146	float32 float32_sqrt( float32 );
	147	flag float32_eq( float32, float32 );
	148	flag float32_le( float32, float32 );
	149	flag float32_lt( float32, float32 );
	150	flag float32_eq_signaling( float32, float32 );
	151	flag float32_le_quiet( float32, float32 );
	152	flag float32_lt_quiet( float32, float32 );
	153	flag float32_is_signaling_nan( float32 );
	154
	155	/*----------------------------------------------------------------------------
	156	\| Software IEC/IEEE double-precision conversion routines.
	157	----------------------------------------------------------------------------/
	158	int32 float64_to_int32( float64 );
	159	int32 float64_to_int32_round_to_zero( float64 );
	160	int64 float64_to_int64( float64 );
	161	int64 float64_to_int64_round_to_zero( float64 );
	162	float32 float64_to_float32( float64 );
	163	#ifdef FLOATX80
	164	floatx80 float64_to_floatx80( float64 );
	165	#endif
	166	#ifdef FLOAT128
	167	float128 float64_to_float128( float64 );
	168	#endif
	169
	170	/*----------------------------------------------------------------------------
	171	\| Software IEC/IEEE double-precision operations.
	172	----------------------------------------------------------------------------/
	173	float64 float64_round_to_int( float64 );
	174	float64 float64_add( float64, float64 );
	175	float64 float64_sub( float64, float64 );
	176	float64 float64_mul( float64, float64 );
	177	float64 float64_div( float64, float64 );
	178	float64 float64_rem( float64, float64 );
	179	float64 float64_sqrt( float64 );
	180	flag float64_eq( float64, float64 );
	181	flag float64_le( float64, float64 );
	182	flag float64_lt( float64, float64 );
	183	flag float64_eq_signaling( float64, float64 );
	184	flag float64_le_quiet( float64, float64 );
	185	flag float64_lt_quiet( float64, float64 );
	186	flag float64_is_signaling_nan( float64 );
	187
	188	#ifdef FLOATX80
	189
	190	/*----------------------------------------------------------------------------
	191	\| Software IEC/IEEE extended double-precision conversion routines.
	192	----------------------------------------------------------------------------/
	193	int32 floatx80_to_int32( floatx80 );
	194	int32 floatx80_to_int32_round_to_zero( floatx80 );
	195	int64 floatx80_to_int64( floatx80 );
	196	int64 floatx80_to_int64_round_to_zero( floatx80 );
	197	float32 floatx80_to_float32( floatx80 );
	198	float64 floatx80_to_float64( floatx80 );
	199	#ifdef FLOAT128
	200	float128 floatx80_to_float128( floatx80 );
	201	#endif
	202	floatx80 floatx80_scale(floatx80 a, floatx80 b);
	203
	204	/*----------------------------------------------------------------------------
	205	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
	206	\| extended double-precision floating-point value, returning the result.
	207	----------------------------------------------------------------------------/
	208
	209	INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
	210	{
	211	floatx80 z;
	212
	213	z.low = zSig;
	214	z.high = ( ( (bits16) zSign )<<15 ) + zExp;
	215	return z;
	216
	217	}
	218
	219	/*----------------------------------------------------------------------------
	220	\| Software IEC/IEEE extended double-precision rounding precision. Valid
	221	\| values are 32, 64, and 80.
	222	----------------------------------------------------------------------------/
	223	extern int8 floatx80_rounding_precision;
	224
	225	/*----------------------------------------------------------------------------
	226	\| Software IEC/IEEE extended double-precision operations.
	227	----------------------------------------------------------------------------/
	228	floatx80 floatx80_round_to_int( floatx80 );
	229	floatx80 floatx80_add( floatx80, floatx80 );
	230	floatx80 floatx80_sub( floatx80, floatx80 );
	231	floatx80 floatx80_mul( floatx80, floatx80 );
	232	floatx80 floatx80_div( floatx80, floatx80 );
	233	floatx80 floatx80_rem( floatx80, floatx80 );
	234	floatx80 floatx80_sqrt( floatx80 );
	235	flag floatx80_eq( floatx80, floatx80 );
	236	flag floatx80_le( floatx80, floatx80 );
	237	flag floatx80_lt( floatx80, floatx80 );
	238	flag floatx80_eq_signaling( floatx80, floatx80 );
	239	flag floatx80_le_quiet( floatx80, floatx80 );
	240	flag floatx80_lt_quiet( floatx80, floatx80 );
	241	flag floatx80_is_signaling_nan( floatx80 );
	242
	243	int floatx80_fsin(floatx80 &a);
	244	int floatx80_fcos(floatx80 &a);
	245	int floatx80_ftan(floatx80 &a);
	246
	247	floatx80 floatx80_flognp1(floatx80 a);
	248	floatx80 floatx80_flogn(floatx80 a);
	249	floatx80 floatx80_flog2(floatx80 a);
	250	floatx80 floatx80_flog10(floatx80 a);
	251
	252	// roundAndPackFloatx80 used to be in softfloat-round-pack, is now in softfloat.c
	253	floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1);
	254
	255	#endif
	256
	257	#ifdef FLOAT128
	258
	259	/*----------------------------------------------------------------------------
	260	\| Software IEC/IEEE quadruple-precision conversion routines.
	261	----------------------------------------------------------------------------/
	262	int32 float128_to_int32( float128 );
	263	int32 float128_to_int32_round_to_zero( float128 );
	264	int64 float128_to_int64( float128 );
	265	int64 float128_to_int64_round_to_zero( float128 );
	266	float32 float128_to_float32( float128 );
	267	float64 float128_to_float64( float128 );
	268	#ifdef FLOATX80
	269	floatx80 float128_to_floatx80( float128 );
	270	#endif
	271
	272	/*----------------------------------------------------------------------------
	273	\| Software IEC/IEEE quadruple-precision operations.
	274	----------------------------------------------------------------------------/
	275	float128 float128_round_to_int( float128 );
	276	float128 float128_add( float128, float128 );
	277	float128 float128_sub( float128, float128 );
	278	float128 float128_mul( float128, float128 );
	279	float128 float128_div( float128, float128 );
	280	float128 float128_rem( float128, float128 );
	281	float128 float128_sqrt( float128 );
	282	flag float128_eq( float128, float128 );
	283	flag float128_le( float128, float128 );
	284	flag float128_lt( float128, float128 );
	285	flag float128_eq_signaling( float128, float128 );
	286	flag float128_le_quiet( float128, float128 );
	287	flag float128_lt_quiet( float128, float128 );
	288	flag float128_is_signaling_nan( float128 );
	289
	290	/*----------------------------------------------------------------------------
	291	\| Packs the sign `zSign', the exponent `zExp', and the significand formed
	292	\| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
	293	\| floating-point value, returning the result. After being shifted into the
	294	\| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
	295	\| added together to form the most significant 32 bits of the result. This
	296	\| means that any integer portion of `zSig0' will be added into the exponent.
	297	\| Since a properly normalized significand will have an integer portion equal
	298	\| to 1, the `zExp' input should be 1 less than the desired result exponent
	299	\| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
	300	\| significand.
	301	----------------------------------------------------------------------------/
	302
	303	INLINE float128
	304	packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
	305	{
	306	float128 z;
	307
	308	z.low = zSig1;
	309	z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
	310	return z;
	311
	312	}
	313
	314	/*----------------------------------------------------------------------------
	315	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
	316	\| and extended significand formed by the concatenation of `zSig0', `zSig1',
	317	\| and `zSig2', and returns the proper quadruple-precision floating-point value
	318	\| corresponding to the abstract input. Ordinarily, the abstract value is
	319	\| simply rounded and packed into the quadruple-precision format, with the
	320	\| inexact exception raised if the abstract input cannot be represented
	321	\| exactly. However, if the abstract value is too large, the overflow and
	322	\| inexact exceptions are raised and an infinity or maximal finite value is
	323	\| returned. If the abstract value is too small, the input value is rounded to
	324	\| a subnormal number, and the underflow and inexact exceptions are raised if
	325	\| the abstract input cannot be represented exactly as a subnormal quadruple-
	326	\| precision floating-point number.
	327	\| The input significand must be normalized or smaller. If the input
	328	\| significand is not normalized, `zExp' must be 0; in that case, the result
	329	\| returned is a subnormal number, and it must not require rounding. In the
	330	\| usual case that the input significand is normalized, `zExp' must be 1 less
	331	\| than the ``true'' floating-point exponent. The handling of underflow and
	332	\| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
	333	----------------------------------------------------------------------------/
	334
	335	INLINE float128
	336	roundAndPackFloat128(
	337	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
	338	{
	339	int8 roundingMode;
	340	flag roundNearestEven, increment, isTiny;
	341
	342	roundingMode = float_rounding_mode;
	343	roundNearestEven = ( roundingMode == float_round_nearest_even );
	344	increment = ( (sbits64) zSig2 < 0 );
	345	if ( ! roundNearestEven ) {
	346	if ( roundingMode == float_round_to_zero ) {
	347	increment = 0;
	348	}
	349	else {
	350	if ( zSign ) {
	351	increment = ( roundingMode == float_round_down ) && zSig2;
	352	}
	353	else {
	354	increment = ( roundingMode == float_round_up ) && zSig2;
	355	}
	356	}
	357	}
	358	if ( 0x7FFD <= (bits32) zExp ) {
	359	if ( ( 0x7FFD < zExp )
	360	\|\| ( ( zExp == 0x7FFD )
	361	&& eq128(
	362	LIT64( 0x0001FFFFFFFFFFFF ),
	363	LIT64( 0xFFFFFFFFFFFFFFFF ),
	364	zSig0,
	365	zSig1
	366	)
	367	&& increment
	368	)
	369	) {
	370	float_raise( float_flag_overflow \| float_flag_inexact );
	371	if ( ( roundingMode == float_round_to_zero )
	372	\|\| ( zSign && ( roundingMode == float_round_up ) )
	373	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
	374	) {
	375	return
	376	packFloat128(
	377	zSign,
	378	0x7FFE,
	379	LIT64( 0x0000FFFFFFFFFFFF ),
	380	LIT64( 0xFFFFFFFFFFFFFFFF )
	381	);
	382	}
	383	return packFloat128( zSign, 0x7FFF, 0, 0 );
	384	}
	385	if ( zExp < 0 ) {
	386	isTiny =
	387	( float_detect_tininess == float_tininess_before_rounding )
	388	\|\| ( zExp < -1 )
	389	\|\| ! increment
	390	\|\| lt128(
	391	zSig0,
	392	zSig1,
	393	LIT64( 0x0001FFFFFFFFFFFF ),
	394	LIT64( 0xFFFFFFFFFFFFFFFF )
	395	);
	396	shift128ExtraRightJamming(
	397	zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
	398	zExp = 0;
	399	if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
	400	if ( roundNearestEven ) {
	401	increment = ( (sbits64) zSig2 < 0 );
	402	}
	403	else {
	404	if ( zSign ) {
	405	increment = ( roundingMode == float_round_down ) && zSig2;
	406	}
	407	else {
	408	increment = ( roundingMode == float_round_up ) && zSig2;
	409	}
	410	}
	411	}
	412	}
	413	if ( zSig2 ) float_exception_flags \|= float_flag_inexact;
	414	if ( increment ) {
	415	add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
	416	zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
	417	}
	418	else {
	419	if ( ( zSig0 \| zSig1 ) == 0 ) zExp = 0;
	420	}
	421	return packFloat128( zSign, zExp, zSig0, zSig1 );
	422
	423	}
	424
	425	/*----------------------------------------------------------------------------
	426	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
	427	\| and significand formed by the concatenation of `zSig0' and `zSig1', and
	428	\| returns the proper quadruple-precision floating-point value corresponding
	429	\| to the abstract input. This routine is just like `roundAndPackFloat128'
	430	\| except that the input significand has fewer bits and does not have to be
	431	\| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
	432	\| point exponent.
	433	----------------------------------------------------------------------------/
	434
	435	INLINE float128
	436	normalizeRoundAndPackFloat128(
	437	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
	438	{
	439	int8 shiftCount;
	440	bits64 zSig2;
	441
	442	if ( zSig0 == 0 ) {
	443	zSig0 = zSig1;
	444	zSig1 = 0;
	445	zExp -= 64;
	446	}
	447	shiftCount = countLeadingZeros64( zSig0 ) - 15;
	448	if ( 0 <= shiftCount ) {
	449	zSig2 = 0;
	450	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
	451	}
	452	else {
	453	shift128ExtraRightJamming(
	454	zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
	455	}
	456	zExp -= shiftCount;
	457	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
	458
	459	}
	460	#endif

trunk/src/emu/cpu/i386/i386.h
r242846	r242847
5	5	#ifndef __I386INTF_H__
6	6	#define __I386INTF_H__
7	7
8		#include "../../../lib/softfloat/milieu.h"
9		#include "../../../lib/softfloat/softfloat.h"
	8	#include "../../../../3rdparty/softfloat/milieu.h"
	9	#include "../../../../3rdparty/softfloat/softfloat.h"
10	10	#include "debug/debugcpu.h"
11	11	#include "cpu/vtlb.h"
12	12

trunk/src/emu/cpu/m68000/m68000.h
r242846	r242847
6	6
7	7
8	8
9		#include "../../../lib/softfloat/milieu.h"
10		#include "../../../lib/softfloat/softfloat.h"
	9	#include "../../../../3rdparty/softfloat/milieu.h"
	10	#include "../../../../3rdparty/softfloat/softfloat.h"
11	11
12	12
13	13	/* MMU constants */

trunk/src/lib/lib.mak
r242846	r242847
281	281	# SoftFloat library objects
282	282	#-------------------------------------------------
283	283
284		PROCESSOR_H = $(LIBSRC)/softfloat/processors/mamesf.h
285		SOFTFLOAT_MACROS = $(LIBSRC)/softfloat/softfloat/bits64/softfloat-macros
	284	PROCESSOR_H = $(3RDPARTY)/softfloat/processors/mamesf.h
	285	SOFTFLOAT_MACROS = $(3RDPARTY)/softfloat/softfloat/bits64/softfloat-macros
286	286
287	287	SOFTFLOATOBJS = \
288	288	$(LIBOBJ)/softfloat/softfloat.o \
r242846	r242847
291	291
292	292	$(OBJ)/libsoftfloat.a: $(SOFTFLOATOBJS)
293	293
294		$(LIBOBJ)/softfloat/softfloat.o: $(LIBSRC)/softfloat/softfloat.c $(LIBSRC)/softfloat/softfloat.h $(LIBSRC)/softfloat/softfloat-macros $(LIBSRC)/softfloat/softfloat-specialize
295		$(LIBOBJ)/softfloat/fsincos.o: $(LIBSRC)/softfloat/fsincos.c $(LIBSRC)/softfloat/fpu_constant.h $(LIBSRC)/softfloat/softfloat.h $(LIBSRC)/softfloat/softfloat-macros $(LIBSRC)/softfloat/softfloat-specialize
	294	$(LIBOBJ)/softfloat/softfloat.o: $(3RDPARTY)/softfloat/softfloat.c $(3RDPARTY)/softfloat/softfloat.h $(3RDPARTY)/softfloat/softfloat-macros $(3RDPARTY)/softfloat/softfloat-specialize
	295	$(LIBOBJ)/softfloat/fsincos.o: $(3RDPARTY)/softfloat/fsincos.c $(3RDPARTY)/softfloat/fpu_constant.h $(3RDPARTY)/softfloat/softfloat.h $(3RDPARTY)/softfloat/softfloat-macros $(3RDPARTY)/softfloat/softfloat-specialize
296	296
	297	$(LIBOBJ)/softfloat/%.o: $(3RDPARTY)/softfloat/%.c \| $(OSPREBUILD)
	298	@echo Compiling $<...
	299	$(CC) $(CDEFS) $(CFLAGS) -c $< -o $@
297	300
298
299	301	#-------------------------------------------------
300	302	# libJPEG library objects
301	303	#-------------------------------------------------

https://github.com/mamedev/mame/commit/45ac9b351e0076c371de4c7453f59567b27b1cb6

199869 Revisions