fp.texi 69 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801
  1. @c Copyright @copyright{} 2022 Richard Stallman and Free Software Foundation, Inc.
  2. This is part of the GNU C Intro and Reference Manual
  3. and covered by its license.
  4. @node Floating Point in Depth
  5. @chapter Floating Point in Depth
  6. @menu
  7. * Floating Representations::
  8. * Floating Type Specs::
  9. * Special Float Values::
  10. * Invalid Optimizations::
  11. * Exception Flags::
  12. * Exact Floating-Point::
  13. * Rounding::
  14. * Rounding Issues::
  15. * Significance Loss::
  16. * Fused Multiply-Add::
  17. * Error Recovery::
  18. @c * Double-Rounding Problems::
  19. * Exact Floating Constants::
  20. * Handling Infinity::
  21. * Handling NaN::
  22. * Signed Zeros::
  23. * Scaling by the Base::
  24. * Rounding Control::
  25. * Machine Epsilon::
  26. * Complex Arithmetic::
  27. * Round-Trip Base Conversion::
  28. * Further Reading::
  29. @end menu
  30. @node Floating Representations
  31. @section Floating-Point Representations
  32. @cindex floating-point representations
  33. @cindex representation of floating-point numbers
  34. @cindex IEEE 754-2008 Standard
  35. Storing numbers as @dfn{floating point} allows representation of
  36. numbers with fractional values, in a range larger than that of
  37. hardware integers. A floating-point number consists of a sign bit, a
  38. @dfn{significand} (also called the @dfn{mantissa}), and a power of a
  39. fixed base. GNU C uses the floating-point representations specified by
  40. the @cite{IEEE 754-2008 Standard for Floating-Point Arithmetic}.
  41. The IEEE 754-2008 specification defines basic binary floating-point
  42. formats of five different sizes: 16-bit, 32-bit, 64-bit, 128-bit, and
  43. 256-bit. The formats of 32, 64, and 128 bits are used for the
  44. standard C types @code{float}, @code{double}, and @code{long double}.
  45. GNU C supports the 16-bit floating point type @code{_Float16} on some
  46. platforms, but does not support the 256-bit floating point type.
  47. Each of the formats encodes the floating-point number as a sign bit.
  48. After this comes an exponent that specifies a power of 2 (with a fixed
  49. offset). Then comes the significand.
  50. The first bit of the significand, before the binary point, is always
  51. 1, so there is no need to store it in memory. It is called the
  52. @dfn{hidden bit} because it doesn't appear in the floating-point
  53. number as used in the computer itself.
  54. All of those floating-point formats are sign-magnitude representations,
  55. so +0 and @minus{}0 are different values.
  56. Besides the IEEE 754 format 128-bit float, GNU C also offers a format
  57. consisting of a pair of 64-bit floating point numbers. This lacks the
  58. full exponent range of the IEEE 128-bit format, but is useful when the
  59. underlying hardware platform does not support that.
  60. @node Floating Type Specs
  61. @section Floating-Point Type Specifications
  62. The standard library header file @file{float.h} defines a number of
  63. constants that describe the platform's implementation of
  64. floating-point types @code{float}, @code{double} and @code{long
  65. double}. They include:
  66. @findex FLT_MIN
  67. @findex DBL_MIN
  68. @findex LDBL_MIN
  69. @findex FLT_HAS_SUBNORM
  70. @findex DBL_HAS_SUBNORM
  71. @findex LDBL_HAS_SUBNORM
  72. @findex FLT_TRUE_MIN
  73. @findex DBL_TRUE_MIN
  74. @findex LDBL_TRUE_MIN
  75. @findex FLT_MAX
  76. @findex DBL_MAX
  77. @findex LDBL_MAX
  78. @findex FLT_DECIMAL_DIG
  79. @findex DBL_DECIMAL_DIG
  80. @findex LDBL_DECIMAL_DIG
  81. @table @code
  82. @item FLT_MIN
  83. @itemx DBL_MIN
  84. @itemx LDBL_MIN
  85. Defines the minimum normalized positive floating-point values that can
  86. be represented with the type.
  87. @item FLT_HAS_SUBNORM
  88. @itemx DBL_HAS_SUBNORM
  89. @itemx LDBL_HAS_SUBNORM
  90. Defines if the floating-point type supports subnormal (or ``denormalized'')
  91. numbers or not (@pxref{subnormal numbers}).
  92. @item FLT_TRUE_MIN
  93. @itemx DBL_TRUE_MIN
  94. @itemx LDBL_TRUE_MIN
  95. Defines the minimum positive values (including subnormal values) that
  96. can be represented with the type.
  97. @item FLT_MAX
  98. @itemx DBL_MAX
  99. @itemx LDBL_MAX
  100. Defines the largest values that can be represented with the type.
  101. @item FLT_DECIMAL_DIG
  102. @itemx DBL_DECIMAL_DIG
  103. @itemx LDBL_DECIMAL_DIG
  104. Defines the number of decimal digits @code{n} such that any
  105. floating-point number that can be represented in the type can be
  106. rounded to a floating-point number with @code{n} decimal digits, and
  107. back again, without losing any precision of the value.
  108. @end table
  109. @node Special Float Values
  110. @section Special Floating-Point Values
  111. @cindex special floating-point values
  112. @cindex floating-point values, special
  113. IEEE floating point provides for special values that are not ordinary
  114. numbers.
  115. @table @asis
  116. @item infinities
  117. @code{+Infinity} and @code{-Infinity} are two different infinite
  118. values, one positive and one negative. These result from
  119. operations such as @code{1 / 0}, @code{Infinity + Infinity},
  120. @code{Infinity * Infinity}, and @code{Infinity + @var{finite}}, and also
  121. from a result that is finite, but larger than the most positive possible
  122. value or smaller than the most negative possible value.
  123. @xref{Handling Infinity}, for more about working with infinities.
  124. @item NaNs (not a number)
  125. @cindex QNaN
  126. @cindex SNaN
  127. There are two special values, called Not-a-Number (NaN): a quiet
  128. NaN (QNaN), and a signaling NaN (SNaN).
  129. A QNaN is produced by operations for which the value is undefined
  130. in real arithmetic, such as @code{0 / 0}, @code{sqrt (-1)},
  131. @code{Infinity - Infinity}, and any basic operation in which an
  132. operand is a QNaN.
  133. The signaling NaN is intended for initializing
  134. otherwise-unassigned storage, and the goal is that unlike a
  135. QNaN, an SNaN @emph{does} cause an interrupt that can be caught
  136. by a software handler, diagnosed, and reported. In practice,
  137. little use has been made of signaling NaNs, because the most
  138. common CPUs in desktop and portable computers fail to implement
  139. the full IEEE 754 Standard, and supply only one kind of NaN, the
  140. quiet one. Also, programming-language standards have taken
  141. decades to catch up to the IEEE 754 standard, and implementations
  142. of those language standards make an additional delay before
  143. programmers become willing to use these features.
  144. To enable support for signaling NaNs, use the GCC command-line option
  145. @option{-fsignaling-nans}, but this is an experimental feature and may
  146. not work as expected in every situation.
  147. A NaN has a sign bit, but its value means nothing.
  148. @xref{Handling NaN}, for more about working with NaNs.
  149. @item subnormal numbers
  150. @cindex subnormal numbers
  151. @cindex underflow, floating
  152. @cindex floating underflow
  153. @anchor{subnormal numbers}
  154. It can happen that a computed floating-point value is too small to
  155. represent, such as when two tiny numbers are multiplied. The result
  156. is then said to @dfn{underflow}. The traditional behavior before
  157. the IEEE 754 Standard was to use zero as the result, and possibly to report
  158. the underflow in some sort of program output.
  159. The IEEE 754 Standard is vague about whether rounding happens
  160. before detection of floating underflow and overflow, or after, and CPU
  161. designers may choose either.
  162. However, the Standard does something unusual compared to earlier
  163. designs, and that is that when the result is smaller than the
  164. smallest @dfn{normalized} representable value (i.e., one in
  165. which the leading significand bit is @code{1}), the normalization
  166. requirement is relaxed, leading zero bits are permitted, and
  167. precision is gradually lost until there are no more bits in the
  168. significand. That phenomenon is called @dfn{gradual underflow},
  169. and it serves important numerical purposes, although it does
  170. reduce the precision of the final result. Some floating-point
  171. designs allow you to choose at compile time, or even at
  172. run time, whether underflows are gradual, or are flushed abruptly
  173. to zero. Numbers that have entered the region of gradual
  174. underflow are called @dfn{subnormal}.
  175. You can use the library functions @code{fesetround} and
  176. @code{fegetround} to set and get the rounding mode. Rounding modes
  177. are defined (if supported by the platform) in @code{fenv.h} as:
  178. @code{FE_UPWARD} to round toward positive infinity; @code{FE_DOWNWARD}
  179. to round toward negative infinity; @code{FE_TOWARDZERO} to round
  180. toward zero; and @code{FE_TONEAREST} to round to the nearest
  181. representable value, the default mode. It is best to use
  182. @code{FE_TONEAREST} except when there is a special need for some other
  183. mode.
  184. @end table
  185. @node Invalid Optimizations
  186. @section Invalid Optimizations
  187. @cindex invalid optimizations in floating-point arithmetic
  188. @cindex floating-point arithmetic invalid optimizations
  189. Signed zeros, Infinity, and NaN invalidate some optimizations by
  190. programmers and compilers that might otherwise have seemed obvious:
  191. @itemize @bullet
  192. @item
  193. @code{x + 0} and @code{x - 0} are not the same as @code{x} when
  194. @code{x} is zero, because the result depends on the rounding rule.
  195. @xref{Rounding}, for more about rounding rules.
  196. @item
  197. @code{x * 0.0} is not the same as @code{0.0} when @code{x} is
  198. Infinity, a NaN, or negative zero.
  199. @item
  200. @code{x / x} is not the same as @code{1.0} when @code{x} is Infinity,
  201. a NaN, or zero.
  202. @item
  203. @code{(x - y)} is not the same as @code{-(y - x)} because when the
  204. operands are finite and equal, one evaluates to @code{+0} and the
  205. other to @code{-0}.
  206. @item
  207. @code{x - x} is not the same as @code{0.0} when @var{x} is Infinity or
  208. a NaN.
  209. @item
  210. @code{x == x} and @code{x != x} are not equivalent to @code{1} and
  211. @code{0} when @var{x} is a NaN.
  212. @item
  213. @code{x < y} and @code{isless (x, y)} are not equivalent, because the
  214. first sets a sticky exception flag (@pxref{Exception Flags}) when an
  215. operand is a NaN, whereas the second does not affect that flag. The
  216. same holds for the other @code{isxxx} functions that are companions to
  217. relational operators. @xref{FP Comparison Functions, , , libc, The
  218. GNU C Library Reference Manual}.
  219. @end itemize
  220. The @option{-funsafe-math-optimizations} option enables
  221. these optimizations.
  222. @node Exception Flags
  223. @section Floating Arithmetic Exception Flags
  224. @cindex floating arithmetic exception flags
  225. @cindex exception flags (floating point)
  226. @cindex sticky exception flags (floating point)
  227. @cindex floating overflow
  228. @cindex overflow, floating
  229. @cindex floating underflow
  230. @cindex underflow, floating
  231. @dfn{Sticky exception flags} record the occurrence of particular
  232. conditions: once set, they remain set until the program explicitly
  233. clears them.
  234. The conditions include @emph{invalid operand},
  235. @emph{division-by_zero}, @emph{inexact result} (i.e., one that
  236. required rounding), @emph{underflow}, and @emph{overflow}. Some
  237. extended floating-point designs offer several additional exception
  238. flags. The functions @code{feclearexcept}, @code{feraiseexcept},
  239. @code{fetestexcept}, @code{fegetexceptflags}, and
  240. @code{fesetexceptflags} provide a standardized interface to those
  241. flags. @xref{Status bit operations, , , libc, The GNU C Library
  242. Reference Manual}.
  243. One important use of those @anchor{fetestexcept}flags is to do a
  244. computation that is normally expected to be exact in floating-point
  245. arithmetic, but occasionally might not be, in which case, corrective
  246. action is needed. You can clear the @emph{inexact result} flag with a
  247. call to @code{feclearexcept (FE_INEXACT)}, do the computation, and
  248. then test the flag with @code{fetestexcept (FE_INEXACT)}; the result
  249. of that call is 0 if the flag is not set (there was no rounding), and
  250. 1 when there was rounding (which, we presume, implies the program has
  251. to correct for that).
  252. @c =====================================================================
  253. @ignore
  254. @node IEEE 754 Decimal Arithmetic
  255. @section IEEE 754 Decimal Arithmetic
  256. @cindex IEEE 754 decimal arithmetic
  257. One of the difficulties that users of computers for numerical
  258. work face, whether they realize it or not, is that the computer
  259. does not operate in the number base that most people are familiar
  260. with. As a result, input decimal fractions must be converted to
  261. binary floating-point values for use in computations, and then
  262. the final results converted back to decimal for humans. Because the
  263. precision is finite and limited, and because algorithms for correct
  264. round-trip conversion between number bases were not known until the
  265. 1990s, and are still not implemented on most systems and most
  266. programming languages, the result is frequent confusion for users,
  267. when a simple expression like @code{3.0*(1.0/3.0)} evaluates to
  268. 0.999999 instead of the expected 1.0. Here is an
  269. example from a floating-point calculator that allows rounding-mode
  270. control, with the mode set to @emph{round-towards-zero}:
  271. @example
  272. for (k = 1; k <= 10; ++k)
  273. (void)printf ("%2d\t%10.6f\n", k, k*(1.0/k))
  274. 1 1.000000
  275. 2 1.000000
  276. 3 0.999999
  277. 4 1.000000
  278. 5 0.999999
  279. 6 0.999999
  280. 7 0.999999
  281. 8 1.000000
  282. 9 0.999999
  283. 10 0.999999
  284. @end example
  285. Increasing working precision can sometimes help by reducing
  286. intermediate rounding errors, but the reality is that when
  287. fractional values are involved, @emph{no amount} of extra
  288. precision can suffice for some computations. For example, the
  289. nice decimal value @code{1/10} in C99-style binary representation
  290. is @code{+0x1.999999999999ap-4}; that final digit @code{a} is the
  291. rounding of an infinite string of @code{9}'s.
  292. Financial computations in particular depend critically on correct
  293. arithmetic, and the losses due to rounding errors can be
  294. large, especially for businesses with large numbers of small
  295. transactions, such as grocery stores and telephone companies.
  296. Tax authorities are particularly picky, and demand specific
  297. rounding rules, including one that instead of rounding ties to
  298. the nearest number, rounds instead in the direction that favors
  299. the taxman.
  300. Programming languages used for business applications, notably the
  301. venerable Cobol language, have therefore always implemented
  302. financial computations in @emph{fixed-point decimal arithmetic}
  303. in software, and because of the large monetary amounts that must be
  304. processed, successive Cobol standards have increased the minimum
  305. number size from 18 to 32 decimal digits, and the most recent one
  306. requires a decimal exponent range of at least @code{[-999, +999]}.
  307. The revised IEEE 754-2008 standard therefore requires decimal
  308. floating-point arithmetic, as well as the now-widely used binary
  309. formats from 1985. Like the binary formats, the decimal formats
  310. also support Infinity, NaN, and signed zero, and the five basic
  311. operations are also required to produce correctly rounded
  312. representations of infinitely precise exact results.
  313. However, the financial applications of decimal arithmetic
  314. introduce some new features:
  315. @itemize @bullet
  316. @item
  317. There are three decimal formats occupying 32, 64, and 128 bits of
  318. storage, and offering exactly 7, 16, and 34 decimal digits of
  319. precision. If one size has @code{n} digits, the next larger size
  320. has @code{2 n + 2} digits. Thus, a future 256-bit format would
  321. supply 70 decimal digits, and at least one library already
  322. supports the 256-bit binary and decimal formats.
  323. @item
  324. Decimal arithmetic has an additional rounding mode, called
  325. @emph{round-ties-to-away-from-zero}, meaning that a four-digit
  326. rounding of @code{1.2345} is @code{1.235}, and @code{-1.2345}
  327. becomes @code{-1.235}. That rounding mode is mandated by
  328. financial laws in several countries.
  329. @item
  330. The decimal significand is an
  331. @anchor{decimal-significand}@emph{integer}, instead of a fractional
  332. value, and trailing zeros are only removed at user request. That
  333. feature allows floating-point arithmetic to emulate the
  334. @emph{fixed-point arithmetic} traditionally used in financial
  335. computations.
  336. @end itemize
  337. @noindent
  338. We can easily estimate how many digits are likely to be needed for
  339. financial work: seven billion people on Earth, with an average annual
  340. income of less than US$10,000, means a world financial base that can
  341. be represented in just 15 decimal digits. Even allowing for alternate
  342. currencies, future growth, multiyear accounting, and intermediate
  343. computations, the 34 digits supplied by the 128-bit format are more
  344. than enough for financial purposes.
  345. We return to decimal arithmetic later in this chapter
  346. (@pxref{More on decimal floating-point arithmetic}), after we have
  347. covered more about floating-point arithmetic in general.
  348. @c =====================================================================
  349. @end ignore
  350. @node Exact Floating-Point
  351. @section Exact Floating-Point Arithmetic
  352. @cindex exact floating-point arithmetic
  353. @cindex floating-point arithmetic, exact
  354. As long as the numbers are exactly representable (fractions whose
  355. denominator is a power of 2), and intermediate results do not require
  356. rounding, then floating-point arithmetic is @emph{exact}. It is easy
  357. to predict how many digits are needed for the results of arithmetic
  358. operations:
  359. @itemize @bullet
  360. @item
  361. addition and subtraction of two @var{n}-digit values with the
  362. @emph{same} exponent require at most @code{@var{n} + 1} digits, but
  363. when the exponents differ, many more digits may be needed;
  364. @item
  365. multiplication of two @var{n}-digit values requires exactly
  366. 2 @var{n} digits;
  367. @item
  368. although integer division produces a quotient and a remainder of
  369. no more than @var{n}-digits, floating-point remainder and square
  370. root may require an unbounded number of digits, and the quotient
  371. can need many more digits than can be stored.
  372. @end itemize
  373. Whenever a result requires more than @var{n} digits, rounding
  374. is needed.
  375. @c =====================================================================
  376. @node Rounding
  377. @section Rounding
  378. @cindex rounding
  379. When floating-point arithmetic produces a result that can't fit
  380. exactly in the significand of the type that's in use, it has to
  381. @dfn{round} the value. The basic arithmetic operations---addition,
  382. subtraction, multiplication, division, and square root---always produce
  383. a result that is equivalent to the exact, possibly infinite-precision
  384. result rounded to storage precision according to the current rounding
  385. rule.
  386. Rounding sets the @code{FE_INEXACT} exception flag (@pxref{Exception
  387. Flags}). This enables programs to determine that rounding has
  388. occurred.
  389. Rounding consists of adjusting the exponent to bring the significand
  390. back to the required base-point alignment, then applying the current
  391. @dfn{rounding rule} to squeeze the significand into the fixed
  392. available size.
  393. The current rule is selected at run time from four options. Here they
  394. are:
  395. @itemize *
  396. @item
  397. @emph{round-to-nearest}, with ties rounded to an even integer;
  398. @item
  399. @emph{round-up}, towards @code{+Infinity};
  400. @item
  401. @emph{round-down}, towards @code{-Infinity};
  402. @item
  403. @emph{round-towards-zero}.
  404. @end itemize
  405. Under those four rounding rules, a decimal value
  406. @code{-1.2345} that is to be rounded to a four-digit result would
  407. become @code{-1.234}, @code{-1.234}, @code{-1.235}, and
  408. @code{-1.234}, respectively.
  409. The default rounding rule is @emph{round-to-nearest}, because that has
  410. the least bias, and produces the lowest average error. When the true
  411. result lies exactly halfway between two representable machine numbers,
  412. the result is rounded to the one that ends with an even digit.
  413. The @emph{round-towards-zero} rule was common on many early computer
  414. designs, because it is the easiest to implement: it just requires
  415. silent truncation of all extra bits.
  416. The two other rules, @emph{round-up} and @emph{round-down}, are
  417. essential for implementing @dfn{interval arithmetic}, whereby
  418. each arithmetic operation produces lower and upper bounds that
  419. are guaranteed to enclose the exact result.
  420. @xref{Rounding Control}, for details on getting and setting the
  421. current rounding mode.
  422. @node Rounding Issues
  423. @section Rounding Issues
  424. @cindex rounding issues (floating point)
  425. @cindex floating-point rounding issues
  426. The default IEEE 754 rounding mode minimizes errors, and most
  427. normal computations should not suffer any serious accumulation of
  428. errors from rounding.
  429. Of course, you can contrive examples where that is not so. Here
  430. is one: iterate a square root, then attempt to recover the
  431. original value by repeated squaring.
  432. @example
  433. #include <stdio.h>
  434. #include <math.h>
  435. int main (void)
  436. @{
  437. double x = 100.0;
  438. double y;
  439. for (n = 10; n <= 100; n += 10)
  440. @{
  441. y = x;
  442. for (k = 0; k < n; ++k) y = sqrt (y);
  443. for (k = 0; k < n; ++k) y *= y;
  444. printf ("n = %3d; x = %.0f\ty = %.6f\n", n, x, y);
  445. @}
  446. return 0;
  447. @}
  448. @end example
  449. @noindent
  450. Here is the output:
  451. @example
  452. n = 10; x = 100 y = 100.000000
  453. n = 20; x = 100 y = 100.000000
  454. n = 30; x = 100 y = 99.999977
  455. n = 40; x = 100 y = 99.981025
  456. n = 50; x = 100 y = 90.017127
  457. n = 60; x = 100 y = 1.000000
  458. n = 70; x = 100 y = 1.000000
  459. n = 80; x = 100 y = 1.000000
  460. n = 90; x = 100 y = 1.000000
  461. n = 100; x = 100 y = 1.000000
  462. @end example
  463. After 50 iterations, @code{y} has barely one correct digit, and
  464. soon after, there are no correct digits.
  465. @c =====================================================================
  466. @node Significance Loss
  467. @section Significance Loss
  468. @cindex significance loss (floating point)
  469. @cindex floating-point significance loss
  470. A much more serious source of error in floating-point computation is
  471. @dfn{significance loss} from subtraction of nearly equal values. This
  472. means that the number of bits in the significand of the result is
  473. fewer than the size of the value would permit. If the values being
  474. subtracted are close enough, but still not equal, a @emph{single
  475. subtraction} can wipe out all correct digits, possibly contaminating
  476. all future computations.
  477. Floating-point calculations can sometimes be carefully designed so
  478. that significance loss is not possible, such as summing a series where
  479. all terms have the same sign. For example, the Taylor series
  480. expansions of the trigonometric and hyperbolic sines have terms of
  481. identical magnitude, of the general form @code{@var{x}**(2*@var{n} +
  482. 1) / (2*@var{n} + 1)!}. However, those in the trigonometric sine series
  483. alternate in sign, while those in the hyperbolic sine series are all
  484. positive. Here is the output of two small programs that sum @var{k}
  485. terms of the series for @code{sin (@var{x})}, and compare the computed
  486. sums with known-to-be-accurate library functions:
  487. @example
  488. x = 10 k = 51
  489. s (x) = -0.544_021_110_889_270
  490. sin (x) = -0.544_021_110_889_370
  491. x = 20 k = 81
  492. s (x) = 0.912_945_250_749_573
  493. sin (x) = 0.912_945_250_727_628
  494. x = 30 k = 109
  495. s (x) = -0.987_813_746_058_855
  496. sin (x) = -0.988_031_624_092_862
  497. x = 40 k = 137
  498. s (x) = 0.617_400_430_980_474
  499. sin (x) = 0.745_113_160_479_349
  500. x = 50 k = 159
  501. s (x) = 57_105.187_673_745_720_532
  502. sin (x) = -0.262_374_853_703_929
  503. // sinh(x) series summation with positive signs
  504. // with k terms needed to converge to machine precision
  505. x = 10 k = 47
  506. t (x) = 1.101_323_287_470_340e+04
  507. sinh (x) = 1.101_323_287_470_339e+04
  508. x = 20 k = 69
  509. t (x) = 2.425_825_977_048_951e+08
  510. sinh (x) = 2.425_825_977_048_951e+08
  511. x = 30 k = 87
  512. t (x) = 5.343_237_290_762_229e+12
  513. sinh (x) = 5.343_237_290_762_231e+12
  514. x = 40 k = 105
  515. t (x) = 1.176_926_334_185_100e+17
  516. sinh (x) = 1.176_926_334_185_100e+17
  517. x = 50 k = 121
  518. t (x) = 2.592_352_764_293_534e+21
  519. sinh (x) = 2.592_352_764_293_536e+21
  520. @end example
  521. @noindent
  522. We have added underscores to the numbers to enhance readability.
  523. The @code{sinh (@var{x})} series with positive terms can be summed to
  524. high accuracy. By contrast, the series for @code{sin (@var{x})}
  525. suffers increasing significance loss, so that when @var{x} = 30 only
  526. two correct digits remain. Soon after, all digits are wrong, and the
  527. answers are complete nonsense.
  528. An important skill in numerical programming is to recognize when
  529. significance loss is likely to contaminate a computation, and revise
  530. the algorithm to reduce this problem. Sometimes, the only practical
  531. way to do so is to compute in higher intermediate precision, which is
  532. why the extended types like @code{long double} are important.
  533. @c Formerly mentioned @code{__float128}
  534. @c =====================================================================
  535. @node Fused Multiply-Add
  536. @section Fused Multiply-Add
  537. @cindex fused multiply-add in floating-point computations
  538. @cindex floating-point fused multiply-add
  539. In 1990, when IBM introduced the POWER architecture, the CPU
  540. provided a previously unknown instruction, the @dfn{fused
  541. multiply-add} (FMA). It computes the value @code{x * y + z} with
  542. an @strong{exact} double-length product, followed by an addition with a
  543. @emph{single} rounding. Numerical computation often needs pairs of
  544. multiply and add operations, for which the FMA is well-suited.
  545. On the POWER architecture, there are two dedicated registers that
  546. hold permanent values of @code{0.0} and @code{1.0}, and the
  547. normal @emph{multiply} and @emph{add} instructions are just
  548. wrappers around the FMA that compute @code{x * y + 0.0} and
  549. @code{x * 1.0 + z}, respectively.
  550. In the early days, it appeared that the main benefit of the FMA
  551. was getting two floating-point operations for the price of one,
  552. almost doubling the performance of some algorithms. However,
  553. numerical analysts have since shown numerous uses of the FMA for
  554. significantly enhancing accuracy. We discuss one of the most
  555. important ones in the next section.
  556. A few other architectures have since included the FMA, and most
  557. provide variants for the related operations @code{x * y - z}
  558. (FMS), @code{-x * y + z} (FNMA), and @code{-x * y - z} (FNMS).
  559. @c The IEEE 754-2008 revision requires implementations to provide
  560. @c the FMA, as a sixth basic operation.
  561. The functions @code{fmaf}, @code{fma}, and @code{fmal} implement fused
  562. multiply-add for the @code{float}, @code{double}, and @code{long
  563. double} data types. Correct implementation of the FMA in software is
  564. difficult, and some systems that appear to provide those functions do
  565. not satisfy the single-rounding requirement. That situation should
  566. change as more programmers use the FMA operation, and more CPUs
  567. provide FMA in hardware.
  568. Use the @option{-ffp-contract=fast} option to allow generation of FMA
  569. instructions, or @option{-ffp-contract=off} to disallow it.
  570. @c =====================================================================
  571. @node Error Recovery
  572. @section Error Recovery
  573. @cindex error recovery (floating point)
  574. @cindex floating-point error recovery
  575. When two numbers are combined by one of the four basic
  576. operations, the result often requires rounding to storage
  577. precision. For accurate computation, one would like to be able
  578. to recover that rounding error. With historical floating-point
  579. designs, it was difficult to do so portably, but now that IEEE
  580. 754 arithmetic is almost universal, the job is much easier.
  581. For addition with the default @emph{round-to-nearest} rounding
  582. mode, we can determine the error in a sum like this:
  583. @example
  584. volatile double err, sum, tmp, x, y;
  585. if (fabs (x) >= fabs (y))
  586. @{
  587. sum = x + y;
  588. tmp = sum - x;
  589. err = y - tmp;
  590. @}
  591. else /* fabs (x) < fabs (y) */
  592. @{
  593. sum = x + y;
  594. tmp = sum - y;
  595. err = x - tmp;
  596. @}
  597. @end example
  598. @noindent
  599. @cindex twosum
  600. Now, @code{x + y} is @emph{exactly} represented by @code{sum + err}.
  601. This basic operation, which has come to be called @dfn{twosum}
  602. in the numerical-analysis literature, is the first key to tracking,
  603. and accounting for, rounding error.
  604. To determine the error in subtraction, just swap the @code{+} and
  605. @code{-} operators.
  606. We used the @code{volatile} qualifier (@pxref{volatile}) in the
  607. declaration of the variables, which forces the compiler to store and
  608. retrieve them from memory, and prevents the compiler from optimizing
  609. @code{err = y - ((x + y) - x)} into @code{err = 0}.
  610. For multiplication, we can compute the rounding error without
  611. magnitude tests with the FMA operation (@pxref{Fused Multiply-Add}),
  612. like this:
  613. @example
  614. volatile double err, prod, x, y;
  615. prod = x * y; /* @r{rounded product} */
  616. err = fma (x, y, -prod); /* @r{exact product @code{= @var{prod} + @var{err}}} */
  617. @end example
  618. For addition, subtraction, and multiplication, we can represent the
  619. exact result with the notional sum of two values. However, the exact
  620. result of division, remainder, or square root potentially requires an
  621. infinite number of digits, so we can at best approximate it.
  622. Nevertheless, we can compute an error term that is close to the true
  623. error: it is just that error value, rounded to machine precision.
  624. For division, you can approximate @code{x / y} with @code{quo + err}
  625. like this:
  626. @example
  627. volatile double err, quo, x, y;
  628. quo = x / y;
  629. err = fma (-quo, y, x) / y;
  630. @end example
  631. For square root, we can approximate @code{sqrt (x)} with @code{root +
  632. err} like this:
  633. @example
  634. volatile double err, root, x;
  635. root = sqrt (x);
  636. err = fma (-root, root, x) / (root + root);
  637. @end example
  638. With the reliable and predictable floating-point design provided
  639. by IEEE 754 arithmetic, we now have the tools we need to track
  640. errors in the five basic floating-point operations, and we can
  641. effectively simulate computing in twice working precision, which
  642. is sometimes sufficient to remove almost all traces of arithmetic
  643. errors.
  644. @c =====================================================================
  645. @ignore
  646. @node Double-Rounding Problems
  647. @section Double-Rounding Problems
  648. @cindex double-rounding problems (floating point)
  649. @cindex floating-point double-rounding problems
  650. Most developers today use 64-bit x86 processors with a 64-bit
  651. operating system, with a Streaming SIMD Extensions (SSE) instruction
  652. set. In the past, using a 32-bit x87 instruction set was common,
  653. leading to issues described in this section.
  654. To offer a few more digits of precision and a wider exponent range,
  655. the IEEE 754 Standard included an optional @emph{temporary real}
  656. format, with 11 more bits in the significand, and 4 more bits in the
  657. biased exponent.
  658. Compilers are free to exploit the longer format, and most do so.
  659. That is usually a @emph{good thing}, such as in computing a
  660. lengthy sum or product, or in implementing the computation of the
  661. hypotenuse of a right-triangle as @code{sqrt (x*x + y*y)}: the
  662. wider exponent range is critical there for avoiding disastrous
  663. overflow or underflow.
  664. @findex fesetprec
  665. @findex fegetprec
  666. However, sometimes it is critical to know what the intermediate
  667. precision and rounding mode are, such as in tracking errors with
  668. the techniques of the preceding section. Some compilers provide
  669. options to prevent the use of the 80-bit format in computations
  670. with 64-bit @code{double}, but ensuring that code is always
  671. compiled that way may be difficult. The x86 architecture has the
  672. ability to force rounding of all operations in the 80-bit
  673. registers to the 64-bit storage format, and some systems provide
  674. a software interface with the functions @code{fesetprec} and
  675. @code{fegetprec}. Unfortunately, neither of those functions is
  676. defined by the ISO Standards for C and C++, and consequently,
  677. they are not standardly available on all platforms that use
  678. the x86 floating-point design.
  679. When @code{double} computations are done in the 80-bit format,
  680. results necessarily involve a @dfn{double rounding}: first to the
  681. 64-bit significand in intermediate operations in registers, and
  682. then to the 53-bit significand when the register contents are
  683. stored to memory. Here is an example in decimal arithmetic where
  684. such a double rounding results in the wrong answer: round
  685. @code{1_234_999} from seven to five to four digits. The result is
  686. @code{1_235_000}, whereas the correct representation to four
  687. significant digits is @code{1_234_000}.
  688. @cindex -ffloat-store
  689. One way to reduce the use of the 80-bit format is to declare variables
  690. as @code{volatile double}: that way, the compiler is required to store
  691. and load intermediates from memory, rather than keeping them in 80-bit
  692. registers over long sequences of floating-point instructions. Doing
  693. so does not, however, eliminate double rounding. The now-common
  694. x86-64 architecture has separate sets of 32-bit and 64-bit
  695. floating-point registers. The option @option{-float-store} says that
  696. floating-point computation should use only those registers, thus eliminating
  697. the possibility of double rounding.
  698. @end ignore
  699. @c =====================================================================
  700. @node Exact Floating Constants
  701. @section Exact Floating-Point Constants
  702. @cindex exact specification of floating-point constants
  703. @cindex floating-point constants, exact specification of
  704. One of the frustrations that numerical programmers have suffered
  705. with since the dawn of digital computers is the inability to
  706. precisely specify numbers in their programs. On the early
  707. decimal machines, that was not an issue: you could write a
  708. constant @code{1e-30} and be confident of that exact value
  709. being used in floating-point operations. However, when the
  710. hardware works in a base other than 10, then human-specified
  711. numbers have to be converted to that base, and then converted
  712. back again at output time. The two base conversions are rarely
  713. exact, and unwanted rounding errors are introduced.
  714. @cindex hexadecimal floating-point constants
  715. As computers usually represent numbers in a base other than 10,
  716. numbers often must be converted to and from different bases, and
  717. rounding errors can occur during conversion. This problem is solved
  718. in C using hexademical floating-point constants. For example,
  719. @code{+0x1.fffffcp-1} is the number that is the IEEE 754 32-bit value
  720. closest to, but below, @code{1.0}. The significand is represented as a
  721. hexadecimal fraction, and the @emph{power of two} is written in
  722. decimal following the exponent letter @code{p} (the traditional
  723. exponent letter @code{e} is not possible, because it is a hexadecimal
  724. digit).
  725. In @code{printf} and @code{scanf} and related functions, you can use
  726. the @samp{%a} and @samp{%A} format specifiers for writing and reading
  727. hexadecimal floating-point values. @samp{%a} writes them with lower
  728. case letters and @samp{%A} writes them with upper case letters. For
  729. instance, this code reproduces our sample number:
  730. @example
  731. printf ("%a\n", 1.0 - pow (2.0, -23));
  732. @print{} 0x1.fffffcp-1
  733. @end example
  734. @noindent
  735. The @code{strtod} family was similarly extended to recognize
  736. numbers in that new format.
  737. If you want to ensure exact data representation for transfer of
  738. floating-point numbers between C programs on different
  739. computers, then hexadecimal constants are an optimum choice.
  740. @c =====================================================================
  741. @node Handling Infinity
  742. @section Handling Infinity
  743. @cindex infinity in floating-point arithmetic
  744. @cindex floating-point infinity
  745. As we noted earlier, the IEEE 754 model of computing is not to stop
  746. the program when exceptional conditions occur. It takes note of
  747. exceptional values or conditions by setting sticky @dfn{exception
  748. flags}, or by producing results with the special values Infinity and
  749. QNaN. In this section, we discuss Infinity; @pxref{Handling NaN} for
  750. the other.
  751. In GNU C, you can create a value of negative Infinity in software like
  752. this:
  753. @verbatim
  754. double x;
  755. x = -1.0 / 0.0;
  756. @end verbatim
  757. GNU C supplies the @code{__builtin_inf}, @code{__builtin_inff}, and
  758. @code{__builtin_infl} macros, and the GNU C Library provides the
  759. @code{INFINITY} macro, all of which are compile-time constants for
  760. positive infinity.
  761. GNU C also provides a standard function to test for an Infinity:
  762. @code{isinf (x)} returns @code{1} if the argument is a signed
  763. infinity, and @code{0} if not.
  764. Infinities can be compared, and all Infinities of the same sign are
  765. equal: there is no notion in IEEE 754 arithmetic of different kinds of
  766. Infinities, as there are in some areas of mathematics. Positive
  767. Infinity is larger than any finite value, and negative Infinity is
  768. smaller than finite value.
  769. Infinities propagate in addition, subtraction, multiplication,
  770. and square root, but in division, they disappear, because of the
  771. rule that @code{finite / Infinity} is @code{0.0}. Thus, an
  772. overflow in an intermediate computation that produces an Infinity
  773. is likely to be noticed later in the final results. The programmer can
  774. then decide whether the overflow is expected, and acceptable, or whether
  775. the code possibly has a bug, or needs to be run in higher
  776. precision, or redesigned to avoid the production of the Infinity.
  777. @c =====================================================================
  778. @node Handling NaN
  779. @section Handling NaN
  780. @cindex NaN in floating-point arithmetic
  781. @cindex not a number
  782. @cindex floating-point NaN
  783. NaNs are not numbers: they represent values from computations that
  784. produce undefined results. They have a distinctive property that
  785. makes them unlike any other floating-point value: they are
  786. @emph{unequal to everything, including themselves}! Thus, you can
  787. write a test for a NaN like this:
  788. @example
  789. if (x != x)
  790. printf ("x is a NaN\n");
  791. @end example
  792. @noindent
  793. This test works in GNU C, but some compilers might evaluate that test
  794. expression as false without properly checking for the NaN value.
  795. A more portable way to test for NaN is to use the @code{isnan}
  796. function declared in @code{math.h}:
  797. @example
  798. if (isnan (x))
  799. printf ("x is a NaN\n");
  800. @end example
  801. @noindent
  802. @xref{Floating Point Classes, , , libc, The GNU C Library Reference Manual}.
  803. One important use of NaNs is marking of missing data. For
  804. example, in statistics, such data must be omitted from
  805. computations. Use of any particular finite value for missing
  806. data would eventually collide with real data, whereas such data
  807. could never be a NaN, so it is an ideal marker. Functions that
  808. deal with collections of data that may have holes can be written
  809. to test for, and ignore, NaN values.
  810. It is easy to generate a NaN in computations: evaluating @code{0.0 /
  811. 0.0} is the commonest way, but @code{Infinity - Infinity},
  812. @code{Infinity / Infinity}, and @code{sqrt (-1.0)} also work.
  813. Functions that receive out-of-bounds arguments can choose to return a
  814. stored NaN value, such as with the @code{NAN} macro defined in
  815. @code{math.h}, but that does not set the @emph{invalid operand}
  816. exception flag, and that can fool some programs.
  817. @cindex NaNs-always-propagate rule
  818. Like Infinity, NaNs propagate in computations, but they are even
  819. stickier, because they never disappear in division. Thus, once a
  820. NaN appears in a chain of numerical operations, it is almost
  821. certain to pop out into the final results. The programmer
  822. has to decide whether that is expected, or whether there is a
  823. coding or algorithmic error that needs repair.
  824. In general, when function gets a NaN argument, it usually returns a
  825. NaN. However, there are some exceptions in the math-library functions
  826. that you need to be aware of, because they violate the
  827. @emph{NaNs-always-propagate} rule:
  828. @itemize @bullet
  829. @item
  830. @code{pow (x, 0.0)} always returns @code{1.0}, even if @code{x} is
  831. 0.0, Infinity, or a NaN.
  832. @item
  833. @code{pow (1, y)} always returns @code{1}, even if @code{y} is a NaN.
  834. @item
  835. @code{hypot (INFINITY, y)} and @code{hypot (-INFINITY, y)} both
  836. always return @code{INFINITY}, even if @code{y} is a Nan.
  837. @item
  838. If just one of the arguments to @code{fmax (x, y)} or
  839. @code{fmin (x, y)} is a NaN, it returns the other argument. If
  840. both arguments are NaNs, it returns a NaN, but there is no
  841. requirement about where it comes from: it could be @code{x}, or
  842. @code{y}, or some other quiet NaN.
  843. @end itemize
  844. NaNs are also used for the return values of math-library
  845. functions where the result is not representable in real
  846. arithmetic, or is mathematically undefined or uncertain, such as
  847. @code{sqrt (-1.0)} and @code{sin (Infinity)}. However, note that a
  848. result that is merely too big to represent should always produce
  849. an Infinity, such as with @code{exp (1000.0)} (too big) and
  850. @code{exp (Infinity)} (truly infinite).
  851. @c =====================================================================
  852. @node Signed Zeros
  853. @section Signed Zeros
  854. @cindex signed zeros in floating-point arithmetic
  855. @cindex floating-point signed zeros
  856. The sign of zero is significant, and important, because it records the
  857. creation of a value that is too small to represent, but came from
  858. either the negative axis, or from the positive axis. Such fine
  859. distinctions are essential for proper handling of @dfn{branch cuts}
  860. in complex arithmetic (@pxref{Complex Arithmetic}).
  861. The key point about signed zeros is that in comparisons, their sign
  862. does not matter: @code{0.0 == -0.0} must @emph{always} evaluate to
  863. @code{1} (true). However, they are not @emph{the same number}, and
  864. @code{-0.0} in C code stands for a negative zero.
  865. @c =====================================================================
  866. @node Scaling by the Base
  867. @section Scaling by Powers of the Base
  868. @cindex scaling floating point by powers of the base
  869. @cindex floating-point scaling by powers of the base
  870. We have discussed rounding errors several times in this chapter,
  871. but it is important to remember that when results require no more
  872. bits than the exponent and significand bits can represent, those results
  873. are @emph{exact}.
  874. One particularly useful exact operation is scaling by a power of
  875. the base. While one, in principle, could do that with code like
  876. this:
  877. @example
  878. y = x * pow (2.0, (double)k); /* @r{Undesirable scaling: avoid!} */
  879. @end example
  880. @noindent
  881. that is not advisable, because it relies on the quality of the
  882. math-library power function, and that happens to be one of the
  883. most difficult functions in the C math library to make accurate.
  884. What is likely to happen on many systems is that the returned
  885. value from @code{pow} will be close to a power of two, but
  886. slightly different, so the subsequent multiplication introduces
  887. rounding error.
  888. The correct, and fastest, way to do the scaling is either via the
  889. traditional C library function, or with its C99 equivalent:
  890. @example
  891. y = ldexp (x, k); /* @r{Traditional pre-C99 style.} */
  892. y = scalbn (x, k); /* @r{C99 style.} */
  893. @end example
  894. @noindent
  895. Both functions return @code{x * 2**k}.
  896. @xref{Normalization Functions, , , libc, The GNU C Library Reference Manual}.
  897. @c =====================================================================
  898. @node Rounding Control
  899. @section Rounding Control
  900. @cindex rounding control (floating point)
  901. @cindex floating-point rounding control
  902. Here we describe how to specify the rounding mode at run time. System
  903. header file @file{fenv.h} provides the prototypes for these functions.
  904. @xref{Rounding, , , libc, The GNU C Library Reference Manual}.
  905. @noindent
  906. That header file also provides constant names for the four rounding modes:
  907. @code{FE_DOWNWARD}, @code{FE_TONEAREST}, @code{FE_TOWARDZERO}, and
  908. @code{FE_UPWARD}.
  909. The function @code{fegetround} examines and returns the current
  910. rounding mode. On a platform with IEEE 754 floating point,
  911. the value will always equal one of those four constants.
  912. On other platforms, it may return a negative value. The function
  913. @code{fesetround} sets the current rounding mode.
  914. Changing the rounding mode can be slow, so it is useful to minimize
  915. the number of changes. For interval arithmetic, we seem to need three
  916. changes for each operation, but we really only need two, because we
  917. can write code like this example for interval addition of two reals:
  918. @example
  919. @{
  920. struct interval_double
  921. @{
  922. double hi, lo;
  923. @} v;
  924. volatile double x, y;
  925. int rule;
  926. rule = fegetround ();
  927. if (fesetround (FE_UPWARD) == 0)
  928. @{
  929. v.hi = x + y;
  930. v.lo = -(-x - y);
  931. @}
  932. else
  933. fatal ("ERROR: failed to change rounding rule");
  934. if (fesetround (rule) != 0)
  935. fatal ("ERROR: failed to restore rounding rule");
  936. @}
  937. @end example
  938. @noindent
  939. The @code{volatile} qualifier (@pxref{volatile}) is essential on x86
  940. platforms to prevent an optimizing compiler from producing the same
  941. value for both bounds.
  942. @ignore We no longer discuss the double rounding issue.
  943. The code also needs to be compiled with the
  944. option @option{-ffloat-store} that prevents use of higher precision
  945. for the basic operations, because that would introduce double rounding
  946. that could spoil the bounds guarantee of interval arithmetic.
  947. @end ignore
  948. @c =====================================================================
  949. @node Machine Epsilon
  950. @section Machine Epsilon
  951. @cindex machine epsilon (floating point)
  952. @cindex floating-point machine epsilon
  953. In any floating-point system, three attributes are particularly
  954. important to know: @dfn{base} (the number that the exponent specifies
  955. a power of), @dfn{precision} (number of digits in the significand),
  956. and @dfn{range} (difference between most positive and most negative
  957. values). The allocation of bits between exponent and significand
  958. decides the answers to those questions.
  959. A measure of the precision is the answer to the question: what is
  960. the smallest number that can be added to @code{1.0} such that the
  961. sum differs from @code{1.0}? That number is called the
  962. @dfn{machine epsilon}.
  963. We could define the needed machine-epsilon constants for @code{float},
  964. @code{double}, and @code{long double} like this:
  965. @example
  966. static const float epsf = 0x1p-23; /* @r{about 1.192e-07} */
  967. static const double eps = 0x1p-52; /* @r{about 2.220e-16} */
  968. static const long double epsl = 0x1p-63; /* @r{about 1.084e-19} */
  969. @end example
  970. @noindent
  971. Instead of the hexadecimal constants, we could also have used the
  972. Standard C macros, @code{FLT_EPSILON}, @code{DBL_EPSILON}, and
  973. @code{LDBL_EPSILON}.
  974. It is useful to be able to compute the machine epsilons at
  975. run time, and we can easily generalize the operation by replacing
  976. the constant @code{1.0} with a user-supplied value:
  977. @example
  978. double
  979. macheps (double x)
  980. @{ /* @r{Return machine epsilon for @var{x},} */
  981. @r{such that @var{x} + macheps (@var{x}) > @var{x}.} */
  982. static const double base = 2.0;
  983. double eps;
  984. if (isnan (x))
  985. eps = x;
  986. else
  987. @{
  988. eps = (x == 0.0) ? 1.0 : x;
  989. while ((x + eps / base) != x)
  990. eps /= base; /* @r{Always exact!} */
  991. @}
  992. return (eps);
  993. @}
  994. @end example
  995. @noindent
  996. If we call that function with arguments from @code{0} to
  997. @code{10}, as well as Infinity and NaN, and print the returned
  998. values in hexadecimal, we get output like this:
  999. @example
  1000. macheps ( 0) = 0x1.0000000000000p-1074
  1001. macheps ( 1) = 0x1.0000000000000p-52
  1002. macheps ( 2) = 0x1.0000000000000p-51
  1003. macheps ( 3) = 0x1.8000000000000p-52
  1004. macheps ( 4) = 0x1.0000000000000p-50
  1005. macheps ( 5) = 0x1.4000000000000p-51
  1006. macheps ( 6) = 0x1.8000000000000p-51
  1007. macheps ( 7) = 0x1.c000000000000p-51
  1008. macheps ( 8) = 0x1.0000000000000p-49
  1009. macheps ( 9) = 0x1.2000000000000p-50
  1010. macheps ( 10) = 0x1.4000000000000p-50
  1011. macheps (Inf) = infinity
  1012. macheps (NaN) = nan
  1013. @end example
  1014. @noindent
  1015. Notice that @code{macheps} has a special test for a NaN to prevent an
  1016. infinite loop.
  1017. @ignore We no longer discuss double rounding.
  1018. To ensure that no expressions are evaluated with an intermediate higher
  1019. precision, we can compile with the @option{-fexcess-precision=standard}
  1020. option, which tells the compiler that all calculation results, including
  1021. intermediate results, are to be put on the stack, forcing rounding.
  1022. @end ignore
  1023. Our code made another test for a zero argument to avoid getting a
  1024. zero return. The returned value in that case is the smallest
  1025. representable floating-point number, here the subnormal value
  1026. @code{2**(-1074)}, which is about @code{4.941e-324}.
  1027. No special test is needed for an Infinity, because the
  1028. @code{eps}-reduction loop then terminates at the first iteration.
  1029. Our @code{macheps} function here assumes binary floating point; some
  1030. architectures may differ.
  1031. The C library includes some related functions that can also be used to
  1032. determine machine epsilons at run time:
  1033. @example
  1034. #include <math.h> /* @r{Include for these prototypes.} */
  1035. double nextafter (double x, double y);
  1036. float nextafterf (float x, float y);
  1037. long double nextafterl (long double x, long double y);
  1038. @end example
  1039. @noindent
  1040. These return the machine number nearest @var{x} in the direction of
  1041. @var{y}. For example, @code{nextafter (1.0, 2.0)} produces the same
  1042. result as @code{1.0 + macheps (1.0)} and @code{1.0 + DBL_EPSILON}.
  1043. @xref{FP Bit Twiddling, , , libc, The GNU C Library Reference Manual}.
  1044. It is important to know that the machine epsilon is not symmetric
  1045. about all numbers. At the boundaries where normalization changes the
  1046. exponent, the epsilon below @var{x} is smaller than that just above
  1047. @var{x} by a factor @code{1 / base}. For example, @code{macheps
  1048. (1.0)} returns @code{+0x1p-52}, whereas @code{macheps (-1.0)} returns
  1049. @code{+0x1p-53}. Some authors distinguish those cases by calling them
  1050. the @emph{positive} and @emph{negative}, or @emph{big} and
  1051. @emph{small}, machine epsilons. You can produce their values like
  1052. this:
  1053. @example
  1054. eps_neg = 1.0 - nextafter (1.0, -1.0);
  1055. eps_pos = nextafter (1.0, +2.0) - 1.0;
  1056. @end example
  1057. If @var{x} is a variable, such that you do not know its value at
  1058. compile time, then you can substitute literal @var{y} values with
  1059. either @code{-inf()} or @code{+inf()}, like this:
  1060. @example
  1061. eps_neg = x - nextafter (x, -inf ());
  1062. eps_pos = nextafter (x, +inf() - x);
  1063. @end example
  1064. @noindent
  1065. In such cases, if @var{x} is Infinity, then @emph{the @code{nextafter}
  1066. functions return @var{y} if @var{x} equals @var{y}}. Our two
  1067. assignments then produce @code{+0x1.fffffffffffffp+1023} (about
  1068. 1.798e+308) for @var{eps_neg} and Infinity for @var{eps_pos}. Thus,
  1069. the call @code{nextafter (INFINITY, -INFINITY)} can be used to find
  1070. the largest representable finite number, and with the call
  1071. @code{nextafter (0.0, 1.0)}, the smallest representable number (here,
  1072. @code{0x1p-1074} (about 4.491e-324), a number that we saw before as
  1073. the output from @code{macheps (0.0)}).
  1074. @c =====================================================================
  1075. @node Complex Arithmetic
  1076. @section Complex Arithmetic
  1077. @cindex complex arithmetic in floating-point calculations
  1078. @cindex floating-point arithmetic with complex numbers
  1079. We've already looked at defining and referring to complex numbers
  1080. (@pxref{Complex Data Types}). What is important to discuss here are
  1081. some issues that are unlikely to be obvious to programmers without
  1082. extensive experience in both numerical computing, and in complex
  1083. arithmetic in mathematics.
  1084. The first important point is that, unlike real arithmetic, in complex
  1085. arithmetic, the danger of significance loss is @emph{pervasive}, and
  1086. affects @emph{every one} of the basic operations, and @emph{almost
  1087. all} of the math-library functions. To understand why, recall the
  1088. rules for complex multiplication and division:
  1089. @example
  1090. a = u + I*v /* @r{First operand.} */
  1091. b = x + I*y /* @r{Second operand.} */
  1092. prod = a * b
  1093. = (u + I*v) * (x + I*y)
  1094. = (u * x - v * y) + I*(v * x + u * y)
  1095. quo = a / b
  1096. = (u + I*v) / (x + I*y)
  1097. = [(u + I*v) * (x - I*y)] / [(x + I*y) * (x - I*y)]
  1098. = [(u * x + v * y) + I*(v * x - u * y)] / (x**2 + y**2)
  1099. @end example
  1100. @noindent
  1101. There are four critical observations about those formulas:
  1102. @itemize @bullet
  1103. @item
  1104. the multiplications on the right-hand side introduce the
  1105. possibility of premature underflow or overflow;
  1106. @item
  1107. the products must be accurate to twice working precision;
  1108. @item
  1109. there is @emph{always} one subtraction on the right-hand sides
  1110. that is subject to catastrophic significance loss; and
  1111. @item
  1112. complex multiplication has up to @emph{six} rounding errors, and
  1113. complex division has @emph{ten} rounding errors.
  1114. @end itemize
  1115. @cindex branch cuts
  1116. Another point that needs careful study is the fact that many functions
  1117. in complex arithmetic have @dfn{branch cuts}. You can view a
  1118. function with a complex argument, @code{f (z)}, as @code{f (x + I*y)},
  1119. and thus, it defines a relation between a point @code{(x, y)} on the
  1120. complex plane with an elevation value on a surface. A branch cut
  1121. looks like a tear in that surface, so approaching the cut from one
  1122. side produces a particular value, and from the other side, a quite
  1123. different value. Great care is needed to handle branch cuts properly,
  1124. and even small numerical errors can push a result from one side to the
  1125. other, radically changing the returned value. As we reported earlier,
  1126. correct handling of the sign of zero is critically important for
  1127. computing near branch cuts.
  1128. The best advice that we can give to programmers who need complex
  1129. arithmetic is to always use the @emph{highest precision available},
  1130. and then to carefully check the results of test calculations to gauge
  1131. the likely accuracy of the computed results. It is easy to supply
  1132. test values of real and imaginary parts where all five basic
  1133. operations in complex arithmetic, and almost all of the complex math
  1134. functions, lose @emph{all} significance, and fail to produce even a
  1135. single correct digit.
  1136. Even though complex arithmetic makes some programming tasks
  1137. easier, it may be numerically preferable to rework the algorithm
  1138. so that it can be carried out in real arithmetic. That is
  1139. commonly possible in matrix algebra.
  1140. GNU C can perform code optimization on complex number multiplication and
  1141. division if certain boundary checks will not be needed. The
  1142. command-line option @option{-fcx-limited-range} tells the compiler that
  1143. a range reduction step is not needed when performing complex division,
  1144. and that there is no need to check if a complex multiplication or
  1145. division results in the value @code{Nan + I*NaN}. By default these
  1146. checks are enabled. You can explicitly enable them with the
  1147. @option{-fno-cx-limited-range} option.
  1148. @ignore
  1149. @c =====================================================================
  1150. @node More on Decimal Floating-Point Arithmetic
  1151. @section More on Decimal Floating-Point Arithmetic
  1152. @cindex decimal floating-point arithmetic
  1153. @cindex floating-point arithmetic, decimal
  1154. Proposed extensions to the C programming language call for the
  1155. inclusion of decimal floating-point arithmetic, which handles
  1156. floating-point numbers with a specified radix of 10, instead of the
  1157. unspecified traditional radix of 2.
  1158. The proposed new types are @code{_Decimal32}, @code{_Decimal64}, and
  1159. @code{_Decimal128}, with corresponding literal constant suffixes of
  1160. @code{df}, @code{dd}, and @code{dl}, respectively. For example, a
  1161. 32-bit decimal floating-point variable could be defined as:
  1162. @example
  1163. _Decimal32 foo = 42.123df;
  1164. @end example
  1165. We stated earlier (@pxref{decimal-significand}) that the significand
  1166. in decimal floating-point arithmetic is an integer, rather than
  1167. fractional, value. Decimal instructions do not normally alter the
  1168. exponent by normalizing nonzero significands to remove trailing zeros.
  1169. That design feature is intentional: it allows emulation of the
  1170. fixed-point arithmetic that has commonly been used for financial
  1171. computations.
  1172. One consequence of the lack of normalization is that there are
  1173. multiple representations of any number that does not use all of the
  1174. significand digits. Thus, in the 32-bit format, the values
  1175. @code{1.DF}, @code{1.0DF}, @code{1.00DF}, @dots{}, @code{1.000_000DF},
  1176. all have different bit patterns in storage, even though they compare
  1177. equal. Thus, programmers need to be careful about trailing zero
  1178. digits, because they appear in the results, and affect scaling. For
  1179. example, @code{1.0DF * 1.0DF} evaluates to @code{1.00DF}, which is
  1180. stored as @code{100 * 10**(-2)}.
  1181. In general, when you look at a decimal expression with fractional
  1182. digits, you should mentally rewrite it in integer form with suitable
  1183. powers of ten. Thus, a multiplication like @code{1.23 * 4.56} really
  1184. means @code{123 * 10**(-2) * 456 * 10**(-2)}, which evaluates to
  1185. @code{56088 * 10**(-4)}, and would be output as @code{5.6088}.
  1186. Another consequence of the decimal significand choice is that
  1187. initializing decimal floating-point values to a pattern of
  1188. all-bits-zero does not produce the expected value @code{0.}: instead,
  1189. the result is the subnormal values @code{0.e-101}, @code{0.e-398}, and
  1190. @code{0.e-6176} in the three storage formats.
  1191. GNU C currently supports basic storage and manipulation of decimal
  1192. floating-point values on some platforms, and support is expected to
  1193. grow in future releases.
  1194. @c ??? Suggest chopping the rest of this section, at least for the time
  1195. @c ??? being. Decimal floating-point support in GNU C is not yet complete,
  1196. @c ??? and functionality discussed appears to not be available on all
  1197. @c ??? platforms, and is not obviously documented for end users of GNU C. --TJR
  1198. The exponent in decimal arithmetic is called the @emph{quantum}, and
  1199. financial computations require that the quantum always be preserved.
  1200. If it is not, then rounding may have happened, and additional scaling
  1201. is required.
  1202. The function @code{samequantumd (x,y)} for 64-bit decimal arithmetic
  1203. returns @code{1} if the arguments have the same exponent, and @code{0}
  1204. otherwise.
  1205. The function @code{quantized (x,y)} returns a value of @var{x} that has
  1206. been adjusted to have the same quantum as @var{y}; that adjustment
  1207. could require rounding of the significand. For example,
  1208. @code{quantized (5.dd, 1.00dd)} returns the value @code{5.00dd}, which
  1209. is stored as @code{500 * 10**(-2)}. As another example, a sales-tax
  1210. computation might be carried out like this:
  1211. @example
  1212. decimal_long_double amount, rate, total;
  1213. amount = 0.70DL;
  1214. rate = 1.05DL;
  1215. total = quantizedl (amount * rate, 1.00DL);
  1216. @end example
  1217. @noindent
  1218. Without the call to @code{quantizedl}, the result would have been
  1219. @code{0.7350}, instead of the correctly rounded @code{0.74}. That
  1220. particular example was chosen because it illustrates yet another
  1221. difference between decimal and binary arithmetic: in the latter, the
  1222. factors each require an infinite number of bits, and their product,
  1223. when converted to decimal, looks like @code{0.734_999_999@dots{}}.
  1224. Thus, rounding the product in binary format to two decimal places
  1225. always gets @code{0.73}, which is the @emph{wrong} answer for tax
  1226. laws.
  1227. In financial computations in decimal floating-point arithmetic, the
  1228. @code{quantized} function family is expected to receive wide use
  1229. whenever multiplication or division change the desired quantum of the
  1230. result.
  1231. The function call @code{normalized (x)} returns a value that is
  1232. numerically equal to @var{x}, but with trailing zeros trimmed.
  1233. Here are some examples of its operation:
  1234. @multitable @columnfractions .5 .5
  1235. @headitem Function Call @tab Result
  1236. @item normalized (+0.00100DD) @tab +0.001DD
  1237. @item normalized (+1.00DD) @tab +1.DD
  1238. @item normalized (+1.E2DD) @tab +1E+2DD
  1239. @item normalized (+100.DD) @tab +1E+2DD
  1240. @item normalized (+100.00DD) @tab +1E+2DD
  1241. @item normalized (+NaN(0x1234)) @tab +NaN(0x1234)
  1242. @item normalized (-NaN(0x1234)) @tab -NaN(0x1234)
  1243. @item normalized (+Infinity) @tab +Infinity
  1244. @item normalized (-Infinity) @tab -Infinity
  1245. @end multitable
  1246. @noindent
  1247. The NaN examples show that payloads are preserved.
  1248. Because the @code{printf} and @code{scanf} families were designed long
  1249. before IEEE 754 decimal arithmetic, their format items do not support
  1250. distinguishing between numbers with identical values, but different
  1251. quanta, and yet, that distinction is likely to be needed in output.
  1252. The solution adopted by one early library for decimal arithmetic is to
  1253. provide a family of number-to-string conversion functions that
  1254. preserve quantization. Here is a code fragment and its output that
  1255. shows how they work.
  1256. @example
  1257. decimal_float x;
  1258. x = 123.000DF;
  1259. printf ("%%He: x = %He\n", x);
  1260. printf ("%%Hf: x = %Hf\n", x);
  1261. printf ("%%Hg: x = %Hg\n", x);
  1262. printf ("ntosdf (x) = %s\n", ntosdf (x));
  1263. %He: x = 1.230000e+02
  1264. %Hf: x = 123.000000
  1265. %Hg: x = 123
  1266. ntosdf (x) = +123.000
  1267. @end example
  1268. @noindent
  1269. The format modifier letter @code{H} indicates a 32-bit decimal value,
  1270. and the modifiers @code{DD} and @code{DL} correspond to the two other
  1271. formats.
  1272. @c =====================================================================
  1273. @end ignore
  1274. @node Round-Trip Base Conversion
  1275. @section Round-Trip Base Conversion
  1276. @cindex round-trip base conversion
  1277. @cindex base conversion (floating point)
  1278. @cindex floating-point round-trip base conversion
  1279. Most numeric programs involve converting between base-2 floating-point
  1280. numbers, as represented by the computer, and base-10 floating-point
  1281. numbers, as entered and handled by the programmer. What might not be
  1282. obvious is the number of base-2 bits vs. base-10 digits required for
  1283. each representation. Consider the following tables showing the number of
  1284. decimal digits representable in a given number of bits, and vice versa:
  1285. @multitable @columnfractions .5 .1 .1 .1 .1 .1
  1286. @item binary in @tab 24 @tab 53 @tab 64 @tab 113 @tab 237
  1287. @item decimal out @tab 9 @tab 17 @tab 21 @tab 36 @tab 73
  1288. @end multitable
  1289. @multitable @columnfractions .5 .1 .1 .1 .1
  1290. @item decimal in @tab 7 @tab 16 @tab 34 @tab 70
  1291. @item binary out @tab 25 @tab 55 @tab 114 @tab 234
  1292. @end multitable
  1293. We can compute the table numbers with these two functions:
  1294. @example
  1295. int
  1296. matula(int nbits)
  1297. @{ /* @r{Return output decimal digits needed for nbits-bits input.} */
  1298. return ((int)ceil((double)nbits / log2(10.0) + 1.0));
  1299. @}
  1300. int
  1301. goldberg(int ndec)
  1302. @{ /* @r{Return output bits needed for ndec-digits input.} */
  1303. return ((int)ceil((double)ndec / log10(2.0) + 1.0));
  1304. @}
  1305. @end example
  1306. One significant observation from those numbers is that we cannot
  1307. achieve correct round-trip conversion between the decimal and
  1308. binary formats in the same storage size! For example, we need 25
  1309. bits to represent a 7-digit value from the 32-bit decimal format,
  1310. but the binary format only has 24 available. Similar
  1311. observations hold for each of the other conversion pairs.
  1312. The general input/output base-conversion problem is astonishingly
  1313. complicated, and solutions were not generally known until the
  1314. publication of two papers in 1990 that are listed later near the end
  1315. of this chapter. For the 128-bit formats, the worst case needs more
  1316. than 11,500 decimal digits of precision to guarantee correct rounding
  1317. in a binary-to-decimal conversion!
  1318. For further details see the references for Bennett Goldberg and David
  1319. Matula.
  1320. @c =====================================================================
  1321. @node Further Reading
  1322. @section Further Reading
  1323. The subject of floating-point arithmetic is much more complex
  1324. than many programmers seem to think, and few books on programming
  1325. languages spend much time in that area. In this chapter, we have
  1326. tried to expose the reader to some of the key ideas, and to warn
  1327. of easily overlooked pitfalls that can soon lead to nonsensical
  1328. results. There are a few good references that we recommend
  1329. for further reading, and for finding other important material
  1330. about computer arithmetic:
  1331. @c =====================================================================
  1332. @c Each bibliography item has a sort key, so the bibliography can be
  1333. @c sorted in emacs with M-x sort-paragraphs on the region with the items.
  1334. @c =====================================================================
  1335. @itemize @bullet
  1336. @item @c sort-key: Abbott
  1337. Paul H. Abbott and 15 others, @cite{Architecture and software support
  1338. in IBM S/390 Parallel Enterprise Servers for IEEE Floating-Point
  1339. arithmetic}, IBM Journal of Research and Development @b{43}(5/6)
  1340. 723--760 (1999),
  1341. @uref{https://doi.org/10.1147/rd.435.0723}. This article gives
  1342. a good description of IBM's algorithm for exact decimal-to-binary
  1343. conversion, complementing earlier ones by Clinger and others.
  1344. @item @c sort-key: Beebe
  1345. Nelson H. F. Beebe, @cite{The Mathematical-Function Computation Handbook:
  1346. Programming Using the MathCW Portable Software Library},
  1347. Springer (2017), ISBN 3-319-64109-3 (hardcover), 3-319-64110-7 (e-book)
  1348. (xxxvi + 1114 pages),
  1349. @uref{https://doi.org/10.1007/978-3-319-64110-2}.
  1350. This book describes portable implementations of a large superset
  1351. of the mathematical functions available in many programming
  1352. languages, extended to a future 256-bit format (70 decimal
  1353. digits), for both binary and decimal floating point. It includes
  1354. a substantial portion of the functions described in the famous
  1355. @cite{NIST Handbook of Mathematical Functions}, Cambridge (2018),
  1356. ISBN 0-521-19225-0.
  1357. See
  1358. @uref{http://www.math.utah.edu/pub/mathcw}
  1359. for compilers and libraries.
  1360. @item @c sort-key: Clinger-1990
  1361. William D. Clinger, @cite{How to Read Floating Point Numbers
  1362. Accurately}, ACM SIGPLAN Notices @b{25}(6) 92--101 (June 1990),
  1363. @uref{https://doi.org/10.1145/93548.93557}.
  1364. See also the papers by Steele & White.
  1365. @item @c sort-key: Clinger-2004
  1366. William D. Clinger, @cite{Retrospective: How to read floating
  1367. point numbers accurately}, ACM SIGPLAN Notices @b{39}(4) 360--371 (April 2004),
  1368. @uref{http://doi.acm.org/10.1145/989393.989430}. Reprint of 1990 paper,
  1369. with additional commentary.
  1370. @item @c sort-key: Goldberg-1967
  1371. I. Bennett Goldberg, @cite{27 Bits Are Not Enough For 8-Digit Accuracy},
  1372. Communications of the ACM @b{10}(2) 105--106 (February 1967),
  1373. @uref{http://doi.acm.org/10.1145/363067.363112}. This paper,
  1374. and its companions by David Matula, address the base-conversion
  1375. problem, and show that the naive formulas are wrong by one or
  1376. two digits.
  1377. @item @c sort-key: Goldberg-1991
  1378. David Goldberg, @cite{What Every Computer Scientist Should Know
  1379. About Floating-Point Arithmetic}, ACM Computing Surveys @b{23}(1)
  1380. 5--58 (March 1991), corrigendum @b{23}(3) 413 (September 1991),
  1381. @uref{https://doi.org/10.1145/103162.103163}.
  1382. This paper has been widely distributed, and reissued in vendor
  1383. programming-language documentation. It is well worth reading,
  1384. and then rereading from time to time.
  1385. @item @c sort-key: Juffa
  1386. Norbert Juffa and Nelson H. F. Beebe, @cite{A Bibliography of
  1387. Publications on Floating-Point Arithmetic},
  1388. @uref{http://www.math.utah.edu/pub/tex/bib/fparith.bib}.
  1389. This is the largest known bibliography of publications about
  1390. floating-point, and also integer, arithmetic. It is actively
  1391. maintained, and in mid 2019, contains more than 6400 references to
  1392. original research papers, reports, theses, books, and Web sites on the
  1393. subject matter. It can be used to locate the latest research in the
  1394. field, and the historical coverage dates back to a 1726 paper on
  1395. signed-digit arithmetic, and an 1837 paper by Charles Babbage, the
  1396. intellectual father of mechanical computers. The entries for the
  1397. Abbott, Clinger, and Steele & White papers cited earlier contain
  1398. pointers to several other important related papers on the
  1399. base-conversion problem.
  1400. @item @c sort-key: Kahan
  1401. William Kahan, @cite{Branch Cuts for Complex Elementary Functions, or
  1402. Much Ado About Nothing's Sign Bit}, (1987),
  1403. @uref{http://people.freebsd.org/~das/kahan86branch.pdf}.
  1404. This Web document about the fine points of complex arithmetic
  1405. also appears in the volume edited by A. Iserles and
  1406. M. J. D. Powell, @cite{The State of the Art in Numerical
  1407. Analysis: Proceedings of the Joint IMA/SIAM Conference on the
  1408. State of the Art in Numerical Analysis held at the University of
  1409. Birmingham, 14--18 April 1986}, Oxford University Press (1987),
  1410. ISBN 0-19-853614-3 (xiv + 719 pages). Its author is the famous
  1411. chief architect of the IEEE 754 arithmetic system, and one of the
  1412. world's greatest experts in the field of floating-point
  1413. arithmetic. An entire generation of his students at the
  1414. University of California, Berkeley, have gone on to careers in
  1415. academic and industry, spreading the knowledge of how to do
  1416. floating-point arithmetic right.
  1417. @item @c sort-key: Knuth
  1418. Donald E. Knuth, @cite{A Simple Program Whose Proof Isn't},
  1419. in @cite{Beauty is our business: a birthday salute to Edsger
  1420. W. Dijkstra}, W. H. J. Feijen, A. J. M. van Gasteren,
  1421. D. Gries, and J. Misra (eds.), Springer (1990), ISBN
  1422. 1-4612-8792-8,
  1423. @uref{https://doi.org/10.1007/978-1-4612-4476-9}. This book
  1424. chapter supplies a correctness proof of the decimal to
  1425. binary, and binary to decimal, conversions in fixed-point
  1426. arithmetic in the TeX typesetting system. The proof evaded
  1427. its author for a dozen years.
  1428. @item @c sort-key: Matula-1968a
  1429. David W. Matula, @cite{In-and-out conversions},
  1430. Communications of the ACM @b{11}(1) 57--50 (January 1968),
  1431. @uref{https://doi.org/10.1145/362851.362887}.
  1432. @item @c sort-key: Matula-1968b
  1433. David W. Matula, @cite{The Base Conversion Theorem},
  1434. Proceedings of the American Mathematical Society @b{19}(3)
  1435. 716--723 (June 1968). See also other papers here by this author,
  1436. and by I. Bennett Goldberg.
  1437. @item @c sort-key: Matula-1970
  1438. David W. Matula, @cite{A Formalization of Floating-Point Numeric
  1439. Base Conversion}, IEEE Transactions on Computers @b{C-19}(8)
  1440. 681--692 (August 1970),
  1441. @uref{https://doi.org/10.1109/T-C.1970.223017}.
  1442. @item @c sort-key: Muller-2010
  1443. Jean-Michel Muller and eight others, @cite{Handbook of
  1444. Floating-Point Arithmetic}, Birkh@"auser-Boston (2010), ISBN
  1445. 0-8176-4704-X (xxiii + 572 pages),
  1446. @uref{https://doi.org/10.1007/978-0-8176-4704-9}. This is a
  1447. comprehensive treatise from a French team who are among the
  1448. world's greatest experts in floating-point arithmetic, and among
  1449. the most prolific writers of research papers in that field. They
  1450. have much to teach, and their book deserves a place on the
  1451. shelves of every serious numerical programmer.
  1452. @item @c sort-key: Muller-2018
  1453. Jean-Michel Muller and eight others, @cite{Handbook of
  1454. Floating-Point Arithmetic}, Second edition, Birkh@"auser-Boston (2018), ISBN
  1455. 3-319-76525-6 (xxv + 627 pages),
  1456. @uref{https://doi.org/10.1007/978-3-319-76526-6}. This is a new
  1457. edition of the preceding entry.
  1458. @item @c sort-key: Overton
  1459. Michael Overton, @cite{Numerical Computing with IEEE Floating
  1460. Point Arithmetic, Including One Theorem, One Rule of Thumb, and
  1461. One Hundred and One Exercises}, SIAM (2001), ISBN 0-89871-482-6
  1462. (xiv + 104 pages),
  1463. @uref{http://www.ec-securehost.com/SIAM/ot76.html}.
  1464. This is a small volume that can be covered in a few hours.
  1465. @item @c sort-key: Steele-1990
  1466. Guy L. Steele Jr. and Jon L. White, @cite{How to Print
  1467. Floating-Point Numbers Accurately}, ACM SIGPLAN Notices
  1468. @b{25}(6) 112--126 (June 1990),
  1469. @uref{https://doi.org/10.1145/93548.93559}.
  1470. See also the papers by Clinger.
  1471. @item @c sort-key: Steele-2004
  1472. Guy L. Steele Jr. and Jon L. White, @cite{Retrospective: How to
  1473. Print Floating-Point Numbers Accurately}, ACM SIGPLAN Notices
  1474. @b{39}(4) 372--389 (April 2004),
  1475. @uref{http://doi.acm.org/10.1145/989393.989431}. Reprint of 1990
  1476. paper, with additional commentary.
  1477. @item @c sort-key: Sterbenz
  1478. Pat H. Sterbenz, @cite{Floating Point Computation}, Prentice-Hall
  1479. (1974), ISBN 0-13-322495-3 (xiv + 316 pages). This often-cited book
  1480. provides solid coverage of what floating-point arithmetic was like
  1481. @emph{before} the introduction of IEEE 754 arithmetic.
  1482. @end itemize