/* * mulsf3.s * * Floating point multiply, single precision: r0r1 *= r2r3 * * The contents of this file are subject to the Mozilla Public License * Version 1.0 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the * License for the specific language governing rights and limitations * under the License. * * The Original Code is Librcx floating point code, released May 27, 1999. * * The Initial Developer of the Original Code is Kekoa Proudfoot. * Portions created by Kekoa Proudfoot are Copyright (C) 1999 * Kekoa Proudfoot. All Rights Reserved. * * Contributor(s): Kekoa Proudfoot */ ; possible optimizations: ; - use push/pop to save exponent and sign on stack ; - might it have been faster/simpler to inline a 24 iteration multiply? ; - this only requires 3b + 3b + 4b + 1b in regs to implement? .section .text ;; ;; function: mulsf3 ;; input: float in r0r1 and float at sp+2 ;; output: float in r0r1 ;; .global ___mulsf3 ___mulsf3: ; Invoke the prologue to expand input operands jsr ___startsf ; At this point, registers/stack contain the following: ; r0r1 - first operand ; r2h - second operand flags ; r2l - second operand sign ; r3h - first operand flags ; r3l - first operand sign ; r4 - first operand exponent ; r5r6 - first operand mantissa ; sp+0 - second operand exponent ; sp+2 - second operand flags (same as r2h) ; sp+3 - second operand sign (same as r2l) ; sp+4 - second operand mantissa ; sp+16 - second operand ; Note on flag bits: 0=zero, 1=inf, 2=nan ; Is the first operand a NaN? ; If yes, return the first operand (the value already in r0r1) bld #2,r3h ; if nan flag of first operand set ; Hack! bcs return_jmp ; carry set indicates true ; Is the second operand a NaN? bld #2,r2h ; if nan flag of second operand set bcc endif_0 ; carry clear indicates false ; Return the second operand (which we need to load off stack) mov.w @(16,r7),r0 ; set return value to second operand (sp+16) mov.w @(18,r7),r1 ; Hack! return_jmp: bra return endif_0: ; Is the first operand infinity and the second operand zero? ; Or is the second operand infinity and the first operand zero? bld #1,r3h ; load inf flag of first operand to carry band #0,r2h ; and carry with zero flag of second operand bcs if_1 ; carry set indicates true bld #1,r2h ; load inf flag of second operand to carry band #0,r3h ; and carry with zero flag of first operand bcc endif_1 ; carry clear indicates false if_1: ; Zero multiplied by infinity, so return NaN mov.w #0x7fff,r0 ; set return value to NaN (7fffffff) mov.w #0xffff,r1 bra return endif_1: ; At this point, we no longer need the following: ; r0r1 - first operand ; r2h - second operand flags ; r3h - first operand flags ; Compute new exponent mov.w @r7,r0 ; load second operand exponent (sp+0) to temp add.w r0,r4 ; add temp to exponent of first operand (r4) add.b #-127,r4l ; subtract bias addx.b #-1,r4h ; finish subtraction ; Compute new sign xor.b r2l,r3l ; xor second sign (r2l) into first sign (r3l) ; At this point r2l is also free (and therefore all of r2 is free) ; We now want to multiply the two mantissas ; We need r3 and r4 free to do this ; Save result exponent and sign to stack mov.w r4,@r7 ; sp+0 is result exponent mov.b r3l,@(3,r7) ; sp+3 is result sign ; Given two 1.23 mantissas, compute a 2.30 product with a sticky lsb ; Use three 24 by 8 multiplies to multiply two 24-bit mantissas ; Save first mantissa to r2r3 mov.w r5,r2 ; copy r5r6 to r2r3 mov.w r6,r3 ; note multiply done using r5r6 ; Perform first multiply (first mantissa * lower byte of second mantissa) mov.b @(7,r7),r4l ; load lower byte of second mantissa to r4l bsr mul_24_8 ; multiply r5l r6h r6l by r4l ; Save lower four result bytes in r0r1 mov.w r5,r0 ; copy r5r6 to r0r1 mov.w r6,r1 ; Perform second multiply (first mantissa * middle byte of second mantissa) mov.w r2,r5 ; copy first mantissa back to r5r6 from r2r3 mov.w r3,r6 mov.b @(6,r7),r4l ; load middle byte of second mantissa to r4l bsr mul_24_8 ; multiply r5l r6h r6l by r4l ; Combine new result into old and propagate carry into fifth result byte add.b r6l,r1h ; add corresponding bytes addx.b r6h,r0l addx.b r5l,r0h addx.b #0,r5h ; propagate carry ; Combine lower two result bytes into a single sticky byte or.b r1l,r1h ; compute new sticky byte ; Save fifth result byte mov.b r5h,r1l ; store fifth result byte ; Perform final multiply (first mantissa * upper byte of second mantissa) mov.w r2,r5 ; copy first mantissa back to r5r6 from r2r3 mov.w r3,r6 mov.b @(5,r7),r4l ; load upper byte of second mantissa to r4l bsr mul_24_8 ; multiply r5l r6h r6l by r4l ; Combine old result into new and propagate carry into upper result byte add.w r0,r6 ; add corresponding bytes addx.b r1l,r5l addx.b #0,r5h ; propagate carry ; 2.30 result now in r5r6 ; Set sticky bit if sticky byte or result lsb non-zero add.b #0xff,r1h ; set carry if sticky byte non-zero bor #0,r6l ; or carry with lsb to get sticky bit bst #0,r6l ; store sticky bit in lsb ; Sticky shift the result right one place to get a 2.29 value shlr.b r5h ; shift mantissa right 1 place rotxr.b r5l rotxr.b r6h rotxr.b r6l ; places old sticky bit in carry bor #0,r6l ; or lsb with old sticky bit to get new bit bst #0,r6l ; store new sticky bit ; Restore result exponent and sign from stack mov.w @r7,r4 ; sp+0 is result exponent mov.b @(3,r7),r3l ; sp+3 is result sign ; Pack the result jsr ___joinsf return: ; Invoke the epilogue to cleanup and return jmp ___finishsf ;; ;; function: mul_24_8: ;; input: three-byte operand in r5l r6h r6l, single-byte operand in r4l ;; output: four-byte product in r5 r6 ;; registers: assumes r4h r5h free ;; mul_24_8: ; Rearrange bytes mov.b r6h,r4h ; move middle byte of three-byte operand to r4h ; Perform bytewise multiply mulxu.b r4l,r5 ; multiply r5l r4h r6l by single-byte operand mulxu.b r4l,r6 ; result split into words in r5 r4 r6 mulxu.b r4h,r4 ; r5 r4 r6 are resp. upper middle lower words ; Combine results ; Add lower byte of middle word into upper byte of lower word ; Add upper byte of middle word into lower byte of upper word, using carry ; Propagate second carry to upper byte of upper word add.b r4l,r6h ; add corresponding bytes addx.b r4h,r5l ; add corresponding bytes, with carry addx.b #0,r5h ; propagate second carry rts