/*
    File:   _fsdiv.c
    Author: Jonathan Dale Kirwan

    Creation Date:  Tue 03-Nov-2009 12:42:28
    Last Modified:  Thu 05-Nov-2009 18:59:06    Modified commentary.
         Modified:  Tue 03-Nov-2009 12:42:28

    Copyright 2009, Jonathan Dale Kirwan, jonk@infinitefactors.org

    This source code module for the library routine ___fsdiv is distributed
    under the terms of the GNU Lesser General Public License (aka LGPL.)

    You should have received a copy of this license as COPYING.LESSER.  But
    if not, it can currently be accessed over the web at:

        http://www.gnu.org/licenses/lgpl.txt

    Or receive a copy by writing to the Free Software Foundation, Inc., the
    current address of which (at this time of writing) is:

        51 Franklin St
        Fifth Floor
        Boston, MA 02110
        USA


    DESCRIPTION

    This routine performs a floating point division for the SDCC compiler,
    for 32-bit float values.  There is no support for denormals at this
    time and this code closely follows prior code in dealing with INF and
    NaN and various other error handling, rather than forge new territory.
    The design was optimized more for speed than space, so it is lengthy.

    Inputs:

        [A,B,DPH,DPL]   32-bit floating-point dividend (in registers)
        [2,3,4,5](SP)   32-bit floating-point divisor (on the stack)
 
    Outputs:
 
        [A,B,DPH,DPL]   32-bit floating-point quotient (return value)

    The routine (like most but not all division routines) also computes
    the remainder value.  Sadly, that information is not returned but is
    instead merely used for rounding purposes and otherwise simply lost.

    This code uses non-restoring division and unrolls its looping somewhat
    -- for two reasons.  One is to minimize the impact of the loop control
    variable.  The other is to allow somewhat faster use of XCH rather
    than a pair of MOV instructions, which also saves a few cycles (and
    code space.)  There is another 'optimization' that is also done.  In
    order to avoid the use of LJMP, fragments of the coroutine sections
    are intermingled so that the conditional jumps can make their leaps,
    satisfactorily.  This may be the most unnerving part of the code to
    read.  For that, I apologize.  I'll do what I can to explain, later.


    RESTORING DIVISION

    Regular division on a micro isn't much different from division done by
    long hand with pencil and paper; except that it is in binary form, so
    that the details may be slightly unfamiliar to some.  A manual method
    uses trial values aligned to the left and subtracted, with more trial
    values to follow that are aligned successively further rightwards.  A
    computerized method isn't much different.
    
    Aside from using binary values, another difference is that computers
    don't know if the trial subtraction will succeed before it's tried.
    So once a mistake is made and the negative result discovered, the
    subtracted value must be added back in before moving again to the
    right by one digit and trying the subtraction once again.

    Non-restoring division is different from the above description because
    the algorithm instead accepts the result from the subtraction, regard-
    less of the outcome.  And simply chooses to do something different on
    the next step, rather than correct it's error first.  On 8-bit micro-
    controollers like the 8051, where the cost of a correcting a mistake
    can be time consuming, it pays to figure out how to move on without
    the need to restore the temporary result's earlier value.

    An example may make this a little clearer.  Imagine dividing a binary
    value of 1011 by 1010 to produce a 4 bit result.  (In decimal form
    this is computing 11.0/10.0 or 1.1.)  In binary, the computed result
    is expressed as 1.000110011001100... and would be rounded up to 1.001
    if only 4 bits were allowed for the result.  To do this in long hand
    form, it looks like:

        1011.00000
       -1010             tentative quotient = 1.
       -----------
           1 0000        drop some zeros down
            -1010        tentative quotient = 1.0001
          --------
              1100       drop one zero down
             -1010       tentative quotient = 1.00011
          --------
                100...               etc.

    Those new, but observant, will notice I shifted the 2nd subtraction
    over a few places before trying to sutract, again.  And in the
    process adding some zeros to the tentative quotient to account for
    that fact.  How did I know to do this?  Well, because I've been
    trained and can apply my experience to tell when the subtraction will
    succeed before attempting it.  Computers are not so fortunate.  They
    has no idea.  So an algorithm on a computer may look more clumsy:


        1011.00000
       -1010             try subtracting and get lucky           (1)
       -----------
           1 0           drop one zero down
        -101 0           try subtracting
        ----------
        -100 0           didn't get lucky, so add it back        (0)
        +101 0           add back to prepare for next attempt
        ----------
           1 00          drop one zero down
         -10 10          try subtracting
         ---------
          -1 10          didn't get lucky, so add it back        (0)
         +10 10          add back to prepare for next attempt
         ---------
           1 000         drop one zero down
          -1 010         try subtracting
          --------
             -10         didn't get lucky, so add it back        (0)
          +1 010         add back to prepare for next attempt
          --------
           1 0000        drop one zero down
            -1010        try subtracting and get lucky           (1)
          --------
              1100       drop one zero down
             -1010       try subtracting and get lucky           (1)
          --------
                100...               etc.

    Note the generated bits, as they occur, along the right side column.

    The computer algorithm works mechanically.  And corrects for mistakes
    by restoring the value before dropping down another bit (binary digit)
    and trying yet another subtraction.  Algorithms that compute quotients
    like this are called restoring division algorithms.  They check the
    arithmetic sign of the result and, if negative, fix it back to the
    original value (restoring from a copy or restoring by addition; either
    works) before proceeding.

    It's important to notice that the 'drop one zero down' step is about
    the same as multiplying the intermediate results by 2.  If the result
    of each step of the trial subtraction is called T[i], then the
    recurrence relations are:

        T[i] = 2*(T[i-1]+D)-D,  if T[i-1] < 0, or,
        T[i] = 2*T[i-1]-D,      if T[i-1] >= 0.

    There's more detail to follow.  But that summarizes restoring division
    in a nutshell.


    NON-RESTORING DIVISION

    Non-restoring division makes one simple adjustment to the recurrence
    just mentioned.  It expands the case where T[i-1] < 0 by distributing
    the factor 2 across the expression and then combining terms.  The
    resulting recurrence should then easily be seen as:

        T[i] = 2*T[i-1]+D,      if T[i-1] < 0, or,
        T[i] = 2*T[i-1]-D,      if T[i-1] >= 0.

    So, it uses both trial subtraction and trial addition instead of just
    trail subtraction alone.  Other than that very small change, it's the
    same concept merely taken to its conclusion.

    The above recurrences make an assumption that I've withheld until now.
    I'll re-insert it now:
    
        T[i] = 2*T[i-1]+D,      where, -D <= T[i-1] < 0,   or,
        T[i] = 2*T[i-1]-D,      where,  0 <= T[i-1] < D

    The additional limits follow easily from careful consideration.  The
    smallest that 2*T[i]-D can be, assuming that T[i-1] >= 0, is -D.  So,
    the lower limit on the first case is exposed.  The largest that 2*
    T[i-1]+D can be, assuming that T[i-1] < 0, is 2*(-1)+D or D-2.  But
    the very first case should be (a point addressed again, shortly):

        0 <= T[1] < D

    Which now justifies the actual range I specified above for all cases.

    By the way, there is a T[0].  But I'll get to that.


    NORMALIZATION

    Floating point numbers pack their mantissa in normalized form.  This
    fact is very helpful for division operations.
    
    Call the dividend N (for numerator) and the divisor D (obvious.)  32-
    bit floating point numbers usually include 23 bits of the mantissa and
    throw away the leading bit (it's assumed to be 1, so why keep it?) as
    a so-called "hidden bit."  Unpacking the floating point mantissa
    restores this bit for a total of 24 bits in the mantissa.  There is an
    assumed radix point (decimal point equivalent) that is just to the
    right of this hidden bit (although that isn't always followed by every
    floating point format and may just as well be considered to the left,
    so long as the exponent is maintained, accordingly.)

    Since both are in normalized form (the leading bit is 1), the range of
    N varies from 1.0 to just under 2.0.  The same is true for D.  So the
    range of the fraction N/D varies from slightly over 0.5 to slighly
    under 2.0.  In other words,

        0.5 < (N/D) < 2.0

    The result will need to also be normalized.  To achieve that easily,
    simply guarantee that the first bit computed will be a '1'.  The way
    to achieve this is to add a special requirement to T[0]:

        D <= T[0] < 2*D

    If that's required at the outset, then subtracting D must meet succeed
    and meet the imposed requirements on T[1].  The full recurrence is:

        T[0] = N                where, D <= N < 2*D,                or,
        T[0] = 2*N              where, D/2 <= N < D

        T[1] = T[0]-D           always -- generates first bit as '1'

        T[i] = 2*T[i-1]+D,      where, -D <= T[i-1] < 0 and i > 1 , or,
        T[i] = 2*T[i-1]-D,      where,  0 <= T[i-1] < D and i > 1

    Doing a check at the beginning guarantees that a 1 bit is generated at
    T[1] and this guarantees that the result will already be normalized
    when finished.  Nice.


    ADDITION, SUBTRACTION AND MULTIPLYING BY 2

    It's time to examine the above recurrences for one more detail.  Just
    how many bits are required for an addition or subtraction operation?
    The range is from -D to (D-1).  This spans a symbol space of 2*D-1.  A
    more detailed examination would show that at each successive step, the
    range reduces slightly, so that it is 2*D+1-2^(i+1).  (This fact won't
    help much until the very end when 2^(i+1) nears the value of D.  Best
    to just ignore that detail.)  The symbol space of 2*D-1 depends on D,
    but since D might possibly occupy all of the binary symbol space (for
    now assume nothing more is known about D), this suggests that the
    operations need to support about one more bit than the size of D.  In
    other words, 25 bits in this case.  However, D's symbol space is
    actually one bit less 24 bits because of the hidden bit, so this
    suggests the arithemtic operation may be 24 bits.  Good thing.

    The multiplication by 2 __before__ performing an arithmetic operation
    yields a slight complication to this idea of a 24-bit computation.
    The result of the 2*T[i-1] might actually require part of another
    bit's symbol space, temporarily, until the operation can complete.
    But this extra bit can be the carry bit.  A grid may help here:

        SUBTRACTION OPERATION
        ====================================================
                          No Borrow            Borrow
                    ,------------------+------------------,
           Carry    |                  | generate '0' and |
            from  0 |   generate '1'   |  transition to   |
            Left    |                  |  addition mode   |
           Shift    +------------------+------------------+
           prior    |                  |                  |
              to  1 |    impossible    |   generate '1'   |
             (-)    |                  |                  |
                    '------------------+------------------'

        ADDITION OPERATION
        ====================================================
                          No Carry           Carry Out
                    ,------------------+------------------,
           Carry    |                  |                  |
            from  0 |    impossible    |   generate '0'   |
            Left    |                  |                  |
           Shift    +------------------+------------------+
           prior    |                  | generate '1' and |
              to  1 |   generate '0'   |  transition to   |
             (+)    |                  | subtraction mode |
                    '------------------+------------------'

    The subtraction cases aren't too hard to follow.  If there is no carry
    (it is zero) as the result of the shift operation, then subtracting
    the divisor from the remainder may cause a borrow.  However, if there
    is a carry (it is one) out of the shift, then there is no possibility
    of a borrow when subtracting the divisor from the remainder.  It is
    always positive.  The table for subtraction clearly illustrates this.

    The addition cases aren't quite as easy to follow for those not used
    to two's complement notation and ALU operations on them, generally.
    In addition mode, there is always an assumed negative sign somewhere.
    This is an outgrowth of the fact that the only way to enter addition
    mode is by way of a subtraction that yielded a negative result, in the
    first place (first row, second column of the subtraction table entry.)
    When a left shift on such a negative number produces a zero carry bit,
    this means that the number is _very_ negative; more so than otherwise.
    As a consequence, any addition to it is guaranteed not to generate a
    positive result -- it must remain negative, but less so.  However, if
    the carry is one, this means that the leading bit was the same as the
    sign bit and that the result before addition is close enough to zero
    that there is a possibility of generating a positive value, adding the
    divisor into it.  The table for subtraction illustrates this, as well.

    Another way to look at the above two paragraphs is to mentally imagine
    prior to the shift operation that the mode, addition or subtraction,
    implies a 25th bit.  In subtraction mode it's zero, representing a
    positive number.  In addition mode it's one, representing a negative
    number.  Namely, the sign bit where the other 24 bits in the result
    represent the rest of the two's complement value.  After the shift
    operation, the sign bit is still implied but the most significant bit
    of the unshifted result is now present in the carry bit.

    It may be easier to understand the earlier subtraction/addition table
    by examining this one:

        SUBTRACTION OPERATION
        ==============================================================
          Implied  Carry after
            Sign      Shift     Effective Result After Shift (26 bits)
            ----------------------------------------------------------
              0         0           00 xxxxxxxx xxxxxxxx xxxxxxx0
              0         1           01 xxxxxxxx xxxxxxxx xxxxxxx0

        ADDITION OPERATION
        ==========================================================
          Implied  Carry after
            Sign      Shift     Effective Result After Shift (26 bits)
            ----------------------------------------------------------
              1         1           11 xxxxxxxx xxxxxxxx xxxxxxx0
              1         0           10 xxxxxxxx xxxxxxxx xxxxxxx0

    Looking closely at the effective results shown above, it is a little
    easier to see the earlier outlined situations.  Addition of the
    divisor, in the case where the upper two bits are "11," may certainly
    yield a positive outcome, where both those bits become "00" after the
    addition.  This necessitates transitioning to the other mode,
    subtraction.  But if the upper two bits are "10" the no possible
    divisor can cause the sign bit to change.  (The '0' in the carry
    absorbs any carry out of the other 24 bits and the result remains
    negative, regardless.)

    In short, if subtraction would cause the leading "00" to become "11",
    then the result after subtraction is negative; if addition would cause
    the leading "11" to become "00," then the result after addition is
    positive.

    These are the theoretically sound arguments showing why it is safe to
    proceed, with caveats noted, and use what otherwise appear to be 24-
    bit operations.


    DESIGN

    In addition to using non-restoring division, I chose to use a number
    of techniques to make the code faster.  For example, instead of using
    a register-order preserving MOV instructions as in:

          .
          .
        MOV a, R0
        ADD a, R4
        MOV R0, a
        MOV a, R1
        ADD a, R5
          .
          .

    I chose instead not to preserve register order, as in:

          .
          .
        MOV a, R0
        ADD a, R4
        XCH R1, a
        ADD a, R5
          .
          .

    To keep track of the details, I provide a running commentary to show
    what the constantly changing scenario is.  It saves instruction space
    and improves execution time.  So I deemed it worthwhile, despite the
    mental struggle it may sometimes present to humans.  A specialized
    compiler tool could make this transition easier to stomach, but I
    didn't bother writing one for this purpose.  Live with it.

    Because of this re-ordering of registers, I chose to unroll the loop
    by 4.  This was exactly enough to bring the register order back into
    line, again.  So it's all that was needed.  Unrolling like this also
    saves a few DJNZ tests, which reduces the cycle count slightly more.

    Once the unrolling was decided, there is another problem.  The range
    of T[0] must be tested and T[1] computed to produce the first bit.
    That means we need 23 more bits, not 24.  But unrolled by 4, only
    multiples of 4 bits at a time can be handled.  To fix this, the code
    jumps into the middle of the first iteration, skipping over the first
    bit of the first loop iteration.  That way, it works fine.

    But since the bit generation also re-orders registers, this requires
    that the initial order meet the assumptions of that step in the loop.
    So the unpacking of the dividend is a little unusual and takes into
    account also the reordering of registers done in generating the first
    bit.

    The resulting division code loop is best examined as paired coroutines
    shown below:
 
          start
            |       ,-------,                                ,-------,
       ,--------,   |       |                                |       |
       |firstbit|   |       v                                v       |
       '--------'   |   ,-------,                        ,-------,   |
            |       |   |rlc/sub|                        |rla/add|   |
            v       |   '-------'----------,  ,----------'-------'   |
            '------ | ----> |      borrow   \/   carry       |       |
                    |       v               /\               v       |
                    |   ,-------, <--------'  '--------> ,-------,   |
                    |   |rlc/sub|                        |rla/add|   |
                    |   '-------'----------,  ,----------'-------'   |
                    |       |      borrow   \/   carry       |       |
                    |       v               /\               v       |
                    |   ,-------, <--------'  '--------> ,-------,   |
                    |   |rlc/sub|                        |rla/add|   |
                    |   '-------'----------,  ,----------'-------'   |
                    |       |      borrow   \/   carry       |       |
                    |       v               /\               v       |
                    |   ,-------, <--------'  '--------> ,-------,   |
                    |   |rlc/sub|                        |rla/add|   |
                    |   '-------'----------,  ,----------'-------'   |
                    |       |      borrow   \/   carry       |       |
                    |       v               /\               v       |
                    |   ,-------, <--------'  '--------> ,-------,   |
                    '<--| DJNZ  |                        | DJNZ  |-->'
                   NZ   '-------'                        '-------'  NZ
                            |  Z                          Z  |
                            v                                v
                        ,-------,                        ,-------,
                        |round 1|                        |round 0|
                        '-------'                        '-------'
                            |                                |
                            v                                v
                            '-------------> end <------------'

    The "borrow" and "carry" phrases above are meant in the precise sense
    shown in the earlier SUBTRACTION/ADDITION table -- the specific boxes
    that talk about making a transition is where these meanings arrive.
    So it's not the simple case of examining a single bit from a single
    operation.  Also, the 'rla' is meant in the same way an 'rlc with
    carry=0' would be meant.  (I read it as "arithmetic left shift.")
    And "firstbit" means the establishment of T[0] and the computation of
    T[1].  Rounding is performed differently, depending on which coroutine
    was active when the DJNZ loop exits.  The above diagram illustrates
    this, but the code provides the exact details.

    The shifting operation must also be performed on the quotient value
    that is being generated.  So the 'rla' and 'rlc' actually apply to
    both the quotient and T[i].  Since T[i] needs a zero shifted in from
    the right and since the quotient starts out with zeros on the left
    side, the arrangement of "gluing" things so that the quotient is to
    the right of T[i] and performing a 48-bit shift achieves both things
    as one longer operation.  Which is how the code does things.

    Finally, the cases in the SUBTRACTION/ADDITION table can be observed
    at different points in the code.  After the shift operation, the carry
    can be examined _before_ the operation is performed and this can then
    be used to slightly hasten the remaining steps.  The code breaks out
    two different operation fragments for subtraction and addition at each
    step and this fact is sorely evident in the code.  If needed, just
    refer back to the table to think more about why I do what I do in the
    code.

    With the above exposed, another optimization was also done.  Unrolling
    the loop by 4 and the need to bounce back and forth between the two
    coroutines caused some extended jumps for some conditional carry test
    instructions.  To mitigate this effect, I intermingled the various
    steps so that these conditional jumps were entirely within their range
    of extent.  This makes the coroutines even more difficult to follow,
    but saves cycles and reduces code size.  Another pain to live with.


    MODIFICATIONS

    Thu 05-Nov-2009 18:59:06    jdk
        Made some modifications to the module's commentary section to
        remove some of the folksy speech I used, correct some minor
        errors, and slightly improve the readability.

    Tue 03-Nov-2009 12:42:28    jdk
        Initial version released under the LGPL.
*/


/*  ----------------------------------------------------------------------  */
static void dummy( void ) __naked {
/*  ----------------------------------------------------------------------
*/
    __asm
            .globl  ___fsdiv
___fsdiv:

    /*  ------------------------------------------------------------------
        Unpack the function parameters -- the dividend and divisor.  The
        dividend is passed into the function in [A,B,DPH,DPL] and the
        divisor is passed into the function on the RAM stack.  This latter
        fact about the divisor is not changed by various compiler options.
        ------------------------------------------------------------------
    */
        /*  ----- Unpack Dividend -----------------------------------
            The dividend is passed in [A,B,DPH,DPL].  Unpack it.
            ---------------------------------------------------------
        */
            mov     r5, dpl     /* least sig. byte of dividend mantissa */
            mov     r0, dph     /* middle sig. byte of dividend mantissa */
            mov     c, b.7      /* copy out lsb of exponent to carry */
            rlc     a           /* ACC = exponent and C = sign */
            mov     f0, c       /* temporary save of dividend sign */
            jz      00001$      /* re-assert mantissa hidden bit? */
            setb    b.7         /*   yes -- assert it for normals */
    00001$:                     /*      or don't for denormals */
            mov     r4, b       /* highest sig. byte of dividend mantissa */
            mov     r3, a       /* temporary save of dividend exponent */

        /*  -- Dividend mantissa=[R4,R0,R5], exponent=R3, sign=F0 -- */

        /*  ----- Unpack Divisor ------------------------------------
            The divisor is passed on the stack.  Unpack it, as well.
            ---------------------------------------------------------
        */
            mov     a, sp       /* get copy of the stack pointer and back it */
            add     a, #-5      /*   before the return address and parameter */
            mov     r1, a       /* R1 is the temporary RAM pointer to use */
            mov     a, @r1      /* fetch the least sig. byte of divisor */
            mov     r6, a       /*   mantissa and save it */
            inc     r1
            mov     a, @r1      /* fetch the middle sig. byte of divisor */
            mov     r7, a       /*   mantissa and save it */
            inc     r1
            mov     b, @r1      /* fetch the highest sig. byte of divisor */
            inc     r1          /*   (and the lsb of the exponent, too) */
            mov     a, @r1      /* fetch the rest of the exponent and sign */
            mov     c, b.7      /* copy out lsb of exponent to carry */
            rlc     a           /* ACC = exponent and C = sign */
            jnb     f0, 00002$  /* sign of the result is the XOR of the sign */
            cpl     c           /*   of the dividend and divisor */
    00002$: mov     f0, c       /* save the resulting sign */
            jz      00003$      /* re-assert mantissa hidden bit? */
            setb    b.7         /*   yes -- assert it for normals */
    00003$:                     /*      or don't for denormals */
            mov     r1, a       /* temporary save of dividend exponent */

        /*  -- Dividend mantissa=[R4,R0,R5], divisor mantissa=[B,R7,R6] -- */
        /*  -- Dividend exponent in R3 and divisor exponent in R1 -- */
        /*  -- Result sign in F0 -- */

    /*  ------------------------------------------------------------------
        Handle Denormals and other special cases.  This code follows the
        existing library behavior, so sense isn't made of it -- the path
        is merely followed.  (No sense deviating from The Path.)
        ------------------------------------------------------------------
    */
            clr     a
            cjne    a, b, 00005$
            cjne    r4, #0, 00004$
            ljmp    fs_return_nan   /* 0/0 yields a NaN (unsigned?) */
    00004$: ljmp    fs_return_inf   /* #/0 yields INF (unsigned?) */
    00005$: cjne    r4, #0, 00006$
            ljmp    fs_return_zero  /* 0/# yields ZERO */
    00006$: dec     a               /* ACC = 0xFF */
            cjne    a, 1, 00008$    /* R1 */
            cjne    a, 3, 00007$    /* R3 */
            ljmp    fs_return_nan   /* INF/INF yields NaN */
    00007$: ljmp    fs_return_zero  /* #/INF yields ZERO */
    00008$: cjne    a, 3, 00009$    /* R3 */
            ljmp    fs_return_inf   /* INF/# yields INF */
    00009$:
    /*  ------------------------------------------------------------------
        Compute tentative exponent.of the result.  Again, the code follows
        the existing library routine behavior here in order to avoid
        introducing new behaviors.
        ------------------------------------------------------------------
    */
            mov     a, r3           /* get dividend exponent and bias */
            clr     c
            subb    a, r1           /* subtract divisor exponent and bias */
            jnc     00010$
            add     a, #127
            jc      00011$
            ljmp    fs_return_zero  /* underflow to denormal */
    00010$: add     a, #128
            dec     a
            jnc     00011$
            ljmp    fs_return_inf   /* overflow into INF */
    00011$: mov     dph, a          /* save tentative exponent into DPH */

    /*  ------------------------------------------------------------------
        Prepare for the main division loop.  In this part, the quotient is
        initialized and a division check is made to guarantee that the
        first quotient bit will be a 1 (for a normalized result and proper
        division computation.)  The exponent is adjusted by that process.
        ------------------------------------------------------------------
    */
        /*  ----- Initialize the quotient ---------------------------
            The quotient is initialized here to 0, but a "1" will be
            added, first thing once the loop starts.  So make it zero
            here.  This operation now also destroys prior exponents
            for both dividend and divisor as a side-effect.
            ---------------------------------------------------------
    */
            clr     a
            mov     r3, a
            mov     r2, a
            mov     r1, a       /* quotient= [R3,R2,R1] */

        /*  -- Dividend mantissa=[R4,R0,R5], divisor mantissa=[B,R7,R6] -- */
        /*  -- Result exponent in DPH and result sign in F0 -- */
        /*  -- Result (quotient) mantissa in [R3,R2,R1] -- */

        /*  ----- First Trial Subtraction ---------------------------
            If dividend >= divisor, then the trial subtraction is the
            right partial result and we are done.  If not, we need to
            adjust it.
            ---------------------------------------------------------
        */
        /*  -- Compute dividend and divisor difference -- */

            clr     c
            mov     a, r5       /* [R4,R0,R5]    - [B,R7,R6] */
            subb    a, r6       /* [R4,R0-C,ACC] - [B,R7,0] */
            xch     a, r0       /* [R4,ACC-C,R0] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R0] - [B,0,0]  */
            xch     a, r4       /* [ACC-C,R4,R0] - [B,0,0]  */
            subb    a, b        /* [ACC,R4,R0]       ... and carry out */
            jnc     00012$

        /*  -- Fixup the partial result, if dividend < divisor -- */

            clr     c
            xch     a, r0       /* [R0,R4,ACC,0] */
            rlc     a           /* [R0,R4,C,ACC] */
            xch     a, r4       /* [R0,ACC,C,R4] */
            rlc     a           /* [R0,C,ACC,R4] */
            xch     a, r0       /* [ACC,C,R0,R4] */
            rlc     a           /* [ACC,R0,R4]      ... and toss C */
            xch     a, r4       /* [R4,R0,ACC]   + [B,R7,R6] */
            add     a, r6       /* [R4,R0+C,ACC] + [B,R7,0] */
            xch     a, r0       /* [R4,ACC+C,R0] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R0] + [B,0,0]  */
            xch     a, r4       /* [ACC+C,R4,R0] + [B,0,0]  */
            addc    a, b        /* [ACC,R4,R0], and carry out */
            dec     dph         /* presumption wrong, adjust exponent back */
    00012$:
    /*  ------------------------------------------------------------------
        Non-restoring division uses two coroutines: one for the case where
        a '1' is generated for the quotient and another for the case where
        a '0' is generated.  These coroutines follow.  It is always true
        that the first bit is a '1', so the code "falls" into the first
        coroutine as a result of this explicit fact.
        ------------------------------------------------------------------
    */
        /*  -- Dividend mantissa=[ACC,R4,R0], divisor mantissa=[B,R7,R6] -- */
        /*  -- Result (quotient) mantissa in [R3,R2,R1] -- */

        /*  ----- Loop Start ----------------------------------------
            Initialize the loop counter.  Each coroutine generates 4
            bits.  But since the first bit is already known to be 1
            from the above code, we really need only generate 23 more
            bits (not 24.)  Since 6*4 is 24, the counter must be set
            to 6, but the first step of the first loop needs to be
            skipped.  This code does both things.
            ---------------------------------------------------------
        */
            mov     dpl, #6
            sjmp    00014$

        /*  ----- Bit Generating Coroutines -------------------------
            The following unrolled-by-4 coroutines are intermingled
            per the text description.
            ---------------------------------------------------------
        */

        /*  -- S0B: Trial subtraction; 24-bit partial minus divisor. -- */

    00013$: clr     c
            xch     a, r4       /* [R4,R0,ACC] - [B,R7,R6] */
            subb    a, r6       /* [R4,R0-C,ACC] - [B,R7,0] */
            xch     a, r0       /* [R4,ACC-C,R0] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R0] - [B,0,0] */
            xch     a, r4       /* [ACC-C,R4,R0] - [B,0,0] */
            subb    a, b        /* [C,ACC,R4,R0] */
            jc      00016$

        /*  -- S0C: Insert "1" into quotient. -- */

    00014$: inc     r1          /* quotient gets "1" bit */

        /*  -- S1A: left shift partial+quotient, [C,ACC,R4,R0,R3,R2,R1,0] -- */

            clr     c
            xch     a, r1       /* [R1,R4,R0,R3,R2,ACC,0] */
            rlc     a           /* [R1,R4,R0,R3,R2,C,ACC] */
            xch     a, r2       /* [R1,R4,R0,R3,ACC,C,R2] */
            rlc     a           /* [R1,R4,R0,R3,C,ACC,R2] */
            xch     a, r3       /* [R1,R4,R0,ACC,C,R3,R2] */
            rlc     a           /* [R1,R4,R0,C,ACC,R3,R2] */
            xch     a, r0       /* [R1,R4,ACC,C,R0,R3,R2] */
            rlc     a           /* [R1,R4,C,ACC,R0,R3,R2] */
            xch     a, r4       /* [R1,ACC,C,R4,R0,R3,R2] */
            rlc     a           /* [R1,C,ACC,R4,R0,R3,R2] */
            xch     a, r1       /* [ACC,C,R1,R4,R0,R3,R2] */
            rlc     a           /* [C,ACC,R1,R4,R0,R3,R2] */
            jnc     00017$

        /*  -- Subtraction; 24-bit partial minus 24-bit divisor. -- */

            clr     c
            xch     a, r4       /* [R4,R1,ACC] - [B,R7,R6] */
            subb    a, r6       /* [R4,R1-C,ACC] - [B,R7,0] */
            xch     a, r1       /* [R4,ACC-C,R1] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R1] - [B,0,0] */
            xch     a, r4       /* [ACC-C,R4,R1] - [B,0,0] */
            subb    a, b        /* [C,ACC,R4,R1] */
            sjmp    00018$

        /*  -- A0B: Trial addition; 24-bit partial plus 24-bit divisor. -- */

    00015$: xch     a, r4       /* [R4,R0,ACC] + [B,R7,R6] */
            add     a, r6       /* [R4,R0+C,ACC] + [B,R7,0] */
            xch     a, r0       /* [R4,ACC+C,R0] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R0] + [B,0,0] */
            xch     a, r4       /* [ACC+C,R4,R0] + [B,0,0] */
            addc    a, b        /* [C,ACC,R4,R0] */
            jc     00014$

        /*  -- A0C: Insert "0" into quotient by doing nothing at all. -- */

    00016$: 

        /*  -- A1A: left shift partial+quotient, [C,ACC,R4,R0,R3,R2,R1,0] -- */

            clr     c
            xch     a, r1       /* [R1,R4,R0,R3,R2,ACC,0] */
            rlc     a           /* [R1,R4,R0,R3,R2,C,ACC] */
            xch     a, r2       /* [R1,R4,R0,R3,ACC,C,R2] */
            rlc     a           /* [R1,R4,R0,R3,C,ACC,R2] */
            xch     a, r3       /* [R1,R4,R0,ACC,C,R3,R2] */
            rlc     a           /* [R1,R4,R0,C,ACC,R3,R2] */
            xch     a, r0       /* [R1,R4,ACC,C,R0,R3,R2] */
            rlc     a           /* [R1,R4,C,ACC,R0,R3,R2] */
            xch     a, r4       /* [R1,ACC,C,R4,R0,R3,R2] */
            rlc     a           /* [R1,C,ACC,R4,R0,R3,R2] */
            xch     a, r1       /* [ACC,C,R1,R4,R0,R3,R2] */
            rlc     a           /* [C,ACC,R1,R4,R0,R3,R2] */
            jc      00021$

        /*  -- Addition; 24-bit partial plus 24-bit divisor. -- */

            xch     a, r4       /* [R4,R1,ACC] + [B,R7,R6] */
            add     a, r6       /* [R4,R1+C,ACC] + [B,R7,0] */
            xch     a, r1       /* [R4,ACC+C,R1] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R1] + [B,0,0] */
            xch     a, r4       /* [ACC+C,R4,R1] + [B,0,0] */
            addc    a, b        /* [C,ACC,R4,R1] */
            sjmp    00022$

        /*  -- S1B: Trial subtraction; 24-bit partial minus divisor. -- */

    00017$: clr     c
            xch     a, r4       /* [R4,R1,ACC] - [B,R7,R6] */
            subb    a, r6       /* [R4,R1-C,ACC] - [B,R7,0] */
            xch     a, r1       /* [R4,ACC-C,R1] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R1] - [B,0,0] */
            xch     a, r4       /* [ACC-C,R4,R1] - [B,0,0] */
            subb    a, b        /* [C,ACC,R4,R1] */
            jc      00022$

        /*  -- S1C: Insert "1" into quotient. -- */

    00018$: inc     r2          /* quotient gets "1" bit */

        /*  -- S2A: left shift partial+quotient, [C,ACC,R4,R1,R0,R3,R2,0] -- */

            clr     c
            xch     a, r2       /* [R2,R4,R1,R0,R3,ACC,0] */
            rlc     a           /* [R2,R4,R1,R0,R3,C,ACC] */
            xch     a, r3       /* [R2,R4,R1,R0,ACC,C,R3] */
            rlc     a           /* [R2,R4,R1,R0,C,ACC,R3] */
            xch     a, r0       /* [R2,R4,R1,ACC,C,R0,R3] */
            rlc     a           /* [R2,R4,R1,C,ACC,R0,R3] */
            xch     a, r1       /* [R2,R4,ACC,C,R1,R0,R3] */
            rlc     a           /* [R2,R4,C,ACC,R1,R0,R3] */
            xch     a, r4       /* [R2,ACC,C,R4,R1,R0,R3] */
            rlc     a           /* [R2,C,ACC,R4,R1,R0,R3] */
            xch     a, r2       /* [ACC,C,R2,R4,R1,R0,R3] */
            rlc     a           /* [C,ACC,R2,R4,R1,R0,R3] */
            jnc     00023$

        /*  -- Subtraction; 24-bit partial minus 24-bit divisor. -- */

            clr     c
            xch     a, r4       /* [R4,R2,ACC] - [B,R7,R6] */
            subb    a, r6       /* [R4,R2-C,ACC] - [B,R7,0] */
            xch     a, r2       /* [R4,ACC-C,R2] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R2] - [B,0,0] */
            xch     a, r4       /* [ACC-C,R4,R2] - [B,0,0] */
            subb    a, b        /* [C=1,ACC,R4,R2] */
            sjmp    00024$

        /*  -- S0A: left shift partial+quotient, [C,ACC,R4,R3,R2,R1,R0,0] -- */

    00019$: clr     c
            xch     a, r0       /* [R0,R4,R3,R2,R1,ACC,0] */
            rlc     a           /* [R0,R4,R3,R2,R1,C,ACC] */
            xch     a, r1       /* [R0,R4,R3,R2,ACC,C,R1] */
            rlc     a           /* [R0,R4,R3,R2,C,ACC,R1] */
            xch     a, r2       /* [R0,R4,R3,ACC,C,R2,R1] */
            rlc     a           /* [R0,R4,R3,C,ACC,R2,R1] */
            xch     a, r3       /* [R0,R4,ACC,C,R3,R2,R1] */
            rlc     a           /* [R0,R4,C,ACC,R3,R2,R1] */
            xch     a, r4       /* [R0,ACC,C,R4,R3,R2,R1] */
            rlc     a           /* [R0,C,ACC,R4,R3,R2,R1] */
            xch     a, r0       /* [ACC,C,R0,R4,R3,R2,R1] */
            rlc     a           /* [C,ACC,R0,R4,R3,R2,R1] */
            jnc     00013$

        /*  -- Subtraction; 24-bit partial minus 24-bit divisor. -- */

            clr     c
            xch     a, r4       /* [R4,R0,ACC] - [B,R7,R6] */
            subb    a, r6       /* [R4,R0-C,ACC] - [B,R7,0] */
            xch     a, r0       /* [R4,ACC-C,R0] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R0] - [B,0,0] */
            xch     a, r4       /* [ACC-C,R4,R0] - [B,0,0] */
            subb    a, b        /* [C=1,ACC,R4,R0] */
            sjmp    00014$

        /*  -- A0A: left shift partial+quotient, [C,ACC,R4,R3,R2,R1,R0,0] -- */

    00020$: clr     c
            xch     a, r0       /* [R0,R4,R3,R2,R1,ACC,C=0] */
            rlc     a           /* [R0,R4,R3,R2,R1,C,ACC] */
            xch     a, r1       /* [R0,R4,R3,R2,ACC,C,R1] */
            rlc     a           /* [R0,R4,R3,R2,C,ACC,R1] */
            xch     a, r2       /* [R0,R4,R3,ACC,C,R2,R1] */
            rlc     a           /* [R0,R4,R3,C,ACC,R2,R1] */
            xch     a, r3       /* [R0,R4,ACC,C,R3,R2,R1] */
            rlc     a           /* [R0,R4,C,ACC,R3,R2,R1] */
            xch     a, r4       /* [R0,ACC,C,R4,R3,R2,R1] */
            rlc     a           /* [R0,C,ACC,R4,R3,R2,R1] */
            xch     a, r0       /* [ACC,C,R0,R4,R3,R2,R1] */
            rlc     a           /* [C,ACC,R0,R4,R3,R2,R1] */
            jc      00015$

        /*  -- Addition; 24-bit partial plus 24-bit divisor. -- */

            xch     a, r4       /* [R4,R0,ACC] + [B,R7,R6] */
            add     a, r6       /* [R4,R0+C,ACC] + [B,R7,0] */
            xch     a, r0       /* [R4,ACC+C,R0] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R0] + [B,0,0] */
            xch     a, r4       /* [ACC+C,R4,R0] + [B,0,0] */
            addc    a, b        /* [C=1,ACC,R4,R0] */
            sjmp    00016$

        /*  -- A:B: Trial addition; 24-bit partial plus 24-bit divisor. -- */

    00021$: xch     a, r4       /* [R4,R1,ACC] + [B,R7,R6] */
            add     a, r6       /* [R4,R1+C,ACC] + [B,R7,0] */
            xch     a, r1       /* [R4,ACC+C,R1] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R1] + [B,0,0] */
            xch     a, r4       /* [ACC+C,R4,R1] + [B,0,0] */
            addc    a, b        /* [C,ACC,R4,R1] */
            jc      00018$

        /*  -- A1C: Insert "0" into quotient by doing nothing at all. -- */

    00022$:

        /*  -- A2A: left shift partial+quotient, [C,ACC,R4,R1,R0,R3,R2,0] -- */

            clr     c
            xch     a, r2       /* [R2,R4,R1,R0,R3,ACC,0] */
            rlc     a           /* [R2,R4,R1,R0,R3,C,ACC] */
            xch     a, r3       /* [R2,R4,R1,R0,ACC,C,R3] */
            rlc     a           /* [R2,R4,R1,R0,C,ACC,R3] */
            xch     a, r0       /* [R2,R4,R1,ACC,C,R0,R3] */
            rlc     a           /* [R2,R4,R1,C,ACC,R0,R3] */
            xch     a, r1       /* [R2,R4,ACC,C,R1,R0,R3] */
            rlc     a           /* [R2,R4,C,ACC,R1,R0,R3] */
            xch     a, r4       /* [R2,ACC,C,R4,R1,R0,R3] */
            rlc     a           /* [R2,C,ACC,R4,R1,R0,R3] */
            xch     a, r2       /* [ACC,C,R2,R4,R1,R0,R3] */
            rlc     a           /* [C,ACC,R2,R4,R1,R0,R3] */
            jc      00025$

        /*  -- Addition; 24-bit partial plus 24-bit divisor. -- */

            xch     a, r4       /* [R4,R2,ACC] + [B,R7,R6] */
            add     a, r6       /* [R4,R2+C,ACC] + [B,R7,0] */
            xch     a, r2       /* [R4,ACC+C,R2] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R2] + [B,0,0] */
            xch     a, r4       /* [ACC+C,R4,R2] + [B,0,0] */
            addc    a, b        /* [C=1,ACC,R4,R2] */
            sjmp    00026$

        /*  -- S2B: Trial subtraction; 24-bit partial minus divisor. -- */

    00023$: clr     c
            xch     a, r4       /* [R4,R2,ACC] - [B,R7,R6] */
            subb    a, r6       /* [R4,R2-C,ACC] - [B,R7,0] */
            xch     a, r2       /* [R4,ACC-C,R2] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R2] - [B,0,0] */
            xch     a, r4       /* [ACC-C,R4,R2] - [B,0,0] */
            subb    a, b        /* [C,ACC,R4,R2] */
            jc      00026$

        /*  -- S2C: Insert "1" into quotient. -- */

    00024$: inc     r3          /* quotient gets "1" bit */

        /*  -- S3A: left shift partial+quotient, [C,ACC,R4,R2,R1,R0,R3,0] -- */

            clr     c
            xch     a, r3       /* [R3,R4,R2,R1,R0,ACC,0] */
            rlc     a           /* [R3,R4,R2,R1,R0,C,ACC] */
            xch     a, r0       /* [R3,R4,R2,R1,ACC,C,R0] */
            rlc     a           /* [R3,R4,R2,R1,C,ACC,R0] */
            xch     a, r1       /* [R3,R4,R2,ACC,C,R1,R0] */
            rlc     a           /* [R3,R4,R2,C,ACC,R1,R0] */
            xch     a, r2       /* [R3,R4,ACC,C,R2,R1,R0] */
            rlc     a           /* [R3,R4,C,ACC,R2,R1,R0] */
            xch     a, r4       /* [R3,ACC,C,R4,R2,R1,R0] */
            rlc     a           /* [R3,C,ACC,R4,R2,R1,R0] */
            xch     a, r3       /* [ACC,C,R3,R4,R2,R1,R0] */
            rlc     a           /* [C,ACC,R3,R4,R2,R1,R0] */
            jnc     00027$

        /*  -- Subtraction; 24-bit partial minus 24-bit divisor. -- */

            clr     c
            xch     a, r4       /* [R4,R3,ACC] - [B,R7,R6] */
            subb    a, r6       /* [R4,R3-C,ACC] - [B,R7,0] */
            xch     a, r3       /* [R4,ACC-C,R3] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R3] - [B,0,0] */
            xch     a, r4       /* [ACC-C,R4,R3] - [B,0,0] */
            subb    a, b        /* [C,ACC,R4,R3] */
            clr     c
            sjmp    00028$

        /*  -- A2B: Trial addition; 24-bit partial plus 24-bit divisor. -- */

    00025$: xch     a, r4       /* [R4,R2,ACC] + [B,R7,R6] */
            add     a, r6       /* [R4,R2+C,ACC] + [B,R7,0] */
            xch     a, r2       /* [R4,ACC+C,R2] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R2] + [B,0,0] */
            xch     a, r4       /* [ACC+C,R4,R2] + [B,0,0] */
            addc    a, b        /* [C,ACC,R4,R2] */
            jc      00024$

        /*  -- A2C: Insert "0" into quotient by doing nothing at all. -- */

    00026$:

        /*  -- A3A: left shift partial+quotient, [C,ACC,R4,R2,R1,R0,R3,0] -- */

            clr     c
            xch     a, r3       /* [R3,R4,R2,R1,R0,ACC,0] */
            rlc     a           /* [R3,R4,R2,R1,R0,C,ACC] */
            xch     a, r0       /* [R3,R4,R2,R1,ACC,C,R0] */
            rlc     a           /* [R3,R4,R2,R1,C,ACC,R0] */
            xch     a, r1       /* [R3,R4,R2,ACC,C,R1,R0] */
            rlc     a           /* [R3,R4,R2,C,ACC,R1,R0] */
            xch     a, r2       /* [R3,R4,ACC,C,R2,R1,R0] */
            rlc     a           /* [R3,R4,C,ACC,R2,R1,R0] */
            xch     a, r4       /* [R3,ACC,C,R4,R2,R1,R0] */
            rlc     a           /* [R3,C,ACC,R4,R2,R1,R0] */
            xch     a, r3       /* [ACC,C,R3,R4,R2,R1,R0] */
            rlc     a           /* [C,ACC,R3,R4,R2,R1,R0] */
            jc      00031$

        /*  -- Addition; 24-bit partial plus 24-bit divisor. -- */

            xch     a, r4       /* [R4,R3,ACC] + [B,R7,R6] */
            add     a, r6       /* [R4,R3+C,ACC] + [B,R7,0] */
            xch     a, r3       /* [R4,ACC+C,R3] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R3] + [B,0,0] */
            xch     a, r4       /* [ACC+C,R4,R3] + [B,0,0] */
            addc    a, b        /* [C,ACC,R4,R3] */
            sjmp    00032$

        /*  -- S3B: Trial subtraction; 24-bit partial minus divisor. -- */

    00027$: clr     c
            xch     a, r4       /* [R4,R3,ACC] - [B,R7,R6] */
            subb    a, r6       /* [R4,R3-C,ACC] - [B,R7,0] */
            xch     a, r3       /* [R4,ACC-C,R3] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R3] - [B,0,0] */
            xch     a, r4       /* [ACC-C,R4,R3] - [B,0,0] */
            subb    a, b        /* [C,ACC,R4,R3] */
            jc      00032$

        /*  -- S3C: Insert "1" into quotient. -- */

    00028$: inc     r0          /* quotient gets "1" bit */

        /*  -- Next Iteration of Quotient Bit "1" Coroutine -- */

            djnz    dpl, 00029$

        /*  -- Final rounding step where "1" was last bit generated -- */

            clr     c
            xch     a, r3       /* [R3,R4,ACC,0] */
            rlc     a           /* [R3,R4,C,ACC] */
            xch     a, r4       /* [R3,ACC,C,R4] */
            rlc     a           /* [R3,C,ACC,R4] */
            xch     a, r3       /* [ACC,C,R3,R4] */
            rlc     a           /* [C,ACC,R3,R4] */
            jc      00033$
            clr     c
            xch     a, r4       /* [R4,R3,ACC] - [B,R7,R6] */
            subb    a, r6       /* [R4,R3-C,ACC] - [B,R7,0] */
            xch     a, r3       /* [R4,ACC-C,R3] - [B,R7,0] */
            subb    a, r7       /* [R4-C,ACC,R3] - [B,0,0] */
            xch     a, r4       /* [ACC-C,R4,R3] - [B,0,0] */
            subb    a, b        /* [C,ACC,R4,R3] */
            cpl     c
            sjmp    00033$

        /*  -- DJNZ extenders.  Oh, well.  It's a big loop! -- */

    00029$: ljmp    00019$
    00030$: ljmp    00020$

        /*  -- A3B: Trial addition; 24-bit partial plus 24-bit divisor. -- */

    00031$: xch     a, r4       /* [R4,R3,ACC] + [B,R7,R6] */
            add     a, r6       /* [R4,R3+C,ACC] + [B,R7,0] */
            xch     a, r3       /* [R4,ACC+C,R3] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R3] + [B,0,0] */
            xch     a, r4       /* [ACC+C,R4,R3] + [B,0,0] */
            addc    a, b        /* [C,ACC,R4,R3] */
            jc      00028$

        /*  -- A3C: Insert "0" into quotient by doing nothing at all. -- */

    00032$:

        /*  -- Next Iteration of Quotient Bit "0" Coroutine -- */

            djnz    dpl, 00030$

        /*  -- Final rounding step where "0" was last bit generated -- */

            clr     c
            xch     a, r3       /* [R3,R4,ACC,0] */
            rlc     a           /* [R3,R4,C,ACC] */
            xch     a, r4       /* [R3,ACC,C,R4] */
            rlc     a           /* [R3,C,ACC,R4] */
            xch     a, r3       /* [ACC,C,R3,R4] */
            rlc     a           /* [C,ACC,R3,R4] */
            jnc     00033$
            xch     a, r4       /* [R4,R3,ACC] + [B,R7,R6] */
            add     a, r6       /* [R4,R3+C,ACC] + [B,R7,0] */
            xch     a, r3       /* [R4,ACC+C,R3] + [B,R7,0] */
            addc    a, r7       /* [R4+C,ACC,R3] + [B,0,0] */
            xch     a, r4       /* [ACC+C,R4,R3] + [B,0,0] */
            addc    a, b        /* [C,ACC,R4,R3] */
    00033$:

    /*  ------------------------------------------------------------------
        Now round the quotient according to the value of C.  In this rare
        case, unlike the existing library routines, this code checks the
        resulting exponent for underflow and overflow (while still not
        embracing the idea of denormals.)
        ------------------------------------------------------------------
    */
            clr     a
            mov     r5, a
            addc    a, r0       /* [R2,R1+C,ACC] */
            xch     a, r1       /* [R2,ACC+C,R1] */
            addc    a, r5       /* [R2+C,ACC,R1] */
            xch     a, r2       /* [ACC+C,R2,R1] */
            addc    a, r5       /* [C,ACC,R2,R1] */
            jnc     00034$

        /*  -- boost exponent and check for overflow -- */

            inc     dph
            mov     r0, dph
            cjne    r0, #0xFF, 00035$
            ljmp    fs_return_inf

        /*  -- otherwise, check for underflow -- */

    00034$: mov     r0, dph
            cjne    r0, #0, 00035$
            ljmp    fs_return_zero  /* underflow */
    00035$:

        /*  -- quotient is now in [ACC,R2,R1] -- */

    /*  ------------------------------------------------------------------
        Pack the floating point value into [ACC,B,DPH,DPL].
        ------------------------------------------------------------------
    */
            rlc     a           /* toss hidden bit and add junk bit into lsb */
            mov     c, F0       /* copy result sign into carry for shift */
            xch     a, dph      /* get exponent temporarily into ACC */
            rrc     a           /* rotate sign in and lsb of exponent out */
            xch     a, dph      /* restore highest mantissa byte to ACC */
            rrc     a           /* shift lsb of exponent into that byte */

        /*  -- Return the value in the standard return value registers -- */

            mov     b, a        /* exponent lsb and mantissa's upper 7 bits */
            mov     a, dph      /* sign and exponent's upper 7 bits */
            mov     dph, r2     /* middle mantissa byte */
            mov     dpl, r1     /* least mantissa byte */

            ret

    __endasm;
    return;
}
