; mix.asm, some mixers and clippers for sb.

[BITS 32]
[GLOBAL _mixmonoloop]
[GLOBAL _mixstereoloop]
[GLOBAL _clip16bitloop]
[GLOBAL _clip8bitloop]
[EXTERN _pit]
[EXTERN _posil]
[EXTERN _posih]
[EXTERN _ma]
[EXTERN _deita]
[EXTERN _taul]
[EXTERN _rtaul]
[EXTERN _ltaul]
[EXTERN _vali]
[EXTERN _faddl]
[EXTERN _faddh]
[EXTERN _outputsamples]
[EXTERN _selektoori]

[SECTION .text]

 ; *********** mono mixer ************
 ; rather a lot of memory accesses. tried to implement some
 ; minor pentium pairing optimizations.

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..


_mixmonoloop:     ;mono mixer
   pushad         ;save everything

   mov ebp,[_pit] ;ebp is the length

   mov edx,[_posih] ;edx is the high part and ebx the low part of the
   mov ebx,[_posil] ;32.32 fixed point position indicator.

   mov eax,[_ma]    ;destination position
   shl eax,2        ;it's a dword table
   add eax,[_vali]  ;destination buffer

   add edx,[_deita] ;source (==sample data)

   mov edi,[_taul]  ;volumetable

   xor ecx,ecx


 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..
                        ;i wonder how djgpp deals with diz.

mixmonoloop_inner:       ;now the inner loop

   mov cl,byte [edx]     ;a1. peek an unsigned byte

   add ebx,[_faddl]      ;    b1. add the low word of the 32.32 fixed position

   mov esi,[edi+ecx*4]   ;a2. get the signed 32 bit volume value from the vtable
 		                   ; - carry was not destroyed.
   adc edx,[_faddh]      ;    b2. add the high part with carry from "add ebx,ebp"

   add [eax],esi         ;a3. add it to the output buffer

   xor ecx,ecx           ;clear the sample byte for the next iteration

   add eax,4             ;add destination pointer
   dec ebp               ;see if we're done
   jnz mixmonoloop_inner

   sub edx,[_deita]      ;sub source
   mov [_posih],edx      ;return the current position HIgh
   mov [_posil],ebx      ;LOw

   popad                 ;restore everything
   ret                   ;return

 ; *********** stereo mixer ************
 ; rather a lot of memory accesses. tried to implement some
 ; minor pentium pairing optimizations.

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..

_mixstereoloop:
   pushad                ;save everything

   mov eax,[_pit]
   mov [length],eax      ;put _pit to length so that _pit won't be destroyed

   mov edx,[_posih]      ;high word of the 32.32 fixed position
   mov ebx,[_posil]      ;low word

   mov eax,[_ma]         ;desti..
   shl eax,2             ;(buffer is 32-bit, so multiply with 4)
   add eax,[_vali]       ;..nation

   add edx,[_deita];     ;add source data pointer to position high

   xor ecx,ecx           ;clear sample byte storage for movmem's purposes.

   mov edi,[_ltaul];     ;left channel's entry to the volume table
   mov ebp,[_rtaul]      ;right channels's one.

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..

mixstereoloop_inner:           ;let's mix.
                               ;first mix the left channel
   mov cl,byte [edx]           ;a1L. take an unsigned sample byte

   add ebx,[_faddl]            ;        b1. update position low

   mov esi,[edi+ecx*4]         ;a3L. dip the sample byte into the volumetable

   adc edx,[_faddh]            ;        b2. high+carry from the previous operation(==add)

   add [eax],esi               ;a4L. and add it to buffer

	                       ;now mix the right channel
   mov esi,[ebp+ecx*4]         ;a5R. use esi(==sample byte) and volumetable

   xor ecx,ecx                 ;clear ecx for the next iteration

   add [eax+4],esi             ;a6R. add right channel to the output buffer

;WRITE AFTER READ - PAIR OK..?
   add eax,8                   ;increase destination pointer

   dec dword [length]          ;do we need to continue?
   jnz mixstereoloop_inner
                               ;no
   sub edx,[_deita]            ;subtract source data pointer
   mov [_posih],edx            ;return the current position HIgh
   mov [_posil],ebx            ;LOw

   popad                       ;restore everything
   ret                         ;return

; *********** 16-bit clipper ************

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..
			;i wonder how djgpp deals with diz.

_clip16bitloop:
   pushad  ;save everything
   push fs ;even fs

   mov ax,[_selektoori] ;dma buffer is in the low (below 1MB) memory
   mov fs,ax            ;and it has a selector of its own. destination.

   mov edx,[_pit]       ;number of elements to clip

   mov esi,[_vali]      ;source
   mov edi,[_posil]     ;destination position

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..
			;i wonder how djgpp deals with diz.

clip16bitloop_inner:    ;ok, let's clip
   mov eax,[esi]        ;peek a signed dword from the buffer
; read after write - penalty
   cmp eax,-32768
   jl prob1_underflow   ;too small, we need to clip
   cmp eax,32767
   jg prob2_overflow    ;too big, we need to clip
   mov [fs:edi],ax      ;everything is ok, just poke to buffer
   add esi,4            ;increase source pointer
   add edi,2            ;increase destination pointer
   dec edx              ;decrease length
   jnz clip16bitloop_inner ;0?
finitoc16bli:           ;yep
   pop fs               ; restore everything
   popad
   ret                  ;return

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..
			;i wonder how djgpp deals with diz.

prob1_underflow:          ;too small, i.e. less than -32768
   mov word [fs:edi],-32768 ;so output -32768
   add esi,4                ;inc source
   add edi,2                ;inc destination
   dec edx                  ;dec length
   jnz clip16bitloop_inner  ;done?
   jmp finitoc16bli         ;jep, goto finish.

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..
			;i wonder how djgpp deals with diz.

prob2_overflow:           ;too large, in other word greater than 32767
   mov word [fs:edi],32767  ;so use 32767
   add esi,4
   add edi,2
   dec edx
   jnz clip16bitloop_inner
   jmp finitoc16bli

; *********** 8-bit clipper ************
 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..
			;i wonder how djgpp deals with diz.

_clip8bitloop:
   pushad
   push fs

   mov ax,[_selektoori]
   mov fs,ax

   mov edx,[_pit]

   mov esi,[_vali]
   mov edi,[_posil]

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..
			;i wonder how djgpp deals with diz.

clip8bitloop_inner:
   mov eax,[esi]
   sar eax,8              ;we're outputting 8-bit, so we'll have to
			;ignore the 8 most insignificant bits.
   cmp eax,127            ;now check for boundaries
   jg prob2_overflow_8bit ;too big, clip it.

   add eax,128              ; add x,128 = sub x,-128 ~= cmp x,-128
   jl prob1_underflow_8bit  ; this little "trick" does the signed->unsigned
   mov [fs:edi],al          ; conversion
   add esi,4
   inc edi
   dec edx
   jnz clip8bitloop_inner
finitoc8bli:
   pop fs
   popad
   ret

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..
			;i wonder how djgpp deals with diz.

prob1_underflow_8bit:    ;too small, so just output 0
   mov byte [fs:edi],0
   add esi,4
   inc edi
   dec edx
   jnz clip8bitloop_inner
   jmp finitoc8bli

 ;times ($$-$) & 3 nop   ;pad with NOPs to 4-byte boundary..
			;i wonder how djgpp deals with diz.

prob2_overflow_8bit:     ;too large, so use 255
   mov byte [fs:edi],255
   add esi,4
   inc edi
   dec edx
   jnz clip8bitloop_inner
   jmp finitoc8bli

[SECTION .data]

 length DD 0

[SECTION .bss]

