        BITS 32
        %include "helper.mac"

        %define d dword 

        SECTION .data

	time	dd	0
        a1      dd      1.0
        a2      dd      2.3
        a3      dd      3.4
        b1      dd      3.4
        b2      dd      4.5
        b3      dd      1.3
        c1      dd      5.6
        c2      dd      6.7
        c3      dd      3.141526535        

        SECTION .text
        ALIGN 16		


proc test_RDTSC
%define count   ARG1        

        mov     ecx, [count]
        
        rdtsc
        mov     [time], eax
                        
.looper
        ;; this is example code that is supposed to take ~12 clock cycles
        ;; Add the overhead from the paired dec ecx/jnz .looper and you
        ;; get 13. When I tested this I got 13200. Divide that by 1000
        ;; and you can see this is an accurate method
        
        FLD     d [a1]    ; clock cycle 1
        FMUL    d [a2]    ; clock cycle 2-4
        FLD     d [b1]    ; clock cycle 3
        FMUL    d [b2]    ; clock cycle 4-6
        FLD     d [c1]    ; clock cycle 5
        FMUL    d [c2]    ; clock cycle 6-8
        FXCH    st2   ; clock cycle 6
        FSTP    d [a3]    ; clock cycle 7-8
        FSTP    d [b3]    ; clock cycle 9-10
        FSTP    d [c3]    ; clock cycle 11-12

        dec     ecx
        jnz     .looper

        rdtsc
        sub     eax, [time]     ; return to C with the # of cycles in eax
                        
endproc
