
#include <iostream.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>

#define NUM_VECTORS 100000

extern "C" int __stdcall matbyvec (
   const float* matrix,
   const float* vectors,
   int num_vec,
   int* output
);

extern "C" int __stdcall check_3dnow();

void main (void)
{
   if (!check_3dnow()) {
      cout << "Your processor does not support 3DNow! instruction set" << endl;
      return;
   }

   // Prepare data
   float matrix[] = { 1.0f, 2.0f, 0.94f, 2.4f,
		      4.4f, 1.0f, 0.1f, 11.2f,
		      0.1f, 0.9f, 3.1f, 0.01f,
		      0.0f, 0.0f, 0.0f, 1.0f };
   float* vectors = new float[4*NUM_VECTORS+32];
   int* output = new int[2*NUM_VECTORS+8];

   // Align
   float* _vectors = (float*) ( ( (((int)vectors)-1) & (-16) ) + 16 );
   int*   _output  = (int*  ) ( ( (((int)output )-1) & (-8 ) ) + 8  );

   // Fill vectors
   srand (time(NULL));
   for (int i=0; i < 4*NUM_VECTORS;) {
      _vectors[i++] = 100.0f*(float)sin((float)(rand() % 6283)/1000);
      _vectors[i++] = 100.0f*(float)sin((float)(rand() % 6283)/1000);
      _vectors[i++] = 100.0f*(float)sin((float)(rand() % 6283)/1000);
      _vectors[i++] = 1.0f;
   }

   // Execute
   int num_cycles = matbyvec (matrix, _vectors, NUM_VECTORS, _output);

   // Output result
   cout << "Done in " << num_cycles << " cycles (" << (num_cycles / NUM_VECTORS)
      << " cycles per point)" << endl;

   // Cleanup
   delete [] output;
   delete [] vectors;
}
