memcpy ist üblicherweise einiges schneller als ein "normaler" for-loop:
Code:
#include <stdio.h>
#include <string.h>
#include "cyc.h"
double a[50][50];
double b[50][50];
void do_memcpy() {
memcpy(b, a, 50*50*sizeof(double));
}
void do_loop() {
int elem_count = 50*50;
int i = 0;
double *A = &a[0][0];
double *B = &b[0][0];
for (i=0; i<elem_count;i++) {
B[i]=A[i];
}
}
int main() {
// init cycle counter
start_counter();
// init cache
double start_cyc = get_counter();
do_loop();
double warmup_time = get_counter() - start_cyc;
// measure loop performance
start_cyc = get_counter();
do_loop();
double loop_time = get_counter() - start_cyc;
// measure memcpy performance
start_cyc = get_counter();
do_memcpy();
double memcpy_time = get_counter() - start_cyc;
printf("Copying a 50x50 matrix of doubles:\n"
" using a for-loop: %.0lf cycles\n"
" using memcpy: %.0lf cycles\n",
loop_time, memcpy_time);
printf("\nCache warmup: %.0lf cycles\n", warmup_time);
return 0;
}
Resultate bei mir (Athlon X2 BE-2400, 512k cache):
Code:
Copying a 50x50 matrix of doubles:
using a for-loop: 47689 cycles
using memcpy: 17349 cycles
Cache warmup: 97446 cycles
Die Resultate ändern nur unwesentlich wenn zuerst die memcpy cycles und dann die for-loop cycles gemessen werden.
cycle counter implementation:
Code:
// cyc.h
#ifndef __CYC_H
#define __CYC_H
double get_counter();
void start_counter();
#endif
Code:
// cyc.c
static unsigned cyc_hi = 0;
static unsigned cyc_lo = 0;
void access_counter(unsigned *hi, unsigned *lo) {
/* Get cycle counter */
asm("rdtsc; movl %%edx, %0; movl %%eax, %1"
: "=r" (*hi), "=r" (*lo)
: /* No input */
: "%edx", "%eax");
}
double get_counter() {
unsigned ncyc_hi, ncyc_lo;
unsigned hi, lo, borrow;
/* Get cycle counter */
access_counter(&ncyc_hi, &ncyc_lo);
/* 64 bit subtraction */
lo = ncyc_lo - cyc_lo;
borrow = lo > ncyc_lo;
hi = ncyc_hi - cyc_hi - borrow;
return (double) hi * (1 << 30) * 4 + lo;
}
void start_counter() {
/* Get current value */
access_counter(&cyc_hi, &cyc_lo);
return;
}
Lesezeichen