Follow along with the video below to see how to install our site as a web app on your home screen.
Note: This feature may not be available in some browsers.
#ifndef lint
static char sccsid[] = "@(#)memcpy.c 1.1 90/03/23"; /* from S5R2 1.1 */
#endif
/*LINTLIBRARY*/
/*
* Copy s2 to s1, always copy n bytes.
* Return s1
*/
char *
memcpy(s1, s2, n)
register char *s1, *s2;
register int n;
{
register char *os1 = s1;
while (--n >= 0)
*s1++ = *s2++;
return (os1);
}
#if !defined(lint) && defined(SCCSIDS)
static char sccsid[] = "@(#)strcpy.c 1.1 90/03/23"; /* from UCB 4.1 82/10/05 */
#endif
/*
* Copy string s2 to s1. s1 must be large enough.
* return s1
*/
char *
strcpy(s1, s2)
register char *s1, *s2;
{
register char *os1;
os1 = s1;
while (*s1++ = *s2++)
;
return (os1);
}
This article may be of interest: Optimizing Memcpy improves speed.hunghsun said:I guess more information is in order. I am dealing with a high speed interconnect that require the data to be copy from a source location to a buffer location before it can be sent over the network (Think of it as MPI where you copy the source data into the MPI buffer and then transfer over the network to another computer). I did a time profile base on the size of data and anything beyond 64K, memcpy (buffer, source, size) simply dominates this process. I think beyond 64K, it counts for more than 50% of the execution time. Because I have to do this copying (no way around it due to hardware limitation), I am trying to find out if I can use something else instead.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef WIN32
#include <windows.h>
typedef __int64 precise_clock_t;
precise_clock_t ReadClock ( void ) {
LARGE_INTEGER result;
QueryPerformanceCounter( &result );
return result.QuadPart;
}
#endif
#ifdef __GNUC__
#define RDTSC(llptr) { __asm__ __volatile__ ( "rdtsc" : "=A" (llptr) ); }
typedef unsigned long long precise_clock_t;
precise_clock_t ReadClock ( void ) {
precise_clock_t result;
RDTSC( result );
return result;
}
#endif
/*
* from [URL unfurl="true"]http://www.lysator.liu.se/c/duffs-device.html[/URL]
*/
void *fast_memcpy1 ( void *dst, const void *src, size_t len ) {
void *result = dst;
long *to = dst;
const long *from = src;
long n;
/* copying longs, not bytes */
/* use len >>= 2 if your compiler can't spot the optimisation */
len /= sizeof(long);
n = ( len + 7 ) / 8;
switch ( len % 8 )
{
case 0: do { *to++ = *from++;
case 7:*to++ = *from++;
case 6:*to++ = *from++;
case 5:*to++ = *from++;
case 4:*to++ = *from++;
case 3:*to++ = *from++;
case 2:*to++ = *from++;
case 1:*to++ = *from++;
} while ( --n > 0 );
}
return dst;
}
/* This works well if the processor has register+offset addressing modes. */
/* This avoids the overhead of incrementing pointers for each long copied. */
/* However, such addressing modes tend to be slower, so overall there may be */
/* little difference */
void *fast_memcpy2 ( void *dst, const void *src, size_t len ) {
void *result = dst;
long *to = dst;
const long *from = src;
long n;
/* copying longs, not bytes */
/* use len >>= 2 if your compiler can't spot the optimisation */
len /= sizeof(long);
n = len / 8;
while ( n-- > 0 ) {
to[0] = from[0];
to[1] = from[1];
to[2] = from[2];
to[3] = from[3];
to[4] = from[4];
to[5] = from[5];
to[6] = from[6];
to[7] = from[7];
to += 8;
from += 8;
}
n = len % 8;
while ( n-- > 0 ) {
*to++ = *from++;
}
return dst;
}
/* copy one byte at a time */
void *simple_memcpy1 ( void *dst, const void *src, size_t len ) {
void *result = dst;
char *to = dst;
const char *from = src;
while ( len-- ) *to++ = *from++;
return dst;
}
/* copy one long at a time */
void *simple_memcpy2 ( void *dst, const void *src, size_t len ) {
void *result = dst;
long *to = dst;
const long *from = src;
len /= 4;
while ( len-- ) *to++ = *from++;
return dst;
}
/* TEST CODE */
/* --------- */
#define TEST_LEN 102400
#define LOW_COPY 10000
#define HIGH_COPY 80000
long src[TEST_LEN];
long dst[TEST_LEN];
/* fill up some memory with some test data */
void prepare ( void ) {
int i;
for ( i = 0 ; i < TEST_LEN ; i++ ) src[i] = ~i;
for ( i = 0 ; i < TEST_LEN ; i++ ) dst[i] = 0;
}
/* make sure it was copied to just the bits we're interested in */
int check ( void ) {
int valid = 1;
int i;
for ( i = 0 ; i < LOW_COPY && valid ; i++ ) valid = dst[i] == 0;
for ( i = LOW_COPY ; i < HIGH_COPY && valid ; i++ ) valid = dst[i] == ~i;
for ( i = HIGH_COPY ; i < TEST_LEN && valid ; i++ ) valid = dst[i] == 0;
return valid;
}
typedef void *(*cpyfunc)(void*,const void*,size_t);
struct {
cpyfunc fn;
char *name;
} funcs[] = {
#define MAKEFN(x) { x, #x }
MAKEFN(memcpy),
MAKEFN(fast_memcpy1),
MAKEFN(fast_memcpy2),
MAKEFN(simple_memcpy1),
MAKEFN(simple_memcpy2),
};
/* call a function with the test data, and time it */
void do_tests ( cpyfunc fn, char *name, void *dst, const void *src, size_t len ) {
precise_clock_t t1, t2;
unsigned long timediff;
int i;
prepare();
t1 = ReadClock();
fn( dst, src, len );
t2 = ReadClock();
timediff = t2 - t1;
printf( "Check %-15s=%d, time=%lu\n", name, check(), timediff );
}
int main ( int argc, char *argv[] ) {
int i, testnum = 0;
char *from = (char*)&src[LOW_COPY];
char *to = (char*)&dst[LOW_COPY];
size_t len = (HIGH_COPY-LOW_COPY)*sizeof(long);
if ( argc >= 2 ) testnum = atoi(argv[1]);
printf( "Addresses are %p and %p\n", from, to );
for ( i = 0 ; i < 5 ; i++ ) {
do_tests( funcs[testnum].fn, funcs[testnum].name, to, from, len );
}
printf("\n");
return 0;
}