//compile with gcc -Wall -lrt -o test7 test7.c // or //compile with gcc -Wall -lpthread -o test7 test7.c //in your cpumon directory to pick up perf_event.h /* ** stupid program to test memory reads/write ** in a controlled way for perf */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include "perf_event.h" int getopt(int argc, char * const argv[], const char *optstring); extern char *optarg; extern int optind; #define FORMAT PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | \ PERF_FORMAT_TOTAL_TIME_RUNNING #define cpu_relax() asm volatile("rep; nop") #define SEM_NAME "offcore_sem" #define MAX_CPUS 255 #define CACHE_STRIDE 128 #define HITM_LOOP 1000000 #define BUSY_LOOP 100 #define CPU_A 1 #define CPU_B 9 enum { FALSE, TRUE }; enum { SUCCESS, FAILURE }; struct evt_t { uint64_t config; uint64_t config1; char * desc; }; struct evt_t pe[] = { { 0xc0, 0, " CYCLES" }, { 0x01b7, 0x10003c0003, "LCL HITM" }, { 0x01bb, 0x107fc00003, "RMT HITM" }, }; #define NUM_EVTS (sizeof(pe)/sizeof(struct evt_t)) volatile uint64_t *lock_A; volatile uint64_t *lock_B; char *shm_buf; int cpucnt; int fd[MAX_CPUS][NUM_EVTS]; int busy_cnt = BUSY_LOOP; int hitm_cnt = HITM_LOOP; int debug = FALSE; int test_only= FALSE; int cpu_a = CPU_A; int cpu_b = CPU_B; /* ** ** ** */ void open_events(void) { struct perf_event_attr pep; int i, cpu, grp_ldr; // // initialize the perf_event_attr structure, make // sure undefined fields are set to zero // memset(&pep, 0, sizeof(struct perf_event_attr)); pep.type = PERF_TYPE_RAW; pep.size = sizeof(struct perf_event_attr); pep.read_format = FORMAT; pep.exclude_kernel = 1; for(cpu = 0; cpu < cpucnt; cpu++) { grp_ldr = -1; for(i = 0; i < NUM_EVTS; i++) { pep.disabled = (i == 0) ? 1 : 0; pep.config = pe[i].config; pep.config1 = pe[i].config1; fd[cpu][i] = perf_event_open(&pep, -1, cpu, grp_ldr, 0); if (fd[cpu][i] == -1) { printf("Failed to open cpu%d\n", cpu); exit(FAILURE); } if (i == 0) grp_ldr = fd[cpu][i]; } } } /* ** ** ** */ void enable_events(void) { int cpu, ret; for (cpu = 0; cpu < cpucnt; cpu++) { ret = ioctl(fd[cpu][0], PERF_EVENT_IOC_ENABLE, 0); if (ret == -1) { printf("Failed to enable cpu%d\n", cpu); exit(FAILURE); } } } /* ** ** ** */ void disable_events(void) { int cpu, ret; for (cpu = 0; cpu < cpucnt; cpu++) { ret = ioctl(fd[cpu][0], PERF_EVENT_IOC_DISABLE, 0); if (ret == -1) { printf("Failed to disable cpu%d\n", cpu); exit(FAILURE); } } } /* ** ** ** */ void close_events(void) { int cpu, i, ret; for (cpu = 0; cpu < cpucnt; cpu++) { for (i = 0; i < NUM_EVTS; i++) { ret = close(fd[cpu][i]); if (ret == -1) { printf("Failed to close cpu%d\n", cpu); exit(FAILURE); } } } } /* ** ** ** */ void read_events(void) { uint64_t data[(NUM_EVTS*2)+3]; int size = sizeof(uint64_t) * ((NUM_EVTS*2)+3); int cpu,i,ret; printf(" "); for (i = 0; i < NUM_EVTS; i++) printf(" %12s\t", pe[i].desc); printf("\n"); for (cpu = 0; cpu < cpucnt; cpu++) { memset(data, 0, sizeof(data)); ret=read(fd[cpu][0],&data,size); if (ret == -1) { printf("Failed to read cpu%d\n", cpu); exit(FAILURE); } if (ret != size) { printf("Failed to do a full read (%d != %d)\n", ret, size); exit(FAILURE); } printf("cpu %2d: ", cpu); for (i = 0; i < data[0]; i++) { printf(" %12ld\t", data[3+(i*2)]); } printf("\n"); } } /* ** ** ** */ #define CPU_BIND(cpu) \ do { \ cpu_set_t cs; \ CPU_ZERO (&cs); \ CPU_SET (cpu, &cs); \ \ if (sched_setaffinity(0, sizeof (cs), &cs) < 0) { \ perror("sched_setaffinity"); \ exit(EXIT_FAILURE); \ }\ } while (0) /* ** ** spin until lock is available, then attempt to acquire ** */ void acquire_lock(volatile uint64_t *pxchg) { asm volatile ( "mov $0x1, %%rdx \n\t" "spin: mov %0, %%rax \n\t" "test %%rax, %%rax \n\t" "jnz spin \n\t" "lock \n\t" "cmpxchg %%rdx, %0 \n\t" "test %%rax, %%rax \n\t" "jnz spin \n\t" : "=m" (*pxchg) : "m" (*pxchg) : "rax", "rdx", "memory" ); } /* ** ** ** */ void release_lock(volatile uint64_t *lock) { *lock = 0; } /* ** ** ** */ int proc1(int cpu) { register int i,j; CPU_BIND(cpu); if (debug) printf("Starting thread on cpu %d\n", cpu); for (i = 0; i < hitm_cnt; i++) { acquire_lock(lock_A); if (debug) printf("Process A active\n"); for (j = 0; j < busy_cnt; j++) cpu_relax(); release_lock(lock_B); } return 0; } /* ** ** ** */ int proc2(int cpu) { register int i,j; CPU_BIND(cpu); if (debug) printf("Starting thread on cpu %d\n", cpu); for (i = 0; i < hitm_cnt; i++) { acquire_lock(lock_B); if (debug) printf("Process B active\n"); for (j = 0; j < busy_cnt; j++) cpu_relax(); release_lock(lock_A); } return 0; } /* ** ** ** */ int main(int argc, char *argv[]) { int status = 0; int wret = 0; int shmid; int shmflg; int size; int ret; int pid1,pid2; int opt; cpucnt = (int)sysconf(_SC_NPROCESSORS_ONLN); // // process command line overrides for HITM count (c2c) and RelaxCpu Loop Iterations (lock hold time) // while ((opt = getopt(argc, argv, "Dtr:c:s:")) != -1) { switch (opt) { case 's': if (sscanf(optarg, "%d,%d", &cpu_a, &cpu_b) != 2) { printf("error - unable to parse cpu sharing pair: -s %s\n", optarg); exit(FAILURE); } if ((cpu_a >= cpucnt) || (cpu_b >= cpucnt)) { printf("specified cpuid exceeds available cpus, cpu_A = %d, cpu_B = %d, cpu cnt = %d\n", cpu_a, cpu_b, cpucnt); exit(FAILURE); } break; case 'c': hitm_cnt = atoi(optarg); break; case 'r': busy_cnt = atoi(optarg); break; case 'D': debug = TRUE; break; case 't': test_only = TRUE; break; default: /* '?' */ fprintf(stderr, "Usage: %s [-c hitm count] [-r relax cpu loop count] name\n", argv[0]); exit(0); } } memset(fd,0,sizeof(fd)); // // create shared memory segment to hold the lock that will be shared // between the two processes attempting to gain lock onwership // // Create shm region. 0x180 provides owner rd/wr permissions. size = 2 * CACHE_STRIDE; shmflg = IPC_CREAT | 0x180; shmid = shmget(IPC_PRIVATE, size, shmflg); if (shmid < 0) { perror("shmget"); printf ("Unable to allocate shared memory segment\n"); printf ("size = %d, shmid = %d, errno = %d\n", size, shmid, errno); exit(FAILURE); } // // Attach to the shared memory segment. Make sure the buffer is acceptably aligned. // shmflg = 0; shm_buf = (char *)shmat(shmid, (void *)0, shmflg); if (shm_buf < 0) { perror("shmat"); printf ("Error: shmat failed, errno = %d\n", errno); exit(FAILURE); } if (debug) printf("Shared memory segment, base adrs : 0x%016lx, size : %d\n", (uint64_t)shm_buf, size ); if ((uint64_t)shm_buf & 0xfff) { printf ("Error: shmat returned a unacceptably aligned value = %016lx\n", (uint64_t)shm_buf); exit(FAILURE); } // // shared memory segment will automatically be removed when all processes are detached // ret = shmctl (shmid, IPC_RMID, NULL); if (ret < 0) { perror("shmctl"); printf ("Error: shmctl failed, errno = %d\n", errno); } // // define the control locks within shared memory segment, beware of data prefetching effects // lock_A = (volatile uint64_t *)(shm_buf + 0 ); lock_B = (volatile uint64_t *)(shm_buf + CACHE_STRIDE); if (debug) printf("lock_A 0x%016lx, Lock_B 0x%016lx\n", (uint64_t)lock_A, (uint64_t)lock_B); // // initialze to locked/blocked to keep processes waiting - spinning on lock // *lock_A = 1; *lock_B = 1; if (debug) printf("lock_A 0x%016lx, Lock_B 0x%016lx\n", *lock_A, *lock_B); // // create the child processes that will ping pong back and forth // pid1 = fork(); if (pid1 < 0) return -1; if (!pid1) { ret = proc1(cpu_a); exit(ret); } pid2 = fork(); if (pid2 < 0) return -1; if (!pid2) { ret = proc2(cpu_b); exit(ret); } if (debug) printf("child processes has been successfully created\n"); // // bind parent to cpu 0, configure and activate the PMU counters // CPU_BIND(0); if (!test_only) { open_events(); enable_events(); } if (debug) printf("parent bound to cpu 0, pmu configured and started, release lock for cpu A\n"); // // start the ping pong dance // *lock_A = 0; // // wait for children to terminate before stopping & reading PMU counters // wret = waitpid(pid1, &status, 0); if (wret < 0 || status) return -1; wret = waitpid(pid2, &status, 0); if (wret < 0 || status) return -1; if (debug) printf("child processes have completed, display the results\n"); // // collect PMU data, report results and then deallocate counters // if (!test_only) { disable_events(); read_events(); close_events(); } // // detach from shared memory segment, segment should automatically be deallocated // ret = shmdt ((void *)shm_buf); if (ret < 0) { perror("shmdt"); printf ("Error: shmdt failed, errno = %d\n", errno); } return 0; }