#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include // settings #define BURSTDOG_IV 10 // how often to wake up, per second #define BURSTDOG_BURST 120 // cpu time percentage considered a burst #define BURSTDOG_BURST_END 100 // cpu time percentage considered the end of a burst #define BURSTDOG_SAMPLES 5 // how many samples a burst needs to persist for to be logged #define BURSTDOG_USE_TOTAL 1 // 0: consider processes individually 1: consider total cpu usage and log top processes #define BURSTDOG_CULPRITS 5 // for BURSTDOG_USE_TOTAL: how many processes are logged on burst #if BURSTDOG_SAMPLES < 2 #error must consider at least 2 samples #endif // arbitrary limits, bump if necessary #define MAX_PROCESS 4096 #define DIRBUFFER_SIZE 262144 struct process { unsigned int pid; int fd; char name[32]; uint64_t time[BURSTDOG_SAMPLES]; }; struct process_tab { size_t num; struct process arr[MAX_PROCESS]; }; static unsigned int pids[MAX_PROCESS]; static struct process_tab process_tabs[2]; static char dirbuffer[DIRBUFFER_SIZE]; static char statbuffer[BUFSIZ]; static char msgbuffer[BUFSIZ]; struct linux_dirent64 { ino64_t d_ino; /* 64-bit inode number */ off64_t d_off; /* Not an offset; see getdents() */ unsigned short d_reclen; /* Size of this dirent */ unsigned char d_type; /* File type */ char d_name[]; /* Filename (null-terminated) */ }; enum cpu_stat_kind { CS_USER, CS_NICE, CS_SYSTEM, CS_IDLE, CS_IOWAIT, CS_IRQ, CS_SOFTIRQ, CS_STEAL, CS_GUEST, CS_GUEST_NICE, }; static int compare_int(const void *pa, const void *pb) { int a = *(const int *) pa; int b = *(const int *) pb; return a > b ? 1 : a == b ? 0 : -1; } static bool is_sep(char c) { return c == ' ' || c == '\n' || c == '\0'; } static char *nth_word(unsigned int n, char *ptr, size_t size) { size_t start = 0, pos = 0; bool paren = false; for (unsigned int i = 0; i <= n; i++) { do { if (pos >= size) return NULL; if (ptr[pos] == '(') paren = true; if (ptr[pos] == ')') paren = false; } while (!is_sep(ptr[pos++]) || paren); if (i == n-1) start = pos; } ptr[pos-1] = '\0'; return &ptr[start]; } static bool write_all(int fd, char *buf, size_t n) { while (n) { int written = write(fd, buf, n); if (written == -1) return false; buf += written; n -= written; } return true; } int main(int argc, char **argv) { long nproc = sysconf(_SC_NPROCESSORS_ONLN); char *logfile = argc > 1 ? argv[1] : "burstdog.log"; int logfd = open(logfile, O_WRONLY | O_CREAT | O_APPEND | O_DIRECT, 0644); if (logfd == -1) { perror("open logfile"); return EXIT_FAILURE; } int dirfd = open("/proc", O_RDONLY | O_DIRECTORY); if (dirfd == -1) { perror("open /proc"); return EXIT_FAILURE; } int sysstatfd = openat(dirfd, "stat", O_RDONLY); if (sysstatfd == -1) { perror("open /proc"); return EXIT_FAILURE; } size_t num_samples = 0; struct { uint64_t busy; uint64_t total; } cpu_stats[BURSTDOG_SAMPLES] = {}; unsigned int bursting = 0; struct process_tab *procs = &process_tabs[0], *oldprocs = &process_tabs[1]; for (;;) { time_t time_v = time(NULL); struct tm time_s; localtime_r(&time_v, &time_s); char timebuf[128]; strftime(timebuf, 128, "%c", &time_s); ssize_t n_read = pread(sysstatfd, statbuffer, BUFSIZ-1, 0); if (n_read == -1) { perror("pread /proc/stat"); return EXIT_FAILURE; } statbuffer[n_read] = '\0'; uint64_t cpu[10]; for (size_t i = 0; i < 10; i++) { char *str = nth_word(2+i, statbuffer, n_read+1); if (!str) { fprintf(stderr, "failed to parse /proc/stat\n"); return EXIT_FAILURE; } cpu[i] = atoll(str); } uint64_t idle = cpu[CS_IDLE] + cpu[CS_IOWAIT]; uint64_t nonidle = cpu[CS_USER] + cpu[CS_NICE] + cpu[CS_SYSTEM] + cpu[CS_IRQ] + cpu[CS_SOFTIRQ] + cpu[CS_STEAL]; memmove(&cpu_stats[1], &cpu_stats[0], sizeof(*cpu_stats) * (BURSTDOG_SAMPLES-1)); cpu_stats[0].busy = nonidle; cpu_stats[0].total = idle+nonidle; if (num_samples < BURSTDOG_SAMPLES) num_samples++; if (lseek(dirfd, 0, SEEK_SET) != 0) { perror("seek /proc"); return EXIT_FAILURE; } ssize_t n_dirent = getdents64(dirfd, dirbuffer, DIRBUFFER_SIZE); if (n_dirent == -1) { perror("getdents64 /proc"); return EXIT_FAILURE; } size_t num_pids = 0; char *dirptr = dirbuffer; while (dirptr < dirbuffer + n_dirent) { struct linux_dirent64 *d = (void*) dirptr; int pid; if ((d->d_type == DT_DIR || d->d_type == DT_UNKNOWN) && (pid = atoi(d->d_name))) { if (num_pids == MAX_PROCESS) { fprintf(stderr, "too many processes\n"); return EXIT_FAILURE; } pids[num_pids++] = pid; } dirptr += d->d_reclen; } qsort(pids, num_pids, sizeof(int), &compare_int); procs->num = 0; size_t oldproc_idx = 0; for (size_t i = 0; i < num_pids; i++) { struct process *oldproc = NULL; if (num_samples > 1) { while (oldproc_idx < oldprocs->num && oldprocs->arr[oldproc_idx].pid < pids[i]) close(oldprocs->arr[oldproc_idx++].fd); if (oldproc_idx < oldprocs->num && oldprocs->arr[oldproc_idx].pid == pids[i]) oldproc = &oldprocs->arr[oldproc_idx++]; } int statfd; if (oldproc) { statfd = oldproc->fd; } else { char statname[20]; snprintf(statname, 20, "%d/stat", pids[i]); statfd = openat(dirfd, statname, O_RDONLY); if (statfd == -1) // fail silently continue; } ssize_t n_read = pread(statfd, statbuffer, BUFSIZ-1, 0); if (n_read == -1) { close(statfd); continue; } statbuffer[n_read] = '\0'; char *name = nth_word(1, statbuffer, n_read+1); char *utime = nth_word(13, statbuffer, n_read+1); char *stime = nth_word(14, statbuffer, n_read+1); if (!name || !utime || !stime) { close(statfd); continue; } struct process *proc = &procs->arr[procs->num++]; proc->pid = pids[i]; proc->fd = statfd; proc->time[0] = atoll(utime) + atoll(stime); strncpy(proc->name, name, 32); if (oldproc) { memcpy(&proc->time[1], &oldproc->time[0], sizeof(*proc->time) * (BURSTDOG_SAMPLES-1)); } else { for (size_t j = 1; j < BURSTDOG_SAMPLES; j++) proc->time[j] = proc->time[0]; } #if !BURSTDOG_USE_TOTAL if (num_samples == BURSTDOG_SAMPLES) { uint64_t total = cpu_stats[0].total - cpu_stats[BURSTDOG_SAMPLES-1].total; uint64_t proc_busy = proc->time[0] - proc->time[BURSTDOG_SAMPLES-1]; uint64_t proc_share = nproc * proc_busy * 100 / total; if (bursting == proc->pid && proc_share < BURSTDOG_BURST_END) { bursting = 0; } else if (bursting != proc->pid && proc_share >= BURSTDOG_BURST) { int n_msg = snprintf(msgbuffer, BUFSIZ, "%s: %6d %s is bursting: %"PRIu64" of %"PRIu64" ticks (%"PRIu64"%% of %ld cores)\n", timebuf, proc->pid, proc->name, proc_busy, total, proc_share, nproc); write_all(logfd, msgbuffer, n_msg); bursting = proc->pid; } } #endif } while (oldproc_idx < oldprocs->num) close(oldprocs->arr[oldproc_idx++].fd); #if BURSTDOG_USE_TOTAL if (num_samples == BURSTDOG_SAMPLES) { uint64_t total = cpu_stats[0].total - cpu_stats[BURSTDOG_SAMPLES-1].total; uint64_t busy = cpu_stats[0].busy - cpu_stats[BURSTDOG_SAMPLES-1].busy; uint64_t share = nproc * busy * 100 / total; bool do_log = false; if (share < BURSTDOG_BURST_END) { bursting = 0; } else if (share >= BURSTDOG_BURST) { uint64_t level = (share-BURSTDOG_BURST)/50+1; if (level > bursting) do_log = true; bursting = level; } if (do_log) { struct process *culprits[BURSTDOG_CULPRITS] = {}; memset(culprits, 0, sizeof(culprits)); for (size_t i = 0; i < procs->num; i++) { struct process *proc = &procs->arr[i]; uint64_t proc_busy = proc->time[0] - proc->time[BURSTDOG_SAMPLES-1]; for (size_t j = 0; j < BURSTDOG_CULPRITS; j++) { if (!culprits[j] || (culprits[j]->time[0] - culprits[j]->time[BURSTDOG_SAMPLES-1]) < proc_busy) { memmove(&culprits[j+1], &culprits[j], sizeof(*culprits) * (BURSTDOG_CULPRITS-j-1)); culprits[j] = proc; break; } } } int n_msg = 0; n_msg += snprintf(msgbuffer+n_msg, BUFSIZ > n_msg ? BUFSIZ-n_msg : 0, "%s: cpu is bursting: %"PRIu64" of %"PRIu64" ticks (%"PRIu64"%% of %ld cores)\n", timebuf, busy, total, share, nproc); for (size_t i = 0; i < BURSTDOG_CULPRITS; i++) { struct process *proc = culprits[i]; if (!proc) break; uint64_t proc_busy = proc->time[0] - proc->time[BURSTDOG_SAMPLES-1]; uint64_t proc_share = nproc * proc_busy * 100 / total; n_msg += snprintf(msgbuffer+n_msg, BUFSIZ > n_msg ? BUFSIZ-n_msg : 0, "%s: top %zu: %6d %s got: %"PRIu64" of %"PRIu64" ticks (%"PRIu64"%% of %ld cores)\n", timebuf, i+1, proc->pid, proc->name, proc_busy, total, proc_share, nproc); } n_msg += snprintf(msgbuffer+n_msg, BUFSIZ > n_msg ? BUFSIZ-n_msg : 0, "\n"); write_all(logfd, msgbuffer, n_msg); } } #endif struct process_tab *tmp = oldprocs; oldprocs = procs; procs = tmp; usleep(1000000/BURSTDOG_IV); } }