#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include // settings #define WATCHDOG_IV 10 // how often to wake up, per second #define WATCHDOG_BURST 98 // cpu time percentage considered a burst #define WATCHDOG_BURST_END 90 // cpu time percentage considered the end of a burst #define WATCHDOG_SAMPLES 5 // how many samples a burst needs to persist for to be logged #if WATCHDOG_SAMPLES < 2 #error must consider at least 2 samples #endif // arbitrary limits, bump if necessary #define MAX_PROCESS 4096 #define DIRBUFFER_SIZE 262144 struct process { unsigned int pid; int fd; unsigned int time[WATCHDOG_SAMPLES]; }; struct process_tab { size_t num; struct process arr[MAX_PROCESS]; }; static unsigned int pids[MAX_PROCESS]; static struct process_tab process_tabs[2]; static char dirbuffer[DIRBUFFER_SIZE]; static char statbuffer[BUFSIZ]; static char linebuffer[BUFSIZ]; struct linux_dirent64 { ino64_t d_ino; /* 64-bit inode number */ off64_t d_off; /* Not an offset; see getdents() */ unsigned short d_reclen; /* Size of this dirent */ unsigned char d_type; /* File type */ char d_name[]; /* Filename (null-terminated) */ }; static int compare_int(const void *pa, const void *pb) { int a = *(const int *) pa; int b = *(const int *) pb; return a > b ? 1 : a == b ? 0 : -1; } static bool is_sep(char c) { return c == ' ' || c == '\0'; } static char *nth_word(unsigned int n, char *ptr, size_t size) { size_t start = 0, pos = 0; bool paren = false; for (unsigned int i = 0; i <= n; i++) { do { if (pos >= size) return NULL; if (ptr[pos] == '(') paren = true; if (ptr[pos] == ')') paren = false; } while (!is_sep(ptr[pos++]) || paren); if (i == n-1) start = pos; } ptr[pos-1] = '\0'; return &ptr[start]; } int main(int argc, char **argv) { long clock_tick = sysconf(_SC_CLK_TCK); char *logfile = argc > 1 ? argv[1] : "watchdog.log"; int logfd = open(logfile, O_WRONLY | O_CREAT | O_APPEND | O_DIRECT, 0644); if (logfd == -1) { perror("open logfile"); return EXIT_FAILURE; } int dirfd = open("/proc", O_RDONLY | O_DIRECTORY); if (dirfd == -1) { perror("open /proc"); return EXIT_FAILURE; } unsigned int times[WATCHDOG_SAMPLES]; unsigned int num_samples = 0; unsigned int bursting_pid = 0; struct process_tab *procs = &process_tabs[0], *oldprocs = &process_tabs[1]; for (;;) { size_t move_samples = num_samples; if (num_samples < WATCHDOG_SAMPLES) num_samples++; else move_samples--; if (num_samples) memmove(×[1], ×[0], sizeof(unsigned int) * move_samples); struct timespec ts; clock_gettime(CLOCK_BOOTTIME, &ts); times[0] = ts.tv_sec * clock_tick + ts.tv_nsec * clock_tick / 1000000000; if (lseek(dirfd, 0, SEEK_SET) != 0) { perror("seek /proc"); return EXIT_FAILURE; } ssize_t n_dirent = getdents64(dirfd, dirbuffer, DIRBUFFER_SIZE); if (n_dirent == -1) { perror("getdents64 /proc"); return EXIT_FAILURE; } size_t num_pids = 0; char *dirptr = dirbuffer; while (dirptr < dirbuffer + n_dirent) { struct linux_dirent64 *d = (void*) dirptr; int pid; if ((d->d_type == DT_DIR || d->d_type == DT_UNKNOWN) && (pid = atoi(d->d_name))) { if (num_pids == MAX_PROCESS) { fprintf(stderr, "too many processes\n"); return EXIT_FAILURE; } pids[num_pids++] = pid; } dirptr += d->d_reclen; } qsort(pids, num_pids, sizeof(int), &compare_int); procs->num = 0; size_t oldproc_idx = 0; for (size_t i = 0; i < num_pids; i++) { struct process *oldproc = NULL; if (num_samples > 1) { while (oldproc_idx < oldprocs->num && oldprocs->arr[oldproc_idx].pid < pids[i]) close(oldprocs->arr[oldproc_idx++].fd); if (oldproc_idx < oldprocs->num && oldprocs->arr[oldproc_idx].pid == pids[i]) oldproc = &oldprocs->arr[oldproc_idx++]; } int statfd; if (oldproc) { statfd = oldproc->fd; } else { char statname[20]; snprintf(statname, 20, "%d/stat", pids[i]); statfd = openat(dirfd, statname, O_RDONLY); if (statfd == -1) // fail silently continue; } ssize_t n_read = pread(statfd, statbuffer, BUFSIZ-1, 0); if (n_read == -1) { close(statfd); continue; } statbuffer[n_read] = '\0'; char *name = nth_word(1, statbuffer, n_read+1); char *utime = nth_word(13, statbuffer, n_read+1); char *stime = nth_word(14, statbuffer, n_read+1); if (!name || !utime || !stime) { close(statfd); continue; } struct process *proc = &procs->arr[procs->num++]; proc->pid = pids[i]; proc->fd = statfd; proc->time[0] = atoi(utime) + atoi(stime); if (oldproc) { memcpy(&proc->time[1], &oldproc->time[0], (WATCHDOG_SAMPLES-1) * sizeof(int)); } else { for (size_t j = 1; j < WATCHDOG_SAMPLES; j++) proc->time[j] = proc->time[0]; } if (num_samples == WATCHDOG_SAMPLES) { unsigned int total = times[0] - times[WATCHDOG_SAMPLES-1]; unsigned int subset = proc->time[0] - proc->time[WATCHDOG_SAMPLES-1]; unsigned int share = subset * 100 / total; if (bursting_pid == proc->pid && share < WATCHDOG_BURST_END) { bursting_pid = 0; } else if (bursting_pid != proc->pid && share >= WATCHDOG_BURST) { time_t time_v = time(NULL); struct tm time_s; localtime_r(&time_v, &time_s); char timebuf[128]; strftime(timebuf, 128, "%c", &time_s); int n_line = snprintf(linebuffer, BUFSIZ, "%s: %5d %s is bursting: %d of %d ticks (%d%%)\n", timebuf, proc->pid, name, subset, total, share); int n_written = write(logfd, linebuffer, n_line); (void) n_written; bursting_pid = proc->pid; } } } while (oldproc_idx < oldprocs->num) close(oldprocs->arr[oldproc_idx++].fd); struct process_tab *tmp = oldprocs; oldprocs = procs; procs = tmp; usleep(1000000/WATCHDOG_IV); } }