Gaudi Framework, version v22r2

Home   Generated: Tue May 10 2011
Classes | Typedefs | Functions

apmon_mon_utils Namespace Reference

Classes

struct  PsInfo
 Structure that holds information about a job, as obtained from the ps command. More...
struct  JobDirInfo
 Structure that holds information about the disk usage for a job. More...

Typedefs

typedef struct
apmon_mon_utils::PsInfo 
PsInfo
 Structure that holds information about a job, as obtained from the ps command.
typedef struct
apmon_mon_utils::JobDirInfo 
JobDirInfo
 Structure that holds information about the disk usage for a job.

Functions

longgetChildren (long pid, int &nChildren) throw (runtime_error)
 Determines all the descendants of a given process.
void readJobInfo (long pid, PsInfo &info) throw (runtime_error)
 Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.
double parsePSTime (char *s)
 Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.
void readJobDiskUsage (MonitoredJob job, JobDirInfo &info) throw (runtime_error)
 If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.

Typedef Documentation

Structure that holds information about the disk usage for a job.

Structure that holds information about a job, as obtained from the ps command.


Function Documentation

long * apmon_mon_utils::getChildren ( long  pid,
int &  nChildren 
) throw (runtime_error)

Determines all the descendants of a given process.

Definition at line 883 of file monitor_utils.cpp.

                       {
#ifdef WIN32
        return 0;
#else
  FILE *pf;
  long *pids, *ppids, *children;
  int nProcesses;
  int i, j, status;
  pid_t cpid;
  char *argv[4], msg[MAX_STRING_LEN], sval[20];
  bool processFound;
  long mypid = getpid();
  char children_f[50], np_f[50], cmd[200];

  /* generate the names of the temporary files in which we have the output
     of some commands */
  sprintf(children_f, "/tmp/apmon_children%ld", mypid);
  sprintf(np_f, "/tmp/apmon_np%ld", mypid);

  switch (cpid = fork()) {
  case -1:
    throw runtime_error("[ getChildren() ] Unable to fork()");
  case 0:
    argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
    sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s",
            children_f, children_f, np_f);
    argv[2] = cmd;
    /*
    argv[2] = "ps --no-headers -eo ppid,pid > /tmp/apmon_children.txt && wc -l /tmp/out_children.txt > /tmp/out_np.txt";
    */
    argv[3] = 0;
    execv("/bin/sh", argv);
    exit(RET_ERROR);
  default:
    if (waitpid(cpid, &status, 0) == -1) {
      sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid);
      unlink(children_f); unlink(np_f);
      throw runtime_error(msg);
    }
  }

  /* find the number of processes */
  pf = fopen(np_f, "rt");
  if (pf == NULL) {
    unlink(np_f); unlink(children_f);
    sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined",
            pid);
    throw runtime_error(msg);
  }
  fscanf(pf, "%d", &nProcesses);
  fclose(pf);
  unlink(np_f);

  pids = (long *)malloc(nProcesses * sizeof(long));
  ppids = (long *)malloc(nProcesses * sizeof(long));
  /* estimated maximum size for the returned vector; it will be realloc'ed */
  children = (long *)malloc(nProcesses * sizeof(long));

  pf = fopen(children_f, "rt");
  if (pf == NULL) {
    free(pids); free(ppids); free(children);
    unlink(children_f);
    sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid);
    throw runtime_error(msg);
  }

  /* scan the output of the ps command and find the children of the process,
   and also check if the process is still running */
  children[0] = pid; nChildren = 1;
  processFound = false;
  for (i = 0; i < nProcesses; i++) {
    fscanf(pf, "%ld %ld", &ppids[i], &pids[i]);
    /* look for the given process */
    if (pids[i] == children[0] || ppids[i] == children[0])
      processFound = true;
    if (ppids[i] == children[0]) {
      children[nChildren++] = pids[i];
    }
  }
  fclose(pf);
  unlink(children_f);

  if (processFound == false) {
    free(pids); free(ppids); free(children);
    nChildren = 0;
    sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid);
    throw runtime_error(msg);
  }

  /* find the PIDs of all the descendant processes */
  i = 1;
  while (i < nChildren) {
    /* find the children of the i-th child */
    for (j = 0; j < nProcesses; j++) {
      if (ppids[j] == children[i]) {
        children[nChildren++] = pids[j];
      }
    }
    i++;
  }

  sprintf(msg, "Sub-processes for process %ld: ", pid);
  for (i = 0; i < nChildren; i++) {
    sprintf(sval, "%ld ", children[i]);
    if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1)
      strcat(msg, sval);
  }
  logger(DEBUG, msg);

  free(pids); free(ppids);
  children = (long *)realloc(children, (nChildren) * sizeof(long));
  return children;
#endif
}
double apmon_mon_utils::parsePSTime ( char *  s )

Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.

Definition at line 1183 of file monitor_utils.cpp.

                                           {
  long days, hours, mins, secs;

  if (strchr(s, '-') != NULL) {
    sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs);
    return 24. * 3600 * days + 3600 * hours + 60 * mins + secs;
  } else {
    if (strchr(s, ':') != NULL && strchr(s, ':') !=  strrchr(s, ':')) {
       sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs);
       return 3600. * hours + 60 * mins + secs;
    } else {
      if (strchr(s, ':') != NULL) {
        sscanf(s, "%ld:%ld", &mins, &secs);
        return 60. * mins + secs;
      } else {
        return RET_ERROR;
      }
    }
  }
}
void apmon_mon_utils::readJobDiskUsage ( MonitoredJob  job,
JobDirInfo info 
) throw (runtime_error)

If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.

Sizes are given in MB.

Definition at line 1204 of file monitor_utils.cpp.

                                                                       {
#ifndef WIN32
  int status;
  pid_t cpid;
  char *cmd, s_tmp[20], *argv[4], msg[100];
  FILE *fp;
  long mypid = getpid();
  char du_f[50], df_f[50];

  /* generate names for the temporary files which will hold the output of the
     du and df commands */
  sprintf(du_f, "/tmp/apmon_du%ld", mypid);
  sprintf(df_f, "/tmp/apmon_df%ld", mypid);

  if (strlen(job.workdir) == 0) {
    sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid);
    throw runtime_error(msg);
  }

  cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char));
  strcpy(cmd, "PRT=`du -Lsk ");
  strcat(cmd, job.workdir);
  //strcat(cmd, " | tail -1 | cut -f 1 > ");
  strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
  strcat(cmd, du_f);


  switch (cpid = fork()) {
  case -1:
    sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
    throw runtime_error(msg);
  case 0:
    argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
    argv[2] = cmd; argv[3] = 0;
    execv("/bin/sh", argv);
    exit(RET_ERROR);
  default:
    if (waitpid(cpid, &status, 0) == -1) {
      free(cmd);
      sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid);
      unlink(du_f); unlink(df_f);
      throw runtime_error(msg);
    }
  }

  strcpy(cmd, "PRT=`df -m ");
  strcat(cmd, job.workdir);
  //strcat(cmd, " | tail -1 > ");
  strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");

  strcat(cmd, df_f);
  //printf("### cmd: %s\n", cmd);

  switch (cpid = fork()) {
  case -1:
    sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
    throw runtime_error(msg);
  case 0:
    argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
    argv[2] = cmd; argv[3] = 0;
    execv("/bin/sh", argv);
    exit(RET_ERROR);
  default:
    if (waitpid(cpid, &status, 0) == -1) {
      free(cmd);
      sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid);
      unlink(du_f); unlink(df_f);
      throw runtime_error(msg);
    }
  }

  free(cmd);
  fp = fopen(du_f, "rt");
  if (fp == NULL) {
    sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid);
    throw runtime_error(msg);
  }

  fscanf(fp, "%lf", &(info.workdir_size));
  /* keep the directory size in MB */
  info.workdir_size /= 1024.0;
  fclose(fp);
  unlink(du_f);

  fp = fopen(df_f, "rt");
  if (fp == NULL) {
    sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid);
    throw runtime_error(msg);
  }
  fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total),
         &(info.disk_used), &(info.disk_free), &(info.disk_usage));
  fclose(fp);
  unlink(df_f);
#endif
}
void apmon_mon_utils::readJobInfo ( long  pid,
PsInfo info 
) throw (runtime_error)

Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.

Definition at line 999 of file monitor_utils.cpp.

                                                                             {
#ifndef WIN32
  long *children;
  FILE *fp;
  int i, nChildren, status, ch, ret, open_fd;
  char *cmd , *mem_cmd_s, *argv[4], *ret_s;
  char pid_s[10], msg[100];
  char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[MAX_STRING_LEN1];
  char etime_s[20], cputime_s[20];
  double rsz, vsz;
  double etime, cputime;
  double pcpu, pmem;
  /* this list contains strings of the form "rsz_vsz_command" for every pid;
     it is used to avoid adding several times processes that have multiple
     threads and appear in ps as sepparate processes, occupying exactly the
     same amount of memory and having the same command name. For every line
     from the output of the ps command we verify if the rsz_vsz_command
     combination is already in the list.
  */
  char **mem_cmd_list;
  int listSize;
  long cpid, crt_pid;
  //unsigned int maxCmdLen = 5 * MAX_STRING_LEN;
  long mypid = getpid();
  char ps_f[50];

  /* get the list of the process' descendants */
  children = getChildren(pid, nChildren);

  /* generate a name for the temporary file which holds the output of the
     ps command */
  sprintf(ps_f, "/tmp/apmon_ps%ld", mypid);

  unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char);
  cmd = (char *)malloc (cmdLen);

  /* issue the "ps" command to obtain information on all the descendants */
  strcpy(cmd, "ps --no-headers --pid ");
  for (i = 0; i < nChildren - 1; i++) {
    sprintf(pid_s, "%ld,", children[i]);
    if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) {
      free(cmd);
      sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
              pid);
      throw runtime_error(msg);
    }
    strcat(cmd, pid_s);
    //strcat(cmd, " 2>&1");
  }

  /* the last part of the command */
  sprintf(pid_s, "%ld", children[nChildren - 1]);
  sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f);
  if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) {
    free(cmd);
    sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
              pid);
    throw runtime_error(msg);
  }
  strcat(cmd, pid_s);
  strcat(cmd, cmdName);
  //strcat(cmd, " 2>&1");

  switch (cpid = fork()) {
  case -1:
    free(cmd);
    sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid);
    throw runtime_error(msg);
  case 0:
    argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
    argv[2] = cmd; argv[3] = 0;
    execv("/bin/sh", argv);
    exit(RET_ERROR);
  default:
    if (waitpid(cpid, &status, 0) == -1) {
      free(cmd);
      sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid);
      throw runtime_error(msg);
    }
  }

  free(cmd);
  fp = fopen(ps_f, "rt");
  if (fp == NULL) {
    sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid);
    throw runtime_error(msg);
  }

  /* parse the output file */
  info.etime = info.cputime = 0;
  info.pcpu = info.pmem = 0;
  info.rsz = info.vsz = 0;
  info.open_fd = 0;
  mem_cmd_list = (char **)malloc(nChildren * sizeof(char *));
  listSize = 0;
  cmdName[0] = 0;
  while (1) {
    ret_s = fgets(buf, MAX_STRING_LEN, fp);
    if (ret_s == NULL)
      break;
    buf[MAX_STRING_LEN - 1] = 0;

    /* if the line was too long and fgets hasn't read it entirely, */
    /* keep only the first 512 chars from the line */
    ch = fgetc(fp); // see if we are at the end of the file
    ungetc(ch, fp);
    if (buf[strlen(buf) - 1] != 10 && ch != EOF) {
      while (1) {
        char *sret = fgets(buf2, MAX_STRING_LEN, fp);
        if (sret == NULL || buf[strlen(buf) - 1] == 10)
          break;
      }
    }

    ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s,
                 cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName);
    if (ret != 8) {
      fclose(fp);
      unlink(ps_f);
      free(children);
      for (i = 0; i < listSize; i++) {
        free(mem_cmd_list[i]);
      }
      free(mem_cmd_list);
      throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command");
    }

    /* etime is the maximum of the elapsed times for the subprocesses */
    etime = parsePSTime(etime_s);

    info.etime = (info.etime > etime) ? info.etime : etime;

    /* cputime is the sum of the cpu times for the subprocesses */
    cputime = parsePSTime(cputime_s);
    info.cputime += cputime;
    info.pcpu += pcpu;

    /* get the number of opened file descriptors */
    try {
      open_fd = ProcUtils::countOpenFiles(crt_pid);
    } catch (procutils_error& err) {
      logger(WARNING, err.what());
      /* don't throw an exception if we couldn't read the number of files */
      open_fd = PROCUTILS_ERROR;
    }

    /* see if this is a process or just a thread */
    mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char));
    sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName);
    //printf("### mem_cmd_s: %s\n", mem_cmd_s);
    if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) {
      /* another pid with the same command name, rsz and vsz was not found,
         so this is a new process and we can add the amount of memory used by
         it */
      info.pmem += pmem;
      info.vsz += vsz; info.rsz += rsz;

      if (info.open_fd >= 0) // if no error occured so far
        info.open_fd += open_fd;
      /* add an entry in the list so that next time we see another thread of
         this process we don't add the amount of  memory again */
      mem_cmd_list[listSize++] = mem_cmd_s;
    } else {
      free(mem_cmd_s);
    }

    /* if we monitor the current process, we have two extra opened files
       that we shouldn't take into account (the output file for ps and
       /proc/<pid>/fd/)
    */
    if (crt_pid == getpid())
      info.open_fd -= 2;
  }

  fclose(fp);
  unlink(ps_f);
  free(children);
  for (i = 0; i < listSize; i++) {
    free(mem_cmd_list[i]);
  }
  free(mem_cmd_list);
#endif
}
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Defines

Generated at Tue May 10 2011 18:55:16 for Gaudi Framework, version v22r2 by Doxygen version 1.7.2 written by Dimitri van Heesch, © 1997-2004