Gaudi Framework, version v21r7

Home   Generated: 22 Jan 2010

apmon_mon_utils Namespace Reference


Classes

struct  PsInfo
 Structure that holds information about a job, as obtained from the ps command. More...
struct  JobDirInfo
 Structure that holds information about the disk usage for a job. More...

Typedefs

typedef struct
apmon_mon_utils::PsInfo 
PsInfo
 Structure that holds information about a job, as obtained from the ps command.
typedef struct
apmon_mon_utils::JobDirInfo 
JobDirInfo
 Structure that holds information about the disk usage for a job.

Functions

long * getChildren (long pid, int &nChildren) throw (runtime_error)
 Determines all the descendants of a given process.
void readJobInfo (long pid, PsInfo &info) throw (runtime_error)
 Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.
long parsePSTime (char *s)
 Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.
void readJobDiskUsage (MonitoredJob job, JobDirInfo &info) throw (runtime_error)
 If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.


Typedef Documentation

Structure that holds information about the disk usage for a job.

Structure that holds information about a job, as obtained from the ps command.


Function Documentation

long * apmon_mon_utils::getChildren ( long  pid,
int &  nChildren 
) throw (runtime_error)

Determines all the descendants of a given process.

Definition at line 885 of file monitor_utils.cpp.

00886                        {
00887 #ifdef WIN32
00888         return 0;
00889 #else
00890   FILE *pf;
00891   long *pids, *ppids, *children;
00892   int nProcesses;
00893   int i, j, status;
00894   pid_t cpid;
00895   char *argv[4], msg[MAX_STRING_LEN], sval[20];
00896   bool processFound;
00897   long mypid = getpid();
00898   char children_f[50], np_f[50], cmd[200];
00899 
00900   /* generate the names of the temporary files in which we have the output
00901      of some commands */
00902   sprintf(children_f, "/tmp/apmon_children%ld", mypid);
00903   sprintf(np_f, "/tmp/apmon_np%ld", mypid);
00904 
00905   switch (cpid = fork()) {
00906   case -1:
00907     throw runtime_error("[ getChildren() ] Unable to fork()");
00908   case 0:
00909     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
00910     sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s",
00911             children_f, children_f, np_f);
00912     argv[2] = cmd;
00913     /*
00914     argv[2] = "ps --no-headers -eo ppid,pid > /tmp/apmon_children.txt && wc -l /tmp/out_children.txt > /tmp/out_np.txt";
00915     */
00916     argv[3] = 0;
00917     execv("/bin/sh", argv);
00918     exit(RET_ERROR);
00919   default:
00920     if (waitpid(cpid, &status, 0) == -1) {
00921       sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid);
00922       unlink(children_f); unlink(np_f);
00923       throw runtime_error(msg); 
00924     }
00925   }
00926 
00927   /* find the number of processes */
00928   pf = fopen(np_f, "rt");
00929   if (pf == NULL) {
00930     unlink(np_f); unlink(children_f);
00931     sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined",
00932             pid);
00933     throw runtime_error(msg);
00934   } 
00935   fscanf(pf, "%d", &nProcesses);
00936   fclose(pf);   
00937   unlink(np_f);
00938 
00939   pids = (long *)malloc(nProcesses * sizeof(long)); 
00940   ppids = (long *)malloc(nProcesses * sizeof(long)); 
00941   /* estimated maximum size for the returned vector; it will be realloc'ed */
00942   children = (long *)malloc(nProcesses * sizeof(long));
00943 
00944   pf = fopen(children_f, "rt");
00945   if (pf == NULL) {
00946     free(pids); free(ppids); free(children);
00947     unlink(children_f);
00948     sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid);
00949     throw runtime_error(msg);
00950   } 
00951  
00952   /* scan the output of the ps command and find the children of the process,
00953    and also check if the process is still running */
00954   children[0] = pid; nChildren = 1;
00955   processFound = false;
00956   for (i = 0; i < nProcesses; i++) {
00957     fscanf(pf, "%ld %ld", &ppids[i], &pids[i]);
00958     /* look for the given process */
00959     if (pids[i] == children[0] || ppids[i] == children[0])
00960       processFound = true;
00961     if (ppids[i] == children[0]) {
00962       children[nChildren++] = pids[i];
00963     }
00964   }
00965   fclose(pf);
00966   unlink(children_f);
00967 
00968   if (processFound == false) {
00969     free(pids); free(ppids); free(children);
00970     nChildren = 0;
00971     sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid);
00972     throw runtime_error(msg);
00973   } 
00974 
00975   /* find the PIDs of all the descendant processes */
00976   i = 1;
00977   while (i < nChildren) {
00978     /* find the children of the i-th child */ 
00979     for (j = 0; j < nProcesses; j++) {
00980       if (ppids[j] == children[i]) {
00981         children[nChildren++] = pids[j];
00982       }
00983     }
00984     i++;
00985   }
00986 
00987   sprintf(msg, "Sub-processes for process %ld: ", pid);
00988   for (i = 0; i < nChildren; i++) {
00989     sprintf(sval, "%ld ", children[i]);
00990     if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1)
00991       strcat(msg, sval);
00992   }
00993   logger(DEBUG, msg);
00994 
00995   free(pids); free(ppids);
00996   children = (long *)realloc(children, (nChildren) * sizeof(long));
00997   return children;
00998 #endif
00999 }

long apmon_mon_utils::parsePSTime ( char *  s  ) 

Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.

Definition at line 1184 of file monitor_utils.cpp.

01184                                          {
01185   long days, hours, mins, secs;
01186 
01187   if (strchr(s, '-') != NULL) {
01188     sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs);
01189     return 24 * 3600 * days + 3600 * hours + 60 * mins + secs;
01190   } else {
01191     if (strchr(s, ':') != NULL && strchr(s, ':') !=  strrchr(s, ':')) {
01192        sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs);
01193        return 3600 * hours + 60 * mins + secs;
01194     } else {
01195       if (strchr(s, ':') != NULL) {
01196         sscanf(s, "%ld:%ld", &mins, &secs);
01197         return 60 * mins + secs;
01198       } else {
01199         return RET_ERROR;
01200       }
01201     }
01202   }
01203 }

void apmon_mon_utils::readJobDiskUsage ( MonitoredJob  job,
JobDirInfo info 
) throw (runtime_error)

If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.

Sizes are given in MB.

Definition at line 1205 of file monitor_utils.cpp.

01206                                                                        {
01207 #ifndef WIN32
01208   int status;
01209   pid_t cpid;
01210   char *cmd, s_tmp[20], *argv[4], msg[100];
01211   FILE *fp;
01212   long mypid = getpid();
01213   char du_f[50], df_f[50]; 
01214 
01215   /* generate names for the temporary files which will hold the output of the
01216      du and df commands */
01217   sprintf(du_f, "/tmp/apmon_du%ld", mypid);
01218   sprintf(df_f, "/tmp/apmon_df%ld", mypid);
01219   
01220   if (strlen(job.workdir) == 0) {
01221     sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid);
01222     throw runtime_error(msg);
01223   }
01224   
01225   cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char));
01226   strcpy(cmd, "PRT=`du -Lsk ");
01227   strcat(cmd, job.workdir);
01228   //strcat(cmd, " | tail -1 | cut -f 1 > ");
01229   strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > "); 
01230   strcat(cmd, du_f);
01231 
01232 
01233   switch (cpid = fork()) {
01234   case -1:
01235     sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01236     throw runtime_error(msg);
01237   case 0:
01238     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01239     argv[2] = cmd; argv[3] = 0;
01240     execv("/bin/sh", argv);
01241     exit(RET_ERROR);
01242   default:
01243     if (waitpid(cpid, &status, 0) == -1) {
01244       free(cmd);
01245       sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid);
01246       unlink(du_f); unlink(df_f);
01247       throw runtime_error(msg); 
01248     }
01249   }
01250 
01251   strcpy(cmd, "PRT=`df -m ");
01252   strcat(cmd, job.workdir);
01253   //strcat(cmd, " | tail -1 > ");
01254   strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
01255 
01256   strcat(cmd, df_f);
01257   //printf("### cmd: %s\n", cmd);
01258 
01259   switch (cpid = fork()) {
01260   case -1:
01261     sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01262     throw runtime_error(msg);
01263   case 0:
01264     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01265     argv[2] = cmd; argv[3] = 0;
01266     execv("/bin/sh", argv);
01267     exit(RET_ERROR);
01268   default:
01269     if (waitpid(cpid, &status, 0) == -1) {
01270       free(cmd);
01271       sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid);
01272       unlink(du_f); unlink(df_f);
01273       throw runtime_error(msg); 
01274     }
01275   }
01276 
01277   free(cmd);
01278   fp = fopen(du_f, "rt");
01279   if (fp == NULL) {
01280     sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid);
01281     throw runtime_error(msg);
01282   }
01283 
01284   fscanf(fp, "%lf", &(info.workdir_size));
01285   /* keep the directory size in MB */
01286   info.workdir_size /= 1024.0;
01287   fclose(fp);
01288   unlink(du_f);
01289  
01290   fp = fopen(df_f, "rt");
01291   if (fp == NULL) {
01292     sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid);
01293     throw runtime_error(msg);
01294   }
01295   fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total), 
01296          &(info.disk_used), &(info.disk_free), &(info.disk_usage));
01297   fclose(fp);
01298   unlink(df_f);
01299 #endif
01300 }

void apmon_mon_utils::readJobInfo ( long  pid,
PsInfo info 
) throw (runtime_error)

Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.

Definition at line 1001 of file monitor_utils.cpp.

01001                                                                              {
01002 #ifndef WIN32
01003   long *children;
01004   FILE *fp;
01005   int i, nChildren, status, ch, ret, open_fd;
01006   char *cmd , *mem_cmd_s, *argv[4], *ret_s;
01007   char pid_s[10], msg[100];
01008   char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[MAX_STRING_LEN1];
01009   char etime_s[20], cputime_s[20];
01010   double rsz, vsz;
01011   double etime, cputime;
01012   double pcpu, pmem;
01013   /* this list contains strings of the form "rsz_vsz_command" for every pid;
01014      it is used to avoid adding several times processes that have multiple 
01015      threads and appear in ps as sepparate processes, occupying exactly the 
01016      same amount of memory and having the same command name. For every line 
01017      from the output of the ps command we verify if the rsz_vsz_command 
01018      combination is already in the list.
01019   */
01020   char **mem_cmd_list;
01021   int listSize;
01022   long cpid, crt_pid;
01023   //unsigned int maxCmdLen = 5 * MAX_STRING_LEN;
01024   long mypid = getpid();
01025   char ps_f[50];
01026 
01027   /* get the list of the process' descendants */
01028   children = getChildren(pid, nChildren);
01029 
01030   /* generate a name for the temporary file which holds the output of the 
01031      ps command */
01032   sprintf(ps_f, "/tmp/apmon_ps%ld", mypid);
01033 
01034   unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char);
01035   cmd = (char *)malloc (cmdLen);
01036 
01037   /* issue the "ps" command to obtain information on all the descendants */
01038   strcpy(cmd, "ps --no-headers --pid ");
01039   for (i = 0; i < nChildren - 1; i++) {
01040     sprintf(pid_s, "%ld,", children[i]);
01041     if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) {
01042       free(cmd);
01043       sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01044               pid);
01045       throw runtime_error(msg);
01046     }
01047     strcat(cmd, pid_s);
01048     //strcat(cmd, " 2>&1");
01049   }
01050 
01051   /* the last part of the command */
01052   sprintf(pid_s, "%ld", children[nChildren - 1]);
01053   sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f);
01054   if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) {
01055     free(cmd);
01056     sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01057               pid);
01058     throw runtime_error(msg);
01059   }
01060   strcat(cmd, pid_s);
01061   strcat(cmd, cmdName);
01062   //strcat(cmd, " 2>&1");
01063 
01064   switch (cpid = fork()) {
01065   case -1:
01066     free(cmd);
01067     sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid);
01068     throw runtime_error(msg);
01069   case 0:
01070     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01071     argv[2] = cmd; argv[3] = 0;
01072     execv("/bin/sh", argv);
01073     exit(RET_ERROR);
01074   default:
01075     if (waitpid(cpid, &status, 0) == -1) {
01076       free(cmd);
01077       sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid);
01078       throw runtime_error(msg); 
01079     }
01080   }
01081 
01082   free(cmd);
01083   fp = fopen(ps_f, "rt");
01084   if (fp == NULL) {
01085     sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid);
01086     throw runtime_error(msg);
01087   }
01088 
01089   /* parse the output file */
01090   info.etime = info.cputime = 0;
01091   info.pcpu = info.pmem = 0;
01092   info.rsz = info.vsz = 0;
01093   info.open_fd = 0;
01094   mem_cmd_list = (char **)malloc(nChildren * sizeof(char *));
01095   listSize = 0;
01096   cmdName[0] = 0;
01097   while (1) {
01098     ret_s = fgets(buf, MAX_STRING_LEN, fp);
01099     if (ret_s == NULL) 
01100       break;
01101     buf[MAX_STRING_LEN - 1] = 0;
01102 
01103     /* if the line was too long and fgets hasn't read it entirely, */
01104     /* keep only the first 512 chars from the line */
01105     ch = fgetc(fp); // see if we are at the end of the file
01106     ungetc(ch, fp);
01107     if (buf[strlen(buf) - 1] != 10 && ch != EOF) { 
01108       while (1) {
01109         char *sret = fgets(buf2, MAX_STRING_LEN, fp);
01110         if (sret == NULL || buf[strlen(buf) - 1] == 10)
01111           break;
01112       }
01113     }
01114 
01115     ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s, 
01116                  cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName);
01117     if (ret != 8) {
01118       fclose(fp);
01119       unlink(ps_f);
01120       free(children);
01121       for (i = 0; i < listSize; i++) {
01122         free(mem_cmd_list[i]);
01123       }
01124       free(mem_cmd_list);
01125       throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command");
01126     }
01127 
01128     /* etime is the maximum of the elapsed times for the subprocesses */
01129     etime = parsePSTime(etime_s);
01130     info.etime = (info.etime > etime) ? info.etime : etime;
01131 
01132     /* cputime is the sum of the cpu times for the subprocesses */
01133     cputime = parsePSTime(cputime_s);
01134     info.cputime += cputime;
01135     info.pcpu += pcpu;
01136 
01137     /* get the number of opened file descriptors */
01138     try {
01139       open_fd = ProcUtils::countOpenFiles(crt_pid);
01140     } catch (procutils_error& err) {
01141       logger(WARNING, err.what());
01142       /* don't throw an exception if we couldn't read the number of files */
01143       open_fd = PROCUTILS_ERROR;
01144     }
01145 
01146     /* see if this is a process or just a thread */
01147     mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char));
01148     sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName);
01149     //printf("### mem_cmd_s: %s\n", mem_cmd_s);
01150     if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) {
01151       /* aonther pid with the same command name, rsz and vsz was not found,
01152          so this is a new process and we can add the amount of memory used by 
01153          it */
01154       info.pmem += pmem;
01155       info.vsz += vsz; info.rsz += rsz;
01156 
01157       if (info.open_fd >= 0) // if no error occured so far
01158         info.open_fd += open_fd;
01159       /* add an entry in the list so that next time we see another thread of
01160          this process we don't add the amount of  memory again */
01161       mem_cmd_list[listSize++] = mem_cmd_s;     
01162     } else {
01163       free(mem_cmd_s);
01164     }
01165 
01166     /* if we monitor the current process, we have two extra opened files
01167        that we shouldn't take into account (the output file for ps and
01168        /proc/<pid>/fd/)
01169     */
01170     if (crt_pid == getpid())
01171       info.open_fd -= 2;
01172   } 
01173 
01174   fclose(fp);
01175   unlink(ps_f);
01176   free(children);
01177   for (i = 0; i < listSize; i++) {
01178     free(mem_cmd_list[i]);
01179   }
01180   free(mem_cmd_list);
01181 #endif
01182 }


Generated at Fri Jan 22 20:44:10 2010 for Gaudi Framework, version v21r7 by Doxygen version 1.5.6 written by Dimitri van Heesch, © 1997-2004