Gaudi Framework, version v22r0

Home   Generated: 9 Feb 2011

apmon_mon_utils Namespace Reference

Classes

struct  PsInfo
 Structure that holds information about a job, as obtained from the ps command. More...
struct  JobDirInfo
 Structure that holds information about the disk usage for a job. More...

Typedefs

typedef struct
apmon_mon_utils::PsInfo 
PsInfo
 Structure that holds information about a job, as obtained from the ps command.
typedef struct
apmon_mon_utils::JobDirInfo 
JobDirInfo
 Structure that holds information about the disk usage for a job.

Functions

longgetChildren (long pid, int &nChildren) throw (runtime_error)
 Determines all the descendants of a given process.
void readJobInfo (long pid, PsInfo &info) throw (runtime_error)
 Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.
double parsePSTime (char *s)
 Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.
void readJobDiskUsage (MonitoredJob job, JobDirInfo &info) throw (runtime_error)
 If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.

Typedef Documentation

Structure that holds information about the disk usage for a job.

Structure that holds information about a job, as obtained from the ps command.


Function Documentation

long * apmon_mon_utils::getChildren ( long  pid,
int &  nChildren 
) throw (runtime_error)

Determines all the descendants of a given process.

Definition at line 883 of file monitor_utils.cpp.

00884                        {
00885 #ifdef WIN32
00886         return 0;
00887 #else
00888   FILE *pf;
00889   long *pids, *ppids, *children;
00890   int nProcesses;
00891   int i, j, status;
00892   pid_t cpid;
00893   char *argv[4], msg[MAX_STRING_LEN], sval[20];
00894   bool processFound;
00895   long mypid = getpid();
00896   char children_f[50], np_f[50], cmd[200];
00897 
00898   /* generate the names of the temporary files in which we have the output
00899      of some commands */
00900   sprintf(children_f, "/tmp/apmon_children%ld", mypid);
00901   sprintf(np_f, "/tmp/apmon_np%ld", mypid);
00902 
00903   switch (cpid = fork()) {
00904   case -1:
00905     throw runtime_error("[ getChildren() ] Unable to fork()");
00906   case 0:
00907     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
00908     sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s",
00909             children_f, children_f, np_f);
00910     argv[2] = cmd;
00911     /*
00912     argv[2] = "ps --no-headers -eo ppid,pid > /tmp/apmon_children.txt && wc -l /tmp/out_children.txt > /tmp/out_np.txt";
00913     */
00914     argv[3] = 0;
00915     execv("/bin/sh", argv);
00916     exit(RET_ERROR);
00917   default:
00918     if (waitpid(cpid, &status, 0) == -1) {
00919       sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid);
00920       unlink(children_f); unlink(np_f);
00921       throw runtime_error(msg);
00922     }
00923   }
00924 
00925   /* find the number of processes */
00926   pf = fopen(np_f, "rt");
00927   if (pf == NULL) {
00928     unlink(np_f); unlink(children_f);
00929     sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined",
00930             pid);
00931     throw runtime_error(msg);
00932   }
00933   fscanf(pf, "%d", &nProcesses);
00934   fclose(pf);
00935   unlink(np_f);
00936 
00937   pids = (long *)malloc(nProcesses * sizeof(long));
00938   ppids = (long *)malloc(nProcesses * sizeof(long));
00939   /* estimated maximum size for the returned vector; it will be realloc'ed */
00940   children = (long *)malloc(nProcesses * sizeof(long));
00941 
00942   pf = fopen(children_f, "rt");
00943   if (pf == NULL) {
00944     free(pids); free(ppids); free(children);
00945     unlink(children_f);
00946     sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid);
00947     throw runtime_error(msg);
00948   }
00949 
00950   /* scan the output of the ps command and find the children of the process,
00951    and also check if the process is still running */
00952   children[0] = pid; nChildren = 1;
00953   processFound = false;
00954   for (i = 0; i < nProcesses; i++) {
00955     fscanf(pf, "%ld %ld", &ppids[i], &pids[i]);
00956     /* look for the given process */
00957     if (pids[i] == children[0] || ppids[i] == children[0])
00958       processFound = true;
00959     if (ppids[i] == children[0]) {
00960       children[nChildren++] = pids[i];
00961     }
00962   }
00963   fclose(pf);
00964   unlink(children_f);
00965 
00966   if (processFound == false) {
00967     free(pids); free(ppids); free(children);
00968     nChildren = 0;
00969     sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid);
00970     throw runtime_error(msg);
00971   }
00972 
00973   /* find the PIDs of all the descendant processes */
00974   i = 1;
00975   while (i < nChildren) {
00976     /* find the children of the i-th child */
00977     for (j = 0; j < nProcesses; j++) {
00978       if (ppids[j] == children[i]) {
00979         children[nChildren++] = pids[j];
00980       }
00981     }
00982     i++;
00983   }
00984 
00985   sprintf(msg, "Sub-processes for process %ld: ", pid);
00986   for (i = 0; i < nChildren; i++) {
00987     sprintf(sval, "%ld ", children[i]);
00988     if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1)
00989       strcat(msg, sval);
00990   }
00991   logger(DEBUG, msg);
00992 
00993   free(pids); free(ppids);
00994   children = (long *)realloc(children, (nChildren) * sizeof(long));
00995   return children;
00996 #endif
00997 }

double apmon_mon_utils::parsePSTime ( char *  s  ) 

Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.

Definition at line 1183 of file monitor_utils.cpp.

01183                                            {
01184   long days, hours, mins, secs;
01185 
01186   if (strchr(s, '-') != NULL) {
01187     sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs);
01188     return 24. * 3600 * days + 3600 * hours + 60 * mins + secs;
01189   } else {
01190     if (strchr(s, ':') != NULL && strchr(s, ':') !=  strrchr(s, ':')) {
01191        sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs);
01192        return 3600. * hours + 60 * mins + secs;
01193     } else {
01194       if (strchr(s, ':') != NULL) {
01195         sscanf(s, "%ld:%ld", &mins, &secs);
01196         return 60. * mins + secs;
01197       } else {
01198         return RET_ERROR;
01199       }
01200     }
01201   }
01202 }

void apmon_mon_utils::readJobDiskUsage ( MonitoredJob  job,
JobDirInfo info 
) throw (runtime_error)

If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.

Sizes are given in MB.

Definition at line 1204 of file monitor_utils.cpp.

01205                                                                        {
01206 #ifndef WIN32
01207   int status;
01208   pid_t cpid;
01209   char *cmd, s_tmp[20], *argv[4], msg[100];
01210   FILE *fp;
01211   long mypid = getpid();
01212   char du_f[50], df_f[50];
01213 
01214   /* generate names for the temporary files which will hold the output of the
01215      du and df commands */
01216   sprintf(du_f, "/tmp/apmon_du%ld", mypid);
01217   sprintf(df_f, "/tmp/apmon_df%ld", mypid);
01218 
01219   if (strlen(job.workdir) == 0) {
01220     sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid);
01221     throw runtime_error(msg);
01222   }
01223 
01224   cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char));
01225   strcpy(cmd, "PRT=`du -Lsk ");
01226   strcat(cmd, job.workdir);
01227   //strcat(cmd, " | tail -1 | cut -f 1 > ");
01228   strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
01229   strcat(cmd, du_f);
01230 
01231 
01232   switch (cpid = fork()) {
01233   case -1:
01234     sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01235     throw runtime_error(msg);
01236   case 0:
01237     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01238     argv[2] = cmd; argv[3] = 0;
01239     execv("/bin/sh", argv);
01240     exit(RET_ERROR);
01241   default:
01242     if (waitpid(cpid, &status, 0) == -1) {
01243       free(cmd);
01244       sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid);
01245       unlink(du_f); unlink(df_f);
01246       throw runtime_error(msg);
01247     }
01248   }
01249 
01250   strcpy(cmd, "PRT=`df -m ");
01251   strcat(cmd, job.workdir);
01252   //strcat(cmd, " | tail -1 > ");
01253   strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
01254 
01255   strcat(cmd, df_f);
01256   //printf("### cmd: %s\n", cmd);
01257 
01258   switch (cpid = fork()) {
01259   case -1:
01260     sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01261     throw runtime_error(msg);
01262   case 0:
01263     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01264     argv[2] = cmd; argv[3] = 0;
01265     execv("/bin/sh", argv);
01266     exit(RET_ERROR);
01267   default:
01268     if (waitpid(cpid, &status, 0) == -1) {
01269       free(cmd);
01270       sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid);
01271       unlink(du_f); unlink(df_f);
01272       throw runtime_error(msg);
01273     }
01274   }
01275 
01276   free(cmd);
01277   fp = fopen(du_f, "rt");
01278   if (fp == NULL) {
01279     sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid);
01280     throw runtime_error(msg);
01281   }
01282 
01283   fscanf(fp, "%lf", &(info.workdir_size));
01284   /* keep the directory size in MB */
01285   info.workdir_size /= 1024.0;
01286   fclose(fp);
01287   unlink(du_f);
01288 
01289   fp = fopen(df_f, "rt");
01290   if (fp == NULL) {
01291     sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid);
01292     throw runtime_error(msg);
01293   }
01294   fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total),
01295          &(info.disk_used), &(info.disk_free), &(info.disk_usage));
01296   fclose(fp);
01297   unlink(df_f);
01298 #endif
01299 }

void apmon_mon_utils::readJobInfo ( long  pid,
PsInfo info 
) throw (runtime_error)

Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.

Definition at line 999 of file monitor_utils.cpp.

00999                                                                              {
01000 #ifndef WIN32
01001   long *children;
01002   FILE *fp;
01003   int i, nChildren, status, ch, ret, open_fd;
01004   char *cmd , *mem_cmd_s, *argv[4], *ret_s;
01005   char pid_s[10], msg[100];
01006   char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[MAX_STRING_LEN1];
01007   char etime_s[20], cputime_s[20];
01008   double rsz, vsz;
01009   double etime, cputime;
01010   double pcpu, pmem;
01011   /* this list contains strings of the form "rsz_vsz_command" for every pid;
01012      it is used to avoid adding several times processes that have multiple
01013      threads and appear in ps as sepparate processes, occupying exactly the
01014      same amount of memory and having the same command name. For every line
01015      from the output of the ps command we verify if the rsz_vsz_command
01016      combination is already in the list.
01017   */
01018   char **mem_cmd_list;
01019   int listSize;
01020   long cpid, crt_pid;
01021   //unsigned int maxCmdLen = 5 * MAX_STRING_LEN;
01022   long mypid = getpid();
01023   char ps_f[50];
01024 
01025   /* get the list of the process' descendants */
01026   children = getChildren(pid, nChildren);
01027 
01028   /* generate a name for the temporary file which holds the output of the
01029      ps command */
01030   sprintf(ps_f, "/tmp/apmon_ps%ld", mypid);
01031 
01032   unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char);
01033   cmd = (char *)malloc (cmdLen);
01034 
01035   /* issue the "ps" command to obtain information on all the descendants */
01036   strcpy(cmd, "ps --no-headers --pid ");
01037   for (i = 0; i < nChildren - 1; i++) {
01038     sprintf(pid_s, "%ld,", children[i]);
01039     if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) {
01040       free(cmd);
01041       sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01042               pid);
01043       throw runtime_error(msg);
01044     }
01045     strcat(cmd, pid_s);
01046     //strcat(cmd, " 2>&1");
01047   }
01048 
01049   /* the last part of the command */
01050   sprintf(pid_s, "%ld", children[nChildren - 1]);
01051   sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f);
01052   if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) {
01053     free(cmd);
01054     sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01055               pid);
01056     throw runtime_error(msg);
01057   }
01058   strcat(cmd, pid_s);
01059   strcat(cmd, cmdName);
01060   //strcat(cmd, " 2>&1");
01061 
01062   switch (cpid = fork()) {
01063   case -1:
01064     free(cmd);
01065     sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid);
01066     throw runtime_error(msg);
01067   case 0:
01068     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01069     argv[2] = cmd; argv[3] = 0;
01070     execv("/bin/sh", argv);
01071     exit(RET_ERROR);
01072   default:
01073     if (waitpid(cpid, &status, 0) == -1) {
01074       free(cmd);
01075       sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid);
01076       throw runtime_error(msg);
01077     }
01078   }
01079 
01080   free(cmd);
01081   fp = fopen(ps_f, "rt");
01082   if (fp == NULL) {
01083     sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid);
01084     throw runtime_error(msg);
01085   }
01086 
01087   /* parse the output file */
01088   info.etime = info.cputime = 0;
01089   info.pcpu = info.pmem = 0;
01090   info.rsz = info.vsz = 0;
01091   info.open_fd = 0;
01092   mem_cmd_list = (char **)malloc(nChildren * sizeof(char *));
01093   listSize = 0;
01094   cmdName[0] = 0;
01095   while (1) {
01096     ret_s = fgets(buf, MAX_STRING_LEN, fp);
01097     if (ret_s == NULL)
01098       break;
01099     buf[MAX_STRING_LEN - 1] = 0;
01100 
01101     /* if the line was too long and fgets hasn't read it entirely, */
01102     /* keep only the first 512 chars from the line */
01103     ch = fgetc(fp); // see if we are at the end of the file
01104     ungetc(ch, fp);
01105     if (buf[strlen(buf) - 1] != 10 && ch != EOF) {
01106       while (1) {
01107         char *sret = fgets(buf2, MAX_STRING_LEN, fp);
01108         if (sret == NULL || buf[strlen(buf) - 1] == 10)
01109           break;
01110       }
01111     }
01112 
01113     ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s,
01114                  cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName);
01115     if (ret != 8) {
01116       fclose(fp);
01117       unlink(ps_f);
01118       free(children);
01119       for (i = 0; i < listSize; i++) {
01120         free(mem_cmd_list[i]);
01121       }
01122       free(mem_cmd_list);
01123       throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command");
01124     }
01125 
01126     /* etime is the maximum of the elapsed times for the subprocesses */
01127     etime = parsePSTime(etime_s);
01128 
01129     info.etime = (info.etime > etime) ? info.etime : etime;
01130 
01131     /* cputime is the sum of the cpu times for the subprocesses */
01132     cputime = parsePSTime(cputime_s);
01133     info.cputime += cputime;
01134     info.pcpu += pcpu;
01135 
01136     /* get the number of opened file descriptors */
01137     try {
01138       open_fd = ProcUtils::countOpenFiles(crt_pid);
01139     } catch (procutils_error& err) {
01140       logger(WARNING, err.what());
01141       /* don't throw an exception if we couldn't read the number of files */
01142       open_fd = PROCUTILS_ERROR;
01143     }
01144 
01145     /* see if this is a process or just a thread */
01146     mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char));
01147     sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName);
01148     //printf("### mem_cmd_s: %s\n", mem_cmd_s);
01149     if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) {
01150       /* another pid with the same command name, rsz and vsz was not found,
01151          so this is a new process and we can add the amount of memory used by
01152          it */
01153       info.pmem += pmem;
01154       info.vsz += vsz; info.rsz += rsz;
01155 
01156       if (info.open_fd >= 0) // if no error occured so far
01157         info.open_fd += open_fd;
01158       /* add an entry in the list so that next time we see another thread of
01159          this process we don't add the amount of  memory again */
01160       mem_cmd_list[listSize++] = mem_cmd_s;
01161     } else {
01162       free(mem_cmd_s);
01163     }
01164 
01165     /* if we monitor the current process, we have two extra opened files
01166        that we shouldn't take into account (the output file for ps and
01167        /proc/<pid>/fd/)
01168     */
01169     if (crt_pid == getpid())
01170       info.open_fd -= 2;
01171   }
01172 
01173   fclose(fp);
01174   unlink(ps_f);
01175   free(children);
01176   for (i = 0; i < listSize; i++) {
01177     free(mem_cmd_list[i]);
01178   }
01179   free(mem_cmd_list);
01180 #endif
01181 }

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Defines

Generated at Wed Feb 9 16:32:58 2011 for Gaudi Framework, version v22r0 by Doxygen version 1.6.2 written by Dimitri van Heesch, © 1997-2004