Gaudi Framework, version v21r8

Home   Generated: 17 Mar 2010

apmon_mon_utils Namespace Reference


Classes

struct  PsInfo
 Structure that holds information about a job, as obtained from the ps command. More...
struct  JobDirInfo
 Structure that holds information about the disk usage for a job. More...

Typedefs

typedef struct
apmon_mon_utils::PsInfo 
PsInfo
 Structure that holds information about a job, as obtained from the ps command.
typedef struct
apmon_mon_utils::JobDirInfo 
JobDirInfo
 Structure that holds information about the disk usage for a job.

Functions

longgetChildren (long pid, int &nChildren) throw (runtime_error)
 Determines all the descendants of a given process.
void readJobInfo (long pid, PsInfo &info) throw (runtime_error)
 Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.
long parsePSTime (char *s)
 Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.
void readJobDiskUsage (MonitoredJob job, JobDirInfo &info) throw (runtime_error)
 If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.


Typedef Documentation

Structure that holds information about the disk usage for a job.

Structure that holds information about a job, as obtained from the ps command.


Function Documentation

long * apmon_mon_utils::getChildren ( long  pid,
int &  nChildren 
) throw (runtime_error)

Determines all the descendants of a given process.

Definition at line 883 of file monitor_utils.cpp.

00884                        {
00885 #ifdef WIN32
00886         return 0;
00887 #else
00888   FILE *pf;
00889   long *pids, *ppids, *children;
00890   int nProcesses;
00891   int i, j, status;
00892   pid_t cpid;
00893   char *argv[4], msg[MAX_STRING_LEN], sval[20];
00894   bool processFound;
00895   long mypid = getpid();
00896   char children_f[50], np_f[50], cmd[200];
00897 
00898   /* generate the names of the temporary files in which we have the output
00899      of some commands */
00900   sprintf(children_f, "/tmp/apmon_children%ld", mypid);
00901   sprintf(np_f, "/tmp/apmon_np%ld", mypid);
00902 
00903   switch (cpid = fork()) {
00904   case -1:
00905     throw runtime_error("[ getChildren() ] Unable to fork()");
00906   case 0:
00907     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
00908     sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s",
00909             children_f, children_f, np_f);
00910     argv[2] = cmd;
00911     /*
00912     argv[2] = "ps --no-headers -eo ppid,pid > /tmp/apmon_children.txt && wc -l /tmp/out_children.txt > /tmp/out_np.txt";
00913     */
00914     argv[3] = 0;
00915     execv("/bin/sh", argv);
00916     exit(RET_ERROR);
00917   default:
00918     if (waitpid(cpid, &status, 0) == -1) {
00919       sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid);
00920       unlink(children_f); unlink(np_f);
00921       throw runtime_error(msg); 
00922     }
00923   }
00924 
00925   /* find the number of processes */
00926   pf = fopen(np_f, "rt");
00927   if (pf == NULL) {
00928     unlink(np_f); unlink(children_f);
00929     sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined",
00930             pid);
00931     throw runtime_error(msg);
00932   } 
00933   fscanf(pf, "%d", &nProcesses);
00934   fclose(pf);   
00935   unlink(np_f);
00936 
00937   pids = (long *)malloc(nProcesses * sizeof(long)); 
00938   ppids = (long *)malloc(nProcesses * sizeof(long)); 
00939   /* estimated maximum size for the returned vector; it will be realloc'ed */
00940   children = (long *)malloc(nProcesses * sizeof(long));
00941 
00942   pf = fopen(children_f, "rt");
00943   if (pf == NULL) {
00944     free(pids); free(ppids); free(children);
00945     unlink(children_f);
00946     sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid);
00947     throw runtime_error(msg);
00948   } 
00949  
00950   /* scan the output of the ps command and find the children of the process,
00951    and also check if the process is still running */
00952   children[0] = pid; nChildren = 1;
00953   processFound = false;
00954   for (i = 0; i < nProcesses; i++) {
00955     fscanf(pf, "%ld %ld", &ppids[i], &pids[i]);
00956     /* look for the given process */
00957     if (pids[i] == children[0] || ppids[i] == children[0])
00958       processFound = true;
00959     if (ppids[i] == children[0]) {
00960       children[nChildren++] = pids[i];
00961     }
00962   }
00963   fclose(pf);
00964   unlink(children_f);
00965 
00966   if (processFound == false) {
00967     free(pids); free(ppids); free(children);
00968     nChildren = 0;
00969     sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid);
00970     throw runtime_error(msg);
00971   } 
00972 
00973   /* find the PIDs of all the descendant processes */
00974   i = 1;
00975   while (i < nChildren) {
00976     /* find the children of the i-th child */ 
00977     for (j = 0; j < nProcesses; j++) {
00978       if (ppids[j] == children[i]) {
00979         children[nChildren++] = pids[j];
00980       }
00981     }
00982     i++;
00983   }
00984 
00985   sprintf(msg, "Sub-processes for process %ld: ", pid);
00986   for (i = 0; i < nChildren; i++) {
00987     sprintf(sval, "%ld ", children[i]);
00988     if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1)
00989       strcat(msg, sval);
00990   }
00991   logger(DEBUG, msg);
00992 
00993   free(pids); free(ppids);
00994   children = (long *)realloc(children, (nChildren) * sizeof(long));
00995   return children;
00996 #endif
00997 }

long apmon_mon_utils::parsePSTime ( char *  s  ) 

Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.

Definition at line 1182 of file monitor_utils.cpp.

01182                                          {
01183   long days, hours, mins, secs;
01184 
01185   if (strchr(s, '-') != NULL) {
01186     sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs);
01187     return 24 * 3600 * days + 3600 * hours + 60 * mins + secs;
01188   } else {
01189     if (strchr(s, ':') != NULL && strchr(s, ':') !=  strrchr(s, ':')) {
01190        sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs);
01191        return 3600 * hours + 60 * mins + secs;
01192     } else {
01193       if (strchr(s, ':') != NULL) {
01194         sscanf(s, "%ld:%ld", &mins, &secs);
01195         return 60 * mins + secs;
01196       } else {
01197         return RET_ERROR;
01198       }
01199     }
01200   }
01201 }

void apmon_mon_utils::readJobDiskUsage ( MonitoredJob  job,
JobDirInfo info 
) throw (runtime_error)

If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.

Sizes are given in MB.

Definition at line 1203 of file monitor_utils.cpp.

01204                                                                        {
01205 #ifndef WIN32
01206   int status;
01207   pid_t cpid;
01208   char *cmd, s_tmp[20], *argv[4], msg[100];
01209   FILE *fp;
01210   long mypid = getpid();
01211   char du_f[50], df_f[50]; 
01212 
01213   /* generate names for the temporary files which will hold the output of the
01214      du and df commands */
01215   sprintf(du_f, "/tmp/apmon_du%ld", mypid);
01216   sprintf(df_f, "/tmp/apmon_df%ld", mypid);
01217   
01218   if (strlen(job.workdir) == 0) {
01219     sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid);
01220     throw runtime_error(msg);
01221   }
01222   
01223   cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char));
01224   strcpy(cmd, "PRT=`du -Lsk ");
01225   strcat(cmd, job.workdir);
01226   //strcat(cmd, " | tail -1 | cut -f 1 > ");
01227   strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > "); 
01228   strcat(cmd, du_f);
01229 
01230 
01231   switch (cpid = fork()) {
01232   case -1:
01233     sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01234     throw runtime_error(msg);
01235   case 0:
01236     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01237     argv[2] = cmd; argv[3] = 0;
01238     execv("/bin/sh", argv);
01239     exit(RET_ERROR);
01240   default:
01241     if (waitpid(cpid, &status, 0) == -1) {
01242       free(cmd);
01243       sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid);
01244       unlink(du_f); unlink(df_f);
01245       throw runtime_error(msg); 
01246     }
01247   }
01248 
01249   strcpy(cmd, "PRT=`df -m ");
01250   strcat(cmd, job.workdir);
01251   //strcat(cmd, " | tail -1 > ");
01252   strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
01253 
01254   strcat(cmd, df_f);
01255   //printf("### cmd: %s\n", cmd);
01256 
01257   switch (cpid = fork()) {
01258   case -1:
01259     sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01260     throw runtime_error(msg);
01261   case 0:
01262     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01263     argv[2] = cmd; argv[3] = 0;
01264     execv("/bin/sh", argv);
01265     exit(RET_ERROR);
01266   default:
01267     if (waitpid(cpid, &status, 0) == -1) {
01268       free(cmd);
01269       sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid);
01270       unlink(du_f); unlink(df_f);
01271       throw runtime_error(msg); 
01272     }
01273   }
01274 
01275   free(cmd);
01276   fp = fopen(du_f, "rt");
01277   if (fp == NULL) {
01278     sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid);
01279     throw runtime_error(msg);
01280   }
01281 
01282   fscanf(fp, "%lf", &(info.workdir_size));
01283   /* keep the directory size in MB */
01284   info.workdir_size /= 1024.0;
01285   fclose(fp);
01286   unlink(du_f);
01287  
01288   fp = fopen(df_f, "rt");
01289   if (fp == NULL) {
01290     sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid);
01291     throw runtime_error(msg);
01292   }
01293   fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total), 
01294          &(info.disk_used), &(info.disk_free), &(info.disk_usage));
01295   fclose(fp);
01296   unlink(df_f);
01297 #endif
01298 }

void apmon_mon_utils::readJobInfo ( long  pid,
PsInfo info 
) throw (runtime_error)

Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.

Definition at line 999 of file monitor_utils.cpp.

00999                                                                              {
01000 #ifndef WIN32
01001   long *children;
01002   FILE *fp;
01003   int i, nChildren, status, ch, ret, open_fd;
01004   char *cmd , *mem_cmd_s, *argv[4], *ret_s;
01005   char pid_s[10], msg[100];
01006   char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[MAX_STRING_LEN1];
01007   char etime_s[20], cputime_s[20];
01008   double rsz, vsz;
01009   double etime, cputime;
01010   double pcpu, pmem;
01011   /* this list contains strings of the form "rsz_vsz_command" for every pid;
01012      it is used to avoid adding several times processes that have multiple 
01013      threads and appear in ps as sepparate processes, occupying exactly the 
01014      same amount of memory and having the same command name. For every line 
01015      from the output of the ps command we verify if the rsz_vsz_command 
01016      combination is already in the list.
01017   */
01018   char **mem_cmd_list;
01019   int listSize;
01020   long cpid, crt_pid;
01021   //unsigned int maxCmdLen = 5 * MAX_STRING_LEN;
01022   long mypid = getpid();
01023   char ps_f[50];
01024 
01025   /* get the list of the process' descendants */
01026   children = getChildren(pid, nChildren);
01027 
01028   /* generate a name for the temporary file which holds the output of the 
01029      ps command */
01030   sprintf(ps_f, "/tmp/apmon_ps%ld", mypid);
01031 
01032   unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char);
01033   cmd = (char *)malloc (cmdLen);
01034 
01035   /* issue the "ps" command to obtain information on all the descendants */
01036   strcpy(cmd, "ps --no-headers --pid ");
01037   for (i = 0; i < nChildren - 1; i++) {
01038     sprintf(pid_s, "%ld,", children[i]);
01039     if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) {
01040       free(cmd);
01041       sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01042               pid);
01043       throw runtime_error(msg);
01044     }
01045     strcat(cmd, pid_s);
01046     //strcat(cmd, " 2>&1");
01047   }
01048 
01049   /* the last part of the command */
01050   sprintf(pid_s, "%ld", children[nChildren - 1]);
01051   sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f);
01052   if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) {
01053     free(cmd);
01054     sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01055               pid);
01056     throw runtime_error(msg);
01057   }
01058   strcat(cmd, pid_s);
01059   strcat(cmd, cmdName);
01060   //strcat(cmd, " 2>&1");
01061 
01062   switch (cpid = fork()) {
01063   case -1:
01064     free(cmd);
01065     sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid);
01066     throw runtime_error(msg);
01067   case 0:
01068     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01069     argv[2] = cmd; argv[3] = 0;
01070     execv("/bin/sh", argv);
01071     exit(RET_ERROR);
01072   default:
01073     if (waitpid(cpid, &status, 0) == -1) {
01074       free(cmd);
01075       sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid);
01076       throw runtime_error(msg); 
01077     }
01078   }
01079 
01080   free(cmd);
01081   fp = fopen(ps_f, "rt");
01082   if (fp == NULL) {
01083     sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid);
01084     throw runtime_error(msg);
01085   }
01086 
01087   /* parse the output file */
01088   info.etime = info.cputime = 0;
01089   info.pcpu = info.pmem = 0;
01090   info.rsz = info.vsz = 0;
01091   info.open_fd = 0;
01092   mem_cmd_list = (char **)malloc(nChildren * sizeof(char *));
01093   listSize = 0;
01094   cmdName[0] = 0;
01095   while (1) {
01096     ret_s = fgets(buf, MAX_STRING_LEN, fp);
01097     if (ret_s == NULL) 
01098       break;
01099     buf[MAX_STRING_LEN - 1] = 0;
01100 
01101     /* if the line was too long and fgets hasn't read it entirely, */
01102     /* keep only the first 512 chars from the line */
01103     ch = fgetc(fp); // see if we are at the end of the file
01104     ungetc(ch, fp);
01105     if (buf[strlen(buf) - 1] != 10 && ch != EOF) { 
01106       while (1) {
01107         char *sret = fgets(buf2, MAX_STRING_LEN, fp);
01108         if (sret == NULL || buf[strlen(buf) - 1] == 10)
01109           break;
01110       }
01111     }
01112 
01113     ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s, 
01114                  cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName);
01115     if (ret != 8) {
01116       fclose(fp);
01117       unlink(ps_f);
01118       free(children);
01119       for (i = 0; i < listSize; i++) {
01120         free(mem_cmd_list[i]);
01121       }
01122       free(mem_cmd_list);
01123       throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command");
01124     }
01125 
01126     /* etime is the maximum of the elapsed times for the subprocesses */
01127     etime = parsePSTime(etime_s);
01128     info.etime = (info.etime > etime) ? info.etime : etime;
01129 
01130     /* cputime is the sum of the cpu times for the subprocesses */
01131     cputime = parsePSTime(cputime_s);
01132     info.cputime += cputime;
01133     info.pcpu += pcpu;
01134 
01135     /* get the number of opened file descriptors */
01136     try {
01137       open_fd = ProcUtils::countOpenFiles(crt_pid);
01138     } catch (procutils_error& err) {
01139       logger(WARNING, err.what());
01140       /* don't throw an exception if we couldn't read the number of files */
01141       open_fd = PROCUTILS_ERROR;
01142     }
01143 
01144     /* see if this is a process or just a thread */
01145     mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char));
01146     sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName);
01147     //printf("### mem_cmd_s: %s\n", mem_cmd_s);
01148     if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) {
01149       /* aonther pid with the same command name, rsz and vsz was not found,
01150          so this is a new process and we can add the amount of memory used by 
01151          it */
01152       info.pmem += pmem;
01153       info.vsz += vsz; info.rsz += rsz;
01154 
01155       if (info.open_fd >= 0) // if no error occured so far
01156         info.open_fd += open_fd;
01157       /* add an entry in the list so that next time we see another thread of
01158          this process we don't add the amount of  memory again */
01159       mem_cmd_list[listSize++] = mem_cmd_s;     
01160     } else {
01161       free(mem_cmd_s);
01162     }
01163 
01164     /* if we monitor the current process, we have two extra opened files
01165        that we shouldn't take into account (the output file for ps and
01166        /proc/<pid>/fd/)
01167     */
01168     if (crt_pid == getpid())
01169       info.open_fd -= 2;
01170   } 
01171 
01172   fclose(fp);
01173   unlink(ps_f);
01174   free(children);
01175   for (i = 0; i < listSize; i++) {
01176     free(mem_cmd_list[i]);
01177   }
01178   free(mem_cmd_list);
01179 #endif
01180 }


Generated at Wed Mar 17 18:21:04 2010 for Gaudi Framework, version v21r8 by Doxygen version 1.5.6 written by Dimitri van Heesch, © 1997-2004