Gaudi Framework, version v20r3

Generated: 24 Nov 2008

monitor_utils.cpp

Go to the documentation of this file.
00001 
00007 /*
00008  * ApMon - Application Monitoring Tool
00009  * Version: 2.2.0
00010  *
00011  * Copyright (C) 2006 California Institute of Technology
00012  *
00013  * Permission is hereby granted, free of charge, to use, copy and modify 
00014  * this software and its documentation (the "Software") for any
00015  * purpose, provided that existing copyright notices are retained in 
00016  * all copies and that this notice is included verbatim in any distributions
00017  * or substantial portions of the Software. 
00018  * This software is a part of the MonALISA framework (http://monalisa.cacr.caltech.edu).
00019  * Users of the Software are asked to feed back problems, benefits,
00020  * and/or suggestions about the software to the MonALISA Development Team
00021  * (developers@monalisa.cern.ch). Support for this software - fixing of bugs,
00022  * incorporation of new features - is done on a best effort basis. All bug
00023  * fixes and enhancements will be made available under the same terms and
00024  * conditions as the original software,
00025 
00026  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
00027  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
00028  * OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY DERIVATIVES THEREOF,
00029  * EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030 
00031  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
00032  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
00033  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE IS
00034  * PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE NO
00035  * OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
00036  * MODIFICATIONS.
00037  */
00038 
00039 #include "ApMon.h"
00040 #include "monitor_utils.h"
00041 #include "proc_utils.h"
00042 #include "utils.h"
00043 #include "mon_constants.h"
00044 
00045 using namespace apmon_utils;
00046 using namespace apmon_mon_utils;
00047 
00048 void ApMon::sendJobInfo() {
00049 #ifndef WIN32
00050   int i;
00051   long crtTime;
00052 
00053  /* the apMon_free() function calls sendJobInfo() from another thread and 
00054      we need mutual exclusion */
00055   pthread_mutex_lock(&mutexBack);
00056 
00057   if (nMonJobs == 0) {
00058     logger(WARNING, "There are no jobs to be monitored, not sending job monitoring information.");
00059     pthread_mutex_unlock(&mutexBack);
00060     return;
00061   }
00062 
00063   crtTime = time(NULL);
00064   logger(INFO, "Sending job monitoring information...");
00065   lastJobInfoSend = (time_t)crtTime; 
00066 
00067   /* send monitoring information for all the jobs specified by the user */
00068   for (i = 0; i < nMonJobs; i++) 
00069     sendOneJobInfo(monJobs[i]);
00070 
00071   pthread_mutex_unlock(&mutexBack);
00072 #endif
00073 }
00074 
00075 void ApMon::updateJobInfo(MonitoredJob job) {
00076   bool needJobInfo, needDiskInfo;
00077   bool jobExists = true;
00078   char err_msg[200];
00079 
00080   PsInfo jobInfo;
00081   JobDirInfo dirInfo;
00082 
00083   /**** runtime, CPU & memory usage information ****/ 
00084   needJobInfo = actJobMonitorParams[JOB_RUN_TIME] 
00085     || actJobMonitorParams[JOB_CPU_TIME] 
00086     || actJobMonitorParams[JOB_CPU_USAGE] 
00087     || actJobMonitorParams[JOB_MEM_USAGE] 
00088     || actJobMonitorParams[JOB_VIRTUALMEM] 
00089     || actJobMonitorParams[JOB_RSS] 
00090     || actJobMonitorParams[JOB_OPEN_FILES];
00091   if (needJobInfo) {
00092     try {
00093       readJobInfo(job.pid, jobInfo);
00094       currentJobVals[JOB_RUN_TIME] = jobInfo.etime;
00095       currentJobVals[JOB_CPU_TIME] = jobInfo.cputime; 
00096       currentJobVals[JOB_CPU_USAGE] = jobInfo.pcpu;
00097       currentJobVals[JOB_MEM_USAGE] = jobInfo.pmem; 
00098       currentJobVals[JOB_VIRTUALMEM] = jobInfo.vsz;
00099       currentJobVals[JOB_RSS] = jobInfo.rsz;
00100 
00101       if (jobInfo.open_fd < 0)
00102         jobRetResults[JOB_OPEN_FILES] = RET_ERROR;
00103       currentJobVals[JOB_OPEN_FILES] = jobInfo.open_fd;
00104 
00105     } catch (runtime_error &err) {
00106       logger(WARNING, err.what());
00107       jobRetResults[JOB_RUN_TIME] = jobRetResults[JOB_CPU_TIME] = 
00108         jobRetResults[JOB_CPU_USAGE] = jobRetResults[JOB_MEM_USAGE] =
00109         jobRetResults[JOB_VIRTUALMEM] = jobRetResults[JOB_RSS] =
00110         jobRetResults[JOB_OPEN_FILES] = RET_ERROR;
00111       strcpy(err_msg, err.what());
00112       if (strstr(err_msg, "does not exist") != NULL)
00113         jobExists = false;
00114     } 
00115   }
00116 
00117   /* if the monitored job has terminated, remove it */
00118   if (!jobExists) {
00119     try {
00120       removeJobToMonitor(job.pid);
00121     } catch (runtime_error &err) {
00122       logger(WARNING, err.what());
00123     }
00124     return;
00125   }
00126 
00127   /* disk usage information */
00128   needDiskInfo = actJobMonitorParams[JOB_DISK_TOTAL] 
00129     || actJobMonitorParams[JOB_DISK_USED] 
00130     || actJobMonitorParams[JOB_DISK_FREE] 
00131     || actJobMonitorParams[JOB_DISK_USAGE] 
00132     || actJobMonitorParams[JOB_WORKDIR_SIZE];
00133   if (needDiskInfo) {
00134     try {
00135       readJobDiskUsage(job, dirInfo);
00136       currentJobVals[JOB_WORKDIR_SIZE] = dirInfo.workdir_size;
00137       currentJobVals[JOB_DISK_TOTAL] = dirInfo.disk_total; 
00138       currentJobVals[JOB_DISK_USED] = dirInfo.disk_used;
00139       currentJobVals[JOB_DISK_USAGE] = dirInfo.disk_usage; 
00140       currentJobVals[JOB_DISK_FREE] = dirInfo.disk_free;
00141     } catch (runtime_error& err) {
00142       logger(WARNING, err.what());
00143       jobRetResults[JOB_WORKDIR_SIZE] = jobRetResults[JOB_DISK_TOTAL] 
00144         = jobRetResults[JOB_DISK_USED] 
00145         = jobRetResults[JOB_DISK_USAGE] 
00146         = jobRetResults[JOB_DISK_FREE] 
00147         = RET_ERROR;
00148     }
00149   }
00150 }
00151  
00152 void ApMon::sendOneJobInfo(MonitoredJob job) {
00153   int i;
00154   int nParams = 0;
00155 
00156   char **paramNames, **paramValues;
00157   int *valueTypes;
00158 
00159   valueTypes = (int *)malloc(nJobMonitorParams * sizeof(int));
00160   paramNames = (char **)malloc(nJobMonitorParams * sizeof(char *));
00161   paramValues = (char **)malloc(nJobMonitorParams * sizeof(char *));
00162 
00163   for (i = 0; i < nJobMonitorParams; i++) {
00164     jobRetResults[i] = RET_SUCCESS;
00165     currentJobVals[i] = 0;
00166   }
00167     
00168   updateJobInfo(job);
00169 
00170   for (i = 0; i < nJobMonitorParams; i++) {
00171     if (actJobMonitorParams[i] && jobRetResults[i] != RET_ERROR) {
00172      
00173       paramNames[nParams] = jobMonitorParams[i];
00174       paramValues[nParams] = (char *)&currentJobVals[i];
00175       valueTypes[nParams] = XDR_REAL64;
00176       nParams++;
00177     } 
00178     /* don't disable the parameter (maybe for another job it can be
00179          obtained) */
00180       /*
00181         else
00182         if (autoDisableMonitoring)
00183         actJobMonitorParams[ind] = 0;
00184       */
00185   }
00186 
00187   if (nParams == 0) {
00188     free(paramNames); free(valueTypes);
00189     free(paramValues);
00190     return;
00191   }
00192 
00193   try {
00194     if (nParams > 0)
00195       sendParameters(job.clusterName, job.nodeName, nParams, 
00196                      paramNames, valueTypes, paramValues);
00197   } catch (runtime_error& err) {
00198     logger(WARNING, err.what());
00199   }
00200 
00201   free(paramNames);
00202   free(valueTypes);
00203   free(paramValues);
00204 }
00205 
00206 
00207 void ApMon::updateSysInfo() {
00208   int needCPUInfo, needSwapPagesInfo, needLoadInfo, needMemInfo,
00209     needNetInfo, needUptime, needProcessesInfo, needNetstatInfo; 
00210  
00211   /**** CPU usage information ****/ 
00212   needCPUInfo = actSysMonitorParams[SYS_CPU_USAGE] 
00213     || actSysMonitorParams[SYS_CPU_USR] 
00214     || actSysMonitorParams[SYS_CPU_SYS] 
00215     || actSysMonitorParams[SYS_CPU_NICE] 
00216     || actSysMonitorParams[SYS_CPU_IDLE];
00217   if (needCPUInfo) {
00218     try {
00219       ProcUtils::getCPUUsage(*this, currentSysVals[SYS_CPU_USAGE], 
00220                              currentSysVals[SYS_CPU_USR], 
00221                              currentSysVals[SYS_CPU_SYS],
00222                              currentSysVals[SYS_CPU_NICE], 
00223                              currentSysVals[SYS_CPU_IDLE], numCPUs);
00224     } catch (procutils_error &perr) {
00225       /* "permanent" error (the parameters could not be obtained) */
00226       logger(WARNING, perr.what());
00227       sysRetResults[SYS_CPU_USAGE] = sysRetResults[SYS_CPU_SYS] = 
00228         sysRetResults[SYS_CPU_USR] = sysRetResults[SYS_CPU_NICE] =
00229         sysRetResults[SYS_CPU_IDLE] = sysRetResults[SYS_CPU_USAGE] = PROCUTILS_ERROR;
00230     } catch (runtime_error &err) {
00231       /* temporary error (next time we might be able to get the paramerers) */
00232       logger(WARNING, err.what());
00233       sysRetResults[SYS_CPU_USAGE] = sysRetResults[SYS_CPU_SYS] 
00234         = sysRetResults[SYS_CPU_USR] 
00235         = sysRetResults[SYS_CPU_NICE] 
00236         = sysRetResults[SYS_CPU_IDLE] 
00237         = sysRetResults[SYS_CPU_USAGE] 
00238         = RET_ERROR;
00239     }
00240   }
00241 
00242   needSwapPagesInfo = actSysMonitorParams[SYS_PAGES_IN] 
00243     || actSysMonitorParams[SYS_PAGES_OUT] 
00244     || actSysMonitorParams[SYS_SWAP_IN] 
00245     || actSysMonitorParams[SYS_SWAP_OUT];
00246 
00247   if (needSwapPagesInfo) {
00248     try {
00249       ProcUtils::getSwapPages(*this, currentSysVals[SYS_PAGES_IN], 
00250                               currentSysVals[SYS_PAGES_OUT], 
00251                               currentSysVals[SYS_SWAP_IN],
00252                               currentSysVals[SYS_SWAP_OUT]);
00253     } catch (procutils_error &perr) {
00254       /* "permanent" error (the parameters could not be obtained) */
00255       logger(WARNING, perr.what());
00256       sysRetResults[SYS_PAGES_IN] = sysRetResults[SYS_PAGES_OUT] = 
00257       sysRetResults[SYS_SWAP_OUT] = sysRetResults[SYS_SWAP_IN] = PROCUTILS_ERROR;
00258     } catch (runtime_error &err) {
00259       /* temporary error (next time we might be able to get the paramerers) */
00260       logger(WARNING, err.what());
00261       sysRetResults[SYS_PAGES_IN] = sysRetResults[SYS_PAGES_OUT] 
00262         = sysRetResults[SYS_SWAP_IN] 
00263         = sysRetResults[SYS_SWAP_OUT] 
00264         = RET_ERROR;
00265     }
00266   }
00267 
00268   needLoadInfo = actSysMonitorParams[SYS_LOAD1] 
00269     || actSysMonitorParams[SYS_LOAD5] 
00270     || actSysMonitorParams[SYS_LOAD15];
00271     
00272   if (needLoadInfo) {
00273     double dummyVal;
00274     try {
00275       /* the number of processes is now obtained with the getProcesses()
00276          function, not with getLoad() */
00277       ProcUtils::getLoad(currentSysVals[SYS_LOAD1], currentSysVals[SYS_LOAD5], 
00278                          currentSysVals[SYS_LOAD15],dummyVal);
00279     } catch (procutils_error& perr) {
00280       /* "permanent" error (the parameters could not be obtained) */
00281       logger(WARNING, perr.what());
00282       sysRetResults[SYS_LOAD1] = sysRetResults[SYS_LOAD5] 
00283         = sysRetResults[SYS_LOAD15] 
00284         = PROCUTILS_ERROR;
00285     }
00286   }
00287 
00288   /**** get statistics about the current processes ****/
00289   needProcessesInfo = actSysMonitorParams[SYS_PROCESSES];
00290   if (needProcessesInfo) {
00291     try {
00292       ProcUtils::getProcesses(currentSysVals[SYS_PROCESSES], 
00293                               currentProcessStates);
00294     } catch (runtime_error& err) {
00295       logger(WARNING, err.what());
00296       sysRetResults[SYS_PROCESSES] = RET_ERROR;
00297     }
00298   }
00299 
00300   /**** get the amount of memory currently in use ****/
00301   needMemInfo = actSysMonitorParams[SYS_MEM_USED] 
00302     || actSysMonitorParams[SYS_MEM_FREE] 
00303     || actSysMonitorParams[SYS_SWAP_USED] 
00304     || actSysMonitorParams[SYS_SWAP_FREE] 
00305     || actSysMonitorParams[SYS_MEM_USAGE] 
00306     || actSysMonitorParams[SYS_SWAP_USAGE];
00307 
00308   if (needMemInfo) {
00309     try {
00310       ProcUtils::getMemUsed(currentSysVals[SYS_MEM_USED], 
00311                             currentSysVals[SYS_MEM_FREE], 
00312                             currentSysVals[SYS_SWAP_USED],
00313                             currentSysVals[SYS_SWAP_FREE]);
00314       currentSysVals[SYS_MEM_USAGE] = 100 * currentSysVals[SYS_MEM_USED] /
00315         (currentSysVals[SYS_MEM_USED] +  currentSysVals[SYS_MEM_FREE]); 
00316       currentSysVals[SYS_SWAP_USAGE] = 100 * currentSysVals[SYS_SWAP_USED] /
00317         (currentSysVals[SYS_SWAP_USED] +  currentSysVals[SYS_SWAP_FREE]); 
00318     } catch (procutils_error &perr) {
00319       logger(WARNING, perr.what());
00320       sysRetResults[SYS_MEM_USED] = sysRetResults[SYS_MEM_FREE] = 
00321         sysRetResults[SYS_SWAP_USED] = sysRetResults[SYS_SWAP_FREE] = 
00322         sysRetResults[SYS_MEM_USAGE] = sysRetResults[SYS_SWAP_USAGE] = 
00323         PROCUTILS_ERROR;
00324     }
00325   }
00326 
00327   
00328   /**** network monitoring information ****/
00329   needNetInfo = actSysMonitorParams[SYS_NET_IN] || 
00330     actSysMonitorParams[SYS_NET_OUT] || actSysMonitorParams[SYS_NET_ERRS];
00331   if (needNetInfo && this -> nInterfaces > 0) {
00332     try {
00333       ProcUtils::getNetInfo(*this, &currentNetIn, &currentNetOut, 
00334                             &currentNetErrs);
00335     } catch (procutils_error &perr) {
00336       logger(WARNING, perr.what());
00337       sysRetResults[SYS_NET_IN] = sysRetResults[SYS_NET_OUT] = 
00338         sysRetResults[SYS_NET_ERRS] = PROCUTILS_ERROR;     
00339     } catch (runtime_error &err) {
00340       logger(WARNING, err.what());
00341       sysRetResults[SYS_NET_IN] = sysRetResults[SYS_NET_OUT] = 
00342         sysRetResults[SYS_NET_ERRS] = RET_ERROR; 
00343     }
00344   }
00345 
00346   needNetstatInfo = actSysMonitorParams[SYS_NET_SOCKETS] || 
00347     actSysMonitorParams[SYS_NET_TCP_DETAILS];
00348   if (needNetstatInfo) {
00349     try {
00350       ProcUtils::getNetstatInfo(*this, this -> currentNSockets, 
00351                                 this -> currentSocketsTCP); 
00352     } catch (runtime_error &err) {
00353       logger(WARNING, err.what());
00354       sysRetResults[SYS_NET_SOCKETS] = sysRetResults[SYS_NET_TCP_DETAILS] = 
00355         RET_ERROR; 
00356     }
00357   }
00358 
00359   needUptime = actSysMonitorParams[SYS_UPTIME];
00360   if (needUptime) {
00361     try {
00362       currentSysVals[SYS_UPTIME] = ProcUtils::getUpTime();
00363     } catch (procutils_error &perr) {
00364       logger(WARNING, perr.what());
00365       sysRetResults[SYS_UPTIME] = PROCUTILS_ERROR;
00366     } 
00367   }
00368 
00369 }
00370 
00371 void ApMon::sendSysInfo() {
00372 #ifndef WIN32
00373   int nParams = 0, maxNParams;
00374   int i;
00375   long crtTime;
00376 
00377   int *valueTypes;
00378   char **paramNames, **paramValues;
00379 
00380   crtTime = time(NULL);
00381   logger(INFO, "Sending system monitoring information...");
00382 
00383   /* make some initializations only the first time this
00384      function is called */
00385   if (this -> sysInfo_first) {
00386     for (i = 0; i < this -> nInterfaces; i++) {
00387      this -> lastBytesSent[i] = this -> lastBytesReceived[i] = 0.0;
00388      this -> lastNetErrs[i] = 0;
00389      
00390     }
00391     this -> sysInfo_first = FALSE;
00392   }
00393 
00394   /* the maximum number of parameters that can be included in a datagram */
00395   /* (the last three terms are for: parameters corresponding to each possible
00396      state of the processes, parameters corresponding to the types of open 
00397      sockets, parameters corresponding to each possible state of the TCP
00398      sockets.) */
00399   maxNParams = nSysMonitorParams + (2 * nInterfaces - 1) + 15 + 4 + 
00400     N_TCP_STATES;
00401 
00402   valueTypes = (int *)malloc(maxNParams * sizeof(int));
00403   paramNames = (char **)malloc(maxNParams * sizeof(char *));
00404   paramValues = (char **)malloc(maxNParams * sizeof(char *));
00405 
00406   for (i = 0; i < nSysMonitorParams; i++) {
00407     if (actSysMonitorParams[i] > 0) /* if the parameter is enabled */
00408       sysRetResults[i] = RET_SUCCESS;
00409     else /* mark it with RET_ERROR so that it will be not included in the
00410             datagram */
00411       sysRetResults[i] = RET_ERROR;
00412   }
00413 
00414   updateSysInfo();
00415 
00416   for (i = 0; i < nSysMonitorParams; i++) {
00417     if (i == SYS_NET_IN || i == SYS_NET_OUT || i == SYS_NET_ERRS ||
00418         i == SYS_NET_SOCKETS || i == SYS_NET_TCP_DETAILS || i == SYS_PROCESSES)
00419       continue;
00420 
00421     if (sysRetResults[i] == PROCUTILS_ERROR) {
00422       /* could not read the requested information from /proc, disable this
00423          parameter */
00424       if (autoDisableMonitoring)
00425         actSysMonitorParams[i] = 0;
00426     } else if (sysRetResults[i] != RET_ERROR) {
00427       /* the parameter is enabled and there were no errors obtaining it */
00428       paramNames[nParams] = strdup(sysMonitorParams[i]);
00429       paramValues[nParams] = (char *)&currentSysVals[i];
00430       valueTypes[nParams] = XDR_REAL64;
00431       nParams++;
00432     } 
00433   }
00434 
00435   if (actSysMonitorParams[SYS_NET_IN] == 1) {
00436     if (sysRetResults[SYS_NET_IN] == PROCUTILS_ERROR) {
00437       if (autoDisableMonitoring)
00438         actSysMonitorParams[SYS_NET_IN] = 0;
00439     } else  if (sysRetResults[SYS_NET_IN] != RET_ERROR) {
00440       for (i = 0; i < nInterfaces; i++) { 
00441         paramNames[nParams] =  (char *)malloc(20 * sizeof(char));
00442         strcpy(paramNames[nParams], interfaceNames[i]);
00443         strcat(paramNames[nParams], "_in");
00444         paramValues[nParams] = (char *)&currentNetIn[i];
00445         valueTypes[nParams] = XDR_REAL64;
00446         nParams++;
00447       }
00448     }
00449   }
00450 
00451   if (actSysMonitorParams[SYS_NET_OUT] == 1) {
00452     if (sysRetResults[SYS_NET_IN] == PROCUTILS_ERROR) {
00453       if (autoDisableMonitoring)
00454         actSysMonitorParams[SYS_NET_OUT] = 0;
00455     } else  if (sysRetResults[SYS_NET_OUT] != RET_ERROR) {
00456       for (i = 0; i < nInterfaces; i++) { 
00457         paramNames[nParams] =  (char *)malloc(20 * sizeof(char));
00458         strcpy(paramNames[nParams], interfaceNames[i]);
00459         strcat(paramNames[nParams], "_out");
00460         paramValues[nParams] = (char *)&currentNetOut[i];
00461         valueTypes[nParams] = XDR_REAL64;
00462         nParams++;
00463       }
00464     }
00465   }
00466 
00467   if (actSysMonitorParams[SYS_NET_ERRS] == 1) {
00468     if (sysRetResults[SYS_NET_ERRS] == PROCUTILS_ERROR) {
00469       if (autoDisableMonitoring)
00470         actSysMonitorParams[SYS_NET_ERRS] = 0;
00471     } else  if (sysRetResults[SYS_NET_ERRS] != RET_ERROR) {
00472       for (i = 0; i < nInterfaces; i++) { 
00473         paramNames[nParams] =  (char *)malloc(20 * sizeof(char));
00474         strcpy(paramNames[nParams], interfaceNames[i]);
00475         strcat(paramNames[nParams], "_errs");
00476         paramValues[nParams] = (char *)&currentNetErrs[i];
00477         valueTypes[nParams] = XDR_REAL64;
00478         nParams++;
00479       }
00480     }
00481   }
00482 
00483 
00484   if (actSysMonitorParams[SYS_PROCESSES] == 1) {
00485     if (sysRetResults[SYS_PROCESSES] != RET_ERROR) {
00486       char act_states[] = {'D', 'R', 'S', 'T', 'Z'};
00487       for (i = 0; i < 5; i++) { 
00488         paramNames[nParams] =  (char *)malloc(20 * sizeof(char));
00489         sprintf(paramNames[nParams], "processes_%c", act_states[i]);
00490         paramValues[nParams] = (char *)&currentProcessStates[act_states[i] - 65];
00491         valueTypes[nParams] = XDR_REAL64;
00492         nParams++;
00493       }
00494     }
00495   }
00496 
00497   if (actSysMonitorParams[SYS_NET_SOCKETS] == 1) {
00498     if (sysRetResults[SYS_NET_SOCKETS] != RET_ERROR) {
00499       const char *socket_types[] = {"tcp", "udp", "icm", "unix"};
00500       for (i = 0; i < 4; i++) { 
00501         paramNames[nParams] =  (char *)malloc(30 * sizeof(char));
00502         sprintf(paramNames[nParams], "sockets_%s", socket_types[i]);
00503         paramValues[nParams] = (char *)&currentNSockets[i];
00504         valueTypes[nParams] = XDR_REAL64;
00505         nParams++;
00506       }
00507     }
00508   }
00509 
00510   if (actSysMonitorParams[SYS_NET_TCP_DETAILS] == 1) {
00511     if (sysRetResults[SYS_NET_TCP_DETAILS] != RET_ERROR) {
00512       for (i = 0; i < N_TCP_STATES; i++) { 
00513         paramNames[nParams] =  (char *)malloc(30 * sizeof(char));
00514         sprintf(paramNames[nParams], "sockets_tcp_%s", socketStatesMapTCP[i]);
00515         paramValues[nParams] = (char *)&currentSocketsTCP[i];
00516         valueTypes[nParams] = XDR_REAL64;
00517         nParams++;
00518       }
00519     }
00520   }
00521 
00522   try {
00523     if (nParams > 0)
00524       sendParameters(sysMonCluster, sysMonNode, nParams, 
00525                      paramNames, valueTypes, paramValues);
00526   } catch (runtime_error& err) {
00527     logger(WARNING, err.what());
00528   }
00529 
00530   this -> lastSysInfoSend = crtTime;
00531 
00532   if (sysRetResults[SYS_NET_IN] == RET_SUCCESS) {
00533     free(currentNetIn);
00534     free(currentNetOut);
00535     free(currentNetErrs);
00536   }
00537 
00538   for (i = 0; i < nParams; i++)
00539     free(paramNames[i]);
00540   free(paramNames);
00541   free(valueTypes);
00542   free(paramValues);
00543 #endif
00544 }
00545 
00546 void ApMon::updateGeneralInfo() {
00547 
00548   strcpy(cpuVendor, ""); strcpy(cpuFamily, "");
00549   strcpy(cpuModel, ""); strcpy(cpuModelName, "");
00550 
00551   if (actGenMonitorParams[GEN_CPU_MHZ] == 1 || 
00552       actGenMonitorParams[GEN_BOGOMIPS] == 1 || 
00553       actGenMonitorParams[GEN_CPU_VENDOR_ID] == 1 ||
00554       actGenMonitorParams[GEN_CPU_FAMILY] == 1 || 
00555       actGenMonitorParams[GEN_CPU_MODEL] == 1 ||
00556       actGenMonitorParams[GEN_CPU_MODEL_NAME] == 1) {
00557     try {
00558       ProcUtils::getCPUInfo(*this);
00559     } catch (procutils_error& err) {
00560       logger(WARNING, err.what());
00561       genRetResults[GEN_CPU_MHZ] = genRetResults[GEN_BOGOMIPS] = PROCUTILS_ERROR;
00562     }
00563   }
00564 
00565   if (actGenMonitorParams[GEN_TOTAL_MEM] == 1 || 
00566       actGenMonitorParams[GEN_TOTAL_SWAP] == 1) {
00567     try {
00568       ProcUtils::getSysMem(currentGenVals[GEN_TOTAL_MEM], 
00569                            currentGenVals[GEN_TOTAL_SWAP]);
00570     } catch (procutils_error& perr) {
00571       logger(WARNING, perr.what());
00572       genRetResults[GEN_TOTAL_MEM] = genRetResults[GEN_TOTAL_SWAP] = PROCUTILS_ERROR;
00573     }
00574   }
00575 
00576   if (this -> numCPUs > 0)
00577     currentGenVals[GEN_NO_CPUS] = this -> numCPUs;
00578   else
00579     genRetResults[GEN_NO_CPUS] = PROCUTILS_ERROR;
00580 }
00581 
00582 void ApMon::sendGeneralInfo() {
00583 #ifndef WIN32
00584   int nParams, maxNParams, i;
00585   long crtTime;
00586   char tmp_s[50];
00587   
00588   char **paramNames, **paramValues;
00589   int *valueTypes;
00590 
00591   crtTime = time(NULL);
00592   logger(INFO, "Sending general monitoring information...");
00593   
00594   maxNParams = nGenMonitorParams + numIPs;
00595   valueTypes = (int *)malloc(maxNParams * sizeof(int));
00596   paramNames = (char **)malloc(maxNParams * sizeof(char *));
00597   paramValues = (char **)malloc(maxNParams * sizeof(char *));
00598   
00599   nParams = 0;
00600 
00601   updateGeneralInfo();
00602 
00603   if (actGenMonitorParams[GEN_HOSTNAME]) {
00604     paramNames[nParams] = strdup(genMonitorParams[GEN_HOSTNAME]);
00605     valueTypes[nParams] = XDR_STRING;
00606     paramValues[nParams] = myHostname;
00607     nParams++;
00608   }
00609 
00610   if (actGenMonitorParams[GEN_IP]) {
00611     for (i = 0; i < this -> numIPs; i++) {
00612       strcpy(tmp_s, "ip_");
00613       strcat(tmp_s, interfaceNames[i]);
00614       paramNames[nParams] = strdup(tmp_s);
00615       valueTypes[nParams] = XDR_STRING;
00616       paramValues[nParams] = this -> allMyIPs[i];
00617       nParams++;
00618     }
00619   }
00620 
00621   if (actGenMonitorParams[GEN_CPU_VENDOR_ID] && strlen(cpuVendor) != 0) {
00622     paramNames[nParams] = strdup(genMonitorParams[GEN_CPU_VENDOR_ID]);
00623     valueTypes[nParams] = XDR_STRING;
00624     paramValues[nParams] = cpuVendor;
00625     nParams++;
00626   }
00627 
00628   if (actGenMonitorParams[GEN_CPU_FAMILY] && strlen(cpuFamily) != 0) {
00629     paramNames[nParams] = strdup(genMonitorParams[GEN_CPU_FAMILY]);
00630     valueTypes[nParams] = XDR_STRING;
00631     paramValues[nParams] = cpuFamily;
00632     nParams++;
00633   }
00634 
00635   if (actGenMonitorParams[GEN_CPU_MODEL] && strlen(cpuModel) != 0) {
00636     paramNames[nParams] = strdup(genMonitorParams[GEN_CPU_MODEL]);
00637     valueTypes[nParams] = XDR_STRING;
00638     paramValues[nParams] = cpuModel;
00639     nParams++;
00640   }
00641   
00642   if (actGenMonitorParams[GEN_CPU_MODEL_NAME] && strlen(cpuModelName) != 0) {
00643     paramNames[nParams] = strdup(genMonitorParams[GEN_CPU_MODEL_NAME]);
00644     valueTypes[nParams] = XDR_STRING;
00645     paramValues[nParams] = cpuModelName;
00646     nParams++;
00647   }
00648 
00649   for (i = 0; i < nGenMonitorParams; i++) {
00650     if (actGenMonitorParams[i] != 1 || i == GEN_IP || i == GEN_HOSTNAME ||
00651         i == GEN_CPU_VENDOR_ID || i == GEN_CPU_FAMILY || i == GEN_CPU_MODEL
00652         || i == GEN_CPU_MODEL_NAME)
00653       continue;
00654 
00655     if (genRetResults[i] == PROCUTILS_ERROR) {
00656       /* could not read the requested information from /proc, disable this
00657          parameter */
00658       if (autoDisableMonitoring)
00659         actGenMonitorParams[i] = 0;
00660     } else if (genRetResults[i] != RET_ERROR) {
00661       paramNames[nParams] = strdup(genMonitorParams[i]);
00662       paramValues[nParams] = (char *)&currentGenVals[i];
00663       valueTypes[nParams] = XDR_REAL64;
00664       nParams++;
00665     } 
00666   }
00667 
00668   try {
00669     if (nParams > 0)
00670       sendParameters(sysMonCluster, sysMonNode, nParams, 
00671                      paramNames, valueTypes, paramValues);
00672   } catch (runtime_error& err) {
00673     logger(WARNING, err.what());
00674   }
00675 
00676   for (i = 0; i < nParams; i++)
00677     free(paramNames[i]);
00678   free(paramNames);
00679   free(valueTypes);
00680   free(paramValues);
00681 #endif
00682 }
00683 
00684 void ApMon::initMonitoring() {
00685   int i;
00686 
00687   this -> autoDisableMonitoring = true;
00688   this -> sysMonitoring = false;
00689   this -> jobMonitoring = false;
00690   this -> genMonitoring = false;
00691   this -> confCheck = false;
00692 
00693 #ifndef WIN32
00694   pthread_mutex_init(&this -> mutex, NULL);
00695   pthread_mutex_init(&this -> mutexBack, NULL);
00696   pthread_mutex_init(&this -> mutexCond, NULL);
00697   pthread_cond_init(&this -> confChangedCond, NULL);
00698 #else
00699   logger(INFO, "init mutexes...");
00700   this -> mutex     = CreateMutex(NULL, FALSE, NULL);
00701   this -> mutexBack = CreateMutex(NULL, FALSE, NULL);
00702   this -> mutexCond = CreateMutex(NULL, FALSE, NULL);
00703   this -> confChangedCond = CreateEvent(NULL, FALSE, FALSE, NULL);
00704 
00705   // Initialize the Windows Sockets library
00706 
00707   WORD wVersionRequested;
00708   WSADATA wsaData;
00709   int err;
00710   wVersionRequested = MAKEWORD( 2, 0 );
00711   err = WSAStartup( wVersionRequested, &wsaData );
00712   if ( err != 0 ) {
00713     logger(FATAL, "Could not initialize the Windows Sockets library (WS2_32.dll)");
00714   }
00715 
00716 #endif
00717 
00718   this -> haveBkThread = false;
00719   this -> bkThreadStarted = false;
00720   this -> stopBkThread = false;
00721 
00722   this -> recheckChanged = false;
00723   this -> jobMonChanged = false;
00724   this -> sysMonChanged = false;
00725 
00726   this -> recheckInterval = RECHECK_INTERVAL;
00727   this -> crtRecheckInterval = RECHECK_INTERVAL;
00728   this -> jobMonitorInterval = JOB_MONITOR_INTERVAL;
00729   this -> sysMonitorInterval = SYS_MONITOR_INTERVAL;
00730 
00731   this -> nSysMonitorParams = initSysParams(this -> sysMonitorParams);
00732 
00733   this -> nGenMonitorParams = initGenParams(this -> genMonitorParams);
00734 
00735   this -> nJobMonitorParams = initJobParams(this -> jobMonitorParams);
00736 
00737   initSocketStatesMapTCP(this -> socketStatesMapTCP);
00738 
00739   this -> sysInfo_first = true;
00740   
00741   try {
00742     this -> lastSysInfoSend = ProcUtils::getBootTime();
00743   } catch (procutils_error& perr) {
00744     logger(WARNING, perr.what());
00745     logger(WARNING, "The first system monitoring values may be inaccurate");
00746     this -> lastSysInfoSend = 0;
00747   } 
00748 
00749   for (i = 0; i < nSysMonitorParams; i++)
00750     this -> lastSysVals[i] = 0;
00751 
00752   //this -> lastUsrTime = this -> lastSysTime = 0;
00753   //this -> lastNiceTime = this -> lastIdleTime = 0;
00754 
00755   for (i = 0; i < nSysMonitorParams; i++) {
00756     actSysMonitorParams[i] = 1;
00757     sysRetResults[i] = RET_SUCCESS;
00758   }
00759 
00760   for (i = 0; i < nGenMonitorParams; i++) {
00761     actGenMonitorParams[i] = 1;
00762     genRetResults[i] = RET_SUCCESS;
00763   }
00764 
00765   for (i = 0; i < nJobMonitorParams; i++) {
00766     actJobMonitorParams[i] = 1;
00767     jobRetResults[i] = RET_SUCCESS;
00768   }
00769 
00770   this -> maxMsgRate = MAX_MSG_RATE;
00771 }
00772 
00773 void ApMon::parseXApMonLine(char *line) {
00774   bool flag, found;
00775   int ind;
00776   char tmp[MAX_STRING_LEN], logmsg[200];
00777   char *param, *value;
00778 //  char sbuf[MAX_STRING_LEN];
00779 //  char *pbuf = sbuf;
00780   const char *sep = " =";
00781 
00782   strcpy(tmp, line);
00783   char *tmp2 = tmp + strlen("xApMon_");
00784 
00785   param = strtok/*_r*/(tmp2, sep);//, &pbuf);
00786   value = strtok/*_r*/(NULL, sep);//, &pbuf);
00787 
00788   /* if it is an on/off parameter, assign its value to flag */
00789   if (strcmp(value, "on") == 0)
00790     flag = true;
00791   else /* if it is not an on/off paramenter the value of flag doesn't matter */
00792     flag = false;
00793 
00794   pthread_mutex_lock(&mutexBack);
00795 
00796   found = false;
00797   if (strcmp(param, "job_monitoring") == 0) {
00798     this -> jobMonitoring = flag; found = true;
00799   }
00800   if (strcmp(param, "sys_monitoring") == 0) {
00801     this -> sysMonitoring = flag; found = true;
00802   }
00803   if (strcmp(param, "job_interval") == 0) {
00804     this -> jobMonitorInterval = atol(value); found = true;
00805   }
00806   if (strcmp(param, "sys_interval") == 0) {
00807     this -> sysMonitorInterval = atol(value); found = true;
00808   }
00809   if (strcmp(param, "general_info") == 0) {
00810     this -> genMonitoring = flag; found = true;
00811   }
00812   if (strcmp(param, "conf_recheck") == 0) {
00813     this -> confCheck = flag; found = true;
00814   }
00815   if (strcmp(param, "recheck_interval") == 0) {
00816     this -> recheckInterval = this -> crtRecheckInterval = atol(value); 
00817     found = true;
00818   }
00819   if (strcmp(param, "auto_disable") == 0) {
00820     this -> autoDisableMonitoring = flag;
00821     found = true;
00822   }
00823   if (strcmp(param, "maxMsgRate") == 0) {
00824     this -> maxMsgRate = atoi(value);
00825     found = true;
00826   }
00827 
00828   if (found) {
00829     pthread_mutex_unlock(&mutexBack);
00830     return;
00831   }
00832 
00833   if (strstr(param, "sys_") == param) {
00834     ind = getVectIndex(param + strlen("sys_"), sysMonitorParams, 
00835                        nSysMonitorParams);
00836     if (ind < 0) {
00837       pthread_mutex_unlock(&mutexBack);
00838       sprintf(logmsg, "Invalid parameter name in the configuration file: %s",
00839             param);
00840       logger(WARNING, logmsg);
00841       return;
00842     }
00843     found = true;
00844     this -> actSysMonitorParams[ind] = (int)flag;
00845   }
00846 
00847   if (strstr(param, "job_") == param) {
00848     ind = getVectIndex(param + strlen("job_"), jobMonitorParams, 
00849                        nJobMonitorParams);
00850     
00851     if (ind < 0) {
00852       pthread_mutex_unlock(&mutexBack);
00853       sprintf(logmsg, "Invalid parameter name in the configuration file: %s",
00854             param);
00855       logger(WARNING, logmsg);
00856       return;
00857     }
00858     found = true;
00859     this -> actJobMonitorParams[ind] = (int)flag;
00860   }
00861 
00862   if (!found) {
00863     ind = getVectIndex(param, genMonitorParams, 
00864                        nGenMonitorParams);
00865     if (ind < 0) {
00866       pthread_mutex_unlock(&mutexBack);
00867       sprintf(logmsg, "Invalid parameter name in the configuration file: %s",
00868               param);
00869       logger(WARNING, logmsg);
00870       return;
00871     } else {
00872       found = true;
00873       this -> actGenMonitorParams[ind] = (int)flag;
00874     }
00875   }
00876 
00877   if (!found) {
00878     sprintf(logmsg, "Invalid parameter name in the configuration file: %s",
00879             param);
00880     logger(WARNING, logmsg);
00881   }
00882   pthread_mutex_unlock(&mutexBack);
00883 }
00884   
00885 long *apmon_mon_utils::getChildren(long pid, int& nChildren) 
00886   throw(runtime_error) {
00887 #ifdef WIN32
00888         return 0;
00889 #else
00890   FILE *pf;
00891   long *pids, *ppids, *children;
00892   int nProcesses;
00893   int i, j, status;
00894   pid_t cpid;
00895   char *argv[4], msg[MAX_STRING_LEN], sval[20];
00896   bool processFound;
00897   long mypid = getpid();
00898   char children_f[50], np_f[50], cmd[200];
00899 
00900   /* generate the names of the temporary files in which we have the output
00901      of some commands */
00902   sprintf(children_f, "/tmp/apmon_children%ld", mypid);
00903   sprintf(np_f, "/tmp/apmon_np%ld", mypid);
00904 
00905   switch (cpid = fork()) {
00906   case -1:
00907     throw runtime_error("[ getChildren() ] Unable to fork()");
00908   case 0:
00909     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
00910     sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s",
00911             children_f, children_f, np_f);
00912     argv[2] = cmd;
00913     /*
00914     argv[2] = "ps --no-headers -eo ppid,pid > /tmp/apmon_children.txt && wc -l /tmp/out_children.txt > /tmp/out_np.txt";
00915     */
00916     argv[3] = 0;
00917     execv("/bin/sh", argv);
00918     exit(RET_ERROR);
00919   default:
00920     if (waitpid(cpid, &status, 0) == -1) {
00921       sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid);
00922       unlink(children_f); unlink(np_f);
00923       throw runtime_error(msg); 
00924     }
00925   }
00926 
00927   /* find the number of processes */
00928   pf = fopen(np_f, "rt");
00929   if (pf == NULL) {
00930     unlink(np_f); unlink(children_f);
00931     sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined",
00932             pid);
00933     throw runtime_error(msg);
00934   } 
00935   fscanf(pf, "%d", &nProcesses);
00936   fclose(pf);   
00937   unlink(np_f);
00938 
00939   pids = (long *)malloc(nProcesses * sizeof(long)); 
00940   ppids = (long *)malloc(nProcesses * sizeof(long)); 
00941   /* estimated maximum size for the returned vector; it will be realloc'ed */
00942   children = (long *)malloc(nProcesses * sizeof(long));
00943 
00944   pf = fopen(children_f, "rt");
00945   if (pf == NULL) {
00946     free(pids); free(ppids); free(children);
00947     unlink(children_f);
00948     sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid);
00949     throw runtime_error(msg);
00950   } 
00951  
00952   /* scan the output of the ps command and find the children of the process,
00953    and also check if the process is still running */
00954   children[0] = pid; nChildren = 1;
00955   processFound = false;
00956   for (i = 0; i < nProcesses; i++) {
00957     fscanf(pf, "%ld %ld", &ppids[i], &pids[i]);
00958     /* look for the given process */
00959     if (pids[i] == children[0] || ppids[i] == children[0])
00960       processFound = true;
00961     if (ppids[i] == children[0]) {
00962       children[nChildren++] = pids[i];
00963     }
00964   }
00965   fclose(pf);
00966   unlink(children_f);
00967 
00968   if (processFound == false) {
00969     free(pids); free(ppids); free(children);
00970     nChildren = 0;
00971     sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid);
00972     throw runtime_error(msg);
00973   } 
00974 
00975   /* find the PIDs of all the descendant processes */
00976   i = 1;
00977   while (i < nChildren) {
00978     /* find the children of the i-th child */ 
00979     for (j = 0; j < nProcesses; j++) {
00980       if (ppids[j] == children[i]) {
00981         children[nChildren++] = pids[j];
00982       }
00983     }
00984     i++;
00985   }
00986 
00987   sprintf(msg, "Sub-processes for process %ld: ", pid);
00988   for (i = 0; i < nChildren; i++) {
00989     sprintf(sval, "%ld ", children[i]);
00990     if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1)
00991       strcat(msg, sval);
00992   }
00993   logger(DEBUG, msg);
00994 
00995   free(pids); free(ppids);
00996   children = (long *)realloc(children, (nChildren) * sizeof(long));
00997   return children;
00998 #endif
00999 }
01000 
01001 void apmon_mon_utils::readJobInfo(long pid, PsInfo& info) throw(runtime_error) {
01002 #ifndef WIN32
01003   long *children;
01004   FILE *fp;
01005   int i, nChildren, status, ch, ret, open_fd;
01006   char *cmd , *mem_cmd_s, *argv[4], *ret_s;
01007   char pid_s[10], msg[100];
01008   char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[100];
01009   char etime_s[20], cputime_s[20];
01010   double rsz, vsz;
01011   double etime, cputime;
01012   double pcpu, pmem;
01013   /* this list contains strings of the form "rsz_vsz_command" for every pid;
01014      it is used to avoid adding several times processes that have multiple 
01015      threads and appear in ps as sepparate processes, occupying exactly the 
01016      same amount of memory and having the same command name. For every line 
01017      from the output of the ps command we verify if the rsz_vsz_command 
01018      combination is already in the list.
01019   */
01020   char **mem_cmd_list;
01021   int listSize;
01022   long cpid, crt_pid;
01023   //unsigned int maxCmdLen = 5 * MAX_STRING_LEN;
01024   long mypid = getpid();
01025   char ps_f[50];
01026 
01027   /* get the list of the process' descendants */
01028   children = getChildren(pid, nChildren);
01029 
01030   /* generate a name for the temporary file which holds the output of the 
01031      ps command */
01032   sprintf(ps_f, "/tmp/apmon_ps%ld", mypid);
01033 
01034   unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char);
01035   cmd = (char *)malloc (cmdLen);
01036 
01037   /* issue the "ps" command to obtain information on all the descendants */
01038   strcpy(cmd, "ps --no-headers --pid ");
01039   for (i = 0; i < nChildren - 1; i++) {
01040     sprintf(pid_s, "%ld,", children[i]);
01041     if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) {
01042       free(cmd);
01043       sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01044               pid);
01045       throw runtime_error(msg);
01046     }
01047     strcat(cmd, pid_s);
01048     //strcat(cmd, " 2>&1");
01049   }
01050 
01051   /* the last part of the command */
01052   sprintf(pid_s, "%ld", children[nChildren - 1]);
01053   sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f);
01054   if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) {
01055     free(cmd);
01056     sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01057               pid);
01058     throw runtime_error(msg);
01059   }
01060   strcat(cmd, pid_s);
01061   strcat(cmd, cmdName);
01062   //strcat(cmd, " 2>&1");
01063 
01064   switch (cpid = fork()) {
01065   case -1:
01066     free(cmd);
01067     sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid);
01068     throw runtime_error(msg);
01069   case 0:
01070     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01071     argv[2] = cmd; argv[3] = 0;
01072     execv("/bin/sh", argv);
01073     exit(RET_ERROR);
01074   default:
01075     if (waitpid(cpid, &status, 0) == -1) {
01076       free(cmd);
01077       sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid);
01078       throw runtime_error(msg); 
01079     }
01080   }
01081 
01082   free(cmd);
01083   fp = fopen(ps_f, "rt");
01084   if (fp == NULL) {
01085     sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid);
01086     throw runtime_error(msg);
01087   }
01088 
01089   /* parse the output file */
01090   info.etime = info.cputime = 0;
01091   info.pcpu = info.pmem = 0;
01092   info.rsz = info.vsz = 0;
01093   info.open_fd = 0;
01094   mem_cmd_list = (char **)malloc(nChildren * sizeof(char *));
01095   listSize = 0;
01096   cmdName[0] = 0;
01097   while (1) {
01098     ret_s = fgets(buf, MAX_STRING_LEN, fp);
01099     if (ret_s == NULL) 
01100       break;
01101     buf[MAX_STRING_LEN - 1] = 0;
01102 
01103     /* if the line was too long and fgets hasn't read it entirely, */
01104     /* keep only the first 512 chars from the line */
01105     ch = fgetc(fp); // see if we are at the end of the file
01106     ungetc(ch, fp);
01107     if (buf[strlen(buf) - 1] != 10 && ch != EOF) { 
01108       while (1) {
01109         char *sret = fgets(buf2, MAX_STRING_LEN, fp);
01110         if (sret == NULL || buf[strlen(buf) - 1] == 10)
01111           break;
01112       }
01113     }
01114 
01115     ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s, 
01116                  cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName);
01117     if (ret != 8) {
01118       fclose(fp);
01119       unlink(ps_f);
01120       free(children);
01121       for (i = 0; i < listSize; i++) {
01122         free(mem_cmd_list[i]);
01123       }
01124       free(mem_cmd_list);
01125       throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command");
01126     }
01127 
01128     /* etime is the maximum of the elapsed times for the subprocesses */
01129     etime = parsePSTime(etime_s);
01130     info.etime = (info.etime > etime) ? info.etime : etime;
01131 
01132     /* cputime is the sum of the cpu times for the subprocesses */
01133     cputime = parsePSTime(cputime_s);
01134     info.cputime += cputime;
01135     info.pcpu += pcpu;
01136 
01137     /* get the number of opened file descriptors */
01138     try {
01139       open_fd = ProcUtils::countOpenFiles(crt_pid);
01140     } catch (procutils_error& err) {
01141       logger(WARNING, err.what());
01142       /* don't throw an exception if we couldn't read the number of files */
01143       open_fd = PROCUTILS_ERROR;
01144     }
01145 
01146     /* see if this is a process or just a thread */
01147     mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char));
01148     sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName);
01149     //printf("### mem_cmd_s: %s\n", mem_cmd_s);
01150     if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) {
01151       /* aonther pid with the same command name, rsz and vsz was not found,
01152          so this is a new process and we can add the amount of memory used by 
01153          it */
01154       info.pmem += pmem;
01155       info.vsz += vsz; info.rsz += rsz;
01156 
01157       if (info.open_fd >= 0) // if no error occured so far
01158         info.open_fd += open_fd;
01159       /* add an entry in the list so that next time we see another thread of
01160          this process we don't add the amount of  memory again */
01161       mem_cmd_list[listSize++] = mem_cmd_s;     
01162     } else {
01163       free(mem_cmd_s);
01164     }
01165 
01166     /* if we monitor the current process, we have two extra opened files
01167        that we shouldn't take into account (the output file for ps and
01168        /proc/<pid>/fd/)
01169     */
01170     if (crt_pid == getpid())
01171       info.open_fd -= 2;
01172   } 
01173 
01174   fclose(fp);
01175   unlink(ps_f);
01176   free(children);
01177   for (i = 0; i < listSize; i++) {
01178     free(mem_cmd_list[i]);
01179   }
01180   free(mem_cmd_list);
01181 #endif
01182 }
01183 
01184 long apmon_mon_utils::parsePSTime(char *s) {
01185   long days, hours, mins, secs;
01186 
01187   if (strchr(s, '-') != NULL) {
01188     sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs);
01189     return 24 * 3600 * days + 3600 * hours + 60 * mins + secs;
01190   } else {
01191     if (strchr(s, ':') != NULL && strchr(s, ':') !=  strrchr(s, ':')) {
01192        sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs);
01193        return 3600 * hours + 60 * mins + secs;
01194     } else {
01195       if (strchr(s, ':') != NULL) {
01196         sscanf(s, "%ld:%ld", &mins, &secs);
01197         return 60 * mins + secs;
01198       } else {
01199         return RET_ERROR;
01200       }
01201     }
01202   }
01203 }
01204 
01205 void apmon_mon_utils::readJobDiskUsage(MonitoredJob job, 
01206                                 JobDirInfo& info) throw(runtime_error) {
01207 #ifndef WIN32
01208   int status;
01209   pid_t cpid;
01210   char *cmd, s_tmp[20], *argv[4], msg[100];
01211   FILE *fp;
01212   long mypid = getpid();
01213   char du_f[50], df_f[50]; 
01214 
01215   /* generate names for the temporary files which will hold the output of the
01216      du and df commands */
01217   sprintf(du_f, "/tmp/apmon_du%ld", mypid);
01218   sprintf(df_f, "/tmp/apmon_df%ld", mypid);
01219   
01220   if (strlen(job.workdir) == 0) {
01221     sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid);
01222     throw runtime_error(msg);
01223   }
01224   
01225   cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char));
01226   strcpy(cmd, "PRT=`du -Lsk ");
01227   strcat(cmd, job.workdir);
01228   //strcat(cmd, " | tail -1 | cut -f 1 > ");
01229   strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > "); 
01230   strcat(cmd, du_f);
01231 
01232 
01233   switch (cpid = fork()) {
01234   case -1:
01235     sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01236     throw runtime_error(msg);
01237   case 0:
01238     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01239     argv[2] = cmd; argv[3] = 0;
01240     execv("/bin/sh", argv);
01241     exit(RET_ERROR);
01242   default:
01243     if (waitpid(cpid, &status, 0) == -1) {
01244       free(cmd);
01245       sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid);
01246       unlink(du_f); unlink(df_f);
01247       throw runtime_error(msg); 
01248     }
01249   }
01250 
01251   strcpy(cmd, "PRT=`df -m ");
01252   strcat(cmd, job.workdir);
01253   //strcat(cmd, " | tail -1 > ");
01254   strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
01255 
01256   strcat(cmd, df_f);
01257   //printf("### cmd: %s\n", cmd);
01258 
01259   switch (cpid = fork()) {
01260   case -1:
01261     sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01262     throw runtime_error(msg);
01263   case 0:
01264     argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01265     argv[2] = cmd; argv[3] = 0;
01266     execv("/bin/sh", argv);
01267     exit(RET_ERROR);
01268   default:
01269     if (waitpid(cpid, &status, 0) == -1) {
01270       free(cmd);
01271       sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid);
01272       unlink(du_f); unlink(df_f);
01273       throw runtime_error(msg); 
01274     }
01275   }
01276 
01277   free(cmd);
01278   fp = fopen(du_f, "rt");
01279   if (fp == NULL) {
01280     sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid);
01281     throw runtime_error(msg);
01282   }
01283 
01284   fscanf(fp, "%lf", &(info.workdir_size));
01285   /* keep the directory size in MB */
01286   info.workdir_size /= 1024.0;
01287   fclose(fp);
01288   unlink(du_f);
01289  
01290   fp = fopen(df_f, "rt");
01291   if (fp == NULL) {
01292     sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid);
01293     throw runtime_error(msg);
01294   }
01295   fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total), 
01296          &(info.disk_used), &(info.disk_free), &(info.disk_usage));
01297   fclose(fp);
01298   unlink(df_f);
01299 #endif
01300 }

Generated at Mon Nov 24 14:38:47 2008 for Gaudi Framework, version v20r3 by Doxygen version 1.5.6 written by Dimitri van Heesch, © 1997-2004