00001
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include "ApMon.h"
00040 #include "monitor_utils.h"
00041 #include "proc_utils.h"
00042 #include "utils.h"
00043 #include "mon_constants.h"
00044
00045 using namespace apmon_utils;
00046 using namespace apmon_mon_utils;
00047
00048 void ApMon::sendJobInfo() {
00049 #ifndef WIN32
00050 int i;
00051 long crtTime;
00052
00053
00054
00055 pthread_mutex_lock(&mutexBack);
00056
00057 if (nMonJobs == 0) {
00058 logger(WARNING, "There are no jobs to be monitored, not sending job monitoring information.");
00059 pthread_mutex_unlock(&mutexBack);
00060 return;
00061 }
00062
00063 crtTime = time(NULL);
00064 logger(INFO, "Sending job monitoring information...");
00065 lastJobInfoSend = (time_t)crtTime;
00066
00067
00068 for (i = 0; i < nMonJobs; i++)
00069 sendOneJobInfo(monJobs[i]);
00070
00071 pthread_mutex_unlock(&mutexBack);
00072 #endif
00073 }
00074
00075 void ApMon::updateJobInfo(MonitoredJob job) {
00076 bool needJobInfo, needDiskInfo;
00077 bool jobExists = true;
00078 char err_msg[200];
00079
00080 PsInfo jobInfo;
00081 JobDirInfo dirInfo;
00082
00083
00084 needJobInfo = actJobMonitorParams[JOB_RUN_TIME]
00085 || actJobMonitorParams[JOB_CPU_TIME]
00086 || actJobMonitorParams[JOB_CPU_USAGE]
00087 || actJobMonitorParams[JOB_MEM_USAGE]
00088 || actJobMonitorParams[JOB_VIRTUALMEM]
00089 || actJobMonitorParams[JOB_RSS]
00090 || actJobMonitorParams[JOB_OPEN_FILES];
00091 if (needJobInfo) {
00092 try {
00093 readJobInfo(job.pid, jobInfo);
00094 currentJobVals[JOB_RUN_TIME] = jobInfo.etime;
00095 currentJobVals[JOB_CPU_TIME] = jobInfo.cputime;
00096 currentJobVals[JOB_CPU_USAGE] = jobInfo.pcpu;
00097 currentJobVals[JOB_MEM_USAGE] = jobInfo.pmem;
00098 currentJobVals[JOB_VIRTUALMEM] = jobInfo.vsz;
00099 currentJobVals[JOB_RSS] = jobInfo.rsz;
00100
00101 if (jobInfo.open_fd < 0)
00102 jobRetResults[JOB_OPEN_FILES] = RET_ERROR;
00103 currentJobVals[JOB_OPEN_FILES] = jobInfo.open_fd;
00104
00105 } catch (runtime_error &err) {
00106 logger(WARNING, err.what());
00107 jobRetResults[JOB_RUN_TIME] = jobRetResults[JOB_CPU_TIME] =
00108 jobRetResults[JOB_CPU_USAGE] = jobRetResults[JOB_MEM_USAGE] =
00109 jobRetResults[JOB_VIRTUALMEM] = jobRetResults[JOB_RSS] =
00110 jobRetResults[JOB_OPEN_FILES] = RET_ERROR;
00111 strcpy(err_msg, err.what());
00112 if (strstr(err_msg, "does not exist") != NULL)
00113 jobExists = false;
00114 }
00115 }
00116
00117
00118 if (!jobExists) {
00119 try {
00120 removeJobToMonitor(job.pid);
00121 } catch (runtime_error &err) {
00122 logger(WARNING, err.what());
00123 }
00124 return;
00125 }
00126
00127
00128 needDiskInfo = actJobMonitorParams[JOB_DISK_TOTAL]
00129 || actJobMonitorParams[JOB_DISK_USED]
00130 || actJobMonitorParams[JOB_DISK_FREE]
00131 || actJobMonitorParams[JOB_DISK_USAGE]
00132 || actJobMonitorParams[JOB_WORKDIR_SIZE];
00133 if (needDiskInfo) {
00134 try {
00135 readJobDiskUsage(job, dirInfo);
00136 currentJobVals[JOB_WORKDIR_SIZE] = dirInfo.workdir_size;
00137 currentJobVals[JOB_DISK_TOTAL] = dirInfo.disk_total;
00138 currentJobVals[JOB_DISK_USED] = dirInfo.disk_used;
00139 currentJobVals[JOB_DISK_USAGE] = dirInfo.disk_usage;
00140 currentJobVals[JOB_DISK_FREE] = dirInfo.disk_free;
00141 } catch (runtime_error& err) {
00142 logger(WARNING, err.what());
00143 jobRetResults[JOB_WORKDIR_SIZE] = jobRetResults[JOB_DISK_TOTAL]
00144 = jobRetResults[JOB_DISK_USED]
00145 = jobRetResults[JOB_DISK_USAGE]
00146 = jobRetResults[JOB_DISK_FREE]
00147 = RET_ERROR;
00148 }
00149 }
00150 }
00151
00152 void ApMon::sendOneJobInfo(MonitoredJob job) {
00153 int i;
00154 int nParams = 0;
00155
00156 char **paramNames, **paramValues;
00157 int *valueTypes;
00158
00159 valueTypes = (int *)malloc(nJobMonitorParams * sizeof(int));
00160 paramNames = (char **)malloc(nJobMonitorParams * sizeof(char *));
00161 paramValues = (char **)malloc(nJobMonitorParams * sizeof(char *));
00162
00163 for (i = 0; i < nJobMonitorParams; i++) {
00164 jobRetResults[i] = RET_SUCCESS;
00165 currentJobVals[i] = 0;
00166 }
00167
00168 updateJobInfo(job);
00169
00170 for (i = 0; i < nJobMonitorParams; i++) {
00171 if (actJobMonitorParams[i] && jobRetResults[i] != RET_ERROR) {
00172
00173 paramNames[nParams] = jobMonitorParams[i];
00174 paramValues[nParams] = (char *)¤tJobVals[i];
00175 valueTypes[nParams] = XDR_REAL64;
00176 nParams++;
00177 }
00178
00179
00180
00181
00182
00183
00184
00185 }
00186
00187 if (nParams == 0) {
00188 free(paramNames); free(valueTypes);
00189 free(paramValues);
00190 return;
00191 }
00192
00193 try {
00194 if (nParams > 0)
00195 sendParameters(job.clusterName, job.nodeName, nParams,
00196 paramNames, valueTypes, paramValues);
00197 } catch (runtime_error& err) {
00198 logger(WARNING, err.what());
00199 }
00200
00201 free(paramNames);
00202 free(valueTypes);
00203 free(paramValues);
00204 }
00205
00206
00207 void ApMon::updateSysInfo() {
00208 int needCPUInfo, needSwapPagesInfo, needLoadInfo, needMemInfo,
00209 needNetInfo, needUptime, needProcessesInfo, needNetstatInfo;
00210
00211
00212 needCPUInfo = actSysMonitorParams[SYS_CPU_USAGE]
00213 || actSysMonitorParams[SYS_CPU_USR]
00214 || actSysMonitorParams[SYS_CPU_SYS]
00215 || actSysMonitorParams[SYS_CPU_NICE]
00216 || actSysMonitorParams[SYS_CPU_IDLE];
00217 if (needCPUInfo) {
00218 try {
00219 ProcUtils::getCPUUsage(*this, currentSysVals[SYS_CPU_USAGE],
00220 currentSysVals[SYS_CPU_USR],
00221 currentSysVals[SYS_CPU_SYS],
00222 currentSysVals[SYS_CPU_NICE],
00223 currentSysVals[SYS_CPU_IDLE], numCPUs);
00224 } catch (procutils_error &perr) {
00225
00226 logger(WARNING, perr.what());
00227 sysRetResults[SYS_CPU_USAGE] = sysRetResults[SYS_CPU_SYS] =
00228 sysRetResults[SYS_CPU_USR] = sysRetResults[SYS_CPU_NICE] =
00229 sysRetResults[SYS_CPU_IDLE] = sysRetResults[SYS_CPU_USAGE] = PROCUTILS_ERROR;
00230 } catch (runtime_error &err) {
00231
00232 logger(WARNING, err.what());
00233 sysRetResults[SYS_CPU_USAGE] = sysRetResults[SYS_CPU_SYS]
00234 = sysRetResults[SYS_CPU_USR]
00235 = sysRetResults[SYS_CPU_NICE]
00236 = sysRetResults[SYS_CPU_IDLE]
00237 = sysRetResults[SYS_CPU_USAGE]
00238 = RET_ERROR;
00239 }
00240 }
00241
00242 needSwapPagesInfo = actSysMonitorParams[SYS_PAGES_IN]
00243 || actSysMonitorParams[SYS_PAGES_OUT]
00244 || actSysMonitorParams[SYS_SWAP_IN]
00245 || actSysMonitorParams[SYS_SWAP_OUT];
00246
00247 if (needSwapPagesInfo) {
00248 try {
00249 ProcUtils::getSwapPages(*this, currentSysVals[SYS_PAGES_IN],
00250 currentSysVals[SYS_PAGES_OUT],
00251 currentSysVals[SYS_SWAP_IN],
00252 currentSysVals[SYS_SWAP_OUT]);
00253 } catch (procutils_error &perr) {
00254
00255 logger(WARNING, perr.what());
00256 sysRetResults[SYS_PAGES_IN] = sysRetResults[SYS_PAGES_OUT] =
00257 sysRetResults[SYS_SWAP_OUT] = sysRetResults[SYS_SWAP_IN] = PROCUTILS_ERROR;
00258 } catch (runtime_error &err) {
00259
00260 logger(WARNING, err.what());
00261 sysRetResults[SYS_PAGES_IN] = sysRetResults[SYS_PAGES_OUT]
00262 = sysRetResults[SYS_SWAP_IN]
00263 = sysRetResults[SYS_SWAP_OUT]
00264 = RET_ERROR;
00265 }
00266 }
00267
00268 needLoadInfo = actSysMonitorParams[SYS_LOAD1]
00269 || actSysMonitorParams[SYS_LOAD5]
00270 || actSysMonitorParams[SYS_LOAD15];
00271
00272 if (needLoadInfo) {
00273 double dummyVal;
00274 try {
00275
00276
00277 ProcUtils::getLoad(currentSysVals[SYS_LOAD1], currentSysVals[SYS_LOAD5],
00278 currentSysVals[SYS_LOAD15],dummyVal);
00279 } catch (procutils_error& perr) {
00280
00281 logger(WARNING, perr.what());
00282 sysRetResults[SYS_LOAD1] = sysRetResults[SYS_LOAD5]
00283 = sysRetResults[SYS_LOAD15]
00284 = PROCUTILS_ERROR;
00285 }
00286 }
00287
00288
00289 needProcessesInfo = actSysMonitorParams[SYS_PROCESSES];
00290 if (needProcessesInfo) {
00291 try {
00292 ProcUtils::getProcesses(currentSysVals[SYS_PROCESSES],
00293 currentProcessStates);
00294 } catch (runtime_error& err) {
00295 logger(WARNING, err.what());
00296 sysRetResults[SYS_PROCESSES] = RET_ERROR;
00297 }
00298 }
00299
00300
00301 needMemInfo = actSysMonitorParams[SYS_MEM_USED]
00302 || actSysMonitorParams[SYS_MEM_FREE]
00303 || actSysMonitorParams[SYS_SWAP_USED]
00304 || actSysMonitorParams[SYS_SWAP_FREE]
00305 || actSysMonitorParams[SYS_MEM_USAGE]
00306 || actSysMonitorParams[SYS_SWAP_USAGE];
00307
00308 if (needMemInfo) {
00309 try {
00310 ProcUtils::getMemUsed(currentSysVals[SYS_MEM_USED],
00311 currentSysVals[SYS_MEM_FREE],
00312 currentSysVals[SYS_SWAP_USED],
00313 currentSysVals[SYS_SWAP_FREE]);
00314 currentSysVals[SYS_MEM_USAGE] = 100 * currentSysVals[SYS_MEM_USED] /
00315 (currentSysVals[SYS_MEM_USED] + currentSysVals[SYS_MEM_FREE]);
00316 currentSysVals[SYS_SWAP_USAGE] = 100 * currentSysVals[SYS_SWAP_USED] /
00317 (currentSysVals[SYS_SWAP_USED] + currentSysVals[SYS_SWAP_FREE]);
00318 } catch (procutils_error &perr) {
00319 logger(WARNING, perr.what());
00320 sysRetResults[SYS_MEM_USED] = sysRetResults[SYS_MEM_FREE] =
00321 sysRetResults[SYS_SWAP_USED] = sysRetResults[SYS_SWAP_FREE] =
00322 sysRetResults[SYS_MEM_USAGE] = sysRetResults[SYS_SWAP_USAGE] =
00323 PROCUTILS_ERROR;
00324 }
00325 }
00326
00327
00328
00329 needNetInfo = actSysMonitorParams[SYS_NET_IN] ||
00330 actSysMonitorParams[SYS_NET_OUT] || actSysMonitorParams[SYS_NET_ERRS];
00331 if (needNetInfo && this -> nInterfaces > 0) {
00332 try {
00333 ProcUtils::getNetInfo(*this, ¤tNetIn, ¤tNetOut,
00334 ¤tNetErrs);
00335 } catch (procutils_error &perr) {
00336 logger(WARNING, perr.what());
00337 sysRetResults[SYS_NET_IN] = sysRetResults[SYS_NET_OUT] =
00338 sysRetResults[SYS_NET_ERRS] = PROCUTILS_ERROR;
00339 } catch (runtime_error &err) {
00340 logger(WARNING, err.what());
00341 sysRetResults[SYS_NET_IN] = sysRetResults[SYS_NET_OUT] =
00342 sysRetResults[SYS_NET_ERRS] = RET_ERROR;
00343 }
00344 }
00345
00346 needNetstatInfo = actSysMonitorParams[SYS_NET_SOCKETS] ||
00347 actSysMonitorParams[SYS_NET_TCP_DETAILS];
00348 if (needNetstatInfo) {
00349 try {
00350 ProcUtils::getNetstatInfo(*this, this -> currentNSockets,
00351 this -> currentSocketsTCP);
00352 } catch (runtime_error &err) {
00353 logger(WARNING, err.what());
00354 sysRetResults[SYS_NET_SOCKETS] = sysRetResults[SYS_NET_TCP_DETAILS] =
00355 RET_ERROR;
00356 }
00357 }
00358
00359 needUptime = actSysMonitorParams[SYS_UPTIME];
00360 if (needUptime) {
00361 try {
00362 currentSysVals[SYS_UPTIME] = ProcUtils::getUpTime();
00363 } catch (procutils_error &perr) {
00364 logger(WARNING, perr.what());
00365 sysRetResults[SYS_UPTIME] = PROCUTILS_ERROR;
00366 }
00367 }
00368
00369 }
00370
00371 void ApMon::sendSysInfo() {
00372 #ifndef WIN32
00373 int nParams = 0, maxNParams;
00374 int i;
00375 long crtTime;
00376
00377 int *valueTypes;
00378 char **paramNames, **paramValues;
00379
00380 crtTime = time(NULL);
00381 logger(INFO, "Sending system monitoring information...");
00382
00383
00384
00385 if (this -> sysInfo_first) {
00386 for (i = 0; i < this -> nInterfaces; i++) {
00387 this -> lastBytesSent[i] = this -> lastBytesReceived[i] = 0.0;
00388 this -> lastNetErrs[i] = 0;
00389
00390 }
00391 this -> sysInfo_first = FALSE;
00392 }
00393
00394
00395
00396
00397
00398
00399 maxNParams = nSysMonitorParams + (2 * nInterfaces - 1) + 15 + 4 +
00400 N_TCP_STATES;
00401
00402 valueTypes = (int *)malloc(maxNParams * sizeof(int));
00403 paramNames = (char **)malloc(maxNParams * sizeof(char *));
00404 paramValues = (char **)malloc(maxNParams * sizeof(char *));
00405
00406 for (i = 0; i < nSysMonitorParams; i++) {
00407 if (actSysMonitorParams[i] > 0)
00408 sysRetResults[i] = RET_SUCCESS;
00409 else
00410
00411 sysRetResults[i] = RET_ERROR;
00412 }
00413
00414 updateSysInfo();
00415
00416 for (i = 0; i < nSysMonitorParams; i++) {
00417 if (i == SYS_NET_IN || i == SYS_NET_OUT || i == SYS_NET_ERRS ||
00418 i == SYS_NET_SOCKETS || i == SYS_NET_TCP_DETAILS || i == SYS_PROCESSES)
00419 continue;
00420
00421 if (sysRetResults[i] == PROCUTILS_ERROR) {
00422
00423
00424 if (autoDisableMonitoring)
00425 actSysMonitorParams[i] = 0;
00426 } else if (sysRetResults[i] != RET_ERROR) {
00427
00428 paramNames[nParams] = strdup(sysMonitorParams[i]);
00429 paramValues[nParams] = (char *)¤tSysVals[i];
00430 valueTypes[nParams] = XDR_REAL64;
00431 nParams++;
00432 }
00433 }
00434
00435 if (actSysMonitorParams[SYS_NET_IN] == 1) {
00436 if (sysRetResults[SYS_NET_IN] == PROCUTILS_ERROR) {
00437 if (autoDisableMonitoring)
00438 actSysMonitorParams[SYS_NET_IN] = 0;
00439 } else if (sysRetResults[SYS_NET_IN] != RET_ERROR) {
00440 for (i = 0; i < nInterfaces; i++) {
00441 paramNames[nParams] = (char *)malloc(20 * sizeof(char));
00442 strcpy(paramNames[nParams], interfaceNames[i]);
00443 strcat(paramNames[nParams], "_in");
00444 paramValues[nParams] = (char *)¤tNetIn[i];
00445 valueTypes[nParams] = XDR_REAL64;
00446 nParams++;
00447 }
00448 }
00449 }
00450
00451 if (actSysMonitorParams[SYS_NET_OUT] == 1) {
00452 if (sysRetResults[SYS_NET_IN] == PROCUTILS_ERROR) {
00453 if (autoDisableMonitoring)
00454 actSysMonitorParams[SYS_NET_OUT] = 0;
00455 } else if (sysRetResults[SYS_NET_OUT] != RET_ERROR) {
00456 for (i = 0; i < nInterfaces; i++) {
00457 paramNames[nParams] = (char *)malloc(20 * sizeof(char));
00458 strcpy(paramNames[nParams], interfaceNames[i]);
00459 strcat(paramNames[nParams], "_out");
00460 paramValues[nParams] = (char *)¤tNetOut[i];
00461 valueTypes[nParams] = XDR_REAL64;
00462 nParams++;
00463 }
00464 }
00465 }
00466
00467 if (actSysMonitorParams[SYS_NET_ERRS] == 1) {
00468 if (sysRetResults[SYS_NET_ERRS] == PROCUTILS_ERROR) {
00469 if (autoDisableMonitoring)
00470 actSysMonitorParams[SYS_NET_ERRS] = 0;
00471 } else if (sysRetResults[SYS_NET_ERRS] != RET_ERROR) {
00472 for (i = 0; i < nInterfaces; i++) {
00473 paramNames[nParams] = (char *)malloc(20 * sizeof(char));
00474 strcpy(paramNames[nParams], interfaceNames[i]);
00475 strcat(paramNames[nParams], "_errs");
00476 paramValues[nParams] = (char *)¤tNetErrs[i];
00477 valueTypes[nParams] = XDR_REAL64;
00478 nParams++;
00479 }
00480 }
00481 }
00482
00483
00484 if (actSysMonitorParams[SYS_PROCESSES] == 1) {
00485 if (sysRetResults[SYS_PROCESSES] != RET_ERROR) {
00486 char act_states[] = {'D', 'R', 'S', 'T', 'Z'};
00487 for (i = 0; i < 5; i++) {
00488 paramNames[nParams] = (char *)malloc(20 * sizeof(char));
00489 sprintf(paramNames[nParams], "processes_%c", act_states[i]);
00490 paramValues[nParams] = (char *)¤tProcessStates[act_states[i] - 65];
00491 valueTypes[nParams] = XDR_REAL64;
00492 nParams++;
00493 }
00494 }
00495 }
00496
00497 if (actSysMonitorParams[SYS_NET_SOCKETS] == 1) {
00498 if (sysRetResults[SYS_NET_SOCKETS] != RET_ERROR) {
00499 const char *socket_types[] = {"tcp", "udp", "icm", "unix"};
00500 for (i = 0; i < 4; i++) {
00501 paramNames[nParams] = (char *)malloc(30 * sizeof(char));
00502 sprintf(paramNames[nParams], "sockets_%s", socket_types[i]);
00503 paramValues[nParams] = (char *)¤tNSockets[i];
00504 valueTypes[nParams] = XDR_REAL64;
00505 nParams++;
00506 }
00507 }
00508 }
00509
00510 if (actSysMonitorParams[SYS_NET_TCP_DETAILS] == 1) {
00511 if (sysRetResults[SYS_NET_TCP_DETAILS] != RET_ERROR) {
00512 for (i = 0; i < N_TCP_STATES; i++) {
00513 paramNames[nParams] = (char *)malloc(30 * sizeof(char));
00514 sprintf(paramNames[nParams], "sockets_tcp_%s", socketStatesMapTCP[i]);
00515 paramValues[nParams] = (char *)¤tSocketsTCP[i];
00516 valueTypes[nParams] = XDR_REAL64;
00517 nParams++;
00518 }
00519 }
00520 }
00521
00522 try {
00523 if (nParams > 0)
00524 sendParameters(sysMonCluster, sysMonNode, nParams,
00525 paramNames, valueTypes, paramValues);
00526 } catch (runtime_error& err) {
00527 logger(WARNING, err.what());
00528 }
00529
00530 this -> lastSysInfoSend = crtTime;
00531
00532 if (sysRetResults[SYS_NET_IN] == RET_SUCCESS) {
00533 free(currentNetIn);
00534 free(currentNetOut);
00535 free(currentNetErrs);
00536 }
00537
00538 for (i = 0; i < nParams; i++)
00539 free(paramNames[i]);
00540 free(paramNames);
00541 free(valueTypes);
00542 free(paramValues);
00543 #endif
00544 }
00545
00546 void ApMon::updateGeneralInfo() {
00547
00548 strcpy(cpuVendor, ""); strcpy(cpuFamily, "");
00549 strcpy(cpuModel, ""); strcpy(cpuModelName, "");
00550
00551 if (actGenMonitorParams[GEN_CPU_MHZ] == 1 ||
00552 actGenMonitorParams[GEN_BOGOMIPS] == 1 ||
00553 actGenMonitorParams[GEN_CPU_VENDOR_ID] == 1 ||
00554 actGenMonitorParams[GEN_CPU_FAMILY] == 1 ||
00555 actGenMonitorParams[GEN_CPU_MODEL] == 1 ||
00556 actGenMonitorParams[GEN_CPU_MODEL_NAME] == 1) {
00557 try {
00558 ProcUtils::getCPUInfo(*this);
00559 } catch (procutils_error& err) {
00560 logger(WARNING, err.what());
00561 genRetResults[GEN_CPU_MHZ] = genRetResults[GEN_BOGOMIPS] = PROCUTILS_ERROR;
00562 }
00563 }
00564
00565 if (actGenMonitorParams[GEN_TOTAL_MEM] == 1 ||
00566 actGenMonitorParams[GEN_TOTAL_SWAP] == 1) {
00567 try {
00568 ProcUtils::getSysMem(currentGenVals[GEN_TOTAL_MEM],
00569 currentGenVals[GEN_TOTAL_SWAP]);
00570 } catch (procutils_error& perr) {
00571 logger(WARNING, perr.what());
00572 genRetResults[GEN_TOTAL_MEM] = genRetResults[GEN_TOTAL_SWAP] = PROCUTILS_ERROR;
00573 }
00574 }
00575
00576 if (this -> numCPUs > 0)
00577 currentGenVals[GEN_NO_CPUS] = this -> numCPUs;
00578 else
00579 genRetResults[GEN_NO_CPUS] = PROCUTILS_ERROR;
00580 }
00581
00582 void ApMon::sendGeneralInfo() {
00583 #ifndef WIN32
00584 int nParams, maxNParams, i;
00585 long crtTime;
00586 char tmp_s[50];
00587
00588 char **paramNames, **paramValues;
00589 int *valueTypes;
00590
00591 crtTime = time(NULL);
00592 logger(INFO, "Sending general monitoring information...");
00593
00594 maxNParams = nGenMonitorParams + numIPs;
00595 valueTypes = (int *)malloc(maxNParams * sizeof(int));
00596 paramNames = (char **)malloc(maxNParams * sizeof(char *));
00597 paramValues = (char **)malloc(maxNParams * sizeof(char *));
00598
00599 nParams = 0;
00600
00601 updateGeneralInfo();
00602
00603 if (actGenMonitorParams[GEN_HOSTNAME]) {
00604 paramNames[nParams] = strdup(genMonitorParams[GEN_HOSTNAME]);
00605 valueTypes[nParams] = XDR_STRING;
00606 paramValues[nParams] = myHostname;
00607 nParams++;
00608 }
00609
00610 if (actGenMonitorParams[GEN_IP]) {
00611 for (i = 0; i < this -> numIPs; i++) {
00612 strcpy(tmp_s, "ip_");
00613 strcat(tmp_s, interfaceNames[i]);
00614 paramNames[nParams] = strdup(tmp_s);
00615 valueTypes[nParams] = XDR_STRING;
00616 paramValues[nParams] = this -> allMyIPs[i];
00617 nParams++;
00618 }
00619 }
00620
00621 if (actGenMonitorParams[GEN_CPU_VENDOR_ID] && strlen(cpuVendor) != 0) {
00622 paramNames[nParams] = strdup(genMonitorParams[GEN_CPU_VENDOR_ID]);
00623 valueTypes[nParams] = XDR_STRING;
00624 paramValues[nParams] = cpuVendor;
00625 nParams++;
00626 }
00627
00628 if (actGenMonitorParams[GEN_CPU_FAMILY] && strlen(cpuFamily) != 0) {
00629 paramNames[nParams] = strdup(genMonitorParams[GEN_CPU_FAMILY]);
00630 valueTypes[nParams] = XDR_STRING;
00631 paramValues[nParams] = cpuFamily;
00632 nParams++;
00633 }
00634
00635 if (actGenMonitorParams[GEN_CPU_MODEL] && strlen(cpuModel) != 0) {
00636 paramNames[nParams] = strdup(genMonitorParams[GEN_CPU_MODEL]);
00637 valueTypes[nParams] = XDR_STRING;
00638 paramValues[nParams] = cpuModel;
00639 nParams++;
00640 }
00641
00642 if (actGenMonitorParams[GEN_CPU_MODEL_NAME] && strlen(cpuModelName) != 0) {
00643 paramNames[nParams] = strdup(genMonitorParams[GEN_CPU_MODEL_NAME]);
00644 valueTypes[nParams] = XDR_STRING;
00645 paramValues[nParams] = cpuModelName;
00646 nParams++;
00647 }
00648
00649 for (i = 0; i < nGenMonitorParams; i++) {
00650 if (actGenMonitorParams[i] != 1 || i == GEN_IP || i == GEN_HOSTNAME ||
00651 i == GEN_CPU_VENDOR_ID || i == GEN_CPU_FAMILY || i == GEN_CPU_MODEL
00652 || i == GEN_CPU_MODEL_NAME)
00653 continue;
00654
00655 if (genRetResults[i] == PROCUTILS_ERROR) {
00656
00657
00658 if (autoDisableMonitoring)
00659 actGenMonitorParams[i] = 0;
00660 } else if (genRetResults[i] != RET_ERROR) {
00661 paramNames[nParams] = strdup(genMonitorParams[i]);
00662 paramValues[nParams] = (char *)¤tGenVals[i];
00663 valueTypes[nParams] = XDR_REAL64;
00664 nParams++;
00665 }
00666 }
00667
00668 try {
00669 if (nParams > 0)
00670 sendParameters(sysMonCluster, sysMonNode, nParams,
00671 paramNames, valueTypes, paramValues);
00672 } catch (runtime_error& err) {
00673 logger(WARNING, err.what());
00674 }
00675
00676 for (i = 0; i < nParams; i++)
00677 free(paramNames[i]);
00678 free(paramNames);
00679 free(valueTypes);
00680 free(paramValues);
00681 #endif
00682 }
00683
00684 void ApMon::initMonitoring() {
00685 int i;
00686
00687 this -> autoDisableMonitoring = true;
00688 this -> sysMonitoring = false;
00689 this -> jobMonitoring = false;
00690 this -> genMonitoring = false;
00691 this -> confCheck = false;
00692
00693 #ifndef WIN32
00694 pthread_mutex_init(&this -> mutex, NULL);
00695 pthread_mutex_init(&this -> mutexBack, NULL);
00696 pthread_mutex_init(&this -> mutexCond, NULL);
00697 pthread_cond_init(&this -> confChangedCond, NULL);
00698 #else
00699 logger(INFO, "init mutexes...");
00700 this -> mutex = CreateMutex(NULL, FALSE, NULL);
00701 this -> mutexBack = CreateMutex(NULL, FALSE, NULL);
00702 this -> mutexCond = CreateMutex(NULL, FALSE, NULL);
00703 this -> confChangedCond = CreateEvent(NULL, FALSE, FALSE, NULL);
00704
00705
00706
00707 WORD wVersionRequested;
00708 WSADATA wsaData;
00709 int err;
00710 wVersionRequested = MAKEWORD( 2, 0 );
00711 err = WSAStartup( wVersionRequested, &wsaData );
00712 if ( err != 0 ) {
00713 logger(FATAL, "Could not initialize the Windows Sockets library (WS2_32.dll)");
00714 }
00715
00716 #endif
00717
00718 this -> haveBkThread = false;
00719 this -> bkThreadStarted = false;
00720 this -> stopBkThread = false;
00721
00722 this -> recheckChanged = false;
00723 this -> jobMonChanged = false;
00724 this -> sysMonChanged = false;
00725
00726 this -> recheckInterval = RECHECK_INTERVAL;
00727 this -> crtRecheckInterval = RECHECK_INTERVAL;
00728 this -> jobMonitorInterval = JOB_MONITOR_INTERVAL;
00729 this -> sysMonitorInterval = SYS_MONITOR_INTERVAL;
00730
00731 this -> nSysMonitorParams = initSysParams(this -> sysMonitorParams);
00732
00733 this -> nGenMonitorParams = initGenParams(this -> genMonitorParams);
00734
00735 this -> nJobMonitorParams = initJobParams(this -> jobMonitorParams);
00736
00737 initSocketStatesMapTCP(this -> socketStatesMapTCP);
00738
00739 this -> sysInfo_first = true;
00740
00741 try {
00742 this -> lastSysInfoSend = ProcUtils::getBootTime();
00743 } catch (procutils_error& perr) {
00744 logger(WARNING, perr.what());
00745 logger(WARNING, "The first system monitoring values may be inaccurate");
00746 this -> lastSysInfoSend = 0;
00747 }
00748
00749 for (i = 0; i < nSysMonitorParams; i++)
00750 this -> lastSysVals[i] = 0;
00751
00752
00753
00754
00755 for (i = 0; i < nSysMonitorParams; i++) {
00756 actSysMonitorParams[i] = 1;
00757 sysRetResults[i] = RET_SUCCESS;
00758 }
00759
00760 for (i = 0; i < nGenMonitorParams; i++) {
00761 actGenMonitorParams[i] = 1;
00762 genRetResults[i] = RET_SUCCESS;
00763 }
00764
00765 for (i = 0; i < nJobMonitorParams; i++) {
00766 actJobMonitorParams[i] = 1;
00767 jobRetResults[i] = RET_SUCCESS;
00768 }
00769
00770 this -> maxMsgRate = MAX_MSG_RATE;
00771 }
00772
00773 void ApMon::parseXApMonLine(char *line) {
00774 bool flag, found;
00775 int ind;
00776 char tmp[MAX_STRING_LEN], logmsg[200];
00777 char *param, *value;
00778
00779
00780 const char *sep = " =";
00781
00782 strcpy(tmp, line);
00783 char *tmp2 = tmp + strlen("xApMon_");
00784
00785 param = strtok(tmp2, sep);
00786 value = strtok(NULL, sep);
00787
00788
00789 if (strcmp(value, "on") == 0)
00790 flag = true;
00791 else
00792 flag = false;
00793
00794 pthread_mutex_lock(&mutexBack);
00795
00796 found = false;
00797 if (strcmp(param, "job_monitoring") == 0) {
00798 this -> jobMonitoring = flag; found = true;
00799 }
00800 if (strcmp(param, "sys_monitoring") == 0) {
00801 this -> sysMonitoring = flag; found = true;
00802 }
00803 if (strcmp(param, "job_interval") == 0) {
00804 this -> jobMonitorInterval = atol(value); found = true;
00805 }
00806 if (strcmp(param, "sys_interval") == 0) {
00807 this -> sysMonitorInterval = atol(value); found = true;
00808 }
00809 if (strcmp(param, "general_info") == 0) {
00810 this -> genMonitoring = flag; found = true;
00811 }
00812 if (strcmp(param, "conf_recheck") == 0) {
00813 this -> confCheck = flag; found = true;
00814 }
00815 if (strcmp(param, "recheck_interval") == 0) {
00816 this -> recheckInterval = this -> crtRecheckInterval = atol(value);
00817 found = true;
00818 }
00819 if (strcmp(param, "auto_disable") == 0) {
00820 this -> autoDisableMonitoring = flag;
00821 found = true;
00822 }
00823 if (strcmp(param, "maxMsgRate") == 0) {
00824 this -> maxMsgRate = atoi(value);
00825 found = true;
00826 }
00827
00828 if (found) {
00829 pthread_mutex_unlock(&mutexBack);
00830 return;
00831 }
00832
00833 if (strstr(param, "sys_") == param) {
00834 ind = getVectIndex(param + strlen("sys_"), sysMonitorParams,
00835 nSysMonitorParams);
00836 if (ind < 0) {
00837 pthread_mutex_unlock(&mutexBack);
00838 sprintf(logmsg, "Invalid parameter name in the configuration file: %s",
00839 param);
00840 logger(WARNING, logmsg);
00841 return;
00842 }
00843 found = true;
00844 this -> actSysMonitorParams[ind] = (int)flag;
00845 }
00846
00847 if (strstr(param, "job_") == param) {
00848 ind = getVectIndex(param + strlen("job_"), jobMonitorParams,
00849 nJobMonitorParams);
00850
00851 if (ind < 0) {
00852 pthread_mutex_unlock(&mutexBack);
00853 sprintf(logmsg, "Invalid parameter name in the configuration file: %s",
00854 param);
00855 logger(WARNING, logmsg);
00856 return;
00857 }
00858 found = true;
00859 this -> actJobMonitorParams[ind] = (int)flag;
00860 }
00861
00862 if (!found) {
00863 ind = getVectIndex(param, genMonitorParams,
00864 nGenMonitorParams);
00865 if (ind < 0) {
00866 pthread_mutex_unlock(&mutexBack);
00867 sprintf(logmsg, "Invalid parameter name in the configuration file: %s",
00868 param);
00869 logger(WARNING, logmsg);
00870 return;
00871 } else {
00872 found = true;
00873 this -> actGenMonitorParams[ind] = (int)flag;
00874 }
00875 }
00876
00877 if (!found) {
00878 sprintf(logmsg, "Invalid parameter name in the configuration file: %s",
00879 param);
00880 logger(WARNING, logmsg);
00881 }
00882 pthread_mutex_unlock(&mutexBack);
00883 }
00884
00885 long *apmon_mon_utils::getChildren(long pid, int& nChildren)
00886 throw(runtime_error) {
00887 #ifdef WIN32
00888 return 0;
00889 #else
00890 FILE *pf;
00891 long *pids, *ppids, *children;
00892 int nProcesses;
00893 int i, j, status;
00894 pid_t cpid;
00895 char *argv[4], msg[MAX_STRING_LEN], sval[20];
00896 bool processFound;
00897 long mypid = getpid();
00898 char children_f[50], np_f[50], cmd[200];
00899
00900
00901
00902 sprintf(children_f, "/tmp/apmon_children%ld", mypid);
00903 sprintf(np_f, "/tmp/apmon_np%ld", mypid);
00904
00905 switch (cpid = fork()) {
00906 case -1:
00907 throw runtime_error("[ getChildren() ] Unable to fork()");
00908 case 0:
00909 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
00910 sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s",
00911 children_f, children_f, np_f);
00912 argv[2] = cmd;
00913
00914
00915
00916 argv[3] = 0;
00917 execv("/bin/sh", argv);
00918 exit(RET_ERROR);
00919 default:
00920 if (waitpid(cpid, &status, 0) == -1) {
00921 sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid);
00922 unlink(children_f); unlink(np_f);
00923 throw runtime_error(msg);
00924 }
00925 }
00926
00927
00928 pf = fopen(np_f, "rt");
00929 if (pf == NULL) {
00930 unlink(np_f); unlink(children_f);
00931 sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined",
00932 pid);
00933 throw runtime_error(msg);
00934 }
00935 fscanf(pf, "%d", &nProcesses);
00936 fclose(pf);
00937 unlink(np_f);
00938
00939 pids = (long *)malloc(nProcesses * sizeof(long));
00940 ppids = (long *)malloc(nProcesses * sizeof(long));
00941
00942 children = (long *)malloc(nProcesses * sizeof(long));
00943
00944 pf = fopen(children_f, "rt");
00945 if (pf == NULL) {
00946 free(pids); free(ppids); free(children);
00947 unlink(children_f);
00948 sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid);
00949 throw runtime_error(msg);
00950 }
00951
00952
00953
00954 children[0] = pid; nChildren = 1;
00955 processFound = false;
00956 for (i = 0; i < nProcesses; i++) {
00957 fscanf(pf, "%ld %ld", &ppids[i], &pids[i]);
00958
00959 if (pids[i] == children[0] || ppids[i] == children[0])
00960 processFound = true;
00961 if (ppids[i] == children[0]) {
00962 children[nChildren++] = pids[i];
00963 }
00964 }
00965 fclose(pf);
00966 unlink(children_f);
00967
00968 if (processFound == false) {
00969 free(pids); free(ppids); free(children);
00970 nChildren = 0;
00971 sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid);
00972 throw runtime_error(msg);
00973 }
00974
00975
00976 i = 1;
00977 while (i < nChildren) {
00978
00979 for (j = 0; j < nProcesses; j++) {
00980 if (ppids[j] == children[i]) {
00981 children[nChildren++] = pids[j];
00982 }
00983 }
00984 i++;
00985 }
00986
00987 sprintf(msg, "Sub-processes for process %ld: ", pid);
00988 for (i = 0; i < nChildren; i++) {
00989 sprintf(sval, "%ld ", children[i]);
00990 if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1)
00991 strcat(msg, sval);
00992 }
00993 logger(DEBUG, msg);
00994
00995 free(pids); free(ppids);
00996 children = (long *)realloc(children, (nChildren) * sizeof(long));
00997 return children;
00998 #endif
00999 }
01000
01001 void apmon_mon_utils::readJobInfo(long pid, PsInfo& info) throw(runtime_error) {
01002 #ifndef WIN32
01003 long *children;
01004 FILE *fp;
01005 int i, nChildren, status, ch, ret, open_fd;
01006 char *cmd , *mem_cmd_s, *argv[4], *ret_s;
01007 char pid_s[10], msg[100];
01008 char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[MAX_STRING_LEN1];
01009 char etime_s[20], cputime_s[20];
01010 double rsz, vsz;
01011 double etime, cputime;
01012 double pcpu, pmem;
01013
01014
01015
01016
01017
01018
01019
01020 char **mem_cmd_list;
01021 int listSize;
01022 long cpid, crt_pid;
01023
01024 long mypid = getpid();
01025 char ps_f[50];
01026
01027
01028 children = getChildren(pid, nChildren);
01029
01030
01031
01032 sprintf(ps_f, "/tmp/apmon_ps%ld", mypid);
01033
01034 unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char);
01035 cmd = (char *)malloc (cmdLen);
01036
01037
01038 strcpy(cmd, "ps --no-headers --pid ");
01039 for (i = 0; i < nChildren - 1; i++) {
01040 sprintf(pid_s, "%ld,", children[i]);
01041 if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) {
01042 free(cmd);
01043 sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01044 pid);
01045 throw runtime_error(msg);
01046 }
01047 strcat(cmd, pid_s);
01048
01049 }
01050
01051
01052 sprintf(pid_s, "%ld", children[nChildren - 1]);
01053 sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f);
01054 if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) {
01055 free(cmd);
01056 sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
01057 pid);
01058 throw runtime_error(msg);
01059 }
01060 strcat(cmd, pid_s);
01061 strcat(cmd, cmdName);
01062
01063
01064 switch (cpid = fork()) {
01065 case -1:
01066 free(cmd);
01067 sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid);
01068 throw runtime_error(msg);
01069 case 0:
01070 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01071 argv[2] = cmd; argv[3] = 0;
01072 execv("/bin/sh", argv);
01073 exit(RET_ERROR);
01074 default:
01075 if (waitpid(cpid, &status, 0) == -1) {
01076 free(cmd);
01077 sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid);
01078 throw runtime_error(msg);
01079 }
01080 }
01081
01082 free(cmd);
01083 fp = fopen(ps_f, "rt");
01084 if (fp == NULL) {
01085 sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid);
01086 throw runtime_error(msg);
01087 }
01088
01089
01090 info.etime = info.cputime = 0;
01091 info.pcpu = info.pmem = 0;
01092 info.rsz = info.vsz = 0;
01093 info.open_fd = 0;
01094 mem_cmd_list = (char **)malloc(nChildren * sizeof(char *));
01095 listSize = 0;
01096 cmdName[0] = 0;
01097 while (1) {
01098 ret_s = fgets(buf, MAX_STRING_LEN, fp);
01099 if (ret_s == NULL)
01100 break;
01101 buf[MAX_STRING_LEN - 1] = 0;
01102
01103
01104
01105 ch = fgetc(fp);
01106 ungetc(ch, fp);
01107 if (buf[strlen(buf) - 1] != 10 && ch != EOF) {
01108 while (1) {
01109 char *sret = fgets(buf2, MAX_STRING_LEN, fp);
01110 if (sret == NULL || buf[strlen(buf) - 1] == 10)
01111 break;
01112 }
01113 }
01114
01115 ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s,
01116 cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName);
01117 if (ret != 8) {
01118 fclose(fp);
01119 unlink(ps_f);
01120 free(children);
01121 for (i = 0; i < listSize; i++) {
01122 free(mem_cmd_list[i]);
01123 }
01124 free(mem_cmd_list);
01125 throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command");
01126 }
01127
01128
01129 etime = parsePSTime(etime_s);
01130 info.etime = (info.etime > etime) ? info.etime : etime;
01131
01132
01133 cputime = parsePSTime(cputime_s);
01134 info.cputime += cputime;
01135 info.pcpu += pcpu;
01136
01137
01138 try {
01139 open_fd = ProcUtils::countOpenFiles(crt_pid);
01140 } catch (procutils_error& err) {
01141 logger(WARNING, err.what());
01142
01143 open_fd = PROCUTILS_ERROR;
01144 }
01145
01146
01147 mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char));
01148 sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName);
01149
01150 if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) {
01151
01152
01153
01154 info.pmem += pmem;
01155 info.vsz += vsz; info.rsz += rsz;
01156
01157 if (info.open_fd >= 0)
01158 info.open_fd += open_fd;
01159
01160
01161 mem_cmd_list[listSize++] = mem_cmd_s;
01162 } else {
01163 free(mem_cmd_s);
01164 }
01165
01166
01167
01168
01169
01170 if (crt_pid == getpid())
01171 info.open_fd -= 2;
01172 }
01173
01174 fclose(fp);
01175 unlink(ps_f);
01176 free(children);
01177 for (i = 0; i < listSize; i++) {
01178 free(mem_cmd_list[i]);
01179 }
01180 free(mem_cmd_list);
01181 #endif
01182 }
01183
01184 long apmon_mon_utils::parsePSTime(char *s) {
01185 long days, hours, mins, secs;
01186
01187 if (strchr(s, '-') != NULL) {
01188 sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs);
01189 return 24 * 3600 * days + 3600 * hours + 60 * mins + secs;
01190 } else {
01191 if (strchr(s, ':') != NULL && strchr(s, ':') != strrchr(s, ':')) {
01192 sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs);
01193 return 3600 * hours + 60 * mins + secs;
01194 } else {
01195 if (strchr(s, ':') != NULL) {
01196 sscanf(s, "%ld:%ld", &mins, &secs);
01197 return 60 * mins + secs;
01198 } else {
01199 return RET_ERROR;
01200 }
01201 }
01202 }
01203 }
01204
01205 void apmon_mon_utils::readJobDiskUsage(MonitoredJob job,
01206 JobDirInfo& info) throw(runtime_error) {
01207 #ifndef WIN32
01208 int status;
01209 pid_t cpid;
01210 char *cmd, s_tmp[20], *argv[4], msg[100];
01211 FILE *fp;
01212 long mypid = getpid();
01213 char du_f[50], df_f[50];
01214
01215
01216
01217 sprintf(du_f, "/tmp/apmon_du%ld", mypid);
01218 sprintf(df_f, "/tmp/apmon_df%ld", mypid);
01219
01220 if (strlen(job.workdir) == 0) {
01221 sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid);
01222 throw runtime_error(msg);
01223 }
01224
01225 cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char));
01226 strcpy(cmd, "PRT=`du -Lsk ");
01227 strcat(cmd, job.workdir);
01228
01229 strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
01230 strcat(cmd, du_f);
01231
01232
01233 switch (cpid = fork()) {
01234 case -1:
01235 sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01236 throw runtime_error(msg);
01237 case 0:
01238 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01239 argv[2] = cmd; argv[3] = 0;
01240 execv("/bin/sh", argv);
01241 exit(RET_ERROR);
01242 default:
01243 if (waitpid(cpid, &status, 0) == -1) {
01244 free(cmd);
01245 sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid);
01246 unlink(du_f); unlink(df_f);
01247 throw runtime_error(msg);
01248 }
01249 }
01250
01251 strcpy(cmd, "PRT=`df -m ");
01252 strcat(cmd, job.workdir);
01253
01254 strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
01255
01256 strcat(cmd, df_f);
01257
01258
01259 switch (cpid = fork()) {
01260 case -1:
01261 sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
01262 throw runtime_error(msg);
01263 case 0:
01264 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
01265 argv[2] = cmd; argv[3] = 0;
01266 execv("/bin/sh", argv);
01267 exit(RET_ERROR);
01268 default:
01269 if (waitpid(cpid, &status, 0) == -1) {
01270 free(cmd);
01271 sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid);
01272 unlink(du_f); unlink(df_f);
01273 throw runtime_error(msg);
01274 }
01275 }
01276
01277 free(cmd);
01278 fp = fopen(du_f, "rt");
01279 if (fp == NULL) {
01280 sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid);
01281 throw runtime_error(msg);
01282 }
01283
01284 fscanf(fp, "%lf", &(info.workdir_size));
01285
01286 info.workdir_size /= 1024.0;
01287 fclose(fp);
01288 unlink(du_f);
01289
01290 fp = fopen(df_f, "rt");
01291 if (fp == NULL) {
01292 sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid);
01293 throw runtime_error(msg);
01294 }
01295 fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total),
01296 &(info.disk_used), &(info.disk_free), &(info.disk_usage));
01297 fclose(fp);
01298 unlink(df_f);
01299 #endif
01300 }