|
Gaudi Framework, version v22r2 |
| Home | Generated: Tue May 10 2011 |
Classes | |
| struct | PsInfo |
| Structure that holds information about a job, as obtained from the ps command. More... | |
| struct | JobDirInfo |
| Structure that holds information about the disk usage for a job. More... | |
Typedefs | |
| typedef struct apmon_mon_utils::PsInfo | PsInfo |
| Structure that holds information about a job, as obtained from the ps command. | |
| typedef struct apmon_mon_utils::JobDirInfo | JobDirInfo |
| Structure that holds information about the disk usage for a job. | |
Functions | |
| long * | getChildren (long pid, int &nChildren) throw (runtime_error) |
| Determines all the descendants of a given process. | |
| void | readJobInfo (long pid, PsInfo &info) throw (runtime_error) |
| Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command. | |
| double | parsePSTime (char *s) |
| Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds. | |
| void | readJobDiskUsage (MonitoredJob job, JobDirInfo &info) throw (runtime_error) |
| If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs. | |
| typedef struct apmon_mon_utils::JobDirInfo apmon_mon_utils::JobDirInfo |
Structure that holds information about the disk usage for a job.
| typedef struct apmon_mon_utils::PsInfo apmon_mon_utils::PsInfo |
Structure that holds information about a job, as obtained from the ps command.
| long * apmon_mon_utils::getChildren | ( | long | pid, |
| int & | nChildren | ||
| ) | throw (runtime_error) |
Determines all the descendants of a given process.
Definition at line 883 of file monitor_utils.cpp.
{
#ifdef WIN32
return 0;
#else
FILE *pf;
long *pids, *ppids, *children;
int nProcesses;
int i, j, status;
pid_t cpid;
char *argv[4], msg[MAX_STRING_LEN], sval[20];
bool processFound;
long mypid = getpid();
char children_f[50], np_f[50], cmd[200];
/* generate the names of the temporary files in which we have the output
of some commands */
sprintf(children_f, "/tmp/apmon_children%ld", mypid);
sprintf(np_f, "/tmp/apmon_np%ld", mypid);
switch (cpid = fork()) {
case -1:
throw runtime_error("[ getChildren() ] Unable to fork()");
case 0:
argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s",
children_f, children_f, np_f);
argv[2] = cmd;
/*
argv[2] = "ps --no-headers -eo ppid,pid > /tmp/apmon_children.txt && wc -l /tmp/out_children.txt > /tmp/out_np.txt";
*/
argv[3] = 0;
execv("/bin/sh", argv);
exit(RET_ERROR);
default:
if (waitpid(cpid, &status, 0) == -1) {
sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid);
unlink(children_f); unlink(np_f);
throw runtime_error(msg);
}
}
/* find the number of processes */
pf = fopen(np_f, "rt");
if (pf == NULL) {
unlink(np_f); unlink(children_f);
sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined",
pid);
throw runtime_error(msg);
}
fscanf(pf, "%d", &nProcesses);
fclose(pf);
unlink(np_f);
pids = (long *)malloc(nProcesses * sizeof(long));
ppids = (long *)malloc(nProcesses * sizeof(long));
/* estimated maximum size for the returned vector; it will be realloc'ed */
children = (long *)malloc(nProcesses * sizeof(long));
pf = fopen(children_f, "rt");
if (pf == NULL) {
free(pids); free(ppids); free(children);
unlink(children_f);
sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid);
throw runtime_error(msg);
}
/* scan the output of the ps command and find the children of the process,
and also check if the process is still running */
children[0] = pid; nChildren = 1;
processFound = false;
for (i = 0; i < nProcesses; i++) {
fscanf(pf, "%ld %ld", &ppids[i], &pids[i]);
/* look for the given process */
if (pids[i] == children[0] || ppids[i] == children[0])
processFound = true;
if (ppids[i] == children[0]) {
children[nChildren++] = pids[i];
}
}
fclose(pf);
unlink(children_f);
if (processFound == false) {
free(pids); free(ppids); free(children);
nChildren = 0;
sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid);
throw runtime_error(msg);
}
/* find the PIDs of all the descendant processes */
i = 1;
while (i < nChildren) {
/* find the children of the i-th child */
for (j = 0; j < nProcesses; j++) {
if (ppids[j] == children[i]) {
children[nChildren++] = pids[j];
}
}
i++;
}
sprintf(msg, "Sub-processes for process %ld: ", pid);
for (i = 0; i < nChildren; i++) {
sprintf(sval, "%ld ", children[i]);
if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1)
strcat(msg, sval);
}
logger(DEBUG, msg);
free(pids); free(ppids);
children = (long *)realloc(children, (nChildren) * sizeof(long));
return children;
#endif
}
| double apmon_mon_utils::parsePSTime | ( | char * | s ) |
Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.
Definition at line 1183 of file monitor_utils.cpp.
{
long days, hours, mins, secs;
if (strchr(s, '-') != NULL) {
sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs);
return 24. * 3600 * days + 3600 * hours + 60 * mins + secs;
} else {
if (strchr(s, ':') != NULL && strchr(s, ':') != strrchr(s, ':')) {
sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs);
return 3600. * hours + 60 * mins + secs;
} else {
if (strchr(s, ':') != NULL) {
sscanf(s, "%ld:%ld", &mins, &secs);
return 60. * mins + secs;
} else {
return RET_ERROR;
}
}
}
}
| void apmon_mon_utils::readJobDiskUsage | ( | MonitoredJob | job, |
| JobDirInfo & | info | ||
| ) | throw (runtime_error) |
If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.
Sizes are given in MB.
Definition at line 1204 of file monitor_utils.cpp.
{
#ifndef WIN32
int status;
pid_t cpid;
char *cmd, s_tmp[20], *argv[4], msg[100];
FILE *fp;
long mypid = getpid();
char du_f[50], df_f[50];
/* generate names for the temporary files which will hold the output of the
du and df commands */
sprintf(du_f, "/tmp/apmon_du%ld", mypid);
sprintf(df_f, "/tmp/apmon_df%ld", mypid);
if (strlen(job.workdir) == 0) {
sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid);
throw runtime_error(msg);
}
cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char));
strcpy(cmd, "PRT=`du -Lsk ");
strcat(cmd, job.workdir);
//strcat(cmd, " | tail -1 | cut -f 1 > ");
strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
strcat(cmd, du_f);
switch (cpid = fork()) {
case -1:
sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
throw runtime_error(msg);
case 0:
argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
argv[2] = cmd; argv[3] = 0;
execv("/bin/sh", argv);
exit(RET_ERROR);
default:
if (waitpid(cpid, &status, 0) == -1) {
free(cmd);
sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid);
unlink(du_f); unlink(df_f);
throw runtime_error(msg);
}
}
strcpy(cmd, "PRT=`df -m ");
strcat(cmd, job.workdir);
//strcat(cmd, " | tail -1 > ");
strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > ");
strcat(cmd, df_f);
//printf("### cmd: %s\n", cmd);
switch (cpid = fork()) {
case -1:
sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid);
throw runtime_error(msg);
case 0:
argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
argv[2] = cmd; argv[3] = 0;
execv("/bin/sh", argv);
exit(RET_ERROR);
default:
if (waitpid(cpid, &status, 0) == -1) {
free(cmd);
sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid);
unlink(du_f); unlink(df_f);
throw runtime_error(msg);
}
}
free(cmd);
fp = fopen(du_f, "rt");
if (fp == NULL) {
sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid);
throw runtime_error(msg);
}
fscanf(fp, "%lf", &(info.workdir_size));
/* keep the directory size in MB */
info.workdir_size /= 1024.0;
fclose(fp);
unlink(du_f);
fp = fopen(df_f, "rt");
if (fp == NULL) {
sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid);
throw runtime_error(msg);
}
fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total),
&(info.disk_used), &(info.disk_free), &(info.disk_usage));
fclose(fp);
unlink(df_f);
#endif
}
| void apmon_mon_utils::readJobInfo | ( | long | pid, |
| PsInfo & | info | ||
| ) | throw (runtime_error) |
Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.
Definition at line 999 of file monitor_utils.cpp.
{
#ifndef WIN32
long *children;
FILE *fp;
int i, nChildren, status, ch, ret, open_fd;
char *cmd , *mem_cmd_s, *argv[4], *ret_s;
char pid_s[10], msg[100];
char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[MAX_STRING_LEN1];
char etime_s[20], cputime_s[20];
double rsz, vsz;
double etime, cputime;
double pcpu, pmem;
/* this list contains strings of the form "rsz_vsz_command" for every pid;
it is used to avoid adding several times processes that have multiple
threads and appear in ps as sepparate processes, occupying exactly the
same amount of memory and having the same command name. For every line
from the output of the ps command we verify if the rsz_vsz_command
combination is already in the list.
*/
char **mem_cmd_list;
int listSize;
long cpid, crt_pid;
//unsigned int maxCmdLen = 5 * MAX_STRING_LEN;
long mypid = getpid();
char ps_f[50];
/* get the list of the process' descendants */
children = getChildren(pid, nChildren);
/* generate a name for the temporary file which holds the output of the
ps command */
sprintf(ps_f, "/tmp/apmon_ps%ld", mypid);
unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char);
cmd = (char *)malloc (cmdLen);
/* issue the "ps" command to obtain information on all the descendants */
strcpy(cmd, "ps --no-headers --pid ");
for (i = 0; i < nChildren - 1; i++) {
sprintf(pid_s, "%ld,", children[i]);
if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) {
free(cmd);
sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
pid);
throw runtime_error(msg);
}
strcat(cmd, pid_s);
//strcat(cmd, " 2>&1");
}
/* the last part of the command */
sprintf(pid_s, "%ld", children[nChildren - 1]);
sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f);
if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) {
free(cmd);
sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored",
pid);
throw runtime_error(msg);
}
strcat(cmd, pid_s);
strcat(cmd, cmdName);
//strcat(cmd, " 2>&1");
switch (cpid = fork()) {
case -1:
free(cmd);
sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid);
throw runtime_error(msg);
case 0:
argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c";
argv[2] = cmd; argv[3] = 0;
execv("/bin/sh", argv);
exit(RET_ERROR);
default:
if (waitpid(cpid, &status, 0) == -1) {
free(cmd);
sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid);
throw runtime_error(msg);
}
}
free(cmd);
fp = fopen(ps_f, "rt");
if (fp == NULL) {
sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid);
throw runtime_error(msg);
}
/* parse the output file */
info.etime = info.cputime = 0;
info.pcpu = info.pmem = 0;
info.rsz = info.vsz = 0;
info.open_fd = 0;
mem_cmd_list = (char **)malloc(nChildren * sizeof(char *));
listSize = 0;
cmdName[0] = 0;
while (1) {
ret_s = fgets(buf, MAX_STRING_LEN, fp);
if (ret_s == NULL)
break;
buf[MAX_STRING_LEN - 1] = 0;
/* if the line was too long and fgets hasn't read it entirely, */
/* keep only the first 512 chars from the line */
ch = fgetc(fp); // see if we are at the end of the file
ungetc(ch, fp);
if (buf[strlen(buf) - 1] != 10 && ch != EOF) {
while (1) {
char *sret = fgets(buf2, MAX_STRING_LEN, fp);
if (sret == NULL || buf[strlen(buf) - 1] == 10)
break;
}
}
ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s,
cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName);
if (ret != 8) {
fclose(fp);
unlink(ps_f);
free(children);
for (i = 0; i < listSize; i++) {
free(mem_cmd_list[i]);
}
free(mem_cmd_list);
throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command");
}
/* etime is the maximum of the elapsed times for the subprocesses */
etime = parsePSTime(etime_s);
info.etime = (info.etime > etime) ? info.etime : etime;
/* cputime is the sum of the cpu times for the subprocesses */
cputime = parsePSTime(cputime_s);
info.cputime += cputime;
info.pcpu += pcpu;
/* get the number of opened file descriptors */
try {
open_fd = ProcUtils::countOpenFiles(crt_pid);
} catch (procutils_error& err) {
logger(WARNING, err.what());
/* don't throw an exception if we couldn't read the number of files */
open_fd = PROCUTILS_ERROR;
}
/* see if this is a process or just a thread */
mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char));
sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName);
//printf("### mem_cmd_s: %s\n", mem_cmd_s);
if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) {
/* another pid with the same command name, rsz and vsz was not found,
so this is a new process and we can add the amount of memory used by
it */
info.pmem += pmem;
info.vsz += vsz; info.rsz += rsz;
if (info.open_fd >= 0) // if no error occured so far
info.open_fd += open_fd;
/* add an entry in the list so that next time we see another thread of
this process we don't add the amount of memory again */
mem_cmd_list[listSize++] = mem_cmd_s;
} else {
free(mem_cmd_s);
}
/* if we monitor the current process, we have two extra opened files
that we shouldn't take into account (the output file for ps and
/proc/<pid>/fd/)
*/
if (crt_pid == getpid())
info.open_fd -= 2;
}
fclose(fp);
unlink(ps_f);
free(children);
for (i = 0; i < listSize; i++) {
free(mem_cmd_list[i]);
}
free(mem_cmd_list);
#endif
}