|
Gaudi Framework, version v22r0 |
| Home | Generated: 9 Feb 2011 |
Classes | |
| struct | PsInfo |
| Structure that holds information about a job, as obtained from the ps command. More... | |
| struct | JobDirInfo |
| Structure that holds information about the disk usage for a job. More... | |
Typedefs | |
| typedef struct apmon_mon_utils::PsInfo | PsInfo |
| Structure that holds information about a job, as obtained from the ps command. | |
| typedef struct apmon_mon_utils::JobDirInfo | JobDirInfo |
| Structure that holds information about the disk usage for a job. | |
Functions | |
| long * | getChildren (long pid, int &nChildren) throw (runtime_error) |
| Determines all the descendants of a given process. | |
| void | readJobInfo (long pid, PsInfo &info) throw (runtime_error) |
| Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command. | |
| double | parsePSTime (char *s) |
| Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds. | |
| void | readJobDiskUsage (MonitoredJob job, JobDirInfo &info) throw (runtime_error) |
| If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs. | |
| typedef struct apmon_mon_utils::JobDirInfo apmon_mon_utils::JobDirInfo |
Structure that holds information about the disk usage for a job.
| typedef struct apmon_mon_utils::PsInfo apmon_mon_utils::PsInfo |
Structure that holds information about a job, as obtained from the ps command.
| long * apmon_mon_utils::getChildren | ( | long | pid, | |
| int & | nChildren | |||
| ) | throw (runtime_error) |
Determines all the descendants of a given process.
Definition at line 883 of file monitor_utils.cpp.
00884 { 00885 #ifdef WIN32 00886 return 0; 00887 #else 00888 FILE *pf; 00889 long *pids, *ppids, *children; 00890 int nProcesses; 00891 int i, j, status; 00892 pid_t cpid; 00893 char *argv[4], msg[MAX_STRING_LEN], sval[20]; 00894 bool processFound; 00895 long mypid = getpid(); 00896 char children_f[50], np_f[50], cmd[200]; 00897 00898 /* generate the names of the temporary files in which we have the output 00899 of some commands */ 00900 sprintf(children_f, "/tmp/apmon_children%ld", mypid); 00901 sprintf(np_f, "/tmp/apmon_np%ld", mypid); 00902 00903 switch (cpid = fork()) { 00904 case -1: 00905 throw runtime_error("[ getChildren() ] Unable to fork()"); 00906 case 0: 00907 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 00908 sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s", 00909 children_f, children_f, np_f); 00910 argv[2] = cmd; 00911 /* 00912 argv[2] = "ps --no-headers -eo ppid,pid > /tmp/apmon_children.txt && wc -l /tmp/out_children.txt > /tmp/out_np.txt"; 00913 */ 00914 argv[3] = 0; 00915 execv("/bin/sh", argv); 00916 exit(RET_ERROR); 00917 default: 00918 if (waitpid(cpid, &status, 0) == -1) { 00919 sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid); 00920 unlink(children_f); unlink(np_f); 00921 throw runtime_error(msg); 00922 } 00923 } 00924 00925 /* find the number of processes */ 00926 pf = fopen(np_f, "rt"); 00927 if (pf == NULL) { 00928 unlink(np_f); unlink(children_f); 00929 sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", 00930 pid); 00931 throw runtime_error(msg); 00932 } 00933 fscanf(pf, "%d", &nProcesses); 00934 fclose(pf); 00935 unlink(np_f); 00936 00937 pids = (long *)malloc(nProcesses * sizeof(long)); 00938 ppids = (long *)malloc(nProcesses * sizeof(long)); 00939 /* estimated maximum size for the returned vector; it will be realloc'ed */ 00940 children = (long *)malloc(nProcesses * sizeof(long)); 00941 00942 pf = fopen(children_f, "rt"); 00943 if (pf == NULL) { 00944 free(pids); free(ppids); free(children); 00945 unlink(children_f); 00946 sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid); 00947 throw runtime_error(msg); 00948 } 00949 00950 /* scan the output of the ps command and find the children of the process, 00951 and also check if the process is still running */ 00952 children[0] = pid; nChildren = 1; 00953 processFound = false; 00954 for (i = 0; i < nProcesses; i++) { 00955 fscanf(pf, "%ld %ld", &ppids[i], &pids[i]); 00956 /* look for the given process */ 00957 if (pids[i] == children[0] || ppids[i] == children[0]) 00958 processFound = true; 00959 if (ppids[i] == children[0]) { 00960 children[nChildren++] = pids[i]; 00961 } 00962 } 00963 fclose(pf); 00964 unlink(children_f); 00965 00966 if (processFound == false) { 00967 free(pids); free(ppids); free(children); 00968 nChildren = 0; 00969 sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid); 00970 throw runtime_error(msg); 00971 } 00972 00973 /* find the PIDs of all the descendant processes */ 00974 i = 1; 00975 while (i < nChildren) { 00976 /* find the children of the i-th child */ 00977 for (j = 0; j < nProcesses; j++) { 00978 if (ppids[j] == children[i]) { 00979 children[nChildren++] = pids[j]; 00980 } 00981 } 00982 i++; 00983 } 00984 00985 sprintf(msg, "Sub-processes for process %ld: ", pid); 00986 for (i = 0; i < nChildren; i++) { 00987 sprintf(sval, "%ld ", children[i]); 00988 if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1) 00989 strcat(msg, sval); 00990 } 00991 logger(DEBUG, msg); 00992 00993 free(pids); free(ppids); 00994 children = (long *)realloc(children, (nChildren) * sizeof(long)); 00995 return children; 00996 #endif 00997 }
| double apmon_mon_utils::parsePSTime | ( | char * | s | ) |
Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.
Definition at line 1183 of file monitor_utils.cpp.
01183 { 01184 long days, hours, mins, secs; 01185 01186 if (strchr(s, '-') != NULL) { 01187 sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs); 01188 return 24. * 3600 * days + 3600 * hours + 60 * mins + secs; 01189 } else { 01190 if (strchr(s, ':') != NULL && strchr(s, ':') != strrchr(s, ':')) { 01191 sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs); 01192 return 3600. * hours + 60 * mins + secs; 01193 } else { 01194 if (strchr(s, ':') != NULL) { 01195 sscanf(s, "%ld:%ld", &mins, &secs); 01196 return 60. * mins + secs; 01197 } else { 01198 return RET_ERROR; 01199 } 01200 } 01201 } 01202 }
| void apmon_mon_utils::readJobDiskUsage | ( | MonitoredJob | job, | |
| JobDirInfo & | info | |||
| ) | throw (runtime_error) |
If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.
Sizes are given in MB.
Definition at line 1204 of file monitor_utils.cpp.
01205 { 01206 #ifndef WIN32 01207 int status; 01208 pid_t cpid; 01209 char *cmd, s_tmp[20], *argv[4], msg[100]; 01210 FILE *fp; 01211 long mypid = getpid(); 01212 char du_f[50], df_f[50]; 01213 01214 /* generate names for the temporary files which will hold the output of the 01215 du and df commands */ 01216 sprintf(du_f, "/tmp/apmon_du%ld", mypid); 01217 sprintf(df_f, "/tmp/apmon_df%ld", mypid); 01218 01219 if (strlen(job.workdir) == 0) { 01220 sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid); 01221 throw runtime_error(msg); 01222 } 01223 01224 cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char)); 01225 strcpy(cmd, "PRT=`du -Lsk "); 01226 strcat(cmd, job.workdir); 01227 //strcat(cmd, " | tail -1 | cut -f 1 > "); 01228 strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > "); 01229 strcat(cmd, du_f); 01230 01231 01232 switch (cpid = fork()) { 01233 case -1: 01234 sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid); 01235 throw runtime_error(msg); 01236 case 0: 01237 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 01238 argv[2] = cmd; argv[3] = 0; 01239 execv("/bin/sh", argv); 01240 exit(RET_ERROR); 01241 default: 01242 if (waitpid(cpid, &status, 0) == -1) { 01243 free(cmd); 01244 sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid); 01245 unlink(du_f); unlink(df_f); 01246 throw runtime_error(msg); 01247 } 01248 } 01249 01250 strcpy(cmd, "PRT=`df -m "); 01251 strcat(cmd, job.workdir); 01252 //strcat(cmd, " | tail -1 > "); 01253 strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > "); 01254 01255 strcat(cmd, df_f); 01256 //printf("### cmd: %s\n", cmd); 01257 01258 switch (cpid = fork()) { 01259 case -1: 01260 sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid); 01261 throw runtime_error(msg); 01262 case 0: 01263 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 01264 argv[2] = cmd; argv[3] = 0; 01265 execv("/bin/sh", argv); 01266 exit(RET_ERROR); 01267 default: 01268 if (waitpid(cpid, &status, 0) == -1) { 01269 free(cmd); 01270 sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid); 01271 unlink(du_f); unlink(df_f); 01272 throw runtime_error(msg); 01273 } 01274 } 01275 01276 free(cmd); 01277 fp = fopen(du_f, "rt"); 01278 if (fp == NULL) { 01279 sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid); 01280 throw runtime_error(msg); 01281 } 01282 01283 fscanf(fp, "%lf", &(info.workdir_size)); 01284 /* keep the directory size in MB */ 01285 info.workdir_size /= 1024.0; 01286 fclose(fp); 01287 unlink(du_f); 01288 01289 fp = fopen(df_f, "rt"); 01290 if (fp == NULL) { 01291 sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid); 01292 throw runtime_error(msg); 01293 } 01294 fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total), 01295 &(info.disk_used), &(info.disk_free), &(info.disk_usage)); 01296 fclose(fp); 01297 unlink(df_f); 01298 #endif 01299 }
| void apmon_mon_utils::readJobInfo | ( | long | pid, | |
| PsInfo & | info | |||
| ) | throw (runtime_error) |
Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.
Definition at line 999 of file monitor_utils.cpp.
00999 { 01000 #ifndef WIN32 01001 long *children; 01002 FILE *fp; 01003 int i, nChildren, status, ch, ret, open_fd; 01004 char *cmd , *mem_cmd_s, *argv[4], *ret_s; 01005 char pid_s[10], msg[100]; 01006 char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[MAX_STRING_LEN1]; 01007 char etime_s[20], cputime_s[20]; 01008 double rsz, vsz; 01009 double etime, cputime; 01010 double pcpu, pmem; 01011 /* this list contains strings of the form "rsz_vsz_command" for every pid; 01012 it is used to avoid adding several times processes that have multiple 01013 threads and appear in ps as sepparate processes, occupying exactly the 01014 same amount of memory and having the same command name. For every line 01015 from the output of the ps command we verify if the rsz_vsz_command 01016 combination is already in the list. 01017 */ 01018 char **mem_cmd_list; 01019 int listSize; 01020 long cpid, crt_pid; 01021 //unsigned int maxCmdLen = 5 * MAX_STRING_LEN; 01022 long mypid = getpid(); 01023 char ps_f[50]; 01024 01025 /* get the list of the process' descendants */ 01026 children = getChildren(pid, nChildren); 01027 01028 /* generate a name for the temporary file which holds the output of the 01029 ps command */ 01030 sprintf(ps_f, "/tmp/apmon_ps%ld", mypid); 01031 01032 unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char); 01033 cmd = (char *)malloc (cmdLen); 01034 01035 /* issue the "ps" command to obtain information on all the descendants */ 01036 strcpy(cmd, "ps --no-headers --pid "); 01037 for (i = 0; i < nChildren - 1; i++) { 01038 sprintf(pid_s, "%ld,", children[i]); 01039 if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) { 01040 free(cmd); 01041 sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored", 01042 pid); 01043 throw runtime_error(msg); 01044 } 01045 strcat(cmd, pid_s); 01046 //strcat(cmd, " 2>&1"); 01047 } 01048 01049 /* the last part of the command */ 01050 sprintf(pid_s, "%ld", children[nChildren - 1]); 01051 sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f); 01052 if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) { 01053 free(cmd); 01054 sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored", 01055 pid); 01056 throw runtime_error(msg); 01057 } 01058 strcat(cmd, pid_s); 01059 strcat(cmd, cmdName); 01060 //strcat(cmd, " 2>&1"); 01061 01062 switch (cpid = fork()) { 01063 case -1: 01064 free(cmd); 01065 sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid); 01066 throw runtime_error(msg); 01067 case 0: 01068 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 01069 argv[2] = cmd; argv[3] = 0; 01070 execv("/bin/sh", argv); 01071 exit(RET_ERROR); 01072 default: 01073 if (waitpid(cpid, &status, 0) == -1) { 01074 free(cmd); 01075 sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid); 01076 throw runtime_error(msg); 01077 } 01078 } 01079 01080 free(cmd); 01081 fp = fopen(ps_f, "rt"); 01082 if (fp == NULL) { 01083 sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid); 01084 throw runtime_error(msg); 01085 } 01086 01087 /* parse the output file */ 01088 info.etime = info.cputime = 0; 01089 info.pcpu = info.pmem = 0; 01090 info.rsz = info.vsz = 0; 01091 info.open_fd = 0; 01092 mem_cmd_list = (char **)malloc(nChildren * sizeof(char *)); 01093 listSize = 0; 01094 cmdName[0] = 0; 01095 while (1) { 01096 ret_s = fgets(buf, MAX_STRING_LEN, fp); 01097 if (ret_s == NULL) 01098 break; 01099 buf[MAX_STRING_LEN - 1] = 0; 01100 01101 /* if the line was too long and fgets hasn't read it entirely, */ 01102 /* keep only the first 512 chars from the line */ 01103 ch = fgetc(fp); // see if we are at the end of the file 01104 ungetc(ch, fp); 01105 if (buf[strlen(buf) - 1] != 10 && ch != EOF) { 01106 while (1) { 01107 char *sret = fgets(buf2, MAX_STRING_LEN, fp); 01108 if (sret == NULL || buf[strlen(buf) - 1] == 10) 01109 break; 01110 } 01111 } 01112 01113 ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s, 01114 cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName); 01115 if (ret != 8) { 01116 fclose(fp); 01117 unlink(ps_f); 01118 free(children); 01119 for (i = 0; i < listSize; i++) { 01120 free(mem_cmd_list[i]); 01121 } 01122 free(mem_cmd_list); 01123 throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command"); 01124 } 01125 01126 /* etime is the maximum of the elapsed times for the subprocesses */ 01127 etime = parsePSTime(etime_s); 01128 01129 info.etime = (info.etime > etime) ? info.etime : etime; 01130 01131 /* cputime is the sum of the cpu times for the subprocesses */ 01132 cputime = parsePSTime(cputime_s); 01133 info.cputime += cputime; 01134 info.pcpu += pcpu; 01135 01136 /* get the number of opened file descriptors */ 01137 try { 01138 open_fd = ProcUtils::countOpenFiles(crt_pid); 01139 } catch (procutils_error& err) { 01140 logger(WARNING, err.what()); 01141 /* don't throw an exception if we couldn't read the number of files */ 01142 open_fd = PROCUTILS_ERROR; 01143 } 01144 01145 /* see if this is a process or just a thread */ 01146 mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char)); 01147 sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName); 01148 //printf("### mem_cmd_s: %s\n", mem_cmd_s); 01149 if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) { 01150 /* another pid with the same command name, rsz and vsz was not found, 01151 so this is a new process and we can add the amount of memory used by 01152 it */ 01153 info.pmem += pmem; 01154 info.vsz += vsz; info.rsz += rsz; 01155 01156 if (info.open_fd >= 0) // if no error occured so far 01157 info.open_fd += open_fd; 01158 /* add an entry in the list so that next time we see another thread of 01159 this process we don't add the amount of memory again */ 01160 mem_cmd_list[listSize++] = mem_cmd_s; 01161 } else { 01162 free(mem_cmd_s); 01163 } 01164 01165 /* if we monitor the current process, we have two extra opened files 01166 that we shouldn't take into account (the output file for ps and 01167 /proc/<pid>/fd/) 01168 */ 01169 if (crt_pid == getpid()) 01170 info.open_fd -= 2; 01171 } 01172 01173 fclose(fp); 01174 unlink(ps_f); 01175 free(children); 01176 for (i = 0; i < listSize; i++) { 01177 free(mem_cmd_list[i]); 01178 } 01179 free(mem_cmd_list); 01180 #endif 01181 }