|
Gaudi Framework, version v21r7 |
| Home | Generated: 22 Jan 2010 |
Classes | |
| struct | PsInfo |
| Structure that holds information about a job, as obtained from the ps command. More... | |
| struct | JobDirInfo |
| Structure that holds information about the disk usage for a job. More... | |
Typedefs | |
| typedef struct apmon_mon_utils::PsInfo | PsInfo |
| Structure that holds information about a job, as obtained from the ps command. | |
| typedef struct apmon_mon_utils::JobDirInfo | JobDirInfo |
| Structure that holds information about the disk usage for a job. | |
Functions | |
| long * | getChildren (long pid, int &nChildren) throw (runtime_error) |
| Determines all the descendants of a given process. | |
| void | readJobInfo (long pid, PsInfo &info) throw (runtime_error) |
| Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command. | |
| long | parsePSTime (char *s) |
| Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds. | |
| void | readJobDiskUsage (MonitoredJob job, JobDirInfo &info) throw (runtime_error) |
| If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs. | |
| typedef struct apmon_mon_utils::JobDirInfo apmon_mon_utils::JobDirInfo |
Structure that holds information about the disk usage for a job.
| typedef struct apmon_mon_utils::PsInfo apmon_mon_utils::PsInfo |
Structure that holds information about a job, as obtained from the ps command.
| long * apmon_mon_utils::getChildren | ( | long | pid, | |
| int & | nChildren | |||
| ) | throw (runtime_error) |
Determines all the descendants of a given process.
Definition at line 885 of file monitor_utils.cpp.
00886 { 00887 #ifdef WIN32 00888 return 0; 00889 #else 00890 FILE *pf; 00891 long *pids, *ppids, *children; 00892 int nProcesses; 00893 int i, j, status; 00894 pid_t cpid; 00895 char *argv[4], msg[MAX_STRING_LEN], sval[20]; 00896 bool processFound; 00897 long mypid = getpid(); 00898 char children_f[50], np_f[50], cmd[200]; 00899 00900 /* generate the names of the temporary files in which we have the output 00901 of some commands */ 00902 sprintf(children_f, "/tmp/apmon_children%ld", mypid); 00903 sprintf(np_f, "/tmp/apmon_np%ld", mypid); 00904 00905 switch (cpid = fork()) { 00906 case -1: 00907 throw runtime_error("[ getChildren() ] Unable to fork()"); 00908 case 0: 00909 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 00910 sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s", 00911 children_f, children_f, np_f); 00912 argv[2] = cmd; 00913 /* 00914 argv[2] = "ps --no-headers -eo ppid,pid > /tmp/apmon_children.txt && wc -l /tmp/out_children.txt > /tmp/out_np.txt"; 00915 */ 00916 argv[3] = 0; 00917 execv("/bin/sh", argv); 00918 exit(RET_ERROR); 00919 default: 00920 if (waitpid(cpid, &status, 0) == -1) { 00921 sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid); 00922 unlink(children_f); unlink(np_f); 00923 throw runtime_error(msg); 00924 } 00925 } 00926 00927 /* find the number of processes */ 00928 pf = fopen(np_f, "rt"); 00929 if (pf == NULL) { 00930 unlink(np_f); unlink(children_f); 00931 sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", 00932 pid); 00933 throw runtime_error(msg); 00934 } 00935 fscanf(pf, "%d", &nProcesses); 00936 fclose(pf); 00937 unlink(np_f); 00938 00939 pids = (long *)malloc(nProcesses * sizeof(long)); 00940 ppids = (long *)malloc(nProcesses * sizeof(long)); 00941 /* estimated maximum size for the returned vector; it will be realloc'ed */ 00942 children = (long *)malloc(nProcesses * sizeof(long)); 00943 00944 pf = fopen(children_f, "rt"); 00945 if (pf == NULL) { 00946 free(pids); free(ppids); free(children); 00947 unlink(children_f); 00948 sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid); 00949 throw runtime_error(msg); 00950 } 00951 00952 /* scan the output of the ps command and find the children of the process, 00953 and also check if the process is still running */ 00954 children[0] = pid; nChildren = 1; 00955 processFound = false; 00956 for (i = 0; i < nProcesses; i++) { 00957 fscanf(pf, "%ld %ld", &ppids[i], &pids[i]); 00958 /* look for the given process */ 00959 if (pids[i] == children[0] || ppids[i] == children[0]) 00960 processFound = true; 00961 if (ppids[i] == children[0]) { 00962 children[nChildren++] = pids[i]; 00963 } 00964 } 00965 fclose(pf); 00966 unlink(children_f); 00967 00968 if (processFound == false) { 00969 free(pids); free(ppids); free(children); 00970 nChildren = 0; 00971 sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid); 00972 throw runtime_error(msg); 00973 } 00974 00975 /* find the PIDs of all the descendant processes */ 00976 i = 1; 00977 while (i < nChildren) { 00978 /* find the children of the i-th child */ 00979 for (j = 0; j < nProcesses; j++) { 00980 if (ppids[j] == children[i]) { 00981 children[nChildren++] = pids[j]; 00982 } 00983 } 00984 i++; 00985 } 00986 00987 sprintf(msg, "Sub-processes for process %ld: ", pid); 00988 for (i = 0; i < nChildren; i++) { 00989 sprintf(sval, "%ld ", children[i]); 00990 if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1) 00991 strcat(msg, sval); 00992 } 00993 logger(DEBUG, msg); 00994 00995 free(pids); free(ppids); 00996 children = (long *)realloc(children, (nChildren) * sizeof(long)); 00997 return children; 00998 #endif 00999 }
| long apmon_mon_utils::parsePSTime | ( | char * | s | ) |
Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.
Definition at line 1184 of file monitor_utils.cpp.
01184 { 01185 long days, hours, mins, secs; 01186 01187 if (strchr(s, '-') != NULL) { 01188 sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs); 01189 return 24 * 3600 * days + 3600 * hours + 60 * mins + secs; 01190 } else { 01191 if (strchr(s, ':') != NULL && strchr(s, ':') != strrchr(s, ':')) { 01192 sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs); 01193 return 3600 * hours + 60 * mins + secs; 01194 } else { 01195 if (strchr(s, ':') != NULL) { 01196 sscanf(s, "%ld:%ld", &mins, &secs); 01197 return 60 * mins + secs; 01198 } else { 01199 return RET_ERROR; 01200 } 01201 } 01202 } 01203 }
| void apmon_mon_utils::readJobDiskUsage | ( | MonitoredJob | job, | |
| JobDirInfo & | info | |||
| ) | throw (runtime_error) |
If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.
Sizes are given in MB.
Definition at line 1205 of file monitor_utils.cpp.
01206 { 01207 #ifndef WIN32 01208 int status; 01209 pid_t cpid; 01210 char *cmd, s_tmp[20], *argv[4], msg[100]; 01211 FILE *fp; 01212 long mypid = getpid(); 01213 char du_f[50], df_f[50]; 01214 01215 /* generate names for the temporary files which will hold the output of the 01216 du and df commands */ 01217 sprintf(du_f, "/tmp/apmon_du%ld", mypid); 01218 sprintf(df_f, "/tmp/apmon_df%ld", mypid); 01219 01220 if (strlen(job.workdir) == 0) { 01221 sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid); 01222 throw runtime_error(msg); 01223 } 01224 01225 cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char)); 01226 strcpy(cmd, "PRT=`du -Lsk "); 01227 strcat(cmd, job.workdir); 01228 //strcat(cmd, " | tail -1 | cut -f 1 > "); 01229 strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > "); 01230 strcat(cmd, du_f); 01231 01232 01233 switch (cpid = fork()) { 01234 case -1: 01235 sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid); 01236 throw runtime_error(msg); 01237 case 0: 01238 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 01239 argv[2] = cmd; argv[3] = 0; 01240 execv("/bin/sh", argv); 01241 exit(RET_ERROR); 01242 default: 01243 if (waitpid(cpid, &status, 0) == -1) { 01244 free(cmd); 01245 sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid); 01246 unlink(du_f); unlink(df_f); 01247 throw runtime_error(msg); 01248 } 01249 } 01250 01251 strcpy(cmd, "PRT=`df -m "); 01252 strcat(cmd, job.workdir); 01253 //strcat(cmd, " | tail -1 > "); 01254 strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > "); 01255 01256 strcat(cmd, df_f); 01257 //printf("### cmd: %s\n", cmd); 01258 01259 switch (cpid = fork()) { 01260 case -1: 01261 sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid); 01262 throw runtime_error(msg); 01263 case 0: 01264 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 01265 argv[2] = cmd; argv[3] = 0; 01266 execv("/bin/sh", argv); 01267 exit(RET_ERROR); 01268 default: 01269 if (waitpid(cpid, &status, 0) == -1) { 01270 free(cmd); 01271 sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid); 01272 unlink(du_f); unlink(df_f); 01273 throw runtime_error(msg); 01274 } 01275 } 01276 01277 free(cmd); 01278 fp = fopen(du_f, "rt"); 01279 if (fp == NULL) { 01280 sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid); 01281 throw runtime_error(msg); 01282 } 01283 01284 fscanf(fp, "%lf", &(info.workdir_size)); 01285 /* keep the directory size in MB */ 01286 info.workdir_size /= 1024.0; 01287 fclose(fp); 01288 unlink(du_f); 01289 01290 fp = fopen(df_f, "rt"); 01291 if (fp == NULL) { 01292 sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid); 01293 throw runtime_error(msg); 01294 } 01295 fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total), 01296 &(info.disk_used), &(info.disk_free), &(info.disk_usage)); 01297 fclose(fp); 01298 unlink(df_f); 01299 #endif 01300 }
| void apmon_mon_utils::readJobInfo | ( | long | pid, | |
| PsInfo & | info | |||
| ) | throw (runtime_error) |
Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.
Definition at line 1001 of file monitor_utils.cpp.
01001 { 01002 #ifndef WIN32 01003 long *children; 01004 FILE *fp; 01005 int i, nChildren, status, ch, ret, open_fd; 01006 char *cmd , *mem_cmd_s, *argv[4], *ret_s; 01007 char pid_s[10], msg[100]; 01008 char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[MAX_STRING_LEN1]; 01009 char etime_s[20], cputime_s[20]; 01010 double rsz, vsz; 01011 double etime, cputime; 01012 double pcpu, pmem; 01013 /* this list contains strings of the form "rsz_vsz_command" for every pid; 01014 it is used to avoid adding several times processes that have multiple 01015 threads and appear in ps as sepparate processes, occupying exactly the 01016 same amount of memory and having the same command name. For every line 01017 from the output of the ps command we verify if the rsz_vsz_command 01018 combination is already in the list. 01019 */ 01020 char **mem_cmd_list; 01021 int listSize; 01022 long cpid, crt_pid; 01023 //unsigned int maxCmdLen = 5 * MAX_STRING_LEN; 01024 long mypid = getpid(); 01025 char ps_f[50]; 01026 01027 /* get the list of the process' descendants */ 01028 children = getChildren(pid, nChildren); 01029 01030 /* generate a name for the temporary file which holds the output of the 01031 ps command */ 01032 sprintf(ps_f, "/tmp/apmon_ps%ld", mypid); 01033 01034 unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char); 01035 cmd = (char *)malloc (cmdLen); 01036 01037 /* issue the "ps" command to obtain information on all the descendants */ 01038 strcpy(cmd, "ps --no-headers --pid "); 01039 for (i = 0; i < nChildren - 1; i++) { 01040 sprintf(pid_s, "%ld,", children[i]); 01041 if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) { 01042 free(cmd); 01043 sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored", 01044 pid); 01045 throw runtime_error(msg); 01046 } 01047 strcat(cmd, pid_s); 01048 //strcat(cmd, " 2>&1"); 01049 } 01050 01051 /* the last part of the command */ 01052 sprintf(pid_s, "%ld", children[nChildren - 1]); 01053 sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f); 01054 if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) { 01055 free(cmd); 01056 sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored", 01057 pid); 01058 throw runtime_error(msg); 01059 } 01060 strcat(cmd, pid_s); 01061 strcat(cmd, cmdName); 01062 //strcat(cmd, " 2>&1"); 01063 01064 switch (cpid = fork()) { 01065 case -1: 01066 free(cmd); 01067 sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid); 01068 throw runtime_error(msg); 01069 case 0: 01070 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 01071 argv[2] = cmd; argv[3] = 0; 01072 execv("/bin/sh", argv); 01073 exit(RET_ERROR); 01074 default: 01075 if (waitpid(cpid, &status, 0) == -1) { 01076 free(cmd); 01077 sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid); 01078 throw runtime_error(msg); 01079 } 01080 } 01081 01082 free(cmd); 01083 fp = fopen(ps_f, "rt"); 01084 if (fp == NULL) { 01085 sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid); 01086 throw runtime_error(msg); 01087 } 01088 01089 /* parse the output file */ 01090 info.etime = info.cputime = 0; 01091 info.pcpu = info.pmem = 0; 01092 info.rsz = info.vsz = 0; 01093 info.open_fd = 0; 01094 mem_cmd_list = (char **)malloc(nChildren * sizeof(char *)); 01095 listSize = 0; 01096 cmdName[0] = 0; 01097 while (1) { 01098 ret_s = fgets(buf, MAX_STRING_LEN, fp); 01099 if (ret_s == NULL) 01100 break; 01101 buf[MAX_STRING_LEN - 1] = 0; 01102 01103 /* if the line was too long and fgets hasn't read it entirely, */ 01104 /* keep only the first 512 chars from the line */ 01105 ch = fgetc(fp); // see if we are at the end of the file 01106 ungetc(ch, fp); 01107 if (buf[strlen(buf) - 1] != 10 && ch != EOF) { 01108 while (1) { 01109 char *sret = fgets(buf2, MAX_STRING_LEN, fp); 01110 if (sret == NULL || buf[strlen(buf) - 1] == 10) 01111 break; 01112 } 01113 } 01114 01115 ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s, 01116 cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName); 01117 if (ret != 8) { 01118 fclose(fp); 01119 unlink(ps_f); 01120 free(children); 01121 for (i = 0; i < listSize; i++) { 01122 free(mem_cmd_list[i]); 01123 } 01124 free(mem_cmd_list); 01125 throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command"); 01126 } 01127 01128 /* etime is the maximum of the elapsed times for the subprocesses */ 01129 etime = parsePSTime(etime_s); 01130 info.etime = (info.etime > etime) ? info.etime : etime; 01131 01132 /* cputime is the sum of the cpu times for the subprocesses */ 01133 cputime = parsePSTime(cputime_s); 01134 info.cputime += cputime; 01135 info.pcpu += pcpu; 01136 01137 /* get the number of opened file descriptors */ 01138 try { 01139 open_fd = ProcUtils::countOpenFiles(crt_pid); 01140 } catch (procutils_error& err) { 01141 logger(WARNING, err.what()); 01142 /* don't throw an exception if we couldn't read the number of files */ 01143 open_fd = PROCUTILS_ERROR; 01144 } 01145 01146 /* see if this is a process or just a thread */ 01147 mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char)); 01148 sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName); 01149 //printf("### mem_cmd_s: %s\n", mem_cmd_s); 01150 if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) { 01151 /* aonther pid with the same command name, rsz and vsz was not found, 01152 so this is a new process and we can add the amount of memory used by 01153 it */ 01154 info.pmem += pmem; 01155 info.vsz += vsz; info.rsz += rsz; 01156 01157 if (info.open_fd >= 0) // if no error occured so far 01158 info.open_fd += open_fd; 01159 /* add an entry in the list so that next time we see another thread of 01160 this process we don't add the amount of memory again */ 01161 mem_cmd_list[listSize++] = mem_cmd_s; 01162 } else { 01163 free(mem_cmd_s); 01164 } 01165 01166 /* if we monitor the current process, we have two extra opened files 01167 that we shouldn't take into account (the output file for ps and 01168 /proc/<pid>/fd/) 01169 */ 01170 if (crt_pid == getpid()) 01171 info.open_fd -= 2; 01172 } 01173 01174 fclose(fp); 01175 unlink(ps_f); 01176 free(children); 01177 for (i = 0; i < listSize; i++) { 01178 free(mem_cmd_list[i]); 01179 } 01180 free(mem_cmd_list); 01181 #endif 01182 }