|
Gaudi Framework, version v21r8 |
| Home | Generated: 17 Mar 2010 |
Classes | |
| struct | PsInfo |
| Structure that holds information about a job, as obtained from the ps command. More... | |
| struct | JobDirInfo |
| Structure that holds information about the disk usage for a job. More... | |
Typedefs | |
| typedef struct apmon_mon_utils::PsInfo | PsInfo |
| Structure that holds information about a job, as obtained from the ps command. | |
| typedef struct apmon_mon_utils::JobDirInfo | JobDirInfo |
| Structure that holds information about the disk usage for a job. | |
Functions | |
| long * | getChildren (long pid, int &nChildren) throw (runtime_error) |
| Determines all the descendants of a given process. | |
| void | readJobInfo (long pid, PsInfo &info) throw (runtime_error) |
| Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command. | |
| long | parsePSTime (char *s) |
| Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds. | |
| void | readJobDiskUsage (MonitoredJob job, JobDirInfo &info) throw (runtime_error) |
| If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs. | |
| typedef struct apmon_mon_utils::JobDirInfo apmon_mon_utils::JobDirInfo |
Structure that holds information about the disk usage for a job.
| typedef struct apmon_mon_utils::PsInfo apmon_mon_utils::PsInfo |
Structure that holds information about a job, as obtained from the ps command.
| long * apmon_mon_utils::getChildren | ( | long | pid, | |
| int & | nChildren | |||
| ) | throw (runtime_error) |
Determines all the descendants of a given process.
Definition at line 883 of file monitor_utils.cpp.
00884 { 00885 #ifdef WIN32 00886 return 0; 00887 #else 00888 FILE *pf; 00889 long *pids, *ppids, *children; 00890 int nProcesses; 00891 int i, j, status; 00892 pid_t cpid; 00893 char *argv[4], msg[MAX_STRING_LEN], sval[20]; 00894 bool processFound; 00895 long mypid = getpid(); 00896 char children_f[50], np_f[50], cmd[200]; 00897 00898 /* generate the names of the temporary files in which we have the output 00899 of some commands */ 00900 sprintf(children_f, "/tmp/apmon_children%ld", mypid); 00901 sprintf(np_f, "/tmp/apmon_np%ld", mypid); 00902 00903 switch (cpid = fork()) { 00904 case -1: 00905 throw runtime_error("[ getChildren() ] Unable to fork()"); 00906 case 0: 00907 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 00908 sprintf(cmd, "ps --no-headers -A -o ppid,pid > %s && wc -l %s > %s", 00909 children_f, children_f, np_f); 00910 argv[2] = cmd; 00911 /* 00912 argv[2] = "ps --no-headers -eo ppid,pid > /tmp/apmon_children.txt && wc -l /tmp/out_children.txt > /tmp/out_np.txt"; 00913 */ 00914 argv[3] = 0; 00915 execv("/bin/sh", argv); 00916 exit(RET_ERROR); 00917 default: 00918 if (waitpid(cpid, &status, 0) == -1) { 00919 sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", pid); 00920 unlink(children_f); unlink(np_f); 00921 throw runtime_error(msg); 00922 } 00923 } 00924 00925 /* find the number of processes */ 00926 pf = fopen(np_f, "rt"); 00927 if (pf == NULL) { 00928 unlink(np_f); unlink(children_f); 00929 sprintf(msg, "[ getChildren() ] The number of sub-processes for %ld could not be determined", 00930 pid); 00931 throw runtime_error(msg); 00932 } 00933 fscanf(pf, "%d", &nProcesses); 00934 fclose(pf); 00935 unlink(np_f); 00936 00937 pids = (long *)malloc(nProcesses * sizeof(long)); 00938 ppids = (long *)malloc(nProcesses * sizeof(long)); 00939 /* estimated maximum size for the returned vector; it will be realloc'ed */ 00940 children = (long *)malloc(nProcesses * sizeof(long)); 00941 00942 pf = fopen(children_f, "rt"); 00943 if (pf == NULL) { 00944 free(pids); free(ppids); free(children); 00945 unlink(children_f); 00946 sprintf(msg, "[ getChildren() ] The sub-processes for %ld could not be determined", pid); 00947 throw runtime_error(msg); 00948 } 00949 00950 /* scan the output of the ps command and find the children of the process, 00951 and also check if the process is still running */ 00952 children[0] = pid; nChildren = 1; 00953 processFound = false; 00954 for (i = 0; i < nProcesses; i++) { 00955 fscanf(pf, "%ld %ld", &ppids[i], &pids[i]); 00956 /* look for the given process */ 00957 if (pids[i] == children[0] || ppids[i] == children[0]) 00958 processFound = true; 00959 if (ppids[i] == children[0]) { 00960 children[nChildren++] = pids[i]; 00961 } 00962 } 00963 fclose(pf); 00964 unlink(children_f); 00965 00966 if (processFound == false) { 00967 free(pids); free(ppids); free(children); 00968 nChildren = 0; 00969 sprintf(msg, "[ getChildren() ] The process %ld does not exist", pid); 00970 throw runtime_error(msg); 00971 } 00972 00973 /* find the PIDs of all the descendant processes */ 00974 i = 1; 00975 while (i < nChildren) { 00976 /* find the children of the i-th child */ 00977 for (j = 0; j < nProcesses; j++) { 00978 if (ppids[j] == children[i]) { 00979 children[nChildren++] = pids[j]; 00980 } 00981 } 00982 i++; 00983 } 00984 00985 sprintf(msg, "Sub-processes for process %ld: ", pid); 00986 for (i = 0; i < nChildren; i++) { 00987 sprintf(sval, "%ld ", children[i]); 00988 if (strlen(msg) + strlen(sval) < MAX_STRING_LEN - 1) 00989 strcat(msg, sval); 00990 } 00991 logger(DEBUG, msg); 00992 00993 free(pids); free(ppids); 00994 children = (long *)realloc(children, (nChildren) * sizeof(long)); 00995 return children; 00996 #endif 00997 }
| long apmon_mon_utils::parsePSTime | ( | char * | s | ) |
Function that parses a time formatted like "days-hours:min:sec" and returns the corresponding number of seconds.
Definition at line 1182 of file monitor_utils.cpp.
01182 { 01183 long days, hours, mins, secs; 01184 01185 if (strchr(s, '-') != NULL) { 01186 sscanf(s, "%ld-%ld:%ld:%ld", &days, &hours, &mins, &secs); 01187 return 24 * 3600 * days + 3600 * hours + 60 * mins + secs; 01188 } else { 01189 if (strchr(s, ':') != NULL && strchr(s, ':') != strrchr(s, ':')) { 01190 sscanf(s, "%ld:%ld:%ld", &hours, &mins, &secs); 01191 return 3600 * hours + 60 * mins + secs; 01192 } else { 01193 if (strchr(s, ':') != NULL) { 01194 sscanf(s, "%ld:%ld", &mins, &secs); 01195 return 60 * mins + secs; 01196 } else { 01197 return RET_ERROR; 01198 } 01199 } 01200 } 01201 }
| void apmon_mon_utils::readJobDiskUsage | ( | MonitoredJob | job, | |
| JobDirInfo & | info | |||
| ) | throw (runtime_error) |
If there is an work directory defined, then compute the used space in that directory and the free disk space on the partition to which that directory belongs.
Sizes are given in MB.
Definition at line 1203 of file monitor_utils.cpp.
01204 { 01205 #ifndef WIN32 01206 int status; 01207 pid_t cpid; 01208 char *cmd, s_tmp[20], *argv[4], msg[100]; 01209 FILE *fp; 01210 long mypid = getpid(); 01211 char du_f[50], df_f[50]; 01212 01213 /* generate names for the temporary files which will hold the output of the 01214 du and df commands */ 01215 sprintf(du_f, "/tmp/apmon_du%ld", mypid); 01216 sprintf(df_f, "/tmp/apmon_df%ld", mypid); 01217 01218 if (strlen(job.workdir) == 0) { 01219 sprintf(msg, "[ readJobDiskUsage() ] The working directory for the job %ld was not specified, not monitoring disk usage", job.pid); 01220 throw runtime_error(msg); 01221 } 01222 01223 cmd = (char *)malloc((300 + 2 * strlen(job.workdir)) * sizeof(char)); 01224 strcpy(cmd, "PRT=`du -Lsk "); 01225 strcat(cmd, job.workdir); 01226 //strcat(cmd, " | tail -1 | cut -f 1 > "); 01227 strcat(cmd, " ` ; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -f 1` ; echo $OUT ; exit 0 ; else exit -1 ; fi > "); 01228 strcat(cmd, du_f); 01229 01230 01231 switch (cpid = fork()) { 01232 case -1: 01233 sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid); 01234 throw runtime_error(msg); 01235 case 0: 01236 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 01237 argv[2] = cmd; argv[3] = 0; 01238 execv("/bin/sh", argv); 01239 exit(RET_ERROR); 01240 default: 01241 if (waitpid(cpid, &status, 0) == -1) { 01242 free(cmd); 01243 sprintf(msg, "[ readJobDiskUsage() ] The disk usage (du) information for %ld could not be determined", job.pid); 01244 unlink(du_f); unlink(df_f); 01245 throw runtime_error(msg); 01246 } 01247 } 01248 01249 strcpy(cmd, "PRT=`df -m "); 01250 strcat(cmd, job.workdir); 01251 //strcat(cmd, " | tail -1 > "); 01252 strcat(cmd, " `; if [[ $? -eq 0 ]] ; then OUT=`echo $PRT | cut -d ' ' -f 8-` ; echo $OUT ; exit 0 ; else exit -1 ; fi > "); 01253 01254 strcat(cmd, df_f); 01255 //printf("### cmd: %s\n", cmd); 01256 01257 switch (cpid = fork()) { 01258 case -1: 01259 sprintf(msg, "[ readJobDiskUsage() ] Unable to fork(). The disk usage information could not be determined for %ld", job.pid); 01260 throw runtime_error(msg); 01261 case 0: 01262 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 01263 argv[2] = cmd; argv[3] = 0; 01264 execv("/bin/sh", argv); 01265 exit(RET_ERROR); 01266 default: 01267 if (waitpid(cpid, &status, 0) == -1) { 01268 free(cmd); 01269 sprintf(msg, "[ readJobDiskUsage() ] The disk usage (df) information for %ld could not be determined", job.pid); 01270 unlink(du_f); unlink(df_f); 01271 throw runtime_error(msg); 01272 } 01273 } 01274 01275 free(cmd); 01276 fp = fopen(du_f, "rt"); 01277 if (fp == NULL) { 01278 sprintf(msg, "[ readJobDiskUsage() ] Error opening du output file for process %ld", job.pid); 01279 throw runtime_error(msg); 01280 } 01281 01282 fscanf(fp, "%lf", &(info.workdir_size)); 01283 /* keep the directory size in MB */ 01284 info.workdir_size /= 1024.0; 01285 fclose(fp); 01286 unlink(du_f); 01287 01288 fp = fopen(df_f, "rt"); 01289 if (fp == NULL) { 01290 sprintf(msg, "[ readJobDiskUsage() ] Error opening df output file for process %ld", job.pid); 01291 throw runtime_error(msg); 01292 } 01293 fscanf(fp, "%s %lf %lf %lf %lf", s_tmp, &(info.disk_total), 01294 &(info.disk_used), &(info.disk_free), &(info.disk_usage)); 01295 fclose(fp); 01296 unlink(df_f); 01297 #endif 01298 }
| void apmon_mon_utils::readJobInfo | ( | long | pid, | |
| PsInfo & | info | |||
| ) | throw (runtime_error) |
Obtains monitoring information for a given job and all its sub-jobs (descendant processes) with the aid of the ps command.
Definition at line 999 of file monitor_utils.cpp.
00999 { 01000 #ifndef WIN32 01001 long *children; 01002 FILE *fp; 01003 int i, nChildren, status, ch, ret, open_fd; 01004 char *cmd , *mem_cmd_s, *argv[4], *ret_s; 01005 char pid_s[10], msg[100]; 01006 char cmdName[MAX_STRING_LEN1], buf[MAX_STRING_LEN1], buf2[MAX_STRING_LEN1]; 01007 char etime_s[20], cputime_s[20]; 01008 double rsz, vsz; 01009 double etime, cputime; 01010 double pcpu, pmem; 01011 /* this list contains strings of the form "rsz_vsz_command" for every pid; 01012 it is used to avoid adding several times processes that have multiple 01013 threads and appear in ps as sepparate processes, occupying exactly the 01014 same amount of memory and having the same command name. For every line 01015 from the output of the ps command we verify if the rsz_vsz_command 01016 combination is already in the list. 01017 */ 01018 char **mem_cmd_list; 01019 int listSize; 01020 long cpid, crt_pid; 01021 //unsigned int maxCmdLen = 5 * MAX_STRING_LEN; 01022 long mypid = getpid(); 01023 char ps_f[50]; 01024 01025 /* get the list of the process' descendants */ 01026 children = getChildren(pid, nChildren); 01027 01028 /* generate a name for the temporary file which holds the output of the 01029 ps command */ 01030 sprintf(ps_f, "/tmp/apmon_ps%ld", mypid); 01031 01032 unsigned int cmdLen = (150 + 6 * nChildren) * sizeof(char); 01033 cmd = (char *)malloc (cmdLen); 01034 01035 /* issue the "ps" command to obtain information on all the descendants */ 01036 strcpy(cmd, "ps --no-headers --pid "); 01037 for (i = 0; i < nChildren - 1; i++) { 01038 sprintf(pid_s, "%ld,", children[i]); 01039 if (strlen(cmd) + strlen(pid_s) + 1 >= cmdLen) { 01040 free(cmd); 01041 sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored", 01042 pid); 01043 throw runtime_error(msg); 01044 } 01045 strcat(cmd, pid_s); 01046 //strcat(cmd, " 2>&1"); 01047 } 01048 01049 /* the last part of the command */ 01050 sprintf(pid_s, "%ld", children[nChildren - 1]); 01051 sprintf(cmdName, " -o pid,etime,time,%%cpu,%%mem,rsz,vsz,comm > %s", ps_f); 01052 if (strlen(cmd) + strlen(pid_s) + strlen(cmdName) >= cmdLen) { 01053 free(cmd); 01054 sprintf(msg, "[ readJobInfo() ] Job %ld has too many sub-processes to be monitored", 01055 pid); 01056 throw runtime_error(msg); 01057 } 01058 strcat(cmd, pid_s); 01059 strcat(cmd, cmdName); 01060 //strcat(cmd, " 2>&1"); 01061 01062 switch (cpid = fork()) { 01063 case -1: 01064 free(cmd); 01065 sprintf(msg, "[ readJobInfo() ] Unable to fork(). The job information could not be determined for %ld", pid); 01066 throw runtime_error(msg); 01067 case 0: 01068 argv[0] = (char *)"/bin/sh"; argv[1] = (char *)"-c"; 01069 argv[2] = cmd; argv[3] = 0; 01070 execv("/bin/sh", argv); 01071 exit(RET_ERROR); 01072 default: 01073 if (waitpid(cpid, &status, 0) == -1) { 01074 free(cmd); 01075 sprintf(msg, "[ readJobInfo() ] The job information for %ld could not be determined", pid); 01076 throw runtime_error(msg); 01077 } 01078 } 01079 01080 free(cmd); 01081 fp = fopen(ps_f, "rt"); 01082 if (fp == NULL) { 01083 sprintf(msg, "[ readJobInfo() ] Error opening the ps output file for process %ld", pid); 01084 throw runtime_error(msg); 01085 } 01086 01087 /* parse the output file */ 01088 info.etime = info.cputime = 0; 01089 info.pcpu = info.pmem = 0; 01090 info.rsz = info.vsz = 0; 01091 info.open_fd = 0; 01092 mem_cmd_list = (char **)malloc(nChildren * sizeof(char *)); 01093 listSize = 0; 01094 cmdName[0] = 0; 01095 while (1) { 01096 ret_s = fgets(buf, MAX_STRING_LEN, fp); 01097 if (ret_s == NULL) 01098 break; 01099 buf[MAX_STRING_LEN - 1] = 0; 01100 01101 /* if the line was too long and fgets hasn't read it entirely, */ 01102 /* keep only the first 512 chars from the line */ 01103 ch = fgetc(fp); // see if we are at the end of the file 01104 ungetc(ch, fp); 01105 if (buf[strlen(buf) - 1] != 10 && ch != EOF) { 01106 while (1) { 01107 char *sret = fgets(buf2, MAX_STRING_LEN, fp); 01108 if (sret == NULL || buf[strlen(buf) - 1] == 10) 01109 break; 01110 } 01111 } 01112 01113 ret = sscanf(buf, "%ld %s %s %lf %lf %lf %lf %s", &crt_pid, etime_s, 01114 cputime_s, &pcpu, &pmem, &rsz, &vsz, cmdName); 01115 if (ret != 8) { 01116 fclose(fp); 01117 unlink(ps_f); 01118 free(children); 01119 for (i = 0; i < listSize; i++) { 01120 free(mem_cmd_list[i]); 01121 } 01122 free(mem_cmd_list); 01123 throw runtime_error("[ readJobInfo() ] Error parsing the output of the ps command"); 01124 } 01125 01126 /* etime is the maximum of the elapsed times for the subprocesses */ 01127 etime = parsePSTime(etime_s); 01128 info.etime = (info.etime > etime) ? info.etime : etime; 01129 01130 /* cputime is the sum of the cpu times for the subprocesses */ 01131 cputime = parsePSTime(cputime_s); 01132 info.cputime += cputime; 01133 info.pcpu += pcpu; 01134 01135 /* get the number of opened file descriptors */ 01136 try { 01137 open_fd = ProcUtils::countOpenFiles(crt_pid); 01138 } catch (procutils_error& err) { 01139 logger(WARNING, err.what()); 01140 /* don't throw an exception if we couldn't read the number of files */ 01141 open_fd = PROCUTILS_ERROR; 01142 } 01143 01144 /* see if this is a process or just a thread */ 01145 mem_cmd_s = (char *)malloc(MAX_STRING_LEN * sizeof(char)); 01146 sprintf(mem_cmd_s, "%f_%f_%s", rsz, vsz, cmdName); 01147 //printf("### mem_cmd_s: %s\n", mem_cmd_s); 01148 if (getVectIndex(mem_cmd_s, mem_cmd_list, listSize) == -1) { 01149 /* aonther pid with the same command name, rsz and vsz was not found, 01150 so this is a new process and we can add the amount of memory used by 01151 it */ 01152 info.pmem += pmem; 01153 info.vsz += vsz; info.rsz += rsz; 01154 01155 if (info.open_fd >= 0) // if no error occured so far 01156 info.open_fd += open_fd; 01157 /* add an entry in the list so that next time we see another thread of 01158 this process we don't add the amount of memory again */ 01159 mem_cmd_list[listSize++] = mem_cmd_s; 01160 } else { 01161 free(mem_cmd_s); 01162 } 01163 01164 /* if we monitor the current process, we have two extra opened files 01165 that we shouldn't take into account (the output file for ps and 01166 /proc/<pid>/fd/) 01167 */ 01168 if (crt_pid == getpid()) 01169 info.open_fd -= 2; 01170 } 01171 01172 fclose(fp); 01173 unlink(ps_f); 01174 free(children); 01175 for (i = 0; i < listSize; i++) { 01176 free(mem_cmd_list[i]); 01177 } 01178 free(mem_cmd_list); 01179 #endif 01180 }