![]() |
|
|
Generated: 18 Jul 2008 |
00001 00008 #ifdef _WIN32 00009 // FIXME: (MCl) the following warning tells that the usage of throw 00010 // declaration may cause trouble with VC > 7.1 00011 00012 // Disable warning C4290: C++ exception specification ignored except to indicate a function is not __declspec(nothrow) 00013 #pragma warning ( disable : 4290 ) 00014 #endif 00015 00016 /* 00017 * ApMon - Application Monitoring Tool 00018 * Version: 2.2.0 00019 * 00020 * Copyright (C) 2006 California Institute of Technology 00021 * 00022 * Permission is hereby granted, free of charge, to use, copy and modify 00023 * this software and its documentation (the "Software") for any 00024 * purpose, provided that existing copyright notices are retained in 00025 * all copies and that this notice is included verbatim in any distributions 00026 * or substantial portions of the Software. 00027 * This software is a part of the MonALISA framework (http://monalisa.cacr.caltech.edu). 00028 * Users of the Software are asked to feed back problems, benefits, 00029 * and/or suggestions about the software to the MonALISA Development Team 00030 * (developers@monalisa.cern.ch). Support for this software - fixing of bugs, 00031 * incorporation of new features - is done on a best effort basis. All bug 00032 * fixes and enhancements will be made available under the same terms and 00033 * conditions as the original software, 00034 00035 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR 00036 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT 00037 * OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY DERIVATIVES THEREOF, 00038 * EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00039 00040 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, 00041 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, 00042 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE IS 00043 * PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE NO 00044 * OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR 00045 * MODIFICATIONS. 00046 */ 00047 00048 #ifndef ApMon_h 00049 #define ApMon_h 00050 00051 #include <sys/types.h> 00052 #include <sys/stat.h> 00053 #include <errno.h> 00054 #include <stdio.h> 00055 #include <stdlib.h> 00056 #include <string.h> 00057 #include <stdexcept> 00058 #include <ctype.h> 00059 #include <time.h> 00060 00061 #ifdef WIN32 00062 #define NOMSG 00063 #define NOGDI 00064 #include <Winsock2.h> 00065 #include <string.h> 00066 #include <process.h> 00067 #include "xdr.h" 00068 #undef NOMSG 00069 #undef NOGDI 00070 #else 00071 #include <sys/socket.h> 00072 #include <sys/ioctl.h> 00073 #include <sys/utsname.h> 00074 #include <sys/time.h> 00075 #include <arpa/inet.h> 00076 #include <netinet/in.h> 00077 #include <net/if.h> 00078 #include <rpc/rpc.h> 00079 #include <netdb.h> 00080 #include <unistd.h> 00081 #include <pthread.h> 00082 #include <pwd.h> 00083 #include <grp.h> 00084 00085 #ifdef __APPLE__ 00086 #include <sys/param.h> 00087 #else 00088 #include <linux/param.h> 00089 #endif 00090 00091 #endif // ~WIN32 00092 00093 using namespace std; 00094 00095 #define XDR_STRING 0 00096 //#define XDR_INT16 1 // NOT SUPPORTED YET! 00097 #define XDR_INT32 2 00098 //#define XDR_INT64 3 // THE SAME! 00099 #define XDR_REAL32 4 00100 #define XDR_REAL64 5 00102 #define MAX_DGRAM_SIZE 8192 00103 #define MAX_STRING_LEN 512 00104 #define MAX_STRING_LEN1 (MAX_STRING_LEN + 1) 00105 #define RET_SUCCESS 0 00106 #define RET_ERROR -1 00107 #define PROCUTILS_ERROR -2 00108 #define RET_NOT_SENT -3 00111 #define MAX_N_DESTINATIONS 30 00114 #define DEFAULT_PORT 8884 00115 #define MAX_HEADER_LENGTH 40 00118 #define FILE_INIT 1 00119 00120 #ifdef __APPLE__ 00121 #define OLIST_INIT 2 00122 #else 00123 #define LIST_INIT 2 00124 #endif 00125 00126 #define DIRECT_INIT 3 00127 00128 #define JOB_MONITOR_INTERVAL 20 00129 00130 #define SYS_MONITOR_INTERVAL 20 00131 00133 #define RECHECK_INTERVAL 600 00134 00138 #define GEN_MONITOR_INTERVALS 10 00139 00140 #define MAX_MONITORED_JOBS 30 00141 00142 #define MAX_SYS_PARAMS 30 00143 00144 #define MAX_GEN_PARAMS 30 00145 00146 #define MAX_JOB_PARAMS 30 00147 00149 #define MAX_MSG_RATE 20 00150 00151 #define NLETTERS 26 00152 00153 #define TWO_BILLION 2000000000 00154 00155 #define APMON_VERSION "2.2.1" 00156 00160 typedef struct ConfURLs { 00162 int nConfURLs; 00164 char *vURLs[MAX_N_DESTINATIONS]; 00167 char *lastModifURLs[MAX_N_DESTINATIONS]; 00168 } ConfURLs; 00169 00173 typedef struct MonitoredJob { 00174 long pid; 00175 /* the job's working dierctory */ 00176 char workdir[MAX_STRING_LEN]; 00177 /* the cluster name that will be included in the monitoring datagrams */ 00178 char clusterName[50]; 00179 /* the node name that will be included in the monitoring datagrams */ 00180 char nodeName[50]; 00181 } MonitoredJob; 00182 00183 #ifdef WIN32 00184 #define pthread_mutex_lock(mutex_ref) (WaitForSingleObject(*mutex_ref, INFINITE)) 00185 #define pthread_mutex_unlock(mutex_ref) (ReleaseMutex(*mutex_ref)) 00186 #define pthread_mutex_destroy(mutex_ref) (CloseHandle(*mutex_ref)) 00187 #define pthread_cond_signal(event_ref) (SetEvent(*event_ref)) 00188 #define pthread_cond_destroy(event_ref) (CloseHandle(*event_ref)) 00189 #define ETIMEDOUT WAIT_TIMEOUT 00190 #endif 00191 00212 class ApMon { 00213 protected: 00214 char *clusterName; 00215 char *nodeName; 00218 char *sysMonCluster; 00220 char *sysMonNode; 00221 00222 int nDestinations; 00223 char **destAddresses; 00224 int *destPorts; 00225 char **destPasswds; 00227 char *buf; 00228 int dgramSize; 00229 #ifndef WIN32 00230 int sockfd; 00231 #else 00232 SOCKET sockfd; 00233 #endif 00234 00237 bool confCheck; 00238 00240 int nInitSources; 00242 char **initSources; 00243 /* The initialization type (from file / list / directly). */ 00244 int initType; 00245 00249 long recheckInterval; 00250 00255 long crtRecheckInterval; 00256 00257 #ifndef WIN32 00258 00260 pthread_t bkThread; 00261 00263 pthread_mutex_t mutex; 00264 00266 pthread_mutex_t mutexBack; 00267 00269 pthread_mutex_t mutexCond; 00270 00272 pthread_cond_t confChangedCond; 00273 #else 00274 public: 00275 HANDLE bkThread; 00276 HANDLE mutex; 00277 HANDLE mutexBack; 00278 HANDLE mutexCond; 00279 HANDLE confChangedCond; 00280 protected: 00281 #endif 00282 00283 bool recheckChanged, jobMonChanged,sysMonChanged; 00284 00287 bool haveBkThread; 00288 00290 bool bkThreadStarted; 00291 00293 bool stopBkThread; 00294 00299 bool autoDisableMonitoring; 00300 00304 bool sysMonitoring; 00305 00309 bool jobMonitoring; 00310 00314 bool genMonitoring; 00315 00319 long jobMonitorInterval, sysMonitorInterval; 00320 00324 int genMonitorIntervals; 00325 00329 int nSysMonitorParams, nJobMonitorParams, nGenMonitorParams; 00330 00331 /* The names of the parameters that can be enabled/disabled by the user in 00332 * the system/job/general monitoring datagrams. 00333 */ 00334 char *sysMonitorParams[MAX_SYS_PARAMS]; 00335 char *genMonitorParams[MAX_GEN_PARAMS]; 00336 char *jobMonitorParams[MAX_JOB_PARAMS]; 00337 00338 /* Arrays of flags that specifiy the active monitoring parameters (the 00339 * ones that are sent in the datagams). 00340 */ 00341 int actSysMonitorParams[MAX_SYS_PARAMS]; 00342 int actGenMonitorParams[MAX_GEN_PARAMS]; 00343 int actJobMonitorParams[MAX_JOB_PARAMS]; 00344 00345 ConfURLs confURLs; 00346 00348 int nMonJobs; 00349 00351 MonitoredJob *monJobs; 00352 00354 long lastModifFile; 00355 00356 /* The moment when the last datagram with job monitoring information 00357 * was sent. 00358 */ 00359 time_t lastJobInfoSend; 00360 00362 char username[MAX_STRING_LEN]; 00364 char groupname[MAX_STRING_LEN]; 00366 char myHostname[MAX_STRING_LEN]; 00368 char myIP[MAX_STRING_LEN]; 00370 int numIPs; 00372 char allMyIPs[20][20]; 00374 int numCPUs; 00375 00376 bool sysInfo_first; 00378 time_t lastSysInfoSend; 00379 /* The last recorded values for system parameters. */ 00380 double lastSysVals[MAX_SYS_PARAMS]; 00381 /* The current values for the system parameters */ 00382 double currentSysVals[MAX_SYS_PARAMS]; 00383 /* The success/error codes returned by the functions that calculate 00384 the system parameters */ 00385 int sysRetResults[MAX_SYS_PARAMS]; 00386 00387 /* The current values for the job parameters */ 00388 double currentJobVals[MAX_JOB_PARAMS]; 00389 /* The success/error codes retuprorned by the functions that calculate 00390 the job parameters */ 00391 int jobRetResults[MAX_JOB_PARAMS]; 00392 00393 /* The current values for the general parameters */ 00394 double currentGenVals[MAX_GEN_PARAMS]; 00395 /* The success/error codes returned by the functions that calculate 00396 the general parameters */ 00397 int genRetResults[MAX_GEN_PARAMS]; 00398 00399 /* Table which stores the number of processes in each state 00400 (R -runnable, S - sleeping etc.) Each entry in the table 00401 corresponds to a capital letter. */ 00402 double currentProcessStates[NLETTERS]; 00403 00404 /* CPU information: */ 00405 char cpuVendor[100]; 00406 char cpuFamily[100]; 00407 char cpuModel[100]; 00408 char cpuModelName[200]; 00409 00411 char interfaceNames[20][20]; 00413 int nInterfaces; 00416 double lastBytesSent[20]; 00417 double lastBytesReceived[20]; 00420 double lastNetErrs[20]; 00422 double *currentNetIn, *currentNetOut, *currentNetErrs; 00423 00425 double currentNSockets[4]; 00428 double currentSocketsTCP[20]; 00431 char *socketStatesMapTCP[20]; 00432 00433 /* don't allow a user to send more than MAX_MSG messages per second, in average */ 00434 int maxMsgRate; 00435 long prvTime; 00436 double prvSent; 00437 double prvDrop; 00438 long crtTime; 00439 long crtSent; 00440 long crtDrop; 00441 double hWeight; 00442 00444 int instance_id; 00448 int seq_nr; 00449 00450 public: 00457 ApMon(char *initsource) throw(runtime_error); 00458 00459 00465 ApMon(int nDestinations, char **destinationsList) throw(runtime_error); 00466 00478 ApMon(int nDestinations, char **destAddresses, int *destPorts, char **destPasswds) throw(runtime_error); 00479 00483 ~ApMon(); 00484 00500 int sendParameter(char *clusterName, char *nodeName, 00501 char *paramName, int valueType, char *paramValue) 00502 throw(runtime_error); 00503 00521 int sendTimedParameter(char *clusterName, char *nodeName, 00522 char *paramName, int valueType, char *paramValue, int timestamp) 00523 throw(runtime_error); 00524 00537 int sendParameter(char *clusterName, char *nodeName, 00538 char *paramName, int paramValue) 00539 throw(runtime_error); 00540 00553 int sendParameter(char *clusterName, char *nodeName, 00554 char *paramName, float paramValue) 00555 throw(runtime_error); 00556 00569 int sendParameter(char *clusterName, char *nodeName, 00570 char *paramName, double paramValue) 00571 throw(runtime_error); 00572 00585 int sendParameter(char *clusterName, char *nodeName, 00586 char *paramName, char *paramValue) 00587 throw(runtime_error); 00588 00589 00602 int sendParameters(char *clusterName, char *nodeName, 00603 int nParams, char **paramNames, int *valueTypes, 00604 char **paramValues) throw(runtime_error); 00605 00622 int sendTimedParameters(char *clusterName, char *nodeName, 00623 int nParams, char **paramNames, int *valueTypes, 00624 char **paramValues, int timestamp) throw(runtime_error); 00625 00631 bool getConfCheck() { return confCheck; } 00632 00639 long getRecheckInterval() { return recheckInterval; } 00640 00641 00649 void setRecheckInterval(long val); 00650 00657 void setConfRecheck(bool confRecheck, long interval); 00658 00663 void setConfRecheck(bool confRecheck) { 00664 setConfRecheck(confRecheck, RECHECK_INTERVAL); 00665 } 00666 00673 void setJobMonitoring(bool jobMonitoring, long interval); 00674 00678 void setJobMonitoring(bool jobMonitoring) { 00679 setJobMonitoring(jobMonitoring, JOB_MONITOR_INTERVAL); 00680 } 00681 00685 long getJobMonitorInterval() { 00686 long i = -1; 00687 pthread_mutex_lock(&mutexBack); 00688 if (jobMonitoring) 00689 i = jobMonitorInterval; 00690 pthread_mutex_unlock(&mutexBack); 00691 return i; 00692 } 00693 00695 bool getJobMonitoring() { 00696 bool b; 00697 pthread_mutex_lock(&mutexBack); 00698 b = jobMonitoring; 00699 pthread_mutex_unlock(&mutexBack); 00700 return b; 00701 } 00702 00709 void setSysMonitoring(bool sysMonitoring, long interval); 00710 00714 void setSysMonitoring(bool sysMonitoring) { 00715 setSysMonitoring(sysMonitoring, SYS_MONITOR_INTERVAL); 00716 } 00717 00721 long getSysMonitorInterval() { 00722 long i = -1; 00723 pthread_mutex_lock(&mutexBack); 00724 if (sysMonitoring) 00725 i = sysMonitorInterval; 00726 pthread_mutex_unlock(&mutexBack); 00727 return i; 00728 } 00729 00731 bool getSysMonitoring() { 00732 bool b; 00733 pthread_mutex_lock(&mutexBack); 00734 b = sysMonitoring; 00735 pthread_mutex_unlock(&mutexBack); 00736 return b; 00737 } 00738 00746 void setGenMonitoring(bool genMonitoring, int nIntervals); 00747 00752 void setGenMonitoring(bool genMonitoring) { 00753 setGenMonitoring(genMonitoring, GEN_MONITOR_INTERVALS); 00754 } 00755 00759 bool getGenMonitoring() { 00760 bool b; 00761 pthread_mutex_lock(&mutexBack); 00762 b = genMonitoring; 00763 pthread_mutex_unlock(&mutexBack); 00764 return b; 00765 } 00766 00777 void addJobToMonitor(long pid, char *workdir, char *clusterName, 00778 char *nodeName) throw(runtime_error); 00779 00784 void removeJobToMonitor(long pid) throw(runtime_error); 00785 00788 void setSysMonClusterNode(char *clusterName, char *nodeName); 00789 00793 static void setLogLevel(char *newLevel_s); 00794 00799 void setMaxMsgRate(int maxRate); 00800 00805 static void errExit(char *msg); 00806 00807 00808 protected: 00809 00819 void initialize(char *filename, bool firstTime) throw(runtime_error); 00820 00822 void constructFromList(int nDestinations, char **destinationsList) 00823 throw(runtime_error); 00824 00833 void initialize(int nDestinations, char **destList, bool firstTime) throw(runtime_error); 00834 00835 00847 void loadFile(char *filename, int *nDestinations, char **destAddresses, 00848 int *destPorts, char **destPasswds) throw(runtime_error); 00849 00850 00862 void arrayInit(int nDestinations, char **destAddresses, int *destPorts, 00863 char **destPasswds) 00864 throw(runtime_error); 00865 00879 void arrayInit(int nDestinations, char **destAddresses, int *destPorts, 00880 char **destPasswds, bool firstTime) 00881 throw(runtime_error); 00882 00893 void addToDestinations(char *line, int *nDestinations, 00894 char *destAddresses[], int destPorts[], char *destPasswds[]); 00895 00900 void getDestFromWeb(char *url, int *nDestinations, char *destAddresses[], 00901 int destPorts[], char *destPasswds[], 00902 ConfURLs& confURLs) throw(runtime_error); 00903 00904 00909 void encodeParams(int nParams, char **paramNames, int *valueTypes, 00910 char **paramValues, int timestamp) throw(runtime_error); 00911 00915 void initMonitoring(); 00916 00920 void sendJobInfo(); 00921 00925 void sendOneJobInfo(MonitoredJob job); 00926 00928 void updateJobInfo(MonitoredJob job); 00929 00932 void sendSysInfo(); 00933 00936 void updateSysInfo(); 00937 00940 void sendGeneralInfo(); 00941 00943 void updateGeneralInfo(); 00944 00950 void setBackgroundThread(bool val); 00951 00956 long getCrtRecheckInterval() { 00957 return crtRecheckInterval; 00958 } 00959 00960 void setCrtRecheckInterval(long val); 00961 00962 00966 void freeConf(); 00967 00974 #ifndef WIN32 00975 friend void *bkTask(void *param); 00976 #else 00977 friend DWORD WINAPI bkTask(void *param); 00978 #endif 00979 00983 void parseXApMonLine(char *line); 00984 00986 void initSocket() throw(runtime_error); 00987 00991 void parseConf(FILE *fp, int *nDestinations, char **destAddresses, 00992 int *destPorts, char **destPasswds) 00993 throw(runtime_error); 00994 01000 bool shouldSend(); 01001 01002 friend class ProcUtils; 01003 }; 01004 01010 #ifndef WIN32 01011 void *bkTask(void *param); // throw(runtime_error); 01012 #else 01013 DWORD WINAPI bkTask(void *param); 01014 #endif 01015 #endif 01016 01017 01018 01019 01020 01021 01022 01023