Gaudi Framework, version v20r2

Generated: 18 Jul 2008

ApMon.h

Go to the documentation of this file.
00001 
00008 #ifdef _WIN32
00009   // FIXME: (MCl) the following warning tells that the usage of throw
00010   // declaration may cause trouble with VC > 7.1
00011 
00012   // Disable warning C4290: C++ exception specification ignored except to indicate a function is not __declspec(nothrow)
00013   #pragma warning ( disable : 4290 )
00014 #endif
00015 
00016 /*
00017  * ApMon - Application Monitoring Tool
00018  * Version: 2.2.0
00019  *
00020  * Copyright (C) 2006 California Institute of Technology
00021  *
00022  * Permission is hereby granted, free of charge, to use, copy and modify 
00023  * this software and its documentation (the "Software") for any
00024  * purpose, provided that existing copyright notices are retained in 
00025  * all copies and that this notice is included verbatim in any distributions
00026  * or substantial portions of the Software. 
00027  * This software is a part of the MonALISA framework (http://monalisa.cacr.caltech.edu).
00028  * Users of the Software are asked to feed back problems, benefits,
00029  * and/or suggestions about the software to the MonALISA Development Team
00030  * (developers@monalisa.cern.ch). Support for this software - fixing of bugs,
00031  * incorporation of new features - is done on a best effort basis. All bug
00032  * fixes and enhancements will be made available under the same terms and
00033  * conditions as the original software,
00034 
00035  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
00036  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
00037  * OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY DERIVATIVES THEREOF,
00038  * EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00039 
00040  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
00041  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
00042  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE IS
00043  * PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE NO
00044  * OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
00045  * MODIFICATIONS.
00046  */
00047 
00048 #ifndef ApMon_h
00049 #define ApMon_h
00050 
00051 #include <sys/types.h>
00052 #include <sys/stat.h>
00053 #include <errno.h>
00054 #include <stdio.h>
00055 #include <stdlib.h>
00056 #include <string.h>
00057 #include <stdexcept>
00058 #include <ctype.h>
00059 #include <time.h>
00060 
00061 #ifdef WIN32
00062 #define NOMSG
00063 #define NOGDI
00064 #include <Winsock2.h>
00065 #include <string.h>
00066 #include <process.h>
00067 #include "xdr.h"
00068 #undef NOMSG
00069 #undef NOGDI
00070 #else
00071 #include <sys/socket.h>
00072 #include <sys/ioctl.h>
00073 #include <sys/utsname.h>
00074 #include <sys/time.h>
00075 #include <arpa/inet.h>
00076 #include <netinet/in.h>
00077 #include <net/if.h>
00078 #include <rpc/rpc.h>
00079 #include <netdb.h>
00080 #include <unistd.h>
00081 #include <pthread.h>
00082 #include <pwd.h>
00083 #include <grp.h>
00084 
00085 #ifdef __APPLE__
00086 #include <sys/param.h>
00087 #else
00088 #include <linux/param.h>
00089 #endif
00090 
00091 #endif  // ~WIN32
00092 
00093 using namespace std;
00094 
00095 #define XDR_STRING  0  
00096 //#define XDR_INT16   1   // NOT SUPPORTED YET!
00097 #define XDR_INT32   2  
00098 //#define XDR_INT64   3    // THE SAME!
00099 #define XDR_REAL32  4  
00100 #define XDR_REAL64  5  
00102 #define MAX_DGRAM_SIZE   8192  
00103 #define MAX_STRING_LEN 512   
00104 #define MAX_STRING_LEN1 (MAX_STRING_LEN + 1)
00105 #define RET_SUCCESS  0  
00106 #define RET_ERROR   -1  
00107 #define PROCUTILS_ERROR -2
00108 #define RET_NOT_SENT -3 
00111 #define MAX_N_DESTINATIONS 30  
00114 #define DEFAULT_PORT 8884 
00115 #define MAX_HEADER_LENGTH 40  
00118 #define FILE_INIT  1
00119 
00120 #ifdef __APPLE__
00121 #define OLIST_INIT  2
00122 #else
00123 #define LIST_INIT  2
00124 #endif
00125 
00126 #define DIRECT_INIT  3
00127 
00128 #define JOB_MONITOR_INTERVAL 20
00129 
00130 #define SYS_MONITOR_INTERVAL 20
00131 
00133 #define RECHECK_INTERVAL 600
00134 
00138 #define GEN_MONITOR_INTERVALS 10
00139 
00140 #define MAX_MONITORED_JOBS 30
00141 
00142 #define MAX_SYS_PARAMS 30
00143 
00144 #define MAX_GEN_PARAMS 30
00145 
00146 #define MAX_JOB_PARAMS 30
00147 
00149 #define MAX_MSG_RATE 20
00150 
00151 #define NLETTERS 26
00152 
00153 #define TWO_BILLION 2000000000
00154 
00155 #define APMON_VERSION "2.2.1"
00156 
00160 typedef struct ConfURLs {
00162   int nConfURLs; 
00164   char *vURLs[MAX_N_DESTINATIONS];
00167   char *lastModifURLs[MAX_N_DESTINATIONS];
00168 } ConfURLs;
00169 
00173 typedef struct MonitoredJob {
00174   long pid;
00175   /* the job's working dierctory */
00176   char workdir[MAX_STRING_LEN];
00177   /* the cluster name that will be included in the monitoring datagrams */
00178   char clusterName[50]; 
00179   /* the node name that will be included in the monitoring datagrams */
00180   char nodeName[50];
00181 } MonitoredJob;
00182 
00183 #ifdef WIN32
00184 #define pthread_mutex_lock(mutex_ref) (WaitForSingleObject(*mutex_ref, INFINITE))
00185 #define pthread_mutex_unlock(mutex_ref) (ReleaseMutex(*mutex_ref))
00186 #define pthread_mutex_destroy(mutex_ref) (CloseHandle(*mutex_ref))
00187 #define pthread_cond_signal(event_ref) (SetEvent(*event_ref))
00188 #define pthread_cond_destroy(event_ref) (CloseHandle(*event_ref))
00189 #define ETIMEDOUT WAIT_TIMEOUT
00190 #endif
00191 
00212 class ApMon {
00213  protected:
00214   char *clusterName; 
00215   char *nodeName; 
00218   char *sysMonCluster; 
00220   char *sysMonNode;
00221 
00222   int nDestinations; 
00223   char **destAddresses; 
00224   int *destPorts; 
00225   char **destPasswds; 
00227   char *buf; 
00228   int dgramSize; 
00229 #ifndef WIN32
00230   int sockfd; 
00231 #else
00232   SOCKET sockfd;
00233 #endif
00234  
00237   bool confCheck;
00238 
00240   int nInitSources;
00242   char  **initSources;
00243   /* The initialization type (from file / list / directly). */
00244   int initType;
00245 
00249   long recheckInterval;
00250 
00255   long crtRecheckInterval;
00256 
00257 #ifndef WIN32
00258 
00260   pthread_t bkThread;
00261   
00263   pthread_mutex_t mutex;
00264 
00266   pthread_mutex_t mutexBack;
00267   
00269   pthread_mutex_t mutexCond;
00270 
00272   pthread_cond_t confChangedCond;
00273 #else
00274  public:
00275   HANDLE bkThread;
00276   HANDLE mutex;
00277   HANDLE mutexBack;
00278   HANDLE mutexCond;
00279   HANDLE confChangedCond;
00280  protected:
00281 #endif
00282 
00283   bool recheckChanged, jobMonChanged,sysMonChanged;
00284 
00287   bool haveBkThread;
00288 
00290   bool bkThreadStarted;
00291 
00293   bool stopBkThread;
00294 
00299   bool autoDisableMonitoring;
00300 
00304   bool sysMonitoring;
00305 
00309   bool jobMonitoring;
00310 
00314   bool genMonitoring;
00315 
00319   long jobMonitorInterval, sysMonitorInterval; 
00320   
00324   int genMonitorIntervals;
00325 
00329   int nSysMonitorParams, nJobMonitorParams, nGenMonitorParams;
00330   
00331   /* The names of the parameters that can be enabled/disabled by the user in
00332    * the system/job/general monitoring datagrams.
00333    */
00334   char *sysMonitorParams[MAX_SYS_PARAMS];
00335   char *genMonitorParams[MAX_GEN_PARAMS];
00336   char *jobMonitorParams[MAX_JOB_PARAMS];
00337 
00338   /* Arrays of flags that specifiy the active monitoring parameters (the
00339    * ones that are sent in the datagams). 
00340    */
00341   int actSysMonitorParams[MAX_SYS_PARAMS];
00342   int actGenMonitorParams[MAX_GEN_PARAMS];
00343   int actJobMonitorParams[MAX_JOB_PARAMS];
00344 
00345   ConfURLs confURLs;
00346 
00348   int nMonJobs;
00349 
00351   MonitoredJob *monJobs;
00352 
00354   long lastModifFile;
00355 
00356   /* The moment when the last datagram with job monitoring information
00357    * was sent. 
00358    */
00359   time_t lastJobInfoSend;
00360 
00362   char username[MAX_STRING_LEN];
00364   char groupname[MAX_STRING_LEN];
00366   char myHostname[MAX_STRING_LEN];
00368   char myIP[MAX_STRING_LEN];
00370   int numIPs;
00372   char allMyIPs[20][20];
00374   int numCPUs;
00375 
00376   bool sysInfo_first;
00378   time_t lastSysInfoSend;
00379  /* The last recorded values for system parameters. */
00380   double lastSysVals[MAX_SYS_PARAMS];
00381   /* The current values for the system parameters */
00382   double currentSysVals[MAX_SYS_PARAMS];
00383   /* The success/error codes returned by the functions that calculate
00384      the system parameters */
00385   int sysRetResults[MAX_SYS_PARAMS];
00386 
00387   /* The current values for the job parameters */
00388   double currentJobVals[MAX_JOB_PARAMS];
00389   /* The success/error codes retuprorned by the functions that calculate
00390      the job parameters */
00391   int jobRetResults[MAX_JOB_PARAMS];
00392 
00393   /* The current values for the general parameters */
00394   double currentGenVals[MAX_GEN_PARAMS];
00395   /* The success/error codes returned by the functions that calculate
00396      the general parameters */
00397   int genRetResults[MAX_GEN_PARAMS];
00398 
00399   /* Table which stores the number of processes in each state 
00400      (R -runnable, S - sleeping etc.) Each entry in the table 
00401      corresponds to a capital letter. */
00402   double currentProcessStates[NLETTERS];
00403 
00404   /* CPU information: */
00405   char cpuVendor[100];
00406   char cpuFamily[100];
00407   char cpuModel[100];
00408   char cpuModelName[200];
00409 
00411   char interfaceNames[20][20];
00413   int nInterfaces;
00416   double lastBytesSent[20];
00417   double lastBytesReceived[20];
00420   double lastNetErrs[20];
00422   double *currentNetIn, *currentNetOut, *currentNetErrs;
00423 
00425   double currentNSockets[4];
00428   double currentSocketsTCP[20];
00431   char *socketStatesMapTCP[20];  
00432 
00433   /* don't allow a user to send more than MAX_MSG messages per second, in average */
00434   int maxMsgRate;
00435   long prvTime;
00436   double prvSent;
00437   double prvDrop;
00438   long crtTime;
00439   long crtSent;
00440   long crtDrop;
00441   double hWeight;
00442 
00444   int instance_id;
00448   int seq_nr;
00449 
00450  public:
00457   ApMon(char *initsource) throw(runtime_error);
00458   
00459 
00465   ApMon(int nDestinations, char **destinationsList) throw(runtime_error);
00466 
00478   ApMon(int nDestinations, char **destAddresses, int *destPorts, char **destPasswds) throw(runtime_error);
00479 
00483   ~ApMon();
00484 
00500   int sendParameter(char *clusterName, char *nodeName,
00501                char *paramName, int valueType, char *paramValue)
00502     throw(runtime_error);
00503 
00521   int sendTimedParameter(char *clusterName, char *nodeName,
00522               char *paramName, int valueType, char *paramValue, int timestamp)
00523     throw(runtime_error);
00524 
00537   int sendParameter(char *clusterName, char *nodeName,
00538                char *paramName, int paramValue)
00539     throw(runtime_error);
00540 
00553   int sendParameter(char *clusterName, char *nodeName,
00554                char *paramName, float paramValue)
00555     throw(runtime_error);
00556 
00569   int sendParameter(char *clusterName, char *nodeName,
00570                char *paramName, double paramValue)
00571     throw(runtime_error);
00572 
00585   int sendParameter(char *clusterName, char *nodeName,
00586                char *paramName, char *paramValue)
00587     throw(runtime_error);
00588 
00589 
00602   int sendParameters(char *clusterName, char *nodeName,
00603                int nParams, char **paramNames, int *valueTypes, 
00604                          char **paramValues) throw(runtime_error);
00605 
00622   int sendTimedParameters(char *clusterName, char *nodeName,
00623                int nParams, char **paramNames, int *valueTypes, 
00624                char **paramValues, int timestamp) throw(runtime_error);
00625 
00631   bool getConfCheck() { return confCheck; }
00632 
00639   long getRecheckInterval() { return recheckInterval; }
00640 
00641 
00649   void setRecheckInterval(long val);
00650 
00657   void setConfRecheck(bool confRecheck, long interval);
00658 
00663   void setConfRecheck(bool confRecheck) {
00664     setConfRecheck(confRecheck, RECHECK_INTERVAL);
00665   }
00666 
00673   void setJobMonitoring(bool jobMonitoring, long interval);
00674 
00678   void setJobMonitoring(bool jobMonitoring) {
00679     setJobMonitoring(jobMonitoring, JOB_MONITOR_INTERVAL);
00680   }
00681 
00685   long getJobMonitorInterval() {
00686     long i = -1;
00687     pthread_mutex_lock(&mutexBack);
00688     if (jobMonitoring)
00689       i = jobMonitorInterval;
00690     pthread_mutex_unlock(&mutexBack);
00691     return i;
00692   }
00693 
00695   bool getJobMonitoring() {
00696     bool b;
00697     pthread_mutex_lock(&mutexBack);
00698     b = jobMonitoring;
00699     pthread_mutex_unlock(&mutexBack);
00700     return b;
00701   }
00702 
00709   void setSysMonitoring(bool sysMonitoring, long interval); 
00710 
00714   void setSysMonitoring(bool sysMonitoring) {
00715     setSysMonitoring(sysMonitoring, SYS_MONITOR_INTERVAL);
00716   }
00717 
00721   long getSysMonitorInterval() {
00722     long i = -1;
00723     pthread_mutex_lock(&mutexBack);
00724     if (sysMonitoring)
00725       i = sysMonitorInterval;
00726     pthread_mutex_unlock(&mutexBack);
00727     return i;
00728   }
00729 
00731   bool getSysMonitoring() {
00732     bool b;
00733     pthread_mutex_lock(&mutexBack);
00734     b = sysMonitoring;
00735     pthread_mutex_unlock(&mutexBack);
00736     return b;
00737   }
00738 
00746   void setGenMonitoring(bool genMonitoring, int nIntervals);
00747 
00752   void setGenMonitoring(bool genMonitoring) {
00753     setGenMonitoring(genMonitoring, GEN_MONITOR_INTERVALS);
00754   }
00755 
00759   bool getGenMonitoring() {
00760     bool b;
00761     pthread_mutex_lock(&mutexBack);
00762     b = genMonitoring;
00763     pthread_mutex_unlock(&mutexBack);
00764     return b;
00765   }
00766 
00777   void addJobToMonitor(long pid, char *workdir, char *clusterName,
00778                        char *nodeName) throw(runtime_error);
00779 
00784   void removeJobToMonitor(long pid) throw(runtime_error);
00785 
00788   void setSysMonClusterNode(char *clusterName, char *nodeName);
00789 
00793   static void setLogLevel(char *newLevel_s);
00794 
00799   void setMaxMsgRate(int maxRate);
00800 
00805   static void errExit(char *msg);
00806 
00807 
00808  protected:
00809 
00819   void initialize(char *filename, bool firstTime) throw(runtime_error);
00820 
00822   void constructFromList(int nDestinations, char **destinationsList)
00823     throw(runtime_error);
00824 
00833   void initialize(int nDestinations, char **destList, bool firstTime) throw(runtime_error);
00834 
00835 
00847   void loadFile(char *filename, int *nDestinations, char **destAddresses,
00848                 int *destPorts, char **destPasswds) throw(runtime_error);
00849 
00850  
00862   void arrayInit(int nDestinations, char **destAddresses, int *destPorts,
00863                  char **destPasswds)
00864     throw(runtime_error);
00865 
00879   void arrayInit(int nDestinations, char **destAddresses, int *destPorts,
00880                       char **destPasswds, bool firstTime)
00881     throw(runtime_error);
00882 
00893   void addToDestinations(char *line, int *nDestinations, 
00894                 char *destAddresses[], int destPorts[], char *destPasswds[]); 
00895 
00900   void getDestFromWeb(char *url, int *nDestinations, char *destAddresses[], 
00901                  int destPorts[], char *destPasswds[],
00902                       ConfURLs& confURLs) throw(runtime_error);
00903 
00904  
00909   void encodeParams(int nParams, char **paramNames, int *valueTypes, 
00910                  char **paramValues, int timestamp) throw(runtime_error);
00911 
00915   void initMonitoring();
00916 
00920   void sendJobInfo();
00921 
00925   void sendOneJobInfo(MonitoredJob job);
00926 
00928   void updateJobInfo(MonitoredJob job); 
00929 
00932   void sendSysInfo();
00933 
00936   void updateSysInfo();
00937 
00940   void sendGeneralInfo();
00941 
00943   void updateGeneralInfo();
00944 
00950   void setBackgroundThread(bool val);
00951 
00956   long getCrtRecheckInterval() {
00957     return crtRecheckInterval;
00958   }
00959 
00960   void setCrtRecheckInterval(long val);
00961 
00962   
00966   void freeConf();
00967 
00974 #ifndef WIN32
00975   friend void *bkTask(void *param);
00976 #else
00977   friend DWORD WINAPI bkTask(void *param);
00978 #endif
00979   
00983   void parseXApMonLine(char *line);
00984 
00986   void initSocket() throw(runtime_error);
00987 
00991   void parseConf(FILE *fp, int *nDestinations, char **destAddresses, 
00992                      int *destPorts, char **destPasswds)
00993     throw(runtime_error);
00994 
01000   bool shouldSend();
01001 
01002   friend class ProcUtils;
01003 };
01004 
01010 #ifndef WIN32
01011 void *bkTask(void *param); // throw(runtime_error);
01012 #else
01013 DWORD WINAPI bkTask(void *param);
01014 #endif
01015 #endif
01016 
01017 
01018 
01019 
01020 
01021 
01022 
01023 

Generated at Fri Jul 18 11:59:23 2008 for Gaudi Framework, version v20r2 by Doxygen version 1.5.1 written by Dimitri van Heesch, © 1997-2004