/* This file is part of Bolixo. Bolixo is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Bolixo is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Bolixo. If not, see . */ /* Monitoring system for bolixo. It connects to all services and perform a test query */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bolixo.m" #include "bolixo.h" #define INSTRUMENT_DONOTOPEN #include "instrument.h" using namespace std; enum CONNECT_TYPE { TYPE_NONE, TYPE_CONTROL, TYPE_PIPE }; struct HANDLE_INFO: public ARRAY_OBJ{ CONNECT_TYPE type; REQUEST_INFO req; pid_t pid; time_t start; HANDLE_INFO(){ type = TYPE_NONE; pid = (pid_t)-1; start = time(NULL); } }; #include "proto/bod_admin.protoch" #define bo_sessiond_client_getsessioninfo_NOTNEED #define bo_sessiond_client_getsessioninfovars_NOTNEED #define bo_sessiond_client_getsessioninfovars_v2_NOTNEED #define bo_sessiond_client_setvar_NOTNEED #define bo_sessiond_client_delnotify_NOTNEED #define bo_sessiond_client_setnotify_NOTNEED #define bo_sessiond_client_waitevent_NOTNEED #define bo_sessiond_client_ping_NOTNEED #include "proto/bo-sessiond_client.protoch" #define bolixod_client_registernode_NOTNEED #define bolixod_client_nodelogout_NOTNEED #define bolixod_client_nodelogin_NOTNEED #define bolixod_client_nodepass_NOTNEED #define bolixod_client_publish_NOTNEED #define bolixod_client_publish_v2_NOTNEED #define bolixod_client_publish_file_NOTNEED #define bolixod_client_publish_rest_NOTNEED #define bolixod_client_remove_NOTNEED #define bolixod_client_recordemail_NOTNEED #define bolixod_client_getnode_NOTNEED #define bolixod_client_newacct_findnode_NOTNEED #define bolixod_client_pub_search_NOTNEED #define bolixod_client_pub_search_v2_NOTNEED #define bolixod_client_pub_list_NOTNEED #define bolixod_client_readfile_NOTNEED #define bolixod_client_readfile_v2_NOTNEED #include "proto/bolixod_client.protoch" #define publishd_client_sendmessage_NOTNEED #include "proto/publishd_client.protoch" #define documentd_client_save_NOTNEED #define documentd_client_savemore_NOTNEED #define documentd_client_load_NOTNEED #define documentd_client_loadmore_NOTNEED #define documentd_client_playstep_NOTNEED #define documentd_client_playstep_more_NOTNEED #define documentd_client_rename_NOTNEED #define documentd_client_startgame_NOTNEED #define documentd_client_endgame_NOTNEED #define documentd_client_is_modified_NOTNEED #define documentd_client_waitevent_NOTNEED #define documentd_client_removesession_NOTNEED #include "proto/documentd_client.protoch" #define bo_keysd_control_genkey_NOTNEED #define bo_keysd_control_setpassphrase_NOTNEED #define bo_keysd_control_checkpassphrase_NOTNEED #define bo_keysd_control_sign_NOTNEED #define bo_keysd_control_quit_NOTNEED #define bo_keysd_control_debug_NOTNEED #define bo_keysd_control_debugfile_NOTNEED #define bo_keysd_control_runstatus_NOTNEED #include "proto/bo-keysd_control.protoch" #define bo_websocket_control_status_NOTNEED #define bo_websocket_control_quit_NOTNEED #define bo_websocket_control_debug_NOTNEED #define bo_websocket_control_debugfile_NOTNEED #define bo_websocket_control_instrument_NOTNEED #define bo_websocket_control_disconnectworkers_NOTNEED #define bo_websocket_control_pause_NOTNEED #define bo_websocket_control_resume_NOTNEED #define bo_websocket_control_set_debug_header_NOTNEED #include "proto/bo-websocket_control.protoch" #include "proto/bo-mon_control.protoh" static const char *ok_marker = " "; static const char *err_marker = "-> "; static int test_system(vector &out) { glocal int ret = 0; glocal vector *out = &out; ("/proc/meminfo",true); vector tb; int n = str_splitline(line,' ',tb); if (n == 3){ if (tb[0] == "SwapFree:"){ unsigned long freekb = atol(tb[1].c_str()); bool ok = freekb > 50000; glocal.out->push_back(string_f("%sSwapFree ok=%d freekb=%lu" ,ok ? ok_marker : err_marker ,ok,freekb)); if (!ok) glocal.ret = -1; } } return 0; ("/proc/loadavg",true); vector tb; int n = str_splitline(line,' ',tb); if (n >= 3){ float avg = atof(tb[2].c_str()); bool ok = avg < 0.5; glocal.out->push_back(string_f("%sloadavg ok=%d %s" ,ok ? ok_marker : err_marker ,ok,line)); if (!ok) glocal.ret = -1; } return 0; struct statfs st; if (statfs ("/",&st)!=-1){ const unsigned long long gig = 1024*1024*1024; unsigned long long space = st.f_bfree * st.f_bsize; bool ok = space > 20*gig || st.f_bfree > (st.f_blocks/4); out.push_back(string_f("%sdiskfree ok=%d f_blocks=%lu f_bfree=%lu space=%lf" ,ok ? ok_marker : err_marker ,ok,st.f_blocks,st.f_bfree,(double)space/gig)); } return glocal.ret; } struct SECRETS{ string bod; string bolixod; string publishd; string documentd; }; static int test_loop ( const vector &socks, const SECRETS &secrets, bool debug, vector &out, time_t &last_error) { glocal last_error; glocal int ret = 0; glocal out; glocal function seterror; out.clear(); long long start = fdpass_getnow(); long long last_now = start; last_error = (time_t)0; glocal.seterror = [&](){ last_error = time(nullptr); glocal.ret = -1; }; if (test_system (out)==-1) glocal.seterror(); for (auto &s:socks){ if (debug){ long long now = fdpass_getnow(); long long diff = now - start; long long diff_now = now - last_now; out.push_back(string_f("%sdebug=%Lu.%06Lu last=%Lu.%06Lu",ok_marker ,diff/1000000,diff%1000000 ,diff_now/1000000,diff_now%1000000)); last_now = now; } glocal const char *path = s.c_str(); debug_printf ("bo-mon path=%s\n",glocal.path); CONNECT_INFO con; con.port = glocal.path; con.set_timeout (5); //out.push_back(string_f("Try to connect to %s",glocal.path)); if (strstr(glocal.path,"-sessiond")!=NULL){ if (strstr(glocal.path,"-admin")!=NULL){ con.secret = secrets.bod; (con); glocal.out.push_back(string_f("%s%s: internal_error=%d success=%d" ,success ? ok_marker : err_marker ,glocal.path,internal_error,success)); if (!success) glocal.seterror(); } }else if (strstr(glocal.path,"-bolixod")!=NULL){ con.secret = secrets.bolixod; (con); const char *marker = ok_marker; if (internal_error || !sessiond || !db || !fsok){ glocal.seterror(); marker = err_marker; } glocal.out.emplace_back(string_f("%s%s: internal_error=%d sessiond=%d db=%d fsok=%d msg=%s" ,marker,glocal.path,internal_error,sessiond,db,fsok,msg)); }else if (strstr(glocal.path,"-publishd")!=NULL){ con.secret = secrets.publishd; glocal string secret = con.secret; string testfile = string_f("/var/lib/bolixo/test-%ld",time(NULL)); (testfile,false); return 0; (con,testfile); const char *marker = ok_marker; if (internal_error || !dbfiles || !fsok){ glocal.seterror(); marker = err_marker; } glocal.out.emplace_back(string_f("%s%s: internal_error=%d dbfiles=%d fsok=%d" ,marker,glocal.path,internal_error,dbfiles,fsok)); unlink (testfile.c_str()); }else if (strstr(glocal.path,"-documentd")!=NULL){ con.secret = secrets.documentd; (con); const char *marker = ok_marker; if (internal_error || !success){ glocal.seterror(); marker = err_marker; } glocal.out.emplace_back(string_f("%s%s: internal_error=%d success=%d" ,marker,glocal.path,internal_error,success)); }else if (strstr(glocal.path,"-keysd")!=NULL){ con.secret = ""; (con); const char *msg = ""; const char *marker = ok_marker; bool found = false; for (auto l:lines){ if (strcmp(l,"passphrase set")==0){ found = true; break; } } if (internal_error){ glocal.seterror(); marker = err_marker; }else if (!found){ glocal.seterror(); marker = err_marker; msg = "passphrase NOT set"; } glocal.out.emplace_back(string_f("%s%s: internal_error=%d %s " ,marker,glocal.path,internal_error,msg)); }else if (strstr(glocal.path,"-bod")!=NULL){ if (strstr(glocal.path,"-admin")!=NULL){ con.secret = secrets.bod; (con); if (internal_error){ glocal.out.push_back(string_f("%sCan't talk to bod server: %s" ,err_marker,glocal.path)); glocal.seterror(); }else{ bool ok = true; if (!writed || !bdfiles1 || !bdfiles2 || !bdusers || !sessiond1 || !sessiond2 || !keysd || !fsok || !publish_dbfiles || !publish_fsok || !documentd || admin_sess_valid!=2){ glocal.seterror(); ok = false; } glocal.out.push_back(string_f("%s%s: internal_error1=%d writed=%d bdfiles1=%d" " bdfiles2=%d bdusers=%d sessiond1=%d sessiond2=%d keysd=%d fsok=%d" " publish_dbfiles=%d publish_fsok=%d docd=%d adm_sess=%u" ,ok ? ok_marker : err_marker ,glocal.path,internal_error1,writed,bdfiles1 ,bdfiles2,bdusers,sessiond1,sessiond2,keysd,fsok ,publish_dbfiles,publish_fsok,documentd,admin_sess_valid)); } } }else if (strstr(glocal.path,"web-80-")!=NULL || strstr(glocal.path,"web-fail-80-")!=NULL){ glocal bool okseen = false; ("unix:",glocal.path,5); send ("GET /index.hc?test=1 HTTP/1.0\r\n\r\n"); if (strcmp(line,"ok")==0){ glocal.out.push_back (string_f("%s%s: ok seen",ok_marker,glocal.path)); glocal.okseen = true; } glocal.out.push_back (string_f("%s%s: can't connect",err_marker,glocal.path)); glocal.seterror(); if (!glocal.okseen){ glocal.out.push_back(string_f("%s%s: Ok not seen",err_marker,glocal.path)); glocal.seterror(); } }else if (strstr(glocal.path,"websocket-")!=nullptr){ (con); if (internal_error || !ok){ glocal.out.push_back(string_f("%s%s: test failed",err_marker,glocal.path)); glocal.seterror(); }else if (paused){ glocal.out.push_back(string_f("%s%s: test failed, workers=%u paused=%d",err_marker,glocal.path,workers,paused)); }else{ glocal.out.push_back (string_f("%s%s: test ok, workers=%u, paused=%d",ok_marker,glocal.path,workers,paused)); } }else if (strstr(glocal.path,"udpproxy")!=nullptr){ glocal int nbreq = 0; glocal bool ok = false; ("unix:",glocal.path,5); sendf ("status\n"); const char *pt = nullptr; if (strcmp(line,"Ok")==0){ glocal.out.push_back (string_f("%s%s: test ok, nbreq=%d",ok_marker,glocal.path,glocal.nbreq)); glocal.ok = true; end = true; }else if (is_start_any_of(line,pt,"nbreq=")){ glocal.nbreq = atoi(pt); } if (!glocal.ok){ glocal.out.push_back(string_f("%s%s: test failed, nbreq=%d",err_marker,glocal.path,glocal.nbreq)); glocal.seterror(); } }else{ auto size = out.size(); time_t last; if (trlitool_mon(glocal.path,out,last)==-1){ if (last > glocal.last_error) glocal.last_error = last; glocal.ret = -1; } if (out.size() == size){ glocal.out.push_back(string_f("%sDon't know how to handle this socket; %s",err_marker,glocal.path)); } } } if (debug){ long long now = fdpass_getnow(); long long diff = now - start; long long diff_now = now - last_now; out.push_back(string_f("%sdebug=%Lu.%06Lu last=%Lu.%06Lu",ok_marker ,diff/1000000,diff%1000000 ,diff_now/1000000,diff_now%1000000)); last_now = now; } return glocal.ret; } static void trli_mon_sendline (int fd, const string &s) { write (fd,s.c_str(),s.size()); write (fd,"\n",1); } static int trli_mon_sendmail ( const char *mailserver, const char *mailport, const char *admins_conf, const char *subject, const vector &out) { glocal string admin; (admins_conf,true); if (strncmp(line,"ADMIN1=",7)==0){ glocal.admin = line+7; } return 0; int ret = -1; if (glocal.admin.size() > 0){ string body; for (auto &s:out) body += string_f("%s\r\n",s.c_str()); ret = fdpass_sendmail (mailserver,mailport,"no-reply@bolixo.org",glocal.admin,subject,body); } return ret; } int main (int argc, char *argv[]) { glocal int ret = -1; glocal const char *mailserver = "unix:"; glocal const char *mailport = "/dev/smtp.sock"; glocal const char *command = NULL; glocal const char *admins_conf = "/etc/bolixo/admins.conf"; glocal const char *control = "/var/run/blackhole/bo-mon.sock"; glocal const char *sock_dir = NULL; glocal SECRETS secrets; glocal bool verbose = false; glocal bool daemon = false; glocal const char *user = "trli"; glocal int sleepdelay=5; // Wake up every N seconds glocal int testdelay = 30; // Executes test every 30 seconds glocal int quiettime = 60*60; // Wait that amount of time before sending an alert glocal const char *pidfile = "/var/run/trli-mon.pid"; glocal.ret = (argc,argv,"bolixo"); setproginfo ("bo-mon",VERSION,MSG_U(I_BO_MON,"Monitoring service for Bolixo")); setarg ('d',"sock_dir","Directoy holding the unix socket to connect to all services",glocal.sock_dir,true); setarg (' ',"bod-secret","Secret needed to connect to bod",glocal.secrets.bod,true); setarg (' ',"bolixod-secret","Secret needed to connect to bolixod",glocal.secrets.bolixod,true); setarg (' ',"publishd-secret","Secret needed to connect to publishd",glocal.secrets.publishd,true); setarg (' ',"documentd-secret","Secret needed to connect to documentd",glocal.secrets.documentd,true); setarg ('v',"verbose","Display more information",glocal.verbose,false); setarg (' ',"control","Unix socket path",glocal.control,false); setarg (' ',"alarmcmd","Command used to send an alarm",glocal.command,false); setgrouparg ("Daemon mode"); setarg (' ',"daemon","Runs in background",glocal.daemon,false); setarg (' ',"user","Runs as this user",glocal.user,false); setarg (' ',"pidfile","PID file",glocal.pidfile,false); setgrouparg ("Misc."); setarg (' ',"testdelay","Test every N seconds",glocal.testdelay,false); setarg (' ',"wakeup","Wakeup every N seconds",glocal.sleepdelay,false); setarg (' ',"mailserver","Mail relay to use",glocal.mailserver,false); setarg (' ',"mailport","TCP port of the relay (or unix socket)",glocal.mailport,false); setarg (' ',"quiettime","Minimum delay before sendming a new mail alert",glocal.quiettime,false); if (glocal.daemon){ syslog (LOG_ERR,"%s",msg); }else{ fprintf (stderr,"%s",msg); } if (glocal.daemon){ syslog (LOG_WARNING,"%s",msg); }else{ fprintf (stderr,"%s",msg); } glocal int ret = 0; glocal vector socks; signal (SIGCHLD,SIG_IGN); (glocal.sock_dir); glocal.socks.push_back(path); sort (glocal.socks.begin(),glocal.socks.end()); if (glocal.socks.size()==0){ glocal.ret = -1; tlmp_error ("No socket found in directory %s\n",glocal.sock_dir); }else if (!glocal.daemon){ vector out; time_t last_error; glocal.ret = test_loop (glocal.socks,glocal.secrets,false,out,last_error); if (glocal.verbose) for (auto &x:out) printf ("%s\n",x.c_str()); }else{ glocal unsigned long nbtest = 0; glocal bool teston = true; glocal bool allok = true; glocal vector testout; glocal bool new_allok = true; // Reception of current test glocal time_t new_last_error = (time_t)0; glocal vector new_testout; glocal pid_t new_testpid = (pid_t)-1; glocal time_t lastmsg = (time_t)0; glocal time_t lasttest = time(NULL); glocal time_t start_time = time(nullptr); // Avoid sending email for old errors // since syslog send the last_error timestamp (string_f("unix:%s",glocal.control),5); HANDLE_INFO *n = new HANDLE_INFO; n->type = TYPE_CONTROL; info.data = n; HANDLE_INFO *n = (HANDLE_INFO*)info.data; if (n->type == TYPE_PIPE){ glocal.allok = glocal.new_allok; glocal.testout = glocal.new_testout; if (!glocal.allok && glocal.new_last_error > glocal.start_time && glocal.new_last_error > glocal.lastmsg+glocal.quiettime){ glocal.lastmsg = time(NULL); if (glocal.command != NULL){ (glocal.command,10); for (auto &s:glocal.testout){ fprintf (fout,"%s\n",s.c_str()); } end = true; return 0; } const char *subject = glocal.new_testpid == (pid_t)-1 ? "bolixo monitoring failed" : "bolixo monitoring"; trli_mon_sendmail(glocal.mailserver,glocal.mailport,glocal.admins_conf,subject,glocal.testout); } //tlmp_error ("endclient pid=%u start %lu now %lu\n",n->pid,n->start,time(NULL)); if (n->pid == glocal.new_testpid) glocal.new_testpid = (pid_t)-1; } HANDLE_INFO *c = (HANDLE_INFO*)info.data; if (c->type == TYPE_CONTROL){ (this,c->req,line, info.linelen,endserver, endclient, no,c); vector tb; tb.push_back(string_f("Version %s",VERSION)); tb.push_back(string_f("testdelay %d",glocal.testdelay)); tb.push_back(string_f("autotest %s",glocal.teston ? "On" : "Off")); tb.push_back(string_f("services %s",glocal.allok ? "OK" : "Fail")); tb.push_back(string_f("nbtest %lu",glocal.nbtest)); tb.push_back(string_f("alarm command %s",glocal.command)); instrument_status (tb); DATEASC date; date.buf[0] = '\0'; if (glocal.lastmsg != (time_t)0) fdpass_asctime(glocal.lastmsg,date); tb.push_back(string_f("alarm sent %s",date.buf)); fdpass_asctime (glocal.lasttest,date); tb.push_back(string_f("last test %s",date.buf)); fdpass_asctime (glocal.new_last_error,date); tb.push_back(string_f("last error %s",date.buf)); fdpass_asctime (glocal.start_time,date); tb.push_back(string_f("start time %s",date.buf)); tb.push_back(string_f("quiet time %d",glocal.quiettime)); for (auto &x:glocal.testout) tb.push_back(x); rep_status(tb); toggle_instrument_file(on,"/tmp/instrument-mon.log"); glocal.teston = teston; endserver = true; vector out; time_t last_error; if (test_loop(glocal.socks,glocal.secrets,debug,out,last_error)==-1){ rep_test (false,out); }else{ rep_test (true,out); } vector out; time_t last_error; int ok = test_loop(glocal.socks,glocal.secrets,false,out,last_error); out.insert (out.begin(),string_f("ok=%d",ok == -1 ? 0 : 1)); int ret = trli_mon_sendmail (glocal.mailserver,glocal.mailport,glocal.admins_conf,"test email monitoring",out); rep_testmail (ret == -1 ? false:true); glocal.lastmsg = (time_t)0; if (on){ debug_seton(); }else{ debug_setoff(); } debug_setfdebug (filename); glocal.quiettime = quiettime; endclient = true; }else if (c->type == TYPE_PIPE){ if (strncmp(line,"ok=",3)==0){ int ok = atoi(line+3); glocal.new_allok = ok == -1 ? false : true; }else if (strncmp(line,"last=",5)==0){ glocal.new_last_error = atoi(line+5); }else{ glocal.new_testout.push_back(line); } } if (o.is_ok()){ glocal o; o.setrawmode(true); daemon_init (glocal.pidfile,glocal.user); (glocal.sleepdelay); time_t now = time(NULL); if (glocal.teston && (now-glocal.lasttest) >= glocal.testdelay){ if (glocal.new_testpid != (pid_t)-1){ tlmp_error ("test_loop did not complete, pid=%u\n",glocal.new_testpid); if (kill (glocal.new_testpid,SIGKILL)==-1){ tlmp_error ("Can't kill pid %u (%s)\n",glocal.new_testpid,strerror(errno)); } glocal.new_testpid = (pid_t)-1; } glocal.nbtest++; glocal.lasttest = now; glocal.new_allok = false; glocal.new_testout.clear(); int tb[2]; if (pipe(tb)==-1){ tlmp_error ("Can't setup pipe for test loop (%s)\n",strerror(errno)); }else{ pid_t pid = fork(); if (pid == (pid_t)0){ close (tb[0]); vector out; time_t last_error; int ok = test_loop(glocal.socks,glocal.secrets,false,out,last_error); trli_mon_sendline (tb[1],string_f("ok=%d",ok)); trli_mon_sendline (tb[1],string_f("last=%ld",last_error)); for (auto l:out) trli_mon_sendline(tb[1],l); _exit (0); }else if (pid == (pid_t)-1){ tlmp_error ("Can't fork for test loop (%s)\n",strerror(errno)); close (tb[1]); close (tb[0]); }else{ close (tb[1]); HANDLE_INFO *n = new HANDLE_INFO; n->type = TYPE_PIPE; n->pid = pid; glocal.o.inject (tb[0],n); glocal.o.setrawmode(tb[0],false); glocal.new_testpid = pid; } } } netevent_loop(o,idle); } } return glocal.ret; return glocal.ret; }