/*
This file is part of Bolixo.
Bolixo is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Bolixo is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Bolixo. If not, see .
*/
/*
Monitoring system for bolixo. It connects to all services and perform a test query
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "bolixo.m"
#include "bolixo.h"
#define INSTRUMENT_DONOTOPEN
#include "instrument.h"
using namespace std;
enum CONNECT_TYPE { TYPE_NONE, TYPE_CONTROL, TYPE_PIPE };
struct HANDLE_INFO: public ARRAY_OBJ{
CONNECT_TYPE type;
REQUEST_INFO req;
pid_t pid;
time_t start;
HANDLE_INFO(){
type = TYPE_NONE;
pid = (pid_t)-1;
start = time(NULL);
}
};
#include "proto/bod_admin.protoch"
#define bo_sessiond_client_getsessioninfo_NOTNEED
#define bo_sessiond_client_getsessioninfovars_NOTNEED
#define bo_sessiond_client_getsessioninfovars_v2_NOTNEED
#define bo_sessiond_client_setvar_NOTNEED
#define bo_sessiond_client_delnotify_NOTNEED
#define bo_sessiond_client_setnotify_NOTNEED
#define bo_sessiond_client_waitevent_NOTNEED
#define bo_sessiond_client_ping_NOTNEED
#include "proto/bo-sessiond_client.protoch"
#define bolixod_client_registernode_NOTNEED
#define bolixod_client_nodelogout_NOTNEED
#define bolixod_client_nodelogin_NOTNEED
#define bolixod_client_nodepass_NOTNEED
#define bolixod_client_publish_NOTNEED
#define bolixod_client_publish_v2_NOTNEED
#define bolixod_client_publish_file_NOTNEED
#define bolixod_client_publish_rest_NOTNEED
#define bolixod_client_remove_NOTNEED
#define bolixod_client_recordemail_NOTNEED
#define bolixod_client_getnode_NOTNEED
#define bolixod_client_newacct_findnode_NOTNEED
#define bolixod_client_pub_search_NOTNEED
#define bolixod_client_pub_search_v2_NOTNEED
#define bolixod_client_pub_list_NOTNEED
#define bolixod_client_readfile_NOTNEED
#define bolixod_client_readfile_v2_NOTNEED
#include "proto/bolixod_client.protoch"
#define publishd_client_sendmessage_NOTNEED
#include "proto/publishd_client.protoch"
#define documentd_client_save_NOTNEED
#define documentd_client_savemore_NOTNEED
#define documentd_client_load_NOTNEED
#define documentd_client_loadmore_NOTNEED
#define documentd_client_playstep_NOTNEED
#define documentd_client_playstep_more_NOTNEED
#define documentd_client_rename_NOTNEED
#define documentd_client_startgame_NOTNEED
#define documentd_client_endgame_NOTNEED
#define documentd_client_is_modified_NOTNEED
#define documentd_client_waitevent_NOTNEED
#define documentd_client_removesession_NOTNEED
#include "proto/documentd_client.protoch"
#define bo_keysd_control_genkey_NOTNEED
#define bo_keysd_control_setpassphrase_NOTNEED
#define bo_keysd_control_checkpassphrase_NOTNEED
#define bo_keysd_control_sign_NOTNEED
#define bo_keysd_control_quit_NOTNEED
#define bo_keysd_control_debug_NOTNEED
#define bo_keysd_control_debugfile_NOTNEED
#define bo_keysd_control_runstatus_NOTNEED
#include "proto/bo-keysd_control.protoch"
#define bo_websocket_control_status_NOTNEED
#define bo_websocket_control_quit_NOTNEED
#define bo_websocket_control_debug_NOTNEED
#define bo_websocket_control_debugfile_NOTNEED
#define bo_websocket_control_instrument_NOTNEED
#define bo_websocket_control_disconnectworkers_NOTNEED
#define bo_websocket_control_pause_NOTNEED
#define bo_websocket_control_resume_NOTNEED
#define bo_websocket_control_set_debug_header_NOTNEED
#include "proto/bo-websocket_control.protoch"
#include "proto/bo-mon_control.protoh"
static const char *ok_marker = " ";
static const char *err_marker = "-> ";
static int test_system(vector &out)
{
glocal int ret = 0;
glocal vector *out = &out;
("/proc/meminfo",true);
vector tb;
int n = str_splitline(line,' ',tb);
if (n == 3){
if (tb[0] == "SwapFree:"){
unsigned long freekb = atol(tb[1].c_str());
bool ok = freekb > 50000;
glocal.out->push_back(string_f("%sSwapFree ok=%d freekb=%lu"
,ok ? ok_marker : err_marker
,ok,freekb));
if (!ok) glocal.ret = -1;
}
}
return 0;
("/proc/loadavg",true);
vector tb;
int n = str_splitline(line,' ',tb);
if (n >= 3){
float avg = atof(tb[2].c_str());
bool ok = avg < 0.5;
glocal.out->push_back(string_f("%sloadavg ok=%d %s"
,ok ? ok_marker : err_marker
,ok,line));
if (!ok) glocal.ret = -1;
}
return 0;
struct statfs st;
if (statfs ("/",&st)!=-1){
const unsigned long long gig = 1024*1024*1024;
unsigned long long space = st.f_bfree * st.f_bsize;
bool ok = space > 20*gig || st.f_bfree > (st.f_blocks/4);
out.push_back(string_f("%sdiskfree ok=%d f_blocks=%lu f_bfree=%lu space=%lf"
,ok ? ok_marker : err_marker
,ok,st.f_blocks,st.f_bfree,(double)space/gig));
}
return glocal.ret;
}
struct SECRETS{
string bod;
string bolixod;
string publishd;
string documentd;
};
static int test_loop (
const vector &socks,
const SECRETS &secrets,
bool debug,
vector &out,
time_t &last_error)
{
glocal last_error;
glocal int ret = 0;
glocal out;
glocal function seterror;
out.clear();
long long start = fdpass_getnow();
long long last_now = start;
last_error = (time_t)0;
glocal.seterror = [&](){
last_error = time(nullptr);
glocal.ret = -1;
};
if (test_system (out)==-1) glocal.seterror();
for (auto &s:socks){
if (debug){
long long now = fdpass_getnow();
long long diff = now - start;
long long diff_now = now - last_now;
out.push_back(string_f("%sdebug=%Lu.%06Lu last=%Lu.%06Lu",ok_marker
,diff/1000000,diff%1000000
,diff_now/1000000,diff_now%1000000));
last_now = now;
}
glocal const char *path = s.c_str();
debug_printf ("bo-mon path=%s\n",glocal.path);
CONNECT_INFO con;
con.port = glocal.path;
con.set_timeout (5);
//out.push_back(string_f("Try to connect to %s",glocal.path));
if (strstr(glocal.path,"-sessiond")!=NULL){
if (strstr(glocal.path,"-admin")!=NULL){
con.secret = secrets.bod;
(con);
glocal.out.push_back(string_f("%s%s: internal_error=%d success=%d"
,success ? ok_marker : err_marker
,glocal.path,internal_error,success));
if (!success) glocal.seterror();
}
}else if (strstr(glocal.path,"-bolixod")!=NULL){
con.secret = secrets.bolixod;
(con);
const char *marker = ok_marker;
if (internal_error || !sessiond || !db || !fsok){
glocal.seterror();
marker = err_marker;
}
glocal.out.emplace_back(string_f("%s%s: internal_error=%d sessiond=%d db=%d fsok=%d msg=%s"
,marker,glocal.path,internal_error,sessiond,db,fsok,msg));
}else if (strstr(glocal.path,"-publishd")!=NULL){
con.secret = secrets.publishd;
glocal string secret = con.secret;
string testfile = string_f("/var/lib/bolixo/test-%ld",time(NULL));
(testfile,false);
return 0;
(con,testfile);
const char *marker = ok_marker;
if (internal_error || !dbfiles || !fsok){
glocal.seterror();
marker = err_marker;
}
glocal.out.emplace_back(string_f("%s%s: internal_error=%d dbfiles=%d fsok=%d"
,marker,glocal.path,internal_error,dbfiles,fsok));
unlink (testfile.c_str());
}else if (strstr(glocal.path,"-documentd")!=NULL){
con.secret = secrets.documentd;
(con);
const char *marker = ok_marker;
if (internal_error || !success){
glocal.seterror();
marker = err_marker;
}
glocal.out.emplace_back(string_f("%s%s: internal_error=%d success=%d"
,marker,glocal.path,internal_error,success));
}else if (strstr(glocal.path,"-keysd")!=NULL){
con.secret = "";
(con);
const char *msg = "";
const char *marker = ok_marker;
bool found = false;
for (auto l:lines){
if (strcmp(l,"passphrase set")==0){
found = true;
break;
}
}
if (internal_error){
glocal.seterror();
marker = err_marker;
}else if (!found){
glocal.seterror();
marker = err_marker;
msg = "passphrase NOT set";
}
glocal.out.emplace_back(string_f("%s%s: internal_error=%d %s "
,marker,glocal.path,internal_error,msg));
}else if (strstr(glocal.path,"-bod")!=NULL){
if (strstr(glocal.path,"-admin")!=NULL){
con.secret = secrets.bod;
(con);
if (internal_error){
glocal.out.push_back(string_f("%sCan't talk to bod server: %s"
,err_marker,glocal.path));
glocal.seterror();
}else{
bool ok = true;
if (!writed
|| !bdfiles1
|| !bdfiles2
|| !bdusers
|| !sessiond1
|| !sessiond2
|| !keysd
|| !fsok
|| !publish_dbfiles
|| !publish_fsok
|| !documentd
|| admin_sess_valid!=2){
glocal.seterror();
ok = false;
}
glocal.out.push_back(string_f("%s%s: internal_error1=%d writed=%d bdfiles1=%d"
" bdfiles2=%d bdusers=%d sessiond1=%d sessiond2=%d keysd=%d fsok=%d"
" publish_dbfiles=%d publish_fsok=%d docd=%d adm_sess=%u"
,ok ? ok_marker : err_marker
,glocal.path,internal_error1,writed,bdfiles1
,bdfiles2,bdusers,sessiond1,sessiond2,keysd,fsok
,publish_dbfiles,publish_fsok,documentd,admin_sess_valid));
}
}
}else if (strstr(glocal.path,"web-80-")!=NULL || strstr(glocal.path,"web-fail-80-")!=NULL){
glocal bool okseen = false;
("unix:",glocal.path,5);
send ("GET /index.hc?test=1 HTTP/1.0\r\n\r\n");
if (strcmp(line,"ok")==0){
glocal.out.push_back (string_f("%s%s: ok seen",ok_marker,glocal.path));
glocal.okseen = true;
}
glocal.out.push_back (string_f("%s%s: can't connect",err_marker,glocal.path));
glocal.seterror();
if (!glocal.okseen){
glocal.out.push_back(string_f("%s%s: Ok not seen",err_marker,glocal.path));
glocal.seterror();
}
}else if (strstr(glocal.path,"websocket-")!=nullptr){
(con);
if (internal_error || !ok){
glocal.out.push_back(string_f("%s%s: test failed",err_marker,glocal.path));
glocal.seterror();
}else if (paused){
glocal.out.push_back(string_f("%s%s: test failed, workers=%u paused=%d",err_marker,glocal.path,workers,paused));
}else{
glocal.out.push_back (string_f("%s%s: test ok, workers=%u, paused=%d",ok_marker,glocal.path,workers,paused));
}
}else if (strstr(glocal.path,"udpproxy")!=nullptr){
glocal int nbreq = 0;
glocal bool ok = false;
("unix:",glocal.path,5);
sendf ("status\n");
const char *pt = nullptr;
if (strcmp(line,"Ok")==0){
glocal.out.push_back (string_f("%s%s: test ok, nbreq=%d",ok_marker,glocal.path,glocal.nbreq));
glocal.ok = true;
end = true;
}else if (is_start_any_of(line,pt,"nbreq=")){
glocal.nbreq = atoi(pt);
}
if (!glocal.ok){
glocal.out.push_back(string_f("%s%s: test failed, nbreq=%d",err_marker,glocal.path,glocal.nbreq));
glocal.seterror();
}
}else{
auto size = out.size();
time_t last;
if (trlitool_mon(glocal.path,out,last)==-1){
if (last > glocal.last_error) glocal.last_error = last;
glocal.ret = -1;
}
if (out.size() == size){
glocal.out.push_back(string_f("%sDon't know how to handle this socket; %s",err_marker,glocal.path));
}
}
}
if (debug){
long long now = fdpass_getnow();
long long diff = now - start;
long long diff_now = now - last_now;
out.push_back(string_f("%sdebug=%Lu.%06Lu last=%Lu.%06Lu",ok_marker
,diff/1000000,diff%1000000
,diff_now/1000000,diff_now%1000000));
last_now = now;
}
return glocal.ret;
}
static void trli_mon_sendline (int fd, const string &s)
{
write (fd,s.c_str(),s.size());
write (fd,"\n",1);
}
static int trli_mon_sendmail (
const char *mailserver,
const char *mailport,
const char *admins_conf,
const char *subject,
const vector &out)
{
glocal string admin;
(admins_conf,true);
if (strncmp(line,"ADMIN1=",7)==0){
glocal.admin = line+7;
}
return 0;
int ret = -1;
if (glocal.admin.size() > 0){
string body;
for (auto &s:out) body += string_f("%s\r\n",s.c_str());
ret = fdpass_sendmail (mailserver,mailport,"no-reply@bolixo.org",glocal.admin,subject,body);
}
return ret;
}
int main (int argc, char *argv[])
{
glocal int ret = -1;
glocal const char *mailserver = "unix:";
glocal const char *mailport = "/dev/smtp.sock";
glocal const char *command = NULL;
glocal const char *admins_conf = "/etc/bolixo/admins.conf";
glocal const char *control = "/var/run/blackhole/bo-mon.sock";
glocal const char *sock_dir = NULL;
glocal SECRETS secrets;
glocal bool verbose = false;
glocal bool daemon = false;
glocal const char *user = "trli";
glocal int sleepdelay=5; // Wake up every N seconds
glocal int testdelay = 30; // Executes test every 30 seconds
glocal int quiettime = 60*60; // Wait that amount of time before sending an alert
glocal const char *pidfile = "/var/run/trli-mon.pid";
glocal.ret = (argc,argv,"bolixo");
setproginfo ("bo-mon",VERSION,MSG_U(I_BO_MON,"Monitoring service for Bolixo"));
setarg ('d',"sock_dir","Directoy holding the unix socket to connect to all services",glocal.sock_dir,true);
setarg (' ',"bod-secret","Secret needed to connect to bod",glocal.secrets.bod,true);
setarg (' ',"bolixod-secret","Secret needed to connect to bolixod",glocal.secrets.bolixod,true);
setarg (' ',"publishd-secret","Secret needed to connect to publishd",glocal.secrets.publishd,true);
setarg (' ',"documentd-secret","Secret needed to connect to documentd",glocal.secrets.documentd,true);
setarg ('v',"verbose","Display more information",glocal.verbose,false);
setarg (' ',"control","Unix socket path",glocal.control,false);
setarg (' ',"alarmcmd","Command used to send an alarm",glocal.command,false);
setgrouparg ("Daemon mode");
setarg (' ',"daemon","Runs in background",glocal.daemon,false);
setarg (' ',"user","Runs as this user",glocal.user,false);
setarg (' ',"pidfile","PID file",glocal.pidfile,false);
setgrouparg ("Misc.");
setarg (' ',"testdelay","Test every N seconds",glocal.testdelay,false);
setarg (' ',"wakeup","Wakeup every N seconds",glocal.sleepdelay,false);
setarg (' ',"mailserver","Mail relay to use",glocal.mailserver,false);
setarg (' ',"mailport","TCP port of the relay (or unix socket)",glocal.mailport,false);
setarg (' ',"quiettime","Minimum delay before sendming a new mail alert",glocal.quiettime,false);
if (glocal.daemon){
syslog (LOG_ERR,"%s",msg);
}else{
fprintf (stderr,"%s",msg);
}
if (glocal.daemon){
syslog (LOG_WARNING,"%s",msg);
}else{
fprintf (stderr,"%s",msg);
}
glocal int ret = 0;
glocal vector socks;
signal (SIGCHLD,SIG_IGN);
(glocal.sock_dir);
glocal.socks.push_back(path);
sort (glocal.socks.begin(),glocal.socks.end());
if (glocal.socks.size()==0){
glocal.ret = -1;
tlmp_error ("No socket found in directory %s\n",glocal.sock_dir);
}else if (!glocal.daemon){
vector out;
time_t last_error;
glocal.ret = test_loop (glocal.socks,glocal.secrets,false,out,last_error);
if (glocal.verbose) for (auto &x:out) printf ("%s\n",x.c_str());
}else{
glocal unsigned long nbtest = 0;
glocal bool teston = true;
glocal bool allok = true;
glocal vector testout;
glocal bool new_allok = true; // Reception of current test
glocal time_t new_last_error = (time_t)0;
glocal vector new_testout;
glocal pid_t new_testpid = (pid_t)-1;
glocal time_t lastmsg = (time_t)0;
glocal time_t lasttest = time(NULL);
glocal time_t start_time = time(nullptr); // Avoid sending email for old errors
// since syslog send the last_error timestamp
(string_f("unix:%s",glocal.control),5);
HANDLE_INFO *n = new HANDLE_INFO;
n->type = TYPE_CONTROL;
info.data = n;
HANDLE_INFO *n = (HANDLE_INFO*)info.data;
if (n->type == TYPE_PIPE){
glocal.allok = glocal.new_allok;
glocal.testout = glocal.new_testout;
if (!glocal.allok
&& glocal.new_last_error > glocal.start_time
&& glocal.new_last_error > glocal.lastmsg+glocal.quiettime){
glocal.lastmsg = time(NULL);
if (glocal.command != NULL){
(glocal.command,10);
for (auto &s:glocal.testout){
fprintf (fout,"%s\n",s.c_str());
}
end = true;
return 0;
}
const char *subject = glocal.new_testpid == (pid_t)-1 ? "bolixo monitoring failed" : "bolixo monitoring";
trli_mon_sendmail(glocal.mailserver,glocal.mailport,glocal.admins_conf,subject,glocal.testout);
}
//tlmp_error ("endclient pid=%u start %lu now %lu\n",n->pid,n->start,time(NULL));
if (n->pid == glocal.new_testpid) glocal.new_testpid = (pid_t)-1;
}
HANDLE_INFO *c = (HANDLE_INFO*)info.data;
if (c->type == TYPE_CONTROL){
(this,c->req,line, info.linelen,endserver, endclient, no,c);
vector tb;
tb.push_back(string_f("Version %s",VERSION));
tb.push_back(string_f("testdelay %d",glocal.testdelay));
tb.push_back(string_f("autotest %s",glocal.teston ? "On" : "Off"));
tb.push_back(string_f("services %s",glocal.allok ? "OK" : "Fail"));
tb.push_back(string_f("nbtest %lu",glocal.nbtest));
tb.push_back(string_f("alarm command %s",glocal.command));
instrument_status (tb);
DATEASC date;
date.buf[0] = '\0';
if (glocal.lastmsg != (time_t)0) fdpass_asctime(glocal.lastmsg,date);
tb.push_back(string_f("alarm sent %s",date.buf));
fdpass_asctime (glocal.lasttest,date);
tb.push_back(string_f("last test %s",date.buf));
fdpass_asctime (glocal.new_last_error,date);
tb.push_back(string_f("last error %s",date.buf));
fdpass_asctime (glocal.start_time,date);
tb.push_back(string_f("start time %s",date.buf));
tb.push_back(string_f("quiet time %d",glocal.quiettime));
for (auto &x:glocal.testout) tb.push_back(x);
rep_status(tb);
toggle_instrument_file(on,"/tmp/instrument-mon.log");
glocal.teston = teston;
endserver = true;
vector out;
time_t last_error;
if (test_loop(glocal.socks,glocal.secrets,debug,out,last_error)==-1){
rep_test (false,out);
}else{
rep_test (true,out);
}
vector out;
time_t last_error;
int ok = test_loop(glocal.socks,glocal.secrets,false,out,last_error);
out.insert (out.begin(),string_f("ok=%d",ok == -1 ? 0 : 1));
int ret = trli_mon_sendmail (glocal.mailserver,glocal.mailport,glocal.admins_conf,"test email monitoring",out);
rep_testmail (ret == -1 ? false:true);
glocal.lastmsg = (time_t)0;
if (on){
debug_seton();
}else{
debug_setoff();
}
debug_setfdebug (filename);
glocal.quiettime = quiettime;
endclient = true;
}else if (c->type == TYPE_PIPE){
if (strncmp(line,"ok=",3)==0){
int ok = atoi(line+3);
glocal.new_allok = ok == -1 ? false : true;
}else if (strncmp(line,"last=",5)==0){
glocal.new_last_error = atoi(line+5);
}else{
glocal.new_testout.push_back(line);
}
}
if (o.is_ok()){
glocal o;
o.setrawmode(true);
daemon_init (glocal.pidfile,glocal.user);
(glocal.sleepdelay);
time_t now = time(NULL);
if (glocal.teston && (now-glocal.lasttest) >= glocal.testdelay){
if (glocal.new_testpid != (pid_t)-1){
tlmp_error ("test_loop did not complete, pid=%u\n",glocal.new_testpid);
if (kill (glocal.new_testpid,SIGKILL)==-1){
tlmp_error ("Can't kill pid %u (%s)\n",glocal.new_testpid,strerror(errno));
}
glocal.new_testpid = (pid_t)-1;
}
glocal.nbtest++;
glocal.lasttest = now;
glocal.new_allok = false;
glocal.new_testout.clear();
int tb[2];
if (pipe(tb)==-1){
tlmp_error ("Can't setup pipe for test loop (%s)\n",strerror(errno));
}else{
pid_t pid = fork();
if (pid == (pid_t)0){
close (tb[0]);
vector out;
time_t last_error;
int ok = test_loop(glocal.socks,glocal.secrets,false,out,last_error);
trli_mon_sendline (tb[1],string_f("ok=%d",ok));
trli_mon_sendline (tb[1],string_f("last=%ld",last_error));
for (auto l:out) trli_mon_sendline(tb[1],l);
_exit (0);
}else if (pid == (pid_t)-1){
tlmp_error ("Can't fork for test loop (%s)\n",strerror(errno));
close (tb[1]);
close (tb[0]);
}else{
close (tb[1]);
HANDLE_INFO *n = new HANDLE_INFO;
n->type = TYPE_PIPE;
n->pid = pid;
glocal.o.inject (tb[0],n);
glocal.o.setrawmode(tb[0],false);
glocal.new_testpid = pid;
}
}
}
netevent_loop(o,idle);
}
}
return glocal.ret;
return glocal.ret;
}