/* Copyright Jacques Gelinas jack@solucorp.qc.ca Distributed under the Gnu Public License, see the License file in this package. */ /* chcontext is a wrapper to user the new_s_context system call. It does little more than mapping command line option to the system call arguments. */ #include #include #include #include #include #include #include #include "vutil.h" #include #include #include "linux/context_cmd.h" #include #include #include #ifndef CAP_QUOTACTL #define CAP_QUOTACTL 29 #endif #ifndef CAP_AUDIT_WRITE #define CAP_AUDIT_WRITE 29 #endif #ifndef CAP_AUDIT_CONTROL #define CAP_AUDIT_CONTROL 30 #endif static void usage() { fprintf (stderr,"chcontext version %s\n",VERSION); fprintf (stderr ,"chcontext [ options ] command arguments ...\n" "\n" "chcontext allocate a new security context and executes\n" "a command in that context.\n" "By default, a new/unused context is allocated\n" "\n" "--debug\n" "--arch ARCH\n" "\tSet the architecture of the context (utsname.machine)\n" "\n" "--cap CAP_NAME\n" "\tAdd a capability from the command. This option may be\n" "\trepeated several time.\n" "\tSee /usr/include/linux/capability.h\n" "\tIn general, this option is used with the --secure option\n" "\t--secure removes most critical capabilities and --cap\n" "\tadds specific ones.\n" "\n" "--cap !CAP_NAME\n" "\tRemove a capability from the command. This option may be\n" "\trepeated several time.\n" "\tSee /usr/include/linux/capability.h\n" "\n" "--ctx num\n" "\tSelect the context. On root in context 0 is allowed to\n" "\tselect a specific context.\n" "\tContext number 1 is special. It can see all processes\n" "\tin any contexts, but can't kill them though.\n" "\tOption --ctx may be repeated several times to specify up to 16 contexts.\n" "--disconnect\n" "\tStart the command in background and make the process\n" "\ta child of process 1.\n" "--domainname new_domainname\n" "\tSet the domainname (NIS) in the new security context.\n" "\tUse \"none\" to unset the domain name.\n" "--flag\n" "\tSet one flag in the new or current security context. The following\n" "\tflags are supported. The option may be used several time.\n" "\n" "\tfakeinit: The new process will believe it is process number 1.\n" " Useful to run a real /sbin/init in a vserver.\n" "\tlock: The new process is trapped and can't use chcontext anymore.\n" "\tsched: The new process and its children will share a common \n" " execution priority.\n" "\tmount: Hide /proc/mounts content\n" "\tnetif: Hide network devices not owned by this context\n" "\tnproc: Limit the number of process in the vserver according to\n" " ulimit setting. Normally, ulimit is a per user thing.\n" " With this flag, it becomes a per vserver thing.\n" "\tprivate: No one can join this security context once created.\n" "\tulimit: Apply the current ulimit to the whole context\n" "--hostname new_hostname\n" "\tSet the hostname in the new security context\n" "\tThis is need because if you create a less privileged\n" "\tsecurity context, it may be unable to change its hostname\n" "--pscript command\n" "\tExecute a privileged script (usually sysctl)\n" "--sscript command\n" "\tExecute a privileged script before loosing capabilities (mount /proc for example)\n" "--secure\n" "\tRemove all the capabilities to make a virtual server trustable\n" "--silent\n" "\tDo not print the allocated context number.\n" "\n" "--controlsock unix_socket_path\n" "\tThe vserver init process will setup a unix socket, listening for command\n" "\tThe vservertalk utility is used to send command to the init process\n" "\t(Used by vserver kill)\n" "Information about context is found in /proc/self/status\n"); } extern "C" int call_newpidspace(); void debug_printf (const char *ctl, ...); extern int debug; /* * Check if a vserver context is alive */ static bool chcontext_exist(int ctx) { bool ret = false; char path[PATH_MAX]; snprintf (path,sizeof(path),"/proc/virtual/%d/status",ctx); FILE *fin = fopen (path,"r"); if (fin != NULL){ char buf[100]; if (fgets(buf,sizeof(buf)-1,fin)!=NULL){ ret = true; } fclose (fin); } return ret; } #if 0 static int fork_or_clone(int ctx) { int ret = -1; if (!chcontext_exist (ctx)){ // The context does not exist, we clone to establish a new pid namespace ret = call_newpidspace(); }else{ ret = fork(); } return ret; } #endif struct CHCONTEXT_OPTIONS{ const char *vname; // Vserver name const char *hostname; const char *domainname; struct utsname uts; const char *pscript; // Script to execute in the namespace of the vserver // but with full privilege of the host const char *sscript; // Script to execute in the context, before loosing privileges bool silent; unsigned long long flags; bool fakeinit; unsigned remove_cap; unsigned add_cap; const char *unixsocket; CHCONTEXT_OPTIONS(){ hostname = NULL; domainname = NULL; pscript = NULL; sscript = NULL; memset (&uts,0,sizeof(uts)); uname (&uts); silent = false; flags = 0; fakeinit = false; remove_cap = 0; add_cap = 0; vname = "unknown"; unixsocket = NULL; } }; static void chcontext_sethost (CHCONTEXT_OPTIONS &opt) { if (opt.hostname != NULL){ if (sethostname (opt.hostname,strlen(opt.hostname))==-1){ fprintf (stderr,"Can't set the host name (%s)\n" ,strerror(errno)); }else if (!opt.silent){ printf ("Host name is now %s\n",opt.hostname); } } if (opt.domainname != NULL){ setdomainname (opt.domainname,strlen(opt.domainname)); if (!opt.silent){ printf ("Domain name is now %s\n",opt.domainname); } } if (opt.uts.machine[0] != '\0') call_setarch(opt.uts.machine); if (opt.uts.release[0] != '\0') call_setkrelease(opt.uts.release); if (opt.sscript != NULL){ int ok = system (opt.sscript); if (ok != 0) fprintf (stderr,"Secure script %s failed\n",opt.sscript); } } static void rewrite (char *argv[], const char *s) { for (int i=0; argv[i] != NULL; i++){ char *pt = argv[i]; while (*pt != '\0') *pt++ = '\0'; } strcpy (argv[0],s); } /* * Enter an existing context */ static int chcontext_enter ( int ctx, char *argv0[], char *argv[]) { int fds[2]; int ctxs[]={ctx,-1}; int newctx = call_new_s_context(1,ctxs,0,0,0); if (newctx != ctx){ fprintf (stderr,"Can't enter context %d\n",ctx); }else if (pipe(fds)==-1){ fprintf (stderr,"can't setup pipe (%s)\n",strerror(errno)); }else{ pid_t pid = fork(); if (pid == (pid_t)0){ pid_t sub_pid = fork(); if (sub_pid == 0){ pid_t sub_sub_pid = fork(); if (sub_sub_pid == 0){ execvp (argv[0],argv); fprintf (stderr,"Can't exec %s (%s)\n",argv[0],strerror(errno)); _exit (1); }else if (sub_sub_pid == (pid_t)-1){ fprintf (stderr,"Can't fork (%s), aborting\n",strerror(errno)); exit (1); }else{ rewrite (argv0,""); int st; wait (&st); write (fds[1],&st,sizeof(st)); _exit (0); } }else if (sub_pid == (pid_t)-1){ fprintf (stderr,"Can't fork (%s), aborting\n",strerror(errno)); exit (1); }else{ // The exec process become a child of process 1 _exit (0); } }else if (pid == (pid_t)-1){ fprintf (stderr,"Can't fork (%s), aborting\n",strerror(errno)); exit (1); }else{ int st; wait (&st); // Just remove the defunct for the sub-pid close (fds[1]); if (read(fds[0],&st,sizeof(st)) != sizeof(st)){ fprintf (stderr,"chcontext enter: Can't read from pipe (%s)\n",strerror(errno)); exit (-1); } debug_printf ("enter fork ending st=%d\n",st); exit (WEXITSTATUS(st)); } } return -1; } static int fds[2]; static void fctterm (int) { FILE *fout = fopen ("/tmp/init.log","a"); if (fout != NULL){ fprintf (fout,"ending\n"); fclose (fout); } int st = 0; write (fds[1],&st,sizeof(st)); // This tells the calling chcontext to stop waiting _exit (0); } /* * Open the unix socket in listen mode */ static int chcontext_initunix(const char *sockn) { int ret = -1; unlink (sockn); int fd = socket (AF_UNIX,SOCK_STREAM,0); if (fd == -1){ perror("socket server"); }else{ struct sockaddr_un un; un.sun_family = AF_UNIX; strcpy (un.sun_path,sockn); if (bind(fd,(struct sockaddr*)&un,sizeof(un))==-1){ perror("bind"); }else{ chmod (sockn,0600); int code = ::listen (fd,10); if (code == -1){ perror ("listen"); }else{ ret = fd; } } } return ret; } static int child_events; void child_sig_handler (int x) { child_events++; int st; if (wait(&st)==-1){ fprintf (stderr,"wait -1 (%s)\n",strerror(errno)); } signal (SIGCHLD, child_sig_handler); } static void debug_init (const char *ctl, ...) { // This is the only way to see what is going on. Enable this only while developping. // After that, it is bad to keep this alive #if 0 FILE *fout = fopen ("/tmp/init.log","a"); if (fout != NULL){ va_list list; va_start (list,ctl); vfprintf (fout,ctl,list); va_end (list); fclose (fout); } #endif } /* * Create and enter a context * Return the execution status of the sub-command */ static int chcontext_create ( int ctx, bool disconnect, CHCONTEXT_OPTIONS &opt, char *argv0[], char *argv[]) { int ret = -1; /* Open a unix domain socket. This will be managed by the init process (See below for the wait loop). One sub-process of this program becomes the PID 1 of the newly created context. So it acts as the "init" process of the vserver. Its job (in a vserver) is very limited. It basically "wait" on child. When they are all dead, it exits and the vserver is gone. Now, we introduce a new functionnality. We open a unix socket with a path in the root (so invisible from process in the vserver). This socket is used to send commands to the init process. For now, the most interesting command is kill. This allows the master to kill from outside some processes in a vserver. This is useful when a vserver has maxed out its number of concurent process. It that case, you can't even "enter" a vserver since this is creating a sub-process. */ int fdsock = -1; if (opt.unixsocket != NULL){ fdsock = chcontext_initunix (opt.unixsocket); if (fdsock == -1){ fprintf (stderr,"Can't setup control unix socket %s, aborting\n",opt.unixsocket); exit (-1); } } int fdpscript[2]; if (opt.pscript != NULL){ /* How does it works... When creating a new context, the current process is instantly migrated to the new context and loose ability to fiddle with sysctl settings. So we use a trick. Before setting the new context, we fork. The child just wait, reading a pipe to learn the new context number. It then enter the newly created context namespace (not the context itself). It runs the privileged script and exits. The parent waits (see further in the source) for the child and then continue. */ if (socketpair(AF_UNIX,SOCK_STREAM,PF_UNIX,fdpscript) != -1){ pid_t pid = fork(); if (pid == (pid_t)0){ int ctx; int rt = -1; rewrite (argv0,""); int nbr = read(fdpscript[0],&ctx,sizeof(ctx)); debug_printf ("Before running pscript nbr %d ctx=%d\n",nbr,ctx); if (nbr!=sizeof(ctx)){ syslog (LOG_ERR,"pscript: failed to read context number (%m)"); }else if (call_enter_space(ctx)!=-1){ rt = system(opt.pscript); }else{ syslog (LOG_ERR,"pscript: Can't enter context %d (%m)\n",ctx); } debug_printf ("pscript done\n"); write (fdpscript[0],&rt,sizeof(rt)); _exit(rt); } }else{ fprintf (stderr,"Can't create pscript socketpair (%s). Aborting\n",strerror(errno)); exit (-1); } } /* Here is how it works. We can't rely on wait to tell if the children are ending Instead we wait on a message receive through a pipe. We have a intermediate process waiting for the child. Once the child ends, the process sends the exit status to us and we exit with it. */ pid_t master_pid=0; if (pipe(fds)==-1){ fprintf (stderr,"can't setup pipe (%s)\n",strerror(errno)); }else if ((master_pid=call_newpidspace())==(pid_t)-1){ fprintf (stderr,"can't fork (%s)\n",strerror(errno)); }else if (master_pid != 0){ int st; char tmp[100]; snprintf (tmp,sizeof(tmp),"",opt.vname); rewrite (argv0,tmp); debug_printf ("Master pid %d child %d\n",getpid(),master_pid); if (read(fds[0],&st,sizeof(st)) == -1){ fprintf (stderr,"Can't read the status of the sub-sub-process (%s)\n",strerror(errno)); }else{ debug_printf ("Master wait st=%d\n",st); } ret = st; }else{ //fprintf (stderr,"Sub-master pid %d -> %d\n",master_pid,getpid()); //system ("/bin/sh"); rewrite (argv0,""); const unsigned long long flagmask = (unsigned long long)-1; long long ADMFLG=VXF_STATE_SETUP|VXF_STATE_INIT|VXF_STATE_ADMIN; long long ADMFLG2=VXF_STATE_INIT|VXF_STATE_ADMIN; //(1ULL<<32)+(1ULL<<34)+(1ULL<<33); int ctxs[]={ctx,-1}; int newctx = call_new_s_context(1,ctxs,0,opt.flags|ADMFLG,flagmask); debug_printf ("nexctx %d pid %d\n",newctx,getpid()); if (newctx != -1){ if (opt.pscript != NULL){ write (fdpscript[1],&newctx,sizeof(newctx)); int status; if (read (fdpscript[1],&status,sizeof(status)) == -1){ syslog (LOG_ERR,"pscript: read wait error %m"); }else{ // fprintf (stderr,"Attend pscript %d\n",status); } close (fdpscript[0]); close (fdpscript[1]); } chcontext_sethost (opt); opt.remove_cap &= (~opt.add_cap); call_new_s_context (0,NULL,opt.remove_cap,opt.flags|ADMFLG2,flagmask); if (!opt.silent){ printf ("New security context is %d\n" ,ctxs[0] == -1 ? newctx : ctxs[0]); } fflush (stdout); fflush (stderr); pid_t pid = getpid(); pid_t ppid = getppid(); debug_printf ("MAIN pid=%d ppid=%d\n",pid,ppid); if (pid == 0){ // This code does not seem to happen anymore // This occurs when we are entering a running context (with pid namespace) // The current process kind of enter the PID namespace, but since this process // was not created in that namespace, it ends up with a PID=0. // Maybe it is a bug in the kernel. // Anyway, we are using this to differentiate between the first access to a namespace // and the other. During the first, we create a pseudo init process // and the other entries become a child of this one. debug_printf ("My pid is 0, forking to get a real PID\n"); // we fork() again and the current process will end // so the child will become a child of process 1 // The process 1 (below) ends when there is no more child // The goal is to have something simple to use // One can do: vserver xx enter // and then starts services, becoming child of process 1 // and then he can do another enter and kill all the service // The new shell, being a child of 1 won't die pid_t pid = fork(); if (pid == 0){ pid = fork(); if (pid == 0){ execvp (argv[0],argv); fprintf (stderr,"Can't exec %s (%s)\n",argv[0] ,strerror(errno)); _exit (1); }else if (pid == (pid_t)-1){ fprintf (stderr,"Can't triple fork (%s)\n",strerror(errno)); _exit (1); }else{ // We wait for the child and then use the pipe to talk to the grand parent // since the parent has left. debug_printf ("Process is pid %u\n",pid); rewrite (argv0,""); int st; wait(&st); write (fds[1],&st,sizeof(st)); } }else if (pid == (pid_t)-1){ fprintf (stderr,"Can't double fork (%s)\n",strerror(errno)); _exit (1); }else{ // this parents does nothing. Just end. _exit (0); } }else if (pid == 1){ // This is the creation of the context/pid_namespace // Process number one is special. If it dies, the whole namespace dies // It becomes the parent of all process loosing their parents in the namespace // So we create a special init process here. pid_t init_pid = fork(); if (init_pid==0){ pid_t sub_pid = fork(); if (sub_pid == 0){ execvp (argv[0],argv); fprintf (stderr,"Can't exec %s (%s)\n",argv[0] ,strerror(errno)); _exit(-1); }else if (sub_pid == (pid_t)-1){ fprintf (stderr,"Can't fork (%s)\n",strerror(errno)); }else{ rewrite (argv0,""); int st; wait(&st); write (fds[1],&st,sizeof(st)); _exit (st); } }else if (init_pid == (pid_t)-1){ fprintf (stderr,"Can't fork (%s)\n",strerror(errno)); }else{ close (0); close(1); close(2); setsid(); chroot ("."); debug_init ("pid 1 waiting\n"); signal (SIGINT,fctterm); if (fdsock == -1){ // We wait until all child complete int st; while (wait(&st)!=-1) debug_init ("pid1 wait %d\n",st); debug_init ("pid 1 ending\n"); //sleep (60); // No more process, ending }else{ signal (SIGCHLD, child_sig_handler); sigset_t sigmask, orig_sigmask; sigemptyset (&sigmask); sigaddset (&sigmask, SIGCHLD); sigprocmask (SIG_BLOCK, &sigmask, &orig_sigmask); while (1){ fd_set in; FD_ZERO (&in); FD_SET (fdsock,&in); int sel = pselect (fdsock+1,&in,NULL,NULL,NULL,&orig_sigmask); bool is_intr = sel == -1 && errno == EINTR; debug_init ("sel = %d is_set %d events %d is_intr %d\n",sel,FD_ISSET(fdsock,&in) ,child_events,is_intr); if (is_intr){ debug_init ("Check if there is another process\n"); bool must_end = false; while (1){ int st; int ok = waitpid (-1,&st,WNOHANG); debug_init ("ok=%d st=%d error=%s must_end %d\n",ok,st,strerror(errno),must_end); if (ok == 0){ // No process has ended, but there are still some remaining break; }else if (ok == -1){ must_end = errno == ECHILD; break; }else{ // ok > 0, a child ended // so we must check again if there are sub-processes alive #if 0 // The following code proves that it happens // The while(1) was added to remove all end status // and detect the -1 showing there is no more child ok = waitpid (-1,&st,WNOHANG); if (ok == -1 && errno == ECHILD){ FILE *fout = fopen ("/tmp/waitpid.log","a"); if (fout != NULL){ fprintf (fout,"waitpid bug\n"); fclose (fout); } } #endif } } if (must_end){ debug_init ("No more child, ending\n"); break; } }else if (sel > 0 && FD_ISSET(fdsock,&in)){ debug_init ("Message from vservertalk\n"); struct sockaddr_un sacc; socklen_t size=sizeof(sacc); int newclient = accept (fdsock,(struct sockaddr *)&sacc,&size); if (newclient != -1){ char buf[100]; int n = read (newclient,buf,sizeof(buf)); if (n > 0){ buf[n] = '\0'; debug_init ("Message is :%s:\n",buf); if (strcmp(buf,"quit")==0){ // We quit and all process are dying debug_init ("Ok message order to end\n"); break; } } } close (newclient); } } } int st = 0; write (fds[1],&st,sizeof(st)); // This tells the calling chcontext to stop waiting _exit (0); } ret = -1; }else{ fprintf (stderr,"??? My PID is %u, ending\n",pid); int st = 1; write (fds[1],&st,sizeof(st)); _exit (1); } }else{ perror ("Can't set the new security context\n"); } if (disconnect != 0) _exit(0); } return ret; } int main (int argc, char *argv0[]) { void show_nsproxy_init(); show_nsproxy_init(); int ret = -1; int i; int nbctx = 0; int ctxs[16]; int disconnect = 0; unsigned long secure = (1<= 16){ fprintf (stderr,"Too many context, max 16, ignored.\n"); }else{ ctxs[nbctx++] = atoi(opt); } i++; }else if (strcmp(arg,"--disconnect")==0){ disconnect = 1; }else if (strcmp(arg,"--silent")==0){ opts.silent = true; }else if (strcmp(arg,"--flag")==0){ if (strcmp(opt,"lock")==0){ opts.flags |= 1; }else if (strcmp(opt,"sched")==0){ opts.flags |= 2; }else if (strcmp(opt,"nproc")==0){ opts.flags |= 4; }else if (strcmp(opt,"private")==0){ opts.flags |= 8; }else if (strcmp(opt,"fakeinit")==0){ opts.fakeinit = false; opts.flags |= 16; }else if (strcmp(opt,"hideinfo")==0){ opts.flags |= 32; }else if (strcmp(opt,"ulimit")==0){ opts.flags |= 64; }else if (strcmp(opt,"mount")==0){ opts.flags |= 0x01000000; }else if (strcmp(opt,"netif")==0){ opts.flags |= 0x02000000; }else{ fprintf (stderr,"Unknown flag %s\n",opt); } i++; }else if (strcmp(arg,"--cap")==0){ static struct { const char *option; int bit; }tbcap[]={ // The following capabilities are normally available // to vservers administrator, but are place for // completeness {"CAP_CHOWN",CAP_CHOWN}, {"CAP_DAC_OVERRIDE",CAP_DAC_OVERRIDE}, {"CAP_DAC_READ_SEARCH",CAP_DAC_READ_SEARCH}, {"CAP_FOWNER",CAP_FOWNER}, {"CAP_FSETID",CAP_FSETID}, {"CAP_KILL",CAP_KILL}, {"CAP_SETGID",CAP_SETGID}, {"CAP_SETUID",CAP_SETUID}, {"CAP_SETPCAP",CAP_SETPCAP}, {"CAP_SYS_TTY_CONFIG",CAP_SYS_TTY_CONFIG}, {"CAP_LEASE",CAP_LEASE}, {"CAP_SYS_CHROOT",CAP_SYS_CHROOT}, // Those capabilities are not normally available // to vservers because they are not needed and // may represent a security risk {"CAP_LINUX_IMMUTABLE",CAP_LINUX_IMMUTABLE}, {"CAP_NET_BIND_SERVICE",CAP_NET_BIND_SERVICE}, {"CAP_NET_BROADCAST",CAP_NET_BROADCAST}, {"CAP_NET_ADMIN", CAP_NET_ADMIN}, {"CAP_NET_RAW", CAP_NET_RAW}, {"CAP_IPC_LOCK", CAP_IPC_LOCK}, {"CAP_IPC_OWNER", CAP_IPC_OWNER}, {"CAP_SYS_MODULE",CAP_SYS_MODULE}, {"CAP_SYS_RAWIO", CAP_SYS_RAWIO}, {"CAP_SYS_PACCT", CAP_SYS_PACCT}, {"CAP_SYS_ADMIN", CAP_SYS_ADMIN}, {"CAP_SYS_BOOT", CAP_SYS_BOOT}, {"CAP_SYS_NICE", CAP_SYS_NICE}, {"CAP_SYS_RESOURCE",CAP_SYS_RESOURCE}, {"CAP_SYS_TIME", CAP_SYS_TIME}, {"CAP_MKNOD", CAP_MKNOD}, {"CAP_QUOTACTL", CAP_QUOTACTL}, {"CAP_SYS_PTRACE", CAP_SYS_PTRACE}, {"CAP_AUDIT_CONTROL", CAP_AUDIT_CONTROL}, {"CAP_AUDIT_WRITE", CAP_AUDIT_WRITE}, {NULL,0} }; int j; unsigned *cap = &opts.add_cap; if (opt[0] == '!'){ cap = &opts.remove_cap; opt++; } for (j=0; tbcap[j].option != NULL; j++){ if (strcasecmp(tbcap[j].option,opt)==0){ *cap |= (1<