David van Moolenbroek 1122b28691 PM: add support for saved user/group IDs
This patch aims to synchronize the basic process user and group ID
management, as well as the set[ug]id(2) and sete[ug]id(2) behavior,
with NetBSD.  As it turns out, the main issue was missing support for
saved user and group IDs.  This support is now added.

Since NetBSD's userland, which we are importing, may rely on NetBSD
specifics when it comes to security, we choose not to deviate from
NetBSD's behavior in any way here.  A new test, test89, verifies the
correct behavior - it has been confirmed to pass on NetBSD as is.

Change-Id: I023935546d97ed01ffd8090f7793d336cceb0f4a
2016-03-12 17:46:06 +01:00

1288 lines
34 KiB
C

/* MIB service - proc.c - functionality based on service process tables */
/* Eventually, the CTL_PROC subtree might end up here as well. */
#include "mib.h"
#include <sys/exec.h>
#include <minix/sysinfo.h>
#include <machine/archtypes.h>
#include "kernel/proc.h"
#include "servers/pm/mproc.h"
#include "servers/vfs/const.h"
#include "servers/vfs/fproc.h"
typedef struct proc ixfer_proc_t;
typedef struct mproc ixfer_mproc_t;
static ixfer_proc_t proc_tab[NR_TASKS + NR_PROCS];
static ixfer_mproc_t mproc_tab[NR_PROCS];
static struct fproc_light fproc_tab[NR_PROCS];
/*
* The number of processes added to the current number of processes when doing
* a size estimation, so that the actual data retrieval does not end up with
* too little space if new processes have forked between the two calls. We do
* a process table update only once per clock tick, which means that typically
* no update will take place between the user process's size estimation request
* and its subsequent data retrieval request. On the other hand, if we do
* update process tables in between, quite a bit might have changed.
*/
#define EXTRA_PROCS 8
#define HASH_SLOTS (NR_PROCS / 4) /* expected nr. of processes in use */
#define NO_SLOT (-1)
static int hash_tab[HASH_SLOTS]; /* hash table mapping from PID.. */
static int hnext_tab[NR_PROCS]; /* ..to PM process slot */
static clock_t tabs_updated = 0; /* when the tables were last updated */
static int tabs_valid = TRUE; /* FALSE if obtaining tables failed */
/*
* Update the process tables by pulling in new copies from the kernel, PM, and
* VFS, but only every so often and only if it has not failed before. Return
* TRUE iff the tables are now valid.
*/
static int
update_tables(void)
{
clock_t now;
pid_t pid;
int r, kslot, mslot, hslot;
/*
* If retrieving the tables failed at some point, do not keep trying
* all the time. Such a failure is very unlikely to be transient.
*/
if (tabs_valid == FALSE)
return FALSE;
/*
* Update the tables once per clock tick at most. The update operation
* is rather heavy, transferring several hundreds of kilobytes between
* servers. Userland should be able to live with information that is
* outdated by at most one clock tick.
*/
now = getticks();
if (tabs_updated != 0 && tabs_updated == now)
return TRUE;
/* Perform an actual update now. */
tabs_valid = FALSE;
/* Retrieve and check the kernel process table. */
if ((r = sys_getproctab(proc_tab)) != OK) {
printf("MIB: unable to obtain kernel process table (%d)\n", r);
return FALSE;
}
for (kslot = 0; kslot < NR_TASKS + NR_PROCS; kslot++) {
if (proc_tab[kslot].p_magic != PMAGIC) {
printf("MIB: kernel process table mismatch\n");
return FALSE;
}
}
/* Retrieve and check the PM process table. */
r = getsysinfo(PM_PROC_NR, SI_PROC_TAB, mproc_tab, sizeof(mproc_tab));
if (r != OK) {
printf("MIB: unable to obtain PM process table (%d)\n", r);
return FALSE;
}
for (mslot = 0; mslot < NR_PROCS; mslot++) {
if (mproc_tab[mslot].mp_magic != MP_MAGIC) {
printf("MIB: PM process table mismatch\n");
return FALSE;
}
}
/* Retrieve an extract of the VFS process table. */
r = getsysinfo(VFS_PROC_NR, SI_PROCLIGHT_TAB, fproc_tab,
sizeof(fproc_tab));
if (r != OK) {
printf("MIB: unable to obtain VFS process table (%d)\n", r);
return FALSE;
}
tabs_valid = TRUE;
tabs_updated = now;
/*
* Build a hash table mapping from process IDs to slot numbers, for
* fast access. TODO: decide if this is better done on demand only.
*/
for (hslot = 0; hslot < HASH_SLOTS; hslot++)
hash_tab[hslot] = NO_SLOT;
for (mslot = 0; mslot < NR_PROCS; mslot++) {
if (mproc_tab[mslot].mp_flags & IN_USE) {
if ((pid = mproc_tab[mslot].mp_pid) <= 0)
continue;
hslot = mproc_tab[mslot].mp_pid % HASH_SLOTS;
hnext_tab[mslot] = hash_tab[hslot];
hash_tab[hslot] = mslot;
}
}
return TRUE;
}
/*
* Return the PM slot number for the given PID, or NO_SLOT if the PID is not in
* use by a process.
*/
static int
get_mslot(pid_t pid)
{
int mslot;
/* PID 0 identifies the kernel; checking this is up to the caller. */
if (pid <= 0)
return NO_SLOT;
for (mslot = hash_tab[pid % HASH_SLOTS]; mslot != NO_SLOT;
mslot = hnext_tab[mslot])
if (mproc_tab[mslot].mp_pid == pid)
break;
return mslot;
}
/*
* Store the given number of clock ticks as a timeval structure.
*/
static void
ticks_to_timeval(struct timeval * tv, clock_t ticks)
{
clock_t hz;
hz = sys_hz();
tv->tv_sec = ticks / hz;
tv->tv_usec = (long)((ticks % hz) * 1000000ULL / hz);
}
/*
* Generate a wchan message text for the cases that the process is blocked on
* IPC with another process, of which the endpoint is given as 'endpt' here.
* The name of the other process is to be stored in 'wmesg', which is a buffer
* of size 'wmsz'. The result should be null terminated. If 'ipc' is set, the
* process is blocked on a direct IPC call, in which case the name of the other
* process is enclosed in parentheses. If 'ipc' is not set, the call is made
* indirectly through VFS, and the name of the other process should not be
* enclosed in parentheses. If no name can be obtained, we use the endpoint of
* the other process instead.
*/
static void
fill_wmesg(char * wmesg, size_t wmsz, endpoint_t endpt, int ipc)
{
const char *name;
int mslot;
switch (endpt) {
case ANY:
name = "any";
break;
case SELF:
name = "self";
break;
case NONE:
name = "none";
break;
default:
mslot = _ENDPOINT_P(endpt);
if (mslot >= -NR_TASKS && mslot < NR_PROCS &&
(mslot < 0 || (mproc_tab[mslot].mp_flags & IN_USE)))
name = proc_tab[NR_TASKS + mslot].p_name;
else
name = NULL;
}
if (name != NULL)
snprintf(wmesg, wmsz, "%s%s%s",
ipc ? "(" : "", name, ipc ? ")" : "");
else
snprintf(wmesg, wmsz, "%s%d%s",
ipc ? "(" : "", endpt, ipc ? ")" : "");
}
/*
* Return the LWP status of a process, along with additional information in
* case the process is sleeping (LSSLEEP): a wchan value and text to indicate
* what the process is sleeping on, and possibly a flag field modification to
* indicate that the sleep is interruptible.
*/
static int
get_lwp_stat(int mslot, uint64_t * wcptr, char * wmptr, size_t wmsz,
int32_t * flag)
{
struct mproc *mp;
struct fproc_light *fp;
struct proc *kp;
const char *wmesg;
uint64_t wchan;
endpoint_t endpt;
mp = &mproc_tab[mslot];
fp = &fproc_tab[mslot];
kp = &proc_tab[NR_TASKS + mslot];
/*
* First cover all the cases that the process is not sleeping. In
* those cases, we need not return additional sleep information either.
*/
if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE))
return LSZOMB;
if (mp->mp_flags & EXITING)
return LSDEAD;
if ((mp->mp_flags & TRACE_STOPPED) || RTS_ISSET(kp, RTS_P_STOP))
return LSSTOP;
if (proc_is_runnable(kp))
return LSRUN;
/*
* The process is sleeping. In that case, we must also figure out why,
* and return an appropriate wchan value and human-readable wmesg text.
*
* The process can be blocked on either a known sleep state in PM or
* VFS, or otherwise on IPC communication with another process, or
* otherwise on a kernel RTS flag. In each case, decide what to use as
* wchan value and wmesg text, and whether the sleep is interruptible.
*
* The wchan value should be unique for the sleep reason. We use its
* lower eight bits to indicate a class:
* 0x00 = kernel task
* 0x01 = kerel RTS block
* 0x02 = PM call
* 0x03 = VFS call
* 0x04 = MIB call
* 0xff = blocked on process
* The upper bits are used for class-specific information. The actual
* value does not really matter, as long as it is nonzero and there is
* no overlap between the different values.
*/
wchan = 0;
wmesg = NULL;
/*
* First see if the process is marked as blocked in the tables of PM or
* VFS. Such a block reason is always an interruptible sleep. Note
* that we do not use the kernel table at all in this case: each of the
* three tables is consistent within itself, but not necessarily
* consistent with any of the other tables, so we avoid internal
* mismatches if we can.
*/
if (mp->mp_flags & WAITING) {
wchan = 0x102;
wmesg = "wait";
} else if (mp->mp_flags & SIGSUSPENDED) {
wchan = 0x202;
wmesg = "pause";
} else if (fp->fpl_blocked_on != FP_BLOCKED_ON_NONE) {
wchan = (fp->fpl_blocked_on << 8) | 0x03;
switch (fp->fpl_blocked_on) {
case FP_BLOCKED_ON_PIPE:
wmesg = "pipe";
break;
case FP_BLOCKED_ON_LOCK:
wmesg = "lock";
break;
case FP_BLOCKED_ON_POPEN:
wmesg = "popen";
break;
case FP_BLOCKED_ON_SELECT:
wmesg = "select";
break;
case FP_BLOCKED_ON_OTHER:
/*
* Add the task (= character driver) endpoint to the
* wchan value, and use the driver's process name,
* without parentheses, as wmesg text.
*/
wchan |= (uint64_t)fp->fpl_task << 16;
fill_wmesg(wmptr, wmsz, fp->fpl_task, FALSE /*ipc*/);
break;
default:
/* A newly added flag we don't yet know about? */
wmesg = "???";
break;
}
}
if (wchan != 0) {
*wcptr = wchan;
if (wmesg != NULL) /* NULL means "already set" here */
strlcpy(wmptr, wmesg, wmsz);
*flag |= L_SINTR;
}
/*
* See if the process is blocked on sending or receiving. If not, then
* use one of the kernel RTS flags as reason.
*/
endpt = P_BLOCKEDON(kp);
switch (endpt) {
case MIB_PROC_NR:
/* This is really just aesthetics. */
wchan = 0x04;
wmesg = "sysctl";
break;
case NONE:
/*
* The process is not running, but also not blocked on IPC with
* another process. This means it must be stopped on a kernel
* RTS flag.
*/
wchan = ((uint64_t)kp->p_rts_flags << 8) | 0x01;
if (RTS_ISSET(kp, RTS_PROC_STOP))
wmesg = "kstop";
else if (RTS_ISSET(kp, RTS_SIGNALED) ||
RTS_ISSET(kp, RTS_SIGNALED))
wmesg = "ksignal";
else if (RTS_ISSET(kp, RTS_NO_PRIV))
wmesg = "knopriv";
else if (RTS_ISSET(kp, RTS_PAGEFAULT) ||
RTS_ISSET(kp, RTS_VMREQTARGET))
wmesg = "fault";
else if (RTS_ISSET(kp, RTS_NO_QUANTUM))
wmesg = "sched";
else
wmesg = "kflag";
break;
case ANY:
/*
* If the process is blocked receiving from ANY, mark it as
* being in an interruptible sleep. This looks nicer, even
* though "interruptible" is not applicable to services at all.
*/
*flag |= L_SINTR;
break;
}
/*
* If at this point wchan is still zero, the process is blocked sending
* or receiving. Use a wchan value based on the target endpoint, and
* use "(procname)" as wmesg text.
*/
if (wchan == 0) {
*wcptr = ((uint64_t)endpt << 8) | 0xff;
fill_wmesg(wmptr, wmsz, endpt, TRUE /*ipc*/);
} else {
*wcptr = wchan;
if (wmesg != NULL) /* NULL means "already set" here */
strlcpy(wmptr, wmesg, wmsz);
}
return LSSLEEP;
}
/*
* Fill the part of a LWP structure that is common between kernel tasks and
* user processes. Also return a CPU estimate in 'estcpu', because we generate
* the value as a side effect here, and the LWP structure has no estcpu field.
*/
static void
fill_lwp_common(struct kinfo_lwp * l, int kslot, uint32_t * estcpu)
{
struct proc *kp;
struct timeval tv;
clock_t uptime;
uint32_t hz;
kp = &proc_tab[kslot];
uptime = getticks();
hz = sys_hz();
/*
* We use the process endpoint as the LWP ID. Not only does this allow
* users to obtain process endpoints with "ps -s" (thus replacing the
* MINIX3 ps(1)'s "ps -E"), but if we ever do implement kernel threads,
* this is probably still going to be accurate.
*/
l->l_lid = kp->p_endpoint;
/*
* The time during which the process has not been swapped in or out is
* not applicable for us, and thus, we set it to the time the process
* has been running (in seconds). This value is relevant mostly for
* ps(1)'s CPU usage correction for processes that have just started.
*/
if (kslot >= NR_TASKS)
l->l_swtime = uptime - mproc_tab[kslot - NR_TASKS].mp_started;
else
l->l_swtime = uptime;
l->l_swtime /= hz;
/*
* Sleep (dequeue) times are not maintained for kernel tasks, so
* pretend they are never asleep (which is pretty accurate).
*/
if (kslot < NR_TASKS)
l->l_slptime = 0;
else
l->l_slptime = (uptime - kp->p_dequeued) / hz;
l->l_priority = kp->p_priority;
l->l_usrpri = kp->p_priority;
l->l_cpuid = kp->p_cpu;
ticks_to_timeval(&tv, kp->p_user_time + kp->p_sys_time);
l->l_rtime_sec = tv.tv_sec;
l->l_rtime_usec = tv.tv_usec;
/*
* Obtain CPU usage percentages and estimates through library code
* shared between the kernel and this service; see its source for
* details. We note that the produced estcpu value is rather different
* from the one produced by NetBSD, but this should not be a problem.
*/
l->l_pctcpu = cpuavg_getstats(&kp->p_cpuavg, &l->l_cpticks, estcpu,
uptime, hz);
}
/*
* Fill a LWP structure for a kernel task. Each kernel task has its own LWP,
* and all of them have negative PIDs.
*/
static void
fill_lwp_kern(struct kinfo_lwp * l, int kslot)
{
uint32_t estcpu;
memset(l, 0, sizeof(*l));
l->l_flag = L_INMEM | L_SINTR | L_SYSTEM;
l->l_stat = LSSLEEP;
l->l_pid = kslot - NR_TASKS;
/*
* When showing LWP entries, ps(1) uses the process name rather than
* the LWP name. All kernel tasks are therefore shown as "[kernel]"
* anyway. We use the wmesg field to show the actual kernel task name.
*/
l->l_wchan = ((uint64_t)(l->l_pid) << 8) | 0x00;
strlcpy(l->l_wmesg, proc_tab[kslot].p_name, sizeof(l->l_wmesg));
strlcpy(l->l_name, "kernel", sizeof(l->l_name));
fill_lwp_common(l, kslot, &estcpu);
}
/*
* Fill a LWP structure for a user process.
*/
static void
fill_lwp_user(struct kinfo_lwp * l, int mslot)
{
struct mproc *mp;
uint32_t estcpu;
memset(l, 0, sizeof(*l));
mp = &mproc_tab[mslot];
l->l_flag = L_INMEM;
l->l_stat = get_lwp_stat(mslot, &l->l_wchan, l->l_wmesg,
sizeof(l->l_wmesg), &l->l_flag);
l->l_pid = mp->mp_pid;
strlcpy(l->l_name, mp->mp_name, sizeof(l->l_name));
fill_lwp_common(l, NR_TASKS + mslot, &estcpu);
}
/*
* Implementation of CTL_KERN KERN_LWP.
*/
ssize_t
mib_kern_lwp(struct mib_call * call, struct mib_node * node __unused,
struct mib_oldp * oldp, struct mib_newp * newp __unused)
{
struct kinfo_lwp lwp;
struct mproc *mp;
size_t copysz;
ssize_t off;
pid_t pid;
int r, elsz, elmax, kslot, mslot, last_mslot;
if (call->call_namelen != 3)
return EINVAL;
pid = (pid_t)call->call_name[0];
elsz = call->call_name[1];
elmax = call->call_name[2]; /* redundant with the given oldlen.. */
if (pid < -1 || elsz <= 0 || elmax < 0)
return EINVAL;
if (!update_tables())
return EINVAL;
off = 0;
copysz = MIN((size_t)elsz, sizeof(lwp));
/*
* We model kernel tasks as LWP threads of the kernel (with PID 0).
* Modeling the kernel tasks as processes with negative PIDs, like
* ProcFS does, conflicts with the KERN_LWP API here: a PID of -1
* indicates that the caller wants a full listing of LWPs.
*/
if (pid <= 0) {
for (kslot = 0; kslot < NR_TASKS; kslot++) {
if (mib_inrange(oldp, off) && elmax > 0) {
fill_lwp_kern(&lwp, kslot);
if ((r = mib_copyout(oldp, off, &lwp,
copysz)) < 0)
return r;
elmax--;
}
off += elsz;
}
/* No need to add extra space here: NR_TASKS is static. */
if (pid == 0)
return off;
}
/*
* With PID 0 out of the way: the user requested the LWP for either a
* specific user process (pid > 0), or for all processes (pid < 0).
*/
if (pid > 0) {
if ((mslot = get_mslot(pid)) == NO_SLOT ||
(mproc_tab[mslot].mp_flags & (TRACE_ZOMBIE | ZOMBIE)))
return ESRCH;
last_mslot = mslot;
} else {
mslot = 0;
last_mslot = NR_PROCS - 1;
}
for (; mslot <= last_mslot; mslot++) {
mp = &mproc_tab[mslot];
if ((mp->mp_flags & (IN_USE | TRACE_ZOMBIE | ZOMBIE)) !=
IN_USE)
continue;
if (mib_inrange(oldp, off) && elmax > 0) {
fill_lwp_user(&lwp, mslot);
if ((r = mib_copyout(oldp, off, &lwp, copysz)) < 0)
return r;
elmax--;
}
off += elsz;
}
if (oldp == NULL && pid < 0)
off += EXTRA_PROCS * elsz;
return off;
}
/*
* Fill the part of a process structure that is common between kernel tasks and
* user processes.
*/
static void
fill_proc2_common(struct kinfo_proc2 * p, int kslot)
{
struct vm_usage_info vui;
struct timeval tv;
struct proc *kp;
struct kinfo_lwp l;
kp = &proc_tab[kslot];
/*
* Much of the information in the LWP structure also ends up in the
* process structure. In order to avoid duplication of some important
* code, first generate LWP values and then copy it them into the
* process structure.
*/
memset(&l, 0, sizeof(l));
fill_lwp_common(&l, kslot, &p->p_estcpu);
/* Obtain memory usage information from VM. Ignore failures. */
memset(&vui, 0, sizeof(vui));
(void)vm_info_usage(kp->p_endpoint, &vui);
ticks_to_timeval(&tv, kp->p_user_time + kp->p_sys_time);
p->p_rtime_sec = l.l_rtime_sec;
p->p_rtime_usec = l.l_rtime_usec;
p->p_cpticks = l.l_cpticks;
p->p_pctcpu = l.l_pctcpu;
p->p_swtime = l.l_swtime;
p->p_slptime = l.l_slptime;
p->p_uticks = kp->p_user_time;
p->p_sticks = kp->p_sys_time;
/* TODO: p->p_iticks */
ticks_to_timeval(&tv, kp->p_user_time);
p->p_uutime_sec = tv.tv_sec;
p->p_uutime_usec = tv.tv_usec;
ticks_to_timeval(&tv, kp->p_sys_time);
p->p_ustime_sec = tv.tv_sec;
p->p_ustime_usec = tv.tv_usec;
p->p_priority = l.l_priority;
p->p_usrpri = l.l_usrpri;
p->p_vm_rssize = howmany(vui.vui_total, PAGE_SIZE);
p->p_vm_vsize = howmany(vui.vui_virtual, PAGE_SIZE);
p->p_vm_msize = howmany(vui.vui_mvirtual, PAGE_SIZE);
p->p_uru_maxrss = vui.vui_maxrss;
p->p_uru_minflt = vui.vui_minflt;
p->p_uru_majflt = vui.vui_majflt;
p->p_cpuid = l.l_cpuid;
}
/*
* Fill a process structure for the kernel pseudo-process (with PID 0).
*/
static void
fill_proc2_kern(struct kinfo_proc2 * p)
{
memset(p, 0, sizeof(*p));
p->p_flag = L_INMEM | L_SYSTEM | L_SINTR;
p->p_pid = 0;
p->p_stat = LSSLEEP;
p->p_nice = NZERO;
/* Use the KERNEL task wchan, for consistency between ps and top. */
p->p_wchan = ((uint64_t)KERNEL << 8) | 0x00;
strlcpy(p->p_wmesg, "kernel", sizeof(p->p_wmesg));
strlcpy(p->p_comm, "kernel", sizeof(p->p_comm));
p->p_realflag = P_INMEM | P_SYSTEM | P_SINTR;
p->p_realstat = SACTIVE;
p->p_nlwps = NR_TASKS;
/*
* By using the KERNEL slot here, the kernel process will get a proper
* CPU usage average.
*/
fill_proc2_common(p, KERNEL + NR_TASKS);
}
/*
* Fill a process structure for a user process.
*/
static void
fill_proc2_user(struct kinfo_proc2 * p, int mslot)
{
struct mproc *mp;
struct fproc_light *fp;
time_t boottime;
dev_t tty;
struct timeval tv;
int i, r, kslot, zombie;
memset(p, 0, sizeof(*p));
if ((r = getuptime(NULL, NULL, &boottime)) != OK)
panic("getuptime failed: %d", r);
kslot = NR_TASKS + mslot;
mp = &mproc_tab[mslot];
fp = &fproc_tab[mslot];
zombie = (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE));
tty = (!zombie) ? fp->fpl_tty : NO_DEV;
p->p_eflag = 0;
if (tty != NO_DEV)
p->p_eflag |= EPROC_CTTY;
if (mp->mp_pid == mp->mp_procgrp) /* TODO: job control support */
p->p_eflag |= EPROC_SLEADER;
p->p_exitsig = SIGCHLD; /* TODO */
p->p_flag = P_INMEM;
if (mp->mp_flags & TAINTED)
p->p_flag |= P_SUGID;
if (mp->mp_tracer != NO_TRACER)
p->p_flag |= P_TRACED;
if (tty != NO_DEV)
p->p_flag |= P_CONTROLT;
p->p_pid = mp->mp_pid;
if (mp->mp_parent >= 0 && mp->mp_parent < NR_PROCS)
p->p_ppid = mproc_tab[mp->mp_parent].mp_pid;
p->p_sid = mp->mp_procgrp; /* TODO: job control supported */
p->p__pgid = mp->mp_procgrp;
p->p_tpgid = (tty != NO_DEV) ? mp->mp_procgrp : 0;
p->p_uid = mp->mp_effuid;
p->p_ruid = mp->mp_realuid;
p->p_gid = mp->mp_effgid;
p->p_rgid = mp->mp_realgid;
p->p_ngroups = MIN(mp->mp_ngroups, KI_NGROUPS);
for (i = 0; i < p->p_ngroups; i++)
p->p_groups[i] = mp->mp_sgroups[i];
p->p_tdev = tty;
memcpy(&p->p_siglist, &mp->mp_sigpending, sizeof(p->p_siglist));
memcpy(&p->p_sigmask, &mp->mp_sigmask, sizeof(p->p_sigmask));
memcpy(&p->p_sigcatch, &mp->mp_catch, sizeof(p->p_sigcatch));
memcpy(&p->p_sigignore, &mp->mp_ignore, sizeof(p->p_sigignore));
p->p_nice = mp->mp_nice + NZERO;
strlcpy(p->p_comm, mp->mp_name, sizeof(p->p_comm));
p->p_uvalid = 1;
ticks_to_timeval(&tv, mp->mp_started);
p->p_ustart_sec = boottime + tv.tv_sec;
p->p_ustart_usec = tv.tv_usec;
/* TODO: other rusage fields */
ticks_to_timeval(&tv, mp->mp_child_utime + mp->mp_child_stime);
p->p_uctime_sec = tv.tv_sec;
p->p_uctime_usec = tv.tv_usec;
p->p_realflag = p->p_flag;
p->p_nlwps = (zombie) ? 0 : 1;
p->p_svuid = mp->mp_svuid;
p->p_svgid = mp->mp_svgid;
p->p_stat = get_lwp_stat(mslot, &p->p_wchan, p->p_wmesg,
sizeof(p->p_wmesg), &p->p_flag);
switch (p->p_stat) {
case LSRUN:
p->p_realstat = SACTIVE;
p->p_nrlwps = 1;
break;
case LSSLEEP:
p->p_realstat = SACTIVE;
if (p->p_flag & L_SINTR)
p->p_realflag |= P_SINTR;
break;
case LSSTOP:
p->p_realstat = SSTOP;
break;
case LSZOMB:
p->p_realstat = SZOMB;
break;
case LSDEAD:
p->p_stat = LSZOMB; /* ps(1) STAT does not know LSDEAD */
p->p_realstat = SDEAD;
break;
default:
assert(0);
}
if (!zombie)
fill_proc2_common(p, kslot);
}
/*
* Implementation of CTL_KERN KERN_PROC2.
*/
ssize_t
mib_kern_proc2(struct mib_call * call, struct mib_node * node __unused,
struct mib_oldp * oldp, struct mib_newp * newp __unused)
{
struct kinfo_proc2 proc2;
struct mproc *mp;
size_t copysz;
ssize_t off;
dev_t tty;
int r, req, arg, elsz, elmax, kmatch, zombie, mslot;
if (call->call_namelen != 4)
return EINVAL;
req = call->call_name[0];
arg = call->call_name[1];
elsz = call->call_name[2];
elmax = call->call_name[3]; /* redundant with the given oldlen.. */
/*
* The kernel is special, in that it does not have a slot in the PM or
* VFS tables. As such, it is dealt with separately. While checking
* arguments, we might as well check whether the kernel is matched.
*/
switch (req) {
case KERN_PROC_ALL:
kmatch = TRUE;
break;
case KERN_PROC_PID:
case KERN_PROC_SESSION:
case KERN_PROC_PGRP:
case KERN_PROC_UID:
case KERN_PROC_RUID:
case KERN_PROC_GID:
case KERN_PROC_RGID:
kmatch = (arg == 0);
break;
case KERN_PROC_TTY:
kmatch = ((dev_t)arg == KERN_PROC_TTY_NODEV);
break;
default:
return EINVAL;
}
if (elsz <= 0 || elmax < 0)
return EINVAL;
if (!update_tables())
return EINVAL;
off = 0;
copysz = MIN((size_t)elsz, sizeof(proc2));
if (kmatch) {
if (mib_inrange(oldp, off) && elmax > 0) {
fill_proc2_kern(&proc2);
if ((r = mib_copyout(oldp, off, &proc2, copysz)) < 0)
return r;
elmax--;
}
off += elsz;
}
for (mslot = 0; mslot < NR_PROCS; mslot++) {
mp = &mproc_tab[mslot];
if (!(mp->mp_flags & IN_USE))
continue;
switch (req) {
case KERN_PROC_PID:
if ((pid_t)arg != mp->mp_pid)
continue;
break;
case KERN_PROC_SESSION: /* TODO: job control support */
case KERN_PROC_PGRP:
if ((pid_t)arg != mp->mp_procgrp)
continue;
break;
case KERN_PROC_TTY:
if ((dev_t)arg == KERN_PROC_TTY_REVOKE)
continue; /* TODO: revoke(2) support */
/* Do not access the fproc_tab slot of zombies. */
zombie = (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE));
tty = (zombie) ? fproc_tab[mslot].fpl_tty : NO_DEV;
if ((dev_t)arg == KERN_PROC_TTY_NODEV) {
if (tty != NO_DEV)
continue;
} else if ((dev_t)arg == NO_DEV || (dev_t)arg != tty)
continue;
break;
case KERN_PROC_UID:
if ((uid_t)arg != mp->mp_effuid)
continue;
break;
case KERN_PROC_RUID:
if ((uid_t)arg != mp->mp_realuid)
continue;
break;
case KERN_PROC_GID:
if ((gid_t)arg != mp->mp_effgid)
continue;
break;
case KERN_PROC_RGID:
if ((gid_t)arg != mp->mp_realgid)
continue;
break;
}
if (mib_inrange(oldp, off) && elmax > 0) {
fill_proc2_user(&proc2, mslot);
if ((r = mib_copyout(oldp, off, &proc2, copysz)) < 0)
return r;
elmax--;
}
off += elsz;
}
if (oldp == NULL && req != KERN_PROC_PID)
off += EXTRA_PROCS * elsz;
return off;
}
/*
* Implementation of CTL_KERN KERN_PROC_ARGS.
*/
ssize_t
mib_kern_proc_args(struct mib_call * call, struct mib_node * node __unused,
struct mib_oldp * oldp, struct mib_newp * newp __unused)
{
char vbuf[PAGE_SIZE], sbuf[PAGE_SIZE], obuf[PAGE_SIZE];
struct ps_strings pss;
struct mproc *mp;
char *buf, *p, *q, *pptr;
vir_bytes vaddr, vpage, spage, paddr, ppage;
size_t max, off, olen, oleft, oldlen, bytes, pleft;
unsigned int copybudget;
pid_t pid;
int req, mslot, count, aborted, ended;
ssize_t r;
if (call->call_namelen != 2)
return EINVAL;
pid = call->call_name[0];
req = call->call_name[1];
switch (req) {
case KERN_PROC_ARGV:
case KERN_PROC_ENV:
case KERN_PROC_NARGV:
case KERN_PROC_NENV:
break;
default:
return EOPNOTSUPP;
}
if (!update_tables())
return EINVAL;
if ((mslot = get_mslot(pid)) == NO_SLOT)
return ESRCH;
mp = &mproc_tab[mslot];
if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE))
return ESRCH;
/* We can return the count field size without copying in any data. */
if (oldp == NULL && (req == KERN_PROC_NARGV || req == KERN_PROC_NENV))
return sizeof(count);
if (sys_datacopy(mp->mp_endpoint,
mp->mp_frame_addr + mp->mp_frame_len - sizeof(pss),
SELF, (vir_bytes)&pss, sizeof(pss)) != OK)
return EINVAL;
/*
* Determine the upper size limit of the requested data. Not only may
* the size never exceed ARG_MAX, it may also not exceed the frame
* length as given in its original exec call. In fact, the frame
* length should be substantially larger: all strings for both the
* arguments and the environment are in there, along with other stuff,
* and there must be no overlap between strings. It is possible that
* the application called setproctitle(3), in which case the ps_strings
* pointers refer to data outside the frame altogether. However, this
* data should not exceed 2048 bytes, and we cover this by rounding up
* the frame length to a multiple of the page size. Anyhow, NetBSD
* blindly returns ARG_MAX when asked for a size estimate, so with this
* maximum we are already quite a bit more accurate.
*/
max = roundup(MIN(mp->mp_frame_len, ARG_MAX), PAGE_SIZE);
switch (req) {
case KERN_PROC_NARGV:
count = pss.ps_nargvstr;
return mib_copyout(oldp, 0, &count, sizeof(count));
case KERN_PROC_NENV:
count = pss.ps_nenvstr;
return mib_copyout(oldp, 0, &count, sizeof(count));
case KERN_PROC_ARGV:
if (oldp == NULL)
return max;
vaddr = (vir_bytes)pss.ps_argvstr;
count = pss.ps_nargvstr;
break;
case KERN_PROC_ENV:
if (oldp == NULL)
return max;
vaddr = (vir_bytes)pss.ps_envstr;
count = pss.ps_nenvstr;
break;
}
/*
* Go through the strings. Copy in entire, machine-aligned pages at
* once, in the hope that all data is stored consecutively, which it
* should be: we expect that the vector is followed by the strings, and
* that the strings are stored in order of vector reference. We keep
* up to two pages with copied-in data: one for the vector, and
* optionally one for string data. In addition, we keep one page with
* data to be copied out, so that we do not cause a lot of copy
* overhead for short strings.
*
* We stop whenever any of the following conditions are met:
* - copying in data from the target process fails for any reason;
* - we have processed the last index ('count') into the vector;
* - the current vector element is a NULL pointer;
* - the requested number of output bytes ('oldlen') has been reached;
* - the maximum number of output bytes ('max') has been reached;
* - the number of page copy-ins exceeds an estimated threshold;
* - copying out data fails for any reason (we then return the error).
*
* We limit the number of page copy-ins because otherwise a rogue
* process could create an argument vector consisting of only two-byte
* strings that all span two pages, causing us to copy up to 1GB of
* data with the current ARG_MAX value of 256K. No reasonable vector
* should cause more than (ARG_MAX / PAGE_SIZE) page copies for
* strings; we are nice enough to allow twice that. Vector copies do
* not count, as they are linear anyway.
*
* Unlike every other sysctl(2) call, we are supposed to truncate the
* resulting size (the returned 'oldlen') to the requested size (the
* given 'oldlen') *and* return the resulting size, rather than ENOMEM
* and the real size. Unfortunately, libkvm actually relies on this.
*
* Generally speaking, upon failure we just return a truncated result.
* In case of truncation, the data we copy out need not be null
* terminated. It is up to userland to process the data correctly.
*/
if (trunc_page(vaddr) == 0 || vaddr % sizeof(char *) != 0)
return 0;
off = 0;
olen = 0;
aborted = FALSE;
oldlen = mib_getoldlen(oldp);
if (oldlen > max)
oldlen = max;
copybudget = (ARG_MAX / PAGE_SIZE) * 2;
vpage = 0;
spage = 0;
while (count > 0 && off + olen < oldlen && !aborted) {
/*
* Start by fetching the page containing the current vector
* element, if needed. We could limit the fetch to the vector
* size, but our hope is that for the simple cases, the strings
* are on the remainder of the same page, so we save a copy
* call. TODO: since the strings should follow the vector, we
* could start the copy at the base of the vector.
*/
if (trunc_page(vaddr) != vpage) {
vpage = trunc_page(vaddr);
if (sys_datacopy(mp->mp_endpoint, vpage, SELF,
(vir_bytes)vbuf, PAGE_SIZE) != OK)
break;
}
/* Get the current vector element, pointing to a string. */
memcpy(&pptr, &vbuf[vaddr - vpage], sizeof(pptr));
paddr = (vir_bytes)pptr;
ppage = trunc_page(paddr);
if (ppage == 0)
break;
/* Fetch the string itself, one page at a time at most. */
do {
/*
* See if the string pointer falls inside either the
* vector page or the previously fetched string page
* (if any). If not, fetch a string page.
*/
if (ppage == vpage) {
buf = vbuf;
} else if (ppage == spage) {
buf = sbuf;
} else {
if (--copybudget == 0) {
aborted = TRUE;
break;
}
spage = ppage;
if (sys_datacopy(mp->mp_endpoint, spage, SELF,
(vir_bytes)sbuf, PAGE_SIZE) != OK) {
aborted = TRUE;
break;
}
buf = sbuf;
}
/*
* We now have a string fragment in a buffer. See if
* the string is null terminated. If not, all the data
* up to the buffer end is part of the string, and the
* string continues on the next page.
*/
p = &buf[paddr - ppage];
pleft = PAGE_SIZE - (paddr - ppage);
assert(pleft > 0);
if ((q = memchr(p, '\0', pleft)) != NULL) {
bytes = (size_t)(q - p + 1);
assert(bytes <= pleft);
ended = TRUE;
} else {
bytes = pleft;
ended = FALSE;
}
/* Limit the result to the requested length. */
if (off + olen + bytes > oldlen)
bytes = oldlen - off - olen;
/*
* Add 'bytes' bytes from string pointer 'p' to the
* output buffer, copying out its contents to userland
* if it has filled up.
*/
if (olen + bytes > sizeof(obuf)) {
oleft = sizeof(obuf) - olen;
memcpy(&obuf[olen], p, oleft);
if ((r = mib_copyout(oldp, off, obuf,
sizeof(obuf))) < 0)
return r;
off += sizeof(obuf);
olen = 0;
p += oleft;
bytes -= oleft;
}
if (bytes > 0) {
memcpy(&obuf[olen], p, bytes);
olen += bytes;
}
/*
* Continue as long as we have not yet found the string
* end, and we have not yet filled the output buffer.
*/
paddr += pleft;
assert(trunc_page(paddr) == paddr);
ppage = paddr;
} while (!ended && off + olen < oldlen);
vaddr += sizeof(char *);
count--;
}
/* Copy out any remainder of the output buffer. */
if (olen > 0) {
if ((r = mib_copyout(oldp, off, obuf, olen)) < 0)
return r;
off += olen;
}
assert(off <= oldlen);
return off;
}
/*
* Implementation of CTL_MINIX MINIX_PROC PROC_LIST.
*/
ssize_t
mib_minix_proc_list(struct mib_call * call __unused,
struct mib_node * node __unused, struct mib_oldp * oldp,
struct mib_newp * newp __unused)
{
struct minix_proc_list mpl[NR_PROCS];
struct minix_proc_list *mplp;
struct mproc *mp;
unsigned int mslot;
if (oldp == NULL)
return sizeof(mpl);
if (!update_tables())
return EINVAL;
memset(&mpl, 0, sizeof(mpl));
mplp = mpl;
mp = mproc_tab;
for (mslot = 0; mslot < NR_PROCS; mslot++, mplp++, mp++) {
if (!(mp->mp_flags & IN_USE) || mp->mp_pid <= 0)
continue;
mplp->mpl_flags = MPLF_IN_USE;
if (mp->mp_flags & (TRACE_ZOMBIE | ZOMBIE))
mplp->mpl_flags |= MPLF_ZOMBIE;
mplp->mpl_pid = mp->mp_pid;
mplp->mpl_uid = mp->mp_effuid;
mplp->mpl_gid = mp->mp_effgid;
}
return mib_copyout(oldp, 0, &mpl, sizeof(mpl));
}
/*
* Implementation of CTL_MINIX MINIX_PROC PROC_DATA.
*/
ssize_t
mib_minix_proc_data(struct mib_call * call, struct mib_node * node __unused,
struct mib_oldp * oldp, struct mib_newp * newp __unused)
{
struct minix_proc_data mpd;
struct proc *kp;
int kslot, mslot = 0;
unsigned int mflags;
pid_t pid;
/*
* It is currently only possible to retrieve the process data for a
* particular PID, which must be given as the last name component.
*/
if (call->call_namelen != 1)
return EINVAL;
pid = (pid_t)call->call_name[0];
if (!update_tables())
return EINVAL;
/*
* Unlike the CTL_KERN nodes, we use the ProcFS semantics here: if the
* given PID is negative, it is a kernel task; otherwise, it identifies
* a user process. A request for PID 0 will result in ESRCH.
*/
if (pid < 0) {
if (pid < -NR_TASKS)
return ESRCH;
kslot = pid + NR_TASKS;
assert(kslot < NR_TASKS);
} else {
if ((mslot = get_mslot(pid)) == NO_SLOT)
return ESRCH;
kslot = NR_TASKS + mslot;
}
if (oldp == NULL)
return sizeof(mpd);
kp = &proc_tab[kslot];
mflags = (pid > 0) ? mproc_tab[mslot].mp_flags : 0;
memset(&mpd, 0, sizeof(mpd));
mpd.mpd_endpoint = kp->p_endpoint;
if (mflags & PRIV_PROC)
mpd.mpd_flags |= MPDF_SYSTEM;
if (mflags & (TRACE_ZOMBIE | ZOMBIE))
mpd.mpd_flags |= MPDF_ZOMBIE;
else if ((mflags & TRACE_STOPPED) || RTS_ISSET(kp, RTS_P_STOP))
mpd.mpd_flags |= MPDF_STOPPED;
else if (proc_is_runnable(kp))
mpd.mpd_flags |= MPDF_RUNNABLE;
mpd.mpd_blocked_on = P_BLOCKEDON(kp);
mpd.mpd_priority = kp->p_priority;
mpd.mpd_user_time = kp->p_user_time;
mpd.mpd_sys_time = kp->p_sys_time;
mpd.mpd_cycles = kp->p_cycles;
mpd.mpd_kipc_cycles = kp->p_kipc_cycles;
mpd.mpd_kcall_cycles = kp->p_kcall_cycles;
if (kslot >= NR_TASKS) {
mpd.mpd_nice = mproc_tab[mslot].mp_nice;
strlcpy(mpd.mpd_name, mproc_tab[mslot].mp_name,
sizeof(mpd.mpd_name));
} else
strlcpy(mpd.mpd_name, kp->p_name, sizeof(mpd.mpd_name));
return mib_copyout(oldp, 0, &mpd, sizeof(mpd));
}