phunix/minix/kernel/system/do_safecopy.c
David van Moolenbroek 10b7016b5a Fix soft faults in FSes resulting in partial I/O
In order to resolve page faults on file-mapped pages, VM may need to
communicate (through VFS) with a file system.  The file system must
therefore not be the one to cause, and thus end up being blocked on,
such page faults.  To resolve this potential deadlock, the safecopy
system was previously extended with the CPF_TRY flag, which causes the
kernel to return EFAULT to the caller of a safecopy function upon
getting a pagefault, bypassing VM and thus avoiding the loop.  VFS was
extended to repeat relevant file system calls that returned EFAULT,
after resolving the page fault, to keep these soft faults from being
exposed to applications.

However, general UNIX I/O semantics dictate that if an I/O transfer
partially succeeded before running into a failure, the partial result
is to be returned.  Proper file system implementations may therefore
end up returning partial success rather than the EFAULT code resulting
from a soft fault.  Since VFS does not get the EFAULT code in this
case, it does not know that a soft fault occurred, and thus does not
repeat the call either.  The end result is that an application may get
partial I/O results (e.g., a short read(2)) even on regular files.
Applications cannot reasonably be expected to deal with this.

Due to the fact that most of the current file system implementations
do not implement proper partial-failure semantics, this problem is not
yet widespread.  In fact, it has only occurred on direct block device
I/O so far.  However, the next generation of file system services will
be implementing proper I/O semantics, thus exacerbating the problem.

To remedy this situation, this patch changes the CPF_TRY semantics:
whenever the kernel experiences a soft fault during a safecopy call,
in addition to returning FAULT, the kernel also stores a mark in the
grant created with CPF_TRY.  Instead of testing on EFAULT, VFS checks
whether the grant was marked, as part of revoking the grant.  If the
grant was indeed marked by the kernel, VFS repeats the file system
operation, regardless of its initial return value.  Thus, the EFAULT
code now only serves to make the file system fail the call faster.

The approach is currently supported for both direct and magic grants,
but is used only with magic grants - arguably the only case where it
makes sense.  Indirect grants should not have CPF_TRY set; in a chain
of indirect grants, the original grant is marked, as it should be.
In order to avoid potential SMP issues, the mark stored in the grant
is its grant identifier, so as to discard outdated kernel writes.
Whether this is necessary or effective remains to be evaluated.

This patch also cleans up the grant structure a bit, removing reserved
space and thus making the structure slightly smaller.  The structure
is used internally between system services only, so there is no need
for binary compatibility.

Change-Id: I6bb3990dce67a80146d954546075ceda4d6567f8
2016-01-16 14:04:21 +01:00

454 lines
14 KiB
C

/* The kernel call implemented in this file:
* m_type: SYS_SAFECOPYFROM or SYS_SAFECOPYTO or SYS_VSAFECOPY
*
* The parameters for this kernel call are:
* m_lsys_kern_safecopy.from_to other endpoint
* m_lsys_kern_safecopy.gid grant id
* m_lsys_kern_safecopy.offset offset within granted space
* m_lsys_kern_safecopy.address address in own address space
* m_lsys_kern_safecopy.bytes bytes to be copied
*
* For the vectored variant (do_vsafecopy):
* m_lsys_kern_vsafecopy.vec_addr address of vector
* m_lsys_kern_vsafecopy.vec_size number of significant elements in vector
*/
#include <assert.h>
#include "kernel/system.h"
#include "kernel/kernel.h"
#include "kernel/vm.h"
#define MAX_INDIRECT_DEPTH 5 /* up to how many indirect grants to follow? */
#define MEM_TOP 0xFFFFFFFFUL
static int safecopy(struct proc *, endpoint_t, endpoint_t,
cp_grant_id_t, size_t, vir_bytes, vir_bytes, int);
#define HASGRANTTABLE(gr) \
(priv(gr) && priv(gr)->s_grant_table)
struct cp_sfinfo { /* information for handling soft faults */
int try; /* if nonzero, try copy only, stop on fault */
endpoint_t endpt; /* endpoint owning grant with CPF_TRY flag */
vir_bytes addr; /* address to write mark upon soft fault */
cp_grant_id_t value; /* grant ID to use as mark value to write */
};
/*===========================================================================*
* verify_grant *
*===========================================================================*/
int verify_grant(
endpoint_t granter, /* copyee */
endpoint_t grantee, /* copyer */
cp_grant_id_t grant, /* grant id */
vir_bytes bytes, /* copy size */
int access, /* direction (read/write) */
vir_bytes offset_in, /* copy offset within grant */
vir_bytes *offset_result, /* copy offset within virtual address space */
endpoint_t *e_granter, /* new granter (magic grants) */
struct cp_sfinfo *sfinfo /* storage for soft fault information */
)
{
cp_grant_t g;
int proc_nr;
const struct proc *granter_proc;
int grant_idx, grant_seq;
int depth = 0;
do {
/* Get granter process slot (if valid), and check range of
* grant id.
*/
if(!isokendpt(granter, &proc_nr) ) {
printf(
"grant verify failed: invalid granter %d\n", (int) granter);
return(EINVAL);
}
if(!GRANT_VALID(grant)) {
printf(
"grant verify failed: invalid grant %d\n", (int) grant);
return(EINVAL);
}
granter_proc = proc_addr(proc_nr);
/* If the granter has a temporary grant table, always allow
* requests with unspecified access and return ENOTREADY if
* no grant table is present or if the grantee's endpoint is not
* the endpoint the table belongs to. When ENOTREADY is returned
* the same verify_grant() request will be replayed again in a
* while until the grant table is final. This is necessary to
* avoid races at live update time.
*/
if(priv(granter_proc)->s_grant_endpoint != granter_proc->p_endpoint) {
if(!access) {
return OK;
}
else if(!HASGRANTTABLE(granter_proc) || grantee != priv(granter_proc)->s_grant_endpoint) {
return ENOTREADY;
}
}
/* If there is no priv. structure, or no grant table in the
* priv. structure, or the grant table in the priv. structure
* is too small for the grant, return EPERM.
*/
if(!HASGRANTTABLE(granter_proc)) {
printf(
"grant verify failed: granter %d has no grant table\n",
granter);
return(EPERM);
}
grant_idx = GRANT_IDX(grant);
grant_seq = GRANT_SEQ(grant);
if(priv(granter_proc)->s_grant_entries <= grant_idx) {
printf(
"verify_grant: grant verify failed in ep %d "
"proc %d: grant 0x%x (#%d) out of range "
"for table size %d\n",
granter, proc_nr, grant, grant_idx,
priv(granter_proc)->s_grant_entries);
return(EPERM);
}
/* Copy the grant entry corresponding to this ID's index to see
* what it looks like. If it fails, hide the fact that granter
* has (presumably) set an invalid grant table entry by
* returning EPERM, just like with an invalid grant id.
*/
if(data_copy(granter, priv(granter_proc)->s_grant_table +
sizeof(g) * grant_idx,
KERNEL, (vir_bytes) &g, sizeof(g)) != OK) {
printf(
"verify_grant: grant verify: data_copy failed\n");
return EPERM;
}
/* Check validity: flags and sequence number. */
if((g.cp_flags & (CPF_USED | CPF_VALID)) !=
(CPF_USED | CPF_VALID)) {
printf("verify_grant: grant failed: invalid flags "
"(0x%x, 0x%lx)\n", grant, g.cp_flags);
return EPERM;
}
if (g.cp_seq != grant_seq) {
printf("verify_grant: grant failed: invalid sequence "
"(0x%x, %d vs %d)\n", grant, grant_seq, g.cp_seq);
return EPERM;
}
/* The given grant may be an indirect grant, that is, a grant
* that provides permission to use a grant given to the
* granter (i.e., for which it is the grantee). This can lead
* to a chain of indirect grants which must be followed back.
*/
if((g.cp_flags & CPF_INDIRECT)) {
/* Stop after a few iterations. There may be a loop. */
if (depth == MAX_INDIRECT_DEPTH) {
printf(
"verify grant: indirect grant verify "
"failed: exceeded maximum depth\n");
return ELOOP;
}
depth++;
/* Verify actual grantee. */
if(g.cp_u.cp_indirect.cp_who_to != grantee &&
grantee != ANY &&
g.cp_u.cp_indirect.cp_who_to != ANY) {
printf(
"verify_grant: indirect grant verify "
"failed: bad grantee\n");
return EPERM;
}
/* Start over with new granter, grant, and grantee. */
grantee = granter;
granter = g.cp_u.cp_indirect.cp_who_from;
grant = g.cp_u.cp_indirect.cp_grant;
}
} while(g.cp_flags & CPF_INDIRECT);
/* Check access of grant. */
if(((g.cp_flags & access) != access)) {
printf(
"verify_grant: grant verify failed: access invalid; want 0x%x, have 0x%x\n",
access, g.cp_flags);
return EPERM;
}
if((g.cp_flags & CPF_DIRECT)) {
/* Don't fiddle around with grants that wrap, arithmetic
* below may be confused.
*/
if(MEM_TOP - g.cp_u.cp_direct.cp_len + 1 <
g.cp_u.cp_direct.cp_start) {
printf(
"verify_grant: direct grant verify failed: len too long\n");
return EPERM;
}
/* Verify actual grantee. */
if(g.cp_u.cp_direct.cp_who_to != grantee && grantee != ANY
&& g.cp_u.cp_direct.cp_who_to != ANY) {
printf(
"verify_grant: direct grant verify failed: bad grantee\n");
return EPERM;
}
/* Verify actual copy range. */
if((offset_in+bytes < offset_in) ||
offset_in+bytes > g.cp_u.cp_direct.cp_len) {
printf(
"verify_grant: direct grant verify failed: bad size or range. "
"granted %d bytes @ 0x%lx; wanted %d bytes @ 0x%lx\n",
g.cp_u.cp_direct.cp_len,
g.cp_u.cp_direct.cp_start,
bytes, offset_in);
return EPERM;
}
/* Verify successful - tell caller what address it is. */
*offset_result = g.cp_u.cp_direct.cp_start + offset_in;
*e_granter = granter;
} else if(g.cp_flags & CPF_MAGIC) {
/* Currently, it is hardcoded that only FS may do
* magic grants.
*/
if(granter != VFS_PROC_NR) {
printf(
"verify_grant: magic grant verify failed: granter (%d) "
"is not FS (%d)\n", granter, VFS_PROC_NR);
return EPERM;
}
/* Verify actual grantee. */
if(g.cp_u.cp_magic.cp_who_to != grantee && grantee != ANY
&& g.cp_u.cp_direct.cp_who_to != ANY) {
printf(
"verify_grant: magic grant verify failed: bad grantee\n");
return EPERM;
}
/* Verify actual copy range. */
if((offset_in+bytes < offset_in) ||
offset_in+bytes > g.cp_u.cp_magic.cp_len) {
printf(
"verify_grant: magic grant verify failed: bad size or range. "
"granted %d bytes @ 0x%lx; wanted %d bytes @ 0x%lx\n",
g.cp_u.cp_magic.cp_len,
g.cp_u.cp_magic.cp_start,
bytes, offset_in);
return EPERM;
}
/* Verify successful - tell caller what address it is. */
*offset_result = g.cp_u.cp_magic.cp_start + offset_in;
*e_granter = g.cp_u.cp_magic.cp_who_from;
} else {
printf(
"verify_grant: grant verify failed: unknown grant type\n");
return EPERM;
}
/* If requested, store information regarding soft faults. */
if (sfinfo != NULL && (sfinfo->try = !!(g.cp_flags & CPF_TRY))) {
sfinfo->endpt = granter;
sfinfo->addr = priv(granter_proc)->s_grant_table +
sizeof(g) * grant_idx + offsetof(cp_grant_t, cp_faulted);
sfinfo->value = grant;
}
return OK;
}
/*===========================================================================*
* safecopy *
*===========================================================================*/
static int safecopy(
struct proc * caller,
endpoint_t granter,
endpoint_t grantee,
cp_grant_id_t grantid,
size_t bytes,
vir_bytes g_offset,
vir_bytes addr,
int access /* CPF_READ for a copy from granter to grantee, CPF_WRITE
* for a copy from grantee to granter.
*/
)
{
static struct vir_addr v_src, v_dst;
static vir_bytes v_offset;
endpoint_t new_granter, *src, *dst;
struct proc *granter_p;
int r;
struct cp_sfinfo sfinfo;
#if PERF_USE_COW_SAFECOPY
vir_bytes size;
#endif
if(granter == NONE || grantee == NONE) {
printf("safecopy: nonsense processes\n");
return EFAULT;
}
/* Decide who is src and who is dst. */
if(access & CPF_READ) {
src = &granter;
dst = &grantee;
} else {
src = &grantee;
dst = &granter;
}
/* Verify permission exists. */
if((r=verify_grant(granter, grantee, grantid, bytes, access,
g_offset, &v_offset, &new_granter, &sfinfo)) != OK) {
if(r == ENOTREADY) return r;
printf(
"grant %d verify to copy %d->%d by %d failed: err %d\n",
grantid, *src, *dst, grantee, r);
return r;
}
/* verify_grant() can redirect the grantee to someone else,
* meaning the source or destination changes.
*/
granter = new_granter;
/* Now it's a regular copy. */
v_src.proc_nr_e = *src;
v_dst.proc_nr_e = *dst;
/* Now the offset in virtual addressing is known in 'offset'.
* Depending on the access, this is the source or destination
* address.
*/
if(access & CPF_READ) {
v_src.offset = v_offset;
v_dst.offset = (vir_bytes) addr;
} else {
v_src.offset = (vir_bytes) addr;
v_dst.offset = v_offset;
}
/* Do the regular copy. */
if (sfinfo.try) {
/*
* Try copying without transparently faulting in pages.
* TODO: while CPF_TRY is meant to protect against deadlocks on
* memory-mapped files in file systems, it seems that this case
* triggers faults a whole lot more often, resulting in extra
* overhead due to retried file system operations. It might be
* a good idea to go through VM even in this case, and have VM
* fail (only) if the affected page belongs to a file mapping.
*/
r = virtual_copy(&v_src, &v_dst, bytes);
if (r == EFAULT_SRC || r == EFAULT_DST) {
/*
* Mark the magic grant as having experienced a soft
* fault during its lifetime. The exact value does not
* matter, but we use the grant ID (including its
* sequence number) as a form of protection in the
* light of CPU concurrency.
*/
r = data_copy(KERNEL, (vir_bytes)&sfinfo.value,
sfinfo.endpt, sfinfo.addr, sizeof(sfinfo.value));
/*
* Failure means the creator of the magic grant messed
* up, which can only be unintentional, so report..
*/
if (r != OK)
printf("Kernel: writing soft fault marker %d "
"into %d at 0x%lx failed (%d)\n",
sfinfo.value, sfinfo.endpt, sfinfo.addr,
r);
return EFAULT;
}
return r;
}
return virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes);
}
/*===========================================================================*
* do_safecopy_to *
*===========================================================================*/
int do_safecopy_to(struct proc * caller, message * m_ptr)
{
return safecopy(caller, m_ptr->m_lsys_kern_safecopy.from_to, caller->p_endpoint,
(cp_grant_id_t) m_ptr->m_lsys_kern_safecopy.gid,
m_ptr->m_lsys_kern_safecopy.bytes, m_ptr->m_lsys_kern_safecopy.offset,
(vir_bytes) m_ptr->m_lsys_kern_safecopy.address, CPF_WRITE);
}
/*===========================================================================*
* do_safecopy_from *
*===========================================================================*/
int do_safecopy_from(struct proc * caller, message * m_ptr)
{
return safecopy(caller, m_ptr->m_lsys_kern_safecopy.from_to, caller->p_endpoint,
(cp_grant_id_t) m_ptr->m_lsys_kern_safecopy.gid,
m_ptr->m_lsys_kern_safecopy.bytes, m_ptr->m_lsys_kern_safecopy.offset,
(vir_bytes) m_ptr->m_lsys_kern_safecopy.address, CPF_READ);
}
/*===========================================================================*
* do_vsafecopy *
*===========================================================================*/
int do_vsafecopy(struct proc * caller, message * m_ptr)
{
static struct vscp_vec vec[SCPVEC_NR];
static struct vir_addr src, dst;
int r, i, els;
size_t bytes;
/* Set vector copy parameters. */
src.proc_nr_e = caller->p_endpoint;
assert(src.proc_nr_e != NONE);
src.offset = (vir_bytes) m_ptr->m_lsys_kern_vsafecopy.vec_addr;
dst.proc_nr_e = KERNEL;
dst.offset = (vir_bytes) vec;
/* No. of vector elements. */
els = m_ptr->m_lsys_kern_vsafecopy.vec_size;
bytes = els * sizeof(struct vscp_vec);
/* Obtain vector of copies. */
if((r=virtual_copy_vmcheck(caller, &src, &dst, bytes)) != OK)
return r;
/* Perform safecopies. */
for(i = 0; i < els; i++) {
int access;
endpoint_t granter;
if(vec[i].v_from == SELF) {
access = CPF_WRITE;
granter = vec[i].v_to;
} else if(vec[i].v_to == SELF) {
access = CPF_READ;
granter = vec[i].v_from;
} else {
printf("vsafecopy: %d: element %d/%d: no SELF found\n",
caller->p_endpoint, i, els);
return EINVAL;
}
/* Do safecopy for this element. */
if((r=safecopy(caller, granter, caller->p_endpoint,
vec[i].v_gid,
vec[i].v_bytes, vec[i].v_offset,
vec[i].v_addr, access)) != OK) {
return r;
}
}
return OK;
}