Fix soft faults in FSes resulting in partial I/O

In order to resolve page faults on file-mapped pages, VM may need to
communicate (through VFS) with a file system.  The file system must
therefore not be the one to cause, and thus end up being blocked on,
such page faults.  To resolve this potential deadlock, the safecopy
system was previously extended with the CPF_TRY flag, which causes the
kernel to return EFAULT to the caller of a safecopy function upon
getting a pagefault, bypassing VM and thus avoiding the loop.  VFS was
extended to repeat relevant file system calls that returned EFAULT,
after resolving the page fault, to keep these soft faults from being
exposed to applications.

However, general UNIX I/O semantics dictate that if an I/O transfer
partially succeeded before running into a failure, the partial result
is to be returned.  Proper file system implementations may therefore
end up returning partial success rather than the EFAULT code resulting
from a soft fault.  Since VFS does not get the EFAULT code in this
case, it does not know that a soft fault occurred, and thus does not
repeat the call either.  The end result is that an application may get
partial I/O results (e.g., a short read(2)) even on regular files.
Applications cannot reasonably be expected to deal with this.

Due to the fact that most of the current file system implementations
do not implement proper partial-failure semantics, this problem is not
yet widespread.  In fact, it has only occurred on direct block device
I/O so far.  However, the next generation of file system services will
be implementing proper I/O semantics, thus exacerbating the problem.

To remedy this situation, this patch changes the CPF_TRY semantics:
whenever the kernel experiences a soft fault during a safecopy call,
in addition to returning FAULT, the kernel also stores a mark in the
grant created with CPF_TRY.  Instead of testing on EFAULT, VFS checks
whether the grant was marked, as part of revoking the grant.  If the
grant was indeed marked by the kernel, VFS repeats the file system
operation, regardless of its initial return value.  Thus, the EFAULT
code now only serves to make the file system fail the call faster.

The approach is currently supported for both direct and magic grants,
but is used only with magic grants - arguably the only case where it
makes sense.  Indirect grants should not have CPF_TRY set; in a chain
of indirect grants, the original grant is marked, as it should be.
In order to avoid potential SMP issues, the mark stored in the grant
is its grant identifier, so as to discard outdated kernel writes.
Whether this is necessary or effective remains to be evaluated.

This patch also cleans up the grant structure a bit, removing reserved
space and thus making the structure slightly smaller.  The structure
is used internally between system services only, so there is no need
for binary compatibility.

Change-Id: I6bb3990dce67a80146d954546075ceda4d6567f8
This commit is contained in:
David van Moolenbroek 2015-12-30 13:17:42 +00:00 committed by Lionel Sambuc
parent efc775b4c8
commit 10b7016b5a
9 changed files with 200 additions and 41 deletions

View File

@ -8,20 +8,19 @@
typedef struct {
int cp_flags; /* CPF_* below */
int cp_seq; /* sequence number */
union ixfer_cp_u {
struct {
/* CPF_DIRECT */
endpoint_t cp_who_to; /* grantee */
vir_bytes cp_start; /* memory */
size_t cp_len; /* size in bytes */
char cp_reserved[8]; /* future use */
} cp_direct;
struct {
/* CPF_INDIRECT */
endpoint_t cp_who_to; /* grantee */
endpoint_t cp_who_from; /* previous granter */
cp_grant_id_t cp_grant; /* previous grant */
char cp_reserved[8];/* future use */
} cp_indirect;
struct {
/* CPF_MAGIC */
@ -29,15 +28,13 @@ typedef struct {
endpoint_t cp_who_to; /* grantee */
vir_bytes cp_start; /* memory */
size_t cp_len; /* size in bytes */
char cp_reserved[8]; /* future use */
} cp_magic;
struct {
/* (free slot) */
int cp_next; /* next free or -1 */
} cp_free;
} cp_u;
int cp_seq; /* sequence number */
char cp_reserved[4]; /* future use */
cp_grant_id_t cp_faulted; /* soft fault marker (CPF_TRY only) */
} cp_grant_t;
/* Vectored safecopy. */
@ -45,7 +42,6 @@ struct vscp_vec {
/* Exactly one of the following must be SELF. */
endpoint_t v_from; /* source */
endpoint_t v_to; /* destination */
cp_grant_id_t v_gid; /* grant id of other process */
size_t v_offset; /* offset in other grant */
vir_bytes v_addr; /* address in copier's space */
@ -78,6 +74,9 @@ struct vscp_vec {
#define CPF_MAGIC 0x000800 /* Grant from any to any. */
#define CPF_VALID 0x001000 /* Grant slot contains valid grant. */
/* Special cpf_revoke() return values. */
#define GRANT_FAULTED 1 /* CPF_TRY: a soft fault occurred */
/* Prototypes for functions in libsys. */
cp_grant_id_t cpf_grant_direct(endpoint_t, vir_bytes, size_t, int);
cp_grant_id_t cpf_grant_indirect(endpoint_t, endpoint_t, cp_grant_id_t);

View File

@ -156,8 +156,9 @@ void hook_ipc_clear(struct proc *proc);
#endif
/* system/do_safecopy.c */
struct cp_sfinfo; /* external callers may only provide NULL */
int verify_grant(endpoint_t, endpoint_t, cp_grant_id_t, vir_bytes, int,
vir_bytes, vir_bytes *, endpoint_t *, u32_t *);
vir_bytes, vir_bytes *, endpoint_t *, struct cp_sfinfo *);
/* system/do_diagctl.c */
int do_diagctl(struct proc * caller, message *m);

View File

@ -29,6 +29,13 @@ static int safecopy(struct proc *, endpoint_t, endpoint_t,
#define HASGRANTTABLE(gr) \
(priv(gr) && priv(gr)->s_grant_table)
struct cp_sfinfo { /* information for handling soft faults */
int try; /* if nonzero, try copy only, stop on fault */
endpoint_t endpt; /* endpoint owning grant with CPF_TRY flag */
vir_bytes addr; /* address to write mark upon soft fault */
cp_grant_id_t value; /* grant ID to use as mark value to write */
};
/*===========================================================================*
* verify_grant *
*===========================================================================*/
@ -41,7 +48,7 @@ int verify_grant(
vir_bytes offset_in, /* copy offset within grant */
vir_bytes *offset_result, /* copy offset within virtual address space */
endpoint_t *e_granter, /* new granter (magic grants) */
u32_t *flags /* CPF_* */
struct cp_sfinfo *sfinfo /* storage for soft fault information */
)
{
cp_grant_t g;
@ -120,8 +127,6 @@ int verify_grant(
return EPERM;
}
if(flags) *flags = g.cp_flags;
/* Check validity: flags and sequence number. */
if((g.cp_flags & (CPF_USED | CPF_VALID)) !=
(CPF_USED | CPF_VALID)) {
@ -250,6 +255,14 @@ int verify_grant(
return EPERM;
}
/* If requested, store information regarding soft faults. */
if (sfinfo != NULL && (sfinfo->try = !!(g.cp_flags & CPF_TRY))) {
sfinfo->endpt = granter;
sfinfo->addr = priv(granter_proc)->s_grant_table +
sizeof(g) * grant_idx + offsetof(cp_grant_t, cp_faulted);
sfinfo->value = grant;
}
return OK;
}
@ -274,7 +287,7 @@ static int safecopy(
endpoint_t new_granter, *src, *dst;
struct proc *granter_p;
int r;
u32_t flags;
struct cp_sfinfo sfinfo;
#if PERF_USE_COW_SAFECOPY
vir_bytes size;
#endif
@ -295,7 +308,7 @@ static int safecopy(
/* Verify permission exists. */
if((r=verify_grant(granter, grantee, grantid, bytes, access,
g_offset, &v_offset, &new_granter, &flags)) != OK) {
g_offset, &v_offset, &new_granter, &sfinfo)) != OK) {
if(r == ENOTREADY) return r;
printf(
"grant %d verify to copy %d->%d by %d failed: err %d\n",
@ -325,11 +338,39 @@ static int safecopy(
}
/* Do the regular copy. */
if(flags & CPF_TRY) {
int r;
/* Try copy without transparently faulting in pages. */
if (sfinfo.try) {
/*
* Try copying without transparently faulting in pages.
* TODO: while CPF_TRY is meant to protect against deadlocks on
* memory-mapped files in file systems, it seems that this case
* triggers faults a whole lot more often, resulting in extra
* overhead due to retried file system operations. It might be
* a good idea to go through VM even in this case, and have VM
* fail (only) if the affected page belongs to a file mapping.
*/
r = virtual_copy(&v_src, &v_dst, bytes);
if(r == EFAULT_SRC || r == EFAULT_DST) return EFAULT;
if (r == EFAULT_SRC || r == EFAULT_DST) {
/*
* Mark the magic grant as having experienced a soft
* fault during its lifetime. The exact value does not
* matter, but we use the grant ID (including its
* sequence number) as a form of protection in the
* light of CPU concurrency.
*/
r = data_copy(KERNEL, (vir_bytes)&sfinfo.value,
sfinfo.endpt, sfinfo.addr, sizeof(sfinfo.value));
/*
* Failure means the creator of the magic grant messed
* up, which can only be unintentional, so report..
*/
if (r != OK)
printf("Kernel: writing soft fault marker %d "
"into %d at 0x%lx failed (%d)\n",
sfinfo.value, sfinfo.endpt, sfinfo.addr,
r);
return EFAULT;
}
return r;
}
return virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes);

View File

@ -152,6 +152,7 @@ cpf_grant_direct(endpoint_t who_to, vir_bytes addr, size_t bytes, int access)
grants[g].cp_u.cp_direct.cp_who_to = who_to;
grants[g].cp_u.cp_direct.cp_start = addr;
grants[g].cp_u.cp_direct.cp_len = bytes;
grants[g].cp_faulted = GRANT_INVALID;
__insn_barrier();
grants[g].cp_flags = access | CPF_DIRECT | CPF_USED | CPF_VALID;
@ -174,6 +175,7 @@ cpf_grant_indirect(endpoint_t who_to, endpoint_t who_from, cp_grant_id_t gr)
grants[g].cp_u.cp_indirect.cp_who_to = who_to;
grants[g].cp_u.cp_indirect.cp_who_from = who_from;
grants[g].cp_u.cp_indirect.cp_grant = gr;
grants[g].cp_faulted = GRANT_INVALID;
__insn_barrier();
grants[g].cp_flags = CPF_USED | CPF_INDIRECT | CPF_VALID;
@ -198,23 +200,39 @@ cpf_grant_magic(endpoint_t who_to, endpoint_t who_from,
grants[g].cp_u.cp_magic.cp_who_from = who_from;
grants[g].cp_u.cp_magic.cp_start = addr;
grants[g].cp_u.cp_magic.cp_len = bytes;
grants[g].cp_faulted = GRANT_INVALID;
__insn_barrier();
grants[g].cp_flags = CPF_USED | CPF_MAGIC | CPF_VALID | access;
return GRANT_ID(g, grants[g].cp_seq);
}
/*
* Revoke previously granted access, identified by grant ID. Return -1 on
* error, with errno set as appropriate. Return 0 on success, with one
* exception: return GRANT_FAULTED (1) if a grant was created with CPF_TRY and
* during its lifetime, a copy from or to the grant experienced a soft fault.
*/
int
cpf_revoke(cp_grant_id_t grant)
{
/* Revoke previously granted access, identified by grant id. */
int g;
int r, g;
GID_CHECK_USED(grant);
g = GRANT_IDX(grant);
/* Make grant invalid by setting flags to 0, clearing CPF_USED.
/*
* If a safecopy action on a (direct or magic) grant with the CPF_TRY
* flag failed on a soft fault, the kernel will have set the cp_faulted
* field to the grant identifier. Here, we test this and return
* GRANT_FAULTED (1) on a match.
*/
r = ((grants[g].cp_flags & CPF_TRY) &&
grants[g].cp_faulted == grant) ? GRANT_FAULTED : 0;
/*
* Make grant invalid by setting flags to 0, clearing CPF_USED.
* This invalidates the grant.
*/
grants[g].cp_flags = 0;
@ -239,7 +257,7 @@ cpf_revoke(cp_grant_id_t grant)
grants[g].cp_u.cp_free.cp_next = freelist;
freelist = g;
return 0;
return r;
}
/*

View File

@ -161,7 +161,10 @@ int fs_sendrec(endpoint_t fs_e, message *reqmp)
assert(self->w_sendrec == NULL);
return(reqmp->m_type);
r = reqmp->m_type;
if (r == ERESTART) /* ERESTART is used internally, so make sure it is.. */
r = EIO; /* ..not delivered as a result from a file system. */
return(r);
}
/*===========================================================================*

View File

@ -483,7 +483,7 @@ void revive(endpoint_t proc_e, int returned)
* it again now that I/O is done.
*/
if (GRANT_VALID(rfp->fp_grant)) {
if(cpf_revoke(rfp->fp_grant)) {
if(cpf_revoke(rfp->fp_grant) == -1) {
panic("VFS: revoke failed for grant: %d",
rfp->fp_grant);
}

View File

@ -49,7 +49,9 @@ static int req_breadwrite_actual(endpoint_t fs_e, endpoint_t user_e, dev_t dev,
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
cpf_revoke(grant_id);
if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
if (r != OK) return(r);
/* Fill in response structure */
@ -68,7 +70,7 @@ int req_breadwrite(endpoint_t fs_e, endpoint_t user_e, dev_t dev, off_t pos,
r = req_breadwrite_actual(fs_e, user_e, dev, pos, num_of_bytes,
user_addr, rw_flag, new_pos, cum_iop, CPF_TRY);
if(r == EFAULT) {
if (r == ERESTART) {
if((r=vm_vfs_procctl_handlemem(user_e, user_addr, num_of_bytes,
rw_flag == READING)) != OK) {
return r;
@ -324,7 +326,8 @@ static int req_getdents_actual(
}
r = fs_sendrec(fs_e, &m);
cpf_revoke(grant_id);
if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
if (r == OK) {
*new_pos = m.m_fs_vfs_getdents.seek_pos;
@ -351,7 +354,9 @@ int req_getdents(
r = req_getdents_actual(fs_e, inode_nr, pos, buf, size, new_pos,
direct, CPF_TRY);
if(r == EFAULT && !direct) {
if (r == ERESTART) {
assert(!direct);
if((r=vm_vfs_procctl_handlemem(who_e, buf, size, 1)) != OK) {
return r;
}
@ -736,7 +741,8 @@ static int req_rdlink_actual(endpoint_t fs_e, ino_t inode_nr,
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
cpf_revoke(grant_id);
if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
if (r == OK) r = m.m_fs_vfs_rdlink.nbytes;
@ -756,7 +762,9 @@ int req_rdlink(endpoint_t fs_e, ino_t inode_nr, endpoint_t proc_e,
r = req_rdlink_actual(fs_e, inode_nr, proc_e, buf, len, direct,
CPF_TRY);
if(r == EFAULT && !direct) {
if (r == ERESTART) {
assert(!direct);
if((r=vm_vfs_procctl_handlemem(proc_e, buf, len, 1)) != OK) {
return r;
}
@ -854,7 +862,8 @@ static int req_readwrite_actual(endpoint_t fs_e, ino_t inode_nr, off_t pos,
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
cpf_revoke(grant_id);
if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
if (r == OK) {
/* Fill in response structure */
@ -877,9 +886,9 @@ int req_readwrite(endpoint_t fs_e, ino_t inode_nr, off_t pos,
r = req_readwrite_actual(fs_e, inode_nr, pos, rw_flag, user_e,
user_addr, num_of_bytes, new_posp, cum_iop, CPF_TRY);
if(r == EFAULT) {
if((r=vm_vfs_procctl_handlemem(user_e, (vir_bytes) user_addr, num_of_bytes,
rw_flag == READING)) != OK) {
if (r == ERESTART) {
if ((r=vm_vfs_procctl_handlemem(user_e, (vir_bytes) user_addr,
num_of_bytes, rw_flag == READING)) != OK) {
return r;
}
@ -1034,8 +1043,9 @@ static int req_slink_actual(
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
cpf_revoke(gid_name);
cpf_revoke(gid_buf);
if (cpf_revoke(gid_buf) == GRANT_FAULTED) return(ERESTART);
return(r);
}
@ -1059,7 +1069,7 @@ int req_slink(
r = req_slink_actual(fs_e, inode_nr, lastc, proc_e, path_addr,
path_length, uid, gid, CPF_TRY);
if(r == EFAULT) {
if (r == ERESTART) {
if((r=vm_vfs_procctl_handlemem(proc_e, (vir_bytes) path_addr,
path_length, 0)) != OK) {
return r;
@ -1096,7 +1106,8 @@ int req_stat_actual(endpoint_t fs_e, ino_t inode_nr, endpoint_t proc_e,
/* Send/rec request */
r = fs_sendrec(fs_e, &m);
cpf_revoke(grant_id);
if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART);
return(r);
}
@ -1112,7 +1123,7 @@ int req_stat(endpoint_t fs_e, ino_t inode_nr, endpoint_t proc_e,
r = req_stat_actual(fs_e, inode_nr, proc_e, buf, CPF_TRY);
if(r == EFAULT) {
if (r == ERESTART) {
if((r=vm_vfs_procctl_handlemem(proc_e, (vir_bytes) buf,
sizeof(struct stat), 1)) != OK) {
return r;

View File

@ -318,7 +318,7 @@ static void raw_xfer(dev_t minor, u64_t pos, iovec_s_t *iovec, int nr_req,
r = sendrec_driver(&m, exp, res);
if (cpf_revoke(grant) != OK)
if (cpf_revoke(grant) == -1)
panic("unable to revoke grant");
if (r != RESULT_OK)
@ -359,7 +359,7 @@ static void vir_xfer(dev_t minor, u64_t pos, iovec_t *iovec, int nr_req,
for (i = 0; i < nr_req; i++) {
iovec[i].iov_size = iov_s[i].iov_size;
if (cpf_revoke(iov_s[i].iov_grant) != OK)
if (cpf_revoke(iov_s[i].iov_grant) == -1)
panic("unable to revoke grant");
}
}
@ -1181,7 +1181,7 @@ static int vir_ioctl(dev_t minor, unsigned long req, void *ptr, ssize_t exp,
r = sendrec_driver(&m, exp, res);
if (cpf_revoke(grant) != OK)
if (cpf_revoke(grant) == -1)
panic("unable to revoke grant");
return r;

View File

@ -791,6 +791,90 @@ hole_regression(void)
close(fd);
}
/*
* Test that soft faults during file system I/O do not cause functions to
* return partial I/O results.
*
* We refer to the faults that are caused internally within the operating
* system as a result of the deadlock mitigation described at the top of this
* file, as a particular class of "soft faults". Such soft faults may occur in
* the middle of an I/O operation, and general I/O semantics dictate that upon
* partial success, the partial success is returned (and *not* an error). As a
* result, these soft faults, if not handled as special cases, may cause even
* common file system operations such as read(2) on a regular file to return
* fewer bytes than requested. Such unexpected short reads are typically not
* handled well by userland, and the OS must prevent them from occurring if it
* can. Note that read(2) is the most problematic, but certainly not the only,
* case where this problem can occur.
*
* Unfortunately, several file system services are not following the proper
* general I/O semantics - and this includes MFS. Therefore, for now, we have
* to test this case using block device I/O, which does do the right thing.
* In this test we hope that the root file system is mounted on a block device
* usable for (read-only!) testing purposes.
*/
static void
softfault_partial(void)
{
struct statvfs stf;
struct stat st;
char *buf, *buf2;
ssize_t size;
int fd;
if (statvfs("/", &stf) != 0) e(0);
/*
* If the root file system is not mounted off a block device, or if we
* cannot open that device ourselves, simply skip this subtest.
*/
if (stat(stf.f_mntfromname, &st) != 0 || !S_ISBLK(st.st_mode))
return; /* skip subtest */
if ((fd = open(stf.f_mntfromname, O_RDONLY)) == -1)
return; /* skip subtest */
/*
* See if we can read in the first two full blocks, or two pages worth
* of data, whichever is larger. If that fails, there is no point in
* continuing the test.
*/
size = MAX(stf.f_bsize, PAGE_SIZE) * 2;
if ((buf = mmap(NULL, size, PROT_READ | PROT_READ,
MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) e(0);
if (read(fd, buf, size) != size) {
munmap(buf, size);
close(fd);
return; /* skip subtest */
}
lseek(fd, 0, SEEK_SET);
/*
* Now attempt a read to a partially faulted-in buffer. The first time
* around, the I/O transfer will generate a fault and return partial
* success. In that case, the entire I/O transfer should be retried
* after faulting in the missing page(s), thus resulting in the read
* succeeding in full.
*/
if ((buf2 = mmap(NULL, size, PROT_READ | PROT_READ,
MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) e(0);
buf2[0] = '\0'; /* fault in the first page */
if (read(fd, buf2, size) != size) e(0);
/* The result should be correct, too. */
if (memcmp(buf, buf2, size)) e(0);
/* Clean up. */
munmap(buf2, size);
munmap(buf, size);
close(fd);
}
int
main(int argc, char *argv[])
{
@ -814,6 +898,8 @@ main(int argc, char *argv[])
test_memory_types_vs_operations();
softfault_partial();
makefiles(MAXFILES);
cachequiet(!bigflag);