434 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
			
		
		
	
	
			434 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
/*
 | 
						|
 * QEMU posix-aio emulation
 | 
						|
 *
 | 
						|
 * Copyright IBM, Corp. 2008
 | 
						|
 *
 | 
						|
 * Authors:
 | 
						|
 *  Anthony Liguori   <aliguori@us.ibm.com>
 | 
						|
 *
 | 
						|
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 | 
						|
 * the COPYING file in the top-level directory.
 | 
						|
 *
 | 
						|
 */
 | 
						|
 | 
						|
#include <sys/ioctl.h>
 | 
						|
#include <pthread.h>
 | 
						|
#include <unistd.h>
 | 
						|
#include <errno.h>
 | 
						|
#include <time.h>
 | 
						|
#include <string.h>
 | 
						|
#include <stdlib.h>
 | 
						|
#include <stdio.h>
 | 
						|
#include "osdep.h"
 | 
						|
#include "qemu-common.h"
 | 
						|
 | 
						|
#include "posix-aio-compat.h"
 | 
						|
 | 
						|
static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 | 
						|
static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
 | 
						|
static pthread_t thread_id;
 | 
						|
static pthread_attr_t attr;
 | 
						|
static int max_threads = 64;
 | 
						|
static int cur_threads = 0;
 | 
						|
static int idle_threads = 0;
 | 
						|
static TAILQ_HEAD(, qemu_paiocb) request_list;
 | 
						|
 | 
						|
#ifdef HAVE_PREADV
 | 
						|
static int preadv_present = 1;
 | 
						|
#else
 | 
						|
static int preadv_present = 0;
 | 
						|
#endif
 | 
						|
 | 
						|
static void die2(int err, const char *what)
 | 
						|
{
 | 
						|
    fprintf(stderr, "%s failed: %s\n", what, strerror(err));
 | 
						|
    abort();
 | 
						|
}
 | 
						|
 | 
						|
static void die(const char *what)
 | 
						|
{
 | 
						|
    die2(errno, what);
 | 
						|
}
 | 
						|
 | 
						|
static void mutex_lock(pthread_mutex_t *mutex)
 | 
						|
{
 | 
						|
    int ret = pthread_mutex_lock(mutex);
 | 
						|
    if (ret) die2(ret, "pthread_mutex_lock");
 | 
						|
}
 | 
						|
 | 
						|
static void mutex_unlock(pthread_mutex_t *mutex)
 | 
						|
{
 | 
						|
    int ret = pthread_mutex_unlock(mutex);
 | 
						|
    if (ret) die2(ret, "pthread_mutex_unlock");
 | 
						|
}
 | 
						|
 | 
						|
static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
 | 
						|
                           struct timespec *ts)
 | 
						|
{
 | 
						|
    int ret = pthread_cond_timedwait(cond, mutex, ts);
 | 
						|
    if (ret && ret != ETIMEDOUT) die2(ret, "pthread_cond_timedwait");
 | 
						|
    return ret;
 | 
						|
}
 | 
						|
 | 
						|
static void cond_signal(pthread_cond_t *cond)
 | 
						|
{
 | 
						|
    int ret = pthread_cond_signal(cond);
 | 
						|
    if (ret) die2(ret, "pthread_cond_signal");
 | 
						|
}
 | 
						|
 | 
						|
static void thread_create(pthread_t *thread, pthread_attr_t *attr,
 | 
						|
                          void *(*start_routine)(void*), void *arg)
 | 
						|
{
 | 
						|
    int ret = pthread_create(thread, attr, start_routine, arg);
 | 
						|
    if (ret) die2(ret, "pthread_create");
 | 
						|
}
 | 
						|
 | 
						|
static size_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
	int ret;
 | 
						|
 | 
						|
	ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
 | 
						|
	if (ret == -1)
 | 
						|
		return -errno;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * This looks weird, but the aio code only consideres a request
 | 
						|
	 * successfull if it has written the number full number of bytes.
 | 
						|
	 *
 | 
						|
	 * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command,
 | 
						|
	 * so in fact we return the ioctl command here to make posix_aio_read()
 | 
						|
	 * happy..
 | 
						|
	 */
 | 
						|
	return aiocb->aio_nbytes;
 | 
						|
}
 | 
						|
 | 
						|
#ifdef HAVE_PREADV
 | 
						|
 | 
						|
static ssize_t
 | 
						|
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 | 
						|
{
 | 
						|
    return preadv(fd, iov, nr_iov, offset);
 | 
						|
}
 | 
						|
 | 
						|
static ssize_t
 | 
						|
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 | 
						|
{
 | 
						|
    return pwritev(fd, iov, nr_iov, offset);
 | 
						|
}
 | 
						|
 | 
						|
#else
 | 
						|
 | 
						|
static ssize_t
 | 
						|
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 | 
						|
{
 | 
						|
    return -ENOSYS;
 | 
						|
}
 | 
						|
 | 
						|
static ssize_t
 | 
						|
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
 | 
						|
{
 | 
						|
    return -ENOSYS;
 | 
						|
}
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
/*
 | 
						|
 * Check if we need to copy the data in the aiocb into a new
 | 
						|
 * properly aligned buffer.
 | 
						|
 */
 | 
						|
static int aiocb_needs_copy(struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
    if (aiocb->aio_flags & QEMU_AIO_SECTOR_ALIGNED) {
 | 
						|
        int i;
 | 
						|
 | 
						|
        for (i = 0; i < aiocb->aio_niov; i++)
 | 
						|
            if ((uintptr_t) aiocb->aio_iov[i].iov_base % 512)
 | 
						|
                return 1;
 | 
						|
    }
 | 
						|
 | 
						|
    return 0;
 | 
						|
}
 | 
						|
 | 
						|
static size_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
    size_t offset = 0;
 | 
						|
    ssize_t len;
 | 
						|
 | 
						|
    do {
 | 
						|
        if (aiocb->aio_type == QEMU_PAIO_WRITE)
 | 
						|
            len = qemu_pwritev(aiocb->aio_fildes,
 | 
						|
                               aiocb->aio_iov,
 | 
						|
                               aiocb->aio_niov,
 | 
						|
                               aiocb->aio_offset + offset);
 | 
						|
         else
 | 
						|
            len = qemu_preadv(aiocb->aio_fildes,
 | 
						|
                              aiocb->aio_iov,
 | 
						|
                              aiocb->aio_niov,
 | 
						|
                              aiocb->aio_offset + offset);
 | 
						|
    } while (len == -1 && errno == EINTR);
 | 
						|
 | 
						|
    if (len == -1)
 | 
						|
        return -errno;
 | 
						|
    return len;
 | 
						|
}
 | 
						|
 | 
						|
static size_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
 | 
						|
{
 | 
						|
    size_t offset = 0;
 | 
						|
    size_t len;
 | 
						|
 | 
						|
    while (offset < aiocb->aio_nbytes) {
 | 
						|
         if (aiocb->aio_type == QEMU_PAIO_WRITE)
 | 
						|
             len = pwrite(aiocb->aio_fildes,
 | 
						|
                          (const char *)buf + offset,
 | 
						|
                          aiocb->aio_nbytes - offset,
 | 
						|
                          aiocb->aio_offset + offset);
 | 
						|
         else
 | 
						|
             len = pread(aiocb->aio_fildes,
 | 
						|
                         buf + offset,
 | 
						|
                         aiocb->aio_nbytes - offset,
 | 
						|
                         aiocb->aio_offset + offset);
 | 
						|
 | 
						|
         if (len == -1 && errno == EINTR)
 | 
						|
             continue;
 | 
						|
         else if (len == -1) {
 | 
						|
             offset = -errno;
 | 
						|
             break;
 | 
						|
         } else if (len == 0)
 | 
						|
             break;
 | 
						|
 | 
						|
         offset += len;
 | 
						|
    }
 | 
						|
 | 
						|
    return offset;
 | 
						|
}
 | 
						|
 | 
						|
static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
    size_t nbytes;
 | 
						|
    char *buf;
 | 
						|
 | 
						|
    if (!aiocb_needs_copy(aiocb)) {
 | 
						|
        /*
 | 
						|
         * If there is just a single buffer, and it is properly aligned
 | 
						|
         * we can just use plain pread/pwrite without any problems.
 | 
						|
         */
 | 
						|
        if (aiocb->aio_niov == 1)
 | 
						|
             return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
 | 
						|
 | 
						|
        /*
 | 
						|
         * We have more than one iovec, and all are properly aligned.
 | 
						|
         *
 | 
						|
         * Try preadv/pwritev first and fall back to linearizing the
 | 
						|
         * buffer if it's not supported.
 | 
						|
         */
 | 
						|
	if (preadv_present) {
 | 
						|
            nbytes = handle_aiocb_rw_vector(aiocb);
 | 
						|
            if (nbytes == aiocb->aio_nbytes)
 | 
						|
	        return nbytes;
 | 
						|
            if (nbytes < 0 && nbytes != -ENOSYS)
 | 
						|
                return nbytes;
 | 
						|
            preadv_present = 0;
 | 
						|
        }
 | 
						|
 | 
						|
        /*
 | 
						|
         * XXX(hch): short read/write.  no easy way to handle the reminder
 | 
						|
         * using these interfaces.  For now retry using plain
 | 
						|
         * pread/pwrite?
 | 
						|
         */
 | 
						|
    }
 | 
						|
 | 
						|
    /*
 | 
						|
     * Ok, we have to do it the hard way, copy all segments into
 | 
						|
     * a single aligned buffer.
 | 
						|
     */
 | 
						|
    buf = qemu_memalign(512, aiocb->aio_nbytes);
 | 
						|
    if (aiocb->aio_type == QEMU_PAIO_WRITE) {
 | 
						|
        char *p = buf;
 | 
						|
        int i;
 | 
						|
 | 
						|
        for (i = 0; i < aiocb->aio_niov; ++i) {
 | 
						|
            memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
 | 
						|
            p += aiocb->aio_iov[i].iov_len;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    nbytes = handle_aiocb_rw_linear(aiocb, buf);
 | 
						|
    if (aiocb->aio_type != QEMU_PAIO_WRITE) {
 | 
						|
        char *p = buf;
 | 
						|
        size_t count = aiocb->aio_nbytes, copy;
 | 
						|
        int i;
 | 
						|
 | 
						|
        for (i = 0; i < aiocb->aio_niov && count; ++i) {
 | 
						|
            copy = count;
 | 
						|
            if (copy > aiocb->aio_iov[i].iov_len)
 | 
						|
                copy = aiocb->aio_iov[i].iov_len;
 | 
						|
            memcpy(aiocb->aio_iov[i].iov_base, p, copy);
 | 
						|
            p     += copy;
 | 
						|
            count -= copy;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    qemu_vfree(buf);
 | 
						|
 | 
						|
    return nbytes;
 | 
						|
}
 | 
						|
 | 
						|
static void *aio_thread(void *unused)
 | 
						|
{
 | 
						|
    pid_t pid;
 | 
						|
    sigset_t set;
 | 
						|
 | 
						|
    pid = getpid();
 | 
						|
 | 
						|
    /* block all signals */
 | 
						|
    if (sigfillset(&set)) die("sigfillset");
 | 
						|
    if (sigprocmask(SIG_BLOCK, &set, NULL)) die("sigprocmask");
 | 
						|
 | 
						|
    while (1) {
 | 
						|
        struct qemu_paiocb *aiocb;
 | 
						|
        size_t ret = 0;
 | 
						|
        qemu_timeval tv;
 | 
						|
        struct timespec ts;
 | 
						|
 | 
						|
        qemu_gettimeofday(&tv);
 | 
						|
        ts.tv_sec = tv.tv_sec + 10;
 | 
						|
        ts.tv_nsec = 0;
 | 
						|
 | 
						|
        mutex_lock(&lock);
 | 
						|
 | 
						|
        while (TAILQ_EMPTY(&request_list) &&
 | 
						|
               !(ret == ETIMEDOUT)) {
 | 
						|
            ret = cond_timedwait(&cond, &lock, &ts);
 | 
						|
        }
 | 
						|
 | 
						|
        if (TAILQ_EMPTY(&request_list))
 | 
						|
            break;
 | 
						|
 | 
						|
        aiocb = TAILQ_FIRST(&request_list);
 | 
						|
        TAILQ_REMOVE(&request_list, aiocb, node);
 | 
						|
        aiocb->active = 1;
 | 
						|
        idle_threads--;
 | 
						|
        mutex_unlock(&lock);
 | 
						|
 | 
						|
        switch (aiocb->aio_type) {
 | 
						|
        case QEMU_PAIO_READ:
 | 
						|
        case QEMU_PAIO_WRITE:
 | 
						|
		ret = handle_aiocb_rw(aiocb);
 | 
						|
		break;
 | 
						|
        case QEMU_PAIO_IOCTL:
 | 
						|
		ret = handle_aiocb_ioctl(aiocb);
 | 
						|
		break;
 | 
						|
	default:
 | 
						|
		fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
 | 
						|
		ret = -EINVAL;
 | 
						|
		break;
 | 
						|
	}
 | 
						|
 | 
						|
        mutex_lock(&lock);
 | 
						|
        aiocb->ret = ret;
 | 
						|
        idle_threads++;
 | 
						|
        mutex_unlock(&lock);
 | 
						|
 | 
						|
        if (kill(pid, aiocb->ev_signo)) die("kill failed");
 | 
						|
    }
 | 
						|
 | 
						|
    idle_threads--;
 | 
						|
    cur_threads--;
 | 
						|
    mutex_unlock(&lock);
 | 
						|
 | 
						|
    return NULL;
 | 
						|
}
 | 
						|
 | 
						|
static void spawn_thread(void)
 | 
						|
{
 | 
						|
    cur_threads++;
 | 
						|
    idle_threads++;
 | 
						|
    thread_create(&thread_id, &attr, aio_thread, NULL);
 | 
						|
}
 | 
						|
 | 
						|
int qemu_paio_init(struct qemu_paioinit *aioinit)
 | 
						|
{
 | 
						|
    int ret;
 | 
						|
 | 
						|
    ret = pthread_attr_init(&attr);
 | 
						|
    if (ret) die2(ret, "pthread_attr_init");
 | 
						|
 | 
						|
    ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
 | 
						|
    if (ret) die2(ret, "pthread_attr_setdetachstate");
 | 
						|
 | 
						|
    TAILQ_INIT(&request_list);
 | 
						|
 | 
						|
    return 0;
 | 
						|
}
 | 
						|
 | 
						|
static int qemu_paio_submit(struct qemu_paiocb *aiocb, int type)
 | 
						|
{
 | 
						|
    aiocb->aio_type = type;
 | 
						|
    aiocb->ret = -EINPROGRESS;
 | 
						|
    aiocb->active = 0;
 | 
						|
    mutex_lock(&lock);
 | 
						|
    if (idle_threads == 0 && cur_threads < max_threads)
 | 
						|
        spawn_thread();
 | 
						|
    TAILQ_INSERT_TAIL(&request_list, aiocb, node);
 | 
						|
    mutex_unlock(&lock);
 | 
						|
    cond_signal(&cond);
 | 
						|
 | 
						|
    return 0;
 | 
						|
}
 | 
						|
 | 
						|
int qemu_paio_read(struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
    return qemu_paio_submit(aiocb, QEMU_PAIO_READ);
 | 
						|
}
 | 
						|
 | 
						|
int qemu_paio_write(struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
    return qemu_paio_submit(aiocb, QEMU_PAIO_WRITE);
 | 
						|
}
 | 
						|
 | 
						|
int qemu_paio_ioctl(struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
    return qemu_paio_submit(aiocb, QEMU_PAIO_IOCTL);
 | 
						|
}
 | 
						|
 | 
						|
ssize_t qemu_paio_return(struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
    ssize_t ret;
 | 
						|
 | 
						|
    mutex_lock(&lock);
 | 
						|
    ret = aiocb->ret;
 | 
						|
    mutex_unlock(&lock);
 | 
						|
 | 
						|
    return ret;
 | 
						|
}
 | 
						|
 | 
						|
int qemu_paio_error(struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
    ssize_t ret = qemu_paio_return(aiocb);
 | 
						|
 | 
						|
    if (ret < 0)
 | 
						|
        ret = -ret;
 | 
						|
    else
 | 
						|
        ret = 0;
 | 
						|
 | 
						|
    return ret;
 | 
						|
}
 | 
						|
 | 
						|
int qemu_paio_cancel(int fd, struct qemu_paiocb *aiocb)
 | 
						|
{
 | 
						|
    int ret;
 | 
						|
 | 
						|
    mutex_lock(&lock);
 | 
						|
    if (!aiocb->active) {
 | 
						|
        TAILQ_REMOVE(&request_list, aiocb, node);
 | 
						|
        aiocb->ret = -ECANCELED;
 | 
						|
        ret = QEMU_PAIO_CANCELED;
 | 
						|
    } else if (aiocb->ret == -EINPROGRESS)
 | 
						|
        ret = QEMU_PAIO_NOTCANCELED;
 | 
						|
    else
 | 
						|
        ret = QEMU_PAIO_ALLDONE;
 | 
						|
    mutex_unlock(&lock);
 | 
						|
 | 
						|
    return ret;
 | 
						|
}
 |