/* Copyright (C) 2009 Red Hat, Inc.
* Copyright (C) 2006 Rusty Russell IBM Corporation
* Author: Michael S. Tsirkin <mst@redhat.com>
* (C) Badari Pulavarty pbadari@us.ibm.com 2010 with the following comment.
* Inspiration, some code, and most witty comments come from
* Documentation/lguest/lguest.c, by Rusty Russell
* This work is licensed under the terms of the GNU GPL, version 2.
* (C) Copyright 2012 Intel Corporation
* Author: Caz Yokoyama <Caz.Yokoyama@intel.com>
* Generic code for virtio server in host kernel.
#include <linux/version.h>
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,34)) || \
defined(RHEL_RELEASE_CODE)
#include <linux/eventfd.h>
#include "./linux/vhost.h"
#include <linux/virtio_net.h>
#include <linux/mmu_context.h>
#include <linux/miscdevice.h>
#include <linux/rcupdate.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/cgroup.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#ifndef VIRTIO_RING_F_EVENT_IDX /* virtio_ring.h of rhel6.0 does not define */
#define VIRTIO_RING_F_EVENT_IDX 29
#include "mic/micveth_dma.h"
#define mic_addr_in_host(va, pa) ((u8 *)(va) + (u64)(pa))
VHOST_MEMORY_MAX_NREGIONS
= 64,
VHOST_MEMORY_F_LOG
= 0x1,
static unsigned vhost_zcopy_mask __read_mostly
;
static void vhost_poll_func(struct file
*file
, wait_queue_head_t
*wqh
,
poll
= container_of(pt
, struct vhost_poll
, table
);
add_wait_queue(wqh
, &poll
->wait
);
static int vhost_poll_wakeup(wait_queue_entry_t
*wait
, unsigned mode
, int sync
,
struct vhost_poll
*poll
= container_of(wait
, struct vhost_poll
, wait
);
if (!((unsigned long)key
& poll
->mask
))
static void vhost_work_init(struct vhost_work
*work
, vhost_work_fn_t fn
)
INIT_LIST_HEAD(&work
->node
);
init_waitqueue_head(&work
->done
);
work
->queue_seq
= work
->done_seq
= 0;
/* Init poll structure */
void vhost_poll_init(struct vhost_poll
*poll
, vhost_work_fn_t fn
,
unsigned long mask
, struct vhost_dev
*dev
)
init_waitqueue_func_entry(&poll
->wait
, vhost_poll_wakeup
);
init_poll_funcptr(&poll
->table
, vhost_poll_func
);
vhost_work_init(&poll
->work
, fn
);
/* Start polling a file. We add ourselves to file's wait queue. The caller must
* keep a reference to a file until after vhost_poll_stop is called. */
void vhost_poll_start(struct vhost_poll
*poll
, struct file
*file
)
mask
= file
->f_op
->poll(file
, &poll
->table
);
vhost_poll_wakeup(&poll
->wait
, 0, 0, (void *)mask
);
/* Stop polling a file. After this function returns, it becomes safe to drop the
* file reference. You must also flush afterwards. */
void vhost_poll_stop(struct vhost_poll
*poll
)
remove_wait_queue(poll
->wqh
, &poll
->wait
);
static bool vhost_work_seq_done(struct vhost_dev
*dev
, struct vhost_work
*work
,
spin_lock_irq(&dev
->work_lock
);
left
= seq
- work
->done_seq
;
spin_unlock_irq(&dev
->work_lock
);
static void vhost_work_flush(struct vhost_dev
*dev
, struct vhost_work
*work
)
spin_lock_irq(&dev
->work_lock
);
spin_unlock_irq(&dev
->work_lock
);
wait_event(work
->done
, vhost_work_seq_done(dev
, work
, seq
));
spin_lock_irq(&dev
->work_lock
);
flushing
= --work
->flushing
;
spin_unlock_irq(&dev
->work_lock
);
/* Flush any work that has been scheduled. When calling this, don't hold any
* locks that are also used by the callback. */
void vhost_poll_flush(struct vhost_poll
*poll
)
vhost_work_flush(poll
->dev
, &poll
->work
);
static inline void vhost_work_queue(struct vhost_dev
*dev
,
spin_lock_irqsave(&dev
->work_lock
, flags
);
if (list_empty(&work
->node
)) {
list_add_tail(&work
->node
, &dev
->work_list
);
wake_up_process(dev
->worker
);
spin_unlock_irqrestore(&dev
->work_lock
, flags
);
void vhost_poll_queue(struct vhost_poll
*poll
)
vhost_work_queue(poll
->dev
, &poll
->work
);
static void vhost_vq_reset(struct vhost_dev
*dev
,
struct vhost_virtqueue
*vq
)
vq
->signalled_used_valid
= false;
static void vhost_vq_free_iovecs(struct vhost_virtqueue
*vq
)
void vhost_enable_zcopy(int vq
)
vhost_zcopy_mask
|= 0x1 << vq
;
static void vhost_dev_free_iovecs(struct vhost_dev
*dev
)
for (i
= 0; i
< dev
->nvqs
; ++i
)
vhost_vq_free_iovecs(&dev
->vqs
[i
]);
long vhost_dev_init(struct vhost_dev
*dev
,
struct vhost_virtqueue
*vqs
, int nvqs
)
spin_lock_init(&dev
->work_lock
);
INIT_LIST_HEAD(&dev
->work_list
);
for (i
= 0; i
< dev
->nvqs
; ++i
) {
dev
->vqs
[i
].indirect
= NULL
;
dev
->vqs
[i
].heads
= NULL
;
dev
->vqs
[i
].ubuf_info
= NULL
;
mutex_init(&dev
->vqs
[i
].mutex
);
vhost_vq_reset(dev
, dev
->vqs
+ i
);
if (dev
->vqs
[i
].handle_kick
)
vhost_poll_init(&dev
->vqs
[i
].poll
,
dev
->vqs
[i
].handle_kick
, POLLIN
, dev
);
/* Caller should have device mutex */
long vhost_dev_check_owner(struct vhost_dev
*dev
)
/* Are you the owner? If not, I don't think you mean to do that */
return dev
->mm
== current
->mm
? 0 : -EPERM
;
struct vhost_attach_cgroups_struct
{
struct task_struct
*owner
;
/* Caller should have device mutex */
long vhost_dev_reset_owner(struct vhost_dev
*dev
)
struct vhost_memory
*memory
;
/* Restore memory to default empty mapping. */
memory
= kmalloc(offsetof(struct vhost_memory
, regions
), GFP_KERNEL
);
/* In case of DMA done not in order in lower device driver for some reason.
* upend_idx is used to track end of used idx, done_idx is used to track head
* of used idx. Once lower device DMA done contiguously, we will signal KVM
int vhost_zerocopy_signal_used(struct vhost_virtqueue
*vq
)
for (i
= vq
->done_idx
; i
!= vq
->upend_idx
; i
= (i
+ 1) % UIO_MAXIOV
) {
if ((vq
->heads
[i
].len
== VHOST_DMA_DONE_LEN
)) {
vq
->heads
[i
].len
= VHOST_DMA_CLEAR_LEN
;
vhost_add_used_and_signal(vq
->dev
, vq
,
/* Caller should have device mutex */
void vhost_dev_cleanup(struct vhost_dev
*dev
)
for (i
= 0; i
< dev
->nvqs
; ++i
) {
if (dev
->vqs
[i
].kick
&& dev
->vqs
[i
].handle_kick
) {
vhost_poll_stop(&dev
->vqs
[i
].poll
);
vhost_poll_flush(&dev
->vqs
[i
].poll
);
BUG_ON(dev
->vqs
[i
].ubufs
!= NULL
);
/* Signal guest as appropriate. */
vhost_zerocopy_signal_used(&dev
->vqs
[i
]);
if (dev
->vqs
[i
].error_ctx
)
eventfd_ctx_put(dev
->vqs
[i
].error_ctx
);
if (dev
->vqs
[i
].call_ctx
)
eventfd_ctx_put(dev
->vqs
[i
].call_ctx
);
vhost_vq_reset(dev
, dev
->vqs
+ i
);
vhost_dev_free_iovecs(dev
);
eventfd_ctx_put(dev
->log_ctx
);
/* No one will access memory at this point */
WARN_ON(!list_empty(&dev
->work_list
));
kthread_stop(dev
->worker
);
/* Caller must have device mutex */
long vhost_dev_ioctl(struct vhost_dev
*d
, unsigned int ioctl
, unsigned long arg
)
static int vhost_update_used_flags(struct vhost_virtqueue
*vq
)
iowrite16(vq
->used_flags
, mic_addr_in_host(vq
->log_addr
, &vq
->used
->flags
));
int vhost_init_used(struct vhost_virtqueue
*vq
)
r
= vhost_update_used_flags(vq
);
vq
->signalled_used_valid
= false;
vq
->last_used_idx
= ioread16(mic_addr_in_host(vq
->log_addr
, &vq
->used
->idx
));
/* Each buffer in the virtqueues is actually a chain of descriptors. This
* function returns the next descriptor in the chain,
* or -1U if we're at the end. */
static unsigned next_desc(struct vring_desc
*desc
)
/* If this descriptor says it doesn't chain, we're done. */
if (!(desc
->flags
& VRING_DESC_F_NEXT
))
/* Check they're not leading us off end of descriptors. */
/* Make sure compiler knows to grab that: we don't want it changing! */
/* We will use the result as an index in an array, so most
* architectures only need a compiler barrier here. */
/* This looks in the virtqueue and for the first available buffer, and converts
* it to an iovec for convenient access. Since descriptors consist of some
* number of output then some number of input descriptors, it's actually two
* iovecs, but we pack them into one and note how many of each there were.
* This function returns the descriptor number found, or vq->num (which is
* never a valid descriptor number) if none was found. A negative code is
int vhost_get_vq_desc(struct vhost_dev
*dev
, struct vhost_virtqueue
*vq
,
struct iovec iov
[], unsigned int iov_size
,
unsigned int *out_num
, unsigned int *in_num
,
struct vhost_log
*log
, unsigned int *log_num
)
unsigned int i
, head
, found
= 0;
/* Check it isn't doing very strange things with descriptor numbers. */
last_avail_idx
= vq
->last_avail_idx
;
vq
->avail_idx
= ioread16(mic_addr_in_host(vq
->log_addr
, &vq
->avail
->idx
));
if (unlikely((u16
)(vq
->avail_idx
- last_avail_idx
) > vq
->num
)) {
vq_err(vq
, "Guest moved used index from %u to %u",
last_avail_idx
, vq
->avail_idx
);
/* If there's nothing new since last we looked, return invalid. */
if (vq
->avail_idx
== last_avail_idx
)
/* Only get avail ring entries after they have been exposed by guest. */
/* Grab the next descriptor number they're advertising, and increment
* the index we've seen. */
head
= ioread16(mic_addr_in_host(vq
->log_addr
,
&vq
->avail
->ring
[last_avail_idx
% vq
->num
]));
/* If their number is silly, that's an error. */
if (unlikely(head
>= vq
->num
)) {
vq_err(vq
, "Guest says index %u > %u is available",
/* When we start there are none of either input nor output. */
unsigned iov_count
= *in_num
+ *out_num
;
if (unlikely(i
>= vq
->num
)) {
vq_err(vq
, "Desc index is %u > %u, head = %u",
if (unlikely(++found
> vq
->num
)) {
vq_err(vq
, "Loop detected: last one at %u "
memcpy_fromio(&desc
, mic_addr_in_host(vq
->log_addr
, vq
->desc
+ i
), sizeof(desc
));
(iov
+ iov_count
)->iov_base
= (void *)desc
.addr
;
(iov
+ iov_count
)->iov_len
= desc
.len
;
if (desc
.flags
& VRING_DESC_F_WRITE
) {
/* If this is an input descriptor,
* increment that count. */
log
[*log_num
].addr
= desc
.addr
;
log
[*log_num
].len
= desc
.len
;
/* If it's an output descriptor, they're all supposed
* to come before any input descriptors. */
vq_err(vq
, "Descriptor has out after in: "
} while ((i
= next_desc(&desc
)) != -1);
/* On success, increment avail index. */
/* Assume notifications from guest are disabled at this point,
* if they aren't we would need to update avail_event index. */
BUG_ON(!(vq
->used_flags
& VRING_USED_F_NO_NOTIFY
));
/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
void vhost_discard_vq_desc(struct vhost_virtqueue
*vq
, int n
)
/* After we've used one of their buffers, we tell them about it. We'll then
* want to notify the guest, using eventfd. */
int vhost_add_used(struct vhost_virtqueue
*vq
, unsigned int head
, int len
)
struct vring_used_elem __user
*used
;
/* The virtqueue contains a ring of used buffers. Get a pointer to the
* next entry in that used ring. */
used
= &vq
->used
->ring
[vq
->last_used_idx
% vq
->num
];
iowrite16(head
, mic_addr_in_host(vq
->log_addr
, &used
->id
));
iowrite16(len
, mic_addr_in_host(vq
->log_addr
, &used
->len
));
/* Make sure buffer is written before we update index. */
ioread16(mic_addr_in_host(vq
->log_addr
, &used
->id
));
iowrite16(vq
->last_used_idx
+ 1, mic_addr_in_host(vq
->log_addr
, &vq
->used
->idx
));
/* If the driver never bothers to signal in a very long while,
* used index might wrap around. If that happens, invalidate
* signalled_used index we stored. TODO: make sure driver
* signals at least once in 2^16 and remove this. */
if (unlikely(vq
->last_used_idx
== vq
->signalled_used
))
vq
->signalled_used_valid
= false;
static int __vhost_add_used_n(struct vhost_virtqueue
*vq
,
struct vring_used_elem
*heads
,
struct vring_used_elem __user
*used
;
start
= vq
->last_used_idx
% vq
->num
;
used
= vq
->used
->ring
+ start
;
memcpy_toio(mic_addr_in_host(vq
->log_addr
, used
), heads
, count
* sizeof(*used
));
new = (vq
->last_used_idx
+= count
);
/* If the driver never bothers to signal in a very long while,
* used index might wrap around. If that happens, invalidate
* signalled_used index we stored. TODO: make sure driver
* signals at least once in 2^16 and remove this. */
if (unlikely((u16
)(new - vq
->signalled_used
) < (u16
)(new - old
)))
vq
->signalled_used_valid
= false;
/* After we've used one of their buffers, we tell them about it. We'll then
* want to notify the guest, using eventfd. */
int vhost_add_used_n(struct vhost_virtqueue
*vq
, struct vring_used_elem
*heads
,
start
= vq
->last_used_idx
% vq
->num
;
r
= __vhost_add_used_n(vq
, heads
, n
);
r
= __vhost_add_used_n(vq
, heads
, count
);
/* Make sure buffer is written before we update index. */
iowrite16(vq
->last_used_idx
, mic_addr_in_host(vq
->log_addr
, &vq
->used
->idx
));
static bool vhost_notify(struct vhost_dev
*dev
, struct vhost_virtqueue
*vq
)
/* Flush out used index updates. This is paired
* with the barrier that the Guest executes when enabling
if (vhost_has_feature(dev
, VIRTIO_F_NOTIFY_ON_EMPTY
) &&
unlikely(vq
->avail_idx
== vq
->last_avail_idx
))
if (!vhost_has_feature(dev
, VIRTIO_RING_F_EVENT_IDX
)) {
flags
= ioread16(mic_addr_in_host(vq
->log_addr
, &vq
->avail
->flags
));
return !(flags
& VRING_AVAIL_F_NO_INTERRUPT
);
old
= vq
->signalled_used
;
v
= vq
->signalled_used_valid
;
new = vq
->signalled_used
= vq
->last_used_idx
;
vq
->signalled_used_valid
= true;
/* This actually signals the guest, using eventfd. */
void vhost_signal(struct vhost_dev
*dev
, struct vhost_virtqueue
*vq
)
/* Signal the Guest tell them we used something up. */
if (vq
->log_base
&& vhost_notify(dev
, vq
))
mic_send_virtio_intr((struct _mic_ctx_t
*)vq
->log_base
);
/* And here's the combo meal deal. Supersize me! */
void vhost_add_used_and_signal(struct vhost_dev
*dev
,
struct vhost_virtqueue
*vq
,
unsigned int head
, int len
)
vhost_add_used(vq
, head
, len
);
/* multi-buffer version of vhost_add_used_and_signal */
void vhost_add_used_and_signal_n(struct vhost_dev
*dev
,
struct vhost_virtqueue
*vq
,
struct vring_used_elem
*heads
, unsigned count
)
vhost_add_used_n(vq
, heads
, count
);
/* OK, now we need to know about added descriptors. */
bool vhost_enable_notify(struct vhost_dev
*dev
, struct vhost_virtqueue
*vq
)
if (!(vq
->used_flags
& VRING_USED_F_NO_NOTIFY
))
vq
->used_flags
&= ~VRING_USED_F_NO_NOTIFY
;
if (!vhost_has_feature(dev
, VIRTIO_RING_F_EVENT_IDX
)) {
r
= vhost_update_used_flags(vq
);
vq_err(vq
, "Failed to enable notification at %p: %d\n",
/* They could have slipped one in as we were doing that: make
* sure it's written, then check again. */
avail_idx
= ioread16(mic_addr_in_host(vq
->log_addr
, &vq
->avail
->idx
));
return avail_idx
!= vq
->avail_idx
;
/* We don't need to be notified again. */
void vhost_disable_notify(struct vhost_dev
*dev
, struct vhost_virtqueue
*vq
)
if (vq
->used_flags
& VRING_USED_F_NO_NOTIFY
)
vq
->used_flags
|= VRING_USED_F_NO_NOTIFY
;
if (!vhost_has_feature(dev
, VIRTIO_RING_F_EVENT_IDX
)) {
r
= vhost_update_used_flags(vq
);
vq_err(vq
, "Failed to enable notification at %p: %d\n",