1526 lines
38 KiB
C
1526 lines
38 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
|
|
*/
|
|
|
|
#include <linux/device.h>
|
|
#include <linux/eventfd.h>
|
|
#include <linux/file.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/iommu.h>
|
|
#include <linux/module.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/pci.h>
|
|
#include <linux/pm_runtime.h>
|
|
#include <linux/types.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/vfio.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/anon_inodes.h>
|
|
|
|
#include "cmd.h"
|
|
|
|
/* Device specification max LOAD size */
|
|
#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
|
|
|
|
#define MAX_CHUNK_SIZE SZ_8M
|
|
|
|
static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
|
|
{
|
|
struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
|
|
|
|
return container_of(core_device, struct mlx5vf_pci_core_device,
|
|
core_device);
|
|
}
|
|
|
|
struct page *
|
|
mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
|
|
unsigned long offset)
|
|
{
|
|
unsigned long cur_offset = 0;
|
|
struct scatterlist *sg;
|
|
unsigned int i;
|
|
|
|
/* All accesses are sequential */
|
|
if (offset < buf->last_offset || !buf->last_offset_sg) {
|
|
buf->last_offset = 0;
|
|
buf->last_offset_sg = buf->table.sgt.sgl;
|
|
buf->sg_last_entry = 0;
|
|
}
|
|
|
|
cur_offset = buf->last_offset;
|
|
|
|
for_each_sg(buf->last_offset_sg, sg,
|
|
buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
|
|
if (offset < sg->length + cur_offset) {
|
|
buf->last_offset_sg = sg;
|
|
buf->sg_last_entry += i;
|
|
buf->last_offset = cur_offset;
|
|
return nth_page(sg_page(sg),
|
|
(offset - cur_offset) / PAGE_SIZE);
|
|
}
|
|
cur_offset += sg->length;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
|
|
unsigned int npages)
|
|
{
|
|
unsigned int to_alloc = npages;
|
|
struct page **page_list;
|
|
unsigned long filled;
|
|
unsigned int to_fill;
|
|
int ret;
|
|
|
|
to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
|
|
page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
|
|
if (!page_list)
|
|
return -ENOMEM;
|
|
|
|
do {
|
|
filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
|
|
page_list);
|
|
if (!filled) {
|
|
ret = -ENOMEM;
|
|
goto err;
|
|
}
|
|
to_alloc -= filled;
|
|
ret = sg_alloc_append_table_from_pages(
|
|
&buf->table, page_list, filled, 0,
|
|
filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
|
|
GFP_KERNEL_ACCOUNT);
|
|
|
|
if (ret)
|
|
goto err;
|
|
buf->allocated_length += filled * PAGE_SIZE;
|
|
/* clean input for another bulk allocation */
|
|
memset(page_list, 0, filled * sizeof(*page_list));
|
|
to_fill = min_t(unsigned int, to_alloc,
|
|
PAGE_SIZE / sizeof(*page_list));
|
|
} while (to_alloc > 0);
|
|
|
|
kvfree(page_list);
|
|
return 0;
|
|
|
|
err:
|
|
kvfree(page_list);
|
|
return ret;
|
|
}
|
|
|
|
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
|
|
{
|
|
mutex_lock(&migf->lock);
|
|
migf->state = MLX5_MIGF_STATE_ERROR;
|
|
migf->filp->f_pos = 0;
|
|
mutex_unlock(&migf->lock);
|
|
}
|
|
|
|
static int mlx5vf_release_file(struct inode *inode, struct file *filp)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
|
|
mlx5vf_disable_fd(migf);
|
|
mutex_destroy(&migf->lock);
|
|
kfree(migf);
|
|
return 0;
|
|
}
|
|
|
|
static struct mlx5_vhca_data_buffer *
|
|
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
|
|
bool *end_of_data)
|
|
{
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
bool found = false;
|
|
|
|
*end_of_data = false;
|
|
spin_lock_irq(&migf->list_lock);
|
|
if (list_empty(&migf->buf_list)) {
|
|
*end_of_data = true;
|
|
goto end;
|
|
}
|
|
|
|
buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
|
|
buf_elm);
|
|
if (pos >= buf->start_pos &&
|
|
pos < buf->start_pos + buf->length) {
|
|
found = true;
|
|
goto end;
|
|
}
|
|
|
|
/*
|
|
* As we use a stream based FD we may expect having the data always
|
|
* on first chunk
|
|
*/
|
|
migf->state = MLX5_MIGF_STATE_ERROR;
|
|
|
|
end:
|
|
spin_unlock_irq(&migf->list_lock);
|
|
return found ? buf : NULL;
|
|
}
|
|
|
|
static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = vhca_buf->migf;
|
|
|
|
if (vhca_buf->stop_copy_chunk_num) {
|
|
bool is_header = vhca_buf->dma_dir == DMA_NONE;
|
|
u8 chunk_num = vhca_buf->stop_copy_chunk_num;
|
|
size_t next_required_umem_size = 0;
|
|
|
|
if (is_header)
|
|
migf->buf_header[chunk_num - 1] = vhca_buf;
|
|
else
|
|
migf->buf[chunk_num - 1] = vhca_buf;
|
|
|
|
spin_lock_irq(&migf->list_lock);
|
|
list_del_init(&vhca_buf->buf_elm);
|
|
if (!is_header) {
|
|
next_required_umem_size =
|
|
migf->next_required_umem_size;
|
|
migf->next_required_umem_size = 0;
|
|
migf->num_ready_chunks--;
|
|
}
|
|
spin_unlock_irq(&migf->list_lock);
|
|
if (next_required_umem_size)
|
|
mlx5vf_mig_file_set_save_work(migf, chunk_num,
|
|
next_required_umem_size);
|
|
return;
|
|
}
|
|
|
|
spin_lock_irq(&migf->list_lock);
|
|
list_del_init(&vhca_buf->buf_elm);
|
|
list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
|
|
spin_unlock_irq(&migf->list_lock);
|
|
}
|
|
|
|
static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
|
|
char __user **buf, size_t *len, loff_t *pos)
|
|
{
|
|
unsigned long offset;
|
|
ssize_t done = 0;
|
|
size_t copy_len;
|
|
|
|
copy_len = min_t(size_t,
|
|
vhca_buf->start_pos + vhca_buf->length - *pos, *len);
|
|
while (copy_len) {
|
|
size_t page_offset;
|
|
struct page *page;
|
|
size_t page_len;
|
|
u8 *from_buff;
|
|
int ret;
|
|
|
|
offset = *pos - vhca_buf->start_pos;
|
|
page_offset = offset % PAGE_SIZE;
|
|
offset -= page_offset;
|
|
page = mlx5vf_get_migration_page(vhca_buf, offset);
|
|
if (!page)
|
|
return -EINVAL;
|
|
page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
|
|
from_buff = kmap_local_page(page);
|
|
ret = copy_to_user(*buf, from_buff + page_offset, page_len);
|
|
kunmap_local(from_buff);
|
|
if (ret)
|
|
return -EFAULT;
|
|
*pos += page_len;
|
|
*len -= page_len;
|
|
*buf += page_len;
|
|
done += page_len;
|
|
copy_len -= page_len;
|
|
}
|
|
|
|
if (*pos >= vhca_buf->start_pos + vhca_buf->length)
|
|
mlx5vf_buf_read_done(vhca_buf);
|
|
|
|
return done;
|
|
}
|
|
|
|
static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
|
|
loff_t *pos)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
struct mlx5_vhca_data_buffer *vhca_buf;
|
|
bool first_loop_call = true;
|
|
bool end_of_data;
|
|
ssize_t done = 0;
|
|
|
|
if (pos)
|
|
return -ESPIPE;
|
|
pos = &filp->f_pos;
|
|
|
|
if (!(filp->f_flags & O_NONBLOCK)) {
|
|
if (wait_event_interruptible(migf->poll_wait,
|
|
!list_empty(&migf->buf_list) ||
|
|
migf->state == MLX5_MIGF_STATE_ERROR ||
|
|
migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
|
|
migf->state == MLX5_MIGF_STATE_PRE_COPY ||
|
|
migf->state == MLX5_MIGF_STATE_COMPLETE))
|
|
return -ERESTARTSYS;
|
|
}
|
|
|
|
mutex_lock(&migf->lock);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR) {
|
|
done = -ENODEV;
|
|
goto out_unlock;
|
|
}
|
|
|
|
while (len) {
|
|
ssize_t count;
|
|
|
|
vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
|
|
&end_of_data);
|
|
if (first_loop_call) {
|
|
first_loop_call = false;
|
|
/* Temporary end of file as part of PRE_COPY */
|
|
if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
|
|
migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
|
|
done = -ENOMSG;
|
|
goto out_unlock;
|
|
}
|
|
|
|
if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
|
|
if (filp->f_flags & O_NONBLOCK) {
|
|
done = -EAGAIN;
|
|
goto out_unlock;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (end_of_data)
|
|
goto out_unlock;
|
|
|
|
if (!vhca_buf) {
|
|
done = -EINVAL;
|
|
goto out_unlock;
|
|
}
|
|
|
|
count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
|
|
if (count < 0) {
|
|
done = count;
|
|
goto out_unlock;
|
|
}
|
|
done += count;
|
|
}
|
|
|
|
out_unlock:
|
|
mutex_unlock(&migf->lock);
|
|
return done;
|
|
}
|
|
|
|
static __poll_t mlx5vf_save_poll(struct file *filp,
|
|
struct poll_table_struct *wait)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
__poll_t pollflags = 0;
|
|
|
|
poll_wait(filp, &migf->poll_wait, wait);
|
|
|
|
mutex_lock(&migf->lock);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR)
|
|
pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
|
|
else if (!list_empty(&migf->buf_list) ||
|
|
migf->state == MLX5_MIGF_STATE_COMPLETE)
|
|
pollflags = EPOLLIN | EPOLLRDNORM;
|
|
mutex_unlock(&migf->lock);
|
|
|
|
return pollflags;
|
|
}
|
|
|
|
/*
|
|
* FD is exposed and user can use it after receiving an error.
|
|
* Mark migf in error, and wake the user.
|
|
*/
|
|
static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
|
|
{
|
|
migf->state = MLX5_MIGF_STATE_ERROR;
|
|
wake_up_interruptible(&migf->poll_wait);
|
|
}
|
|
|
|
void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
|
|
u8 chunk_num, size_t next_required_umem_size)
|
|
{
|
|
migf->save_data[chunk_num - 1].next_required_umem_size =
|
|
next_required_umem_size;
|
|
migf->save_data[chunk_num - 1].migf = migf;
|
|
get_file(migf->filp);
|
|
queue_work(migf->mvdev->cb_wq,
|
|
&migf->save_data[chunk_num - 1].work);
|
|
}
|
|
|
|
static struct mlx5_vhca_data_buffer *
|
|
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
|
|
u8 index, size_t required_length)
|
|
{
|
|
struct mlx5_vhca_data_buffer *buf = migf->buf[index];
|
|
u8 chunk_num;
|
|
|
|
WARN_ON(!buf);
|
|
chunk_num = buf->stop_copy_chunk_num;
|
|
buf->migf->buf[index] = NULL;
|
|
/* Checking whether the pre-allocated buffer can fit */
|
|
if (buf->allocated_length >= required_length)
|
|
return buf;
|
|
|
|
mlx5vf_put_data_buffer(buf);
|
|
buf = mlx5vf_get_data_buffer(buf->migf, required_length,
|
|
DMA_FROM_DEVICE);
|
|
if (IS_ERR(buf))
|
|
return buf;
|
|
|
|
buf->stop_copy_chunk_num = chunk_num;
|
|
return buf;
|
|
}
|
|
|
|
static void mlx5vf_mig_file_save_work(struct work_struct *_work)
|
|
{
|
|
struct mlx5vf_save_work_data *save_data = container_of(_work,
|
|
struct mlx5vf_save_work_data, work);
|
|
struct mlx5_vf_migration_file *migf = save_data->migf;
|
|
struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR)
|
|
goto end;
|
|
|
|
buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
|
|
save_data->chunk_num - 1,
|
|
save_data->next_required_umem_size);
|
|
if (IS_ERR(buf))
|
|
goto err;
|
|
|
|
if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
|
|
goto err_save;
|
|
|
|
goto end;
|
|
|
|
err_save:
|
|
mlx5vf_put_data_buffer(buf);
|
|
err:
|
|
mlx5vf_mark_err(migf);
|
|
end:
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
fput(migf->filp);
|
|
}
|
|
|
|
static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
|
|
bool track)
|
|
{
|
|
size_t size = sizeof(struct mlx5_vf_migration_header) +
|
|
sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
|
|
struct mlx5_vf_migration_tag_stop_copy_data data = {};
|
|
struct mlx5_vhca_data_buffer *header_buf = NULL;
|
|
struct mlx5_vf_migration_header header = {};
|
|
unsigned long flags;
|
|
struct page *page;
|
|
u8 *to_buff;
|
|
int ret;
|
|
|
|
header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
|
|
if (IS_ERR(header_buf))
|
|
return PTR_ERR(header_buf);
|
|
|
|
header.record_size = cpu_to_le64(sizeof(data));
|
|
header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
|
|
header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
|
|
page = mlx5vf_get_migration_page(header_buf, 0);
|
|
if (!page) {
|
|
ret = -EINVAL;
|
|
goto err;
|
|
}
|
|
to_buff = kmap_local_page(page);
|
|
memcpy(to_buff, &header, sizeof(header));
|
|
header_buf->length = sizeof(header);
|
|
data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
|
|
memcpy(to_buff + sizeof(header), &data, sizeof(data));
|
|
header_buf->length += sizeof(data);
|
|
kunmap_local(to_buff);
|
|
header_buf->start_pos = header_buf->migf->max_pos;
|
|
migf->max_pos += header_buf->length;
|
|
spin_lock_irqsave(&migf->list_lock, flags);
|
|
list_add_tail(&header_buf->buf_elm, &migf->buf_list);
|
|
spin_unlock_irqrestore(&migf->list_lock, flags);
|
|
if (track)
|
|
migf->pre_copy_initial_bytes = size;
|
|
return 0;
|
|
err:
|
|
mlx5vf_put_data_buffer(header_buf);
|
|
return ret;
|
|
}
|
|
|
|
static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
|
|
struct mlx5_vf_migration_file *migf,
|
|
size_t state_size, u64 full_size,
|
|
bool track)
|
|
{
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
size_t inc_state_size;
|
|
int num_chunks;
|
|
int ret;
|
|
int i;
|
|
|
|
if (mvdev->chunk_mode) {
|
|
size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
|
|
|
|
/* from firmware perspective at least 'state_size' buffer should be set */
|
|
inc_state_size = max(state_size, chunk_size);
|
|
} else {
|
|
if (track) {
|
|
/* let's be ready for stop_copy size that might grow by 10 percents */
|
|
if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
|
|
inc_state_size = state_size;
|
|
} else {
|
|
inc_state_size = state_size;
|
|
}
|
|
}
|
|
|
|
/* let's not overflow the device specification max SAVE size */
|
|
inc_state_size = min_t(size_t, inc_state_size,
|
|
(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
|
|
|
|
num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
|
|
for (i = 0; i < num_chunks; i++) {
|
|
buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto err;
|
|
}
|
|
|
|
migf->buf[i] = buf;
|
|
buf = mlx5vf_get_data_buffer(migf,
|
|
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto err;
|
|
}
|
|
migf->buf_header[i] = buf;
|
|
if (mvdev->chunk_mode) {
|
|
migf->buf[i]->stop_copy_chunk_num = i + 1;
|
|
migf->buf_header[i]->stop_copy_chunk_num = i + 1;
|
|
INIT_WORK(&migf->save_data[i].work,
|
|
mlx5vf_mig_file_save_work);
|
|
migf->save_data[i].chunk_num = i + 1;
|
|
}
|
|
}
|
|
|
|
ret = mlx5vf_add_stop_copy_header(migf, track);
|
|
if (ret)
|
|
goto err;
|
|
return 0;
|
|
|
|
err:
|
|
for (i = 0; i < num_chunks; i++) {
|
|
if (migf->buf[i]) {
|
|
mlx5vf_put_data_buffer(migf->buf[i]);
|
|
migf->buf[i] = NULL;
|
|
}
|
|
if (migf->buf_header[i]) {
|
|
mlx5vf_put_data_buffer(migf->buf_header[i]);
|
|
migf->buf_header[i] = NULL;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
|
|
unsigned long arg)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
struct vfio_precopy_info info = {};
|
|
loff_t *pos = &filp->f_pos;
|
|
unsigned long minsz;
|
|
size_t inc_length = 0;
|
|
bool end_of_data = false;
|
|
int ret;
|
|
|
|
if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
|
|
return -ENOTTY;
|
|
|
|
minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
|
|
|
|
if (copy_from_user(&info, (void __user *)arg, minsz))
|
|
return -EFAULT;
|
|
|
|
if (info.argsz < minsz)
|
|
return -EINVAL;
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
|
|
mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
|
|
ret = -EINVAL;
|
|
goto err_state_unlock;
|
|
}
|
|
|
|
/*
|
|
* We can't issue a SAVE command when the device is suspended, so as
|
|
* part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
|
|
* bytes that can't be read.
|
|
*/
|
|
if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
|
|
/*
|
|
* Once the query returns it's guaranteed that there is no
|
|
* active SAVE command.
|
|
* As so, the other code below is safe with the proper locks.
|
|
*/
|
|
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
|
|
NULL, MLX5VF_QUERY_INC);
|
|
if (ret)
|
|
goto err_state_unlock;
|
|
}
|
|
|
|
mutex_lock(&migf->lock);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR) {
|
|
ret = -ENODEV;
|
|
goto err_migf_unlock;
|
|
}
|
|
|
|
if (migf->pre_copy_initial_bytes > *pos) {
|
|
info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
|
|
} else {
|
|
info.dirty_bytes = migf->max_pos - *pos;
|
|
if (!info.dirty_bytes)
|
|
end_of_data = true;
|
|
info.dirty_bytes += inc_length;
|
|
}
|
|
|
|
if (!end_of_data || !inc_length) {
|
|
mutex_unlock(&migf->lock);
|
|
goto done;
|
|
}
|
|
|
|
mutex_unlock(&migf->lock);
|
|
/*
|
|
* We finished transferring the current state and the device has a
|
|
* dirty state, save a new state to be ready for.
|
|
*/
|
|
buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
mlx5vf_mark_err(migf);
|
|
goto err_state_unlock;
|
|
}
|
|
|
|
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
|
|
if (ret) {
|
|
mlx5vf_mark_err(migf);
|
|
mlx5vf_put_data_buffer(buf);
|
|
goto err_state_unlock;
|
|
}
|
|
|
|
done:
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
if (copy_to_user((void __user *)arg, &info, minsz))
|
|
return -EFAULT;
|
|
return 0;
|
|
|
|
err_migf_unlock:
|
|
mutex_unlock(&migf->lock);
|
|
err_state_unlock:
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
return ret;
|
|
}
|
|
|
|
static const struct file_operations mlx5vf_save_fops = {
|
|
.owner = THIS_MODULE,
|
|
.read = mlx5vf_save_read,
|
|
.poll = mlx5vf_save_poll,
|
|
.unlocked_ioctl = mlx5vf_precopy_ioctl,
|
|
.compat_ioctl = compat_ptr_ioctl,
|
|
.release = mlx5vf_release_file,
|
|
.llseek = no_llseek,
|
|
};
|
|
|
|
static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
size_t length;
|
|
int ret;
|
|
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR)
|
|
return -ENODEV;
|
|
|
|
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
|
|
MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
|
|
if (ret)
|
|
goto err;
|
|
|
|
buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto err;
|
|
}
|
|
|
|
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
|
|
if (ret)
|
|
goto err_save;
|
|
|
|
return 0;
|
|
|
|
err_save:
|
|
mlx5vf_put_data_buffer(buf);
|
|
err:
|
|
mlx5vf_mark_err(migf);
|
|
return ret;
|
|
}
|
|
|
|
static struct mlx5_vf_migration_file *
|
|
mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
|
|
{
|
|
struct mlx5_vf_migration_file *migf;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
size_t length;
|
|
u64 full_size;
|
|
int ret;
|
|
|
|
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
|
|
if (!migf)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
|
|
O_RDONLY);
|
|
if (IS_ERR(migf->filp)) {
|
|
ret = PTR_ERR(migf->filp);
|
|
goto end;
|
|
}
|
|
|
|
migf->mvdev = mvdev;
|
|
ret = mlx5vf_cmd_alloc_pd(migf);
|
|
if (ret)
|
|
goto out_free;
|
|
|
|
stream_open(migf->filp->f_inode, migf->filp);
|
|
mutex_init(&migf->lock);
|
|
init_waitqueue_head(&migf->poll_wait);
|
|
init_completion(&migf->save_comp);
|
|
/*
|
|
* save_comp is being used as a binary semaphore built from
|
|
* a completion. A normal mutex cannot be used because the lock is
|
|
* passed between kernel threads and lockdep can't model this.
|
|
*/
|
|
complete(&migf->save_comp);
|
|
mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
|
|
INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
|
|
INIT_LIST_HEAD(&migf->buf_list);
|
|
INIT_LIST_HEAD(&migf->avail_list);
|
|
spin_lock_init(&migf->list_lock);
|
|
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
|
|
if (ret)
|
|
goto out_pd;
|
|
|
|
ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
|
|
if (ret)
|
|
goto out_pd;
|
|
|
|
if (track) {
|
|
/* leave the allocated buffer ready for the stop-copy phase */
|
|
buf = mlx5vf_alloc_data_buffer(migf,
|
|
migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto out_pd;
|
|
}
|
|
} else {
|
|
buf = migf->buf[0];
|
|
migf->buf[0] = NULL;
|
|
}
|
|
|
|
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
|
|
if (ret)
|
|
goto out_save;
|
|
return migf;
|
|
out_save:
|
|
mlx5vf_free_data_buffer(buf);
|
|
out_pd:
|
|
mlx5fv_cmd_clean_migf_resources(migf);
|
|
out_free:
|
|
fput(migf->filp);
|
|
end:
|
|
kfree(migf);
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
static int
|
|
mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
|
|
const char __user **buf, size_t *len,
|
|
loff_t *pos, ssize_t *done)
|
|
{
|
|
unsigned long offset;
|
|
size_t page_offset;
|
|
struct page *page;
|
|
size_t page_len;
|
|
u8 *to_buff;
|
|
int ret;
|
|
|
|
offset = *pos - vhca_buf->start_pos;
|
|
page_offset = offset % PAGE_SIZE;
|
|
|
|
page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
|
|
if (!page)
|
|
return -EINVAL;
|
|
page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
|
|
to_buff = kmap_local_page(page);
|
|
ret = copy_from_user(to_buff + page_offset, *buf, page_len);
|
|
kunmap_local(to_buff);
|
|
if (ret)
|
|
return -EFAULT;
|
|
|
|
*pos += page_len;
|
|
*done += page_len;
|
|
*buf += page_len;
|
|
*len -= page_len;
|
|
vhca_buf->length += page_len;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
|
|
loff_t requested_length,
|
|
const char __user **buf, size_t *len,
|
|
loff_t *pos, ssize_t *done)
|
|
{
|
|
int ret;
|
|
|
|
if (requested_length > MAX_LOAD_SIZE)
|
|
return -ENOMEM;
|
|
|
|
if (vhca_buf->allocated_length < requested_length) {
|
|
ret = mlx5vf_add_migration_pages(
|
|
vhca_buf,
|
|
DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
|
|
PAGE_SIZE));
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
while (*len) {
|
|
ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
|
|
done);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static ssize_t
|
|
mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
|
|
struct mlx5_vhca_data_buffer *vhca_buf,
|
|
size_t image_size, const char __user **buf,
|
|
size_t *len, loff_t *pos, ssize_t *done,
|
|
bool *has_work)
|
|
{
|
|
size_t copy_len, to_copy;
|
|
int ret;
|
|
|
|
to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
|
|
copy_len = to_copy;
|
|
while (to_copy) {
|
|
ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
|
|
done);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
*len -= copy_len;
|
|
if (vhca_buf->length == image_size) {
|
|
migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
|
|
migf->max_pos += image_size;
|
|
*has_work = true;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
|
|
struct mlx5_vhca_data_buffer *vhca_buf,
|
|
const char __user **buf, size_t *len,
|
|
loff_t *pos, ssize_t *done)
|
|
{
|
|
size_t copy_len, to_copy;
|
|
size_t required_data;
|
|
u8 *to_buff;
|
|
int ret;
|
|
|
|
required_data = migf->record_size - vhca_buf->length;
|
|
to_copy = min_t(size_t, *len, required_data);
|
|
copy_len = to_copy;
|
|
while (to_copy) {
|
|
ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
|
|
done);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
*len -= copy_len;
|
|
if (vhca_buf->length == migf->record_size) {
|
|
switch (migf->record_tag) {
|
|
case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
|
|
{
|
|
struct page *page;
|
|
|
|
page = mlx5vf_get_migration_page(vhca_buf, 0);
|
|
if (!page)
|
|
return -EINVAL;
|
|
to_buff = kmap_local_page(page);
|
|
migf->stop_copy_prep_size = min_t(u64,
|
|
le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
|
|
kunmap_local(to_buff);
|
|
break;
|
|
}
|
|
default:
|
|
/* Optional tag */
|
|
break;
|
|
}
|
|
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
|
|
migf->max_pos += migf->record_size;
|
|
vhca_buf->length = 0;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
|
|
struct mlx5_vhca_data_buffer *vhca_buf,
|
|
const char __user **buf,
|
|
size_t *len, loff_t *pos,
|
|
ssize_t *done, bool *has_work)
|
|
{
|
|
struct page *page;
|
|
size_t copy_len;
|
|
u8 *to_buff;
|
|
int ret;
|
|
|
|
copy_len = min_t(size_t, *len,
|
|
sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
|
|
page = mlx5vf_get_migration_page(vhca_buf, 0);
|
|
if (!page)
|
|
return -EINVAL;
|
|
to_buff = kmap_local_page(page);
|
|
ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
|
|
if (ret) {
|
|
ret = -EFAULT;
|
|
goto end;
|
|
}
|
|
|
|
*buf += copy_len;
|
|
*pos += copy_len;
|
|
*done += copy_len;
|
|
*len -= copy_len;
|
|
vhca_buf->length += copy_len;
|
|
if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
|
|
u64 record_size;
|
|
u32 flags;
|
|
|
|
record_size = le64_to_cpup((__le64 *)to_buff);
|
|
if (record_size > MAX_LOAD_SIZE) {
|
|
ret = -ENOMEM;
|
|
goto end;
|
|
}
|
|
|
|
migf->record_size = record_size;
|
|
flags = le32_to_cpup((__le32 *)(to_buff +
|
|
offsetof(struct mlx5_vf_migration_header, flags)));
|
|
migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
|
|
offsetof(struct mlx5_vf_migration_header, tag)));
|
|
switch (migf->record_tag) {
|
|
case MLX5_MIGF_HEADER_TAG_FW_DATA:
|
|
migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
|
|
break;
|
|
case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
|
|
migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
|
|
break;
|
|
default:
|
|
if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
|
|
ret = -EOPNOTSUPP;
|
|
goto end;
|
|
}
|
|
/* We may read and skip this optional record data */
|
|
migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
|
|
}
|
|
|
|
migf->max_pos += vhca_buf->length;
|
|
vhca_buf->length = 0;
|
|
*has_work = true;
|
|
}
|
|
end:
|
|
kunmap_local(to_buff);
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
|
|
size_t len, loff_t *pos)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
|
|
struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
|
|
loff_t requested_length;
|
|
bool has_work = false;
|
|
ssize_t done = 0;
|
|
int ret = 0;
|
|
|
|
if (pos)
|
|
return -ESPIPE;
|
|
pos = &filp->f_pos;
|
|
|
|
if (*pos < 0 ||
|
|
check_add_overflow((loff_t)len, *pos, &requested_length))
|
|
return -EINVAL;
|
|
|
|
mutex_lock(&migf->mvdev->state_mutex);
|
|
mutex_lock(&migf->lock);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR) {
|
|
ret = -ENODEV;
|
|
goto out_unlock;
|
|
}
|
|
|
|
while (len || has_work) {
|
|
has_work = false;
|
|
switch (migf->load_state) {
|
|
case MLX5_VF_LOAD_STATE_READ_HEADER:
|
|
ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
|
|
&buf, &len, pos,
|
|
&done, &has_work);
|
|
if (ret)
|
|
goto out_unlock;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
|
|
if (vhca_buf_header->allocated_length < migf->record_size) {
|
|
mlx5vf_free_data_buffer(vhca_buf_header);
|
|
|
|
migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
|
|
migf->record_size, DMA_NONE);
|
|
if (IS_ERR(migf->buf_header[0])) {
|
|
ret = PTR_ERR(migf->buf_header[0]);
|
|
migf->buf_header[0] = NULL;
|
|
goto out_unlock;
|
|
}
|
|
|
|
vhca_buf_header = migf->buf_header[0];
|
|
}
|
|
|
|
vhca_buf_header->start_pos = migf->max_pos;
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
|
|
ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
|
|
&buf, &len, pos, &done);
|
|
if (ret)
|
|
goto out_unlock;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_PREP_IMAGE:
|
|
{
|
|
u64 size = max(migf->record_size,
|
|
migf->stop_copy_prep_size);
|
|
|
|
if (vhca_buf->allocated_length < size) {
|
|
mlx5vf_free_data_buffer(vhca_buf);
|
|
|
|
migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
|
|
size, DMA_TO_DEVICE);
|
|
if (IS_ERR(migf->buf[0])) {
|
|
ret = PTR_ERR(migf->buf[0]);
|
|
migf->buf[0] = NULL;
|
|
goto out_unlock;
|
|
}
|
|
|
|
vhca_buf = migf->buf[0];
|
|
}
|
|
|
|
vhca_buf->start_pos = migf->max_pos;
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
|
|
break;
|
|
}
|
|
case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
|
|
ret = mlx5vf_resume_read_image_no_header(vhca_buf,
|
|
requested_length,
|
|
&buf, &len, pos, &done);
|
|
if (ret)
|
|
goto out_unlock;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_READ_IMAGE:
|
|
ret = mlx5vf_resume_read_image(migf, vhca_buf,
|
|
migf->record_size,
|
|
&buf, &len, pos, &done, &has_work);
|
|
if (ret)
|
|
goto out_unlock;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
|
|
ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
|
|
if (ret)
|
|
goto out_unlock;
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
|
|
|
|
/* prep header buf for next image */
|
|
vhca_buf_header->length = 0;
|
|
/* prep data buf for next image */
|
|
vhca_buf->length = 0;
|
|
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
out_unlock:
|
|
if (ret)
|
|
migf->state = MLX5_MIGF_STATE_ERROR;
|
|
mutex_unlock(&migf->lock);
|
|
mlx5vf_state_mutex_unlock(migf->mvdev);
|
|
return ret ? ret : done;
|
|
}
|
|
|
|
static const struct file_operations mlx5vf_resume_fops = {
|
|
.owner = THIS_MODULE,
|
|
.write = mlx5vf_resume_write,
|
|
.release = mlx5vf_release_file,
|
|
.llseek = no_llseek,
|
|
};
|
|
|
|
static struct mlx5_vf_migration_file *
|
|
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
|
|
{
|
|
struct mlx5_vf_migration_file *migf;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
int ret;
|
|
|
|
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
|
|
if (!migf)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
|
|
O_WRONLY);
|
|
if (IS_ERR(migf->filp)) {
|
|
ret = PTR_ERR(migf->filp);
|
|
goto end;
|
|
}
|
|
|
|
migf->mvdev = mvdev;
|
|
ret = mlx5vf_cmd_alloc_pd(migf);
|
|
if (ret)
|
|
goto out_free;
|
|
|
|
buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto out_pd;
|
|
}
|
|
|
|
migf->buf[0] = buf;
|
|
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
|
|
buf = mlx5vf_alloc_data_buffer(migf,
|
|
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto out_buf;
|
|
}
|
|
|
|
migf->buf_header[0] = buf;
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
|
|
} else {
|
|
/* Initial state will be to read the image */
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
|
|
}
|
|
|
|
stream_open(migf->filp->f_inode, migf->filp);
|
|
mutex_init(&migf->lock);
|
|
INIT_LIST_HEAD(&migf->buf_list);
|
|
INIT_LIST_HEAD(&migf->avail_list);
|
|
spin_lock_init(&migf->list_lock);
|
|
return migf;
|
|
out_buf:
|
|
mlx5vf_free_data_buffer(migf->buf[0]);
|
|
out_pd:
|
|
mlx5vf_cmd_dealloc_pd(migf);
|
|
out_free:
|
|
fput(migf->filp);
|
|
end:
|
|
kfree(migf);
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
|
|
{
|
|
if (mvdev->resuming_migf) {
|
|
mlx5vf_disable_fd(mvdev->resuming_migf);
|
|
mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
|
|
fput(mvdev->resuming_migf->filp);
|
|
mvdev->resuming_migf = NULL;
|
|
}
|
|
if (mvdev->saving_migf) {
|
|
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
|
|
cancel_work_sync(&mvdev->saving_migf->async_data.work);
|
|
mlx5vf_disable_fd(mvdev->saving_migf);
|
|
wake_up_interruptible(&mvdev->saving_migf->poll_wait);
|
|
mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
|
|
fput(mvdev->saving_migf->filp);
|
|
mvdev->saving_migf = NULL;
|
|
}
|
|
}
|
|
|
|
static struct file *
|
|
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
|
|
u32 new)
|
|
{
|
|
u32 cur = mvdev->mig_state;
|
|
int ret;
|
|
|
|
if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
|
|
ret = mlx5vf_cmd_suspend_vhca(mvdev,
|
|
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
return NULL;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
|
|
ret = mlx5vf_cmd_resume_vhca(mvdev,
|
|
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
return NULL;
|
|
}
|
|
|
|
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
|
|
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
|
|
ret = mlx5vf_cmd_suspend_vhca(mvdev,
|
|
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
return NULL;
|
|
}
|
|
|
|
if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
|
|
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
|
|
ret = mlx5vf_cmd_resume_vhca(mvdev,
|
|
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
return NULL;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
|
|
struct mlx5_vf_migration_file *migf;
|
|
|
|
migf = mlx5vf_pci_save_device_data(mvdev, false);
|
|
if (IS_ERR(migf))
|
|
return ERR_CAST(migf);
|
|
get_file(migf->filp);
|
|
mvdev->saving_migf = migf;
|
|
return migf->filp;
|
|
}
|
|
|
|
if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
|
|
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
|
|
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
|
|
new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
|
|
mlx5vf_disable_fds(mvdev);
|
|
return NULL;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
|
|
struct mlx5_vf_migration_file *migf;
|
|
|
|
migf = mlx5vf_pci_resume_device_data(mvdev);
|
|
if (IS_ERR(migf))
|
|
return ERR_CAST(migf);
|
|
get_file(migf->filp);
|
|
mvdev->resuming_migf = migf;
|
|
return migf->filp;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
|
|
if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
|
|
ret = mlx5vf_cmd_load_vhca_state(mvdev,
|
|
mvdev->resuming_migf,
|
|
mvdev->resuming_migf->buf[0]);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
}
|
|
mlx5vf_disable_fds(mvdev);
|
|
return NULL;
|
|
}
|
|
|
|
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
|
|
(cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
|
|
new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
|
|
struct mlx5_vf_migration_file *migf;
|
|
|
|
migf = mlx5vf_pci_save_device_data(mvdev, true);
|
|
if (IS_ERR(migf))
|
|
return ERR_CAST(migf);
|
|
get_file(migf->filp);
|
|
mvdev->saving_migf = migf;
|
|
return migf->filp;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
|
|
ret = mlx5vf_cmd_suspend_vhca(mvdev,
|
|
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
ret = mlx5vf_pci_save_device_inc_data(mvdev);
|
|
return ret ? ERR_PTR(ret) : NULL;
|
|
}
|
|
|
|
/*
|
|
* vfio_mig_get_next_state() does not use arcs other than the above
|
|
*/
|
|
WARN_ON(true);
|
|
return ERR_PTR(-EINVAL);
|
|
}
|
|
|
|
/*
|
|
* This function is called in all state_mutex unlock cases to
|
|
* handle a 'deferred_reset' if exists.
|
|
*/
|
|
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
|
|
{
|
|
again:
|
|
spin_lock(&mvdev->reset_lock);
|
|
if (mvdev->deferred_reset) {
|
|
mvdev->deferred_reset = false;
|
|
spin_unlock(&mvdev->reset_lock);
|
|
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
|
|
mlx5vf_disable_fds(mvdev);
|
|
goto again;
|
|
}
|
|
mutex_unlock(&mvdev->state_mutex);
|
|
spin_unlock(&mvdev->reset_lock);
|
|
}
|
|
|
|
static struct file *
|
|
mlx5vf_pci_set_device_state(struct vfio_device *vdev,
|
|
enum vfio_device_mig_state new_state)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
enum vfio_device_mig_state next_state;
|
|
struct file *res = NULL;
|
|
int ret;
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
while (new_state != mvdev->mig_state) {
|
|
ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
|
|
new_state, &next_state);
|
|
if (ret) {
|
|
res = ERR_PTR(ret);
|
|
break;
|
|
}
|
|
res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
|
|
if (IS_ERR(res))
|
|
break;
|
|
mvdev->mig_state = next_state;
|
|
if (WARN_ON(res && new_state != mvdev->mig_state)) {
|
|
fput(res);
|
|
res = ERR_PTR(-EINVAL);
|
|
break;
|
|
}
|
|
}
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
return res;
|
|
}
|
|
|
|
static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
|
|
unsigned long *stop_copy_length)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
size_t state_size;
|
|
u64 total_size;
|
|
int ret;
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
|
|
&total_size, 0);
|
|
if (!ret)
|
|
*stop_copy_length = total_size;
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
return ret;
|
|
}
|
|
|
|
static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
|
|
enum vfio_device_mig_state *curr_state)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
*curr_state = mvdev->mig_state;
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
return 0;
|
|
}
|
|
|
|
static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
|
|
|
|
if (!mvdev->migrate_cap)
|
|
return;
|
|
|
|
/*
|
|
* As the higher VFIO layers are holding locks across reset and using
|
|
* those same locks with the mm_lock we need to prevent ABBA deadlock
|
|
* with the state_mutex and mm_lock.
|
|
* In case the state_mutex was taken already we defer the cleanup work
|
|
* to the unlock flow of the other running context.
|
|
*/
|
|
spin_lock(&mvdev->reset_lock);
|
|
mvdev->deferred_reset = true;
|
|
if (!mutex_trylock(&mvdev->state_mutex)) {
|
|
spin_unlock(&mvdev->reset_lock);
|
|
return;
|
|
}
|
|
spin_unlock(&mvdev->reset_lock);
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
}
|
|
|
|
static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
struct vfio_pci_core_device *vdev = &mvdev->core_device;
|
|
int ret;
|
|
|
|
ret = vfio_pci_core_enable(vdev);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (mvdev->migrate_cap)
|
|
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
|
|
vfio_pci_core_finish_enable(vdev);
|
|
return 0;
|
|
}
|
|
|
|
static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
|
|
mlx5vf_cmd_close_migratable(mvdev);
|
|
vfio_pci_core_close_device(core_vdev);
|
|
}
|
|
|
|
static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
|
|
.migration_set_state = mlx5vf_pci_set_device_state,
|
|
.migration_get_state = mlx5vf_pci_get_device_state,
|
|
.migration_get_data_size = mlx5vf_pci_get_data_size,
|
|
};
|
|
|
|
static const struct vfio_log_ops mlx5vf_pci_log_ops = {
|
|
.log_start = mlx5vf_start_page_tracker,
|
|
.log_stop = mlx5vf_stop_page_tracker,
|
|
.log_read_and_clear = mlx5vf_tracker_read_and_clear,
|
|
};
|
|
|
|
static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
|
|
struct mlx5vf_pci_core_device, core_device.vdev);
|
|
int ret;
|
|
|
|
ret = vfio_pci_core_init_dev(core_vdev);
|
|
if (ret)
|
|
return ret;
|
|
|
|
mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
|
|
&mlx5vf_pci_log_ops);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
|
|
struct mlx5vf_pci_core_device, core_device.vdev);
|
|
|
|
mlx5vf_cmd_remove_migratable(mvdev);
|
|
vfio_pci_core_release_dev(core_vdev);
|
|
}
|
|
|
|
static const struct vfio_device_ops mlx5vf_pci_ops = {
|
|
.name = "mlx5-vfio-pci",
|
|
.init = mlx5vf_pci_init_dev,
|
|
.release = mlx5vf_pci_release_dev,
|
|
.open_device = mlx5vf_pci_open_device,
|
|
.close_device = mlx5vf_pci_close_device,
|
|
.ioctl = vfio_pci_core_ioctl,
|
|
.device_feature = vfio_pci_core_ioctl_feature,
|
|
.read = vfio_pci_core_read,
|
|
.write = vfio_pci_core_write,
|
|
.mmap = vfio_pci_core_mmap,
|
|
.request = vfio_pci_core_request,
|
|
.match = vfio_pci_core_match,
|
|
.bind_iommufd = vfio_iommufd_physical_bind,
|
|
.unbind_iommufd = vfio_iommufd_physical_unbind,
|
|
.attach_ioas = vfio_iommufd_physical_attach_ioas,
|
|
.detach_ioas = vfio_iommufd_physical_detach_ioas,
|
|
};
|
|
|
|
static int mlx5vf_pci_probe(struct pci_dev *pdev,
|
|
const struct pci_device_id *id)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev;
|
|
int ret;
|
|
|
|
mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
|
|
&pdev->dev, &mlx5vf_pci_ops);
|
|
if (IS_ERR(mvdev))
|
|
return PTR_ERR(mvdev);
|
|
|
|
dev_set_drvdata(&pdev->dev, &mvdev->core_device);
|
|
ret = vfio_pci_core_register_device(&mvdev->core_device);
|
|
if (ret)
|
|
goto out_put_vdev;
|
|
return 0;
|
|
|
|
out_put_vdev:
|
|
vfio_put_device(&mvdev->core_device.vdev);
|
|
return ret;
|
|
}
|
|
|
|
static void mlx5vf_pci_remove(struct pci_dev *pdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
|
|
|
|
vfio_pci_core_unregister_device(&mvdev->core_device);
|
|
vfio_put_device(&mvdev->core_device.vdev);
|
|
}
|
|
|
|
static const struct pci_device_id mlx5vf_pci_table[] = {
|
|
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
|
|
{}
|
|
};
|
|
|
|
MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
|
|
|
|
static const struct pci_error_handlers mlx5vf_err_handlers = {
|
|
.reset_done = mlx5vf_pci_aer_reset_done,
|
|
.error_detected = vfio_pci_core_aer_err_detected,
|
|
};
|
|
|
|
static struct pci_driver mlx5vf_pci_driver = {
|
|
.name = KBUILD_MODNAME,
|
|
.id_table = mlx5vf_pci_table,
|
|
.probe = mlx5vf_pci_probe,
|
|
.remove = mlx5vf_pci_remove,
|
|
.err_handler = &mlx5vf_err_handlers,
|
|
.driver_managed_dma = true,
|
|
};
|
|
|
|
module_pci_driver(mlx5vf_pci_driver);
|
|
|
|
MODULE_IMPORT_NS(IOMMUFD);
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
|
|
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
|
|
MODULE_DESCRIPTION(
|
|
"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
|