From ddad99c98629406cfce48d9f1b7034bfd882c4b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= Date: Mon, 4 Dec 2017 09:08:19 +0100 Subject: [PATCH] cherry-pick vhost perf regression and mem-leak fix --- ...0008-vhost-fix-skb-leak-in-handle_rx.patch | 72 ++++++++++++++++ .../0009-tun-free-skb-in-early-errors.patch | 86 +++++++++++++++++++ .../0010-tap-free-skb-if-flags-error.patch | 58 +++++++++++++ 3 files changed, 216 insertions(+) create mode 100644 patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch create mode 100644 patches/kernel/0009-tun-free-skb-in-early-errors.patch create mode 100644 patches/kernel/0010-tap-free-skb-if-flags-error.patch diff --git a/patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch b/patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch new file mode 100644 index 0000000..7938167 --- /dev/null +++ b/patches/kernel/0008-vhost-fix-skb-leak-in-handle_rx.patch @@ -0,0 +1,72 @@ +From 15bea51dda49a0403a9a84fb26f3376636d2a3c2 Mon Sep 17 00:00:00 2001 +From: Wei Xu +Date: Fri, 1 Dec 2017 05:10:36 -0500 +Subject: [PATCH 08/13] vhost: fix skb leak in handle_rx() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Matthew found a roughly 40% tcp throughput regression with commit +c67df11f(vhost_net: try batch dequing from skb array) as discussed +in the following thread: +https://www.mail-archive.com/netdev@vger.kernel.org/msg187936.html + +Eventually we figured out that it was a skb leak in handle_rx() +when sending packets to the VM. This usually happens when a guest +can not drain out vq as fast as vhost fills in, afterwards it sets +off the traffic jam and leaks skb(s) which occurs as no headcount +to send on the vq from vhost side. + +This can be avoided by making sure we have got enough headcount +before actually consuming a skb from the batched rx array while +transmitting, which is simply done by moving checking the zero +headcount a bit ahead. + +Signed-off-by: Wei Xu +Reported-by: Matthew Rosato +Signed-off-by: Fabian Grünbichler +--- + drivers/vhost/net.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c +index 1c75572f5a3f..010253847022 100644 +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -781,16 +781,6 @@ static void handle_rx(struct vhost_net *net) + /* On error, stop handling until the next kick. */ + if (unlikely(headcount < 0)) + goto out; +- if (nvq->rx_array) +- msg.msg_control = vhost_net_buf_consume(&nvq->rxq); +- /* On overrun, truncate and discard */ +- if (unlikely(headcount > UIO_MAXIOV)) { +- iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); +- err = sock->ops->recvmsg(sock, &msg, +- 1, MSG_DONTWAIT | MSG_TRUNC); +- pr_debug("Discarded rx packet: len %zd\n", sock_len); +- continue; +- } + /* OK, now we need to know about added descriptors. */ + if (!headcount) { + if (unlikely(vhost_enable_notify(&net->dev, vq))) { +@@ -803,6 +793,16 @@ static void handle_rx(struct vhost_net *net) + * they refilled. */ + goto out; + } ++ if (nvq->rx_array) ++ msg.msg_control = vhost_net_buf_consume(&nvq->rxq); ++ /* On overrun, truncate and discard */ ++ if (unlikely(headcount > UIO_MAXIOV)) { ++ iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1); ++ err = sock->ops->recvmsg(sock, &msg, ++ 1, MSG_DONTWAIT | MSG_TRUNC); ++ pr_debug("Discarded rx packet: len %zd\n", sock_len); ++ continue; ++ } + /* We don't need to be notified again. */ + iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len); + fixup = msg.msg_iter; +-- +2.14.2 + diff --git a/patches/kernel/0009-tun-free-skb-in-early-errors.patch b/patches/kernel/0009-tun-free-skb-in-early-errors.patch new file mode 100644 index 0000000..f76eab5 --- /dev/null +++ b/patches/kernel/0009-tun-free-skb-in-early-errors.patch @@ -0,0 +1,86 @@ +From ccc0b8662620d562798183b77bcd30125d2d14f3 Mon Sep 17 00:00:00 2001 +From: Wei Xu +Date: Fri, 1 Dec 2017 05:10:37 -0500 +Subject: [PATCH 09/13] tun: free skb in early errors +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +tun_recvmsg() supports accepting skb by msg_control after +commit ac77cfd4258f ("tun: support receiving skb through msg_control"), +the skb if presented should be freed no matter how far it can go +along, otherwise it would be leaked. + +This patch fixes several missed cases. + +Signed-off-by: Wei Xu +Reported-by: Matthew Rosato +Signed-off-by: Fabian Grünbichler +--- + drivers/net/tun.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +diff --git a/drivers/net/tun.c b/drivers/net/tun.c +index cb1f7747adad..5143e948d7d1 100644 +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1519,8 +1519,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, + + tun_debug(KERN_INFO, tun, "tun_do_read\n"); + +- if (!iov_iter_count(to)) ++ if (!iov_iter_count(to)) { ++ if (skb) ++ kfree_skb(skb); + return 0; ++ } + + if (!skb) { + /* Read frames from ring */ +@@ -1636,22 +1639,24 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, + { + struct tun_file *tfile = container_of(sock, struct tun_file, socket); + struct tun_struct *tun = __tun_get(tfile); ++ struct sk_buff *skb = m->msg_control; + int ret; + +- if (!tun) +- return -EBADFD; ++ if (!tun) { ++ ret = -EBADFD; ++ goto out_free_skb; ++ } + + if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { + ret = -EINVAL; +- goto out; ++ goto out_put_tun; + } + if (flags & MSG_ERRQUEUE) { + ret = sock_recv_errqueue(sock->sk, m, total_len, + SOL_PACKET, TUN_TX_TIMESTAMP); + goto out; + } +- ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, +- m->msg_control); ++ ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, skb); + if (ret > (ssize_t)total_len) { + m->msg_flags |= MSG_TRUNC; + ret = flags & MSG_TRUNC ? ret : total_len; +@@ -1659,6 +1664,13 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, + out: + tun_put(tun); + return ret; ++ ++out_put_tun: ++ tun_put(tun); ++out_free_skb: ++ if (skb) ++ kfree_skb(skb); ++ return ret; + } + + static int tun_peek_len(struct socket *sock) +-- +2.14.2 + diff --git a/patches/kernel/0010-tap-free-skb-if-flags-error.patch b/patches/kernel/0010-tap-free-skb-if-flags-error.patch new file mode 100644 index 0000000..c9521b3 --- /dev/null +++ b/patches/kernel/0010-tap-free-skb-if-flags-error.patch @@ -0,0 +1,58 @@ +From 68783aa6989756cda8e9e305292afbb9f4f5677c Mon Sep 17 00:00:00 2001 +From: Wei Xu +Date: Fri, 1 Dec 2017 05:10:38 -0500 +Subject: [PATCH 10/13] tap: free skb if flags error +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +tap_recvmsg() supports accepting skb by msg_control after +commit 3b4ba04acca8 ("tap: support receiving skb from msg_control"), +the skb if presented should be freed within the function, otherwise +it would be leaked. + +Signed-off-by: Wei Xu +Reported-by: Matthew Rosato +Signed-off-by: Fabian Grünbichler +--- + drivers/net/tap.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +diff --git a/drivers/net/tap.c b/drivers/net/tap.c +index 3570c7576993..4e04b6094f3c 100644 +--- a/drivers/net/tap.c ++++ b/drivers/net/tap.c +@@ -829,8 +829,11 @@ static ssize_t tap_do_read(struct tap_queue *q, + DEFINE_WAIT(wait); + ssize_t ret = 0; + +- if (!iov_iter_count(to)) ++ if (!iov_iter_count(to)) { ++ if (skb) ++ kfree_skb(skb); + return 0; ++ } + + if (skb) + goto put; +@@ -1155,11 +1158,14 @@ static int tap_recvmsg(struct socket *sock, struct msghdr *m, + size_t total_len, int flags) + { + struct tap_queue *q = container_of(sock, struct tap_queue, sock); ++ struct sk_buff *skb = m->msg_control; + int ret; +- if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) ++ if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { ++ if (skb) ++ kfree_skb(skb); + return -EINVAL; +- ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, +- m->msg_control); ++ } ++ ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb); + if (ret > total_len) { + m->msg_flags |= MSG_TRUNC; + ret = flags & MSG_TRUNC ? ret : total_len; +-- +2.14.2 +