* [PATCH v2] e1000e: using bottom half to send packets
@ 2020-07-20 16:45 Li Qiang
2020-07-21 2:03 ` Jason Wang
0 siblings, 1 reply; 6+ messages in thread
From: Li Qiang @ 2020-07-20 16:45 UTC (permalink / raw
To: dmitry.fleytman, jasowang, pbonzini
Cc: alxndr, Li Qiang, liq3ea, qemu-devel, ppandit
Alexander Bulekov reported a UAF bug related e1000e packets send.
-->https://bugs.launchpad.net/qemu/+bug/1886362
This is because the guest trigger a e1000e packet send and set the
data's address to e1000e's MMIO address. So when the e1000e do DMA
it will write the MMIO again and trigger re-entrancy and finally
causes this UAF.
Paolo suggested to use a bottom half whenever MMIO is doing complicate
things in here:
-->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
Reference here:
'The easiest solution is to delay processing of descriptors to a bottom
half whenever MMIO is doing something complicated. This is also better
for latency because it will free the vCPU thread more quickly and leave
the work to the I/O thread.'
This patch fixes this UAF.
Signed-off-by: Li Qiang <liq3ea@163.com>
---
Change since v1:
Per Jason's review here:
-- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
1. Cancel and schedule the tx bh when VM is stopped or resume
2. Add a tx_burst for e1000e configuration to throttle the bh execution
3. Add a tx_waiting to record whether the bh is pending or not
Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
acquired.
hw/net/e1000e.c | 6 +++
hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
hw/net/e1000e_core.h | 8 ++++
3 files changed, 98 insertions(+), 22 deletions(-)
diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
index fda34518c9..24e35a78bf 100644
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -77,10 +77,14 @@ typedef struct E1000EState {
bool disable_vnet;
+ int32_t tx_burst;
+
E1000ECore core;
} E1000EState;
+#define TX_BURST 256
+
#define E1000E_MMIO_IDX 0
#define E1000E_FLASH_IDX 1
#define E1000E_IO_IDX 2
@@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
{
s->core.owner = &s->parent_obj;
s->core.owner_nic = s->nic;
+ s->core.tx_burst = s->tx_burst;
}
static void
@@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
e1000e_prop_subsys_ven, uint16_t),
DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
e1000e_prop_subsys, uint16_t),
+ DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index bcd186cac5..0a38a50cca 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
rxr->i = &i[idx];
}
-static void
-e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
+static int32_t
+e1000e_start_xmit(struct e1000e_tx *q)
{
+ E1000ECore *core = q->core;
dma_addr_t base;
struct e1000_tx_desc desc;
- bool ide = false;
- const E1000E_RingInfo *txi = txr->i;
- uint32_t cause = E1000_ICS_TXQE;
+ const E1000E_RingInfo *txi;
+ E1000E_TxRing txr;
+ int32_t num_packets = 0;
- if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
- trace_e1000e_tx_disabled();
- return;
- }
+ e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
+ txi = txr.i;
while (!e1000e_ring_empty(core, txi)) {
base = e1000e_ring_head_descr(core, txi);
@@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
desc.lower.data, desc.upper.data);
- e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
- cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
+ e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
+ q->cause |= e1000e_txdesc_writeback(core, base, &desc,
+ &q->ide, txi->idx);
e1000e_ring_advance(core, txi, 1);
+ if (++num_packets >= core->tx_burst) {
+ break;
+ }
}
- if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
- e1000e_set_interrupt_cause(core, cause);
- }
+ return num_packets;
}
static bool
@@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
static void
e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
{
- E1000E_TxRing txr;
core->mac[index] = val;
if (core->mac[TARC0] & E1000_TARC_ENABLE) {
- e1000e_tx_ring_init(core, &txr, 0);
- e1000e_start_xmit(core, &txr);
+ if (core->tx[0].tx_waiting) {
+ return;
+ }
+ core->tx[0].tx_waiting = 1;
+ if (!core->vm_running) {
+ return;
+ }
+ qemu_bh_schedule(core->tx[0].tx_bh);
}
if (core->mac[TARC1] & E1000_TARC_ENABLE) {
- e1000e_tx_ring_init(core, &txr, 1);
- e1000e_start_xmit(core, &txr);
+ if (core->tx[1].tx_waiting) {
+ return;
+ }
+ core->tx[1].tx_waiting = 1;
+ if (!core->vm_running) {
+ return;
+ }
+ qemu_bh_schedule(core->tx[1].tx_bh);
}
}
static void
e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
{
- E1000E_TxRing txr;
int qidx = e1000e_mq_queue_idx(TDT, index);
uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
core->mac[index] = val & 0xffff;
if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
- e1000e_tx_ring_init(core, &txr, qidx);
- e1000e_start_xmit(core, &txr);
+ qemu_bh_schedule(core->tx[qidx].tx_bh);
}
}
@@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
trace_e1000e_vm_state_running();
e1000e_intrmgr_resume(core);
e1000e_autoneg_resume(core);
+ core->vm_running = 1;
+
+ for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
+ qemu_bh_schedule(core->tx[i].tx_bh);
+ }
+
} else {
trace_e1000e_vm_state_stopped();
+
+ for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
+ qemu_bh_cancel(core->tx[i].tx_bh);
+ }
+
e1000e_autoneg_pause(core);
e1000e_intrmgr_pause(core);
+ core->vm_running = 0;
+ }
+}
+
+
+static void e1000e_core_tx_bh(void *opaque)
+{
+ struct e1000e_tx *q = opaque;
+ E1000ECore *core = q->core;
+ int32_t ret;
+
+ if (!core->vm_running) {
+ assert(q->tx_waiting);
+ return;
+ }
+
+ q->tx_waiting = 0;
+
+ if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
+ trace_e1000e_tx_disabled();
+ return;
+ }
+
+ q->cause = E1000_ICS_TXQE;
+ q->ide = false;
+
+ ret = e1000e_start_xmit(q);
+ if (ret >= core->tx_burst) {
+ qemu_bh_schedule(q->tx_bh);
+ q->tx_waiting = 1;
+ return;
+ }
+
+ if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
+ e1000e_set_interrupt_cause(core, q->cause);
}
}
@@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core,
e1000e_autoneg_timer, core);
e1000e_intrmgr_pci_realize(core);
+ core->vm_running = runstate_is_running();
core->vmstate =
qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
for (i = 0; i < E1000E_NUM_QUEUES; i++) {
net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
E1000E_MAX_TX_FRAGS, core->has_vnet);
+ core->tx[i].core = core;
+ core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
}
net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
@@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
for (i = 0; i < E1000E_NUM_QUEUES; i++) {
net_tx_pkt_reset(core->tx[i].tx_pkt);
net_tx_pkt_uninit(core->tx[i].tx_pkt);
+ qemu_bh_cancel(core->tx[i].tx_bh);
+ qemu_bh_delete(core->tx[i].tx_bh);
+ core->tx[i].tx_bh = NULL;
}
net_rx_pkt_uninit(core->rx_pkt);
diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
index aee32f7e48..0c16dce3a6 100644
--- a/hw/net/e1000e_core.h
+++ b/hw/net/e1000e_core.h
@@ -77,10 +77,18 @@ struct E1000Core {
unsigned char sum_needed;
bool cptse;
struct NetTxPkt *tx_pkt;
+ QEMUBH *tx_bh;
+ uint32_t tx_waiting;
+ uint32_t cause;
+ bool ide;
+ E1000ECore *core;
} tx[E1000E_NUM_QUEUES];
struct NetRxPkt *rx_pkt;
+ int32_t tx_burst;
+
+ bool vm_running;
bool has_vnet;
int max_queue_num;
--
2.17.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH v2] e1000e: using bottom half to send packets
2020-07-20 16:45 [PATCH v2] e1000e: using bottom half to send packets Li Qiang
@ 2020-07-21 2:03 ` Jason Wang
2020-07-21 4:33 ` Li Qiang
0 siblings, 1 reply; 6+ messages in thread
From: Jason Wang @ 2020-07-21 2:03 UTC (permalink / raw
To: Li Qiang, dmitry.fleytman, pbonzini; +Cc: alxndr, liq3ea, qemu-devel, ppandit
On 2020/7/21 上午12:45, Li Qiang wrote:
> Alexander Bulekov reported a UAF bug related e1000e packets send.
>
> -->https://bugs.launchpad.net/qemu/+bug/1886362
>
> This is because the guest trigger a e1000e packet send and set the
> data's address to e1000e's MMIO address. So when the e1000e do DMA
> it will write the MMIO again and trigger re-entrancy and finally
> causes this UAF.
>
> Paolo suggested to use a bottom half whenever MMIO is doing complicate
> things in here:
> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
>
> Reference here:
> 'The easiest solution is to delay processing of descriptors to a bottom
> half whenever MMIO is doing something complicated. This is also better
> for latency because it will free the vCPU thread more quickly and leave
> the work to the I/O thread.'
>
> This patch fixes this UAF.
>
> Signed-off-by: Li Qiang <liq3ea@163.com>
> ---
> Change since v1:
> Per Jason's review here:
> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> 1. Cancel and schedule the tx bh when VM is stopped or resume
> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> 3. Add a tx_waiting to record whether the bh is pending or not
> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> acquired.
>
> hw/net/e1000e.c | 6 +++
> hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
> hw/net/e1000e_core.h | 8 ++++
> 3 files changed, 98 insertions(+), 22 deletions(-)
>
> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> index fda34518c9..24e35a78bf 100644
> --- a/hw/net/e1000e.c
> +++ b/hw/net/e1000e.c
> @@ -77,10 +77,14 @@ typedef struct E1000EState {
>
> bool disable_vnet;
>
> + int32_t tx_burst;
> +
> E1000ECore core;
>
> } E1000EState;
>
> +#define TX_BURST 256
> +
> #define E1000E_MMIO_IDX 0
> #define E1000E_FLASH_IDX 1
> #define E1000E_IO_IDX 2
> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
> {
> s->core.owner = &s->parent_obj;
> s->core.owner_nic = s->nic;
> + s->core.tx_burst = s->tx_burst;
> }
>
> static void
> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
> e1000e_prop_subsys_ven, uint16_t),
> DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
> e1000e_prop_subsys, uint16_t),
> + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
> DEFINE_PROP_END_OF_LIST(),
> };
>
> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> index bcd186cac5..0a38a50cca 100644
> --- a/hw/net/e1000e_core.c
> +++ b/hw/net/e1000e_core.c
> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
> rxr->i = &i[idx];
> }
>
> -static void
> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> +static int32_t
> +e1000e_start_xmit(struct e1000e_tx *q)
> {
> + E1000ECore *core = q->core;
> dma_addr_t base;
> struct e1000_tx_desc desc;
> - bool ide = false;
> - const E1000E_RingInfo *txi = txr->i;
> - uint32_t cause = E1000_ICS_TXQE;
> + const E1000E_RingInfo *txi;
> + E1000E_TxRing txr;
> + int32_t num_packets = 0;
>
> - if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> - trace_e1000e_tx_disabled();
> - return;
> - }
> + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> + txi = txr.i;
>
> while (!e1000e_ring_empty(core, txi)) {
> base = e1000e_ring_head_descr(core, txi);
> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
> desc.lower.data, desc.upper.data);
>
> - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
> + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> + q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> + &q->ide, txi->idx);
>
> e1000e_ring_advance(core, txi, 1);
> + if (++num_packets >= core->tx_burst) {
> + break;
> + }
> }
>
> - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> - e1000e_set_interrupt_cause(core, cause);
> - }
> + return num_packets;
> }
>
> static bool
> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
> static void
> e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
> {
> - E1000E_TxRing txr;
> core->mac[index] = val;
>
> if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> - e1000e_tx_ring_init(core, &txr, 0);
> - e1000e_start_xmit(core, &txr);
> + if (core->tx[0].tx_waiting) {
> + return;
> + }
> + core->tx[0].tx_waiting = 1;
> + if (!core->vm_running) {
> + return;
> + }
> + qemu_bh_schedule(core->tx[0].tx_bh);
> }
>
> if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> - e1000e_tx_ring_init(core, &txr, 1);
> - e1000e_start_xmit(core, &txr);
> + if (core->tx[1].tx_waiting) {
> + return;
> + }
> + core->tx[1].tx_waiting = 1;
> + if (!core->vm_running) {
> + return;
> + }
> + qemu_bh_schedule(core->tx[1].tx_bh);
> }
> }
>
> static void
> e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
> {
> - E1000E_TxRing txr;
> int qidx = e1000e_mq_queue_idx(TDT, index);
> uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
>
> core->mac[index] = val & 0xffff;
>
> if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> - e1000e_tx_ring_init(core, &txr, qidx);
> - e1000e_start_xmit(core, &txr);
> + qemu_bh_schedule(core->tx[qidx].tx_bh);
> }
> }
>
> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
> trace_e1000e_vm_state_running();
> e1000e_intrmgr_resume(core);
> e1000e_autoneg_resume(core);
> + core->vm_running = 1;
> +
> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> + qemu_bh_schedule(core->tx[i].tx_bh);
I guess the reason for the unconditional scheduling of bh is to make
sure tx work after live migration since we don't migrate tx_waiting.
If yes, better add a comment here. And do we need to clear tx_waiting here?
> + }
> +
> } else {
> trace_e1000e_vm_state_stopped();
> +
> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> + qemu_bh_cancel(core->tx[i].tx_bh);
> + }
> +
> e1000e_autoneg_pause(core);
> e1000e_intrmgr_pause(core);
> + core->vm_running = 0;
> + }
> +}
> +
> +
> +static void e1000e_core_tx_bh(void *opaque)
> +{
> + struct e1000e_tx *q = opaque;
> + E1000ECore *core = q->core;
> + int32_t ret;
> +
> + if (!core->vm_running) {
> + assert(q->tx_waiting);
> + return;
> + }
> +
> + q->tx_waiting = 0;
> +
> + if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> + trace_e1000e_tx_disabled();
> + return;
> + }
> +
> + q->cause = E1000_ICS_TXQE;
> + q->ide = false;
> +
> + ret = e1000e_start_xmit(q);
> + if (ret >= core->tx_burst) {
> + qemu_bh_schedule(q->tx_bh);
> + q->tx_waiting = 1;
> + return;
> + }
> +
> + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> + e1000e_set_interrupt_cause(core, q->cause);
So I think this will cause some delay of the interrupt delivering. It
looks to be it's better to leave the set ics in e1000e_start_xmit().
> }
> }
>
> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core,
> e1000e_autoneg_timer, core);
> e1000e_intrmgr_pci_realize(core);
>
> + core->vm_running = runstate_is_running();
> core->vmstate =
> qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
>
> for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
> E1000E_MAX_TX_FRAGS, core->has_vnet);
> + core->tx[i].core = core;
> + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
> }
>
> net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
> for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> net_tx_pkt_reset(core->tx[i].tx_pkt);
> net_tx_pkt_uninit(core->tx[i].tx_pkt);
> + qemu_bh_cancel(core->tx[i].tx_bh);
> + qemu_bh_delete(core->tx[i].tx_bh);
> + core->tx[i].tx_bh = NULL;
> }
>
> net_rx_pkt_uninit(core->rx_pkt);
> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> index aee32f7e48..0c16dce3a6 100644
> --- a/hw/net/e1000e_core.h
> +++ b/hw/net/e1000e_core.h
> @@ -77,10 +77,18 @@ struct E1000Core {
> unsigned char sum_needed;
> bool cptse;
> struct NetTxPkt *tx_pkt;
> + QEMUBH *tx_bh;
> + uint32_t tx_waiting;
> + uint32_t cause;
> + bool ide;
> + E1000ECore *core;
> } tx[E1000E_NUM_QUEUES];
>
> struct NetRxPkt *rx_pkt;
>
> + int32_t tx_burst;
> +
> + bool vm_running;
> bool has_vnet;
> int max_queue_num;
>
Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?
Thanks
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] e1000e: using bottom half to send packets
2020-07-21 2:03 ` Jason Wang
@ 2020-07-21 4:33 ` Li Qiang
2020-07-21 5:30 ` Jason Wang
0 siblings, 1 reply; 6+ messages in thread
From: Li Qiang @ 2020-07-21 4:33 UTC (permalink / raw
To: Jason Wang
Cc: Dmitry Fleytman, Li Qiang, Qemu Developers, P J P,
Alexander Bulekov, Paolo Bonzini
Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道:
>
>
> On 2020/7/21 上午12:45, Li Qiang wrote:
> > Alexander Bulekov reported a UAF bug related e1000e packets send.
> >
> > -->https://bugs.launchpad.net/qemu/+bug/1886362
> >
> > This is because the guest trigger a e1000e packet send and set the
> > data's address to e1000e's MMIO address. So when the e1000e do DMA
> > it will write the MMIO again and trigger re-entrancy and finally
> > causes this UAF.
> >
> > Paolo suggested to use a bottom half whenever MMIO is doing complicate
> > things in here:
> > -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
> >
> > Reference here:
> > 'The easiest solution is to delay processing of descriptors to a bottom
> > half whenever MMIO is doing something complicated. This is also better
> > for latency because it will free the vCPU thread more quickly and leave
> > the work to the I/O thread.'
> >
> > This patch fixes this UAF.
> >
> > Signed-off-by: Li Qiang <liq3ea@163.com>
> > ---
> > Change since v1:
> > Per Jason's review here:
> > -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> > 1. Cancel and schedule the tx bh when VM is stopped or resume
> > 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> > 3. Add a tx_waiting to record whether the bh is pending or not
> > Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> > acquired.
> >
> > hw/net/e1000e.c | 6 +++
> > hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
> > hw/net/e1000e_core.h | 8 ++++
> > 3 files changed, 98 insertions(+), 22 deletions(-)
> >
> > diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> > index fda34518c9..24e35a78bf 100644
> > --- a/hw/net/e1000e.c
> > +++ b/hw/net/e1000e.c
> > @@ -77,10 +77,14 @@ typedef struct E1000EState {
> >
> > bool disable_vnet;
> >
> > + int32_t tx_burst;
> > +
> > E1000ECore core;
> >
> > } E1000EState;
> >
> > +#define TX_BURST 256
> > +
> > #define E1000E_MMIO_IDX 0
> > #define E1000E_FLASH_IDX 1
> > #define E1000E_IO_IDX 2
> > @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
> > {
> > s->core.owner = &s->parent_obj;
> > s->core.owner_nic = s->nic;
> > + s->core.tx_burst = s->tx_burst;
> > }
> >
> > static void
> > @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
> > e1000e_prop_subsys_ven, uint16_t),
> > DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
> > e1000e_prop_subsys, uint16_t),
> > + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
> > DEFINE_PROP_END_OF_LIST(),
> > };
> >
> > diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> > index bcd186cac5..0a38a50cca 100644
> > --- a/hw/net/e1000e_core.c
> > +++ b/hw/net/e1000e_core.c
> > @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
> > rxr->i = &i[idx];
> > }
> >
> > -static void
> > -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> > +static int32_t
> > +e1000e_start_xmit(struct e1000e_tx *q)
> > {
> > + E1000ECore *core = q->core;
> > dma_addr_t base;
> > struct e1000_tx_desc desc;
> > - bool ide = false;
> > - const E1000E_RingInfo *txi = txr->i;
> > - uint32_t cause = E1000_ICS_TXQE;
> > + const E1000E_RingInfo *txi;
> > + E1000E_TxRing txr;
> > + int32_t num_packets = 0;
> >
> > - if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> > - trace_e1000e_tx_disabled();
> > - return;
> > - }
> > + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> > + txi = txr.i;
> >
> > while (!e1000e_ring_empty(core, txi)) {
> > base = e1000e_ring_head_descr(core, txi);
> > @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> > trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
> > desc.lower.data, desc.upper.data);
> >
> > - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> > - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
> > + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> > + q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> > + &q->ide, txi->idx);
> >
> > e1000e_ring_advance(core, txi, 1);
> > + if (++num_packets >= core->tx_burst) {
> > + break;
> > + }
> > }
> >
> > - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> > - e1000e_set_interrupt_cause(core, cause);
> > - }
> > + return num_packets;
> > }
> >
> > static bool
> > @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
> > static void
> > e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
> > {
> > - E1000E_TxRing txr;
> > core->mac[index] = val;
> >
> > if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> > - e1000e_tx_ring_init(core, &txr, 0);
> > - e1000e_start_xmit(core, &txr);
> > + if (core->tx[0].tx_waiting) {
> > + return;
> > + }
> > + core->tx[0].tx_waiting = 1;
> > + if (!core->vm_running) {
> > + return;
> > + }
> > + qemu_bh_schedule(core->tx[0].tx_bh);
> > }
> >
> > if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> > - e1000e_tx_ring_init(core, &txr, 1);
> > - e1000e_start_xmit(core, &txr);
> > + if (core->tx[1].tx_waiting) {
> > + return;
> > + }
> > + core->tx[1].tx_waiting = 1;
> > + if (!core->vm_running) {
> > + return;
> > + }
> > + qemu_bh_schedule(core->tx[1].tx_bh);
> > }
> > }
> >
> > static void
> > e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
> > {
> > - E1000E_TxRing txr;
> > int qidx = e1000e_mq_queue_idx(TDT, index);
> > uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
> >
> > core->mac[index] = val & 0xffff;
> >
> > if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> > - e1000e_tx_ring_init(core, &txr, qidx);
> > - e1000e_start_xmit(core, &txr);
> > + qemu_bh_schedule(core->tx[qidx].tx_bh);
> > }
> > }
> >
> > @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
> > trace_e1000e_vm_state_running();
> > e1000e_intrmgr_resume(core);
> > e1000e_autoneg_resume(core);
> > + core->vm_running = 1;
> > +
> > + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> > + qemu_bh_schedule(core->tx[i].tx_bh);
>
>
> I guess the reason for the unconditional scheduling of bh is to make
> sure tx work after live migration since we don't migrate tx_waiting.
>
> If yes, better add a comment here.
Ok will do in next revision.
And do we need to clear tx_waiting here?
I think there is no need as the tx_bh handler will do this.
>
>
> > + }
> > +
> > } else {
> > trace_e1000e_vm_state_stopped();
> > +
> > + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> > + qemu_bh_cancel(core->tx[i].tx_bh);
> > + }
> > +
> > e1000e_autoneg_pause(core);
> > e1000e_intrmgr_pause(core);
> > + core->vm_running = 0;
> > + }
> > +}
> > +
> > +
> > +static void e1000e_core_tx_bh(void *opaque)
> > +{
> > + struct e1000e_tx *q = opaque;
> > + E1000ECore *core = q->core;
> > + int32_t ret;
> > +
> > + if (!core->vm_running) {
> > + assert(q->tx_waiting);
> > + return;
> > + }
> > +
> > + q->tx_waiting = 0;
> > +
> > + if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> > + trace_e1000e_tx_disabled();
> > + return;
> > + }
> > +
> > + q->cause = E1000_ICS_TXQE;
> > + q->ide = false;
> > +
> > + ret = e1000e_start_xmit(q);
> > + if (ret >= core->tx_burst) {
> > + qemu_bh_schedule(q->tx_bh);
> > + q->tx_waiting = 1;
> > + return;
> > + }
> > +
> > + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> > + e1000e_set_interrupt_cause(core, q->cause);
>
>
> So I think this will cause some delay of the interrupt delivering.
Exactly. if tx burst occurs, the interrupt won't be triggered in the
first tx_bh handler.
As we give the vcpu more time and this may acceptable?
It
> looks to be it's better to leave the set ics in e1000e_start_xmit().
I think let e1000e_start_xmit just send packets is more clean.
Leave setting ics in it will not improve the performance as far as I can see.
What's your idea here to leave it in e1000e_start_xmit?
>
>
> > }
> > }
> >
> > @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core,
> > e1000e_autoneg_timer, core);
> > e1000e_intrmgr_pci_realize(core);
> >
> > + core->vm_running = runstate_is_running();
> > core->vmstate =
> > qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
> >
> > for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> > net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
> > E1000E_MAX_TX_FRAGS, core->has_vnet);
> > + core->tx[i].core = core;
> > + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
> > }
> >
> > net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> > @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
> > for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> > net_tx_pkt_reset(core->tx[i].tx_pkt);
> > net_tx_pkt_uninit(core->tx[i].tx_pkt);
> > + qemu_bh_cancel(core->tx[i].tx_bh);
> > + qemu_bh_delete(core->tx[i].tx_bh);
> > + core->tx[i].tx_bh = NULL;
> > }
> >
> > net_rx_pkt_uninit(core->rx_pkt);
> > diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> > index aee32f7e48..0c16dce3a6 100644
> > --- a/hw/net/e1000e_core.h
> > +++ b/hw/net/e1000e_core.h
> > @@ -77,10 +77,18 @@ struct E1000Core {
> > unsigned char sum_needed;
> > bool cptse;
> > struct NetTxPkt *tx_pkt;
> > + QEMUBH *tx_bh;
> > + uint32_t tx_waiting;
> > + uint32_t cause;
> > + bool ide;
> > + E1000ECore *core;
> > } tx[E1000E_NUM_QUEUES];
> >
> > struct NetRxPkt *rx_pkt;
> >
> > + int32_t tx_burst;
> > +
> > + bool vm_running;
> > bool has_vnet;
> > int max_queue_num;
> >
>
>
> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?
I think we need. But why the virtio net doesn't do this? Maybe it also
lack of this?
Thanks,
Li Qiang
>
> Thanks
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] e1000e: using bottom half to send packets
2020-07-21 4:33 ` Li Qiang
@ 2020-07-21 5:30 ` Jason Wang
2020-07-21 5:59 ` Li Qiang
0 siblings, 1 reply; 6+ messages in thread
From: Jason Wang @ 2020-07-21 5:30 UTC (permalink / raw
To: Li Qiang
Cc: Dmitry Fleytman, Li Qiang, Qemu Developers, P J P,
Alexander Bulekov, Paolo Bonzini
On 2020/7/21 下午12:33, Li Qiang wrote:
> Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道:
>>
>> On 2020/7/21 上午12:45, Li Qiang wrote:
>>> Alexander Bulekov reported a UAF bug related e1000e packets send.
>>>
>>> -->https://bugs.launchpad.net/qemu/+bug/1886362
>>>
>>> This is because the guest trigger a e1000e packet send and set the
>>> data's address to e1000e's MMIO address. So when the e1000e do DMA
>>> it will write the MMIO again and trigger re-entrancy and finally
>>> causes this UAF.
>>>
>>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
>>> things in here:
>>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
>>>
>>> Reference here:
>>> 'The easiest solution is to delay processing of descriptors to a bottom
>>> half whenever MMIO is doing something complicated. This is also better
>>> for latency because it will free the vCPU thread more quickly and leave
>>> the work to the I/O thread.'
>>>
>>> This patch fixes this UAF.
>>>
>>> Signed-off-by: Li Qiang <liq3ea@163.com>
>>> ---
>>> Change since v1:
>>> Per Jason's review here:
>>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
>>> 1. Cancel and schedule the tx bh when VM is stopped or resume
>>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
>>> 3. Add a tx_waiting to record whether the bh is pending or not
>>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
>>> acquired.
>>>
>>> hw/net/e1000e.c | 6 +++
>>> hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
>>> hw/net/e1000e_core.h | 8 ++++
>>> 3 files changed, 98 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
>>> index fda34518c9..24e35a78bf 100644
>>> --- a/hw/net/e1000e.c
>>> +++ b/hw/net/e1000e.c
>>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
>>>
>>> bool disable_vnet;
>>>
>>> + int32_t tx_burst;
>>> +
>>> E1000ECore core;
>>>
>>> } E1000EState;
>>>
>>> +#define TX_BURST 256
>>> +
>>> #define E1000E_MMIO_IDX 0
>>> #define E1000E_FLASH_IDX 1
>>> #define E1000E_IO_IDX 2
>>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
>>> {
>>> s->core.owner = &s->parent_obj;
>>> s->core.owner_nic = s->nic;
>>> + s->core.tx_burst = s->tx_burst;
>>> }
>>>
>>> static void
>>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
>>> e1000e_prop_subsys_ven, uint16_t),
>>> DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
>>> e1000e_prop_subsys, uint16_t),
>>> + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
>>> DEFINE_PROP_END_OF_LIST(),
>>> };
>>>
>>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
>>> index bcd186cac5..0a38a50cca 100644
>>> --- a/hw/net/e1000e_core.c
>>> +++ b/hw/net/e1000e_core.c
>>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
>>> rxr->i = &i[idx];
>>> }
>>>
>>> -static void
>>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>> +static int32_t
>>> +e1000e_start_xmit(struct e1000e_tx *q)
>>> {
>>> + E1000ECore *core = q->core;
>>> dma_addr_t base;
>>> struct e1000_tx_desc desc;
>>> - bool ide = false;
>>> - const E1000E_RingInfo *txi = txr->i;
>>> - uint32_t cause = E1000_ICS_TXQE;
>>> + const E1000E_RingInfo *txi;
>>> + E1000E_TxRing txr;
>>> + int32_t num_packets = 0;
>>>
>>> - if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>> - trace_e1000e_tx_disabled();
>>> - return;
>>> - }
>>> + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
>>> + txi = txr.i;
>>>
>>> while (!e1000e_ring_empty(core, txi)) {
>>> base = e1000e_ring_head_descr(core, txi);
>>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>> trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
>>> desc.lower.data, desc.upper.data);
>>>
>>> - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
>>> - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
>>> + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
>>> + q->cause |= e1000e_txdesc_writeback(core, base, &desc,
>>> + &q->ide, txi->idx);
>>>
>>> e1000e_ring_advance(core, txi, 1);
>>> + if (++num_packets >= core->tx_burst) {
>>> + break;
>>> + }
>>> }
>>>
>>> - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
>>> - e1000e_set_interrupt_cause(core, cause);
>>> - }
>>> + return num_packets;
>>> }
>>>
>>> static bool
>>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
>>> static void
>>> e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
>>> {
>>> - E1000E_TxRing txr;
>>> core->mac[index] = val;
>>>
>>> if (core->mac[TARC0] & E1000_TARC_ENABLE) {
>>> - e1000e_tx_ring_init(core, &txr, 0);
>>> - e1000e_start_xmit(core, &txr);
>>> + if (core->tx[0].tx_waiting) {
>>> + return;
>>> + }
>>> + core->tx[0].tx_waiting = 1;
>>> + if (!core->vm_running) {
>>> + return;
>>> + }
>>> + qemu_bh_schedule(core->tx[0].tx_bh);
>>> }
>>>
>>> if (core->mac[TARC1] & E1000_TARC_ENABLE) {
>>> - e1000e_tx_ring_init(core, &txr, 1);
>>> - e1000e_start_xmit(core, &txr);
>>> + if (core->tx[1].tx_waiting) {
>>> + return;
>>> + }
>>> + core->tx[1].tx_waiting = 1;
>>> + if (!core->vm_running) {
>>> + return;
>>> + }
>>> + qemu_bh_schedule(core->tx[1].tx_bh);
>>> }
>>> }
>>>
>>> static void
>>> e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
>>> {
>>> - E1000E_TxRing txr;
>>> int qidx = e1000e_mq_queue_idx(TDT, index);
>>> uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
>>>
>>> core->mac[index] = val & 0xffff;
>>>
>>> if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
>>> - e1000e_tx_ring_init(core, &txr, qidx);
>>> - e1000e_start_xmit(core, &txr);
>>> + qemu_bh_schedule(core->tx[qidx].tx_bh);
>>> }
>>> }
>>>
>>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
>>> trace_e1000e_vm_state_running();
>>> e1000e_intrmgr_resume(core);
>>> e1000e_autoneg_resume(core);
>>> + core->vm_running = 1;
>>> +
>>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>> + qemu_bh_schedule(core->tx[i].tx_bh);
>>
>> I guess the reason for the unconditional scheduling of bh is to make
>> sure tx work after live migration since we don't migrate tx_waiting.
>>
>> If yes, better add a comment here.
> Ok will do in next revision.
>
> And do we need to clear tx_waiting here?
>
> I think there is no need as the tx_bh handler will do this.
Yes.
>
>>
>>> + }
>>> +
>>> } else {
>>> trace_e1000e_vm_state_stopped();
>>> +
>>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>> + qemu_bh_cancel(core->tx[i].tx_bh);
>>> + }
>>> +
>>> e1000e_autoneg_pause(core);
>>> e1000e_intrmgr_pause(core);
>>> + core->vm_running = 0;
>>> + }
>>> +}
>>> +
>>> +
>>> +static void e1000e_core_tx_bh(void *opaque)
>>> +{
>>> + struct e1000e_tx *q = opaque;
>>> + E1000ECore *core = q->core;
>>> + int32_t ret;
>>> +
>>> + if (!core->vm_running) {
>>> + assert(q->tx_waiting);
>>> + return;
>>> + }
>>> +
>>> + q->tx_waiting = 0;
>>> +
>>> + if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>> + trace_e1000e_tx_disabled();
>>> + return;
>>> + }
>>> +
>>> + q->cause = E1000_ICS_TXQE;
>>> + q->ide = false;
>>> +
>>> + ret = e1000e_start_xmit(q);
>>> + if (ret >= core->tx_burst) {
>>> + qemu_bh_schedule(q->tx_bh);
>>> + q->tx_waiting = 1;
>>> + return;
>>> + }
>>> +
>>> + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
>>> + e1000e_set_interrupt_cause(core, q->cause);
>>
>> So I think this will cause some delay of the interrupt delivering.
> Exactly. if tx burst occurs, the interrupt won't be triggered in the
> first tx_bh handler.
> As we give the vcpu more time and this may acceptable?
I think not, e1000e has its own interrupt delay mechanism, let's keep it
work as much as possible.
>
> It
>> looks to be it's better to leave the set ics in e1000e_start_xmit().
> I think let e1000e_start_xmit just send packets is more clean.
> Leave setting ics in it will not improve the performance as far as I can see.
> What's your idea here to leave it in e1000e_start_xmit?
I just worry about the delay of the interrupt if you do it in
e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number
of packets transmitted and keep most of its code untouched.
>
>>
>>> }
>>> }
>>>
>>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core,
>>> e1000e_autoneg_timer, core);
>>> e1000e_intrmgr_pci_realize(core);
>>>
>>> + core->vm_running = runstate_is_running();
>>> core->vmstate =
>>> qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
>>>
>>> for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>> net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
>>> E1000E_MAX_TX_FRAGS, core->has_vnet);
>>> + core->tx[i].core = core;
>>> + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
>>> }
>>>
>>> net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
>>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
>>> for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>> net_tx_pkt_reset(core->tx[i].tx_pkt);
>>> net_tx_pkt_uninit(core->tx[i].tx_pkt);
>>> + qemu_bh_cancel(core->tx[i].tx_bh);
>>> + qemu_bh_delete(core->tx[i].tx_bh);
>>> + core->tx[i].tx_bh = NULL;
>>> }
>>>
>>> net_rx_pkt_uninit(core->rx_pkt);
>>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
>>> index aee32f7e48..0c16dce3a6 100644
>>> --- a/hw/net/e1000e_core.h
>>> +++ b/hw/net/e1000e_core.h
>>> @@ -77,10 +77,18 @@ struct E1000Core {
>>> unsigned char sum_needed;
>>> bool cptse;
>>> struct NetTxPkt *tx_pkt;
>>> + QEMUBH *tx_bh;
>>> + uint32_t tx_waiting;
>>> + uint32_t cause;
>>> + bool ide;
>>> + E1000ECore *core;
>>> } tx[E1000E_NUM_QUEUES];
>>>
>>> struct NetRxPkt *rx_pkt;
>>>
>>> + int32_t tx_burst;
>>> +
>>> + bool vm_running;
>>> bool has_vnet;
>>> int max_queue_num;
>>>
>>
>> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?
> I think we need. But why the virtio net doesn't do this? Maybe it also
> lack of this?
Ok, so I think the current code is probably OK.
virtio-net check DRIVER_OK and the code here check TCTL_EN.
Thanks
>
> Thanks,
> Li Qiang
>
>> Thanks
>>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] e1000e: using bottom half to send packets
2020-07-21 5:30 ` Jason Wang
@ 2020-07-21 5:59 ` Li Qiang
2020-07-21 6:05 ` Jason Wang
0 siblings, 1 reply; 6+ messages in thread
From: Li Qiang @ 2020-07-21 5:59 UTC (permalink / raw
To: Jason Wang
Cc: Dmitry Fleytman, Li Qiang, Qemu Developers, P J P,
Alexander Bulekov, Paolo Bonzini
Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 下午1:30写道:
>
>
> On 2020/7/21 下午12:33, Li Qiang wrote:
> > Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道:
> >>
> >> On 2020/7/21 上午12:45, Li Qiang wrote:
> >>> Alexander Bulekov reported a UAF bug related e1000e packets send.
> >>>
> >>> -->https://bugs.launchpad.net/qemu/+bug/1886362
> >>>
> >>> This is because the guest trigger a e1000e packet send and set the
> >>> data's address to e1000e's MMIO address. So when the e1000e do DMA
> >>> it will write the MMIO again and trigger re-entrancy and finally
> >>> causes this UAF.
> >>>
> >>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
> >>> things in here:
> >>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
> >>>
> >>> Reference here:
> >>> 'The easiest solution is to delay processing of descriptors to a bottom
> >>> half whenever MMIO is doing something complicated. This is also better
> >>> for latency because it will free the vCPU thread more quickly and leave
> >>> the work to the I/O thread.'
> >>>
> >>> This patch fixes this UAF.
> >>>
> >>> Signed-off-by: Li Qiang <liq3ea@163.com>
> >>> ---
> >>> Change since v1:
> >>> Per Jason's review here:
> >>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> >>> 1. Cancel and schedule the tx bh when VM is stopped or resume
> >>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> >>> 3. Add a tx_waiting to record whether the bh is pending or not
> >>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> >>> acquired.
> >>>
> >>> hw/net/e1000e.c | 6 +++
> >>> hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
> >>> hw/net/e1000e_core.h | 8 ++++
> >>> 3 files changed, 98 insertions(+), 22 deletions(-)
> >>>
> >>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> >>> index fda34518c9..24e35a78bf 100644
> >>> --- a/hw/net/e1000e.c
> >>> +++ b/hw/net/e1000e.c
> >>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
> >>>
> >>> bool disable_vnet;
> >>>
> >>> + int32_t tx_burst;
> >>> +
> >>> E1000ECore core;
> >>>
> >>> } E1000EState;
> >>>
> >>> +#define TX_BURST 256
> >>> +
> >>> #define E1000E_MMIO_IDX 0
> >>> #define E1000E_FLASH_IDX 1
> >>> #define E1000E_IO_IDX 2
> >>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
> >>> {
> >>> s->core.owner = &s->parent_obj;
> >>> s->core.owner_nic = s->nic;
> >>> + s->core.tx_burst = s->tx_burst;
> >>> }
> >>>
> >>> static void
> >>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
> >>> e1000e_prop_subsys_ven, uint16_t),
> >>> DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
> >>> e1000e_prop_subsys, uint16_t),
> >>> + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
> >>> DEFINE_PROP_END_OF_LIST(),
> >>> };
> >>>
> >>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> >>> index bcd186cac5..0a38a50cca 100644
> >>> --- a/hw/net/e1000e_core.c
> >>> +++ b/hw/net/e1000e_core.c
> >>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
> >>> rxr->i = &i[idx];
> >>> }
> >>>
> >>> -static void
> >>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> >>> +static int32_t
> >>> +e1000e_start_xmit(struct e1000e_tx *q)
> >>> {
> >>> + E1000ECore *core = q->core;
> >>> dma_addr_t base;
> >>> struct e1000_tx_desc desc;
> >>> - bool ide = false;
> >>> - const E1000E_RingInfo *txi = txr->i;
> >>> - uint32_t cause = E1000_ICS_TXQE;
> >>> + const E1000E_RingInfo *txi;
> >>> + E1000E_TxRing txr;
> >>> + int32_t num_packets = 0;
> >>>
> >>> - if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> >>> - trace_e1000e_tx_disabled();
> >>> - return;
> >>> - }
> >>> + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> >>> + txi = txr.i;
> >>>
> >>> while (!e1000e_ring_empty(core, txi)) {
> >>> base = e1000e_ring_head_descr(core, txi);
> >>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> >>> trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
> >>> desc.lower.data, desc.upper.data);
> >>>
> >>> - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> >>> - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
> >>> + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> >>> + q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> >>> + &q->ide, txi->idx);
> >>>
> >>> e1000e_ring_advance(core, txi, 1);
> >>> + if (++num_packets >= core->tx_burst) {
> >>> + break;
> >>> + }
> >>> }
> >>>
> >>> - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> >>> - e1000e_set_interrupt_cause(core, cause);
> >>> - }
> >>> + return num_packets;
> >>> }
> >>>
> >>> static bool
> >>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
> >>> static void
> >>> e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
> >>> {
> >>> - E1000E_TxRing txr;
> >>> core->mac[index] = val;
> >>>
> >>> if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> >>> - e1000e_tx_ring_init(core, &txr, 0);
> >>> - e1000e_start_xmit(core, &txr);
> >>> + if (core->tx[0].tx_waiting) {
> >>> + return;
> >>> + }
> >>> + core->tx[0].tx_waiting = 1;
> >>> + if (!core->vm_running) {
> >>> + return;
> >>> + }
> >>> + qemu_bh_schedule(core->tx[0].tx_bh);
> >>> }
> >>>
> >>> if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> >>> - e1000e_tx_ring_init(core, &txr, 1);
> >>> - e1000e_start_xmit(core, &txr);
> >>> + if (core->tx[1].tx_waiting) {
> >>> + return;
> >>> + }
> >>> + core->tx[1].tx_waiting = 1;
> >>> + if (!core->vm_running) {
> >>> + return;
> >>> + }
> >>> + qemu_bh_schedule(core->tx[1].tx_bh);
> >>> }
> >>> }
> >>>
> >>> static void
> >>> e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
> >>> {
> >>> - E1000E_TxRing txr;
> >>> int qidx = e1000e_mq_queue_idx(TDT, index);
> >>> uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
> >>>
> >>> core->mac[index] = val & 0xffff;
> >>>
> >>> if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> >>> - e1000e_tx_ring_init(core, &txr, qidx);
> >>> - e1000e_start_xmit(core, &txr);
> >>> + qemu_bh_schedule(core->tx[qidx].tx_bh);
> >>> }
> >>> }
> >>>
> >>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
> >>> trace_e1000e_vm_state_running();
> >>> e1000e_intrmgr_resume(core);
> >>> e1000e_autoneg_resume(core);
> >>> + core->vm_running = 1;
> >>> +
> >>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> + qemu_bh_schedule(core->tx[i].tx_bh);
> >>
> >> I guess the reason for the unconditional scheduling of bh is to make
> >> sure tx work after live migration since we don't migrate tx_waiting.
> >>
> >> If yes, better add a comment here.
> > Ok will do in next revision.
> >
> > And do we need to clear tx_waiting here?
> >
> > I think there is no need as the tx_bh handler will do this.
>
>
> Yes.
>
>
> >
> >>
> >>> + }
> >>> +
> >>> } else {
> >>> trace_e1000e_vm_state_stopped();
> >>> +
> >>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> + qemu_bh_cancel(core->tx[i].tx_bh);
> >>> + }
> >>> +
> >>> e1000e_autoneg_pause(core);
> >>> e1000e_intrmgr_pause(core);
> >>> + core->vm_running = 0;
> >>> + }
> >>> +}
> >>> +
> >>> +
> >>> +static void e1000e_core_tx_bh(void *opaque)
> >>> +{
> >>> + struct e1000e_tx *q = opaque;
> >>> + E1000ECore *core = q->core;
> >>> + int32_t ret;
> >>> +
> >>> + if (!core->vm_running) {
> >>> + assert(q->tx_waiting);
> >>> + return;
> >>> + }
> >>> +
> >>> + q->tx_waiting = 0;
> >>> +
> >>> + if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> >>> + trace_e1000e_tx_disabled();
> >>> + return;
> >>> + }
> >>> +
> >>> + q->cause = E1000_ICS_TXQE;
> >>> + q->ide = false;
> >>> +
> >>> + ret = e1000e_start_xmit(q);
> >>> + if (ret >= core->tx_burst) {
> >>> + qemu_bh_schedule(q->tx_bh);
> >>> + q->tx_waiting = 1;
> >>> + return;
> >>> + }
> >>> +
> >>> + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> >>> + e1000e_set_interrupt_cause(core, q->cause);
> >>
> >> So I think this will cause some delay of the interrupt delivering.
> > Exactly. if tx burst occurs, the interrupt won't be triggered in the
> > first tx_bh handler.
> > As we give the vcpu more time and this may acceptable?
>
>
> I think not, e1000e has its own interrupt delay mechanism, let's keep it
> work as much as possible.
>
>
> >
> > It
> >> looks to be it's better to leave the set ics in e1000e_start_xmit().
> > I think let e1000e_start_xmit just send packets is more clean.
> > Leave setting ics in it will not improve the performance as far as I can see.
> > What's your idea here to leave it in e1000e_start_xmit?
>
>
> I just worry about the delay of the interrupt if you do it in
> e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number
> of packets transmitted and keep most of its code untouched.
Sorry here let's make sure I got your meaning.
'The set ics' is meaning 'set interrupt cause' , in code:
+ if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
+ e1000e_set_interrupt_cause(core, q->cause);
Do you mean we should leave it in the 'e1000e_start_xmit'? My
understanding is this.
if it is this, we should also add following code in it:
+ if (ret >= core->tx_burst) {
+ qemu_bh_schedule(q->tx_bh);
+ q->tx_waiting = 1;
+ return;
+ }
Then the tx_bh handler just call 'e1000e_start_xmit'. Right?
>
>
> >
> >>
> >>> }
> >>> }
> >>>
> >>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core,
> >>> e1000e_autoneg_timer, core);
> >>> e1000e_intrmgr_pci_realize(core);
> >>>
> >>> + core->vm_running = runstate_is_running();
> >>> core->vmstate =
> >>> qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
> >>>
> >>> for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
> >>> E1000E_MAX_TX_FRAGS, core->has_vnet);
> >>> + core->tx[i].core = core;
> >>> + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
> >>> }
> >>>
> >>> net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> >>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
> >>> for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> net_tx_pkt_reset(core->tx[i].tx_pkt);
> >>> net_tx_pkt_uninit(core->tx[i].tx_pkt);
> >>> + qemu_bh_cancel(core->tx[i].tx_bh);
> >>> + qemu_bh_delete(core->tx[i].tx_bh);
> >>> + core->tx[i].tx_bh = NULL;
> >>> }
> >>>
> >>> net_rx_pkt_uninit(core->rx_pkt);
> >>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> >>> index aee32f7e48..0c16dce3a6 100644
> >>> --- a/hw/net/e1000e_core.h
> >>> +++ b/hw/net/e1000e_core.h
> >>> @@ -77,10 +77,18 @@ struct E1000Core {
> >>> unsigned char sum_needed;
> >>> bool cptse;
> >>> struct NetTxPkt *tx_pkt;
> >>> + QEMUBH *tx_bh;
> >>> + uint32_t tx_waiting;
> >>> + uint32_t cause;
> >>> + bool ide;
> >>> + E1000ECore *core;
> >>> } tx[E1000E_NUM_QUEUES];
> >>>
> >>> struct NetRxPkt *rx_pkt;
> >>>
> >>> + int32_t tx_burst;
> >>> +
> >>> + bool vm_running;
> >>> bool has_vnet;
> >>> int max_queue_num;
> >>>
> >>
> >> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?
> > I think we need. But why the virtio net doesn't do this? Maybe it also
> > lack of this?
>
>
> Ok, so I think the current code is probably OK.
So my next revision will do following:
1. add comment for the tx_bh schdule when VM resume.
2. Leave some code in the 'e1000e_start_xmit'
3. cancel the tx_bh and reset tx_waiting in e1000e_core_reset
If I lost/misunderstanding anything please let me know.
Thanks,
Li Qiang
>
> virtio-net check DRIVER_OK and the code here check TCTL_EN.
>
> Thanks
>
>
> >
> > Thanks,
> > Li Qiang
> >
> >> Thanks
> >>
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] e1000e: using bottom half to send packets
2020-07-21 5:59 ` Li Qiang
@ 2020-07-21 6:05 ` Jason Wang
0 siblings, 0 replies; 6+ messages in thread
From: Jason Wang @ 2020-07-21 6:05 UTC (permalink / raw
To: Li Qiang
Cc: Dmitry Fleytman, Li Qiang, Qemu Developers, P J P,
Alexander Bulekov, Paolo Bonzini
On 2020/7/21 下午1:59, Li Qiang wrote:
> Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 下午1:30写道:
>>
>> On 2020/7/21 下午12:33, Li Qiang wrote:
>>> Jason Wang <jasowang@redhat.com> 于2020年7月21日周二 上午10:03写道:
>>>> On 2020/7/21 上午12:45, Li Qiang wrote:
>>>>> Alexander Bulekov reported a UAF bug related e1000e packets send.
>>>>>
>>>>> -->https://bugs.launchpad.net/qemu/+bug/1886362
>>>>>
>>>>> This is because the guest trigger a e1000e packet send and set the
>>>>> data's address to e1000e's MMIO address. So when the e1000e do DMA
>>>>> it will write the MMIO again and trigger re-entrancy and finally
>>>>> causes this UAF.
>>>>>
>>>>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
>>>>> things in here:
>>>>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
>>>>>
>>>>> Reference here:
>>>>> 'The easiest solution is to delay processing of descriptors to a bottom
>>>>> half whenever MMIO is doing something complicated. This is also better
>>>>> for latency because it will free the vCPU thread more quickly and leave
>>>>> the work to the I/O thread.'
>>>>>
>>>>> This patch fixes this UAF.
>>>>>
>>>>> Signed-off-by: Li Qiang <liq3ea@163.com>
>>>>> ---
>>>>> Change since v1:
>>>>> Per Jason's review here:
>>>>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
>>>>> 1. Cancel and schedule the tx bh when VM is stopped or resume
>>>>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
>>>>> 3. Add a tx_waiting to record whether the bh is pending or not
>>>>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
>>>>> acquired.
>>>>>
>>>>> hw/net/e1000e.c | 6 +++
>>>>> hw/net/e1000e_core.c | 106 ++++++++++++++++++++++++++++++++++---------
>>>>> hw/net/e1000e_core.h | 8 ++++
>>>>> 3 files changed, 98 insertions(+), 22 deletions(-)
>>>>>
>>>>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
>>>>> index fda34518c9..24e35a78bf 100644
>>>>> --- a/hw/net/e1000e.c
>>>>> +++ b/hw/net/e1000e.c
>>>>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
>>>>>
>>>>> bool disable_vnet;
>>>>>
>>>>> + int32_t tx_burst;
>>>>> +
>>>>> E1000ECore core;
>>>>>
>>>>> } E1000EState;
>>>>>
>>>>> +#define TX_BURST 256
>>>>> +
>>>>> #define E1000E_MMIO_IDX 0
>>>>> #define E1000E_FLASH_IDX 1
>>>>> #define E1000E_IO_IDX 2
>>>>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
>>>>> {
>>>>> s->core.owner = &s->parent_obj;
>>>>> s->core.owner_nic = s->nic;
>>>>> + s->core.tx_burst = s->tx_burst;
>>>>> }
>>>>>
>>>>> static void
>>>>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
>>>>> e1000e_prop_subsys_ven, uint16_t),
>>>>> DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
>>>>> e1000e_prop_subsys, uint16_t),
>>>>> + DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
>>>>> DEFINE_PROP_END_OF_LIST(),
>>>>> };
>>>>>
>>>>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
>>>>> index bcd186cac5..0a38a50cca 100644
>>>>> --- a/hw/net/e1000e_core.c
>>>>> +++ b/hw/net/e1000e_core.c
>>>>> @@ -909,19 +909,18 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
>>>>> rxr->i = &i[idx];
>>>>> }
>>>>>
>>>>> -static void
>>>>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>>>> +static int32_t
>>>>> +e1000e_start_xmit(struct e1000e_tx *q)
>>>>> {
>>>>> + E1000ECore *core = q->core;
>>>>> dma_addr_t base;
>>>>> struct e1000_tx_desc desc;
>>>>> - bool ide = false;
>>>>> - const E1000E_RingInfo *txi = txr->i;
>>>>> - uint32_t cause = E1000_ICS_TXQE;
>>>>> + const E1000E_RingInfo *txi;
>>>>> + E1000E_TxRing txr;
>>>>> + int32_t num_packets = 0;
>>>>>
>>>>> - if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>>>> - trace_e1000e_tx_disabled();
>>>>> - return;
>>>>> - }
>>>>> + e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
>>>>> + txi = txr.i;
>>>>>
>>>>> while (!e1000e_ring_empty(core, txi)) {
>>>>> base = e1000e_ring_head_descr(core, txi);
>>>>> @@ -931,15 +930,17 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>>>> trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
>>>>> desc.lower.data, desc.upper.data);
>>>>>
>>>>> - e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
>>>>> - cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
>>>>> + e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
>>>>> + q->cause |= e1000e_txdesc_writeback(core, base, &desc,
>>>>> + &q->ide, txi->idx);
>>>>>
>>>>> e1000e_ring_advance(core, txi, 1);
>>>>> + if (++num_packets >= core->tx_burst) {
>>>>> + break;
>>>>> + }
>>>>> }
>>>>>
>>>>> - if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
>>>>> - e1000e_set_interrupt_cause(core, cause);
>>>>> - }
>>>>> + return num_packets;
>>>>> }
>>>>>
>>>>> static bool
>>>>> @@ -2423,32 +2424,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
>>>>> static void
>>>>> e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
>>>>> {
>>>>> - E1000E_TxRing txr;
>>>>> core->mac[index] = val;
>>>>>
>>>>> if (core->mac[TARC0] & E1000_TARC_ENABLE) {
>>>>> - e1000e_tx_ring_init(core, &txr, 0);
>>>>> - e1000e_start_xmit(core, &txr);
>>>>> + if (core->tx[0].tx_waiting) {
>>>>> + return;
>>>>> + }
>>>>> + core->tx[0].tx_waiting = 1;
>>>>> + if (!core->vm_running) {
>>>>> + return;
>>>>> + }
>>>>> + qemu_bh_schedule(core->tx[0].tx_bh);
>>>>> }
>>>>>
>>>>> if (core->mac[TARC1] & E1000_TARC_ENABLE) {
>>>>> - e1000e_tx_ring_init(core, &txr, 1);
>>>>> - e1000e_start_xmit(core, &txr);
>>>>> + if (core->tx[1].tx_waiting) {
>>>>> + return;
>>>>> + }
>>>>> + core->tx[1].tx_waiting = 1;
>>>>> + if (!core->vm_running) {
>>>>> + return;
>>>>> + }
>>>>> + qemu_bh_schedule(core->tx[1].tx_bh);
>>>>> }
>>>>> }
>>>>>
>>>>> static void
>>>>> e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
>>>>> {
>>>>> - E1000E_TxRing txr;
>>>>> int qidx = e1000e_mq_queue_idx(TDT, index);
>>>>> uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
>>>>>
>>>>> core->mac[index] = val & 0xffff;
>>>>>
>>>>> if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
>>>>> - e1000e_tx_ring_init(core, &txr, qidx);
>>>>> - e1000e_start_xmit(core, &txr);
>>>>> + qemu_bh_schedule(core->tx[qidx].tx_bh);
>>>>> }
>>>>> }
>>>>>
>>>>> @@ -3315,10 +3325,56 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
>>>>> trace_e1000e_vm_state_running();
>>>>> e1000e_intrmgr_resume(core);
>>>>> e1000e_autoneg_resume(core);
>>>>> + core->vm_running = 1;
>>>>> +
>>>>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>> + qemu_bh_schedule(core->tx[i].tx_bh);
>>>> I guess the reason for the unconditional scheduling of bh is to make
>>>> sure tx work after live migration since we don't migrate tx_waiting.
>>>>
>>>> If yes, better add a comment here.
>>> Ok will do in next revision.
>>>
>>> And do we need to clear tx_waiting here?
>>>
>>> I think there is no need as the tx_bh handler will do this.
>>
>> Yes.
>>
>>
>>>>> + }
>>>>> +
>>>>> } else {
>>>>> trace_e1000e_vm_state_stopped();
>>>>> +
>>>>> + for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>> + qemu_bh_cancel(core->tx[i].tx_bh);
>>>>> + }
>>>>> +
>>>>> e1000e_autoneg_pause(core);
>>>>> e1000e_intrmgr_pause(core);
>>>>> + core->vm_running = 0;
>>>>> + }
>>>>> +}
>>>>> +
>>>>> +
>>>>> +static void e1000e_core_tx_bh(void *opaque)
>>>>> +{
>>>>> + struct e1000e_tx *q = opaque;
>>>>> + E1000ECore *core = q->core;
>>>>> + int32_t ret;
>>>>> +
>>>>> + if (!core->vm_running) {
>>>>> + assert(q->tx_waiting);
>>>>> + return;
>>>>> + }
>>>>> +
>>>>> + q->tx_waiting = 0;
>>>>> +
>>>>> + if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>>>> + trace_e1000e_tx_disabled();
>>>>> + return;
>>>>> + }
>>>>> +
>>>>> + q->cause = E1000_ICS_TXQE;
>>>>> + q->ide = false;
>>>>> +
>>>>> + ret = e1000e_start_xmit(q);
>>>>> + if (ret >= core->tx_burst) {
>>>>> + qemu_bh_schedule(q->tx_bh);
>>>>> + q->tx_waiting = 1;
>>>>> + return;
>>>>> + }
>>>>> +
>>>>> + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
>>>>> + e1000e_set_interrupt_cause(core, q->cause);
>>>> So I think this will cause some delay of the interrupt delivering.
>>> Exactly. if tx burst occurs, the interrupt won't be triggered in the
>>> first tx_bh handler.
>>> As we give the vcpu more time and this may acceptable?
>>
>> I think not, e1000e has its own interrupt delay mechanism, let's keep it
>> work as much as possible.
>>
>>
>>> It
>>>> looks to be it's better to leave the set ics in e1000e_start_xmit().
>>> I think let e1000e_start_xmit just send packets is more clean.
>>> Leave setting ics in it will not improve the performance as far as I can see.
>>> What's your idea here to leave it in e1000e_start_xmit?
>>
>> I just worry about the delay of the interrupt if you do it in
>> e1000e_core_tx_bh() and just let e1000e_start_xmit() return the number
>> of packets transmitted and keep most of its code untouched.
> Sorry here let's make sure I got your meaning.
> 'The set ics' is meaning 'set interrupt cause' , in code:
>
> + if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> + e1000e_set_interrupt_cause(core, q->cause);
>
> Do you mean we should leave it in the 'e1000e_start_xmit'? My
> understanding is this.
> if it is this, we should also add following code in it:
>
> + if (ret >= core->tx_burst) {
> + qemu_bh_schedule(q->tx_bh);
> + q->tx_waiting = 1;
> + return;
> + }
>
> Then the tx_bh handler just call 'e1000e_start_xmit'. Right?
This should work.
>
>
>
>>
>>>>> }
>>>>> }
>>>>>
>>>>> @@ -3334,12 +3390,15 @@ e1000e_core_pci_realize(E1000ECore *core,
>>>>> e1000e_autoneg_timer, core);
>>>>> e1000e_intrmgr_pci_realize(core);
>>>>>
>>>>> + core->vm_running = runstate_is_running();
>>>>> core->vmstate =
>>>>> qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
>>>>>
>>>>> for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>> net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
>>>>> E1000E_MAX_TX_FRAGS, core->has_vnet);
>>>>> + core->tx[i].core = core;
>>>>> + core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
>>>>> }
>>>>>
>>>>> net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
>>>>> @@ -3367,6 +3426,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
>>>>> for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>> net_tx_pkt_reset(core->tx[i].tx_pkt);
>>>>> net_tx_pkt_uninit(core->tx[i].tx_pkt);
>>>>> + qemu_bh_cancel(core->tx[i].tx_bh);
>>>>> + qemu_bh_delete(core->tx[i].tx_bh);
>>>>> + core->tx[i].tx_bh = NULL;
>>>>> }
>>>>>
>>>>> net_rx_pkt_uninit(core->rx_pkt);
>>>>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
>>>>> index aee32f7e48..0c16dce3a6 100644
>>>>> --- a/hw/net/e1000e_core.h
>>>>> +++ b/hw/net/e1000e_core.h
>>>>> @@ -77,10 +77,18 @@ struct E1000Core {
>>>>> unsigned char sum_needed;
>>>>> bool cptse;
>>>>> struct NetTxPkt *tx_pkt;
>>>>> + QEMUBH *tx_bh;
>>>>> + uint32_t tx_waiting;
>>>>> + uint32_t cause;
>>>>> + bool ide;
>>>>> + E1000ECore *core;
>>>>> } tx[E1000E_NUM_QUEUES];
>>>>>
>>>>> struct NetRxPkt *rx_pkt;
>>>>>
>>>>> + int32_t tx_burst;
>>>>> +
>>>>> + bool vm_running;
>>>>> bool has_vnet;
>>>>> int max_queue_num;
>>>>>
>>>> Do we need to cancel the bh and reset tx_waiting in e1000e_core_reset()?
>>> I think we need. But why the virtio net doesn't do this? Maybe it also
>>> lack of this?
>>
>> Ok, so I think the current code is probably OK.
> So my next revision will do following:
> 1. add comment for the tx_bh schdule when VM resume.
> 2. Leave some code in the 'e1000e_start_xmit'
> 3. cancel the tx_bh and reset tx_waiting in e1000e_core_reset
>
> If I lost/misunderstanding anything please let me know.
Your understanding is correct.
Thanks
>
> Thanks,
> Li Qiang
>
>> virtio-net check DRIVER_OK and the code here check TCTL_EN.
>>
>> Thanks
>>
>>
>>> Thanks,
>>> Li Qiang
>>>
>>>> Thanks
>>>>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2020-07-21 6:06 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-07-20 16:45 [PATCH v2] e1000e: using bottom half to send packets Li Qiang
2020-07-21 2:03 ` Jason Wang
2020-07-21 4:33 ` Li Qiang
2020-07-21 5:30 ` Jason Wang
2020-07-21 5:59 ` Li Qiang
2020-07-21 6:05 ` Jason Wang
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.