From 738b54b9b6236f573eed2453c4cbfa77326793e2 Mon Sep 17 00:00:00 2001 From: duanqiangwen Date: Thu, 14 Dec 2023 10:33:37 +0800 Subject: [PATCH 01/12] net: libwx: fix memory leak on free page ifconfig ethx up, will set page->refcount larger than 1, and then ifconfig ethx down, calling __page_frag_cache_drain() to free pages, it is not compatible with page pool. So deleting codes which changing page->refcount. Fixes: 3c47e8ae113a ("net: libwx: Support to receive packets in NAPI") Signed-off-by: duanqiangwen Signed-off-by: David S. Miller --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 82 ++------------------ drivers/net/ethernet/wangxun/libwx/wx_type.h | 1 - 2 files changed, 6 insertions(+), 77 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index a5a50b5a881684..347d3cec02a363 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -160,60 +160,6 @@ static __le32 wx_test_staterr(union wx_rx_desc *rx_desc, return rx_desc->wb.upper.status_error & cpu_to_le32(stat_err_bits); } -static bool wx_can_reuse_rx_page(struct wx_rx_buffer *rx_buffer, - int rx_buffer_pgcnt) -{ - unsigned int pagecnt_bias = rx_buffer->pagecnt_bias; - struct page *page = rx_buffer->page; - - /* avoid re-using remote and pfmemalloc pages */ - if (!dev_page_is_reusable(page)) - return false; - -#if (PAGE_SIZE < 8192) - /* if we are only owner of page we can reuse it */ - if (unlikely((rx_buffer_pgcnt - pagecnt_bias) > 1)) - return false; -#endif - - /* If we have drained the page fragment pool we need to update - * the pagecnt_bias and page count so that we fully restock the - * number of references the driver holds. - */ - if (unlikely(pagecnt_bias == 1)) { - page_ref_add(page, USHRT_MAX - 1); - rx_buffer->pagecnt_bias = USHRT_MAX; - } - - return true; -} - -/** - * wx_reuse_rx_page - page flip buffer and store it back on the ring - * @rx_ring: rx descriptor ring to store buffers on - * @old_buff: donor buffer to have page reused - * - * Synchronizes page for reuse by the adapter - **/ -static void wx_reuse_rx_page(struct wx_ring *rx_ring, - struct wx_rx_buffer *old_buff) -{ - u16 nta = rx_ring->next_to_alloc; - struct wx_rx_buffer *new_buff; - - new_buff = &rx_ring->rx_buffer_info[nta]; - - /* update, and store next to alloc */ - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - /* transfer page from old buffer to new buffer */ - new_buff->page = old_buff->page; - new_buff->page_dma = old_buff->page_dma; - new_buff->page_offset = old_buff->page_offset; - new_buff->pagecnt_bias = old_buff->pagecnt_bias; -} - static void wx_dma_sync_frag(struct wx_ring *rx_ring, struct wx_rx_buffer *rx_buffer) { @@ -270,8 +216,6 @@ static struct wx_rx_buffer *wx_get_rx_buffer(struct wx_ring *rx_ring, size, DMA_FROM_DEVICE); skip_sync: - rx_buffer->pagecnt_bias--; - return rx_buffer; } @@ -280,19 +224,9 @@ static void wx_put_rx_buffer(struct wx_ring *rx_ring, struct sk_buff *skb, int rx_buffer_pgcnt) { - if (wx_can_reuse_rx_page(rx_buffer, rx_buffer_pgcnt)) { - /* hand second half of page back to the ring */ - wx_reuse_rx_page(rx_ring, rx_buffer); - } else { - if (!IS_ERR(skb) && WX_CB(skb)->dma == rx_buffer->dma) - /* the page has been released from the ring */ - WX_CB(skb)->page_released = true; - else - page_pool_put_full_page(rx_ring->page_pool, rx_buffer->page, false); - - __page_frag_cache_drain(rx_buffer->page, - rx_buffer->pagecnt_bias); - } + if (!IS_ERR(skb) && WX_CB(skb)->dma == rx_buffer->dma) + /* the page has been released from the ring */ + WX_CB(skb)->page_released = true; /* clear contents of rx_buffer */ rx_buffer->page = NULL; @@ -335,11 +269,12 @@ static struct sk_buff *wx_build_skb(struct wx_ring *rx_ring, if (size <= WX_RXBUFFER_256) { memcpy(__skb_put(skb, size), page_addr, ALIGN(size, sizeof(long))); - rx_buffer->pagecnt_bias++; - + page_pool_put_full_page(rx_ring->page_pool, rx_buffer->page, true); return skb; } + skb_mark_for_recycle(skb); + if (!wx_test_staterr(rx_desc, WX_RXD_STAT_EOP)) WX_CB(skb)->dma = rx_buffer->dma; @@ -382,8 +317,6 @@ static bool wx_alloc_mapped_page(struct wx_ring *rx_ring, bi->page_dma = dma; bi->page = page; bi->page_offset = 0; - page_ref_add(page, USHRT_MAX - 1); - bi->pagecnt_bias = USHRT_MAX; return true; } @@ -723,7 +656,6 @@ static int wx_clean_rx_irq(struct wx_q_vector *q_vector, /* exit if we failed to retrieve a buffer */ if (!skb) { rx_ring->rx_stats.alloc_rx_buff_failed++; - rx_buffer->pagecnt_bias++; break; } @@ -2248,8 +2180,6 @@ static void wx_clean_rx_ring(struct wx_ring *rx_ring) /* free resources associated with mapping */ page_pool_put_full_page(rx_ring->page_pool, rx_buffer->page, false); - __page_frag_cache_drain(rx_buffer->page, - rx_buffer->pagecnt_bias); i++; rx_buffer++; diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 165e82de772e6c..83f9bb7b3c22a0 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -787,7 +787,6 @@ struct wx_rx_buffer { dma_addr_t page_dma; struct page *page; unsigned int page_offset; - u16 pagecnt_bias; }; struct wx_queue_stats { From 8c97ab5448f2096daba11edf8d18a44e1eb6f31d Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Wed, 13 Dec 2023 23:40:44 +0530 Subject: [PATCH 02/12] octeontx2-pf: Fix graceful exit during PFC configuration failure During PFC configuration failure the code was not handling a graceful exit. This patch fixes the same and add proper code for a graceful exit. Fixes: 99c969a83d82 ("octeontx2-pf: Add egress PFC support") Signed-off-by: Suman Ghosh Signed-off-by: David S. Miller --- .../ethernet/marvell/octeontx2/nic/otx2_dcbnl.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c index bfddbff7bcdfbf..28fb643d2917f7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dcbnl.c @@ -399,9 +399,10 @@ static int otx2_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc) static int otx2_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) { struct otx2_nic *pfvf = netdev_priv(dev); + u8 old_pfc_en; int err; - /* Save PFC configuration to interface */ + old_pfc_en = pfvf->pfc_en; pfvf->pfc_en = pfc->pfc_en; if (pfvf->hw.tx_queues >= NIX_PF_PFC_PRIO_MAX) @@ -411,13 +412,17 @@ static int otx2_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) * supported by the tx queue configuration */ err = otx2_check_pfc_config(pfvf); - if (err) + if (err) { + pfvf->pfc_en = old_pfc_en; return err; + } process_pfc: err = otx2_config_priority_flow_ctrl(pfvf); - if (err) + if (err) { + pfvf->pfc_en = old_pfc_en; return err; + } /* Request Per channel Bpids */ if (pfc->pfc_en) @@ -425,6 +430,12 @@ static int otx2_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) err = otx2_pfc_txschq_update(pfvf); if (err) { + if (pfc->pfc_en) + otx2_nix_config_bp(pfvf, false); + + otx2_pfc_txschq_stop(pfvf); + pfvf->pfc_en = old_pfc_en; + otx2_config_priority_flow_ctrl(pfvf); dev_err(pfvf->dev, "%s failed to update TX schedulers\n", __func__); return err; } From cac23b7d7627915d967ce25436d7aae26e88ed06 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Thu, 14 Dec 2023 14:09:22 +0900 Subject: [PATCH 03/12] net: Return error from sk_stream_wait_connect() if sk_wait_event() fails The following NULL pointer dereference issue occurred: BUG: kernel NULL pointer dereference, address: 0000000000000000 <...> RIP: 0010:ccid_hc_tx_send_packet net/dccp/ccid.h:166 [inline] RIP: 0010:dccp_write_xmit+0x49/0x140 net/dccp/output.c:356 <...> Call Trace: dccp_sendmsg+0x642/0x7e0 net/dccp/proto.c:801 inet_sendmsg+0x63/0x90 net/ipv4/af_inet.c:846 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x83/0xe0 net/socket.c:745 ____sys_sendmsg+0x443/0x510 net/socket.c:2558 ___sys_sendmsg+0xe5/0x150 net/socket.c:2612 __sys_sendmsg+0xa6/0x120 net/socket.c:2641 __do_sys_sendmsg net/socket.c:2650 [inline] __se_sys_sendmsg net/socket.c:2648 [inline] __x64_sys_sendmsg+0x45/0x50 net/socket.c:2648 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x43/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b sk_wait_event() returns an error (-EPIPE) if disconnect() is called on the socket waiting for the event. However, sk_stream_wait_connect() returns success, i.e. zero, even if sk_wait_event() returns -EPIPE, so a function that waits for a connection with sk_stream_wait_connect() may misbehave. In the case of the above DCCP issue, dccp_sendmsg() is waiting for the connection. If disconnect() is called in concurrently, the above issue occurs. This patch fixes the issue by returning error from sk_stream_wait_connect() if sk_wait_event() fails. Fixes: 419ce133ab92 ("tcp: allow again tcp_disconnect() when threads are waiting") Signed-off-by: Shigeru Yoshida Reviewed-by: Kuniyuki Iwashima Reported-by: syzbot+c71bc336c5061153b502@syzkaller.appspotmail.com Reviewed-by: Eric Dumazet Reported-by: syzbot Reported-by: syzkaller Signed-off-by: David S. Miller --- net/core/stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/stream.c b/net/core/stream.c index 96fbcb9bbb30a5..b16dfa568a2d5b 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -79,7 +79,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); - return 0; + return done < 0 ? done : 0; } EXPORT_SYMBOL(sk_stream_wait_connect); From 19391a2ca98baa7b80279306cdf7dd43f81fa595 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Dec 2023 11:30:38 +0000 Subject: [PATCH 04/12] net: sched: ife: fix potential use-after-free ife_decode() calls pskb_may_pull() two times, we need to reload ifehdr after the second one, or risk use-after-free as reported by syzbot: BUG: KASAN: slab-use-after-free in __ife_tlv_meta_valid net/ife/ife.c:108 [inline] BUG: KASAN: slab-use-after-free in ife_tlv_meta_decode+0x1d1/0x210 net/ife/ife.c:131 Read of size 2 at addr ffff88802d7300a4 by task syz-executor.5/22323 CPU: 0 PID: 22323 Comm: syz-executor.5 Not tainted 6.7.0-rc3-syzkaller-00804-g074ac38d5b95 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/10/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0xc4/0x620 mm/kasan/report.c:475 kasan_report+0xda/0x110 mm/kasan/report.c:588 __ife_tlv_meta_valid net/ife/ife.c:108 [inline] ife_tlv_meta_decode+0x1d1/0x210 net/ife/ife.c:131 tcf_ife_decode net/sched/act_ife.c:739 [inline] tcf_ife_act+0x4e3/0x1cd0 net/sched/act_ife.c:879 tc_act include/net/tc_wrapper.h:221 [inline] tcf_action_exec+0x1ac/0x620 net/sched/act_api.c:1079 tcf_exts_exec include/net/pkt_cls.h:344 [inline] mall_classify+0x201/0x310 net/sched/cls_matchall.c:42 tc_classify include/net/tc_wrapper.h:227 [inline] __tcf_classify net/sched/cls_api.c:1703 [inline] tcf_classify+0x82f/0x1260 net/sched/cls_api.c:1800 hfsc_classify net/sched/sch_hfsc.c:1147 [inline] hfsc_enqueue+0x315/0x1060 net/sched/sch_hfsc.c:1546 dev_qdisc_enqueue+0x3f/0x230 net/core/dev.c:3739 __dev_xmit_skb net/core/dev.c:3828 [inline] __dev_queue_xmit+0x1de1/0x3d30 net/core/dev.c:4311 dev_queue_xmit include/linux/netdevice.h:3165 [inline] packet_xmit+0x237/0x350 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3081 [inline] packet_sendmsg+0x24aa/0x5200 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 __sys_sendto+0x255/0x340 net/socket.c:2190 __do_sys_sendto net/socket.c:2202 [inline] __se_sys_sendto net/socket.c:2198 [inline] __x64_sys_sendto+0xe0/0x1b0 net/socket.c:2198 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x7fe9acc7cae9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fe9ada450c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007fe9acd9bf80 RCX: 00007fe9acc7cae9 RDX: 000000000000fce0 RSI: 00000000200002c0 RDI: 0000000000000003 RBP: 00007fe9accc847a R08: 0000000020000140 R09: 0000000000000014 R10: 0000000000000004 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007fe9acd9bf80 R15: 00007ffd5427ae78 Allocated by task 22323: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 ____kasan_kmalloc mm/kasan/common.c:374 [inline] __kasan_kmalloc+0xa2/0xb0 mm/kasan/common.c:383 kasan_kmalloc include/linux/kasan.h:198 [inline] __do_kmalloc_node mm/slab_common.c:1007 [inline] __kmalloc_node_track_caller+0x5a/0x90 mm/slab_common.c:1027 kmalloc_reserve+0xef/0x260 net/core/skbuff.c:582 __alloc_skb+0x12b/0x330 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1298 [inline] alloc_skb_with_frags+0xe4/0x710 net/core/skbuff.c:6331 sock_alloc_send_pskb+0x7e4/0x970 net/core/sock.c:2780 packet_alloc_skb net/packet/af_packet.c:2930 [inline] packet_snd net/packet/af_packet.c:3024 [inline] packet_sendmsg+0x1e2a/0x5200 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 __sys_sendto+0x255/0x340 net/socket.c:2190 __do_sys_sendto net/socket.c:2202 [inline] __se_sys_sendto net/socket.c:2198 [inline] __x64_sys_sendto+0xe0/0x1b0 net/socket.c:2198 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Freed by task 22323: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 kasan_save_free_info+0x2b/0x40 mm/kasan/generic.c:522 ____kasan_slab_free mm/kasan/common.c:236 [inline] ____kasan_slab_free+0x15b/0x1b0 mm/kasan/common.c:200 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1800 [inline] slab_free_freelist_hook+0x114/0x1e0 mm/slub.c:1826 slab_free mm/slub.c:3809 [inline] __kmem_cache_free+0xc0/0x180 mm/slub.c:3822 skb_kfree_head net/core/skbuff.c:950 [inline] skb_free_head+0x110/0x1b0 net/core/skbuff.c:962 pskb_expand_head+0x3c5/0x1170 net/core/skbuff.c:2130 __pskb_pull_tail+0xe1/0x1830 net/core/skbuff.c:2655 pskb_may_pull_reason include/linux/skbuff.h:2685 [inline] pskb_may_pull include/linux/skbuff.h:2693 [inline] ife_decode+0x394/0x4f0 net/ife/ife.c:82 tcf_ife_decode net/sched/act_ife.c:727 [inline] tcf_ife_act+0x43b/0x1cd0 net/sched/act_ife.c:879 tc_act include/net/tc_wrapper.h:221 [inline] tcf_action_exec+0x1ac/0x620 net/sched/act_api.c:1079 tcf_exts_exec include/net/pkt_cls.h:344 [inline] mall_classify+0x201/0x310 net/sched/cls_matchall.c:42 tc_classify include/net/tc_wrapper.h:227 [inline] __tcf_classify net/sched/cls_api.c:1703 [inline] tcf_classify+0x82f/0x1260 net/sched/cls_api.c:1800 hfsc_classify net/sched/sch_hfsc.c:1147 [inline] hfsc_enqueue+0x315/0x1060 net/sched/sch_hfsc.c:1546 dev_qdisc_enqueue+0x3f/0x230 net/core/dev.c:3739 __dev_xmit_skb net/core/dev.c:3828 [inline] __dev_queue_xmit+0x1de1/0x3d30 net/core/dev.c:4311 dev_queue_xmit include/linux/netdevice.h:3165 [inline] packet_xmit+0x237/0x350 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3081 [inline] packet_sendmsg+0x24aa/0x5200 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 __sys_sendto+0x255/0x340 net/socket.c:2190 __do_sys_sendto net/socket.c:2202 [inline] __se_sys_sendto net/socket.c:2198 [inline] __x64_sys_sendto+0xe0/0x1b0 net/socket.c:2198 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b The buggy address belongs to the object at ffff88802d730000 which belongs to the cache kmalloc-8k of size 8192 The buggy address is located 164 bytes inside of freed 8192-byte region [ffff88802d730000, ffff88802d732000) The buggy address belongs to the physical page: page:ffffea0000b5cc00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x2d730 head:ffffea0000b5cc00 order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0xfff00000000840(slab|head|node=0|zone=1|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 00fff00000000840 ffff888013042280 dead000000000122 0000000000000000 raw: 0000000000000000 0000000080020002 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0x1d20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL), pid 22323, tgid 22320 (syz-executor.5), ts 950317230369, free_ts 950233467461 set_page_owner include/linux/page_owner.h:31 [inline] post_alloc_hook+0x2d0/0x350 mm/page_alloc.c:1544 prep_new_page mm/page_alloc.c:1551 [inline] get_page_from_freelist+0xa28/0x3730 mm/page_alloc.c:3319 __alloc_pages+0x22e/0x2420 mm/page_alloc.c:4575 alloc_pages_mpol+0x258/0x5f0 mm/mempolicy.c:2133 alloc_slab_page mm/slub.c:1870 [inline] allocate_slab mm/slub.c:2017 [inline] new_slab+0x283/0x3c0 mm/slub.c:2070 ___slab_alloc+0x979/0x1500 mm/slub.c:3223 __slab_alloc.constprop.0+0x56/0xa0 mm/slub.c:3322 __slab_alloc_node mm/slub.c:3375 [inline] slab_alloc_node mm/slub.c:3468 [inline] __kmem_cache_alloc_node+0x131/0x310 mm/slub.c:3517 __do_kmalloc_node mm/slab_common.c:1006 [inline] __kmalloc_node_track_caller+0x4a/0x90 mm/slab_common.c:1027 kmalloc_reserve+0xef/0x260 net/core/skbuff.c:582 __alloc_skb+0x12b/0x330 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1298 [inline] alloc_skb_with_frags+0xe4/0x710 net/core/skbuff.c:6331 sock_alloc_send_pskb+0x7e4/0x970 net/core/sock.c:2780 packet_alloc_skb net/packet/af_packet.c:2930 [inline] packet_snd net/packet/af_packet.c:3024 [inline] packet_sendmsg+0x1e2a/0x5200 net/packet/af_packet.c:3113 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0xd5/0x180 net/socket.c:745 __sys_sendto+0x255/0x340 net/socket.c:2190 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1144 [inline] free_unref_page_prepare+0x53c/0xb80 mm/page_alloc.c:2354 free_unref_page+0x33/0x3b0 mm/page_alloc.c:2494 __unfreeze_partials+0x226/0x240 mm/slub.c:2655 qlink_free mm/kasan/quarantine.c:168 [inline] qlist_free_all+0x6a/0x170 mm/kasan/quarantine.c:187 kasan_quarantine_reduce+0x18e/0x1d0 mm/kasan/quarantine.c:294 __kasan_slab_alloc+0x65/0x90 mm/kasan/common.c:305 kasan_slab_alloc include/linux/kasan.h:188 [inline] slab_post_alloc_hook mm/slab.h:763 [inline] slab_alloc_node mm/slub.c:3478 [inline] slab_alloc mm/slub.c:3486 [inline] __kmem_cache_alloc_lru mm/slub.c:3493 [inline] kmem_cache_alloc_lru+0x219/0x6f0 mm/slub.c:3509 alloc_inode_sb include/linux/fs.h:2937 [inline] ext4_alloc_inode+0x28/0x650 fs/ext4/super.c:1408 alloc_inode+0x5d/0x220 fs/inode.c:261 new_inode_pseudo fs/inode.c:1006 [inline] new_inode+0x22/0x260 fs/inode.c:1032 __ext4_new_inode+0x333/0x5200 fs/ext4/ialloc.c:958 ext4_symlink+0x5d7/0xa20 fs/ext4/namei.c:3398 vfs_symlink fs/namei.c:4464 [inline] vfs_symlink+0x3e5/0x620 fs/namei.c:4448 do_symlinkat+0x25f/0x310 fs/namei.c:4490 __do_sys_symlinkat fs/namei.c:4506 [inline] __se_sys_symlinkat fs/namei.c:4503 [inline] __x64_sys_symlinkat+0x97/0xc0 fs/namei.c:4503 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 Fixes: d57493d6d1be ("net: sched: ife: check on metadata length") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Jamal Hadi Salim Cc: Alexander Aring Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/ife/ife.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ife/ife.c b/net/ife/ife.c index 13bbf8cb6a3961..be05b690b9ef29 100644 --- a/net/ife/ife.c +++ b/net/ife/ife.c @@ -82,6 +82,7 @@ void *ife_decode(struct sk_buff *skb, u16 *metalen) if (unlikely(!pskb_may_pull(skb, total_pull))) return NULL; + ifehdr = (struct ifeheadr *)(skb->data + skb->dev->hard_header_len); skb_set_mac_header(skb, total_pull); __skb_pull(skb, total_pull); *metalen = ifehdrln - IFE_METAHDRLEN; From 309fdb1c33fe726d92d0030481346f24e1b01f07 Mon Sep 17 00:00:00 2001 From: Zhipeng Lu Date: Thu, 14 Dec 2023 21:04:04 +0800 Subject: [PATCH 05/12] ethernet: atheros: fix a memleak in atl1e_setup_ring_resources In the error handling of 'offset > adapter->ring_size', the tx_ring->tx_buffer allocated by kzalloc should be freed, instead of 'goto failed' instantly. Fixes: a6a5325239c2 ("atl1e: Atheros L1E Gigabit Ethernet driver") Signed-off-by: Zhipeng Lu Reviewed-by: Suman Ghosh Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/atl1e/atl1e_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c index 5935be190b9e22..5f2a6fcba96708 100644 --- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c +++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c @@ -866,10 +866,13 @@ static int atl1e_setup_ring_resources(struct atl1e_adapter *adapter) netdev_err(adapter->netdev, "offset(%d) > ring size(%d) !!\n", offset, adapter->ring_size); err = -1; - goto failed; + goto free_buffer; } return 0; +free_buffer: + kfree(tx_ring->tx_buffer); + tx_ring->tx_buffer = NULL; failed: if (adapter->ring_vir_addr != NULL) { dma_free_coherent(&pdev->dev, adapter->ring_size, From 64b8bc7d5f1434c636a40bdcfcd42b278d1714be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Dec 2023 15:27:47 +0000 Subject: [PATCH 06/12] net/rose: fix races in rose_kill_by_device() syzbot found an interesting netdev refcounting issue in net/rose/af_rose.c, thanks to CONFIG_NET_DEV_REFCNT_TRACKER=y [1] Problem is that rose_kill_by_device() can change rose->device while other threads do not expect the pointer to be changed. We have to first collect sockets in a temporary array, then perform the changes while holding the socket lock and rose_list_lock spinlock (in this order) Change rose_release() to also acquire rose_list_lock before releasing the netdev refcount. [1] [ 1185.055088][ T7889] ref_tracker: reference already released. [ 1185.061476][ T7889] ref_tracker: allocated in: [ 1185.066081][ T7889] rose_bind+0x4ab/0xd10 [ 1185.070446][ T7889] __sys_bind+0x1ec/0x220 [ 1185.074818][ T7889] __x64_sys_bind+0x72/0xb0 [ 1185.079356][ T7889] do_syscall_64+0x40/0x110 [ 1185.083897][ T7889] entry_SYSCALL_64_after_hwframe+0x63/0x6b [ 1185.089835][ T7889] ref_tracker: freed in: [ 1185.094088][ T7889] rose_release+0x2f5/0x570 [ 1185.098629][ T7889] __sock_release+0xae/0x260 [ 1185.103262][ T7889] sock_close+0x1c/0x20 [ 1185.107453][ T7889] __fput+0x270/0xbb0 [ 1185.111467][ T7889] task_work_run+0x14d/0x240 [ 1185.116085][ T7889] get_signal+0x106f/0x2790 [ 1185.120622][ T7889] arch_do_signal_or_restart+0x90/0x7f0 [ 1185.126205][ T7889] exit_to_user_mode_prepare+0x121/0x240 [ 1185.131846][ T7889] syscall_exit_to_user_mode+0x1e/0x60 [ 1185.137293][ T7889] do_syscall_64+0x4d/0x110 [ 1185.141783][ T7889] entry_SYSCALL_64_after_hwframe+0x63/0x6b [ 1185.148085][ T7889] ------------[ cut here ]------------ WARNING: CPU: 1 PID: 7889 at lib/ref_tracker.c:255 ref_tracker_free+0x61a/0x810 lib/ref_tracker.c:255 Modules linked in: CPU: 1 PID: 7889 Comm: syz-executor.2 Not tainted 6.7.0-rc4-syzkaller-00162-g65c95f78917e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/10/2023 RIP: 0010:ref_tracker_free+0x61a/0x810 lib/ref_tracker.c:255 Code: 00 44 8b 6b 18 31 ff 44 89 ee e8 21 62 f5 fc 45 85 ed 0f 85 a6 00 00 00 e8 a3 66 f5 fc 48 8b 34 24 48 89 ef e8 27 5f f1 05 90 <0f> 0b 90 bb ea ff ff ff e9 52 fd ff ff e8 84 66 f5 fc 4c 8d 6d 44 RSP: 0018:ffffc90004917850 EFLAGS: 00010202 RAX: 0000000000000201 RBX: ffff88802618f4c0 RCX: 0000000000000000 RDX: 0000000000000202 RSI: ffffffff8accb920 RDI: 0000000000000001 RBP: ffff8880269ea5b8 R08: 0000000000000001 R09: fffffbfff23e35f6 R10: ffffffff91f1afb7 R11: 0000000000000001 R12: 1ffff92000922f0c R13: 0000000005a2039b R14: ffff88802618f4d8 R15: 00000000ffffffff FS: 00007f0a720ef6c0(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f43a819d988 CR3: 0000000076c64000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: netdev_tracker_free include/linux/netdevice.h:4127 [inline] netdev_put include/linux/netdevice.h:4144 [inline] netdev_put include/linux/netdevice.h:4140 [inline] rose_kill_by_device net/rose/af_rose.c:195 [inline] rose_device_event+0x25d/0x330 net/rose/af_rose.c:218 notifier_call_chain+0xb6/0x3b0 kernel/notifier.c:93 call_netdevice_notifiers_info+0xbe/0x130 net/core/dev.c:1967 call_netdevice_notifiers_extack net/core/dev.c:2005 [inline] call_netdevice_notifiers net/core/dev.c:2019 [inline] __dev_notify_flags+0x1f5/0x2e0 net/core/dev.c:8646 dev_change_flags+0x122/0x170 net/core/dev.c:8682 dev_ifsioc+0x9ad/0x1090 net/core/dev_ioctl.c:529 dev_ioctl+0x224/0x1090 net/core/dev_ioctl.c:786 sock_do_ioctl+0x198/0x270 net/socket.c:1234 sock_ioctl+0x22e/0x6b0 net/socket.c:1339 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl fs/ioctl.c:857 [inline] __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x7f0a7147cba9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 e1 20 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f0a720ef0c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f0a7159bf80 RCX: 00007f0a7147cba9 RDX: 0000000020000040 RSI: 0000000000008914 RDI: 0000000000000004 RBP: 00007f0a714c847a R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007f0a7159bf80 R15: 00007ffc8bb3a5f8 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Bernard Pidoux Signed-off-by: David S. Miller --- net/rose/af_rose.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index ecb91ad4ce639e..ef81d019b20f48 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -182,21 +182,47 @@ void rose_kill_by_neigh(struct rose_neigh *neigh) */ static void rose_kill_by_device(struct net_device *dev) { - struct sock *s; + struct sock *sk, *array[16]; + struct rose_sock *rose; + bool rescan; + int i, cnt; +start: + rescan = false; + cnt = 0; spin_lock_bh(&rose_list_lock); - sk_for_each(s, &rose_list) { - struct rose_sock *rose = rose_sk(s); + sk_for_each(sk, &rose_list) { + rose = rose_sk(sk); + if (rose->device == dev) { + if (cnt == ARRAY_SIZE(array)) { + rescan = true; + break; + } + sock_hold(sk); + array[cnt++] = sk; + } + } + spin_unlock_bh(&rose_list_lock); + for (i = 0; i < cnt; i++) { + sk = array[cnt]; + rose = rose_sk(sk); + lock_sock(sk); + spin_lock_bh(&rose_list_lock); if (rose->device == dev) { - rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); + rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); if (rose->neighbour) rose->neighbour->use--; netdev_put(rose->device, &rose->dev_tracker); rose->device = NULL; } + spin_unlock_bh(&rose_list_lock); + release_sock(sk); + sock_put(sk); + cond_resched(); } - spin_unlock_bh(&rose_list_lock); + if (rescan) + goto start; } /* @@ -656,7 +682,10 @@ static int rose_release(struct socket *sock) break; } + spin_lock_bh(&rose_list_lock); netdev_put(rose->device, &rose->dev_tracker); + rose->device = NULL; + spin_unlock_bh(&rose_list_lock); sock->sk = NULL; release_sock(sk); sock_put(sk); From 23c93c3b6275a59f2a685f4a693944b53c31df4e Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Thu, 14 Dec 2023 13:31:38 -0800 Subject: [PATCH 07/12] bnxt_en: do not map packet buffers twice Remove double-mapping of DMA buffers as it can prevent page pool entries from being freed. Mapping is managed by page pool infrastructure and was previously managed by the driver in __bnxt_alloc_rx_page before allowing the page pool infrastructure to manage it. Fixes: 578fcfd26e2a ("bnxt_en: Let the page pool manage the DMA mapping") Reviewed-by: Somnath Kotur Signed-off-by: Andy Gospodarek Signed-off-by: Michael Chan Reviewed-by: David Wei Link: https://lore.kernel.org/r/20231214213138.98095-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 96f5ca778c67d6..8cb9a99154aad9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -59,7 +59,6 @@ struct bnxt_sw_tx_bd *bnxt_xmit_bd(struct bnxt *bp, for (i = 0; i < num_frags ; i++) { skb_frag_t *frag = &sinfo->frags[i]; struct bnxt_sw_tx_bd *frag_tx_buf; - struct pci_dev *pdev = bp->pdev; dma_addr_t frag_mapping; int frag_len; @@ -73,16 +72,10 @@ struct bnxt_sw_tx_bd *bnxt_xmit_bd(struct bnxt *bp, txbd = &txr->tx_desc_ring[TX_RING(prod)][TX_IDX(prod)]; frag_len = skb_frag_size(frag); - frag_mapping = skb_frag_dma_map(&pdev->dev, frag, 0, - frag_len, DMA_TO_DEVICE); - - if (unlikely(dma_mapping_error(&pdev->dev, frag_mapping))) - return NULL; - - dma_unmap_addr_set(frag_tx_buf, mapping, frag_mapping); - flags = frag_len << TX_BD_LEN_SHIFT; txbd->tx_bd_len_flags_type = cpu_to_le32(flags); + frag_mapping = page_pool_get_dma_addr(skb_frag_page(frag)) + + skb_frag_off(frag); txbd->tx_bd_haddr = cpu_to_le64(frag_mapping); len = frag_len; From b1dfc0f76231bbf395c59d20a2070684620d5d0f Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 12 Dec 2023 00:05:35 +0000 Subject: [PATCH 08/12] net: phy: skip LED triggers on PHYs on SFP modules Calling led_trigger_register() when attaching a PHY located on an SFP module potentially (and practically) leads into a deadlock. Fix this by not calling led_trigger_register() for PHYs localted on SFP modules as such modules actually never got any LEDs. ====================================================== WARNING: possible circular locking dependency detected 6.7.0-rc4-next-20231208+ #0 Tainted: G O ------------------------------------------------------ kworker/u8:2/43 is trying to acquire lock: ffffffc08108c4e8 (triggers_list_lock){++++}-{3:3}, at: led_trigger_register+0x4c/0x1a8 but task is already holding lock: ffffff80c5c6f318 (&sfp->sm_mutex){+.+.}-{3:3}, at: cleanup_module+0x2ba8/0x3120 [sfp] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&sfp->sm_mutex){+.+.}-{3:3}: __mutex_lock+0x88/0x7a0 mutex_lock_nested+0x20/0x28 cleanup_module+0x2ae0/0x3120 [sfp] sfp_register_bus+0x5c/0x9c sfp_register_socket+0x48/0xd4 cleanup_module+0x271c/0x3120 [sfp] platform_probe+0x64/0xb8 really_probe+0x17c/0x3c0 __driver_probe_device+0x78/0x164 driver_probe_device+0x3c/0xd4 __driver_attach+0xec/0x1f0 bus_for_each_dev+0x60/0xa0 driver_attach+0x20/0x28 bus_add_driver+0x108/0x208 driver_register+0x5c/0x118 __platform_driver_register+0x24/0x2c init_module+0x28/0xa7c [sfp] do_one_initcall+0x70/0x2ec do_init_module+0x54/0x1e4 load_module+0x1b78/0x1c8c __do_sys_init_module+0x1bc/0x2cc __arm64_sys_init_module+0x18/0x20 invoke_syscall.constprop.0+0x4c/0xdc do_el0_svc+0x3c/0xbc el0_svc+0x34/0x80 el0t_64_sync_handler+0xf8/0x124 el0t_64_sync+0x150/0x154 -> #2 (rtnl_mutex){+.+.}-{3:3}: __mutex_lock+0x88/0x7a0 mutex_lock_nested+0x20/0x28 rtnl_lock+0x18/0x20 set_device_name+0x30/0x130 netdev_trig_activate+0x13c/0x1ac led_trigger_set+0x118/0x234 led_trigger_write+0x104/0x17c sysfs_kf_bin_write+0x64/0x80 kernfs_fop_write_iter+0x128/0x1b4 vfs_write+0x178/0x2a4 ksys_write+0x58/0xd4 __arm64_sys_write+0x18/0x20 invoke_syscall.constprop.0+0x4c/0xdc do_el0_svc+0x3c/0xbc el0_svc+0x34/0x80 el0t_64_sync_handler+0xf8/0x124 el0t_64_sync+0x150/0x154 -> #1 (&led_cdev->trigger_lock){++++}-{3:3}: down_write+0x4c/0x13c led_trigger_write+0xf8/0x17c sysfs_kf_bin_write+0x64/0x80 kernfs_fop_write_iter+0x128/0x1b4 vfs_write+0x178/0x2a4 ksys_write+0x58/0xd4 __arm64_sys_write+0x18/0x20 invoke_syscall.constprop.0+0x4c/0xdc do_el0_svc+0x3c/0xbc el0_svc+0x34/0x80 el0t_64_sync_handler+0xf8/0x124 el0t_64_sync+0x150/0x154 -> #0 (triggers_list_lock){++++}-{3:3}: __lock_acquire+0x12a0/0x2014 lock_acquire+0x100/0x2ac down_write+0x4c/0x13c led_trigger_register+0x4c/0x1a8 phy_led_triggers_register+0x9c/0x214 phy_attach_direct+0x154/0x36c phylink_attach_phy+0x30/0x60 phylink_sfp_connect_phy+0x140/0x510 sfp_add_phy+0x34/0x50 init_module+0x15c/0xa7c [sfp] cleanup_module+0x1d94/0x3120 [sfp] cleanup_module+0x2bb4/0x3120 [sfp] process_one_work+0x1f8/0x4ec worker_thread+0x1e8/0x3d8 kthread+0x104/0x110 ret_from_fork+0x10/0x20 other info that might help us debug this: Chain exists of: triggers_list_lock --> rtnl_mutex --> &sfp->sm_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sfp->sm_mutex); lock(rtnl_mutex); lock(&sfp->sm_mutex); lock(triggers_list_lock); *** DEADLOCK *** 4 locks held by kworker/u8:2/43: #0: ffffff80c000f938 ((wq_completion)events_power_efficient){+.+.}-{0:0}, at: process_one_work+0x150/0x4ec #1: ffffffc08214bde8 ((work_completion)(&(&sfp->timeout)->work)){+.+.}-{0:0}, at: process_one_work+0x150/0x4ec #2: ffffffc0810902f8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_lock+0x18/0x20 #3: ffffff80c5c6f318 (&sfp->sm_mutex){+.+.}-{3:3}, at: cleanup_module+0x2ba8/0x3120 [sfp] stack backtrace: CPU: 0 PID: 43 Comm: kworker/u8:2 Tainted: G O 6.7.0-rc4-next-20231208+ #0 Hardware name: Bananapi BPI-R4 (DT) Workqueue: events_power_efficient cleanup_module [sfp] Call trace: dump_backtrace+0xa8/0x10c show_stack+0x14/0x1c dump_stack_lvl+0x5c/0xa0 dump_stack+0x14/0x1c print_circular_bug+0x328/0x430 check_noncircular+0x124/0x134 __lock_acquire+0x12a0/0x2014 lock_acquire+0x100/0x2ac down_write+0x4c/0x13c led_trigger_register+0x4c/0x1a8 phy_led_triggers_register+0x9c/0x214 phy_attach_direct+0x154/0x36c phylink_attach_phy+0x30/0x60 phylink_sfp_connect_phy+0x140/0x510 sfp_add_phy+0x34/0x50 init_module+0x15c/0xa7c [sfp] cleanup_module+0x1d94/0x3120 [sfp] cleanup_module+0x2bb4/0x3120 [sfp] process_one_work+0x1f8/0x4ec worker_thread+0x1e8/0x3d8 kthread+0x104/0x110 ret_from_fork+0x10/0x20 Signed-off-by: Daniel Golle Fixes: 01e5b728e9e4 ("net: phy: Add a binding for PHY LEDs") Link: https://lore.kernel.org/r/102a9dce38bdf00215735d04cd4704458273ad9c.1702339354.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 2ce74593d6e4a1..a42df2c1bd043c 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1548,7 +1548,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, goto error; phy_resume(phydev); - phy_led_triggers_register(phydev); + if (!phydev->is_on_sfp_module) + phy_led_triggers_register(phydev); /** * If the external phy used by current mac interface is managed by @@ -1817,7 +1818,8 @@ void phy_detach(struct phy_device *phydev) } phydev->phylink = NULL; - phy_led_triggers_unregister(phydev); + if (!phydev->is_on_sfp_module) + phy_led_triggers_unregister(phydev); if (phydev->mdio.dev.driver) module_put(phydev->mdio.dev.driver->owner); From c8f021eec5817601dbd25ab7e3ad5c720965c688 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Dec 2023 17:04:24 +0100 Subject: [PATCH 09/12] selftests: mptcp: join: fix subflow_send_ack lookup MPC backups tests will skip unexpected sometimes (For example, when compiling kernel with an older version of gcc, such as gcc-8), since static functions like mptcp_subflow_send_ack also be listed in /proc/kallsyms, with a 't' in front of it, not 'T' ('T' is for a global function): > grep "mptcp_subflow_send_ack" /proc/kallsyms 0000000000000000 T __pfx___mptcp_subflow_send_ack 0000000000000000 T __mptcp_subflow_send_ack 0000000000000000 t __pfx_mptcp_subflow_send_ack 0000000000000000 t mptcp_subflow_send_ack In this case, mptcp_lib_kallsyms_doesnt_have "mptcp_subflow_send_ack$" will be false, MPC backups tests will skip. This is not what we expected. The correct logic here should be: if mptcp_subflow_send_ack is not a global function in /proc/kallsyms, do these MPC backups tests. So a 'T' must be added in front of mptcp_subflow_send_ack. Fixes: 632978f0a961 ("selftests: mptcp: join: skip MPC backups tests if not supported") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 3c94f2f194d681..24a57b3ae2155d 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2776,7 +2776,7 @@ backup_tests() fi if reset "mpc backup" && - continue_if mptcp_lib_kallsyms_doesnt_have "mptcp_subflow_send_ack$"; then + continue_if mptcp_lib_kallsyms_doesnt_have "T mptcp_subflow_send_ack$"; then pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2785,7 +2785,7 @@ backup_tests() fi if reset "mpc backup both sides" && - continue_if mptcp_lib_kallsyms_doesnt_have "mptcp_subflow_send_ack$"; then + continue_if mptcp_lib_kallsyms_doesnt_have "T mptcp_subflow_send_ack$"; then pm_nl_add_endpoint $ns1 10.0.1.1 flags subflow,backup pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow,backup speed=slow \ @@ -2795,7 +2795,7 @@ backup_tests() fi if reset "mpc switch to backup" && - continue_if mptcp_lib_kallsyms_doesnt_have "mptcp_subflow_send_ack$"; then + continue_if mptcp_lib_kallsyms_doesnt_have "T mptcp_subflow_send_ack$"; then pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 @@ -2804,7 +2804,7 @@ backup_tests() fi if reset "mpc switch to backup both sides" && - continue_if mptcp_lib_kallsyms_doesnt_have "mptcp_subflow_send_ack$"; then + continue_if mptcp_lib_kallsyms_doesnt_have "T mptcp_subflow_send_ack$"; then pm_nl_add_endpoint $ns1 10.0.1.1 flags subflow pm_nl_add_endpoint $ns2 10.0.1.2 flags subflow sflags=backup speed=slow \ From 4fd19a30701659af5839b7bd19d1f05f05933ebe Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 15 Dec 2023 17:04:25 +0100 Subject: [PATCH 10/12] mptcp: fix inconsistent state on fastopen race The netlink PM can race with fastopen self-connect attempts, shutting down the first subflow via: MPTCP_PM_CMD_DEL_ADDR -> mptcp_nl_remove_id_zero_address -> mptcp_pm_nl_rm_subflow_received -> mptcp_close_ssk and transitioning such subflow to FIN_WAIT1 status before the syn-ack packet is processed. The MPTCP code does not react to such state change, leaving the connection in not-fallback status and the subflow handshake uncompleted, triggering the following splat: WARNING: CPU: 0 PID: 10630 at net/mptcp/subflow.c:1405 subflow_data_ready+0x39f/0x690 net/mptcp/subflow.c:1405 Modules linked in: CPU: 0 PID: 10630 Comm: kworker/u4:11 Not tainted 6.6.0-syzkaller-14500-g1c41041124bd #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 Workqueue: bat_events batadv_nc_worker RIP: 0010:subflow_data_ready+0x39f/0x690 net/mptcp/subflow.c:1405 Code: 18 89 ee e8 e3 d2 21 f7 40 84 ed 75 1f e8 a9 d7 21 f7 44 89 fe bf 07 00 00 00 e8 0c d3 21 f7 41 83 ff 07 74 07 e8 91 d7 21 f7 <0f> 0b e8 8a d7 21 f7 48 89 df e8 d2 b2 ff ff 31 ff 89 c5 89 c6 e8 RSP: 0018:ffffc90000007448 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888031efc700 RCX: ffffffff8a65baf4 RDX: ffff888043222140 RSI: ffffffff8a65baff RDI: 0000000000000005 RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000007 R10: 000000000000000b R11: 0000000000000000 R12: 1ffff92000000e89 R13: ffff88807a534d80 R14: ffff888021c11a00 R15: 000000000000000b FS: 0000000000000000(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa19a0ffc81 CR3: 000000007a2db000 CR4: 00000000003506f0 DR0: 000000000000d8dd DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Call Trace: tcp_data_ready+0x14c/0x5b0 net/ipv4/tcp_input.c:5128 tcp_data_queue+0x19c3/0x5190 net/ipv4/tcp_input.c:5208 tcp_rcv_state_process+0x11ef/0x4e10 net/ipv4/tcp_input.c:6844 tcp_v4_do_rcv+0x369/0xa10 net/ipv4/tcp_ipv4.c:1929 tcp_v4_rcv+0x3888/0x3b30 net/ipv4/tcp_ipv4.c:2329 ip_protocol_deliver_rcu+0x9f/0x480 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x2e4/0x510 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ip_local_deliver+0x1b6/0x550 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:461 [inline] ip_rcv_finish+0x1c4/0x2e0 net/ipv4/ip_input.c:449 NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ip_rcv+0xce/0x440 net/ipv4/ip_input.c:569 __netif_receive_skb_one_core+0x115/0x180 net/core/dev.c:5527 __netif_receive_skb+0x1f/0x1b0 net/core/dev.c:5641 process_backlog+0x101/0x6b0 net/core/dev.c:5969 __napi_poll.constprop.0+0xb4/0x540 net/core/dev.c:6531 napi_poll net/core/dev.c:6600 [inline] net_rx_action+0x956/0xe90 net/core/dev.c:6733 __do_softirq+0x21a/0x968 kernel/softirq.c:553 do_softirq kernel/softirq.c:454 [inline] do_softirq+0xaa/0xe0 kernel/softirq.c:441 __local_bh_enable_ip+0xf8/0x120 kernel/softirq.c:381 spin_unlock_bh include/linux/spinlock.h:396 [inline] batadv_nc_purge_paths+0x1ce/0x3c0 net/batman-adv/network-coding.c:471 batadv_nc_worker+0x9b1/0x10e0 net/batman-adv/network-coding.c:722 process_one_work+0x884/0x15c0 kernel/workqueue.c:2630 process_scheduled_works kernel/workqueue.c:2703 [inline] worker_thread+0x8b9/0x1290 kernel/workqueue.c:2784 kthread+0x33c/0x440 kernel/kthread.c:388 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 To address the issue, catch the racing subflow state change and use it to cause the MPTCP fallback. Such fallback is also used to cause the first subflow state propagation to the msk socket via mptcp_set_connected(). After this change, the first subflow can additionally propagate the TCP_FIN_WAIT1 state, so rename the helper accordingly. Finally, if the state propagation is delayed to the msk release callback, the first subflow can change to a different state in between. Cache the relevant target state in a new msk-level field and use such value to update the msk state at release time. Fixes: 1e777f39b4d7 ("mptcp: add MSG_FASTOPEN sendmsg flag support") Cc: stable@vger.kernel.org Reported-by: Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/458 Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 6 +++--- net/mptcp/protocol.h | 9 ++++++--- net/mptcp/subflow.c | 28 +++++++++++++++++----------- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bc81ea53a04992..5cd5c3f535a821 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3402,12 +3402,12 @@ static void mptcp_release_cb(struct sock *sk) if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); if (unlikely(msk->cb_flags)) { - /* be sure to set the current sk state before taking actions + /* be sure to sync the msk state before taking actions * depending on sk_state (MPTCP_ERROR_REPORT) * On sk release avoid actions depending on the first subflow */ - if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags) && msk->first) - __mptcp_set_connected(sk); + if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first) + __mptcp_sync_state(sk, msk->pending_state); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index fe6f2d399ee898..aa1a93fe40ffaf 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -124,7 +124,7 @@ #define MPTCP_ERROR_REPORT 3 #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 -#define MPTCP_CONNECTED 6 +#define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 struct mptcp_skb_cb { @@ -296,6 +296,9 @@ struct mptcp_sock { bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; bool allow_infinite_fallback; + u8 pending_state; /* A subflow asked to set this sk_state, + * protected by the msk data lock + */ u8 mpc_endpoint_id; u8 recvmsg_inq:1, cork:1, @@ -728,7 +731,7 @@ void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); -void __mptcp_set_connected(struct sock *sk); +void __mptcp_sync_state(struct sock *sk, int state); void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout); static inline void mptcp_stop_tout_timer(struct sock *sk) @@ -1115,7 +1118,7 @@ static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - return sk->sk_state == TCP_ESTABLISHED && + return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1) && is_active_ssk(subflow) && !subflow->conn_finished; } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a4f3c27f0309f9..6d7684c35e9331 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -419,22 +419,28 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } -void __mptcp_set_connected(struct sock *sk) +void __mptcp_sync_state(struct sock *sk, int state) { - __mptcp_propagate_sndbuf(sk, mptcp_sk(sk)->first); + struct mptcp_sock *msk = mptcp_sk(sk); + + __mptcp_propagate_sndbuf(sk, msk->first); if (sk->sk_state == TCP_SYN_SENT) { - inet_sk_state_store(sk, TCP_ESTABLISHED); + inet_sk_state_store(sk, state); sk->sk_state_change(sk); } } -static void mptcp_set_connected(struct sock *sk) +static void mptcp_propagate_state(struct sock *sk, struct sock *ssk) { + struct mptcp_sock *msk = mptcp_sk(sk); + mptcp_data_lock(sk); - if (!sock_owned_by_user(sk)) - __mptcp_set_connected(sk); - else - __set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->cb_flags); + if (!sock_owned_by_user(sk)) { + __mptcp_sync_state(sk, ssk->sk_state); + } else { + msk->pending_state = ssk->sk_state; + __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags); + } mptcp_data_unlock(sk); } @@ -496,7 +502,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow_set_remote_key(msk, subflow, &mp_opt); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); - mptcp_set_connected(parent); + mptcp_propagate_state(parent, sk); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -540,7 +546,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (mptcp_check_fallback(sk)) { fallback: mptcp_rcv_space_init(msk, sk); - mptcp_set_connected(parent); + mptcp_propagate_state(parent, sk); } return; @@ -1740,7 +1746,7 @@ static void subflow_state_change(struct sock *sk) mptcp_rcv_space_init(msk, sk); pr_fallback(msk); subflow->conn_finished = 1; - mptcp_set_connected(parent); + mptcp_propagate_state(parent, sk); } /* as recvmsg() does not acquire the subflow socket for ssk selection From a8f570b247972775f710375125ebabfc47b1e518 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 15 Dec 2023 17:04:26 +0100 Subject: [PATCH 11/12] mptcp: fill in missing MODULE_DESCRIPTION() W=1 builds warn on missing MODULE_DESCRIPTION, add them here in MPTCP. Only two were missing: two modules with different KUnit tests for MPTCP. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/crypto_test.c | 1 + net/mptcp/token_test.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/mptcp/crypto_test.c b/net/mptcp/crypto_test.c index 017248dea038c1..220414e5c850f4 100644 --- a/net/mptcp/crypto_test.c +++ b/net/mptcp/crypto_test.c @@ -70,3 +70,4 @@ static struct kunit_suite mptcp_crypto_suite = { kunit_test_suite(mptcp_crypto_suite); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("KUnit tests for MPTCP Crypto"); diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c index 0758865ab658ea..bfff53e668da68 100644 --- a/net/mptcp/token_test.c +++ b/net/mptcp/token_test.c @@ -143,3 +143,4 @@ static struct kunit_suite mptcp_token_suite = { kunit_test_suite(mptcp_token_suite); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("KUnit tests for MPTCP Token"); From 356c71c46169d5f3ff7f9ae939d73aceb3b2e514 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Dec 2023 17:04:27 +0100 Subject: [PATCH 12/12] mailmap: add entries for Geliang Tang Map Geliang's old mail addresses to his @linux.dev one. Suggested-by: Mat Martineau Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- .mailmap | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.mailmap b/.mailmap index 3ac1c12545f201..68e72a6017a0ee 100644 --- a/.mailmap +++ b/.mailmap @@ -191,6 +191,10 @@ Gao Xiang Gao Xiang Gao Xiang Gao Xiang +Geliang Tang +Geliang Tang +Geliang Tang +Geliang Tang Georgi Djakov Gerald Schaefer Gerald Schaefer