求助,使用内核定时器时遇到一些问题

内核编译和嵌入式产品的设计与开发
回复
fufen
帖子: 2
注册时间: 2024-04-19 14:07

求助,使用内核定时器时遇到一些问题

#1

帖子 fufen » 2024-04-19 14:15

各位大佬好,我的目的是在写一个内核模块,在UDP层上写一个可靠传输协议,然后在使用内核定时器时遇到一些问题。我的Linux版本是6.5.1,Ubuntu是22.04。Bug的主要表现是,关闭掉测试的进程后一段时间,内核会随机地报Oops,Oops中的call traces我实在是看不懂。以下是我的代码,代码是简化之后的,当不涉及其他逻辑时,这个bug依旧存在:

代码: 全选

#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <net/protocol.h>
#include <net/sock.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <net/udp.h>

#define IPPROTO_RUDP 141

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Zhou hongfeng");
MODULE_DESCRIPTION("A simple example Linux module.");
MODULE_VERSION("0.01");

static struct proto rudp_prot;
static struct net_protocol rudp_protocol;

static struct inet_protosw RUDP_inetsw =
{
		.type = SOCK_DGRAM,
		.protocol = IPPROTO_RUDP,
		.prot = &rudp_prot,
		.ops = &inet_dgram_ops,
		.flags = INET_PROTOSW_REUSE,
};

struct rudp_sock{
	struct udp_sock usock;
	struct timer_list timer;
};

void retransmit_handler(struct timer_list *t)
{
	printk("timer activated!\n");
	struct rudp_sock *rsock =
		from_timer(rsock, t, timer);
	struct sock *sk = (struct sock*)rsock;
	mod_timer(&rsock->timer,jiffies+HZ);
}

int rudp_init(struct sock *sk)
{
	printk(KERN_INFO "Init sock!\n");
	struct rudp_sock *rsock = (struct rudp_sock*)(sk);

	timer_setup(&rsock->timer,retransmit_handler,0);

	mod_timer(&rsock->timer,jiffies+HZ);
	return udp_prot.init(sk);
}

void rudp_close(struct sock *sk,long timeout)
{
	struct rudp_sock *rsock = (struct rudp_sock*)sk;
	del_timer_sync(&rsock->timer);

	printk("ref: %d\n",refcount_read(&sk->sk_refcnt));

	udp_prot.close(sk,timeout);
}

int rudp_rcv(struct sk_buff *skb)
{
	struct net_protocol *udp_protocol_ref = rcu_dereference(inet_protos[IPPROTO_UDP]);
    return udp_protocol_ref->handler(skb);
}

int rudp_err(struct sk_buff *skb, u32 info)
{
    struct net_protocol *udp_protocol_ref = rcu_dereference(inet_protos[IPPROTO_UDP]);
    return udp_protocol_ref->err_handler(skb,info);
}

void init_RUDP(void)
{
	rudp_prot = udp_prot;
	memcpy(rudp_prot.name,"RUDP\0",5);
	// rudp_prot.sendmsg = rudp_sendmsg;
	// rudp_prot.recvmsg = rudp_recvmsg;
	rudp_prot.init = rudp_init;
	// rudp_prot.connect = rudp_connect;
	rudp_prot.close = rudp_close;
	// rudp_prot.accept = rudp_accept;

	rudp_protocol.handler = rudp_rcv;
	rudp_protocol.err_handler = rudp_err;
	rudp_protocol.no_policy = 1;
	return ;
}

static int __init lkm_example_init(void) {
	init_RUDP();
	int res;
	res =  proto_register(&rudp_prot,1);

	if(res<0)
		printk(KERN_INFO "proto_register error!\n");		
	res = inet_add_protocol(&rudp_protocol,IPPROTO_RUDP);
	if(res<0)
		printk(KERN_INFO "proto add error!\n");
	inet_register_protosw(&RUDP_inetsw);
	printk(KERN_INFO "Hello, World!\n");
	return 0;
}
static void __exit lkm_example_exit(void) {
	printk(KERN_INFO "Goodbye, World!\n");
	proto_unregister(&rudp_prot);	
	inet_del_protocol(&rudp_protocol,IPPROTO_RUDP);
	inet_unregister_protosw(&RUDP_inetsw);
	return ;
}


module_init(lkm_example_init);
module_exit(lkm_example_exit);
然后下面是报的Oops:

代码: 全选

[  363.328047] BUG: unable to handle page fault for address: 00000001000021da
[  363.328067] #PF: supervisor read access in kernel mode
[  363.328072] #PF: error_code(0x0001) - permissions violation
[  363.328076] PGD 62a37067 P4D 62a37067 PUD 62a36067 PMD ed89067 PTE 80000000bbade865
[  363.328086] Oops: 0001 [#1] PREEMPT SMP NOPTI
[  363.328095] CPU: 6 PID: 69 Comm: khungtaskd Tainted: G           OE      6.5.1 #5
[  363.328100] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020
[  363.328104] RIP: 0010:watchdog+0x133/0x570
[  363.328130] Code: 48 8b 50 10 48 83 c0 10 4c 8d a2 a8 f5 ff ff 48 39 c2 0f 84 36 03 00 00 44 8d 79 ff 85 c9 0f 84 8e 01 00 00 48 89 75 b8 eb 40 <41> 8b 44 24 18 25 02 05 00 00 83 f8 02 0f 84 11 01 00 00 48 8b 83
[  363.328135] RSP: 0018:ffffb73b4066be90 EFLAGS: 00010203
[  363.328139] RAX: 0000000000000019 RBX: ffff9b9bf6896ed0 RCX: 00000000003ffece
[  363.328143] RDX: 0000000100003e00 RSI: ffff9b9bf6896600 RDI: 0000000000000000
[  363.328146] RBP: ffffb73b4066bee0 R08: 0000000000000000 R09: 0000000000000000
[  363.328149] R10: 0000000000000000 R11: 0000000000000000 R12: 00000001000021c2
[  363.328152] R13: 0000000100003e00 R14: 0000000000000000 R15: 00000000003ffecd
[  363.328156] FS:  0000000000000000(0000) GS:ffff9b9cf9f80000(0000) knlGS:0000000000000000
[  363.328159] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  363.328163] CR2: 00000001000021da CR3: 0000000001fc6000 CR4: 0000000000350ee0
[  363.328198] Call Trace:
[  363.328207]  <TASK>
[  363.328222]  ? show_regs+0x6d/0x80
[  363.328240]  ? __die+0x24/0x80
[  363.328246]  ? page_fault_oops+0x176/0x500
[  363.328260]  ? raw_spin_rq_unlock+0x10/0x40
[  363.328276]  ? do_user_addr_fault+0x31d/0x6b0
[  363.328282]  ? exc_page_fault+0x83/0x1b0
[  363.328306]  ? asm_exc_page_fault+0x27/0x30
[  363.328322]  ? watchdog+0x133/0x570
[  363.328330]  ? __pfx_watchdog+0x10/0x10
[  363.328336]  kthread+0xf2/0x120
[  363.328349]  ? __pfx_kthread+0x10/0x10
[  363.328355]  ret_from_fork+0x47/0x70
[  363.328363]  ? __pfx_kthread+0x10/0x10
[  363.328369]  ret_from_fork_asm+0x1b/0x30
[  363.328384]  </TASK>
[  363.328386] Modules linked in: testModel(OE) isofs bnep vsock_loopback vmw_vsock_virtio_transport_common snd_ens1371 snd_ac97_codec vmw_vsock_vmci_transport vsock gameport intel_rapl_msr ac97_bus snd_pcm btusb btrtl btbcm intel_rapl_common btintel vmw_balloon btmtk snd_seq_midi crct10dif_pclmul bluetooth binfmt_misc polyval_clmulni polyval_generic snd_seq_midi_event ghash_clmulni_intel aesni_intel crypto_simd snd_rawmidi cryptd nls_iso8859_1 snd_seq input_leds joydev serio_raw snd_seq_device snd_timer ecdh_generic snd ecc vmw_vmci soundcore sch_fq_codel mac_hid vmwgfx drm_ttm_helper ttm drm_kms_helper drm msr parport_pc ppdev lp parport efi_pstore ip_tables x_tables autofs4 hid_generic crc32_pclmul psmouse usbhid mptspi mptscsih ahci hid mptbase e1000 scsi_transport_spi libahci i2c_piix4 pata_acpi [last unloaded: testModel(OE)]
[  363.328583] CR2: 00000001000021da
[  363.328589] ---[ end trace 0000000000000000 ]---
[  363.328592] RIP: 0010:watchdog+0x133/0x570
[  363.328637] Code: 48 8b 50 10 48 83 c0 10 4c 8d a2 a8 f5 ff ff 48 39 c2 0f 84 36 03 00 00 44 8d 79 ff 85 c9 0f 84 8e 01 00 00 48 89 75 b8 eb 40 <41> 8b 44 24 18 25 02 05 00 00 83 f8 02 0f 84 11 01 00 00 48 8b 83
[  363.328641] RSP: 0018:ffffb73b4066be90 EFLAGS: 00010203
[  363.328645] RAX: 0000000000000019 RBX: ffff9b9bf6896ed0 RCX: 00000000003ffece
[  363.328648] RDX: 0000000100003e00 RSI: ffff9b9bf6896600 RDI: 0000000000000000
[  363.328651] RBP: ffffb73b4066bee0 R08: 0000000000000000 R09: 0000000000000000
[  363.328653] R10: 0000000000000000 R11: 0000000000000000 R12: 00000001000021c2
[  363.328656] R13: 0000000100003e00 R14: 0000000000000000 R15: 00000000003ffecd
[  363.328659] FS:  0000000000000000(0000) GS:ffff9b9cf9f80000(0000) knlGS:0000000000000000
[  363.328663] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  363.328666] CR2: 00000001000021da CR3: 0000000001fc6000 CR4: 0000000000350ee0
[  363.328689] note: khungtaskd[69] exited with irqs disabled
目前能确定的就是把定时器部分去掉,就不会报错,或者把sock的部分去掉,只用定时器,也不会报错。求助各位大佬,这个问题卡了我两个礼拜了,都没找到答。
头像
astolia
论坛版主
帖子: 6465
注册时间: 2008-09-18 13:11

Re: 求助,使用内核定时器时遇到一些问题

#2

帖子 astolia » 2024-04-22 13:06

fufen 写了: 2024-04-19 14:15

代码: 全选

struct rudp_sock{
	struct udp_sock usock;
	struct timer_list timer;
};

代码: 全选

int rudp_init(struct sock *sk)
{
	printk(KERN_INFO "Init sock!\n");
	struct rudp_sock *rsock = (struct rudp_sock*)(sk);
	timer_setup(&rsock->timer,retransmit_handler,0);
看出问题没有?

没看出的话猜猜下面的代码输出是啥?执行逻辑和你的代码是一样的。

代码: 全选

#include <stdio.h>
struct x {
 long long n;
 int extra;
};
void x2(int *n) {
  struct x *p = (struct x *)n;
  p->extra = 123;
  *n *= 2;
}
int main() {
  int a[4] = {1, 2, 3, 4};
  x2(&a[0]);
  x2(&a[1]);
  x2(&a[2]);
  printf("%d, %d, %d, %d\n", a[0], a[1], a[2], a[3]);
  return 0;
}
fufen
帖子: 2
注册时间: 2024-04-19 14:07

Re: 求助,使用内核定时器时遇到一些问题

#3

帖子 fufen » 2024-04-22 14:46

astolia 写了: 2024-04-22 13:06
fufen 写了: 2024-04-19 14:15

代码: 全选

struct rudp_sock{
	struct udp_sock usock;
	struct timer_list timer;
};

代码: 全选

int rudp_init(struct sock *sk)
{
	printk(KERN_INFO "Init sock!\n");
	struct rudp_sock *rsock = (struct rudp_sock*)(sk);
	timer_setup(&rsock->timer,retransmit_handler,0);
看出问题没有?

没看出的话猜猜下面的代码输出是啥?执行逻辑和你的代码是一样的。

代码: 全选

#include <stdio.h>
struct x {
 long long n;
 int extra;
};
void x2(int *n) {
  struct x *p = (struct x *)n;
  p->extra = 123;
  *n *= 2;
}
int main() {
  int a[4] = {1, 2, 3, 4};
  x2(&a[0]);
  x2(&a[1]);
  x2(&a[2]);
  printf("%d, %d, %d, %d\n", a[0], a[1], a[2], a[3]);
  return 0;
}
太屌了楼主,我知道是为什么了。proto这个结构体还有个obj_size成员,我忘记初始化了(最初的版本有初始化,但是重构了一遍代码后就忘了),导致创建sock结构体指针时内存不够,使用timer时会引用到不属于这个结构体的内存。
太感谢您了,这个bug我找了两三个礼拜,五体投地了!!
回复