基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

一、Socket API编程接口

Libc库中定义的一些应用编程接口(Application Program Interface, API)引用了封装例程(Wrapper Routine),一般一个封装例程对应一个系统调用,大部分封装例程返回一个整数,其值含义依赖于相应的系统调用,-1在多数情况下表示内核不能满足进程的请求,Libc中定义的errno变量包含特定的出错码。C语言中的Socket API就是一种涉及系统调用的API,常用的函数如下:

int socket(int domain, int type, int protocol)
//创建一个新的套接字,返回套接字描述符
int connect(int sockfd, struct sockaddr *server_addr, int sockaddr_len)
//同远程服务器主动连接,成功时返回0,失败时返回1
int bind(int sockfd, struct sockaddr* my_addr, int addrlen)
//为套接字指明一个本地端点地址,TCP/IP协议使用sockaddr_in结构,包含IP地址和端口号,服务器使用它来指明熟悉的端口号,然后等待连接
int listen(int sockfd, int input_queue_size)
//面向连接的服务器指明某个套接字,将其置为被动模式,并准备接收传入连接
int accept(int sockfd, void* addr, int* addrlen)
//获取传入连接请求,返回新的连接套接字描述符,为每个新连接请求创建一个新的套接字,服务器只对新的连接使用该套接字,原来的监听套接字接受其他的连接请求。新的连接上传输数据使用新的套接字
int sendto(int sockfd, const void* data, int data_len, unsigned int flags, struct sockaddr* remaddr,int remaddr_len)
//基于UDP发送数据报,返回实际发送的数据长度,出错时返回1
int send(int sockfd, const void* data, int data_len, unsigned int flags)
//在TCP连接上发送数据,返回成功传送数据的长度,出错时返回-1,将外发数据复制到OS内核中
int recvfrom(int sockfd, void *buf, int buf_len,unsigned int flags,struct sockaddr *from,int *fromlen);
//从UDP接收数据,返回实际接收的字节数,失败时返回-1
int recv(int sockfd, void* buf, int buf_len,unsigned int flags) 
//从TCP接收数据,返回实际接收的数据长度,出错时返回-1。服务器使用其接收客户请求,客户使用它接受服务器的应答。如果没有数据,将阻塞,如果收到的数据大于缓存的大小,多余的数据将丢弃
close(int sockfd)
//撤销套接字,如果只有一个进程使用,立即终止连接并撤销该套接字,如果多个进程共享该套接字,将引用数减一,如果引用数降到零,则撤销它

基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

图1 UDP连接涉及的Socket API

基于x86-64 Linux-5.0.1的Socket与系统调用深度分析图2 TCP连接涉及的Socket API

二、系统调用机制及内核中相关源代码

基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

 图3 应?程序、封装例程、系统调?处理程序及系统调?服务例程之间的关系

x86-64Linux系统启动时依次调用以下过程:start_kernel --> trap_init --> cpu_init --> syscall_init,而syscall_init函数实现了系统调用的初始化将中断向量与服务例程进行绑定。除此之外,还要进行系统调用表(对应于sys_call_table 数组)的初始化。在linux-5.0.1/arch/x86/kernel/cpu/common.c中定义了sysycall_init函数:

/* May not be marked __init: used by software suspend */

void syscall_init(void)

{

    wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);

    wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);



#ifdef CONFIG_IA32_EMULATION

    wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);

    /*

     * This only works on Intel CPUs.

     * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.

     * This does not cause SYSENTER to jump to the wrong location, because

     * AMD doesn‘t allow SYSENTER in long mode (either 32- or 64-bit).

     */

    wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);

    wrmsrl_safe(MSR_IA32_SYSENTER_ESP,

            (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));

    wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);

#else

    wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);

    wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);

    wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);

    wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);

#endif



    /* Flags to clear on syscall */

    wrmsrl(MSR_SYSCALL_MASK,

           X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|

           X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);

}

在一个终端打开qemu启动MenuOS,在另一个终端用gdb读入linux-5.0.1的vmlinux,通过端口1234与qemu建立连接,在start_kernel,trap_init,cpu_init,syscall_init处设置断点,然后不断continue,跟踪验证内核启动及系统调用初始化过程,如下图所示:基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

当用户态程序进行系统调用时,CPU会切换到内核态并开始执行一个内核函数,内核实现了很多不同的系统调用,进程必须传递一个叫作系统调用号的参数来指明需要哪个系统调用。对于x86-64系统来说,用户态程序发起系统调用时,进程会跳转到entry_SYSCALL_64,在linux-5.0.1/arch/x86/entry/entry_64.S和//以下代码来自linux-5.0.1/arch/x86/entry/common.c中定义了x86-64的系统调用服务例程:

//以下代码来自linux-5.0.1/arch/x86/entry/entry_64.SGLOBAL(entry_SYSCALL_64_after_hwframe)
...
    /* IRQs are off. */
    movq    %rax, %rdi
    movq    %rsp, %rsi
    call    do_syscall_64        /* returns with IRQs disabled */
...//以下代码来自linux-5.0.1/arch/x86/entry/common.c
#ifdef CONFIG_X86_64
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
...
    if (likely(nr < NR_syscalls)) {
        nr = array_index_nospec(nr, NR_syscalls);
        regs->ax = sys_call_table[nr](regs);
...
}
#endif

三、socket相关系统调用的内核处理函数

在linux-5.0.1/arch/x86/entry/syscalls/syscall_64.tbl中可查看x86-64系统调用号,及其对应API和入口(此处只摘取Socket相关的部分):

#
# 64-bit system call numbers and entry vectors
#
# The format is:
# <number> <abi> <name> <entry point>
#
# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
#
# The abi is "common", "64" or "x32" for this file.
#
...
41    common    socket            __x64_sys_socket
42    common    connect            __x64_sys_connect
43    common    accept            __x64_sys_accept
44    common    sendto            __x64_sys_sendto
45    64    recvfrom        __x64_sys_recvfrom
46    64    sendmsg            __x64_sys_sendmsg
47    64    recvmsg            __x64_sys_recvmsg
48    common    shutdown        __x64_sys_shutdown
49    common    bind            __x64_sys_bind
50    common    listen            __x64_sys_listen
51    common    getsockname        __x64_sys_getsockname
52    common    getpeername        __x64_sys_getpeername
53    common    socketpair        __x64_sys_socketpair
54    64    setsockopt        __x64_sys_setsockopt
55    64    getsockopt        __x64_sys_getsockopt

在linux-5.0.1/net/socket.c中可以查看Socket接口对应的Linux内核系统调用处理函数:

/*
 *    System call vectors.
 *
 *    Argument checking cleaned up. Saved 20% in size.
 *  This function doesn‘t need to set the kernel lock because
 *  it is set by the callees.
 */

SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
    unsigned long a[AUDITSC_ARGS];
    unsigned long a0, a1;
    int err;
    unsigned int len;

    if (call < 1 || call > SYS_SENDMMSG)
        return -EINVAL;
    call = array_index_nospec(call, SYS_SENDMMSG + 1);

    len = nargs[call];
    if (len > sizeof(a))
        return -EINVAL;

    /* copy_from_user should be SMP safe. */
    if (copy_from_user(a, args, len))
        return -EFAULT;

    err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
    if (err)
        return err;

    a0 = a[0];
    a1 = a[1];

    switch (call) {
    case SYS_SOCKET:
        err = __sys_socket(a0, a1, a[2]);
        break;
    case SYS_BIND:
        err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_CONNECT:
        err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_LISTEN:
        err = __sys_listen(a0, a1);
        break;
    case SYS_ACCEPT:
        err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2], 0);
        break;
    case SYS_GETSOCKNAME:
        err =
            __sys_getsockname(a0, (struct sockaddr __user *)a1,
                      (int __user *)a[2]);
        break;
    case SYS_GETPEERNAME:
        err =
            __sys_getpeername(a0, (struct sockaddr __user *)a1,
                      (int __user *)a[2]);
        break;
    case SYS_SOCKETPAIR:
        err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]);
        break;
    case SYS_SEND:
        err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                   NULL, 0);
        break;
    case SYS_SENDTO:
        err = __sys_sendto(a0, (void __user *)a1, a[2], a[3],
                   (struct sockaddr __user *)a[4], a[5]);
        break;
    case SYS_RECV:
        err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                     NULL, NULL);
        break;
    case SYS_RECVFROM:
        err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
                     (struct sockaddr __user *)a[4],
                     (int __user *)a[5]);
        break;
    case SYS_SHUTDOWN:
        err = __sys_shutdown(a0, a1);
        break;
    case SYS_SETSOCKOPT:
        err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3],
                       a[4]);
        break;
    case SYS_GETSOCKOPT:
        err =
            __sys_getsockopt(a0, a1, a[2], (char __user *)a[3],
                     (int __user *)a[4]);
        break;
    case SYS_SENDMSG:
        err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1,
                    a[2], true);
        break;
    case SYS_SENDMMSG:
        err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2],
                     a[3], true);
        break;
    case SYS_RECVMSG:
        err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1,
                    a[2], true);
        break;
    case SYS_RECVMMSG:
        if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME))
            err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                         a[2], a[3],
                         (struct __kernel_timespec __user *)a[4],
                         NULL);
        else
            err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1,
                         a[2], a[3], NULL,
                         (struct old_timespec32 __user *)a[4]);
        break;
    case SYS_ACCEPT4:
        err = __sys_accept4(a0, (struct sockaddr __user *)a1,
                    (int __user *)a[2], a[3]);
        break;
    default:
        err = -EINVAL;
        break;
    }
    return err;
}

#endif                /* __ARCH_WANT_SYS_SOCKETCALL */

接下来我们将通过gdb跟踪MenuOS中replyhi和hello指令的执行过程来了解Socket相关系统调用的内核函数。先打开linuxnet/lab3/main.c查看指令的实现代码:

#include"syswrapper.h"
#define MAX_CONNECT_QUEUE   1024
int Replyhi()
{
    char szBuf[MAX_BUF_LEN] = "\0";
    char szReplyMsg[MAX_BUF_LEN] = "hi\0";
    InitializeService();
    while (1)
    {
        ServiceStart();
        RecvMsg(szBuf);
        SendMsg(szReplyMsg);
        ServiceStop();
    }
    ShutdownService();
    return 0;
}

int StartReplyhi(int argc, char *argv[])
{
    int pid;
    /* fork another process */
    pid = fork();
    if (pid < 0)
    {
        /* error occurred */
        fprintf(stderr, "Fork Failed!");
        exit(-1);
    }
    else if (pid == 0)
    {
        /*     child process     */
        Replyhi();
        printf("Reply hi TCP Service Started!\n");
    }
    else
    {
        /*     parent process     */
        printf("Please input hello...\n");
    }
}

int Hello(int argc, char *argv[])
{
    char szBuf[MAX_BUF_LEN] = "\0";
    char szMsg[MAX_BUF_LEN] = "hello\0";
    OpenRemoteService();
    SendMsg(szMsg);
    RecvMsg(szBuf);
    CloseRemoteService();
    return 0;
}

不难发现Replyhi()和Hello()调用了封装函数InitializeService(),ServiceStart(),RecvMsg(szBuf),SendMsg(),ServiceStop(),OpenRemoteService(),CloseRemoteService(),打开linuxnet/lab3/syswrapper.h查看这些封装函数的实现:

/********************************************************************/
/* Copyright (C) SSE-USTC, 2012                                     */
/*                                                                  */
/*  FILE NAME             :  syswraper.h                            */
/*  PRINCIPAL AUTHOR      :  Mengning                               */
/*  SUBSYSTEM NAME        :  system                                 */
/*  MODULE NAME           :  syswraper                              */
/*  LANGUAGE              :  C                                      */
/*  TARGET ENVIRONMENT    :  Linux                                  */
/*  DATE OF FIRST RELEASE :  2012/11/22                             */
/*  DESCRIPTION           :  the interface to Linux system(socket)  */
/********************************************************************/

/*
 * Revision log:
 *
 * Created by Mengning,2012/11/22
 *
 */

#ifndef _SYS_WRAPER_H_
#define _SYS_WRAPER_H_

#include<stdio.h> 
#include<arpa/inet.h> /* internet socket */
#include<string.h>
//#define NDEBUG
#include<assert.h>

#define PORT                5001
#define IP_ADDR             "127.0.0.1"
#define MAX_BUF_LEN         1024

/* private macro */
#define PrepareSocket(addr,port)                                int sockfd = -1;                                        struct sockaddr_in serveraddr;                          struct sockaddr_in clientaddr;                          socklen_t addr_len = sizeof(struct sockaddr);           serveraddr.sin_family = AF_INET;                        serveraddr.sin_port = htons(port);                      serveraddr.sin_addr.s_addr = inet_addr(addr);           memset(&serveraddr.sin_zero, 0, 8);                     sockfd = socket(PF_INET,SOCK_STREAM,0);
        
#define InitServer()                                            int ret = bind( sockfd,                                                 (struct sockaddr *)&serveraddr,                         sizeof(struct sockaddr));               if(ret == -1)                                           {                                                           fprintf(stderr,"Bind Error,%s:%d\n",                                    __FILE__,__LINE__);                     close(sockfd);                                          return -1;                                          }                                                       listen(sockfd,MAX_CONNECT_QUEUE); 

#define InitClient()                                            int ret = connect(sockfd,                                   (struct sockaddr *)&serveraddr,                         sizeof(struct sockaddr));                           if(ret == -1)                                           {                                                           fprintf(stderr,"Connect Error,%s:%d\n",                     __FILE__,__LINE__);                                 return -1;                                          }
/* public macro */               
#define InitializeService()                             \
        PrepareSocket(IP_ADDR,PORT);                            InitServer();
        
#define ShutdownService()                               \
        close(sockfd);
         
#define OpenRemoteService()                             \
        PrepareSocket(IP_ADDR,PORT);                            InitClient();                                           int newfd = sockfd;
        
#define CloseRemoteService()                            \
        close(sockfd); 
              
#define ServiceStart()                                          int newfd = accept( sockfd,                                         (struct sockaddr *)&clientaddr,                         &addr_len);                                 if(newfd == -1)                                         {                                                           fprintf(stderr,"Accept Error,%s:%d\n",                                  __FILE__,__LINE__);                 }        
#define ServiceStop()                                   \
        close(newfd);
        
#define RecvMsg(buf)                                    \
       ret = recv(newfd,buf,MAX_BUF_LEN,0);                    if(ret > 0)                                             {                                                            printf("recv \"%s\" from %s:%d\n",                      buf,                                                    (char*)inet_ntoa(clientaddr.sin_addr),                  ntohs(clientaddr.sin_port));                       }
       
#define SendMsg(buf)                                    \
        ret = send(newfd,buf,strlen(buf),0);                    if(ret > 0)                                             {                                                           printf("rely \"hi\" to %s:%d\n",                        (char*)inet_ntoa(clientaddr.sin_addr),                  ntohs(clientaddr.sin_port));                        }
        
#endif /* _SYS_WRAPER_H_ */

不难发现,过程中涉及到的Socket相关API有socket(),bind(),listen(),accept(),recv(),send(),close(),connect()。于是我们在一个终端打开qemu启动MenuOS(指令中去掉 -S),在另一个终端用gdb读入linux-5.0.1的vmlinux,通过端口1234与qemu建立连接,在相关的系统调用内核处理函数处设置断点,如下图所示:

基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

先在gdb输入一次continue令MenuOS完成启动,然后在qemu中输入replyhi,再在gdb中不断continue,显示的调用过程如下:

基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

然后在qemu中输入hello,再在gdb中不断continue,显示的调用过程如下:

基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

最后查看qemu发现指令已经完整运行:

基于x86-64 Linux-5.0.1的Socket与系统调用深度分析

 参考文献:

1.https://github.com/torvalds/linux

相关推荐